mycaffe/html/_layer_norm_layer_8cs_source.html

using System;

using System.Collections.Generic;

using System.Diagnostics;

using System.Linq;

using System.Text;

using MyCaffe.basecode;

using MyCaffe.common;

using MyCaffe.param;


namespace MyCaffe.layers.gpt

{

    public class LayerNormLayer<T> : Layer<T>

    {

        Blob<T> m_blobWork;

        Blob<T> m_blobMu;

        Blob<T> m_blobXmu;

        Blob<T> m_blobXmuSq;

        Blob<T> m_blobVar;

        Blob<T> m_blobStdev;

        Blob<T> m_blobStdevFull;

        long m_hLayerNorm = 0;

        int m_nCount = 0;

        int m_nOuterNum = 0;

        int m_nChannels = 0;

        int m_nInnerNum = 0;

        List<int> m_rgShape = new List<int>(4);


        public LayerNormLayer(CudaDnn<T> cuda, Log log, LayerParameter p)

            : base(cuda, log, p)

        {

            m_type = LayerParameter.LayerType.LAYERNORM;


            m_blobWork = new Blob<T>(cuda, log);

            m_blobWork.Name = m_param.name + " work";

            m_blobMu = new Blob<T>(cuda, log);

            m_blobMu.Name = m_param.name + " mu";

            m_blobXmu = new Blob<T>(cuda, log);

            m_blobXmu.Name = m_param.name + " xmu";

            m_blobXmuSq = new Blob<T>(cuda, log);

            m_blobXmuSq.Name = m_param.name + " xmu_sq";

            m_blobVar = new Blob<T>(cuda, log);

            m_blobVar.Name = m_param.name + " var";

            m_blobStdev = new Blob<T>(cuda, log);

            m_blobStdev.Name = m_param.name + " stdev";

            m_blobStdevFull = new Blob<T>(cuda, log);

            m_blobStdevFull.Name = m_param.name + " stdev_full";


            setup_internal_blobs(m_colInternalBlobs);

        }


        protected override void dispose()

        {

            dispose(ref m_blobWork);

            dispose(ref m_blobMu);

            dispose(ref m_blobXmu);

            dispose(ref m_blobXmuSq);

            dispose(ref m_blobVar);

            dispose(ref m_blobStdev);

            dispose(ref m_blobStdevFull);


            if (m_hLayerNorm != 0)

            {

                m_cuda.FreeLayerNorm(m_hLayerNorm);

                m_hLayerNorm = 0;

            }


            base.dispose();

        }


        protected override void setup_internal_blobs(BlobCollection<T> col)

        {

            if (col.Count > 0)

                return;


            col.Add(m_blobWork);

            col.Add(m_blobMu);

            col.Add(m_blobXmu);

            col.Add(m_blobXmuSq);

            col.Add(m_blobVar);

            col.Add(m_blobStdev);

            col.Add(m_blobStdevFull);

        }


        public override int ExactNumBottomBlobs

        {

            get { return 1; }

        }


        public override int ExactNumTopBlobs

        {

            get { return 1; }

        }


        public override void LayerSetUp(BlobCollection<T> colBottom, BlobCollection<T> colTop)

        {

            if (m_param.layer_norm_param.enable_passthrough)

                m_log.WriteLine("WARNING: LayerNormLayer '" + m_param.name + "' is using passthrough mode which is only used when debugging.");

        }


        public override void Reshape(BlobCollection<T> colBottom, BlobCollection<T> colTop)

        {

            int nAxes = colBottom[0].num_axes;

            m_nCount = colBottom[0].count();

            m_nOuterNum = colBottom[0].num;

            m_nChannels = (nAxes == 2) ? 1 : colBottom[0].channels;

            m_nInnerNum = (nAxes == 2) ? colBottom[0].channels : colBottom[0].count(2);


            if (m_param.layer_norm_param.enable_cuda_impl)

            {

                if (m_hLayerNorm == 0 || colBottom[0].count() != m_nCount || colBottom[0].num != m_nOuterNum || colBottom[0].channels != m_nChannels || colBottom[0].count(2) != m_nInnerNum)

                {

                    if (m_hLayerNorm != 0)

                        m_cuda.FreeLayerNorm(m_hLayerNorm);


                    int nGpuID = m_cuda.GetDeviceID();

                    m_hLayerNorm = m_cuda.CreateLayerNorm(nGpuID, m_nCount, m_nOuterNum, m_nChannels, m_nInnerNum, (float)m_param.layer_norm_param.epsilon);

                    if (m_hLayerNorm == 0)

                        m_log.FAIL("Failed to create CUDA version LayerNorm!");

                }

            }

            else

            {

                shareLayerBlob(m_blobWork, colBottom[0].shape());

                m_blobWork.ReshapeLike(colBottom[0]);

                shareLayerBlob(m_blobMu, colBottom[0].shape());

                m_blobMu.ReshapeLike(colBottom[0]);

                shareLayerBlob(m_blobXmu, colBottom[0].shape());

                m_blobXmu.ReshapeLike(colBottom[0]);

                shareLayerBlob(m_blobXmuSq, colBottom[0].shape());

                m_blobXmuSq.ReshapeLike(colBottom[0]);

                shareLayerBlob(m_blobVar, colBottom[0].shape());

                m_blobVar.ReshapeLike(colBottom[0]);

                shareLayerBlob(m_blobStdev, colBottom[0].shape());

                m_blobStdev.ReshapeLike(colBottom[0]);

                shareLayerBlob(m_blobStdevFull, colBottom[0].shape());

                m_blobStdevFull.ReshapeLike(colBottom[0]);


                m_rgShape.Clear();

                m_rgShape.Add(m_nOuterNum);

                m_rgShape.Add(m_nChannels);

                if (nAxes > 2)

                    m_rgShape.Add(1);

                m_blobMu.Reshape(m_rgShape);

                m_blobVar.Reshape(m_rgShape);

                m_blobStdev.Reshape(m_rgShape);

            }


            colTop[0].ReshapeLike(colBottom[0]);

        }


        protected override void forward(BlobCollection<T> colBottom, BlobCollection<T> colTop)

        {

            if (m_param.layer_norm_param.enable_passthrough)

            {

                colTop[0].CopyFrom(colBottom[0]);

                return;

            }


            if (m_param.layer_norm_param.enable_cuda_impl)

                m_cuda.LayerNormForward(m_hLayerNorm, colBottom[0].gpu_data, colTop[0].mutable_gpu_data);

            else

                forward_local(colBottom, colTop);

        }


        private void forward_local(BlobCollection<T> colBottom, BlobCollection<T> colTop)

        {

            //-----------------------------------

            // Calculate the mean across the last dim.

            // mean = x.mean(dim=-1, keepdim=True)

            // --step1--

            m_cuda.channel_mean(m_nCount, m_nOuterNum, m_nChannels, m_nInnerNum, colBottom[0].gpu_data, m_blobMu.mutable_gpu_data);

            m_blobMu.Reshape(m_blobMu.num, m_blobMu.channels, 1, 1);


            //-----------------------------------

            // var = ((x - mean) ** 2).mean(dim=-1, keepdim=True)

            // Copy each mean value per channel across all items in the channel (e.g. 1 -> channel items)

            m_cuda.channel_fillfrom(m_nCount, m_nOuterNum, m_nChannels, m_nInnerNum, m_blobMu.gpu_data, m_blobXmu.mutable_gpu_data, DIR.FWD);


            // --step2--

            // Subtract the mean from the input.

            // xmu = x - mean

            m_cuda.sub(m_nCount, colBottom[0].gpu_data, m_blobXmu.gpu_data, m_blobXmu.mutable_gpu_data);


            // --step3--

            // Square the values

            // xmusq = (xmu) ** 2

            m_cuda.powx(m_nCount, m_blobXmu.gpu_data, 2.0, m_blobXmuSq.mutable_gpu_data);


            // --step4--

            // Calculate the mean across the last dim.

            // var = xmusq.mean(dim=-1, keepdim=True)

            // var shape = (n, c, 1)

            m_cuda.channel_mean(m_nCount, m_nOuterNum, m_nChannels, m_nInnerNum, m_blobXmuSq.gpu_data, m_blobVar.mutable_gpu_data);

            m_blobVar.Reshape(m_blobVar.num, m_blobVar.channels, 1, 1);


            //-----------------------------------

            // std = (var + self.epsilon).sqrt()

            // Calculate the stdev across the last dim

            // std = sqrt(var + eps)

            // stdev shape: (n, c, 1)

            // --step5--

            m_blobStdev.Reshape(m_blobStdev.num, m_blobStdev.channels, 1, 1);

            m_cuda.add_scalar(m_nOuterNum * m_nChannels, m_param.layer_norm_param.epsilon, m_blobVar.mutable_gpu_data);

            m_cuda.sqrt(m_nOuterNum * m_nChannels, m_blobVar.gpu_data, m_blobStdev.mutable_gpu_data);


            //-----------------------------------

            // y = (x - mean) / std

            // Normalize the input by centering and dividing by stdev across channels.

            // Copy each stdev value per channel across all items in the channel (e.g. 1 -> channel items)

            // --step6, step7--

            m_cuda.channel_fillfrom(m_nCount, m_nOuterNum, m_nChannels, m_nInnerNum, m_blobStdev.gpu_data, m_blobStdevFull.mutable_gpu_data, DIR.FWD);

            m_cuda.div(m_nCount, m_blobXmu.gpu_data, m_blobStdevFull.gpu_data, colTop[0].mutable_gpu_data);

        }


        protected override void backward(BlobCollection<T> colTop, List<bool> rgbPropagateDown, BlobCollection<T> colBottom)

        {

            if (rgbPropagateDown[0])

            {

                if (m_param.layer_norm_param.enable_passthrough)

                {

                    colBottom[0].CopyFrom(colTop[0], true);

                    return;

                }


                if (m_param.layer_norm_param.enable_cuda_impl)

                    m_cuda.LayerNormBackward(m_hLayerNorm, colTop[0].gpu_data, colTop[0].gpu_diff, colBottom[0].mutable_gpu_diff);

                else

                    backward_local(colTop, rgbPropagateDown, colBottom);

            }

        }


        private void backward_local(BlobCollection<T> colTop, List<bool> rgbPropagateDown, BlobCollection<T> colBottom)

        {

            // Multiply previous dx by dy (grad)

            // dx1 = dx * dy

            m_blobWork.ReshapeLike(colTop[0]);

            m_cuda.mul(m_nCount, colTop[0].gpu_data, colTop[0].gpu_diff, m_blobWork.mutable_gpu_diff);


            // Average (dx * dy) across channel, dx1 = dx1.mean()

            m_cuda.channel_mean(m_nCount, m_nOuterNum, m_nChannels, m_nInnerNum, m_blobWork.gpu_diff, m_blobVar.mutable_gpu_diff);


            // Average dy across channel, dx2 = dy.mean()

            m_cuda.channel_mean(m_nCount, m_nOuterNum, m_nChannels, m_nInnerNum, colTop[0].gpu_diff, m_blobStdev.mutable_gpu_diff);


            // Multiply previous dx with dx1 (average across channel of dx * dy)

            m_cuda.channel_fillfrom(m_nCount, m_nOuterNum, m_nChannels, m_nInnerNum, m_blobVar.gpu_diff, m_blobStdevFull.mutable_gpu_diff, DIR.FWD);

            m_cuda.mul(m_nCount, colTop[0].gpu_data, m_blobStdevFull.gpu_diff, m_blobWork.mutable_gpu_diff);


            // Add in dy average dx2

            m_cuda.channel_fillfrom(m_nCount, m_nOuterNum, m_nChannels, m_nInnerNum, m_blobStdev.gpu_diff, m_blobStdevFull.mutable_gpu_diff, DIR.FWD);

            m_cuda.add(m_nCount, m_blobWork.gpu_diff, m_blobStdevFull.gpu_diff, m_blobWork.mutable_gpu_diff);


            // Subtract from original dy gradient

            // dy - ((dx * dx1) + dx2)

            m_cuda.sub(m_nCount, colTop[0].gpu_diff, m_blobWork.gpu_diff, m_blobWork.mutable_gpu_diff);


            // Divide by the original stdev std, dx = (dy - ((dx * dx1) + dx2))/std

            m_blobStdevFull.add_scalar(m_param.layer_norm_param.epsilon);

            m_cuda.div(m_nCount, m_blobWork.gpu_diff, m_blobStdevFull.gpu_data, colBottom[0].mutable_gpu_diff);

        }

    }

}

MyCaffe.basecode.Log
The Log class provides general output in text form.
Definition: Log.cs:13

MyCaffe.basecode.Log.WriteLine
void WriteLine(string str, bool bOverrideEnabled=false, bool bHeader=false, bool bError=false, bool bDisable=false)
Write a line of output.
Definition: Log.cs:80

MyCaffe.basecode.Log.FAIL
void FAIL(string str)
Causes a failure which throws an exception with the desciptive text.
Definition: Log.cs:394

MyCaffe.common.BlobCollection
The BlobCollection contains a list of Blobs.
Definition: BlobCollection.cs:16

MyCaffe.common.BlobCollection.Add
void Add(Blob< T > b)
Add a new Blob to the collection.
Definition: BlobCollection.cs:92

MyCaffe.common.BlobCollection.Count
int Count
Returns the number of items in the collection.
Definition: BlobCollection.cs:30

MyCaffe.common.BlobCollection.ReshapeLike
void ReshapeLike(BlobCollection< T > src)
Reshapes all blobs in the collection to the sizes of the source.
Definition: BlobCollection.cs:214

MyCaffe.common.BlobCollection.CopyFrom
void CopyFrom(BlobCollection< T > bSrc, bool bCopyDiff=false)
Copy the data or diff from another BlobCollection into this one.
Definition: BlobCollection.cs:266

MyCaffe.common.Blob
The Blob is the main holder of data that moves through the Layers of the Net.
Definition: Blob.cs:25

MyCaffe.common.Blob.channels
int channels
DEPRECIATED; legacy shape accessor channels: use shape(1) instead.
Definition: Blob.cs:800

MyCaffe.common.Blob.mutable_gpu_diff
long mutable_gpu_diff
Returns the diff GPU handle used by the CudaDnn connection.
Definition: Blob.cs:1555

MyCaffe.common.Blob.mutable_gpu_data
long mutable_gpu_data
Returns the data GPU handle used by the CudaDnn connection.
Definition: Blob.cs:1487

MyCaffe.common.Blob.Reshape
void Reshape(int nNum, int nChannels, int nHeight, int nWidth, bool? bUseHalfSize=null)
DEPRECIATED; use
Definition: Blob.cs:442

MyCaffe.common.Blob.add_scalar
void add_scalar(double dfVal)
Adds a scalar value to the Blob.
Definition: Blob.cs:2779

MyCaffe.common.Blob.ReshapeLike
void ReshapeLike(Blob< T > b, bool? bUseHalfSize=null)
Reshape this Blob to have the same shape as another Blob.
Definition: Blob.cs:648

MyCaffe.common.Blob.Name
string Name
Get/set the name of the Blob.
Definition: Blob.cs:2184

MyCaffe.common.Blob.gpu_diff
long gpu_diff
Returns the diff GPU handle used by the CudaDnn connection.
Definition: Blob.cs:1541

MyCaffe.common.Blob.num
int num
DEPRECIATED; legacy shape accessor num: use shape(0) instead.
Definition: Blob.cs:792

MyCaffe.common.Blob.gpu_data
long gpu_data
Returns the data GPU handle used by the CudaDnn connection.
Definition: Blob.cs:1479

MyCaffe.common.CudaDnn
The CudaDnn object is the main interface to the Low-Level Cuda C++ DLL.
Definition: CudaDnn.cs:969

MyCaffe.layers.Layer
An interface for the units of computation which can be composed into a Net.
Definition: Layer.cs:31

MyCaffe.layers.Layer.m_log
Log m_log
Specifies the Log for output.
Definition: Layer.cs:43

MyCaffe.layers.Layer.m_param
LayerParameter m_param
Specifies the LayerParameter describing the Layer.
Definition: Layer.cs:47

MyCaffe.layers.Layer.shareLayerBlob
bool shareLayerBlob(Blob< T > b, List< int > rgMinShape)
Attempts to share a Layer Blob if another parameter Blob with the same name and acceptable size is fo...
Definition: Layer.cs:1170

MyCaffe.layers.Layer.m_colInternalBlobs
BlobCollection< T > m_colInternalBlobs
Specifies internal blobs used by the layer.
Definition: Layer.cs:59

MyCaffe.layers.Layer.m_cuda
CudaDnn< T > m_cuda
Specifies the CudaDnn connection to Cuda.
Definition: Layer.cs:39

MyCaffe.layers.Layer.m_type
LayerParameter.LayerType m_type
Specifies the Layer type.
Definition: Layer.cs:35

MyCaffe.layers.gpt.LayerNormLayer
The LayerNormalizationLayer performs layer normalization similar to the PyTorch LayerNorm layer.
Definition: LayerNormLayer.cs:23

MyCaffe.layers.gpt.LayerNormLayer.forward
override void forward(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Computes the forward calculation.
Definition: LayerNormLayer.cs:195

MyCaffe.layers.gpt.LayerNormLayer.backward
override void backward(BlobCollection< T > colTop, List< bool > rgbPropagateDown, BlobCollection< T > colBottom)
Computes the error gradient w.r.t the inputs.
Definition: LayerNormLayer.cs:267

MyCaffe.layers.gpt.LayerNormLayer.LayerNormLayer
LayerNormLayer(CudaDnn< T > cuda, Log log, LayerParameter p)
The LayerNormalizationLayer constructor.
Definition: LayerNormLayer.cs:47

MyCaffe.layers.gpt.LayerNormLayer.setup_internal_blobs
override void setup_internal_blobs(BlobCollection< T > col)
Derivative layers should add all internal blobws to the 'col' provided.
Definition: LayerNormLayer.cs:91

MyCaffe.layers.gpt.LayerNormLayer.ExactNumTopBlobs
override int ExactNumTopBlobs
Returns the exact number of required top (output) Blobs: norm
Definition: LayerNormLayer.cs:117

MyCaffe.layers.gpt.LayerNormLayer.LayerSetUp
override void LayerSetUp(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Setup the layer.
Definition: LayerNormLayer.cs:126

MyCaffe.layers.gpt.LayerNormLayer.dispose
override void dispose()
Releases all GPU and host resources used by the Layer.
Definition: LayerNormLayer.cs:71

MyCaffe.layers.gpt.LayerNormLayer.Reshape
override void Reshape(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Reshape the bottom (input) and top (output) blobs.
Definition: LayerNormLayer.cs:137

MyCaffe.layers.gpt.LayerNormLayer.ExactNumBottomBlobs
override int ExactNumBottomBlobs
Returns the exact number of required bottom (input) Blobs: data
Definition: LayerNormLayer.cs:109

MyCaffe.param.LayerParameter
Specifies the base parameter for all layers.
Definition: LayerParameter.cs:24

MyCaffe.param.LayerParameter.name
string name
Specifies the name of this LayerParameter.
Definition: LayerParameter.cs:1865

MyCaffe.param.LayerParameter.layer_norm_param
LayerNormParameter layer_norm_param
Returns the parameter set when initialized with LayerType.LAYERNORM
Definition: LayerParameter.cs:2488

MyCaffe.param.LayerParameter.LayerType
LayerType
Specifies the layer type.
Definition: LayerParameter.cs:110

MyCaffe.basecode
The MyCaffe.basecode contains all generic types used throughout MyCaffe.
Definition: Annotation.cs:12

MyCaffe.common
The MyCaffe.common namespace contains common MyCaffe classes.
Definition: BatchInput.cs:8

MyCaffe.common.DIR
DIR
Defines the direction of data flow.
Definition: CudaDnn.cs:22

MyCaffe.layers.gpt
The MyCaffe.layers.gpt namespace contains all GPT related layers.
Definition: LayerFactory.cs:15

MyCaffe.param
The MyCaffe.param namespace contains parameters used to create models.
Definition: AttentionParameter.cs:9

MyCaffe
The MyCaffe namespace contains the main body of MyCaffe code that closesly tracks the C++ Caffe open-...
Definition: Annotation.cs:12

System
Definition: Component.cs:11