mycaffe/html/_multihead_attention_layer_8cs_source.html

using System;

using System.Collections.Generic;

using System.Linq;

using System.Text;

using MyCaffe.basecode;

using MyCaffe.common;

using MyCaffe.param;

using MyCaffe.fillers;

using System.Diagnostics;

using MyCaffe.param.gpt;


namespace MyCaffe.layers.gpt

{

    public class MultiheadAttentionLayer<T> : Layer<T>

    {

        List<int> m_rgShape = new List<int>() { 1, 1, 1, 1 };

        // Key, query, value projections for all heads, but in a batch.

        Layer<T> m_c_attnQ = null;

        Layer<T> m_c_attnK = null;

        Layer<T> m_c_attnV = null;

        // Output projection.

        Layer<T> m_c_proj = null;

        // Regularization

        Layer<T> m_attn_dropout = null;

        Layer<T> m_resid_dropout = null;

        // Transpose

        Layer<T> m_transpose;

        Layer<T> m_transposeQ;

        // Softmax

        Layer<T> m_softmax = null;

        Blob<T> m_blobX0;

        Blob<T> m_blobX1;

        Blob<T> m_blobX2;

        Blob<T> m_blobQ;

        Blob<T> m_blobK;

        Blob<T> m_blobV;

        Blob<T> m_blobQt;

        Blob<T> m_blobKt;

        Blob<T> m_blobKt1;

        Blob<T> m_blobVt;

        Blob<T> m_blobWork;

        Blob<T> m_blobAttA;

        Blob<T> m_blobAttB;

        Blob<T> m_blobY;

        // The number of heads.

        int m_nHeads;

        int m_nEmbed;

        int m_nBlockSize;

        double m_dfAttnDropout;

        double m_dfResidDropout;


        int m_nSize;

        int m_nB;

        int m_nT;

        int m_nC;


        BlobCollection<T> m_colInternalBottom = new BlobCollection<T>();

        BlobCollection<T> m_colInternalTop = new BlobCollection<T>();


        public MultiheadAttentionLayer(CudaDnn<T> cuda, Log log, LayerParameter p)

            : base(cuda, log, p)

        {

            m_type = LayerParameter.LayerType.MULTIHEAD_ATTENTION;


            m_nHeads = (int)p.multihead_attention_param.heads;

            m_nEmbed = (int)p.multihead_attention_param.embed;

            m_nBlockSize = (int)p.multihead_attention_param.block_size;

            m_dfAttnDropout = p.multihead_attention_param.attn_dropout;

            m_dfResidDropout = p.multihead_attention_param.resid_dropout;


            log.CHECK_EQ(m_nEmbed % m_nHeads, 0, "The embedding size must be divisible by the number of heads.");


            // Query projection for all heads, but in a batch.

            // input features = m_nHeads

            LayerParameter ipAttnQ = new LayerParameter(LayerParameter.LayerType.INNERPRODUCT, p.name + ".c_attnQ");

            ipAttnQ.inner_product_param.num_output = (uint)m_nEmbed;

            ipAttnQ.inner_product_param.bias_term = true;

            if (m_param.multihead_attention_param.weight_init == MultiheadAttentionParameter.WEIGHT_INIT.ENCODER_DECODER)

            {

                ipAttnQ.inner_product_param.weight_filler = new FillerParameter("xavier");

                ipAttnQ.inner_product_param.bias_filler = new FillerParameter("xavier");

            }

            else

            {

                ipAttnQ.inner_product_param.weight_filler = new FillerParameter("gaussian", 0, 0, 0.02);

                ipAttnQ.inner_product_param.bias_filler = new FillerParameter("constant", 0.0);

            }

            ipAttnQ.inner_product_param.axis = 2;

            ipAttnQ.parameters.Add((m_param.parameters.Count > 0) ? m_param.parameters[0] : new ParamSpec(1.0, 1.0));

            ipAttnQ.parameters.Add((m_param.parameters.Count > 1) ? m_param.parameters[1] : new ParamSpec(1.0, 0.0));

            m_c_attnQ = Layer<T>.Create(cuda, log, convertLayerParam(ipAttnQ, p), null);


            // Key projection for all heads, but in a batch.

            // input features = m_nHeads

            LayerParameter ipAttnK = new LayerParameter(LayerParameter.LayerType.INNERPRODUCT, p.name + ".c_attnK");

            ipAttnK.inner_product_param.num_output = (uint)m_nEmbed;

            ipAttnK.inner_product_param.bias_term = true;

            if (m_param.multihead_attention_param.weight_init == MultiheadAttentionParameter.WEIGHT_INIT.ENCODER_DECODER)

            {

                ipAttnK.inner_product_param.weight_filler = new FillerParameter("xavier");

                ipAttnK.inner_product_param.bias_filler = new FillerParameter("xavier");

            }

            else

            {

                ipAttnK.inner_product_param.weight_filler = new FillerParameter("gaussian", 0, 0, 0.02);

                ipAttnK.inner_product_param.bias_filler = new FillerParameter("constant", 0.0);

            }

            ipAttnK.inner_product_param.axis = 2;

            ipAttnK.parameters.Add((m_param.parameters.Count > 0) ? m_param.parameters[0] : new ParamSpec(1.0, 1.0));

            ipAttnK.parameters.Add((m_param.parameters.Count > 1) ? m_param.parameters[1] : new ParamSpec(1.0, 0.0));

            m_c_attnK = Layer<T>.Create(cuda, log, convertLayerParam(ipAttnK, p), null);


            // Value projection for all heads, but in a batch.

            // input features = m_nHeads

            LayerParameter ipAttnV = new LayerParameter(LayerParameter.LayerType.INNERPRODUCT, p.name + ".c_attnV");

            ipAttnV.inner_product_param.num_output = (uint)m_nEmbed;

            ipAttnV.inner_product_param.bias_term = true;

            if (m_param.multihead_attention_param.weight_init == MultiheadAttentionParameter.WEIGHT_INIT.ENCODER_DECODER)

            {

                ipAttnV.inner_product_param.weight_filler = new FillerParameter("xavier");

                ipAttnV.inner_product_param.bias_filler = new FillerParameter("xavier");

            }

            else

            {

                ipAttnV.inner_product_param.weight_filler = new FillerParameter("gaussian", 0, 0, 0.02);

                ipAttnV.inner_product_param.bias_filler = new FillerParameter("constant", 0.0);

            }

            ipAttnV.inner_product_param.axis = 2;

            ipAttnV.parameters.Add((m_param.parameters.Count > 0) ? m_param.parameters[0] : new ParamSpec(1.0, 1.0));

            ipAttnV.parameters.Add((m_param.parameters.Count > 1) ? m_param.parameters[1] : new ParamSpec(1.0, 0.0));

            m_c_attnV = Layer<T>.Create(cuda, log, convertLayerParam(ipAttnV, p), null);


            // Output projection.

            // input features = m_nEmbed

            LayerParameter ipProj = new LayerParameter(LayerParameter.LayerType.INNERPRODUCT, p.name + ".c_proj");

            ipProj.inner_product_param.num_output = (uint)m_nEmbed;

            ipProj.inner_product_param.bias_term = true;

            if (m_param.multihead_attention_param.weight_init == MultiheadAttentionParameter.WEIGHT_INIT.ENCODER_DECODER)

            {

                ipProj.inner_product_param.weight_filler = new FillerParameter("xavier");

                ipProj.inner_product_param.bias_filler = new FillerParameter("xavier");

            }

            else

            {

                ipProj.inner_product_param.weight_filler = new FillerParameter("gaussian", 0, 0, 0.02 / Math.Sqrt(2 * m_param.multihead_attention_param.layers));

                ipProj.inner_product_param.bias_filler = new FillerParameter("constant", 0.0);

            }

            ipProj.inner_product_param.axis = 2;

            ipProj.parameters.Add((m_param.parameters.Count > 0) ? m_param.parameters[0] : new ParamSpec(1.0, 1.0));

            ipProj.parameters.Add((m_param.parameters.Count > 1) ? m_param.parameters[1] : new ParamSpec(1.0, 0.0));

            m_c_proj = Layer<T>.Create(cuda, log, convertLayerParam(ipProj, p), null);


            // Regularization

            if (m_dfAttnDropout > 0)

            {

                LayerParameter dropoutAttn = new LayerParameter(LayerParameter.LayerType.DROPOUT, p.name + ".drop_attn");

                dropoutAttn.dropout_param.dropout_ratio = m_dfAttnDropout;

                m_attn_dropout = Layer<T>.Create(cuda, log, convertLayerParam(dropoutAttn, p), null);

            }


            if (m_dfResidDropout > 0)

            {

                LayerParameter dropoutResid = new LayerParameter(LayerParameter.LayerType.DROPOUT, p.name + ".drop_resid");

                dropoutResid.dropout_param.dropout_ratio = m_dfResidDropout;

                m_resid_dropout = Layer<T>.Create(cuda, log, convertLayerParam(dropoutResid, p), null);

            }


            // Transpose

            LayerParameter transpose = new LayerParameter(LayerParameter.LayerType.TRANSPOSE, p.name + ".trans");

            transpose.transpose_param.dim[1] = 2;

            transpose.transpose_param.dim[2] = 1;

            m_transpose = Layer<T>.Create(cuda, log, convertLayerParam(transpose, p), null);


            LayerParameter transposeQ = new LayerParameter(LayerParameter.LayerType.TRANSPOSE, p.name + ".transQ");

            transposeQ.transpose_param.dim[2] = 3;

            transposeQ.transpose_param.dim[3] = 2;

            m_transposeQ = Layer<T>.Create(cuda, log, convertLayerParam(transposeQ, p), null);


            // Softmax

            LayerParameter softmax = new LayerParameter(LayerParameter.LayerType.SOFTMAX, p.name + ".softmax");

            softmax.softmax_param.axis = -1;

            softmax.softmax_param.engine = EngineParameter.Engine.CUDNN;

            m_softmax = Layer<T>.Create(cuda, log, convertLayerParam(softmax, p), null);


            m_blobX0 = new Blob<T>(cuda, log);

            m_blobX0.Name = m_param.name + " x0";

            m_blobX1 = new Blob<T>(cuda, log);

            m_blobX1.Name = m_param.name + " x1";

            m_blobX2 = new Blob<T>(cuda, log);

            m_blobX2.Name = m_param.name + " x2";

            m_blobQ = new Blob<T>(cuda, log);

            m_blobQ.Name = m_param.name + " Q";

            m_blobK = new Blob<T>(cuda, log);

            m_blobK.Name = m_param.name + " K";

            m_blobV = new Blob<T>(cuda, log);

            m_blobV.Name = m_param.name + " V";

            m_blobQt = new Blob<T>(cuda, log);

            m_blobQt.Name = m_param.name + " Qt";

            m_blobKt = new Blob<T>(cuda, log);

            m_blobKt.Name = m_param.name + " Kt";

            m_blobKt1 = new Blob<T>(cuda, log);

            m_blobKt1.Name = m_param.name + " Kt1";

            m_blobVt = new Blob<T>(cuda, log);

            m_blobVt.Name = m_param.name + " Vt";

            m_blobAttA = new Blob<T>(cuda, log);

            m_blobAttA.Name = m_param.name + " AttA";

            m_blobAttB = new Blob<T>(cuda, log);

            m_blobAttB.Name = m_param.name + " AttB";

            m_blobWork = new Blob<T>(cuda, log);

            m_blobWork.Name = m_param.name + " Work";

            m_blobY = new Blob<T>(cuda, log);

            m_blobY.Name = m_param.name + " Y";


            setup_internal_blobs(m_colInternalBlobs);

        }


        protected override void dispose()

        {

            dispose(ref m_c_attnQ);

            dispose(ref m_c_attnK);

            dispose(ref m_c_attnV);

            dispose(ref m_c_proj);

            dispose(ref m_attn_dropout);

            dispose(ref m_resid_dropout);

            dispose(ref m_transpose);

            dispose(ref m_transposeQ);

            dispose(ref m_softmax);


            dispose(ref m_blobX0);

            dispose(ref m_blobX1);

            dispose(ref m_blobX2);

            dispose(ref m_blobQ);

            dispose(ref m_blobK);

            dispose(ref m_blobV);

            dispose(ref m_blobQt);

            dispose(ref m_blobKt);

            dispose(ref m_blobKt1);

            dispose(ref m_blobVt);

            dispose(ref m_blobAttA);

            dispose(ref m_blobAttB);

            dispose(ref m_blobWork);

            dispose(ref m_blobY);


            base.dispose();

        }


        protected override void setup_internal_blobs(BlobCollection<T> col)

        {

            if (col.Count > 0)

                return;


            col.Add(m_blobX0);

            col.Add(m_blobX1);

            col.Add(m_blobX2);

            col.Add(m_blobQ);

            col.Add(m_blobK);

            col.Add(m_blobV);

            col.Add(m_blobQt);

            col.Add(m_blobKt);

            col.Add(m_blobVt);

            col.Add(m_blobKt1);

            col.Add(m_blobAttA);

            col.Add(m_blobAttB);

            col.Add(m_blobWork);

            col.Add(m_blobY);


            col.Add(m_c_attnQ.internal_blobs);

            col.Add(m_c_attnK.internal_blobs);

            col.Add(m_c_attnV.internal_blobs);

            col.Add(m_transpose.internal_blobs);

            col.Add(m_transposeQ.internal_blobs);

            col.Add(m_softmax.internal_blobs);

            if (m_attn_dropout != null)

                col.Add(m_attn_dropout.internal_blobs);

            col.Add(m_c_proj.internal_blobs);

            if (m_resid_dropout != null)

                col.Add(m_resid_dropout.internal_blobs);

        }


        public override int ExactNumBottomBlobs

        {

            get { return 4; }

        }


        public override int ExactNumTopBlobs

        {

            get { return 1; }

        }


        public override bool ReInitializeParameters(WEIGHT_TARGET target)

        {

            base.ReInitializeParameters(target);


            m_c_attnQ.ReInitializeParameters(target);

            m_c_attnK.ReInitializeParameters(target);

            m_c_attnV.ReInitializeParameters(target);

            m_c_proj.ReInitializeParameters(target);


            return true;

        }


        private void addInternal(Blob<T> bottom, Blob<T> top)

        {

            m_colInternalBottom.Clear();

            m_colInternalBottom.Add(bottom);


            m_colInternalTop.Clear();

            m_colInternalTop.Add(top);

        }


        private void addInternal(List<Blob<T>> rgBottom, Blob<T> top)

        {

            m_colInternalBottom.Clear();


            for (int i=0; i<rgBottom.Count; i++)

            {

                m_colInternalBottom.Add(rgBottom[i]);

            }


            m_colInternalTop.Clear();

            m_colInternalTop.Add(top);

        }


        public override void LayerSetUp(BlobCollection<T> colBottom, BlobCollection<T> colTop)

        {

            shareLayerBlob(m_blobX0, colBottom[0].shape());

            m_blobX0.ReshapeLike(colBottom[0]);

            shareLayerBlob(m_blobX1, colBottom[0].shape());

            m_blobX1.ReshapeLike(colBottom[1]);

            shareLayerBlob(m_blobX2, colBottom[0].shape());

            m_blobX2.ReshapeLike(colBottom[2]);


            m_nB = m_blobX0.num;         // batch size

            m_nT = m_blobX0.channels;    // sequence length

            m_nC = m_blobX0.height;      // embedding dim (m_nEmbed)

            m_nSize = m_nC / m_nHeads;


            addInternal(m_blobX0, m_blobQ);

            m_c_attnQ.Setup(m_colInternalBottom, m_colInternalTop);

            addInternal(m_blobX1, m_blobK);

            m_c_attnK.Setup(m_colInternalBottom, m_colInternalTop);

            addInternal(m_blobX2, m_blobV);

            m_c_attnV.Setup(m_colInternalBottom, m_colInternalTop);


            blobs.Add(m_c_attnQ.blobs[0]);

            blobs.Add(m_c_attnQ.blobs[1]);

            blobs.Add(m_c_attnK.blobs[0]);

            blobs.Add(m_c_attnK.blobs[1]);

            blobs.Add(m_c_attnV.blobs[0]);

            blobs.Add(m_c_attnV.blobs[1]);


            m_rgShape[0] = m_nB;

            m_rgShape[1] = m_nHeads;

            m_rgShape[2] = m_nT;

            m_rgShape[3] = m_nSize;


            shareLayerBlob(m_blobQ, m_rgShape);

            m_blobQ.Reshape(m_rgShape);

            addInternal(m_blobQ, m_blobQt);

            m_transpose.Setup(m_colInternalBottom, m_colInternalTop); // (B, nh, T, hs)


            shareLayerBlob(m_blobAttA, m_blobX0.shape());

            m_blobAttA.Reshape(m_nB, m_nHeads, m_nBlockSize, m_nBlockSize);

            shareLayerBlob(m_blobAttB, m_blobX0.shape());

            m_blobAttB.Reshape(m_nB, m_nHeads, m_nBlockSize, m_nBlockSize);


            addInternal(m_blobAttA, m_blobAttB);

            m_softmax.Setup(m_colInternalBottom, m_colInternalTop);


            if (m_attn_dropout != null)

            {

                addInternal(m_blobAttB, m_blobAttB);

                m_attn_dropout.Setup(m_colInternalBottom, m_colInternalTop);

            }


            m_rgShape[0] = m_nB;

            m_rgShape[1] = m_nT;

            m_rgShape[2] = m_nC;

            m_rgShape[3] = 1;


            shareLayerBlob(m_blobY, m_rgShape);

            m_blobY.Reshape(m_rgShape);


            addInternal(m_blobY, colTop[0]);

            m_c_proj.Setup(m_colInternalBottom, m_colInternalTop);


            blobs.Add(m_c_proj.blobs[0]);

            blobs.Add(m_c_proj.blobs[1]);


            if (m_resid_dropout != null)

            {

                addInternal(colTop[0], colTop[0]);

                m_resid_dropout.Setup(m_colInternalBottom, m_colInternalTop);

            }


            foreach (Blob<T> blob in blobs)

            {

                if (!blob.Name.StartsWith(m_param.name + "_"))

                    blob.Name = m_param.name + "_" + blob.Name;

            }

        }


        public override void Reshape(BlobCollection<T> colBottom, BlobCollection<T> colTop)

        {

            m_blobX0.ReshapeLike(colBottom[0]);

            m_blobX1.ReshapeLike(colBottom[1]);

            m_blobX2.ReshapeLike(colBottom[2]);


            m_nB = m_blobX0.num;         // batch size

            m_nT = m_blobX0.channels;    // sequence length

            m_nC = m_blobX0.height;      // embedding dim (m_nEmbed)

            m_nSize = m_nC / m_nHeads;


            m_rgShape[0] = m_nB;

            m_rgShape[1] = m_nT;

            m_rgShape[2] = m_nHeads;

            m_rgShape[3] = m_nSize;


            shareLayerBlob(m_blobK, m_rgShape);

            m_blobK.Reshape(m_rgShape);

            shareLayerBlob(m_blobKt1, m_rgShape);

            m_blobKt1.ReshapeLike(m_blobK);

            shareLayerBlob(m_blobKt, m_rgShape);


            addInternal(m_blobK, m_blobKt);

            m_transpose.Reshape(m_colInternalBottom, m_colInternalTop); // (B, nh, T, hs)

            m_blobKt1.ReshapeLike(m_blobKt);


            shareLayerBlob(m_blobQ, m_rgShape);

            m_blobQ.Reshape(m_rgShape);

            shareLayerBlob(m_blobQt, m_rgShape);


            addInternal(m_blobQ, m_blobQt);

            m_transpose.Reshape(m_colInternalBottom, m_colInternalTop); // (B, nh, T, hs)


            shareLayerBlob(m_blobV, m_rgShape);

            m_blobV.Reshape(m_rgShape);

            shareLayerBlob(m_blobVt, m_rgShape);


            addInternal(m_blobV, m_blobVt);

            m_transpose.Reshape(m_colInternalBottom, m_colInternalTop); // (B, nh, T, hs)


            m_rgShape[0] = m_nB;

            m_rgShape[1] = m_nHeads;

            m_rgShape[2] = m_nT;

            m_rgShape[3] = m_nT;


            shareLayerBlob(m_blobAttA, m_rgShape);

            m_blobAttA.Reshape(m_rgShape);

            shareLayerBlob(m_blobAttB, m_rgShape);

            m_blobAttB.Reshape(m_rgShape);


            m_rgShape[0] = m_blobVt.num;

            m_rgShape[1] = m_blobVt.channels;

            m_rgShape[2] = m_blobVt.width;  // col major

            m_rgShape[3] = m_blobVt.height;


            shareLayerBlob(m_blobWork, m_rgShape);

            m_blobWork.Reshape(m_rgShape); // col major

            addInternal(m_blobWork, m_blobY);

            m_transposeQ.Reshape(m_colInternalBottom, m_colInternalTop);


            m_rgShape[0] = m_nB;

            m_rgShape[1] = m_nT;

            m_rgShape[2] = m_nC;

            m_rgShape[3] = 1;


            shareLayerBlob(m_blobY, m_rgShape);

            m_blobY.Reshape(m_rgShape);

            addInternal(m_blobY, colTop[0]);

            m_c_proj.Reshape(m_colInternalBottom, m_colInternalTop);


            if (m_resid_dropout != null)

            {

                addInternal(colTop[0], colTop[0]);

                m_resid_dropout.Reshape(m_colInternalBottom, m_colInternalTop);

            }

        }


        protected override void forward(BlobCollection<T> colBottom, BlobCollection<T> colTop)

        {

            Blob<T> blobMask = colBottom[3];


            m_blobX0.CopyFrom(colBottom[0]);

            m_blobX1.CopyFrom(colBottom[1]);

            m_blobX2.CopyFrom(colBottom[2]);


            // Calculate query, for all heads in batch and move head forward to be the batch dim.

            // q  = self.c_attnQ(x1)

            addInternal(m_blobX0, m_blobQ);

            m_c_attnQ.Forward(m_colInternalBottom, m_colInternalTop);


            // Calculate key, for all heads in batch and move head forward to be the batch dim.

            // k  = self.c_attnK(x2)

            addInternal(m_blobX1, m_blobK);

            m_c_attnK.Forward(m_colInternalBottom, m_colInternalTop);


            // Calculate value, for all heads in batch and move head forward to be the batch dim.

            // v  = self.c_attnK(x3)

            addInternal(m_blobX2, m_blobV);

            m_c_attnV.Forward(m_colInternalBottom, m_colInternalTop);


            // Transpose query, key and values along axes 1 & 2

            // k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

            // q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

            // v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

            m_blobQ.Reshape(m_nB, m_nT, m_nHeads, m_nSize);

            m_blobK.Reshape(m_nB, m_nT, m_nHeads, m_nSize);

            m_blobV.Reshape(m_nB, m_nT, m_nHeads, m_nSize);


            addInternal(m_blobQ, m_blobQt);

            m_transpose.Forward(m_colInternalBottom, m_colInternalTop); // (B, nh, T, hs)

            addInternal(m_blobK, m_blobKt);

            m_transpose.Forward(m_colInternalBottom, m_colInternalTop); // (B, nh, T, hs)

            addInternal(m_blobV, m_blobVt);

            m_transpose.Forward(m_colInternalBottom, m_colInternalTop); // (B, nh, T, hs)


            // Perform Self Attention forward pass

            {

                // Multiply query and key(T) matrices and scale

                // att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))

                addInternal(m_blobKt, m_blobKt1);

                m_transposeQ.Forward(m_colInternalBottom, m_colInternalTop);


                double dfScale = 1.0 / Math.Sqrt(m_nSize);

                m_blobAttA.MatMul(m_blobQt, m_blobKt1);

                m_blobAttA.scale_data(dfScale);


                // Apply mask to attention matrix

                // att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))

                float fInf = 1e+29f;

                m_cuda.mask_batch(m_blobAttA.count(), m_blobAttA.num, blobMask.count(), convert(0.0), convert(-1 * fInf), m_blobAttA.gpu_data, blobMask.gpu_data, m_blobAttA.mutable_gpu_data); // all masked items set to -inf.


                // Take softmax of attention along the last axis.

                // att = F.softmax(att, dim = -1)

                addInternal(m_blobAttA, m_blobAttB);

                m_softmax.Forward(m_colInternalBottom, m_colInternalTop);


                // Apply attention dropout.

                // att = self.attn_dropout(att)

                if (m_attn_dropout != null)

                {

                    addInternal(m_blobAttB, m_blobAttB);

                    m_attn_dropout.Forward(m_colInternalBottom, m_colInternalTop);

                }


                m_blobWork.Reshape(m_blobVt.num, m_blobVt.channels, m_blobVt.height, m_blobVt.width);


                // Multiply attention matrix with values

                // y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)

                m_blobWork.MatMul(m_blobAttB, m_blobVt);

            }


            // Reassemble all head outputs side by side.

            // y = y.transpose(1, 2).contiguous().view(B, T, C)

            addInternal(m_blobWork, m_blobY);

            m_transpose.Forward(m_colInternalBottom, m_colInternalTop);

            m_blobY.Reshape(m_nB, m_nT, m_nC, 1);


            // Apply output projection.

            // y = self.resid_dropout(self.c_proj(y))

            addInternal(m_blobY, colTop[0]);

            m_c_proj.Forward(m_colInternalBottom, m_colInternalTop);


            // Apply resid dropout

            if (m_resid_dropout != null)

            {

                addInternal(colTop[0], colTop[0]);

                m_resid_dropout.Forward(m_colInternalBottom, m_colInternalTop);

            }

        }


        protected override void backward(BlobCollection<T> colTop, List<bool> rgbPropagateDown, BlobCollection<T> colBottom)

        {

            // Gradient with respect to state then data.

            if (rgbPropagateDown[0])

            {

                List<bool> rgbPropagate = new List<bool>() { true, true };


                // Apply resid dropout

                if (m_resid_dropout != null)

                {

                    addInternal(colTop[0], colTop[0]);

                    m_resid_dropout.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);

                }


                // Apply output projection.

                // y = self.w_0(concat_output)

                addInternal(m_blobY, colTop[0]);

                m_c_proj.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);


                // Reassemble all head outputs side by side.

                // y = y.transpose(1, 2).contiguous().view(B, T, C)

                addInternal(m_blobWork, m_blobY);

                m_transpose.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);


                // Perform Self Attention backward pass

                {

                    // Multiply attention matrix with values

                    // y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)

                    m_blobY.CopyFrom(m_blobWork, true, true);


                    // Multiply attention matrix with values

                    // y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)

                    // Gradient with respect to att

                    // att' = y' @ v^T

                    // Gradient with respect to vt

                    // vt' = att^T @ y'

                    m_blobY.MatMulGrad(m_blobAttB, m_blobVt, m_blobWork);


                    // Apply attention dropout.

                    // att = self.attn_dropout(att)

                    if (m_attn_dropout != null)

                    {

                        addInternal(m_blobAttB, m_blobAttB);

                        m_attn_dropout.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);

                    }


                    // Take softmax of attention along the last axis.

                    // att = F.softmax(att, dim = -1)

                    addInternal(m_blobAttA, m_blobAttB);

                    m_softmax.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);


                    // Multiply qt with kt^T to create attention matrix

                    // att = qt @ kt^T

                    // Gradient with respect to qt

                    // qt' = att' @ kt

                    // Gradient with respect to qt

                    // qt' = att' @ kt

                    double dfScale = 1.0 / Math.Sqrt(m_nSize);

                    m_blobAttA.MatMulGrad(m_blobQt, m_blobKt1, m_blobWork, dfScale);


                    // Transpose Kt1 back to Kt

                    addInternal(m_blobKt, m_blobKt1);

                    m_transposeQ.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);

                }


                // Transpose query, key and values along axes 1 & 2

                // k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

                // q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

                // v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

                addInternal(m_blobQ, m_blobQt);

                m_transpose.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom); // (B, nh, T, hs)

                addInternal(m_blobK, m_blobKt);

                m_transpose.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom); // (B, nh, T, hs)

                addInternal(m_blobV, m_blobVt);

                m_transpose.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom); // (B, nh, T, hs)


                // Calculate query for all heads in batch and move head forward to be the batch dim.

                // q = self.c_attnQ(x1)

                addInternal(m_blobX0, m_blobQ);

                m_c_attnQ.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);


                // Calculate query for all heads in batch and move head forward to be the batch dim.

                // k = self.c_attnK(x2)

                addInternal(m_blobX1, m_blobK);

                m_c_attnK.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);


                // Calculate query for all heads in batch and move head forward to be the batch dim.

                // v = self.c_attnV(x3)

                addInternal(m_blobX2, m_blobV);

                m_c_attnV.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);


                if (colBottom[0].gpu_diff == colBottom[1].gpu_diff && colBottom[0].gpu_diff == colBottom[2].gpu_diff)

                {

                    m_cuda.add(m_blobX0.count(), m_blobX0.gpu_diff, m_blobX1.gpu_diff, m_blobX2.gpu_diff, colBottom[0].mutable_gpu_diff);

                }

                else if (colBottom[1].gpu_diff == colBottom[2].gpu_diff)

                {

                    colBottom[0].CopyFrom(m_blobX0, true);

                    m_cuda.add(m_blobX1.count(), m_blobX1.gpu_diff, m_blobX2.gpu_diff, colBottom[1].mutable_gpu_diff);

                }

                else

                {

                    colBottom[0].CopyFrom(m_blobX0, true);

                    colBottom[1].CopyFrom(m_blobX1, true);

                    colBottom[2].CopyFrom(m_blobX2, true);

                }

            }

        }

    }

}

MyCaffe.basecode.Log
The Log class provides general output in text form.
Definition: Log.cs:13

MyCaffe.basecode.Log.CHECK_EQ
void CHECK_EQ(double df1, double df2, string str)
Test whether one number is equal to another.
Definition: Log.cs:239

MyCaffe.common.BlobCollection
The BlobCollection contains a list of Blobs.
Definition: BlobCollection.cs:16

MyCaffe.common.BlobCollection.Add
void Add(Blob< T > b)
Add a new Blob to the collection.
Definition: BlobCollection.cs:92

MyCaffe.common.BlobCollection.Count
int Count
Returns the number of items in the collection.
Definition: BlobCollection.cs:30

MyCaffe.common.BlobCollection.Clear
void Clear(bool bDispose=false)
Remove all items from the collection.
Definition: BlobCollection.cs:135

MyCaffe.common.BlobCollection.CopyFrom
void CopyFrom(BlobCollection< T > bSrc, bool bCopyDiff=false)
Copy the data or diff from another BlobCollection into this one.
Definition: BlobCollection.cs:266

MyCaffe.common.Blob
The Blob is the main holder of data that moves through the Layers of the Net.
Definition: Blob.cs:25

MyCaffe.common.Blob.channels
int channels
DEPRECIATED; legacy shape accessor channels: use shape(1) instead.
Definition: Blob.cs:800

MyCaffe.common.Blob.MatMul
void MatMul(Blob< T > blobA, Blob< T > blobB, bool bReshape=false, bool bTransA=false, bool bTransB=false, double dfScale=1.0, bool bADiff=false, bool bBDiff=false, bool bCDiff=false)
MatMul blobA with blobB and place the result in this blob (e.g. this = matmul(A, B))....
Definition: Blob.cs:3922

MyCaffe.common.Blob.height
int height
DEPRECIATED; legacy shape accessor height: use shape(2) instead.
Definition: Blob.cs:808

MyCaffe.common.Blob.MatMulGrad
void MatMulGrad(Blob< T > blobA, Blob< T > blobB, Blob< T > blobWork, double dfScale=1.0)
Calculates and propagates the gradient for blobA and BlobB given the input gradient in this blob's di...
Definition: Blob.cs:3974

MyCaffe.common.Blob.mutable_gpu_data
long mutable_gpu_data
Returns the data GPU handle used by the CudaDnn connection.
Definition: Blob.cs:1487

MyCaffe.common.Blob.Reshape
void Reshape(int nNum, int nChannels, int nHeight, int nWidth, bool? bUseHalfSize=null)
DEPRECIATED; use
Definition: Blob.cs:442

MyCaffe.common.Blob.CopyFrom
void CopyFrom(Blob< T > src, int nSrcOffset, int nDstOffset, int nCount, bool bCopyData, bool bCopyDiff)
Copy from a source Blob.
Definition: Blob.cs:903

MyCaffe.common.Blob.scale_data
void scale_data(double df)
Scale the data by a scaling factor.
Definition: Blob.cs:1754

MyCaffe.common.Blob.width
int width
DEPRECIATED; legacy shape accessor width: use shape(3) instead.
Definition: Blob.cs:816

MyCaffe.common.Blob.shape
List< int > shape()
Returns an array where each element contains the shape of an axis of the Blob.
Definition: Blob.cs:684

MyCaffe.common.Blob.count
int count()
Returns the total number of items in the Blob.
Definition: Blob.cs:739

MyCaffe.common.Blob.ReshapeLike
void ReshapeLike(Blob< T > b, bool? bUseHalfSize=null)
Reshape this Blob to have the same shape as another Blob.
Definition: Blob.cs:648

MyCaffe.common.Blob.Name
string Name
Get/set the name of the Blob.
Definition: Blob.cs:2184

MyCaffe.common.Blob.gpu_diff
long gpu_diff
Returns the diff GPU handle used by the CudaDnn connection.
Definition: Blob.cs:1541

MyCaffe.common.Blob.num
int num
DEPRECIATED; legacy shape accessor num: use shape(0) instead.
Definition: Blob.cs:792

MyCaffe.common.Blob.gpu_data
long gpu_data
Returns the data GPU handle used by the CudaDnn connection.
Definition: Blob.cs:1479

MyCaffe.common.CudaDnn
The CudaDnn object is the main interface to the Low-Level Cuda C++ DLL.
Definition: CudaDnn.cs:969

MyCaffe.layers.Layer
An interface for the units of computation which can be composed into a Net.
Definition: Layer.cs:31

MyCaffe.layers.Layer.m_param
LayerParameter m_param
Specifies the LayerParameter describing the Layer.
Definition: Layer.cs:47

MyCaffe.layers.Layer.convert
void convert(BlobCollection< T > col)
Convert a collection of blobs from / to half size.
Definition: Layer.cs:535

MyCaffe.layers.Layer.shareLayerBlob
bool shareLayerBlob(Blob< T > b, List< int > rgMinShape)
Attempts to share a Layer Blob if another parameter Blob with the same name and acceptable size is fo...
Definition: Layer.cs:1170

MyCaffe.layers.Layer.Backward
void Backward(BlobCollection< T > colTop, List< bool > rgbPropagateDown, BlobCollection< T > colBottom)
Given the top Blob error gradients, compute the bottom Blob error gradients.
Definition: Layer.cs:815

MyCaffe.layers.Layer.ReInitializeParameters
virtual bool ReInitializeParameters(WEIGHT_TARGET target)
Re-initialize the parameters of the layer.
Definition: Layer.cs:389

MyCaffe.layers.Layer.Forward
double Forward(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Given the bottom (input) Blobs, this function computes the top (output) Blobs and the loss.
Definition: Layer.cs:728

MyCaffe.layers.Layer.Reshape
abstract void Reshape(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Adjust the shapes of top blobs and internal buffers to accomodate the shapes of the bottom blobs.

MyCaffe.layers.Layer.m_colInternalBlobs
BlobCollection< T > m_colInternalBlobs
Specifies internal blobs used by the layer.
Definition: Layer.cs:59

MyCaffe.layers.Layer.m_cuda
CudaDnn< T > m_cuda
Specifies the CudaDnn connection to Cuda.
Definition: Layer.cs:39

MyCaffe.layers.Layer.Setup
void Setup(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Implements common Layer setup functionality.
Definition: Layer.cs:439

MyCaffe.layers.Layer.Create
static Layer< T > Create(CudaDnn< T > cuda, Log log, LayerParameter p, CancelEvent evtCancel, IXDatabaseBase db=null, TransferInput trxinput=null)
Create a new Layer based on the LayerParameter.
Definition: Layer.cs:1468

MyCaffe.layers.Layer.m_type
LayerParameter.LayerType m_type
Specifies the Layer type.
Definition: Layer.cs:35

MyCaffe.layers.Layer.blobs
BlobCollection< T > blobs
Returns the collection of learnable parameter Blobs for the Layer.
Definition: Layer.cs:875

MyCaffe.layers.Layer.internal_blobs
BlobCollection< T > internal_blobs
Returns the collection of internal Blobs used by the Layer.
Definition: Layer.cs:883

MyCaffe.layers.Layer.convertLayerParam
LayerParameter convertLayerParam(LayerParameter pChild, LayerParameter pParent)
Called to convert a parent LayerParameterEx, used in blob sharing, with a child layer parameter.
Definition: Layer.cs:1134

MyCaffe.layers.gpt.MultiheadAttentionLayer
The MultiheadAttention provides a vanilla multi-head layer.
Definition: MultiheadAttentionLayer.cs:23

MyCaffe.layers.gpt.MultiheadAttentionLayer.LayerSetUp
override void LayerSetUp(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Setup the layer.
Definition: MultiheadAttentionLayer.cs:358

MyCaffe.layers.gpt.MultiheadAttentionLayer.ReInitializeParameters
override bool ReInitializeParameters(WEIGHT_TARGET target)
Re-initialize the parameters of the layer.
Definition: MultiheadAttentionLayer.cs:319

MyCaffe.layers.gpt.MultiheadAttentionLayer.setup_internal_blobs
override void setup_internal_blobs(BlobCollection< T > col)
Derivative layers should add all internal blobws to the 'col' provided.
Definition: MultiheadAttentionLayer.cs:264

MyCaffe.layers.gpt.MultiheadAttentionLayer.ExactNumTopBlobs
override int ExactNumTopBlobs
Returns the exact number of required top (output) Blobs: attn
Definition: MultiheadAttentionLayer.cs:309

MyCaffe.layers.gpt.MultiheadAttentionLayer.dispose
override void dispose()
Releases all GPU and host resources used by the Layer.
Definition: MultiheadAttentionLayer.cs:233

MyCaffe.layers.gpt.MultiheadAttentionLayer.Reshape
override void Reshape(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Reshape the bottom (input) and top (output) blobs.
Definition: MultiheadAttentionLayer.cs:442

MyCaffe.layers.gpt.MultiheadAttentionLayer.MultiheadAttentionLayer
MultiheadAttentionLayer(CudaDnn< T > cuda, Log log, LayerParameter p)
The MultiheadAttention constructor.
Definition: MultiheadAttentionLayer.cs:75

MyCaffe.layers.gpt.MultiheadAttentionLayer.forward
override void forward(BlobCollection< T > colBottom, BlobCollection< T > colTop)
The forward computation.
Definition: MultiheadAttentionLayer.cs:529

MyCaffe.layers.gpt.MultiheadAttentionLayer.ExactNumBottomBlobs
override int ExactNumBottomBlobs
Returns the exact number of required bottom (input) Blobs: q, k, v, mask
Definition: MultiheadAttentionLayer.cs:301

MyCaffe.layers.gpt.MultiheadAttentionLayer.backward
override void backward(BlobCollection< T > colTop, List< bool > rgbPropagateDown, BlobCollection< T > colBottom)
Computes the loss error gradient w.r.t the outputs.
Definition: MultiheadAttentionLayer.cs:633

MyCaffe.param.DropoutParameter.dropout_ratio
double dropout_ratio
Specifies the dropout ratio. (e.g. the probability that values will be dropped out and set to zero....
Definition: DropoutParameter.cs:63

MyCaffe.param.EngineParameter
Specifies whether to use the NVIDIA cuDnn version or Caffe version of a given forward/backward operat...
Definition: EngineParameter.cs:17

MyCaffe.param.EngineParameter.engine
Engine engine
Specifies the Engine in use.
Definition: EngineParameter.cs:49

MyCaffe.param.EngineParameter.Engine
Engine
Defines the type of engine to use.
Definition: EngineParameter.cs:24

MyCaffe.param.FillerParameter
Specifies the filler parameters used to create each Filler.
Definition: FillerParameter.cs:16

MyCaffe.param.InnerProductParameter.weight_filler
FillerParameter weight_filler
The filler for the weights.
Definition: InnerProductParameter.cs:119

MyCaffe.param.InnerProductParameter.axis
int axis
Specifies the first axis to be lumped into a single inner product computation; all preceding axes are...
Definition: InnerProductParameter.cs:142

MyCaffe.param.InnerProductParameter.bias_filler
FillerParameter bias_filler
The filler for the bias.
Definition: InnerProductParameter.cs:130

MyCaffe.param.InnerProductParameter.num_output
uint num_output
The number of outputs for the layer.
Definition: InnerProductParameter.cs:85

MyCaffe.param.InnerProductParameter.bias_term
bool bias_term
Whether to have bias terms or not.
Definition: InnerProductParameter.cs:108

MyCaffe.param.LayerParameter
Specifies the base parameter for all layers.
Definition: LayerParameter.cs:24

MyCaffe.param.LayerParameter.parameters
List< ParamSpec > parameters
Specifies the ParamSpec parameters of the LayerParameter.
Definition: LayerParameter.cs:1964

MyCaffe.param.LayerParameter.name
string name
Specifies the name of this LayerParameter.
Definition: LayerParameter.cs:1865

MyCaffe.param.LayerParameter.softmax_param
SoftmaxParameter softmax_param
Returns the parameter set when initialized with LayerType.SOFTMAX
Definition: LayerParameter.cs:2794

MyCaffe.param.LayerParameter.multihead_attention_param
MultiheadAttentionParameter multihead_attention_param
Returns the parameter set when initialized with LayerType.MULTIHEAD_ATTENTION
Definition: LayerParameter.cs:2200

MyCaffe.param.LayerParameter.inner_product_param
InnerProductParameter inner_product_param
Returns the parameter set when initialized with LayerType.INNERPRODUCT
Definition: LayerParameter.cs:2452

MyCaffe.param.LayerParameter.transpose_param
TransposeParameter transpose_param
Returns the parameter set when initialized with LayerType.TRANSPOSE
Definition: LayerParameter.cs:2876

MyCaffe.param.LayerParameter.LayerType
LayerType
Specifies the layer type.
Definition: LayerParameter.cs:110

MyCaffe.param.LayerParameter.dropout_param
DropoutParameter dropout_param
Returns the parameter set when initialized with LayerType.DROPOUT
Definition: LayerParameter.cs:2290

MyCaffe.param.ParamSpec
Specifies training parameters (multipliers on global learning constants, and the name of other settin...
Definition: ParamSpec.cs:19

MyCaffe.param.SoftmaxParameter.axis
int axis
The axis along which to perform the softmax – may be negative to index from the end (e....
Definition: SoftmaxParameter.cs:83

MyCaffe.param.gpt.MultiheadAttentionParameter
Specifies the parameters for the MultiheadAttentionLayer.
Definition: MultiheadAttentionParameter.cs:16

MyCaffe.param.gpt.MultiheadAttentionParameter.WEIGHT_INIT
WEIGHT_INIT
Defines the weight initialization strategy.
Definition: MultiheadAttentionParameter.cs:29

MyCaffe.param.gpt.MultiheadAttentionParameter.attn_dropout
double attn_dropout
Specifies dropout probability used on the attention weights.
Definition: MultiheadAttentionParameter.cs:87

MyCaffe.param.gpt.MultiheadAttentionParameter.block_size
uint block_size
Specifies size of the block.
Definition: MultiheadAttentionParameter.cs:78

MyCaffe.param.gpt.MultiheadAttentionParameter.heads
uint heads
The number of heads used.
Definition: MultiheadAttentionParameter.cs:60

MyCaffe.param.gpt.MultiheadAttentionParameter.layers
uint layers
The number of layers (transformer blocks) used.
Definition: MultiheadAttentionParameter.cs:50

MyCaffe.param.gpt.MultiheadAttentionParameter.resid_dropout
double resid_dropout
Specifies dropout probability used on the residual weights.
Definition: MultiheadAttentionParameter.cs:96

MyCaffe.param.gpt.MultiheadAttentionParameter.embed
uint embed
Specifies size of the embed.
Definition: MultiheadAttentionParameter.cs:69

MyCaffe.param.gpt.MultiheadAttentionParameter.weight_init
WEIGHT_INIT weight_init
Specifies the weight initialization strategy (default = ENCODER_DECODER).
Definition: MultiheadAttentionParameter.cs:105

MyCaffe.basecode
The MyCaffe.basecode contains all generic types used throughout MyCaffe.
Definition: Annotation.cs:12

MyCaffe.common
The MyCaffe.common namespace contains common MyCaffe classes.
Definition: BatchInput.cs:8

MyCaffe.common.WEIGHT_TARGET
WEIGHT_TARGET
Defines the type of weight to target in re-initializations.
Definition: Interfaces.cs:38

MyCaffe.fillers
The MyCaffe.fillers namespace contains all fillers including the Filler class.
Definition: BilinearFiller.cs:10

MyCaffe.layers.gpt
The MyCaffe.layers.gpt namespace contains all GPT related layers.
Definition: LayerFactory.cs:15

MyCaffe.param.gpt
Definition: CausalSelfAttentionParameter.cs:9

MyCaffe.param
The MyCaffe.param namespace contains parameters used to create models.
Definition: AttentionParameter.cs:9

MyCaffe
The MyCaffe namespace contains the main body of MyCaffe code that closesly tracks the C++ Caffe open-...
Definition: Annotation.cs:12

System
Definition: Component.cs:11