2using System.Collections.Generic;
24 List<int> m_rgShape =
new List<int>() { 1, 1, 1, 1 };
57 double m_dfAttnDropout;
58 double m_dfResidDropout;
86 log.
CHECK_EQ(m_nEmbed % m_nHeads, 0,
"The embedding size must be divisible by the number of heads.");
169 if (m_dfAttnDropout > 0)
176 if (m_dfResidDropout > 0)
200 m_blobX0 =
new Blob<T>(cuda, log);
202 m_blobX1 =
new Blob<T>(cuda, log);
204 m_blobX2 =
new Blob<T>(cuda, log);
206 m_blobQ =
new Blob<T>(cuda, log);
208 m_blobK =
new Blob<T>(cuda, log);
210 m_blobV =
new Blob<T>(cuda, log);
212 m_blobQt =
new Blob<T>(cuda, log);
214 m_blobKt =
new Blob<T>(cuda, log);
216 m_blobKt1 =
new Blob<T>(cuda, log);
218 m_blobVt =
new Blob<T>(cuda, log);
220 m_blobAttA =
new Blob<T>(cuda, log);
222 m_blobAttB =
new Blob<T>(cuda, log);
224 m_blobWork =
new Blob<T>(cuda, log);
226 m_blobY =
new Blob<T>(cuda, log);
290 if (m_attn_dropout !=
null)
293 if (m_resid_dropout !=
null)
321 base.ReInitializeParameters(target);
333 m_colInternalBottom.
Clear();
334 m_colInternalBottom.
Add(bottom);
336 m_colInternalTop.
Clear();
337 m_colInternalTop.
Add(top);
342 m_colInternalBottom.
Clear();
344 for (
int i=0; i<rgBottom.Count; i++)
346 m_colInternalBottom.
Add(rgBottom[i]);
349 m_colInternalTop.
Clear();
350 m_colInternalTop.
Add(top);
370 m_nSize = m_nC / m_nHeads;
372 addInternal(m_blobX0, m_blobQ);
373 m_c_attnQ.
Setup(m_colInternalBottom, m_colInternalTop);
374 addInternal(m_blobX1, m_blobK);
375 m_c_attnK.
Setup(m_colInternalBottom, m_colInternalTop);
376 addInternal(m_blobX2, m_blobV);
377 m_c_attnV.
Setup(m_colInternalBottom, m_colInternalTop);
387 m_rgShape[1] = m_nHeads;
389 m_rgShape[3] = m_nSize;
393 addInternal(m_blobQ, m_blobQt);
394 m_transpose.
Setup(m_colInternalBottom, m_colInternalTop);
397 m_blobAttA.
Reshape(m_nB, m_nHeads, m_nBlockSize, m_nBlockSize);
399 m_blobAttB.
Reshape(m_nB, m_nHeads, m_nBlockSize, m_nBlockSize);
401 addInternal(m_blobAttA, m_blobAttB);
402 m_softmax.
Setup(m_colInternalBottom, m_colInternalTop);
404 if (m_attn_dropout !=
null)
406 addInternal(m_blobAttB, m_blobAttB);
407 m_attn_dropout.
Setup(m_colInternalBottom, m_colInternalTop);
418 addInternal(m_blobY, colTop[0]);
419 m_c_proj.
Setup(m_colInternalBottom, m_colInternalTop);
424 if (m_resid_dropout !=
null)
426 addInternal(colTop[0], colTop[0]);
427 m_resid_dropout.
Setup(m_colInternalBottom, m_colInternalTop);
451 m_nSize = m_nC / m_nHeads;
455 m_rgShape[2] = m_nHeads;
456 m_rgShape[3] = m_nSize;
464 addInternal(m_blobK, m_blobKt);
465 m_transpose.
Reshape(m_colInternalBottom, m_colInternalTop);
472 addInternal(m_blobQ, m_blobQt);
473 m_transpose.
Reshape(m_colInternalBottom, m_colInternalTop);
479 addInternal(m_blobV, m_blobVt);
480 m_transpose.
Reshape(m_colInternalBottom, m_colInternalTop);
483 m_rgShape[1] = m_nHeads;
492 m_rgShape[0] = m_blobVt.
num;
494 m_rgShape[2] = m_blobVt.
width;
495 m_rgShape[3] = m_blobVt.
height;
499 addInternal(m_blobWork, m_blobY);
500 m_transposeQ.
Reshape(m_colInternalBottom, m_colInternalTop);
509 addInternal(m_blobY, colTop[0]);
510 m_c_proj.
Reshape(m_colInternalBottom, m_colInternalTop);
512 if (m_resid_dropout !=
null)
514 addInternal(colTop[0], colTop[0]);
515 m_resid_dropout.
Reshape(m_colInternalBottom, m_colInternalTop);
531 Blob<T> blobMask = colBottom[3];
539 addInternal(m_blobX0, m_blobQ);
540 m_c_attnQ.
Forward(m_colInternalBottom, m_colInternalTop);
544 addInternal(m_blobX1, m_blobK);
545 m_c_attnK.
Forward(m_colInternalBottom, m_colInternalTop);
549 addInternal(m_blobX2, m_blobV);
550 m_c_attnV.
Forward(m_colInternalBottom, m_colInternalTop);
556 m_blobQ.
Reshape(m_nB, m_nT, m_nHeads, m_nSize);
557 m_blobK.
Reshape(m_nB, m_nT, m_nHeads, m_nSize);
558 m_blobV.
Reshape(m_nB, m_nT, m_nHeads, m_nSize);
560 addInternal(m_blobQ, m_blobQt);
561 m_transpose.
Forward(m_colInternalBottom, m_colInternalTop);
562 addInternal(m_blobK, m_blobKt);
563 m_transpose.
Forward(m_colInternalBottom, m_colInternalTop);
564 addInternal(m_blobV, m_blobVt);
565 m_transpose.
Forward(m_colInternalBottom, m_colInternalTop);
571 addInternal(m_blobKt, m_blobKt1);
572 m_transposeQ.
Forward(m_colInternalBottom, m_colInternalTop);
574 double dfScale = 1.0 / Math.Sqrt(m_nSize);
575 m_blobAttA.
MatMul(m_blobQt, m_blobKt1);
585 addInternal(m_blobAttA, m_blobAttB);
586 m_softmax.
Forward(m_colInternalBottom, m_colInternalTop);
590 if (m_attn_dropout !=
null)
592 addInternal(m_blobAttB, m_blobAttB);
593 m_attn_dropout.
Forward(m_colInternalBottom, m_colInternalTop);
600 m_blobWork.
MatMul(m_blobAttB, m_blobVt);
605 addInternal(m_blobWork, m_blobY);
606 m_transpose.
Forward(m_colInternalBottom, m_colInternalTop);
607 m_blobY.
Reshape(m_nB, m_nT, m_nC, 1);
611 addInternal(m_blobY, colTop[0]);
612 m_c_proj.
Forward(m_colInternalBottom, m_colInternalTop);
615 if (m_resid_dropout !=
null)
617 addInternal(colTop[0], colTop[0]);
618 m_resid_dropout.
Forward(m_colInternalBottom, m_colInternalTop);
636 if (rgbPropagateDown[0])
638 List<bool> rgbPropagate =
new List<bool>() {
true,
true };
641 if (m_resid_dropout !=
null)
643 addInternal(colTop[0], colTop[0]);
644 m_resid_dropout.
Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
649 addInternal(m_blobY, colTop[0]);
650 m_c_proj.
Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
654 addInternal(m_blobWork, m_blobY);
655 m_transpose.
Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
661 m_blobY.
CopyFrom(m_blobWork,
true,
true);
669 m_blobY.
MatMulGrad(m_blobAttB, m_blobVt, m_blobWork);
673 if (m_attn_dropout !=
null)
675 addInternal(m_blobAttB, m_blobAttB);
676 m_attn_dropout.
Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
681 addInternal(m_blobAttA, m_blobAttB);
682 m_softmax.
Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
690 double dfScale = 1.0 / Math.Sqrt(m_nSize);
691 m_blobAttA.
MatMulGrad(m_blobQt, m_blobKt1, m_blobWork, dfScale);
694 addInternal(m_blobKt, m_blobKt1);
695 m_transposeQ.
Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
702 addInternal(m_blobQ, m_blobQt);
703 m_transpose.
Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
704 addInternal(m_blobK, m_blobKt);
705 m_transpose.
Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
706 addInternal(m_blobV, m_blobVt);
707 m_transpose.
Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
711 addInternal(m_blobX0, m_blobQ);
712 m_c_attnQ.
Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
716 addInternal(m_blobX1, m_blobK);
717 m_c_attnK.
Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
721 addInternal(m_blobX2, m_blobV);
722 m_c_attnV.
Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
724 if (colBottom[0].gpu_diff == colBottom[1].gpu_diff && colBottom[0].gpu_diff == colBottom[2].gpu_diff)
728 else if (colBottom[1].gpu_diff == colBottom[2].gpu_diff)
730 colBottom[0].
CopyFrom(m_blobX0,
true);
735 colBottom[0].
CopyFrom(m_blobX0,
true);
736 colBottom[1].
CopyFrom(m_blobX1,
true);
737 colBottom[2].
CopyFrom(m_blobX2,
true);
The Log class provides general output in text form.
void CHECK_EQ(double df1, double df2, string str)
Test whether one number is equal to another.
The BlobCollection contains a list of Blobs.
void Add(Blob< T > b)
Add a new Blob to the collection.
int Count
Returns the number of items in the collection.
void Clear(bool bDispose=false)
Remove all items from the collection.
void CopyFrom(BlobCollection< T > bSrc, bool bCopyDiff=false)
Copy the data or diff from another BlobCollection into this one.
The Blob is the main holder of data that moves through the Layers of the Net.
int channels
DEPRECIATED; legacy shape accessor channels: use shape(1) instead.
void MatMul(Blob< T > blobA, Blob< T > blobB, bool bReshape=false, bool bTransA=false, bool bTransB=false, double dfScale=1.0, bool bADiff=false, bool bBDiff=false, bool bCDiff=false)
MatMul blobA with blobB and place the result in this blob (e.g. this = matmul(A, B))....
int height
DEPRECIATED; legacy shape accessor height: use shape(2) instead.
void MatMulGrad(Blob< T > blobA, Blob< T > blobB, Blob< T > blobWork, double dfScale=1.0)
Calculates and propagates the gradient for blobA and BlobB given the input gradient in this blob's di...
long mutable_gpu_data
Returns the data GPU handle used by the CudaDnn connection.
void Reshape(int nNum, int nChannels, int nHeight, int nWidth, bool? bUseHalfSize=null)
DEPRECIATED; use
void CopyFrom(Blob< T > src, int nSrcOffset, int nDstOffset, int nCount, bool bCopyData, bool bCopyDiff)
Copy from a source Blob.
void scale_data(double df)
Scale the data by a scaling factor.
int width
DEPRECIATED; legacy shape accessor width: use shape(3) instead.
List< int > shape()
Returns an array where each element contains the shape of an axis of the Blob.
int count()
Returns the total number of items in the Blob.
void ReshapeLike(Blob< T > b, bool? bUseHalfSize=null)
Reshape this Blob to have the same shape as another Blob.
string Name
Get/set the name of the Blob.
long gpu_diff
Returns the diff GPU handle used by the CudaDnn connection.
int num
DEPRECIATED; legacy shape accessor num: use shape(0) instead.
long gpu_data
Returns the data GPU handle used by the CudaDnn connection.
The CudaDnn object is the main interface to the Low-Level Cuda C++ DLL.
An interface for the units of computation which can be composed into a Net.
LayerParameter m_param
Specifies the LayerParameter describing the Layer.
void convert(BlobCollection< T > col)
Convert a collection of blobs from / to half size.
bool shareLayerBlob(Blob< T > b, List< int > rgMinShape)
Attempts to share a Layer Blob if another parameter Blob with the same name and acceptable size is fo...
void Backward(BlobCollection< T > colTop, List< bool > rgbPropagateDown, BlobCollection< T > colBottom)
Given the top Blob error gradients, compute the bottom Blob error gradients.
virtual bool ReInitializeParameters(WEIGHT_TARGET target)
Re-initialize the parameters of the layer.
double Forward(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Given the bottom (input) Blobs, this function computes the top (output) Blobs and the loss.
abstract void Reshape(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Adjust the shapes of top blobs and internal buffers to accomodate the shapes of the bottom blobs.
BlobCollection< T > m_colInternalBlobs
Specifies internal blobs used by the layer.
CudaDnn< T > m_cuda
Specifies the CudaDnn connection to Cuda.
void Setup(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Implements common Layer setup functionality.
static Layer< T > Create(CudaDnn< T > cuda, Log log, LayerParameter p, CancelEvent evtCancel, IXDatabaseBase db=null, TransferInput trxinput=null)
Create a new Layer based on the LayerParameter.
LayerParameter.LayerType m_type
Specifies the Layer type.
BlobCollection< T > blobs
Returns the collection of learnable parameter Blobs for the Layer.
BlobCollection< T > internal_blobs
Returns the collection of internal Blobs used by the Layer.
LayerParameter convertLayerParam(LayerParameter pChild, LayerParameter pParent)
Called to convert a parent LayerParameterEx, used in blob sharing, with a child layer parameter.
The MultiheadAttention provides a vanilla multi-head layer.
override void LayerSetUp(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Setup the layer.
override bool ReInitializeParameters(WEIGHT_TARGET target)
Re-initialize the parameters of the layer.
override void setup_internal_blobs(BlobCollection< T > col)
Derivative layers should add all internal blobws to the 'col' provided.
override int ExactNumTopBlobs
Returns the exact number of required top (output) Blobs: attn
override void dispose()
Releases all GPU and host resources used by the Layer.
override void Reshape(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Reshape the bottom (input) and top (output) blobs.
MultiheadAttentionLayer(CudaDnn< T > cuda, Log log, LayerParameter p)
The MultiheadAttention constructor.
override void forward(BlobCollection< T > colBottom, BlobCollection< T > colTop)
The forward computation.
override int ExactNumBottomBlobs
Returns the exact number of required bottom (input) Blobs: q, k, v, mask
override void backward(BlobCollection< T > colTop, List< bool > rgbPropagateDown, BlobCollection< T > colBottom)
Computes the loss error gradient w.r.t the outputs.
double dropout_ratio
Specifies the dropout ratio. (e.g. the probability that values will be dropped out and set to zero....
Specifies whether to use the NVIDIA cuDnn version or Caffe version of a given forward/backward operat...
Engine engine
Specifies the Engine in use.
Engine
Defines the type of engine to use.
Specifies the filler parameters used to create each Filler.
FillerParameter weight_filler
The filler for the weights.
int axis
Specifies the first axis to be lumped into a single inner product computation; all preceding axes are...
FillerParameter bias_filler
The filler for the bias.
uint num_output
The number of outputs for the layer.
bool bias_term
Whether to have bias terms or not.
Specifies the base parameter for all layers.
List< ParamSpec > parameters
Specifies the ParamSpec parameters of the LayerParameter.
string name
Specifies the name of this LayerParameter.
SoftmaxParameter softmax_param
Returns the parameter set when initialized with LayerType.SOFTMAX
MultiheadAttentionParameter multihead_attention_param
Returns the parameter set when initialized with LayerType.MULTIHEAD_ATTENTION
InnerProductParameter inner_product_param
Returns the parameter set when initialized with LayerType.INNERPRODUCT
TransposeParameter transpose_param
Returns the parameter set when initialized with LayerType.TRANSPOSE
LayerType
Specifies the layer type.
DropoutParameter dropout_param
Returns the parameter set when initialized with LayerType.DROPOUT
Specifies training parameters (multipliers on global learning constants, and the name of other settin...
int axis
The axis along which to perform the softmax – may be negative to index from the end (e....
Specifies the parameters for the MultiheadAttentionLayer.
WEIGHT_INIT
Defines the weight initialization strategy.
double attn_dropout
Specifies dropout probability used on the attention weights.
uint block_size
Specifies size of the block.
uint heads
The number of heads used.
uint layers
The number of layers (transformer blocks) used.
double resid_dropout
Specifies dropout probability used on the residual weights.
uint embed
Specifies size of the embed.
WEIGHT_INIT weight_init
Specifies the weight initialization strategy (default = ENCODER_DECODER).
The MyCaffe.basecode contains all generic types used throughout MyCaffe.
The MyCaffe.common namespace contains common MyCaffe classes.
WEIGHT_TARGET
Defines the type of weight to target in re-initializations.
The MyCaffe.fillers namespace contains all fillers including the Filler class.
The MyCaffe.layers.gpt namespace contains all GPT related layers.
The MyCaffe.param namespace contains parameters used to create models.
The MyCaffe namespace contains the main body of MyCaffe code that closesly tracks the C++ Caffe open-...