2using System.Collections.Generic;
22 List<int> m_rgShape =
new List<int>() { 1, 1, 1, 1 };
53 double m_dfAttnDropout;
54 double m_dfResidDropout;
55 double m_dfIgnoreVal = -1e+29;
84 log.
CHECK_EQ(m_nEmbed % m_nHeads, 0,
"The embedding size must be divisible by the number of heads.");
111 if (m_dfAttnDropout > 0)
118 if (m_dfResidDropout > 0)
143 m_blobBias =
new Blob<T>(cuda, log);
146 List<int> rgShape =
new List<int>() { 1, 1, m_nBlockSize, m_nBlockSize };
148 fillBias(m_blobBias);
150 m_blobQ =
new Blob<T>(cuda, log);
152 m_blobK =
new Blob<T>(cuda, log);
154 m_blobV =
new Blob<T>(cuda, log);
156 m_blobQt =
new Blob<T>(cuda, log);
158 m_blobKt =
new Blob<T>(cuda, log);
160 m_blobKt1 =
new Blob<T>(cuda, log);
162 m_blobVt =
new Blob<T>(cuda, log);
164 m_blobAttA =
new Blob<T>(cuda, log);
166 m_blobAttB =
new Blob<T>(cuda, log);
168 m_blobWork =
new Blob<T>(cuda, log);
171 m_blobIpAttn =
new Blob<T>(cuda, log);
173 m_blobY =
new Blob<T>(cuda, log);
213 col.
Add(m_blobIpAttn);
231 if (m_attn_dropout !=
null)
234 if (m_resid_dropout !=
null)
238 private void fillBias(
Blob<T> b)
244 for (
int i = 0; i<b.
height; i++)
246 for (
int j = i + 1; j < b.
width; j++)
248 rgBiasData[i * b.
width + j] = 0;
278 base.ReInitializeParameters(target);
288 m_colInternalBottom.
Clear();
289 m_colInternalBottom.
Add(bottom);
291 m_colInternalTop.
Clear();
292 m_colInternalTop.
Add(top);
297 m_colInternalBottom.
Clear();
299 for (
int i=0; i<rgBottom.Count; i++)
301 m_colInternalBottom.
Add(rgBottom[i]);
304 m_colInternalTop.
Clear();
305 m_colInternalTop.
Add(top);
320 m_nSize = m_nC / (int)m_nHeads;
322 m_nDataSize = blobX.
count(3);
324 m_nSize *= m_nDataSize;
326 addInternal(blobX, m_blobIpAttn);
327 m_c_attn.
Setup(m_colInternalBottom, m_colInternalTop);
334 m_rgShape[2] = m_nHeads;
335 m_rgShape[3] = m_nSize;
339 addInternal(m_blobQ, m_blobQt);
340 m_transpose.
Setup(m_colInternalBottom, m_colInternalTop);
346 addInternal(m_blobAttA, m_blobAttB);
347 m_softmax.
Setup(m_colInternalBottom, m_colInternalTop);
349 if (m_attn_dropout !=
null)
351 addInternal(m_blobAttB, m_blobAttB);
352 m_attn_dropout.
Setup(m_colInternalBottom, m_colInternalTop);
358 m_rgShape[3] = m_nDataSize;
363 addInternal(m_blobY, colTop[0]);
364 m_c_proj.
Setup(m_colInternalBottom, m_colInternalTop);
369 if (m_resid_dropout !=
null)
371 addInternal(colTop[0], colTop[0]);
372 m_resid_dropout.
Setup(m_colInternalBottom, m_colInternalTop);
394 m_nSize = m_nC / m_nHeads;
396 m_nDataSize = blobX.
count(3);
398 m_nSize *= m_nDataSize;
402 m_rgShape[2] = m_nHeads;
403 m_rgShape[3] = m_nSize;
411 addInternal(m_blobK, m_blobKt);
412 m_transpose.
Reshape(m_colInternalBottom, m_colInternalTop);
419 addInternal(m_blobQ, m_blobQt);
420 m_transpose.
Reshape(m_colInternalBottom, m_colInternalTop);
426 m_blobV.
Reshape(m_nB, m_nT, m_nHeads, m_nSize);
427 addInternal(m_blobV, m_blobVt);
428 m_transpose.
Reshape(m_colInternalBottom, m_colInternalTop);
431 m_rgShape[1] = m_nHeads;
440 m_rgShape[0] = m_blobVt.
num;
442 m_rgShape[2] = m_blobVt.
width;
443 m_rgShape[3] = m_blobVt.
height;
448 addInternal(m_blobWork, m_blobY);
449 m_transposeQ.
Reshape(m_colInternalBottom, m_colInternalTop);
454 m_rgShape[3] = m_nDataSize;
459 addInternal(m_blobY, colTop[0]);
460 m_c_proj.
Reshape(m_colInternalBottom, m_colInternalTop);
462 if (m_resid_dropout !=
null)
464 addInternal(colTop[0], colTop[0]);
465 m_resid_dropout.
Reshape(m_colInternalBottom, m_colInternalTop);
468 if (m_blobBias.
height != m_nT || m_blobBias.
width != m_nT)
470 List<int> rgShape =
new List<int>() { 1, 1, m_nT, m_nT };
472 fillBias(m_blobBias);
492 addInternal(blobX, m_blobIpAttn);
493 m_c_attn.
Forward(m_colInternalBottom, m_colInternalTop);
496 int nCount = m_blobQ.
count();
505 addInternal(m_blobK, m_blobKt);
506 m_transpose.
Forward(m_colInternalBottom, m_colInternalTop);
507 addInternal(m_blobQ, m_blobQt);
508 m_transpose.
Forward(m_colInternalBottom, m_colInternalTop);
509 addInternal(m_blobV, m_blobVt);
510 m_transpose.
Forward(m_colInternalBottom, m_colInternalTop);
515 addInternal(m_blobKt, m_blobKt1);
516 m_transposeQ.
Forward(m_colInternalBottom, m_colInternalTop);
518 double dfScale = 1.0 / Math.Sqrt(m_nSize);
519 m_blobAttA.
MatMul(m_blobQt, m_blobKt1);
528 addInternal(m_blobAttA, m_blobAttB);
529 m_softmax.
Forward(m_colInternalBottom, m_colInternalTop);
533 if (m_attn_dropout !=
null)
535 addInternal(m_blobAttB, m_blobAttB);
536 m_attn_dropout.
Forward(m_colInternalBottom, m_colInternalTop);
543 m_blobWork.
MatMul(m_blobAttB, m_blobVt);
547 addInternal(m_blobWork, m_blobY);
548 m_transpose.
Forward(m_colInternalBottom, m_colInternalTop);
549 m_blobY.
Reshape(m_nB, m_nT, m_nC, m_nDataSize);
553 addInternal(m_blobY, colTop[0]);
554 m_c_proj.
Forward(m_colInternalBottom, m_colInternalTop);
557 if (m_resid_dropout !=
null)
559 addInternal(colTop[0], colTop[0]);
560 m_resid_dropout.
Forward(m_colInternalBottom, m_colInternalTop);
578 if (rgbPropagateDown[0])
580 List<bool> rgbPropagate =
new List<bool>() {
true,
true };
583 if (m_resid_dropout !=
null)
585 addInternal(colTop[0], colTop[0]);
586 m_resid_dropout.
Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
591 addInternal(m_blobY, colTop[0]);
592 m_c_proj.
Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
596 addInternal(m_blobWork, m_blobY);
597 m_transpose.
Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
603 m_blobY.
CopyFrom(m_blobWork,
true,
true);
611 m_blobY.
MatMulGrad(m_blobAttB, m_blobVt, m_blobWork);
615 if (m_attn_dropout !=
null)
617 addInternal(m_blobAttB, m_blobAttB);
618 m_attn_dropout.
Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
623 addInternal(m_blobAttA, m_blobAttB);
624 m_softmax.
Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
632 double dfScale = 1.0 / Math.Sqrt(m_nSize);
633 m_blobAttA.
MatMulGrad(m_blobQt, m_blobKt1, m_blobWork, dfScale);
636 addInternal(m_blobKt, m_blobKt1);
637 m_transposeQ.
Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
644 addInternal(m_blobK, m_blobKt);
645 m_transpose.
Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
646 addInternal(m_blobQ, m_blobQt);
647 m_transpose.
Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
648 addInternal(m_blobV, m_blobVt);
649 m_transpose.
Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
652 int nCount = m_blobQ.
count();
659 addInternal(colBottom[0], m_blobIpAttn);
660 m_c_attn.
Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
The Log class provides general output in text form.
void CHECK_EQ(double df1, double df2, string str)
Test whether one number is equal to another.
The BlobCollection contains a list of Blobs.
void Add(Blob< T > b)
Add a new Blob to the collection.
int Count
Returns the number of items in the collection.
void Clear(bool bDispose=false)
Remove all items from the collection.
The Blob is the main holder of data that moves through the Layers of the Net.
int channels
DEPRECIATED; legacy shape accessor channels: use shape(1) instead.
void SetData(T[] rgData, int nCount=-1, bool bSetCount=true)
Sets a number of items within the Blob's data.
void MatMul(Blob< T > blobA, Blob< T > blobB, bool bReshape=false, bool bTransA=false, bool bTransB=false, double dfScale=1.0, bool bADiff=false, bool bBDiff=false, bool bCDiff=false)
MatMul blobA with blobB and place the result in this blob (e.g. this = matmul(A, B))....
int height
DEPRECIATED; legacy shape accessor height: use shape(2) instead.
void MatMulGrad(Blob< T > blobA, Blob< T > blobB, Blob< T > blobWork, double dfScale=1.0)
Calculates and propagates the gradient for blobA and BlobB given the input gradient in this blob's di...
long mutable_gpu_diff
Returns the diff GPU handle used by the CudaDnn connection.
long mutable_gpu_data
Returns the data GPU handle used by the CudaDnn connection.
T[] mutable_cpu_data
Get data from the GPU and bring it over to the host, or Set data from the Host and send it over to th...
void Reshape(int nNum, int nChannels, int nHeight, int nWidth, bool? bUseHalfSize=null)
DEPRECIATED; use
void CopyFrom(Blob< T > src, int nSrcOffset, int nDstOffset, int nCount, bool bCopyData, bool bCopyDiff)
Copy from a source Blob.
void scale_data(double df)
Scale the data by a scaling factor.
int width
DEPRECIATED; legacy shape accessor width: use shape(3) instead.
List< int > shape()
Returns an array where each element contains the shape of an axis of the Blob.
int count()
Returns the total number of items in the Blob.
void ReshapeLike(Blob< T > b, bool? bUseHalfSize=null)
Reshape this Blob to have the same shape as another Blob.
string Name
Get/set the name of the Blob.
long gpu_diff
Returns the diff GPU handle used by the CudaDnn connection.
int num
DEPRECIATED; legacy shape accessor num: use shape(0) instead.
long gpu_data
Returns the data GPU handle used by the CudaDnn connection.
The CudaDnn object is the main interface to the Low-Level Cuda C++ DLL.
An interface for the units of computation which can be composed into a Net.
LayerParameter m_param
Specifies the LayerParameter describing the Layer.
void convert(BlobCollection< T > col)
Convert a collection of blobs from / to half size.
bool shareLayerBlob(Blob< T > b, List< int > rgMinShape)
Attempts to share a Layer Blob if another parameter Blob with the same name and acceptable size is fo...
void Backward(BlobCollection< T > colTop, List< bool > rgbPropagateDown, BlobCollection< T > colBottom)
Given the top Blob error gradients, compute the bottom Blob error gradients.
virtual bool ReInitializeParameters(WEIGHT_TARGET target)
Re-initialize the parameters of the layer.
double Forward(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Given the bottom (input) Blobs, this function computes the top (output) Blobs and the loss.
float convertF(T df)
Converts a generic to a float value.
abstract void Reshape(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Adjust the shapes of top blobs and internal buffers to accomodate the shapes of the bottom blobs.
BlobCollection< T > m_colInternalBlobs
Specifies internal blobs used by the layer.
CudaDnn< T > m_cuda
Specifies the CudaDnn connection to Cuda.
void Setup(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Implements common Layer setup functionality.
static Layer< T > Create(CudaDnn< T > cuda, Log log, LayerParameter p, CancelEvent evtCancel, IXDatabaseBase db=null, TransferInput trxinput=null)
Create a new Layer based on the LayerParameter.
LayerParameter.LayerType m_type
Specifies the Layer type.
BlobCollection< T > blobs
Returns the collection of learnable parameter Blobs for the Layer.
BlobCollection< T > internal_blobs
Returns the collection of internal Blobs used by the Layer.
LayerParameter convertLayerParam(LayerParameter pChild, LayerParameter pParent)
Called to convert a parent LayerParameterEx, used in blob sharing, with a child layer parameter.
The CausalSelfAttention provides a vanilla multi-head self-attention layer with projection at the end...
override void forward(BlobCollection< T > colBottom, BlobCollection< T > colTop)
The forward computation.
override int ExactNumBottomBlobs
Returns the exact number of required bottom (input) Blobs: input
override void LayerSetUp(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Setup the layer.
CausalSelfAttentionLayer(CudaDnn< T > cuda, Log log, LayerParameter p)
The CausalSelfAttention constructor.
override int ExactNumTopBlobs
Returns the exact number of required top (output) Blobs: attn
override void dispose()
Releases all GPU and host resources used by the Layer.
override bool ReInitializeParameters(WEIGHT_TARGET target)
Re-initialize the parameters of the layer.
override void backward(BlobCollection< T > colTop, List< bool > rgbPropagateDown, BlobCollection< T > colBottom)
Computes the loss error gradient w.r.t the outputs.
override void Reshape(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Reshape the bottom (input) and top (output) blobs.
override void setup_internal_blobs(BlobCollection< T > col)
Derivative layers should add all internal blobws to the 'col' provided.
double dropout_ratio
Specifies the dropout ratio. (e.g. the probability that values will be dropped out and set to zero....
Specifies whether to use the NVIDIA cuDnn version or Caffe version of a given forward/backward operat...
Engine engine
Specifies the Engine in use.
Engine
Defines the type of engine to use.
Specifies the filler parameters used to create each Filler.
FillerParameter weight_filler
The filler for the weights.
int axis
Specifies the first axis to be lumped into a single inner product computation; all preceding axes are...
FillerParameter bias_filler
The filler for the bias.
uint num_output
The number of outputs for the layer.
bool bias_term
Whether to have bias terms or not.
Specifies the base parameter for all layers.
List< ParamSpec > parameters
Specifies the ParamSpec parameters of the LayerParameter.
string name
Specifies the name of this LayerParameter.
SoftmaxParameter softmax_param
Returns the parameter set when initialized with LayerType.SOFTMAX
CausalSelfAttentionParameter causal_self_attention_param
Returns the parameter set when initialized with LayerType.CAUSAL_SELF_ATTENTION
InnerProductParameter inner_product_param
Returns the parameter set when initialized with LayerType.INNERPRODUCT
TransposeParameter transpose_param
Returns the parameter set when initialized with LayerType.TRANSPOSE
LayerType
Specifies the layer type.
DropoutParameter dropout_param
Returns the parameter set when initialized with LayerType.DROPOUT
Specifies training parameters (multipliers on global learning constants, and the name of other settin...
int axis
The axis along which to perform the softmax – may be negative to index from the end (e....
The MyCaffe.basecode contains all generic types used throughout MyCaffe.
The MyCaffe.common namespace contains common MyCaffe classes.
DIR
Defines the direction of data flow.
WEIGHT_TARGET
Defines the type of weight to target in re-initializations.
The MyCaffe.fillers namespace contains all fillers including the Filler class.
The MyCaffe.layers.gpt namespace contains all GPT related layers.
The MyCaffe.param namespace contains parameters used to create models.
The MyCaffe namespace contains the main body of MyCaffe code that closesly tracks the C++ Caffe open-...