2using System.Collections.Generic;
11using System.Runtime.InteropServices.WindowsRuntime;
61 m_blobLn1 =
new Blob<T>(cuda, log);
63 m_blobAttn1 =
new Blob<T>(cuda, log);
65 m_blobLn2 =
new Blob<T>(cuda, log);
67 m_blobMlp =
new Blob<T>(cuda, log);
69 m_blobMlpOut =
new Blob<T>(cuda, log);
71 m_blobX =
new Blob<T>(cuda, log);
111 m_blobAttn2 =
new Blob<T>(cuda, log);
113 m_blobLn3 =
new Blob<T>(cuda, log);
188 bool? bEnableBert =
null;
202 if (bEnableBert.HasValue)
249 col.
Add(m_blobAttn1);
251 if (m_blobAttn2 !=
null)
252 col.
Add(m_blobAttn2);
254 if (m_blobLn3 !=
null)
257 col.
Add(m_blobMlpOut);
269 if (m_dropout !=
null)
317 base.ReInitializeParameters(target);
334 m_colInternalBottom.
Clear();
335 m_colInternalBottom.
Add(bottom);
337 m_colInternalTop.
Clear();
338 m_colInternalTop.
Add(top);
343 m_colInternalBottom.
Clear();
345 for (
int i=0; i<rgBottom.Count; i++)
347 m_colInternalBottom.
Add(rgBottom[i]);
350 m_colInternalTop.
Clear();
351 m_colInternalTop.
Add(top);
375 m_blobLn1.ReshapeLike(colBottom[0]);
383 if (m_blobAttn2 !=
null)
389 if (m_blobLn3 !=
null)
400 addInternal(colBottom[0], m_blobLn1);
401 m_ln1.
LayerSetUp(m_colInternalBottom, m_colInternalTop);
406 addInternal(m_blobLn1, m_blobAttn1);
407 m_attn1.
LayerSetUp(m_colInternalBottom, m_colInternalTop);
412 addInternal(
new List<
Blob<T>>() { m_blobLn1, m_blobLn1, m_blobLn1, colBottom[1] }, m_blobAttn1);
413 m_attn1.
LayerSetUp(m_colInternalBottom, m_colInternalTop);
418 addInternal(
new List<
Blob<T>>() { m_blobLn1, m_blobLn1, m_blobLn1, colBottom[1] }, m_blobAttn1);
419 m_attn1.
LayerSetUp(m_colInternalBottom, m_colInternalTop);
426 addInternal(colTop[0], m_blobLn2);
427 m_ln2.
LayerSetUp(m_colInternalBottom, m_colInternalTop);
433 addInternal(
new List<
Blob<T>>() { m_blobLn2, colBottom[2], colBottom[2], colBottom[3] }, m_blobAttn2);
434 m_attn2.
LayerSetUp(m_colInternalBottom, m_colInternalTop);
436 addInternal(m_blobAttn2, m_blobLn3);
437 m_ln3.
LayerSetUp(m_colInternalBottom, m_colInternalTop);
441 addInternal(blobLn, m_blobMlp);
442 m_fc.
LayerSetUp(m_colInternalBottom, m_colInternalTop);
443 addInternal(m_blobLn2, m_blobMlp);
444 m_fc.
Reshape(m_colInternalBottom, m_colInternalTop);
445 addInternal(m_blobMlp, m_blobMlp);
446 m_act.
LayerSetUp(m_colInternalBottom, m_colInternalTop);
447 addInternal(m_blobMlp, m_blobMlpOut);
448 m_proj.
LayerSetUp(m_colInternalBottom, m_colInternalTop);
450 if (m_dropout !=
null)
452 addInternal(m_blobMlpOut, m_blobMlpOut);
453 m_dropout.
LayerSetUp(m_colInternalBottom, m_colInternalTop);
480 m_blobLn1.ReshapeLike(colBottom[0]);
485 if (m_blobAttn2 !=
null)
488 if (m_blobLn3 !=
null)
494 addInternal(colBottom[0], m_blobLn1);
495 m_ln1.
Reshape(m_colInternalBottom, m_colInternalTop);
500 addInternal(m_blobLn1, m_blobAttn1);
501 m_attn1.
Reshape(m_colInternalBottom, m_colInternalTop);
506 addInternal(
new List<
Blob<T>>() { m_blobLn1, m_blobLn1, m_blobLn1, colBottom[1] }, m_blobAttn1);
507 m_attn1.
Reshape(m_colInternalBottom, m_colInternalTop);
512 addInternal(
new List<
Blob<T>>() { m_blobLn1, m_blobLn1, m_blobLn1, colBottom[1] }, m_blobAttn1);
513 m_attn1.
Reshape(m_colInternalBottom, m_colInternalTop);
520 addInternal(colTop[0], m_blobLn2);
521 m_ln2.
Reshape(m_colInternalBottom, m_colInternalTop);
527 addInternal(
new List<
Blob<T>>() { m_blobLn2, colBottom[2], colBottom[2], colBottom[3] }, m_blobAttn2);
528 m_attn2.
Reshape(m_colInternalBottom, m_colInternalTop);
530 addInternal(m_blobAttn2, m_blobLn3);
531 m_ln3.
Reshape(m_colInternalBottom, m_colInternalTop);
535 addInternal(blobLn, m_blobMlp);
536 m_fc.
Reshape(m_colInternalBottom, m_colInternalTop);
537 addInternal(m_blobMlp, m_blobMlp);
538 m_act.
Reshape(m_colInternalBottom, m_colInternalTop);
539 addInternal(m_blobMlp, m_blobMlpOut);
540 m_proj.
Reshape(m_colInternalBottom, m_colInternalTop);
542 if (m_dropout !=
null)
544 addInternal(m_blobMlpOut, m_blobMlpOut);
545 m_dropout.
Reshape(m_colInternalBottom, m_colInternalTop);
610 int nCount = colBottom[0].count();
612 Blob<T> blobXMask = (colBottom.
Count > 1) ? colBottom[1] :
null;
613 Blob<T> blobEncOut = (colBottom.
Count > 3) ? colBottom[2] :
null;
614 Blob<T> blobEncMask = (colBottom.
Count > 3) ? colBottom[3] :
null;
621 addInternal(blobX, m_blobLn1);
622 m_ln1.
Forward(m_colInternalBottom, m_colInternalTop);
627 addInternal(m_blobLn1, m_blobAttn1);
628 m_attn1.
Forward(m_colInternalBottom, m_colInternalTop);
633 addInternal(
new List<
Blob<T>>() { m_blobLn1, m_blobLn1, m_blobLn1, blobXMask }, m_blobAttn1);
634 m_attn1.
Forward(m_colInternalBottom, m_colInternalTop);
639 addInternal(
new List<
Blob<T>>() { m_blobLn1, m_blobLn1, m_blobLn1, blobXMask }, m_blobAttn1);
640 m_attn1.
Forward(m_colInternalBottom, m_colInternalTop);
651 addInternal(m_blobX, m_blobLn2);
652 m_ln2.
Forward(m_colInternalBottom, m_colInternalTop);
658 addInternal(
new List<
Blob<T>>() { m_blobLn2, blobEncOut, blobEncOut, blobEncMask }, m_blobAttn2);
659 m_attn2.
Forward(m_colInternalBottom, m_colInternalTop);
665 addInternal(m_blobX, m_blobLn3);
666 m_ln3.
Forward(m_colInternalBottom, m_colInternalTop);
672 addInternal(blobLn, m_blobMlp);
673 m_fc.
Forward(m_colInternalBottom, m_colInternalTop);
674 addInternal(m_blobMlp, m_blobMlp);
675 m_act.
Forward(m_colInternalBottom, m_colInternalTop);
676 addInternal(m_blobMlp, m_blobMlpOut);
677 m_proj.
Forward(m_colInternalBottom, m_colInternalTop);
679 if (m_dropout !=
null)
681 addInternal(m_blobMlpOut, m_blobMlpOut);
682 m_dropout.
Forward(m_colInternalBottom, m_colInternalTop);
712 int nCount = colBottom[0].count();
714 Blob<T> blobXMask = (colBottom.
Count > 1) ? colBottom[1] :
null;
715 Blob<T> blobEncOut = (colBottom.
Count > 3) ? colBottom[2] :
null;
716 Blob<T> blobEncMask = (colBottom.
Count > 3) ? colBottom[3] :
null;
719 if (rgbPropagateDown[0])
721 List<bool> rgbPropagate =
new List<bool>() {
true,
true };
730 if (m_dropout !=
null)
732 addInternal(m_blobMlpOut, m_blobMlpOut);
733 m_dropout.
Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
737 addInternal(m_blobMlp, m_blobMlpOut);
738 m_proj.
Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
739 addInternal(m_blobMlp, m_blobMlp);
740 m_act.
Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
743 addInternal(blobLn, m_blobMlp);
744 m_fc.
Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
750 m_blobAttn2.
CopyFrom(m_blobX,
true);
751 addInternal(m_blobX, m_blobLn3);
752 m_ln3.
Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
758 m_blobAttn2.
CopyFrom(m_blobX,
true);
762 addInternal(
new List<
Blob<T>>() { m_blobLn2, blobEncOut, blobEncOut, blobEncMask }, m_blobAttn2);
763 m_attn2.
Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
768 m_blobAttn1.
CopyFrom(m_blobX,
true);
769 addInternal(m_blobX, m_blobLn2);
770 m_ln2.
Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
776 m_blobAttn1.
CopyFrom(m_blobX,
true);
781 addInternal(m_blobLn1, m_blobAttn1);
782 m_attn1.
Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
788 addInternal(
new List<
Blob<T>>() { m_blobLn1, m_blobLn1, m_blobLn1, blobXMask }, m_blobAttn1);
789 m_attn1.
Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
798 addInternal(blobX, m_blobLn1);
799 m_ln1.
Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
The CancelEvent provides an extension to the manual cancel event that allows for overriding the manua...
The Log class provides general output in text form.
The BlobCollection contains a list of Blobs.
void Add(Blob< T > b)
Add a new Blob to the collection.
int Count
Returns the number of items in the collection.
void Clear(bool bDispose=false)
Remove all items from the collection.
void ReshapeLike(BlobCollection< T > src)
Reshapes all blobs in the collection to the sizes of the source.
The Blob is the main holder of data that moves through the Layers of the Net.
long mutable_gpu_diff
Returns the diff GPU handle used by the CudaDnn connection.
long mutable_gpu_data
Returns the data GPU handle used by the CudaDnn connection.
void CopyFrom(Blob< T > src, int nSrcOffset, int nDstOffset, int nCount, bool bCopyData, bool bCopyDiff)
Copy from a source Blob.
void ReshapeLike(Blob< T > b, bool? bUseHalfSize=null)
Reshape this Blob to have the same shape as another Blob.
string Name
Get/set the name of the Blob.
long gpu_diff
Returns the diff GPU handle used by the CudaDnn connection.
long gpu_data
Returns the data GPU handle used by the CudaDnn connection.
The CudaDnn object is the main interface to the Low-Level Cuda C++ DLL.
An interface for the units of computation which can be composed into a Net.
LayerParameter m_param
Specifies the LayerParameter describing the Layer.
abstract void LayerSetUp(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Performs Layer specific setup. Derived layers should override this function as well as the Reshape fu...
bool shareLayerBlob(Blob< T > b, List< int > rgMinShape)
Attempts to share a Layer Blob if another parameter Blob with the same name and acceptable size is fo...
void Backward(BlobCollection< T > colTop, List< bool > rgbPropagateDown, BlobCollection< T > colBottom)
Given the top Blob error gradients, compute the bottom Blob error gradients.
virtual bool ReInitializeParameters(WEIGHT_TARGET target)
Re-initialize the parameters of the layer.
double Forward(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Given the bottom (input) Blobs, this function computes the top (output) Blobs and the loss.
abstract void Reshape(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Adjust the shapes of top blobs and internal buffers to accomodate the shapes of the bottom blobs.
BlobCollection< T > m_colInternalBlobs
Specifies internal blobs used by the layer.
CudaDnn< T > m_cuda
Specifies the CudaDnn connection to Cuda.
static Layer< T > Create(CudaDnn< T > cuda, Log log, LayerParameter p, CancelEvent evtCancel, IXDatabaseBase db=null, TransferInput trxinput=null)
Create a new Layer based on the LayerParameter.
LayerParameter.LayerType m_type
Specifies the Layer type.
BlobCollection< T > blobs
Returns the collection of learnable parameter Blobs for the Layer.
BlobCollection< T > internal_blobs
Returns the collection of internal Blobs used by the Layer.
LayerParameter convertLayerParam(LayerParameter pChild, LayerParameter pParent)
Called to convert a parent LayerParameterEx, used in blob sharing, with a child layer parameter.
double dropout_ratio
Specifies the dropout ratio. (e.g. the probability that values will be dropped out and set to zero....
Specifies the filler parameters used to create each Filler.
FillerParameter weight_filler
The filler for the weights.
int axis
Specifies the first axis to be lumped into a single inner product computation; all preceding axes are...
FillerParameter bias_filler
The filler for the bias.
uint num_output
The number of outputs for the layer.
bool bias_term
Whether to have bias terms or not.
Specifies the base parameter for all layers.
List< ParamSpec > parameters
Specifies the ParamSpec parameters of the LayerParameter.
string name
Specifies the name of this LayerParameter.
MultiheadAttentionParameter multihead_attention_param
Returns the parameter set when initialized with LayerType.MULTIHEAD_ATTENTION
LayerNormParameter layer_norm_param
Returns the parameter set when initialized with LayerType.LAYERNORM
CausalSelfAttentionParameter causal_self_attention_param
Returns the parameter set when initialized with LayerType.CAUSAL_SELF_ATTENTION
InnerProductParameter inner_product_param
Returns the parameter set when initialized with LayerType.INNERPRODUCT
TransformerBlockParameter transformer_block_param
Returns the parameter set when initialized with LayerType.TRANSFORMER_BLOCK
LayerType
Specifies the layer type.
GeluParameter gelu_param
Returns the parameter set when initialized with LayerType.GELU
DropoutParameter dropout_param
Returns the parameter set when initialized with LayerType.DROPOUT
Specifies training parameters (multipliers on global learning constants, and the name of other settin...
uint embed
Specifies size of the embed.
uint heads
The number of heads used.
uint block_size
Specifies size of the block.
double resid_dropout
Specifies dropout probability used on the residual weights.
uint layers
The number of layers (transformer blocks) used.
double attn_dropout
Specifies dropout probability used on the attention weights.
bool enable_bert_version
Specifies to use the special BERT version used in GPT models.
bool enable_cuda_impl
Specifies to use the low-level full cuda implementation of LayerNorm (default = false).
Specifies the parameters for the MultiheadAttentionLayer.
WEIGHT_INIT
Defines the weight initialization strategy.
double attn_dropout
Specifies dropout probability used on the attention weights.
uint block_size
Specifies size of the block.
uint heads
The number of heads used.
uint layers
The number of layers (transformer blocks) used.
double resid_dropout
Specifies dropout probability used on the residual weights.
uint embed
Specifies size of the embed.
WEIGHT_INIT weight_init
Specifies the weight initialization strategy (default = ENCODER_DECODER).
The MyCaffe.basecode contains all generic types used throughout MyCaffe.
The MyCaffe.common namespace contains common MyCaffe classes.
WEIGHT_TARGET
Defines the type of weight to target in re-initializations.
The MyCaffe.fillers namespace contains all fillers including the Filler class.
The MyCaffe.layers.gpt namespace contains all GPT related layers.
The MyCaffe.param namespace contains parameters used to create models.
The MyCaffe namespace contains the main body of MyCaffe code that closesly tracks the C++ Caffe open-...