2using System.Collections.Generic;
49 const int CUDNN_STREAMS_PER_GROUP = 3;
51 long[] m_rghCudnn =
null;
52 long[] m_rghStream =
null;
59 List<long> m_rghBottomDesc =
new List<long>();
60 List<long> m_rghTopDesc =
new List<long>();
62 long m_hFilterDesc = 0;
63 List<long> m_rghConvDesc =
new List<long>();
64 int m_nBottomOffset = 0;
66 int m_nBiasOffset = 0;
68 ulong[] m_rglWorkspaceFwdSizes =
null;
69 ulong[] m_rglWorkspaceBwdFilterSizes =
null;
70 ulong[] m_rglWorkspaceBwdDataSizes =
null;
71 ulong[] m_rglWorkspaceFwdOffsets =
null;
72 ulong[] m_rglWorkspaceBwdFilterOffsets =
null;
73 ulong[] m_rglWorkspaceBwdDataOffsets =
null;
74 bool m_bUseTensorCores =
false;
127 for (
int i = 0; i < m_rghBottomDesc.Count; i++)
129 m_cuda.FreeTensorDesc(m_rghBottomDesc[i]);
130 m_cuda.FreeTensorDesc(m_rghTopDesc[i]);
131 m_cuda.FreeConvolutionDesc(m_rghConvDesc[i]);
134 m_rghBottomDesc.Clear();
135 m_rghTopDesc.Clear();
136 m_rghConvDesc.Clear();
138 if (m_hBiasDesc != 0)
140 m_cuda.FreeTensorDesc(m_hBiasDesc);
144 if (m_hFilterDesc != 0)
146 m_cuda.FreeFilterDesc(m_hFilterDesc);
150 for (
int g = 0; g < (
m_nGroup * CUDNN_STREAMS_PER_GROUP); g++)
152 if (m_rghStream !=
null && m_rghStream[g] != 0)
153 m_cuda.FreeStream(m_rghStream[g]);
155 if (m_rghCudnn !=
null && m_rghCudnn[g] != 0)
156 m_cuda.FreeCuDNN(m_rghCudnn[g]);
172 base.LayerSetUp(colBottom, colTop);
178 m_rghStream =
new long[
m_nGroup * CUDNN_STREAMS_PER_GROUP];
179 m_rghCudnn =
new long[
m_nGroup * CUDNN_STREAMS_PER_GROUP];
187 m_rglWorkspaceFwdSizes =
new ulong[colBottom.
Count];
188 m_rglWorkspaceBwdFilterSizes =
new ulong[colBottom.
Count];
189 m_rglWorkspaceBwdDataSizes =
new ulong[colBottom.
Count];
190 m_rglWorkspaceFwdOffsets =
new ulong[
m_nGroup * CUDNN_STREAMS_PER_GROUP];
191 m_rglWorkspaceBwdFilterOffsets =
new ulong[
m_nGroup * CUDNN_STREAMS_PER_GROUP];
192 m_rglWorkspaceBwdDataOffsets =
new ulong[
m_nGroup * CUDNN_STREAMS_PER_GROUP];
194 for (
int i = 0; i < colBottom.
Count; i++)
202 m_rglWorkspaceFwdSizes[i] = 0;
203 m_rglWorkspaceBwdFilterSizes[i] = 0;
204 m_rglWorkspaceBwdDataSizes[i] = 0;
207 for (
int g = 0; g <
m_nGroup * CUDNN_STREAMS_PER_GROUP; g++)
209 m_rghStream[g] =
m_cuda.CreateStream(
false, g);
210 m_rghCudnn[g] =
m_cuda.CreateCuDNN(m_rghStream[g]);
211 m_rglWorkspaceFwdOffsets[g] = 0;
212 m_rglWorkspaceBwdFilterOffsets[g] = 0;
213 m_rglWorkspaceBwdDataOffsets[g] = 0;
217 if (typeof(T) == typeof(
double))
219 m_log.
WriteLine(
"WARNING: Tensor cores are only supported with the 'float' base type. Tensor core use will be disabled for the 'double' base type.");
220 m_bUseTensorCores =
false;
228 m_hFilterDesc =
m_cuda.CreateFilterDesc();
232 for (
int i = 0; i < colBottom.
Count; i++)
234 m_rghBottomDesc.Add(
m_cuda.CreateTensorDesc());
235 m_rghTopDesc.Add(
m_cuda.CreateTensorDesc());
236 m_rghConvDesc.Add(
m_cuda.CreateConvolutionDesc());
241 m_hBiasDesc =
m_cuda.CreateTensorDesc();
276 base.Reshape(colBottom, colTop);
285 m_log.
CHECK_EQ(2,
m_nNumSpatialAxes,
"cuDNN Convolution input must have 2 spatial axes (e.g., height and width). Use 'engine: CAFFE' for general ND convolution.");
301 for (
int i = 0; i < colBottom.
Count; i++)
305 m_cuda.SetConvolutionDesc(m_rghConvDesc[i], szPad.Height, szPad.Width, szStride.Height, szStride.Width, szDilation.Height, szDilation.Width, m_bUseTensorCores,
m_bUseHalfSize);
311 ulong lWsSizeFwd = 0;
312 ulong lWsSizeBwdFilter = 0;
313 ulong lWsSizeBwdData = 0;
315 m_cuda.GetConvolutionInfo(m_rghCudnn[0], m_rghBottomDesc[i], m_hFilterDesc, m_rghConvDesc[i], m_rghTopDesc[i], lWorkspaceLimitBytes, m_bUseTensorCores, out algoFwd, out lWsSizeFwd, out algoBwdFilter, out lWsSizeBwdFilter, out algoBwdData, out lWsSizeBwdData);
316 m_rgfwdAlgo[i] = algoFwd;
317 m_rglWorkspaceFwdSizes[i] = lWsSizeFwd;
318 m_rgbwdFilterAlgo[i] = algoBwdFilter;
319 m_rglWorkspaceBwdFilterSizes[i] = lWsSizeBwdFilter;
320 m_rgbwdDataAlgo[i] = algoBwdData;
321 m_rglWorkspaceBwdDataSizes[i] = lWsSizeBwdData;
325 ulong lTotalWsFwd = 0;
326 ulong lTotalWsBwdFilter = 0;
327 ulong lTotalWsBwdData = 0;
329 for (
int i = 0; i < colBottom.
Count; i++)
331 lTotalWsFwd = Math.Max(lTotalWsFwd, m_rglWorkspaceFwdSizes[i]);
332 lTotalWsBwdFilter = Math.Max(lTotalWsBwdFilter, m_rglWorkspaceBwdFilterSizes[i]);
333 lTotalWsBwdData = Math.Max(lTotalWsBwdData, m_rglWorkspaceBwdDataSizes[i]);
337 ulong lMaxWorkspace = Math.Max(lTotalWsFwd, Math.Max(lTotalWsBwdFilter, lTotalWsBwdData));
340 ulong lTotalMaxWorkspace = (ulong)lMaxWorkspace * (ulong)
m_nGroup * (ulong)CUDNN_STREAMS_PER_GROUP;
341 lTotalMaxWorkspace *= (ulong)CUDNN_STREAMS_PER_GROUP;
350 for (
int g = 0; g < (
m_nGroup * CUDNN_STREAMS_PER_GROUP); g++)
352 m_rglWorkspaceFwdOffsets[g] = (ulong)g * lTotalWsFwd;
353 m_rglWorkspaceBwdFilterOffsets[g] = (ulong)g * lTotalWsBwdFilter;
354 m_rglWorkspaceBwdDataOffsets[g] = (ulong)g * lTotalWsBwdData;
385 int nKernel =
val_at(rgKernelShapeData, i);
386 int nStride =
val_at(rgStrideData, i);
387 int nPad =
val_at(rgPadData, i);
388 int nDilation =
val_at(rgDilationData, i);
392 int nKernelExtent = nDilation * (nKernel - 1) + 1;
393 int nOutputDim = (nInputDim + 2 * nPad - nKernelExtent) / nStride + 1;
438 for (
int i = 0; i < colBottom.
Count; i++)
440 long hBottomData = colBottom[i].gpu_data;
441 long hTopData = colTop[i].mutable_gpu_data;
443 for (
int n = 0; n <
m_nNum; n++)
462 long hWeightDiff =
m_colBlobs[0].mutable_gpu_diff;
464 for (
int i = 0; i < colTop.
Count; i++)
466 long hTopDiff = colTop[i].gpu_diff;
471 long hBiasDiff =
m_colBlobs[1].mutable_gpu_diff;
473 for (
int n = 0; n <
m_nNum; n++)
481 long hBottomData = colBottom[i].gpu_data;
482 long hBottomDiff = colBottom[i].mutable_gpu_diff;
484 for (
int n = 0; n <
m_nNum; n++)
491 if (rgbPropagateDown[i])
508 for (
int i = 0; i < colBottom.
Count; i++)
510 long hBottomData = colBottom[i].gpu_data;
511 long hTopData = colTop[i].mutable_gpu_data;
517 m_cuda.ConvolutionForward(m_rghCudnn[g],
520 hBottomData, m_nBottomOffset * g,
525 wsArgs.
WorkspaceData, (
int)m_rglWorkspaceFwdOffsets[g], m_rglWorkspaceFwdSizes[i],
528 hTopData, m_nTopOffset * g,
535 m_cuda.SynchronizeStream(m_rghStream[g]);
545 m_cuda.AddTensor(m_rghCudnn[g],
548 hBiasData, m_nBiasOffset * g,
551 hTopData, m_nTopOffset * g);
557 m_cuda.SynchronizeStream(m_rghStream[g]);
576 long hBiasDiff =
m_colBlobs[1].mutable_gpu_diff;
578 for (
int i = 0; i < colTop.
Count; i++)
580 long hTopDiff = colTop[i].mutable_gpu_diff;
586 m_tOne, m_rghTopDesc[i], hTopDiff, m_nTopOffset * g,
587 m_tOne, m_hBiasDesc, hBiasDiff, m_nBiasOffset * g,
601 long hWeightDiff =
m_colBlobs[0].mutable_gpu_diff;
603 for (
int i = 0; i < colTop.
Count; i++)
605 long hTopDiff = colTop[i].mutable_gpu_diff;
606 long hBottomData = colBottom[i].gpu_data;
613 m_rghBottomDesc[i], hBottomData, m_nBottomOffset * g,
614 m_rghTopDesc[i], hTopDiff, m_nTopOffset * g,
616 m_rgbwdFilterAlgo[i],
618 m_rglWorkspaceBwdFilterSizes[i],
634 for (
int i=0; i<colTop.
Count; i++)
636 if (rgbPropagateDown[i])
638 long hTopDiff = colTop[i].mutable_gpu_diff;
639 long hBottomDiff = colBottom[i].mutable_gpu_diff;
647 m_rghTopDesc[i], hTopDiff, m_nTopOffset * g,
651 m_rglWorkspaceBwdDataSizes[i],
653 m_rghBottomDesc[i], hBottomDiff, m_nBottomOffset * g,
The Log class provides general output in text form.
void WriteLine(string str, bool bOverrideEnabled=false, bool bHeader=false, bool bError=false, bool bDisable=false)
Write a line of output.
void CHECK_EQ(double df1, double df2, string str)
Test whether one number is equal to another.
The BlobCollection contains a list of Blobs.
int Count
Returns the number of items in the collection.
The CudaDnn object is the main interface to the Low-Level Cuda C++ DLL.
The WorkspaceArgs are passed to both the Layer::OnSetWorkspace and Layer::OnGetWorkspace events.
long WorkspaceData
Get/set the handle to workspace data in GPU memory.
The BaseConvolutionLayer is an abstract base class that factors out BLAS code common to ConvolutionLa...
int m_nBottomDim
The bottom dimension.
int m_nNumOutput
The number of outputs.
int m_nTopDim
The top dimension.
List< int > m_rgOutputShape
The spatial dimensions of the output.
void backward_bias(long hBias, long hInput, int nInputOffset)
Helper function that abstracts away the column buffer and gemm arguments.
int m_nOutSpatialDim
The output spatial dimension.
int m_nChannelAxis
The channel axis.
int m_nChannels
The number of channels in each item.
void forward_gemm(long hInput, int nInputOffset, long hWeights, long hOutput, int nOutputOffset, bool bSkipIm2Col=false)
Helper function that abstract away the column buffer and gemm arguments.
override WorkspaceArgs getWorkspace()
Retruns the WorkspaceArgs containing the workspace used by this Layer.
int m_nNumSpatialAxes
The number of spatial axes.
ulong getWorkspaceLimitInBytes(bool bUseTensorCores=false)
Returns the workspace limit in bytes based on the cudnn_workspace_limit setting.
void weight_gemm(long hInput, int nInputOffset, long hOutput, int nOutputOffset, long hWeights)
Helper function that abstract away the column buffer and gemm arguments.
Blob< T > m_blobStride
The spatial dimensions of the stride.
Blob< T > m_blobDilation
The spatial dimentions of the dilation.
Blob< T > m_blobKernelShape
The spatial dimensions of the filter kernel.
int m_nWeightOffset
The weight offset used.
Blob< T > m_blobPad
The spatial dimensions of the padding.
int m_nNum
The number of items in the batch.
void forward_bias(long hOutput, int nOutputOffset, long hBias)
Helper function that abstracts away the column buffer and gemm arguments.
override bool setWorkspace(ulong lSizeInBytes)
If not already set, allocates the workspace needed in GPU memory.
void backward_gemm(long hOutput, int nOutputOffset, long hWeights, long hInput, int nInputOffset)
Helper function that abstract away the column buffer and gemm arguments.
int input_shape(int i)
Returns the spatial dimensions of the input.
bool m_bBiasTerm
Whether or not to use bias.
The ConvolutionLayer convolves the input image with a bank of learned filters, and (optionally) adds ...
override void dispose()
Releases all GPU and host resources used by the Layer.
void backward_cuda(BlobCollection< T > colTop, List< bool > rgbPropagateDown, BlobCollection< T > colBottom)
Run the Backward computation using the Engine.CAFFE mode as specified in the LayerParameter.
override bool reshapeNeeded(BlobCollection< T > colBottom, BlobCollection< T > colTop, bool bReset=true)
Tests the shapes of both the bottom and top blobs and if they are the same as the previous sizing,...
void backward_cudnn(BlobCollection< T > colTop, List< bool > rgbPropagateDown, BlobCollection< T > colBottom)
Run the Backward computation using the Engine CUDNN mode as specified in the LayerParameter.
void forward_cuda(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Run the Forward computation using the Engine.CAFFE mode as specified in the LayerParameter.
override bool reverse_dimensions()
Returns false, for we want convolution, not deconvolution.
override void backward(BlobCollection< T > colTop, List< bool > rgbPropagateDown, BlobCollection< T > colBottom)
Run the Backward computation using either the Engine.CAFFE or Engine.CUDNN mode as specified in the L...
override void LayerSetUp(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Setup the layer for use with both Engine.CAFFE and Engine.CUDNN modes.
ConvolutionLayer(CudaDnn< T > cuda, Log log, LayerParameter p)
The ConvolutionLayer constructor.
override void forward(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Run the Forward computation using either the Engine.CAFFE or Engine.CUDNN mode as specified in the La...
void forward_cudnn(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Run the Forward computation using the Engine CUDNN mode as specified in the LayerParameter.
override void compute_output_shape()
Computes the output shape used by the BaseConvolutionLayer.
override void Reshape(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Reshape the bottom (input) and top (output) blobs.
Log m_log
Specifies the Log for output.
LayerParameter m_param
Specifies the LayerParameter describing the Layer.
void setShapes(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Set the internal shape sizes - used when determining if a Reshape is necessary.
int val_at(T[] rg, int nIdx)
Returns the integer value at a given index in a generic array.
T m_tZero
Specifies a generic type equal to 0.0.
T m_tOne
Specifies a generic type equal to 1.0.
bool compareShapes(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Compare the shapes of the top and bottom and if the same, return true, otherwise false.
bool m_bUseHalfSize
Specifies that the half size of the top (if any) should be converted to the base size.
Size size_at(Blob< T > b)
Returns the Size of a given two element Blob, such as one that stores Blob size information.
CudaDnn< T > m_cuda
Specifies the CudaDnn connection to Cuda.
bool m_bReshapeOnForwardNeeded
Specifies whether or not the reshape on forward is needed or not.
LayerParameter.LayerType m_type
Specifies the Layer type.
BlobCollection< T > m_colBlobs
Specifies the learnable parameter Blobs of the Layer.
DictionaryMap< bool > m_rgbParamPropagateDown
Specifies whether or not to compute the learnable diff of each parameter Blob.
bool cudnn_enable_tensor_cores
Specifies to enable the CUDA tensor cores when performing the convolution which is faster but not sup...
bool useCudnn(int nNumSpatialAxes=2)
Queries whether or not to use NVIDIA's cuDnn.
Specifies the base parameter for all layers.
ConvolutionParameter convolution_param
Returns the parameter set when initialized with LayerType.CONVOLUTION
LayerType
Specifies the layer type.
The MyCaffe.basecode contains all generic types used throughout MyCaffe.
The MyCaffe.common namespace contains common MyCaffe classes.
CONV_BWD_FILTER_ALGO
Specifies the cuDnn convolution backward filter algorithm to use.
CONV_FWD_ALGO
Specifies the cuDnn convolution forward algorithm to use.
CONV_BWD_DATA_ALGO
Specifies the cuDnn convolution backward data algorithm to use.
The MyCaffe.layers namespace contains all layers that have a solidified code base,...
The MyCaffe.param namespace contains parameters used to create models.
The MyCaffe namespace contains the main body of MyCaffe code that closesly tracks the C++ Caffe open-...