2using System.Collections.Generic;
40 const int CUDNN_STREAMS_PER_GROUP = 3;
42 long[] m_rghCudnn =
null;
43 long[] m_rghStream =
null;
50 List<long> m_rghBottomDesc =
new List<long>();
51 List<long> m_rghTopDesc =
new List<long>();
53 long m_hFilterDesc = 0;
54 List<long> m_rghConvDesc =
new List<long>();
55 int m_nBottomOffset = 0;
57 int m_nBiasOffset = 0;
59 ulong[] m_rglWorkspaceFwdSizes =
null;
60 ulong[] m_rglWorkspaceBwdFilterSizes =
null;
61 ulong[] m_rglWorkspaceBwdDataSizes =
null;
62 ulong[] m_rglWorkspaceFwdOffsets =
null;
63 ulong[] m_rglWorkspaceBwdFilterOffsets =
null;
64 ulong[] m_rglWorkspaceBwdDataOffsets =
null;
65 bool m_bUseTensorCores =
false;
119 for (
int i = 0; i < m_rghBottomDesc.Count; i++)
121 m_cuda.FreeTensorDesc(m_rghBottomDesc[i]);
122 m_cuda.FreeTensorDesc(m_rghTopDesc[i]);
123 m_cuda.FreeConvolutionDesc(m_rghConvDesc[i]);
126 m_rghBottomDesc.Clear();
127 m_rghTopDesc.Clear();
128 m_rghConvDesc.Clear();
130 if (m_hBiasDesc != 0)
132 m_cuda.FreeTensorDesc(m_hBiasDesc);
136 if (m_hFilterDesc != 0)
138 m_cuda.FreeFilterDesc(m_hFilterDesc);
142 for (
int g = 0; g < (
m_nGroup * CUDNN_STREAMS_PER_GROUP); g++)
144 if (m_rghStream !=
null && m_rghStream[g] != 0)
145 m_cuda.FreeStream(m_rghStream[g]);
147 if (m_rghCudnn !=
null && m_rghCudnn[g] != 0)
148 m_cuda.FreeCuDNN(m_rghCudnn[g]);
164 base.LayerSetUp(colBottom, colTop);
168 for (
int i = 0; i < colBottom.
Count; i++)
170 if (colBottom[i].HalfSize)
171 m_log.
FAIL(
"Half sizes are only supported with the CUDNN engine!");
178 m_rghStream =
new long[
m_nGroup * CUDNN_STREAMS_PER_GROUP];
179 m_rghCudnn =
new long[
m_nGroup * CUDNN_STREAMS_PER_GROUP];
187 m_rglWorkspaceFwdSizes =
new ulong[colBottom.
Count];
188 m_rglWorkspaceBwdFilterSizes =
new ulong[colBottom.
Count];
189 m_rglWorkspaceBwdDataSizes =
new ulong[colBottom.
Count];
190 m_rglWorkspaceFwdOffsets =
new ulong[
m_nGroup * CUDNN_STREAMS_PER_GROUP];
191 m_rglWorkspaceBwdFilterOffsets =
new ulong[
m_nGroup * CUDNN_STREAMS_PER_GROUP];
192 m_rglWorkspaceBwdDataOffsets =
new ulong[
m_nGroup * CUDNN_STREAMS_PER_GROUP];
194 for (
int i = 0; i < colBottom.
Count; i++)
202 m_rglWorkspaceFwdSizes[i] = 0;
203 m_rglWorkspaceBwdFilterSizes[i] = 0;
204 m_rglWorkspaceBwdDataSizes[i] = 0;
207 for (
int g = 0; g <
m_nGroup * CUDNN_STREAMS_PER_GROUP; g++)
209 m_rghStream[g] =
m_cuda.CreateStream();
210 m_rghCudnn[g] =
m_cuda.CreateCuDNN(m_rghStream[g]);
211 m_rglWorkspaceFwdOffsets[g] = 0;
212 m_rglWorkspaceBwdFilterOffsets[g] = 0;
213 m_rglWorkspaceBwdDataOffsets[g] = 0;
217 if (typeof(T) == typeof(
double))
219 m_log.
WriteLine(
"WARNING: Tensor cores are only supported with the 'float' base type. Tensor core use will be disabled for the 'double' base type.");
220 m_bUseTensorCores =
false;
228 m_hFilterDesc =
m_cuda.CreateFilterDesc();
232 for (
int i = 0; i < colBottom.
Count; i++)
234 m_rghBottomDesc.Add(
m_cuda.CreateTensorDesc());
235 m_rghTopDesc.Add(
m_cuda.CreateTensorDesc());
236 m_rghConvDesc.Add(
m_cuda.CreateConvolutionDesc());
241 m_hBiasDesc =
m_cuda.CreateTensorDesc();
251 base.Reshape(colBottom, colTop);
257 for (
int i = 0; i < colBottom.
Count; i++)
259 if (colBottom[i].HalfSize)
260 m_log.
FAIL(
"Half sizes are only supported with the CUDNN engine!");
266 m_log.
CHECK_EQ(2,
m_nNumSpatialAxes,
"cuDNN Deconvolution input must have 2 spatial axes (e.g., height and width). Use 'engine: CAFFE' for general ND deconvolution.");
282 for (
int i = 0; i < colBottom.
Count; i++)
286 m_cuda.SetConvolutionDesc(m_rghConvDesc[i], szPad.Height, szPad.Width, szStride.Height, szStride.Width, szDilation.Height, szDilation.Width, m_bUseTensorCores,
m_bUseHalfSize);
297 ulong lWsSizeFwd = 0;
298 ulong lWsSizeBwdFilter = 0;
299 ulong lWsSizeBwdData = 0;
301 m_cuda.GetConvolutionInfo(m_rghCudnn[0], m_rghTopDesc[i], m_hFilterDesc, m_rghConvDesc[i], m_rghBottomDesc[i], lWorkspaceLimitBytes, m_bUseTensorCores, out algoFwd, out lWsSizeFwd, out algoBwdFilter, out lWsSizeBwdFilter, out algoBwdData, out lWsSizeBwdData, algoFwdPreferred);
302 m_rgfwdAlgo[i] = algoFwd;
303 m_rglWorkspaceFwdSizes[i] = lWsSizeFwd;
304 m_rgbwdFilterAlgo[i] = algoBwdFilter;
305 m_rglWorkspaceBwdFilterSizes[i] = lWsSizeBwdFilter;
306 m_rgbwdDataAlgo[i] = algoBwdData;
307 m_rglWorkspaceBwdDataSizes[i] = lWsSizeBwdData;
311 ulong lTotalWsFwd = 0;
312 ulong lTotalWsBwdFilter = 0;
313 ulong lTotalWsBwdData = 0;
315 for (
int i = 0; i < colBottom.
Count; i++)
317 lTotalWsFwd = Math.Max(lTotalWsFwd, m_rglWorkspaceFwdSizes[i]);
318 lTotalWsBwdFilter = Math.Max(lTotalWsBwdFilter, m_rglWorkspaceBwdFilterSizes[i]);
319 lTotalWsBwdData = Math.Max(lTotalWsBwdData, m_rglWorkspaceBwdDataSizes[i]);
323 ulong lMaxWorkspace = Math.Max(lTotalWsFwd, Math.Max(lTotalWsBwdFilter, lTotalWsBwdData));
326 ulong lTotalMaxWorkspace = (ulong)lMaxWorkspace * (ulong)
m_nGroup * CUDNN_STREAMS_PER_GROUP;
335 for (
int g = 0; g < (
m_nGroup * CUDNN_STREAMS_PER_GROUP); g++)
337 m_rglWorkspaceFwdOffsets[g] = (ulong)g * lTotalWsFwd;
338 m_rglWorkspaceBwdFilterOffsets[g] = (ulong)g * lTotalWsBwdFilter;
339 m_rglWorkspaceBwdDataOffsets[g] = (ulong)g * lTotalWsBwdData;
363 T[] rgPadData =
m_blobPad.update_cpu_data();
370 int nStride =
val_at(rgStrideData, i);
371 int nKernel =
val_at(rgKernelShape, i);
372 int nPad =
val_at(rgPadData, i);
373 int nDilation =
val_at(rgDilationData, i);
377 int nKernelExtent = nDilation * (nKernel - 1) + 1;
378 int nOutputDim = nStride * (nInputDim - 1) + nKernelExtent - 2 * nPad;
419 for (
int i = 0; i < colBottom.
Count; i++)
421 if (colBottom[i].HalfSize)
422 m_log.
FAIL(
"The CAFFE engine does not support half sizes!");
424 long hBottomData = colBottom[i].gpu_data;
425 long hTopData = colTop[i].mutable_gpu_data;
427 for (
int n = 0; n <
m_nNum; n++)
446 long hWeightDiff =
m_colBlobs[0].mutable_gpu_diff;
448 for (
int i = 0; i < colTop.
Count; i++)
450 if (colTop[i].HalfSize || colBottom[i].HalfSize)
451 m_log.
FAIL(
"The CAFFE engine does not support half sizes!");
453 long hTopDiff = colTop[i].gpu_diff;
454 long hBottomData = colBottom[i].gpu_data;
455 long hBottomDiff = colBottom[i].mutable_gpu_diff;
460 long hBiasDiff =
m_colBlobs[1].mutable_gpu_diff;
462 for (
int n = 0; n <
m_nNum; n++)
470 for (
int n = 0; n <
m_nNum; n++)
477 if (rgbPropagateDown[i])
494 for (
int i = 0; i < colBottom.
Count; i++)
496 long hBottomData = colBottom[i].gpu_data;
497 long hTopData = colTop[i].mutable_gpu_data;
503 m_cuda.ConvolutionBackwardData(m_rghCudnn[g],
508 hBottomData, m_nBottomOffset * g,
511 wsArgs.
WorkspaceData, (
int)m_rglWorkspaceBwdDataOffsets[g], m_rglWorkspaceBwdDataSizes[i],
514 hTopData, m_nTopOffset * g);
515 m_cuda.SynchronizeStream(m_rghStream[g]);
522 m_cuda.AddTensor(m_rghCudnn[g],
525 hBiasData, m_nBiasOffset * g,
528 hTopData, m_nTopOffset * g);
529 m_cuda.SynchronizeStream(m_rghStream[g]);
535 m_cuda.SynchronizeThread();
548 long hWeightDiff = 0;
562 for (
int i = 0; i < colTop.
Count; i++)
564 long hTopDiff = colTop[i].gpu_diff;
575 hTopDiff, m_nTopOffset * g,
578 hBiasDiff, m_nBiasOffset * g);
584 long hBottomData = colBottom[i].gpu_data;
589 hTopDiff, m_nTopOffset * g,
591 hBottomData, m_nBottomOffset * g,
593 m_rgbwdFilterAlgo[i],
595 m_rglWorkspaceBwdFilterSizes[i],
602 if (rgbPropagateDown[i])
607 long hBottomDiff = colBottom[i].mutable_gpu_diff;
612 hTopDiff, m_nTopOffset * g,
618 m_rglWorkspaceFwdSizes[i],
621 hBottomDiff, m_nBottomOffset * g);
627 m_cuda.SynchronizeThread();
The Log class provides general output in text form.
void WriteLine(string str, bool bOverrideEnabled=false, bool bHeader=false, bool bError=false, bool bDisable=false)
Write a line of output.
void FAIL(string str)
Causes a failure which throws an exception with the desciptive text.
void CHECK_EQ(double df1, double df2, string str)
Test whether one number is equal to another.
The BlobCollection contains a list of Blobs.
int Count
Returns the number of items in the collection.
The CudaDnn object is the main interface to the Low-Level Cuda C++ DLL.
The WorkspaceArgs are passed to both the Layer::OnSetWorkspace and Layer::OnGetWorkspace events.
long WorkspaceData
Get/set the handle to workspace data in GPU memory.
The BaseConvolutionLayer is an abstract base class that factors out BLAS code common to ConvolutionLa...
int m_nBottomDim
The bottom dimension.
int m_nNumOutput
The number of outputs.
int m_nTopDim
The top dimension.
List< int > m_rgOutputShape
The spatial dimensions of the output.
void backward_bias(long hBias, long hInput, int nInputOffset)
Helper function that abstracts away the column buffer and gemm arguments.
int m_nChannelAxis
The channel axis.
int m_nChannels
The number of channels in each item.
void forward_gemm(long hInput, int nInputOffset, long hWeights, long hOutput, int nOutputOffset, bool bSkipIm2Col=false)
Helper function that abstract away the column buffer and gemm arguments.
override WorkspaceArgs getWorkspace()
Retruns the WorkspaceArgs containing the workspace used by this Layer.
int m_nNumSpatialAxes
The number of spatial axes.
ulong getWorkspaceLimitInBytes(bool bUseTensorCores=false)
Returns the workspace limit in bytes based on the cudnn_workspace_limit setting.
void weight_gemm(long hInput, int nInputOffset, long hOutput, int nOutputOffset, long hWeights)
Helper function that abstract away the column buffer and gemm arguments.
Blob< T > m_blobStride
The spatial dimensions of the stride.
Blob< T > m_blobDilation
The spatial dimentions of the dilation.
Blob< T > m_blobKernelShape
The spatial dimensions of the filter kernel.
int m_nWeightOffset
The weight offset used.
Blob< T > m_blobPad
The spatial dimensions of the padding.
int m_nNum
The number of items in the batch.
void forward_bias(long hOutput, int nOutputOffset, long hBias)
Helper function that abstracts away the column buffer and gemm arguments.
override bool setWorkspace(ulong lSizeInBytes)
If not already set, allocates the workspace needed in GPU memory.
void backward_gemm(long hOutput, int nOutputOffset, long hWeights, long hInput, int nInputOffset)
Helper function that abstract away the column buffer and gemm arguments.
int input_shape(int i)
Returns the spatial dimensions of the input.
bool m_bBiasTerm
Whether or not to use bias.
The DeconvolutionLayer convolves the input with a bank of learned filtered, and (optionally) add bias...
override void backward(BlobCollection< T > colTop, List< bool > rgbPropagateDown, BlobCollection< T > colBottom)
Run the Backward computation.
override void forward(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Run the Forward computation.
DeconvolutionLayer(CudaDnn< T > cuda, Log log, LayerParameter p)
The DeconvolutionLayer constructor.
void backward_cuda(BlobCollection< T > colTop, List< bool > rgbPropagateDown, BlobCollection< T > colBottom)
Run the Backward computation using Engine.CAFFE.
override void LayerSetUp(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Setup the layer for use with both Engine.CAFFE and Engine.CUDNN modes.
void backward_cudnn(BlobCollection< T > colTop, List< bool > rgbPropagateDown, BlobCollection< T > colBottom)
Run the Backward computation using Engine.CUDNN.
override void Reshape(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Reshape the bottom (input) and top (output) blobs.
override bool reverse_dimensions()
Returns true, for we want deconvolution, not convolution.
override void compute_output_shape()
Computes the output shape used by the BaseConvolutionLayer.
override void dispose()
Releases all GPU and host resources used by the Layer.
void forward_cudnn(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Run the Forward computation with Engine.CUDNN.
void forward_cuda(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Run the Forward computation with Engine.CAFFE.
Log m_log
Specifies the Log for output.
LayerParameter m_param
Specifies the LayerParameter describing the Layer.
int val_at(T[] rg, int nIdx)
Returns the integer value at a given index in a generic array.
T m_tZero
Specifies a generic type equal to 0.0.
T m_tOne
Specifies a generic type equal to 1.0.
bool m_bUseHalfSize
Specifies that the half size of the top (if any) should be converted to the base size.
Size size_at(Blob< T > b)
Returns the Size of a given two element Blob, such as one that stores Blob size information.
CudaDnn< T > m_cuda
Specifies the CudaDnn connection to Cuda.
bool m_bReshapeOnForwardNeeded
Specifies whether or not the reshape on forward is needed or not.
LayerParameter.LayerType m_type
Specifies the Layer type.
BlobCollection< T > m_colBlobs
Specifies the learnable parameter Blobs of the Layer.
DictionaryMap< bool > m_rgbParamPropagateDown
Specifies whether or not to compute the learnable diff of each parameter Blob.
bool cudnn_enable_tensor_cores
Specifies to enable the CUDA tensor cores when performing the convolution which is faster but not sup...
bool useCudnn(int nNumSpatialAxes=2)
Queries whether or not to use NVIDIA's cuDnn.
Specifies the base parameter for all layers.
ConvolutionParameter convolution_param
Returns the parameter set when initialized with LayerType.CONVOLUTION
LayerType
Specifies the layer type.
The MyCaffe.basecode contains all generic types used throughout MyCaffe.
The MyCaffe.common namespace contains common MyCaffe classes.
CONV_BWD_FILTER_ALGO
Specifies the cuDnn convolution backward filter algorithm to use.
CONV_FWD_ALGO
Specifies the cuDnn convolution forward algorithm to use.
CONV_BWD_DATA_ALGO
Specifies the cuDnn convolution backward data algorithm to use.
The MyCaffe.fillers namespace contains all fillers including the Filler class.
The MyCaffe.layers namespace contains all layers that have a solidified code base,...
The MyCaffe.param namespace contains parameters used to create models.
The MyCaffe namespace contains the main body of MyCaffe code that closesly tracks the C++ Caffe open-...