2using System.Collections.Generic;
11using System.Runtime.Remoting.Channels;
13using System.Security.Cryptography.X509Certificates;
735#pragma warning disable 1591
743 public interface ICudaDevice
745 void SetDeviceID(
int nDeviceID,
DEVINIT flags =
DEVINIT.NONE,
long? lSeed =
null);
746 void SetRandomSeed(
long lSeed);
747 int GetDeviceCount();
750 void SynchronizeDevice();
751 string GetDeviceName(
int nDeviceID);
752 string GetDeviceP2PInfo(
int nDeviceID);
753 string GetRequiredCompute(out
int nMinMajor, out
int nMinMinor);
763 public interface ICudaMemory
765 long AllocMemory(
long lCount,
bool bHalf =
false);
766 long AllocMemory(List<double> rg);
767 long AllocMemory(List<float> rg);
768 long AllocMemory(
double[] rgSrc,
long hStream = 0);
769 long AllocMemory(
float[] rgSrc,
long hStream = 0);
770 void FreeMemory(
long hMem);
771 double[] GetMemoryDouble(
long hMem,
long lCount = -1);
772 float[] GetMemoryFloat(
long hMem,
long lCount = -1);
773 void SetMemory(
long hMem, List<double> rg);
774 void SetMemory(
long hMem, List<float> rg);
775 void SetMemory(
long hMem,
double[] rgSrc,
long hStream = 0);
776 void SetMemory(
long hMem,
float[] rgSrc,
long hStream = 0);
777 void SetMemoryAt(
long hMem,
double[] rgSrc,
int nOffset);
778 void SetMemoryAt(
long hMem,
float[] rgSrc,
int nOffset);
779 long AllocHostBuffer(
long lCount);
780 void FreeHostBuffer(
long hMem);
781 double[] GetHostMemoryDouble(
long hMem);
782 float[] GetHostMemoryFloat(
long hMem);
783 long CreateMemoryPointer(
long hData,
long lOffset,
long lCount);
784 void FreeMemoryPointer(
long hMem);
793 public interface ICudaCuDnn
795 long CreateStream(
bool bNonBlocking =
false,
int nIndex = -1);
796 void FreeStream(
long h);
797 void SynchronizeStream(
long h = 0);
798 void SynchronizeThread();
800 long CreateCuDNN(
long hStream = 0);
801 void FreeCuDNN(
long h);
803 long CreateTensorDesc();
804 void FreeTensorDesc(
long h);
805 void SetTensorNdDesc(
long hHandle,
int[] rgDim,
int[] rgStride,
bool bHalf =
false);
806 void SetTensorDesc(
long hHandle,
int n,
int c,
int h,
int w,
bool bHalf =
false);
807 void SetTensorDesc(
long hHandle,
int n,
int c,
int h,
int w,
int nStride,
int cStride,
int hStride,
int wStride,
bool bHalf =
false);
808 void AddTensor(
long hHandle,
long hSrcDesc,
long hSrc,
int nSrcOffset,
long hDstDesc,
long hDst,
int nDstOffset);
810 void DeriveBatchNormDesc(
long hFwdScaleBiasMeanVarDesc,
long hFwdBottomDesc,
long hBwdScaleBiasMeanVarDesc,
long hBwdBottomDesc,
BATCHNORM_MODE mode);
812 long CreateFilterDesc();
813 void FreeFilterDesc(
long h);
814 void SetFilterNdDesc(
long hHandle,
int[] rgDim,
bool bHalf =
false);
815 void SetFilterDesc(
long hHandle,
int n,
int c,
int h,
int w,
bool bHalf =
false);
817 long CreateConvolutionDesc();
818 void FreeConvolutionDesc(
long h);
819 void SetConvolutionDesc(
long hHandle,
int hPad,
int wPad,
int hStride,
int wStride,
int hDilation,
int wDilation,
bool bUseTensorCores,
bool bHalf =
false);
821 long CreatePoolingDesc();
822 void FreePoolingDesc(
long h);
823 void SetPoolingDesc(
long hHandle,
PoolingMethod method,
int h,
int w,
int hPad,
int wPad,
int hStride,
int wStride);
825 long CreateLRNDesc();
826 void FreeLRNDesc(
long h);
827 void SetLRNDesc(
long hHandle, uint nSize,
double fAlpha,
double fBeta,
double fK);
829 long CreateRnnDataDesc();
830 void FreeRnnDataDesc(
long h);
831 void SetRnnDataDesc(
long hRnnDataDesc,
RNN_DATALAYOUT layout,
int nMaxSeqLen,
int nBatchSize,
int nVectorSize,
bool bBidirectional =
false,
int[] rgSeqLen =
null);
833 long CreateRnnDesc();
834 void FreeRnnDesc(
long h);
835 void SetRnnDesc(
long hHandle,
long hRnnDesc,
int nHiddenSize,
int nNumLayers,
long hDropoutDesc,
RNN_MODE mode,
bool bUseTensorCores,
RNN_DIRECTION direction =
RNN_DIRECTION.RNN_UNIDIRECTIONAL);
836 int GetRnnParamCount(
long hHandle,
long hRnnDesc,
long hXDesc);
837 ulong GetRnnWorkspaceCount(
long hHandle,
long hRnnDesc,
long hXDesc, out ulong nReservedCount);
838 void GetRnnLinLayerParams(
long hHandle,
long hRnnDesc,
int nLayer,
long hXDesc,
long hWtDesc,
long hWtData,
int nLinLayer, out
int nWtCount, out
long hWt, out
int nBiasCount, out
long hBias);
839 void RnnForward(
long hHandle,
long hRnnDesc,
long hXDesc,
long hXData,
long hHxDesc,
long hHxData,
long hCxDesc,
long hCxData,
long hWtDesc,
long hWtData,
long hYDesc,
long hYData,
long hHyDesc,
long hHyData,
long hCyDesc,
long hCyData,
long hWorkspace, ulong nWsCount,
long hReserved, ulong hResCount,
bool bTraining);
840 void RnnBackwardData(
long hHandle,
long hRnnDesc,
long hYDesc,
long hYData,
long hYDiff,
long hHyDesc,
long hHyDiff,
long hCyDesc,
long hCyDiff,
long hWtDesc,
long hWtData,
long hHxDesc,
long hHxData,
long hCxDesc,
long hCxData,
long hXDesc,
long hXDiff,
long hdHxDesc,
long hHxDiff,
long hdCxDesc,
long hCxDiff,
long hWorkspace, ulong nWsCount,
long hReserved, ulong nResCount);
841 void RnnBackwardWeights(
long hHandle,
long hRnnDesc,
long hXDesc,
long hXData,
long hHxDesc,
long hHxData,
long hYDesc,
long hYData,
long hWorkspace, ulong nWsCount,
long hWtDesc,
long hWtDiff,
long hReserved, ulong nResCount);
850 public interface ICudaMath
852 void set(
int nCount,
long hHandle,
double fVal,
int nIdx = -1);
853 void set(
int nCount,
long hHandle,
float fVal,
int nIdx = -1);
854 double[] get_double(
int nCount,
long hHandle,
int nIdx = -1);
855 float[] get_float(
int nCount,
long hHandle,
int nIdx = -1);
856 void copy(
int nCount,
long hSrc,
long hDst,
int nSrcOffset = 0,
int nDstOffset = 0,
long hAsyncStream = -1,
bool? bSrcHalfOverride =
null,
bool? bDstHalfOverride =
null);
857 void copy(
int nCount,
int nNum,
int nDim,
long hSrc1,
long hSrc2,
long hDst,
long hSimilar,
bool bInvert =
false);
858 void copy_expand(
int n,
int nNum,
int nDim,
long hSrc,
long hDs);
859 void fill(
int n,
int nDim,
long hSrc,
int nSrcOff,
int nCount,
long hDst);
860 void sort(
int nCount,
long hY);
862 void channel_compare(
int nCount,
int nOuterNum,
int nChannels,
int nInnerNum,
long hX,
long hY);
863 void channel_fill(
int nCount,
int nOuterNum,
int nChannels,
int nInnerNum,
long hX,
int nLabelDim,
long hLabels,
long hY);
864 void channel_fillfrom(
int nCount,
int nOuterNum,
int nChannels,
int nInnerNum,
long hX,
long hY,
DIR dir);
865 void channel_scale(
int nCount,
int nOuterNum,
int nChannels,
int nInnerNum,
long hX,
long hA,
long hY);
866 void channel_mulv(
int nCount,
int nOuterNum,
int nChannels,
int nInnerNum,
long hA,
long hX,
long hC);
867 void channel_sum(
int nCount,
int nOuterNum,
int nChannels,
int nInnerNum,
long hX,
long hY,
bool bSumAcrossChannels =
true,
DIR dir =
DIR.FWD,
int nChanalesY = -1);
868 void channel_mean(
int nCount,
int nOuterNum,
int nChannels,
int nInnerNum,
long hX,
long hY);
869 void channel_copy(
int nCount,
int nOuterNum,
int nChannels,
int nBlocks,
int nInnerNum,
int nOffset,
long hX,
long hY,
DIR dir);
870 void channel_copyall(
int nCount,
int nOuterNum,
int nChannels,
int nInnerNum,
long hX,
long hY);
871 void channel_duplicate(
int nCount,
int nOuterNum,
int nChannels,
int nInnerNum,
long hX,
long hY);
872 void channel_percentile(
int nCount,
int nOuterNum,
int nChannels,
int nInnerNum,
long hX,
long hY,
double dfPercentile);
873 void channel_op_fwd(
OP op,
int nCount,
int nC,
int nN1,
int nSD1,
int nN2,
int nSD2,
long hA,
long hB,
long hY);
874 void channel_op_bwd(
OP op,
int nCount,
int nC,
int nN1,
int nSD1,
int nN2,
int nSD2,
int nCy,
int nSDy,
long hA,
long hB,
long hY,
long hAd,
long hBd,
long hYd,
long hWork);
876 void gemm(
bool bTransA,
bool bTransB,
int m,
int n,
int k,
double fAlpha,
long hA,
long hB,
double fBeta,
long hC);
877 void gemm(
bool bTransA,
bool bTransB,
int m,
int n,
int k,
float fAlpha,
long hA,
long hB,
float fBeta,
long hC);
878 void gemv(
bool bTransA,
int m,
int n,
double fAlpha,
long hA,
long hX,
double fBeta,
long hY);
879 void gemv(
bool bTransA,
int m,
int n,
float fAlpha,
long hA,
long hX,
float fBeta,
long hY);
880 void geam(
bool bTransA,
bool bTransB,
int m,
int n,
double fAlpha,
long hA,
long hB,
double fBeta,
long hC);
881 void geam(
bool bTransA,
bool bTransB,
int m,
int n,
float fAlpha,
long hA,
long hB,
float fBeta,
long hC);
883 void ger(
int m,
int n,
double fAlpha,
long hX,
long hY,
long hA);
884 void ger(
int m,
int n,
float fAlpha,
long hX,
long hY,
long hA);
885 void axpy(
int n,
double fAlpha,
long hX,
long hY);
886 void axpy(
int n,
float fAlpha,
long hX,
long hY);
887 void axpby(
int n,
double fAlpha,
long hX,
double fBeta,
long hY);
888 void axpby(
int n,
float fAlpha,
long hX,
float fBeta,
long hY);
889 void scal(
int n,
double fAlpha,
long hX,
int nXOff = 0);
890 void scal(
int n,
float fAlpha,
long hX,
int nXOff = 0);
891 double dot_double(
int n,
long hX,
long hY);
892 float dot_float(
int n,
long hX,
long hY);
893 double asum_double(
int n,
long hX,
int nXOff = 0);
894 float asum_float(
int n,
long hX,
int nXOff = 0);
895 void scale(
int n,
double fAlpha,
long hX,
long hY);
896 void scale(
int n,
float fAlpha,
long hX,
long hY);
897 void add_scalar(
int n,
double fAlpha,
long hY);
898 void add_scalar(
int n,
float fAlpha,
long hY);
899 void add(
int n,
long hA,
long hB,
long hY);
900 void add(
int n,
long hA,
long hB,
long hY,
double dfAlpha);
901 void add(
int n,
long hA,
long hB,
long hY,
float fAlpha);
902 void sub(
int n,
long hA,
long hB,
long hY,
int nAOff = 0,
int nBOff = 0,
int nYOff = 0,
int nB = 0);
903 void mul(
int n,
long hA,
long hB,
long hY,
int nAOff = 0,
int nBOff = 0,
int nYOff = 0);
904 void mul_scalar(
int n,
double fAlpha,
long hY);
905 void mul_scalar(
int n,
float fAlpha,
long hY);
906 void div(
int n,
long hA,
long hB,
long hY);
907 void abs(
int n,
long hA,
long hY);
908 void exp(
int n,
long hA,
long hY);
909 void log(
int n,
long hA,
long hY);
910 void powx(
int n,
long hA,
double fAlpha,
long hY,
int nAOff = 0,
int nYOff = 0);
911 void powx(
int n,
long hA,
float fAlpha,
long hY,
int nAOff = 0,
int nYOff = 0);
912 void sign(
int n,
long hX,
long hY,
int nXOff = 0,
int nYOff = 0);
913 double min(
int n,
long hA, out
long lPos,
int nAOff = 0,
long hWork = 0);
914 double max(
int n,
long hA, out
long lPos,
int nAOff = 0,
long hWork = 0);
915 double sumsq(
int n,
long hW,
long hA,
int nAOff = 0);
916 double sumsqdiff(
int n,
long hW,
long hA,
long hB,
int nAOff = 0,
int nBOff = 0);
917 void sqrt(
int n,
long hA,
long hY);
918 void sqrt_scale(
int n,
long hA,
long hY);
920 void mask(
int n,
int nMaskDim,
double fSearch,
double fReplace,
long hX,
long hMask,
long hY);
921 void mask(
int n,
int nMaskDim,
float fSearch,
float fReplace,
long hX,
long hMask,
long hY);
922 void mask_batch(
int n,
int nBatch,
int nMaskDim,
double fSearch,
double fReplace,
long hX,
long hMask,
long hY);
923 void mask_batch(
int n,
int nBatch,
int nMaskDim,
float fSearch,
float fReplace,
long hX,
long hMask,
long hY);
925 void im2col(
long hDataIm,
int nDataImOffset,
int nChannels,
int nHeight,
int nWidth,
int nKernelH,
int nKernelW,
int nPadH,
int nPadW,
int nStrideH,
int nStrideW,
int nDilationH,
int nDilationW,
long hDataCol,
int nDataColOffset);
926 void im2col_nd(
long hDataIm,
int nDataImOffset,
int nNumSpatialAxes,
int nColCount,
int nChannelAxis,
long hImShape,
long hColShape,
long hKernelShape,
long hPad,
long hStride,
long hDilation,
long hDataCol,
int nDataColOffset);
927 void col2im(
long hDataCol,
int nDataColOffset,
int nChannels,
int nHeight,
int nWidth,
int nKernelH,
int nKernelW,
int nPadH,
int nPadW,
int nStrideH,
int nStrideW,
int nDilationH,
int nDilationW,
long hDataIm,
int nDataImOffset);
928 void col2im_nd(
long hDataCol,
int nDataColOffset,
int nNumSpatialAxes,
int nColCount,
int nChannelAxis,
long hImShape,
long hColShape,
long hKernelShape,
long hPad,
long hStride,
long hDilation,
long hDataIm,
int nDataImOffset);
937 public interface ICudaRandom
939 void rng_setseed(
long lSeed);
940 void rng_uniform(
int n,
double fMin,
double fMax,
long hY);
941 void rng_uniform(
int n,
float fMin,
float fMax,
long hY);
942 void rng_gaussian(
int n,
double fMu,
double fSigma,
long hY);
943 void rng_gaussian(
int n,
float fMu,
float fSigma,
long hY);
944 void rng_bernoulli(
int n,
double fNonZeroProb,
long hY);
945 void rng_bernoulli(
int n,
float fNonZeroProb,
long hY);
954 public interface ICudaDnn : ICudaDevice, ICudaMemory, ICudaCuDnn, ICudaMath, ICudaRandom
958#pragma warning restore 1591
968 public class CudaDnn<T> : ICudaDnn, IDisposable
973 string m_strPath =
"";
974 static int s_nIdxSeed = 0;
975 static string s_strCudaPath =
"";
976 CudaControlLib.ICudaKernel m_cuda;
983 long m_nGhostMemoryIndex = 1000;
984 Dictionary<long, T[]> m_rgGhostMemory =
null;
985 bool m_bGhostMemoryEnabled =
false;
986 bool m_bOwner =
true;
987 object m_memSync =
new object();
988 bool m_bEnableRnnExtendedVersion =
false;
989 static object m_createSync =
new object();
990 static object m_getconvSync =
new object();
991 static ulong m_lBaseSize = (ulong)((typeof(T) == typeof(
float)) ?
sizeof(float) :
sizeof(
double));
1006 DEVICE_P2P_INFO = 1001,
1013#pragma warning disable 1591
1024 KERNEL_MEMCOPY = -4,
1026 KERNEL_COPY_NCCL = -10,
1032 SYNCHRONIZEDEVICE = 5,
1034 CHECKMEMORYATTRIB = 7,
1035 GETDEVICEMEMORY = 8,
1036 GETREQUIREDCOMPUTE = 9,
1038 DEVICE_CANACCESSPEER = 10,
1039 DEVICE_ENABLEPEERACCESS = 11,
1040 DEVICE_DISABLEPEERACCESS = 12,
1042 COPY_DEVICE_TO_HOST = 14,
1043 COPY_HOST_TO_DEVICE = 15,
1045 CREATE_MEMORYPOINTER = 16,
1046 FREE_MEMORYPOINTER = 17,
1055 ALLOCHOSTBUFFER = 25,
1056 FREEHOSTBUFFER = 26,
1059 GETHOSTBUFFERCAPACITY = 29,
1063 SYNCRHONIZE_STREAM = 32,
1064 SYNCHRONIZE_THREAD = 33,
1066 CREATE_MEMTEST = 34,
1070 CREATE_IMAGEOP = 37,
1072 DISTORTIMAGE_IMAGEOP = 39,
1076 NCCL_INIT_SINGLEPROCESS = 42,
1077 NCCL_INIT_MULTIPROCESS = 43,
1078 NCCL_BROADCAST = 44,
1079 NCCL_ALLREDUCE = 45,
1086 CREATE_TENSORDESC = 50,
1087 FREE_TENSORDESC = 51,
1088 SET_TENSORDESC = 52,
1090 SET_TENSORNDDESC = 54,
1092 CREATE_FILTERDESC = 60,
1093 FREE_FILTERDESC = 61,
1094 SET_FILTERDESC = 62,
1095 SET_FILTERNDDESC = 63,
1097 CREATE_EXTENSION = 67,
1098 FREE_EXTENSION = 68,
1101 CREATE_CONVDESC = 70,
1107 BWD_CONV_FILTER = 76,
1110 CREATE_POOLDESC = 80,
1120 CREATE_LRNDESC = 90,
1124 GET_DROPOUT_INFO = 94,
1125 CREATE_DROPOUTDESC = 95,
1126 FREE_DROPOUTDESC = 96,
1127 SET_DROPOUTDESC = 97,
1152 CREATE_RNN_DATA_DESC = 130,
1153 FREE_RNN_DATA_DESC = 131,
1154 SET_RNN_DATA_DESC = 132,
1157 CREATE_RNN_DATA_DESCEX = 135,
1158 FREE_RNN_DATA_DESCEX = 136,
1159 SET_RNN_DATA_DESCEX = 137,
1162 CREATE_RNN_DESC = 140,
1163 FREE_RNN_DESC = 141,
1165 GET_RNN_PARAMCOUNT = 143,
1166 GET_RNN_WORKSPACECOUNT = 144,
1167 GET_RNN_LINLAYERPARAMS = 145,
1172 RNN8_IS_SUPPORTED = 150,
1176 RNN8_GET_MEMORY_SIZES = 154,
1177 RNN8_INIT_WEIGHTS = 155,
1184 CUDA_COPY_SIM = 203,
1185 CUDA_COPY_FILL = 204,
1187 CUDA_COPY_BATCH = 206,
1188 CUDA_COPY_SEQUENCE = 207,
1189 CUDA_COPY_EXPAND = 208,
1190 CUDA_COPY_SEQUENCE2 = 209,
1203 CUDA_ADD_SCALAR = 228,
1207 CUDA_MUL_SCALAR = 232,
1215 CUDA_RECIPROCOL = 240,
1217 CUDA_LOGISTIC1 = 242,
1218 CUDA_LOGISTIC2 = 243,
1220 CUDA_COMPARE_SIGNS = 245,
1224 CUDA_SUMSQDIFF = 249,
1226 CUDA_CONTAINS_POINT = 251,
1228 CUDA_SUB_AND_DOT = 253,
1229 CUDA_MINMAXVAL = 254,
1231 CUDA_SQRT_SCALE = 256,
1233 CUDA_SET_BOUNDS = 259,
1234 CUDA_MINMAXVEC = 260,
1235 CUDA_TRANSPOSE = 261,
1236 CUDA_SCALE_TO_RANGE = 262,
1241 CUDA_MASK_BATCH = 266,
1242 CUDA_TRANSPOSE_HW = 267,
1250 CUDA_MAX_BWD2 = 272,
1253 CUDA_IM2COL_ND = 281,
1255 CUDA_COL2IM_ND = 283,
1257 CUDA_ACCURACY_FWD = 286,
1259 CUDA_CHANNEL_MEAN = 287,
1260 CUDA_CHANNEL_MIN = 289,
1261 CUDA_CHANNEL_MAX = 290,
1262 CUDA_CHANNEL_SUB = 291,
1263 CUDA_CHANNEL_SUM = 292,
1264 CUDA_CHANNEL_DIV = 293,
1265 CUDA_CHANNEL_DOT = 294,
1266 CUDA_CHANNEL_MUL = 295,
1267 CUDA_CHANNEL_COMPARE = 296,
1268 CUDA_CHANNEL_FILL = 297,
1269 CUDA_CHANNEL_SCALE = 298,
1270 CUDA_CHANNEL_MULV = 299,
1271 CUDA_CHANNEL_COPY = 300,
1272 CUDA_CHANNEL_FILLFROM = 301,
1273 CUDA_CHANNEL_COPYALL = 302,
1274 CUDA_CHANNEL_DUP = 303,
1275 CUDA_CHANNEL_ADD = 304,
1276 CUDA_CHANNEL_PERCENTILE = 305,
1277 CUDA_CHANNEL_OP_FWD = 306,
1278 CUDA_CHANNEL_OP_BWD = 307,
1280 CUDA_RNG_SETSEED = 349,
1281 CUDA_RNG_UNIFORM = 350,
1282 CUDA_RNG_GAUSSIAN = 351,
1285 CUDA_BATCHREIDX_FWD = 386,
1286 CUDA_BATCHREIDX_BWD = 387,
1288 CUDA_EMBED_FWD = 390,
1289 CUDA_EMBED_BWD = 391,
1291 CUDA_CLIP_FWD = 394,
1292 CUDA_CLIP_BWD = 395,
1294 CUDA_POOL_FWD = 400,
1295 CUDA_POOL_BWD = 401,
1297 CUDA_UNPOOL_FWD = 410,
1298 CUDA_UNPOOL_BWD = 411,
1300 CUDA_TANH_FWD = 420,
1301 CUDA_TANH_BWD = 421,
1303 CUDA_MISH_FWD = 422,
1304 CUDA_MISH_BWD = 423,
1306 CUDA_SIGMOID_FWD = 424,
1307 CUDA_SIGMOID_BWD = 425,
1309 CUDA_SWISH_BWD = 427,
1311 CUDA_RELU_FWD = 428,
1312 CUDA_RELU_BWD = 429,
1317 CUDA_DROPOUT_FWD = 432,
1318 CUDA_DROPOUT_BWD = 433,
1320 CUDA_BNLL_FWD = 435,
1321 CUDA_BNLL_BWD = 436,
1323 CUDA_PRELU_FWD = 438,
1324 CUDA_PRELU_BWD = 439,
1325 CUDA_PRELU_BWD_PARAM = 440,
1327 CUDA_NLLLOSS_FWD = 442,
1328 CUDA_NLLLOSS_BWD = 443,
1330 CUDA_SOFTMAXLOSS_FWD = 444,
1331 CUDA_SOFTMAXLOSS_BWD = 445,
1339 CUDA_CROP_FWD = 450,
1340 CUDA_CROP_BWD = 451,
1342 CUDA_CONCAT_FWD = 452,
1343 CUDA_CONCAT_BWD = 453,
1345 CUDA_SLICE_FWD = 455,
1346 CUDA_SLICE_BWD = 456,
1348 CUDA_TILE_FWD = 457,
1349 CUDA_TILE_BWD = 458,
1351 CUDA_BIAS_FWD = 460,
1353 CUDA_SCALE_FWD = 461,
1355 CUDA_THRESHOLD_FWD = 462,
1359 CUDA_LRN_FILLSCALE = 465,
1360 CUDA_LRN_COMPUTEOUTPUT = 466,
1361 CUDA_LRN_COMPUTEDIFF = 467,
1363 CUDA_SMOOTHL1_FWD = 470,
1364 CUDA_SMOOTHL1_BWD = 471,
1366 CUDA_SERF_FWD = 472,
1367 CUDA_SERF_BWD = 473,
1371 CUDA_GATHER_FWD = 476,
1372 CUDA_GATHER_BWD = 477,
1374 CUDA_LSTM_FWD = 480,
1375 CUDA_LSTM_BWD = 481,
1377 CUDA_LSTM_UNIT_FWD = 482,
1378 CUDA_LSTM_UNIT_BWD = 483,
1380 CUDA_MATH_FWD = 487,
1381 CUDA_MATH_BWD = 488,
1383 CUDA_COEFF_SUM_FWD = 490,
1384 CUDA_COEFF_SUM_BWD = 491,
1386 CUDA_COEFF_SUB_FWD = 492,
1387 CUDA_COEFF_SUB_BWD = 493,
1389 CUDA_MEAN_ERROR_LOSS_BWD = 495,
1391 CUDA_SIGMOID_CROSS_ENTROPY_FWD = 496,
1392 CUDA_SIGMOID_CROSS_ENTROPY_BWD = 497,
1393 CUDA_SOFTMAX_CROSS_ENTROPY_FWD = 498,
1394 CUDA_SOFTMAX_CROSS_ENTROPY_BWD = 499,
1396 CUDA_SGD_UPDATE = 500,
1397 CUDA_NESTEROV_UPDATE = 501,
1398 CUDA_ADAGRAD_UPDATE = 502,
1399 CUDA_ADADELTA_UPDATE = 503,
1400 CUDA_ADAM_UPDATE = 504,
1401 CUDA_RMSPROP_UPDATE = 505,
1402 CUDA_ADAMW_UPDATE = 506,
1404 CUDA_COMBINE_DATA = 550,
1406 CUDA_GELU_FWD = 600,
1407 CUDA_GELU_BWD = 601,
1409 CUDA_SILU_FWD = 605,
1410 CUDA_SILU_BWD = 606,
1412 CUDA_SOFTPLUS_FWD = 610,
1413 CUDA_SOFTPLUS_BWD = 611,
1415 CUDA_LECUN_FWD = 615,
1416 CUDA_LECUN_BWD = 616,
1418 CUDA_MTX_SET_DIAGONAL = 700,
1419 CUDA_MTX_SET_DIAGONAL2 = 701,
1420 CUDA_MTX_ADD_VECTOR = 702,
1421 CUDA_MTX_TRANSPOSE_OPERATION = 703,
1422 CUDA_MTX_AGGREGATE_COLS = 704,
1423 CUDA_MTX_AGGREGATE_ROWS = 705,
1424 CUDA_MTX_TRANSPOSE = 706,
1425 CUDA_MTX_MEANCENTER_BY_COL = 707,
1426 CUDA_MTX_MEANCENTER_BY_ROW = 708,
1427 CUDA_MTX_EUCLIDEAN_DIST = 709,
1429 CUDA_MTX_MEAN = 711,
1430 CUDA_MTX_STDEV = 712,
1431 CUDA_MTX_CORRELATIONS = 714,
1433 CUDA_CREATE_PCA = 800,
1435 CUDA_FREE_PCA = 802,
1437 CUDA_TSNE_UPDATE = 850,
1438 CUDA_TSNE_UPDATE_GRAD = 851,
1439 CUDA_TSNE_COMPUTE_EXACT_ERROR = 852,
1440 CUDA_TSNE_COMPUTE_SQUARED_EUCLIDEAN_DISTANCE = 854,
1441 CUDA_TSNE_COMPUTE_Q_MATRIX = 855,
1442 CUDA_TSNE_COMPUTE_EXACT_GRADIENT = 856,
1443 CUDA_TSNE_SYMMETRIZE_MATRIX = 858,
1444 CUDA_TSNE_COMPUTE_KNN_BOUNDS = 859,
1446 CUDA_TSNE_CREATE_GAUSSIAN_PERPLEXITY = 870,
1447 CUDA_TSNE_FREE_GAUSSIAN_PERPLEXITY = 871,
1448 CUDA_TSNE_FIND_GAUSSIAN_PERPLEXITY = 872,
1450 CUDA_TSNE_CREATE = 875,
1451 CUDA_TSNE_FREE = 876,
1452 CUDA_TSNE_COMPUTE_GRADIENT1 = 877,
1453 CUDA_TSNE_COMPUTE_ERROR1 = 878,
1455 CUDA_GUASSIAN_BLUR = 900,
1456 CUDA_HAMMING_DIFF = 901,
1457 CUDA_CALC_BATCH_DIST = 902,
1458 CUDA_CALC_DFT = 903,
1460 CUDA_CREATE_SSD = 950,
1461 CUDA_FREE_SSD = 951,
1462 CUDA_SETUP_SSD = 952,
1463 CUDA_SSD_FWD_MULTIBOXLOSS = 955,
1464 CUDA_SSD_ENCODE_LOCPRED = 958,
1465 CUDA_SSD_ENCODE_CONFPRED = 959,
1467 CUDA_CREATE_LAYERNORM = 970,
1468 CUDA_FREE_LAYERNORM = 971,
1469 CUDA_LAYERNORM_FWD = 975,
1470 CUDA_LAYERNORM_BWD = 976,
1475#pragma warning restore 1591
1488 public CudaDnn(
int nDeviceID,
DEVINIT flags = (
DEVINIT.CUBLAS |
DEVINIT.CURAND),
long? lSeed =
null,
string strPath =
"",
bool bResetFirst =
false,
bool bEnableMemoryTrace =
false)
1491 m_nDeviceId = nDeviceID;
1492 m_nIdx = get_index();
1494 if (strPath ==
null || strPath.Length == 0)
1495 strPath = s_strCudaPath;
1497 m_strPath = strPath;
1502 m_cuda =
new CudaControlLib.CudaKernel();
1504 catch (Exception excpt)
1506 throw new Exception(
"The CudaControl is not registered! Make sure that you are using the 'x64' build and if so, run 'regsvr32 CudaControl.dll' from a CMD window with Administrative privileges to register.", excpt);
1511 if (
string.IsNullOrEmpty(strPath))
1514 m_strPath = strPath;
1516 string strDir =
System.IO.Path.GetDirectoryName(strPath);
1517 string strCurDir = Directory.GetCurrentDirectory();
1518 Directory.SetCurrentDirectory(strDir);
1520 m_cuda.Load(strPath);
1522 Directory.SetCurrentDirectory(strCurDir);
1524 catch (Exception excpt)
1526 if (excpt.Message !=
null && excpt.Message.Length > 0)
1529 throw new Exception(
"The CudaDnnDll.x.dll at '" + strPath +
"' failed to load. The error code = 0x" + excpt.HResult.ToString(
"X"));
1538 double[] rg = m_cuda.RunDouble(0, (
int)CUDAFN.INITIALIZE, m_param.AsDouble(nDeviceID, (
int)flags));
1539 m_hKernel = (long)rg[0];
1543 float[] rg = m_cuda.RunFloat(0, (
int)CUDAFN.INITIALIZE, m_param.AsFloat(nDeviceID, (
int)flags));
1544 m_hKernel = (long)rg[0];
1548 catch (Exception excpt)
1550 if (excpt.Message !=
null && excpt.Message.Length > 0)
1553 throw new Exception(
"CudaDnn failed to initialize. You may need to reboot or reset the Cuda GPU #" + nDeviceID.ToString() +
". The error code = 0x" + excpt.HResult.ToString(
"X"));
1564 double[] rg = m_cuda.RunDouble(0, (
int)CUDAFN.INITIALIZE, m_param.AsDouble(nDeviceID, (
int)flags));
1565 m_hKernel = (long)rg[0];
1569 float[] rg = m_cuda.RunFloat(0, (
int)CUDAFN.INITIALIZE, m_param.AsFloat(nDeviceID, (
int)flags));
1570 m_hKernel = (long)rg[0];
1578 m_tOne = (T)Convert.ChangeType(1.0, typeof(T));
1579 m_tZero = (T)Convert.ChangeType(0.0, typeof(T));
1589 m_nDeviceId = cuda.m_nDeviceId;
1590 m_nIdx = get_index();
1592 m_strPath = cuda.m_strPath;
1594 m_cuda = cuda.m_cuda;
1595 m_hKernel = cuda.m_hKernel;
1596 m_tOne = cuda.m_tOne;
1597 m_tZero = cuda.m_tZero;
1599 if (bEnableGhostMemory)
1601 m_rgGhostMemory =
new Dictionary<long, T[]>();
1602 m_bGhostMemoryEnabled =
true;
1614 if (m_bOwner && m_hKernel != 0)
1617 m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.CLEANUP,
null);
1619 m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.CLEANUP,
null);
1640 FileInfo fi =
new FileInfo(Process.GetCurrentProcess().MainModule.FileName);
1642 string strPath = fi.DirectoryName +
"\\cuda_12.1\\CudaDnnDll.12.1.dll";
1644 if (!File.Exists(strPath))
1646 strPath = fi.DirectoryName +
"\\CudaDnnDll.12.1.dll";
1647 if (!File.Exists(strPath))
1649 strPath = fi.DirectoryName +
"\\cuda_12.0\\CudaDnnDll.12.0.dll";
1650 if (!File.Exists(strPath))
1652 strPath = fi.DirectoryName +
"\\CudaDnnDll.12.0.dll";
1653 if (!File.Exists(strPath))
1655 if (!File.Exists(strPath))
1657 strPath = fi.DirectoryName +
"\\cuda_11.8\\CudaDnnDll.11.8.dll";
1658 if (!File.Exists(strPath))
1660 strPath = fi.DirectoryName +
"\\CudaDnnDll.11.8.dll";
1661 if (!File.Exists(strPath))
1663 strPath = fi.DirectoryName +
"\\cuda_11.7\\CudaDnnDll.11.7.dll";
1664 if (!File.Exists(strPath))
1666 strPath = fi.DirectoryName +
"\\CudaDnnDll.11.7.dll";
1667 if (!File.Exists(strPath))
1669 strPath = fi.DirectoryName +
"\\cuda_11.6\\CudaDnnDll.11.6.dll";
1670 if (!File.Exists(strPath))
1672 strPath = fi.DirectoryName +
"\\CudaDnnDll.11.6.dll";
1673 if (!File.Exists(strPath))
1675 strPath = fi.DirectoryName +
"\\cuda_11.5\\CudaDnnDll.11.5.dll";
1676 if (!File.Exists(strPath))
1678 strPath = fi.DirectoryName +
"\\CudaDnnDll.11.5.dll";
1679 if (!File.Exists(strPath))
1681 strPath = fi.DirectoryName +
"\\cuda_11.4\\CudaDnnDll.11.4.dll";
1682 if (!File.Exists(strPath))
1684 strPath = fi.DirectoryName +
"\\CudaDnnDll.11.4.dll";
1685 if (!File.Exists(strPath))
1687 strPath = fi.DirectoryName +
"\\cuda_11.3\\CudaDnnDll.11.3.dll";
1688 if (!File.Exists(strPath))
1690 strPath = fi.DirectoryName +
"\\CudaDnnDll.11.3.dll";
1691 if (!File.Exists(strPath))
1693 strPath = fi.DirectoryName +
"\\cuda_11.2\\CudaDnnDll.11.2.dll";
1694 if (!File.Exists(strPath))
1696 strPath = fi.DirectoryName +
"\\CudaDnnDll.11.2.dll";
1697 if (!File.Exists(strPath))
1699 strPath = fi.DirectoryName +
"\\cuda_11.1\\CudaDnnDll.11.1.dll";
1700 if (!File.Exists(strPath))
1702 strPath = fi.DirectoryName +
"\\CudaDnnDll.11.1.dll";
1703 if (!File.Exists(strPath))
1705 strPath = fi.DirectoryName +
"\\cuda_11.0\\CudaDnnDll.11.0.dll";
1706 if (!File.Exists(strPath))
1708 strPath = fi.DirectoryName +
"\\CudaDnnDll.11.0.dll";
1709 if (!File.Exists(strPath))
1711 strPath = fi.DirectoryName +
"\\cuda_10.2\\CudaDnnDll.10.2.dll";
1712 if (!File.Exists(strPath))
1714 strPath = fi.DirectoryName +
"\\CudaDnnDll.10.2.dll";
1715 if (!File.Exists(strPath))
1717 strPath = fi.DirectoryName +
"\\cuda_10.2.3_5\\CudaDnnDll.10.2.dll";
1718 if (!File.Exists(strPath))
1720 strPath = fi.DirectoryName +
"\\CudaDnnDll.10.2.3_5.dll";
1721 if (!File.Exists(strPath))
1723 strPath = fi.DirectoryName +
"\\CudaDnnDll.10.1.dll";
1724 if (!File.Exists(strPath))
1726 strPath = fi.DirectoryName +
"\\CudaDnnDll.10.0.dll";
1727 if (!File.Exists(strPath))
1729 strPath = fi.DirectoryName +
"\\CudaDnnDll.9.2.dll";
1730 if (!File.Exists(strPath))
1732 strPath = fi.DirectoryName +
"\\CudaDnnDll.9.1.dll";
1733 if (!File.Exists(strPath))
1735 if (!File.Exists(strPath))
1736 strPath = fi.DirectoryName +
"\\CudaDnnDll.8.dll";
1777 m_bGhostMemoryEnabled =
false;
1785 if (m_rgGhostMemory !=
null)
1786 m_bGhostMemoryEnabled =
true;
1788 m_bGhostMemoryEnabled =
false;
1794 public ulong TotalMemoryUsed
1802 public string TotalMemoryUsedAsText
1811 public long KernelHandle
1813 get {
return m_hKernel; }
1829 public void KernelCopy(
int nCount,
long hSrc,
int nSrcOffset,
long hDstKernel,
long hDst,
int nDstOffset,
long hHostBuffer,
long hHostKernel = -1,
long hStream = -1,
long hSrcKernel = -1)
1831 if (hSrcKernel == -1)
1832 hSrcKernel = m_hKernel;
1835 m_cuda.RunDouble((
int)hSrcKernel, (
int)CUDAFN.KERNEL_MEMCOPY, m_param.AsDouble(nCount, hSrc, nSrcOffset, hDstKernel, hDst, nDstOffset, hHostBuffer, hHostKernel, hStream));
1837 m_cuda.RunFloat((
int)hSrcKernel, (
int)CUDAFN.KERNEL_MEMCOPY, m_param.AsFloat(nCount, hSrc, nSrcOffset, hDstKernel, hDst, nDstOffset, hHostBuffer, hHostKernel, hStream));
1848 public void KernelAdd(
int nCount,
long hA,
long hDstKernel,
long hB,
long hC)
1851 m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.KERNEL_ADD, m_param.AsDouble(nCount, hA, hDstKernel, hB, hC));
1853 m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.KERNEL_ADD, m_param.AsFloat(nCount, hA, hDstKernel, hB, hC));
1870 double[] rg = m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.KERNEL_COPY_NCCL, m_param.AsDouble(hSrcKernel, hSrcNccl));
1875 float[] rg = m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.KERNEL_COPY_NCCL, m_param.AsFloat(hSrcKernel, hSrcNccl));
1880 private static int get_index()
1892 s_strCudaPath = strPath;
1904 if (typeof(T) == typeof(
float))
1910 private double convertD(T fVal)
1912 return (
double)Convert.ChangeType(fVal, typeof(
double));
1915 private float convertF(T fVal)
1917 return (
float)Convert.ChangeType(fVal, typeof(
float));
1925 get {
return m_strPath; }
1931 public static string DefaultPath
1933 get {
return s_strCudaPath; }
1936#pragma warning disable 1591
1938 public void CombineData(
int nCount,
long hOriginal,
long hUpdated,
double dfUpdatedPct,
long hServer,
double dfServerPct,
long hNewData)
1941 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_COMBINE_DATA, m_param.AsDouble(dfUpdatedPct, dfServerPct), m_param.AsLong(nCount, hOriginal, hUpdated, 0, hServer, 0, hNewData));
1943 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_COMBINE_DATA, m_param.AsFloat((
float)dfUpdatedPct, (
float)dfServerPct), m_param.AsLong(nCount, hOriginal, hUpdated, 0, hServer, 0, hNewData));
1946#pragma warning restore 1591
1952 #region ICudaDevice Methods
1962 if (m_cuda ==
null || m_hKernel <= 0)
1963 throw new Exception(
"CudaDnn has already nbeen disposed!");
1965 if (nDeviceID == -1)
1966 nDeviceID = m_nDeviceId;
1968 m_nDeviceId = nDeviceID;
1973 m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.SETDEVICE, m_param.AsDouble(nDeviceID, (
int)flags, lSeed.Value));
1975 m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.SETDEVICE, m_param.AsDouble(nDeviceID, (
int)flags));
1980 m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.SETDEVICE, m_param.AsFloat(nDeviceID, (
int)flags, lSeed.Value));
1982 m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.SETDEVICE, m_param.AsFloat(nDeviceID, (
int)flags));
1992 if (m_cuda ==
null || m_hKernel <= 0)
1993 throw new Exception(
"CudaDnn has already nbeen disposed!");
1996 m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.SETRANDOMSEED, m_param.AsDouble(lSeed));
1998 m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.SETRANDOMSEED, m_param.AsFloat(lSeed));
2004 public int OriginalDeviceID
2006 get {
return m_nDeviceId; }
2015 if (m_cuda ==
null || m_hKernel <= 0)
2016 throw new Exception(
"CudaDnn has already nbeen disposed!");
2020 double[] rg = m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.GETDEVICE,
null);
2025 float[] rg = m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.GETDEVICE,
null);
2037 if (m_cuda ==
null || m_hKernel <= 0)
2038 throw new Exception(
"CudaDnn has already nbeen disposed!");
2040 string[] rgstr = m_cuda.QueryString((
int)m_hKernel, (
int)CUDAQRY.DEVICE_NAME,
new int[] { nDeviceID });
2051 if (m_cuda ==
null || m_hKernel <= 0)
2052 throw new Exception(
"CudaDnn has already nbeen disposed!");
2054 string[] rgstr = m_cuda.QueryString((
int)m_hKernel, (
int)CUDAQRY.DEVICE_P2P_INFO,
new int[] { nDeviceID });
2066 if (m_cuda ==
null || m_hKernel <= 0)
2067 throw new Exception(
"CudaDnn has already nbeen disposed!");
2069 string[] rgstr = m_cuda.QueryString((
int)m_hKernel, (
int)CUDAQRY.DEVICE_INFO,
new int[] { nDeviceID, (bVerbose) ? 1 : 0 });
2081 if (m_cuda ==
null || m_hKernel <= 0)
2082 throw new Exception(
"CudaDnn has already nbeen disposed!");
2085 m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.RESETDEVICE,
null);
2087 m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.RESETDEVICE,
null);
2095 if (m_cuda ==
null || m_hKernel <= 0)
2096 throw new Exception(
"CudaDnn has already nbeen disposed!");
2099 m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.SYNCHRONIZEDEVICE,
null);
2101 m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.SYNCHRONIZEDEVICE,
null);
2113 double[] rg = m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.GETDEVICEPROP, m_param.AsDouble(nDeviceID, (
int)
DEVPROP.MULTIGPUBOARDGROUPID));
2118 float[] rg = m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.GETDEVICEPROP, m_param.AsFloat(nDeviceID, (
int)
DEVPROP.MULTIGPUBOARDGROUPID));
2129 if (m_cuda ==
null || m_hKernel <= 0)
2136 double[] rg = m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.GETDEVICEPROP, m_param.AsDouble(0, (
int)
DEVPROP.DEVICECOUNT));
2141 float[] rg = m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.GETDEVICEPROP, m_param.AsFloat(0, (
int)
DEVPROP.DEVICECOUNT));
2164 double[] rg = m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.CHECKMEMORYATTRIB, m_param.AsDouble(hSrc, nSrcDeviceID, hDst, nDstDeviceID));
2165 return (rg[0] == 0) ? false :
true;
2169 float[] rg = m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.CHECKMEMORYATTRIB, m_param.AsFloat(hSrc, nSrcDeviceID, hDst, nDstDeviceID));
2170 return (rg[0] == 0) ? false :
true;
2182 public double GetDeviceMemory(out
double dfFree, out
double dfUsed, out
bool bCudaCallUsed,
int nDeviceID = -1)
2184 if (nDeviceID == -1)
2185 nDeviceID = m_nDeviceId;
2189 double[] rg = m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.GETDEVICEMEMORY, m_param.AsDouble(nDeviceID));
2192 bCudaCallUsed = (rg[3] == 0) ?
false :
true;
2197 float[] rg = m_cuda.RunFloat((
int)m_hKernel, (int)CUDAFN.GETDEVICEMEMORY, m_param.AsFloat(nDeviceID));
2198 dfFree = (double)rg[1];
2199 dfUsed = (double)rg[2];
2200 bCudaCallUsed = (rg[3] == 0) ?
false :
true;
2201 return (
double)rg[0];
2220 double[] rg = m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.GETREQUIREDCOMPUTE,
null);
2221 nMinMajor = (int)rg[0];
2222 nMinMinor = (int)rg[1];
2226 float[] rg = m_cuda.RunFloat((
int)m_hKernel, (int)CUDAFN.GETREQUIREDCOMPUTE,
null);
2227 nMinMajor = (int)rg[0];
2228 nMinMinor = (int)rg[1];
2244 double[] rg = m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.DEVICE_CANACCESSPEER, m_param.AsDouble(nSrcDeviceID, nPeerDeviceID));
2245 return (rg[0] == 0) ? false :
true;
2249 float[] rg = m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.DEVICE_CANACCESSPEER, m_param.AsFloat(nSrcDeviceID, nPeerDeviceID));
2250 return (rg[0] == 0) ? false :
true;
2261 m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.DEVICE_ENABLEPEERACCESS, m_param.AsDouble(nPeerDeviceID));
2263 m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.DEVICE_ENABLEPEERACCESS, m_param.AsFloat(nPeerDeviceID));
2273 m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.DEVICE_DISABLEPEERACCESS, m_param.AsDouble(nPeerDeviceID));
2275 m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.DEVICE_DISABLEPEERACCESS, m_param.AsFloat(nPeerDeviceID));
2283 #region ICudaMemory Methods
2293 return AllocMemory(rg.ToArray());
2304 return AllocMemory(rg.ToArray());
2316 return AllocMemory(convert(rgSrc), hStream);
2328 return AllocMemory(convert(rgSrc), hStream);
2338 public long AllocMemory(T[] rgSrc,
long hStream = 0,
bool bHalfSize =
false)
2341 throw new ArgumentNullException();
2343 if (rgSrc.Length == 0)
2344 throw new ArgumentOutOfRangeException();
2351 throw new Exception(
"Half sizes are only supported with the 'float' base type.");
2353 List<double> rgInput =
new List<double>() { rgSrc.Length };
2354 List<long> rgInput2 =
new List<long>() { rgSrc.Length };
2358 rgInput.Add(hStream);
2359 rgInput2.Add(hStream);
2362 rgInput.AddRange(convertD(rgSrc));
2368 if (m_rgGhostMemory ==
null || !m_bGhostMemoryEnabled)
2370 rg = m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.ALLOCMEM, rgInput.ToArray(), rgInput2.ToArray());
2374 m_nGhostMemoryIndex++;
2375 m_rgGhostMemory.Add(m_nGhostMemoryIndex, convert(
Utility.Clone<
double>(rgInput).ToArray()));
2376 rg =
new double[] { m_nGhostMemoryIndex };
2379 return m_memTracker.
AllocMemory(m_hKernel, m_nDeviceId, (
long)rg[0], (ulong)rgInput.Count, bHalfSize);
2384 List<float> rgInput =
new List<float>() { rgSrc.Length };
2385 List<long> rgInput2 =
new List<long>() { rgSrc.Length };
2389 rgInput.Add(hStream);
2390 rgInput2.Add(hStream);
2393 rgInput.AddRange(convertF(rgSrc));
2399 if (m_rgGhostMemory ==
null || !m_bGhostMemoryEnabled)
2402 rg = m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.ALLOCMEM_HALF, rgInput.ToArray(), rgInput2.ToArray());
2404 rg = m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.ALLOCMEM, rgInput.ToArray(), rgInput2.ToArray());
2408 m_nGhostMemoryIndex++;
2409 m_rgGhostMemory.Add(m_nGhostMemoryIndex, convert(
Utility.Clone<
float>(rgInput).ToArray()));
2410 rg =
new float[] { m_nGhostMemoryIndex };
2413 return m_memTracker.
AllocMemory(m_hKernel, m_nDeviceId, (
long)rg[0], (ulong)rgInput.Count, bHalfSize);
2417 catch (Exception excpt)
2420 string strDevice = GetDeviceName(m_nDeviceId);
2421 throw new Exception(
"Out of memory! You are currently using " + strMemory +
" of memory on " + strDevice +
". You may need to use a different GPU that has more memory.", excpt);
2428 public static ulong BaseSize
2430 get {
return m_lBaseSize; }
2440 return ulSizeInBytes / m_lBaseSize;
2452 throw new ArgumentOutOfRangeException();
2454 long[] rgIn =
new long[] { lCapacity };
2461 throw new Exception(
"Half sizes are only supported with the 'float' base type.");
2466 if (m_rgGhostMemory ==
null || !m_bGhostMemoryEnabled)
2468 rgOut = m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.ALLOCMEM,
null, rgIn);
2472 m_nGhostMemoryIndex++;
2473 m_rgGhostMemory.Add(m_nGhostMemoryIndex, convert(
Utility.
Create<
double>((
int)lCapacity, 0).ToArray()));
2474 rgOut =
new double[] { m_nGhostMemoryIndex };
2477 return m_memTracker.
AllocMemory(m_hKernel, m_nDeviceId, (
long)rgOut[0], (ulong)lCapacity, bHalfSize);
2485 if (m_rgGhostMemory ==
null || !m_bGhostMemoryEnabled)
2488 rgOut = m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.ALLOCMEM_HALF,
null, rgIn);
2490 rgOut = m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.ALLOCMEM,
null, rgIn);
2494 m_nGhostMemoryIndex++;
2495 m_rgGhostMemory.Add(m_nGhostMemoryIndex, convert(
Utility.
Create<
float>((
int)lCapacity, 0).ToArray()));
2496 rgOut =
new float[] { m_nGhostMemoryIndex };
2499 return m_memTracker.
AllocMemory(m_hKernel, m_nDeviceId, (
long)rgOut[0], (ulong)lCapacity, bHalfSize);
2503 catch (Exception excpt)
2506 string strDevice = GetDeviceName(m_nDeviceId);
2507 long lMb = (lCapacity * (int)basetype_size(
false)) / 1000000;
2509 throw new Exception(
"Out of memory! There is not enough memory to allocate the requested " + lMb.ToString(
"N0") +
" MB of memory. You are currently using " + strMemory +
" of memory on " + strDevice +
". You may need to use a different GPU that has more memory.", excpt);
2519 if (m_cuda ==
null || m_hKernel <= 0)
2521 Trace.WriteLine(
"WARNING: CudaDnn has already been disposed, cannot free memory.");
2529 m_memTracker.
FreeMemory(m_hKernel, m_nDeviceId, hMem);
2531 if (m_rgGhostMemory ==
null || !m_bGhostMemoryEnabled)
2532 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.FREEMEM,
null, m_param.AsLong(hMem));
2534 m_rgGhostMemory.Remove(hMem);
2538 m_memTracker.
FreeMemory(m_hKernel, m_nDeviceId, hMem);
2540 if (m_rgGhostMemory ==
null || !m_bGhostMemoryEnabled)
2541 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.FREEMEM,
null, m_param.AsLong(hMem));
2543 m_rgGhostMemory.Remove(hMem);
2557 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.COPY_DEVICE_TO_HOST,
null, m_param.AsLong(lCount, hGpuSrc, hHostDst));
2559 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.COPY_DEVICE_TO_HOST,
null, m_param.AsLong(lCount, hGpuSrc, hHostDst));
2571 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.COPY_HOST_TO_DEVICE,
null, m_param.AsLong(lCount, hHostSrc, hGpuDst));
2573 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.COPY_HOST_TO_DEVICE,
null, m_param.AsLong(lCount, hHostSrc, hGpuDst));
2584 throw new ArgumentOutOfRangeException();
2588 double[] rg = m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.ALLOCHOSTBUFFER,
null, m_param.AsLong(lCapacity));
2593 float[] rg = m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.ALLOCHOSTBUFFER,
null, m_param.AsLong(lCapacity));
2604 if (m_cuda ==
null || m_hKernel <= 0)
2606 Trace.WriteLine(
"WARNING: CudaDnn has already been disposed, cannot free memory.");
2611 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.FREEHOSTBUFFER,
null, m_param.AsLong(hMem));
2613 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.FREEHOSTBUFFER,
null, m_param.AsLong(hMem));
2625 double[] rg = m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.GETHOSTBUFFERCAPACITY,
null, m_param.AsLong(hMem));
2630 float[] rg = m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.GETHOSTBUFFERCAPACITY,
null, m_param.AsLong(hMem));
2643 return convertD(GetHostMemory(hMem));
2654 return convertF(GetHostMemory(hMem));
2665 return convert(m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.GETHOSTMEM,
null, m_param.AsLong(hMem)));
2667 return convert(m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.GETHOSTMEM,
null, m_param.AsLong(hMem)));
2679 return convertD(GetMemory(hMem, lCount));
2691 return convertF(GetMemory(hMem, lCount));
2704 if (m_rgGhostMemory ==
null)
2706 double[] rgr = m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.GETMEM,
null, m_param.AsLong(hMem, lCount));
2707 return convert(rgr);
2711 return m_rgGhostMemory[hMem];
2716 if (m_rgGhostMemory ==
null)
2718 float[] rgr = m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.GETMEM,
null, m_param.AsLong(hMem, lCount));
2719 return convert(rgr);
2723 return m_rgGhostMemory[hMem];
2736 SetMemory(hMem, rg.ToArray());
2747 SetMemory(hMem, rg.ToArray());
2757 public void SetMemory(
long hMem,
double[] rgSrc,
long hStream = 0)
2759 SetMemory(hMem, convert(rgSrc), hStream);
2769 public void SetMemory(
long hMem,
float[] rgSrc,
long hStream = 0)
2771 SetMemory(hMem, convert(rgSrc), hStream);
2781 public void SetMemory(
long hMem, T[] rgSrc,
long hStream = 0,
int nCount = -1)
2784 nCount = rgSrc.Length;
2786 if (rgSrc ==
null || nCount == 0)
2787 throw new ArgumentOutOfRangeException(
"There are no data items to set!");
2791 if (m_rgGhostMemory !=
null)
2793 m_rgGhostMemory[hMem] =
Utility.Clone<T>(rgSrc);
2804 nDataCount += nCount;
2806 double[] rg =
new double[nDataCount];
2818 long[] rgIn =
new long[] { hMem, nCount };
2820 convertD(rgSrc, rg, nIdx, nCount);
2821 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.SETMEM, rg, rgIn);
2830 nDataCount += nCount;
2832 float[] rg =
new float[nDataCount];
2844 long[] rgIn =
new long[] { hMem, nCount };
2846 convertF(rgSrc, rg, nIdx, nCount);
2847 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.SETMEM, rg, rgIn);
2862 SetMemoryAt(hMem, convert(rgSrc), nOffset);
2874 SetMemoryAt(hMem, convert(rgSrc), nOffset);
2885 if (rgSrc ==
null || rgSrc.Length == 0)
2886 throw new ArgumentOutOfRangeException(
"There are no data items to set!");
2890 if (m_rgGhostMemory !=
null)
2891 throw new Exception(
"Ghost memory does not support SetMemoryAt.");
2895 int nDataCount = 3 + rgSrc.Length;
2896 double[] rg =
new double[nDataCount];
2899 rg[1] = rgSrc.Length;
2902 long[] rgIn =
new long[] { hMem, rgSrc.Length, nOffset };
2904 convertD(rgSrc, rg, 3);
2905 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.SETMEMAT, rg, rgIn);
2909 int nDataCount = 3 + rgSrc.Length;
2910 float[] rg =
new float[nDataCount];
2913 rg[1] = rgSrc.Length;
2916 long[] rgIn =
new long[] { hMem, rgSrc.Length, nOffset };
2918 convertF(rgSrc, rg, 3);
2919 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.SETMEMAT, rg, rgIn);
2933 public T[]
SetPixel(
long hMem,
int nCount,
bool bReturnOriginal,
int nOffset, params Tuple<int, T>[] rgPixel)
2935 if (rgPixel.Length == 0)
2936 throw new Exception(
"You must specify at least one pixel!");
2940 double[] rg =
new double[5 + rgPixel.Length * 2];
2944 rg[2] = (bReturnOriginal) ? 1 : 0;
2946 rg[4] = rgPixel.Length;
2949 for (
int i = 0; i < rgPixel.Length; i++)
2951 rg[nIdx] = rgPixel[i].Item1;
2953 rg[nIdx] = convertD1(rgPixel[i].Item2);
2957 rg = m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.SETPIXEL, rg);
2965 float[] rg =
new float[5 + rgPixel.Length * 2];
2969 rg[2] = (bReturnOriginal) ? 1 : 0;
2971 rg[4] = rgPixel.Length;
2974 for (
int i = 0; i < rgPixel.Length; i++)
2976 rg[nIdx] = rgPixel[i].Item1;
2978 rg[nIdx] = convertF1(rgPixel[i].Item2);
2982 rg = m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.SETPIXEL, rg);
2999 int nDataCount = 2 + rgSrc.Length;
3000 double[] rg =
new double[nDataCount];
3003 rg[1] = rgSrc.Length;
3005 convertD(rgSrc, rg, 2);
3006 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.SETHOSTMEM, rg, m_param.AsLong(hMem, rgSrc.Length));
3010 int nDataCount = 2 + rgSrc.Length;
3011 float[] rg =
new float[nDataCount];
3014 rg[1] = rgSrc.Length;
3016 convertF(rgSrc, rg, 2);
3017 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.SETHOSTMEM, rg, m_param.AsLong(hMem, rgSrc.Length));
3032 double[] rg = m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CREATE_MEMORYPOINTER,
null, m_param.AsLong(hData, lOffset, lCount));
3037 float[] rg = m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CREATE_MEMORYPOINTER,
null, m_param.AsLong(hData, lOffset, lCount));
3048 if (m_cuda ==
null || m_hKernel <= 0)
3050 Trace.WriteLine(
"WARNING: CudaDnn has already been disposed, cannot free memory pointer.");
3055 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.FREE_MEMORYPOINTER,
null, m_param.AsLong(hData));
3057 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.FREE_MEMORYPOINTER,
null, m_param.AsLong(hData));
3069 public long CreateMemoryTest(out ulong ulTotalNumBlocks, out
double dfMemAllocatedInGB, out ulong ulMemStartAddr, out ulong ulBlockSize,
double dfPctToAllocate = 1.0)
3073 double[] rg = m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.CREATE_MEMTEST, m_param.AsDouble(dfPctToAllocate));
3074 ulTotalNumBlocks = (ulong)rg[1];
3075 dfMemAllocatedInGB = (double)rg[2];
3076 ulMemStartAddr = (ulong)rg[3];
3077 ulBlockSize = (ulong)rg[4];
3082 float[] rg = m_cuda.RunFloat((
int)m_hKernel, (int)CUDAFN.CREATE_MEMTEST, m_param.AsFloat((
float)dfPctToAllocate));
3083 ulTotalNumBlocks = (ulong)rg[1];
3084 dfMemAllocatedInGB = (double)rg[2];
3085 ulMemStartAddr = (ulong)rg[3];
3086 ulBlockSize = (ulong)rg[4];
3098 m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.FREE_MEMTEST, m_param.AsDouble(h));
3100 m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.FREE_MEMTEST, m_param.AsFloat(h));
3123 public T[]
RunMemoryTest(
long h,
MEMTEST_TYPE type, ulong ulBlockStartOffset, ulong ulBlockCount,
bool bVerbose,
bool bWrite,
bool bReadWrite,
bool bRead)
3125 List<ulong> rgErrorAddresses =
new List<ulong>();
3129 double[] rg = m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.RUN_MEMTEST,
null, m_param.AsLong(h, (
long)type, (
long)ulBlockStartOffset, (
long)ulBlockCount, (bVerbose) ? 1 : 0, (bWrite) ? 1 : 0, (bReadWrite) ? 1 : 0, (bRead) ? 1 : 0));
3130 return (T[])Convert.ChangeType(rg, typeof(T[]));
3134 float[] rg = m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.RUN_MEMTEST,
null, m_param.AsLong(h, (
long)type, (
long)ulBlockStartOffset, (
long)ulBlockCount, (bVerbose) ? 1 : 0, (bWrite) ? 1 : 0, (bReadWrite) ? 1 : 0, (bRead) ? 1 : 0));
3135 return (T[])Convert.ChangeType(rg, typeof(T[]));
3153 public long CreateImageOp(
int nNum,
double dfBrightnessProb,
double dfBrightnessDelta,
double dfContrastProb,
double dfContrastLower,
double dfContrastUpper,
double dfSaturationProb,
double dfSaturationLower,
double dfSaturationUpper,
long lRandomSeed = 0)
3157 double[] rg = m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.CREATE_IMAGEOP, m_param.AsDouble(nNum, dfBrightnessProb, dfBrightnessDelta, dfContrastProb, dfContrastLower, dfContrastUpper, dfSaturationProb, dfSaturationLower, dfSaturationUpper, lRandomSeed));
3162 float[] rg = m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.CREATE_IMAGEOP, m_param.AsFloat(nNum, (
float)dfBrightnessProb, (
float)dfBrightnessDelta, (
float)dfContrastProb, (
float)dfContrastLower, (
float)dfContrastUpper, (
float)dfSaturationProb, (
float)dfSaturationLower, (
float)dfSaturationUpper, lRandomSeed));
3174 m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.FREE_IMAGEOP, m_param.AsDouble(h));
3176 m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.FREE_IMAGEOP, m_param.AsFloat(h));
3188 public void DistortImage(
long h,
int nCount,
int nNum,
int nDim,
long hX,
long hY)
3191 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.DISTORTIMAGE_IMAGEOP,
null, m_param.AsLong(h, nCount, nNum, nDim, hX, hY));
3193 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.DISTORTIMAGE_IMAGEOP,
null, m_param.AsLong(h, nCount, nNum, nDim, hX, hY));
3201 #region ICudaDnn Methods
3213 double[] rg = m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.CREATE_STREAM, m_param.AsDouble((bNonBlocking) ? 1 : 0, nIndex));
3218 float[] rg = m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.CREATE_STREAM, m_param.AsFloat((bNonBlocking) ? 1 : 0, nIndex));
3230 m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.FREE_STREAM, m_param.AsDouble(h));
3232 m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.FREE_STREAM, m_param.AsFloat(h));
3242 m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.SYNCRHONIZE_STREAM, m_param.AsDouble(h));
3244 m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.SYNCRHONIZE_STREAM, m_param.AsFloat(h));
3253 m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.SYNCHRONIZE_THREAD,
null);
3255 m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.SYNCHRONIZE_THREAD,
null);
3267 double[] rg = m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.CREATE_CUDNN, m_param.AsDouble(hStream));
3272 float[] rg = m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.CREATE_CUDNN, m_param.AsFloat(hStream));
3284 m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.FREE_CUDNN, m_param.AsDouble(h));
3286 m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.FREE_CUDNN, m_param.AsFloat(h));
3297 public long CreateNCCL(
int nDeviceId,
int nCount,
int nRank, Guid guid)
3301 List<double> rgParam =
new List<double>() { nDeviceId, nCount, nRank };
3302 List<double> rgGuid = guidToArrayDouble(guid);
3304 rgParam.Add(rgGuid.Count);
3305 rgParam.AddRange(rgGuid);
3307 double[] rg = m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.CREATE_NCCL, rgParam.ToArray());
3312 List<float> rgParam =
new List<float>() { nDeviceId, nCount, nRank };
3313 List<float> rgGuid = guidToArrayFloat(guid);
3315 rgParam.Add(rgGuid.Count);
3316 rgParam.AddRange(rgGuid);
3318 float[] rg = m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.CREATE_NCCL, rgParam.ToArray());
3323 private List<double> guidToArrayDouble(Guid guid)
3325 List<double> rgdf =
new List<double>();
3326 string str = guid.ToString();
3327 string[] rgstr = str.Split(
'-');
3329 foreach (
string str1
in rgstr)
3331 long val = Convert.ToInt64(str1, 16);
3338 private List<float> guidToArrayFloat(Guid guid)
3340 List<double> rgDf = guidToArrayDouble(guid);
3341 List<float> rg =
new List<float>();
3343 foreach (
double df
in rgDf)
3358 m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.FREE_NCCL, m_param.AsDouble(hNccl));
3360 m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.FREE_NCCL, m_param.AsFloat(hNccl));
3374 List<double> rg =
new List<double>() { 0, rghNccl.Length };
3376 for (
int i = 0; i < rghNccl.Length; i++)
3381 m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.NCCL_INIT_SINGLEPROCESS, rg.ToArray());
3385 List<float> rg =
new List<float>() { 0, rghNccl.Length };
3387 for (
int i = 0; i < rghNccl.Length; i++)
3392 m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.NCCL_INIT_SINGLEPROCESS, rg.ToArray());
3406 m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.NCCL_INIT_MULTIPROCESS, m_param.AsDouble(hNccl));
3408 m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.NCCL_INIT_MULTIPROCESS, m_param.AsFloat(hNccl));
3423 Trace.WriteLine(
"Broadcasting from device ID " + GetDeviceID().ToString());
3425 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.NCCL_BROADCAST,
null, m_param.AsLong(hNccl, hStream, hX, nCount));
3427 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.NCCL_BROADCAST,
null, m_param.AsLong(hNccl, hStream, hX, nCount));
3445 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.NCCL_ALLREDUCE, m_param.AsDouble(dfScale), m_param.AsLong(hNccl, hStream, hX, nCount, (
int)op, 0));
3447 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.NCCL_ALLREDUCE, m_param.AsFloat((
float)dfScale), m_param.AsLong(hNccl, hStream, hX, nCount, (
int)op, 0));
3460 double[] rg = m_cuda.RunDoubleEx((
int)m_hKernel, (
int)CUDAFN.CREATE_EXTENSION,
null, strExtensionDllPath);
3465 float[] rg = m_cuda.RunFloatEx((
int)m_hKernel, (
int)CUDAFN.CREATE_EXTENSION,
null, strExtensionDllPath);
3477 m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.FREE_EXTENSION, m_param.AsDouble(hExtension));
3479 m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.FREE_EXTENSION, m_param.AsFloat(hExtension));
3493 List<double> rgdf =
new List<double>() { hExtension, lfnIdx };
3495 if (rgParam !=
null)
3498 double[] rg = m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.EXTENSION_RUN, rgdf.ToArray());
3503 List<float> rgf =
new List<float>() { hExtension, lfnIdx };
3505 if (rgParam !=
null)
3506 rgf.AddRange(
Utility.ConvertVecF<T>(rgParam));
3508 float[] rg = m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.EXTENSION_RUN, rgf.ToArray());
3522 double[] rg = m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.CREATE_TENSORDESC,
null);
3527 float[] rg = m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.CREATE_TENSORDESC,
null);
3539 m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.FREE_TENSORDESC, m_param.AsDouble(h));
3541 m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.FREE_TENSORDESC, m_param.AsFloat(h));
3551 public void SetTensorNdDesc(
long hHandle,
int[] rgDim,
int[] rgStride,
bool bHalf =
false)
3553 if (rgDim.Length != rgStride.Length)
3554 throw new Exception(
"The stride and dim arrays must have the same length.");
3558 List<long> rgArg =
new List<long>() { hHandle, (bHalf) ? 1 : 0, rgDim.Length };
3560 for (
int i = 0; i < rgDim.Length; i++)
3562 rgArg.Add(rgDim[i]);
3565 for (
int i = 0; i < rgStride.Length; i++)
3567 rgArg.Add(rgStride[i]);
3570 m_cuda.RunDoubleEx2((
int)m_hKernel, (int)CUDAFN.SET_TENSORNDDESC,
null, rgArg.ToArray());
3574 List<long> rgArg =
new List<long>() { hHandle, (bHalf) ? 1 : 0, rgDim.Length };
3576 for (
int i = 0; i < rgDim.Length; i++)
3578 rgArg.Add(rgDim[i]);
3581 for (
int i = 0; i < rgStride.Length; i++)
3583 rgArg.Add(rgStride[i]);
3586 m_cuda.RunFloatEx2((
int)m_hKernel, (int)CUDAFN.SET_TENSORNDDESC,
null, rgArg.ToArray());
3599 public void SetTensorDesc(
long hHandle,
int n,
int c,
int h,
int w,
bool bHalf =
false)
3602 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.SET_TENSORDESC,
null, m_param.AsLong(hHandle, (bHalf) ? 1 : 0, n, c, h, w));
3604 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.SET_TENSORDESC,
null, m_param.AsLong(hHandle, (bHalf) ? 1 : 0, n, c, h, w));
3620 public void SetTensorDesc(
long hHandle,
int n,
int c,
int h,
int w,
int nStride,
int cStride,
int hStride,
int wStride,
bool bHalf =
false)
3623 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.SET_TENSORDESC,
null, m_param.AsLong(hHandle, (bHalf) ? 1 : 0, n, c, h, w, nStride, cStride, hStride, wStride));
3625 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.SET_TENSORDESC,
null, m_param.AsLong(hHandle, (bHalf) ? 1 : 0, n, c, h, w, nStride, cStride, hStride, wStride));
3638 public void AddTensor(
long hCuDnn,
long hSrcDesc,
long hSrc,
int nSrcOffset,
long hDstDesc,
long hDst,
int nDstOffset)
3640 AddTensor(hCuDnn, m_tOne, hSrcDesc, hSrc, nSrcOffset, m_tOne, hDstDesc, hDst, nDstOffset);
3655 public void AddTensor(
long hCuDnn, T fAlpha,
long hSrcDesc,
long hSrc,
int nSrcOffset, T fBeta,
long hDstDesc,
long hDst,
int nDstOffset)
3658 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.ADD_TENSOR, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hSrcDesc, hSrc, nSrcOffset, 0, hDstDesc, hDst, nDstOffset));
3660 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.ADD_TENSOR, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hSrcDesc, hSrc, nSrcOffset, 0, hDstDesc, hDst, nDstOffset));
3672 double[] rg = m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.CREATE_FILTERDESC,
null);
3677 float[] rg = m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.CREATE_FILTERDESC,
null);
3689 m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.FREE_FILTERDESC, m_param.AsDouble(h));
3691 m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.FREE_FILTERDESC, m_param.AsFloat(h));
3704 List<long> rgArg =
new List<long>() { hHandle, (bHalf) ? 1 : 0, rgDim.Length };
3706 for (
int i = 0; i < rgDim.Length; i++)
3708 rgArg.Add(rgDim[i]);
3711 m_cuda.RunDoubleEx2((
int)m_hKernel, (int)CUDAFN.SET_FILTERNDDESC,
null, rgArg.ToArray());
3715 List<long> rgArg =
new List<long>() { hHandle, (bHalf) ? 1 : 0, rgDim.Length };
3717 for (
int i = 0; i < rgDim.Length; i++)
3719 rgArg.Add(rgDim[i]);
3722 m_cuda.RunFloatEx2((
int)m_hKernel, (int)CUDAFN.SET_FILTERNDDESC,
null, rgArg.ToArray());
3735 public void SetFilterDesc(
long hHandle,
int n,
int c,
int h,
int w,
bool bHalf =
false)
3738 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.SET_FILTERDESC,
null, m_param.AsLong(hHandle, (bHalf) ? 1 : 0, n, c, h, w));
3740 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.SET_FILTERDESC,
null, m_param.AsLong(hHandle, (bHalf) ? 1 : 0, n, c, h, w));
3751 double[] rg = m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.CREATE_CONVDESC,
null);
3756 float[] rg = m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.CREATE_CONVDESC,
null);
3768 m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.FREE_CONVDESC, m_param.AsDouble(h));
3770 m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.FREE_CONVDESC, m_param.AsFloat(h));
3785 public void SetConvolutionDesc(
long hHandle,
int hPad,
int wPad,
int hStride,
int wStride,
int hDilation,
int wDilation,
bool bUseTensorCores,
bool bHalf =
false)
3788 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.SET_CONVDESC,
null, m_param.AsLong(hHandle, (bHalf) ? 1 : 0, hPad, wPad, hStride, wStride, hDilation, wDilation, (bUseTensorCores) ? 1 : 0));
3790 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.SET_CONVDESC,
null, m_param.AsLong(hHandle, (bHalf) ? 1 : 0, hPad, wPad, hStride, wStride, hDilation, wDilation, (bUseTensorCores) ? 1 : 0));
3810 public void GetConvolutionInfo(
long hCuDnn,
long hBottomDesc,
long hFilterDesc,
long hConvDesc,
long hTopDesc, ulong lWorkspaceSizeLimitInBytes,
bool bUseTensorCores, out
CONV_FWD_ALGO algoFwd, out ulong lWsSizeFwd, out
CONV_BWD_FILTER_ALGO algoBwdFilter, out ulong lWsSizeBwdFilter, out
CONV_BWD_DATA_ALGO algoBwdData, out ulong lWsSizeBwdData,
CONV_FWD_ALGO preferredFwdAlgo =
CONV_FWD_ALGO.NONE)
3812 lock (m_getconvSync)
3816 double[] rg = m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.GET_CONVINFO,
null, m_param.AsLong(hCuDnn, hBottomDesc, hFilterDesc, hConvDesc, hTopDesc, (
long)lWorkspaceSizeLimitInBytes, (bUseTensorCores) ? 1 : 0, (
int)preferredFwdAlgo));
3818 lWsSizeFwd = (ulong)rg[1];
3820 lWsSizeBwdFilter = (ulong)rg[3];
3822 lWsSizeBwdData = (ulong)rg[5];
3826 float[] rg = m_cuda.RunFloatEx2((
int)m_hKernel, (int)CUDAFN.GET_CONVINFO,
null, m_param.AsLong(hCuDnn, hBottomDesc, hFilterDesc, hConvDesc, hTopDesc, (
long)lWorkspaceSizeLimitInBytes, (bUseTensorCores) ? 1 : 0, (
int)preferredFwdAlgo));
3828 lWsSizeFwd = (ulong)rg[1];
3830 lWsSizeBwdFilter = (ulong)rg[3];
3832 lWsSizeBwdData = (ulong)rg[5];
3856 public void ConvolutionForward(
long hCuDnn,
long hBottomDesc,
long hBottomData,
int nBottomOffset,
long hFilterDesc,
long hWeight,
int nWeightOffset,
long hConvDesc,
CONV_FWD_ALGO algoFwd,
long hWorkspace,
int nWorkspaceOffset, ulong lWorkspaceSize,
long hTopDesc,
long hTopData,
int nTopOffset,
bool bSyncStream =
true)
3858 ConvolutionForward(hCuDnn, m_tOne, hBottomDesc, hBottomData, nBottomOffset, hFilterDesc, hWeight, nWeightOffset, hConvDesc, algoFwd, hWeight, nWeightOffset, lWorkspaceSize, m_tZero, hTopDesc, hTopData, nTopOffset, bSyncStream);
3882 public void ConvolutionForward(
long hCuDnn, T fAlpha,
long hBottomDesc,
long hBottomData,
int nBottomOffset,
long hFilterDesc,
long hWeight,
int nWeightOffset,
long hConvDesc,
CONV_FWD_ALGO algoFwd,
long hWorkspace,
int nWorkspaceOffset, ulong lWorkspaceSize, T fBeta,
long hTopDesc,
long hTopData,
int nTopOffset,
bool bSyncStream =
true)
3885 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.FWD_CONV, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hBottomDesc, hBottomData, nBottomOffset, hFilterDesc, hWeight, nWeightOffset, hConvDesc, (
long)algoFwd, hWorkspace, nWorkspaceOffset, (
long)lWorkspaceSize, 0, hTopDesc, hTopData, nTopOffset, (bSyncStream) ? 1 : 0));
3887 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.FWD_CONV, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hBottomDesc, hBottomData, nBottomOffset, hFilterDesc, hWeight, nWeightOffset, hConvDesc, (
long)algoFwd, hWorkspace, nWorkspaceOffset, (
long)lWorkspaceSize, 0, hTopDesc, hTopData, nTopOffset, (bSyncStream) ? 1 : 0));
3901 public void ConvolutionBackwardBias(
long hCuDnn,
long hTopDesc,
long hTopDiff,
int nTopOffset,
long hBiasDesc,
long hBiasDiff,
int nBiasOffset,
bool bSyncStream =
true)
3903 ConvolutionBackwardBias(hCuDnn, m_tOne, hTopDesc, hTopDiff, nTopOffset, m_tOne, hBiasDesc, hBiasDiff, nBiasOffset, bSyncStream);
3919 public void ConvolutionBackwardBias(
long hCuDnn, T fAlpha,
long hTopDesc,
long hTopDiff,
int nTopOffset, T fBeta,
long hBiasDesc,
long hBiasDiff,
int nBiasOffset,
bool bSyncStream =
true)
3922 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.BWD_CONV_BIAS, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hTopDesc, hTopDiff, nTopOffset, 0, hBiasDesc, hBiasDiff, nBiasOffset, (bSyncStream) ? 1 : 0));
3924 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.BWD_CONV_BIAS, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hTopDesc, hTopDiff, nTopOffset, 0, hBiasDesc, hBiasDiff, nBiasOffset, (bSyncStream) ? 1 : 0));
3946 public void ConvolutionBackwardFilter(
long hCuDnn,
long hBottomDesc,
long hBottomData,
int nBottomOffset,
long hTopDesc,
long hTopDiff,
int nTopOffset,
long hConvDesc,
CONV_BWD_FILTER_ALGO algoBwd,
long hWorkspace,
int nWorkspaceOffset, ulong lWorkspaceSize,
long hFilterDesc,
long hWeightDiff,
int nWeightOffset,
bool bSyncStream)
3948 ConvolutionBackwardFilter(hCuDnn, m_tOne, hBottomDesc, hBottomData, nBottomOffset, hTopDesc, hTopDiff, nTopOffset, hConvDesc, algoBwd, hWorkspace, nWorkspaceOffset, lWorkspaceSize, m_tOne, hFilterDesc, hWeightDiff, nWeightOffset, bSyncStream);
3972 public void ConvolutionBackwardFilter(
long hCuDnn, T fAlpha,
long hBottomDesc,
long hBottomData,
int nBottomOffset,
long hTopDesc,
long hTopDiff,
int nTopOffset,
long hConvDesc,
CONV_BWD_FILTER_ALGO algoBwd,
long hWorkspace,
int nWorkspaceOffset, ulong lWorkspaceSize, T fBeta,
long hFilterDesc,
long hWeightDiff,
int nWeightOffset,
bool bSyncStream =
true)
3975 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.BWD_CONV_FILTER, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hBottomDesc, hBottomData, nBottomOffset, hTopDesc, hTopDiff, nTopOffset, hConvDesc, (
long)algoBwd, hWorkspace, nWorkspaceOffset, (
long)lWorkspaceSize, 0, hFilterDesc, hWeightDiff, nWeightOffset, (bSyncStream) ? 1 : 0));
3977 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.BWD_CONV_FILTER, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hBottomDesc, hBottomData, nBottomOffset, hTopDesc, hTopDiff, nTopOffset, hConvDesc, (
long)algoBwd, hWorkspace, nWorkspaceOffset, (
long)lWorkspaceSize, 0, hFilterDesc, hWeightDiff, nWeightOffset, (bSyncStream) ? 1 : 0));
3999 public void ConvolutionBackwardData(
long hCuDnn,
long hFilterDesc,
long hWeight,
int nWeightOffset,
long hTopDesc,
long hTopDiff,
int nTopOffset,
long hConvDesc,
CONV_BWD_DATA_ALGO algoBwd,
long hWorkspace,
int nWorkspaceOffset, ulong lWorkspaceSize,
long hBottomDesc,
long hBottomDiff,
int nBottomOffset,
bool bSyncStream =
true)
4001 ConvolutionBackwardData(hCuDnn, m_tOne, hFilterDesc, hWeight, nWeightOffset, hTopDesc, hTopDiff, nTopOffset, hConvDesc, algoBwd, hWorkspace, nWorkspaceOffset, lWorkspaceSize, m_tZero, hBottomDesc, hBottomDiff, nBottomOffset, bSyncStream);
4025 public void ConvolutionBackwardData(
long hCuDnn, T fAlpha,
long hFilterDesc,
long hWeight,
int nWeightOffset,
long hTopDesc,
long hTopDiff,
int nTopOffset,
long hConvDesc,
CONV_BWD_DATA_ALGO algoBwd,
long hWorkspace,
int nWorkspaceOffset, ulong lWorkspaceSize, T fBeta,
long hBottomDesc,
long hBottomDiff,
int nBottomOffset,
bool bSyncStream =
true)
4028 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.BWD_CONV_DATA, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hFilterDesc, hWeight, nWeightOffset, hTopDesc, hTopDiff, nTopOffset, hConvDesc, (
long)algoBwd, hWorkspace, nWorkspaceOffset, (
long)lWorkspaceSize, 0, hBottomDesc, hBottomDiff, nBottomOffset, (bSyncStream) ? 1 : 0));
4030 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.BWD_CONV_DATA, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hFilterDesc, hWeight, nWeightOffset, hTopDesc, hTopDiff, nTopOffset, hConvDesc, (
long)algoBwd, hWorkspace, nWorkspaceOffset, (
long)lWorkspaceSize, 0, hBottomDesc, hBottomDiff, nBottomOffset, (bSyncStream) ? 1 : 0));
4041 double[] rg = m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.CREATE_POOLDESC,
null);
4046 float[] rg = m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.CREATE_POOLDESC,
null);
4058 m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.FREE_POOLDESC, m_param.AsDouble(h));
4060 m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.FREE_POOLDESC, m_param.AsFloat(h));
4077 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.SET_POOLDESC,
null, m_param.AsLong(hHandle, (
int)method, h, w, hPad, wPad, hStride, wStride));
4079 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.SET_POOLDESC,
null, m_param.AsLong(hHandle, (
int)method, h, w, hPad, wPad, hStride, wStride));
4093 public void PoolingForward(
long hCuDnn,
long hPoolingDesc, T fAlpha,
long hBottomDesc,
long hBottomData, T fBeta,
long hTopDesc,
long hTopData)
4096 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.FWD_POOL, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, hPoolingDesc, 0, hBottomDesc, hBottomData, 0, hTopDesc, hTopData));
4098 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.FWD_POOL, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, hPoolingDesc, 0, hBottomDesc, hBottomData, 0, hTopDesc, hTopData));
4116 public void PoolingBackward(
long hCuDnn,
long hPoolingDesc, T fAlpha,
long hTopDataDesc,
long hTopData,
long hTopDiffDesc,
long hTopDiff,
long hBottomDataDesc,
long hBottomData, T fBeta,
long hBottomDiffDesc,
long hBottomDiff)
4119 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.BWD_POOL, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, hPoolingDesc, 0, hTopDataDesc, hTopData, hTopDiffDesc, hTopDiff, hBottomDataDesc, hBottomData, 0, hBottomDiffDesc, hBottomDiff));
4121 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.BWD_POOL, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, hPoolingDesc, 0, hTopDataDesc, hTopData, hTopDiffDesc, hTopDiff, hBottomDataDesc, hBottomData, 0, hBottomDiffDesc, hBottomDiff));
4135 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.DERIVE_BNDESC,
null, m_param.AsLong(hFwdScaleBiasMeanVarDesc, hFwdBottomDesc, hBwdScaleBiasMeanVarDesc, hBwdBottomDesc, (
int)mode));
4137 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.DERIVE_BNDESC,
null, m_param.AsLong(hFwdScaleBiasMeanVarDesc, hFwdBottomDesc, hBwdScaleBiasMeanVarDesc, hBwdBottomDesc, (
int)mode));
4161 public void BatchNormForward(
long hCuDnn,
BATCHNORM_MODE mode, T fAlpha, T fBeta,
long hFwdBottomDesc,
long hBottomData,
long hFwdTopDesc,
long hTopData,
long hFwdScaleBiasMeanVarDesc,
long hScaleData,
long hBiasData,
double dfFactor,
long hGlobalMean,
long hGlobalVar,
double dfEps,
long hSaveMean,
long hSaveInvVar,
bool bTraining)
4164 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.FWD_BN, m_param.AsDouble(convertD(fAlpha), convertD(fBeta), dfFactor, dfEps), m_param.AsLong(hCuDnn, (
int)mode, 0, 0, hFwdBottomDesc, hBottomData, hFwdTopDesc, hTopData, hFwdScaleBiasMeanVarDesc, hScaleData, hBiasData, 0, hGlobalMean, hGlobalVar, 0, hSaveMean, hSaveInvVar, (bTraining) ? 1 : 0));
4166 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.FWD_BN, m_param.AsFloat(convertF(fAlpha), convertF(fBeta), (
float)dfFactor, (
float)dfEps), m_param.AsLong(hCuDnn, (
int)mode, 0, 0, hFwdBottomDesc, hBottomData, hFwdTopDesc, hTopData, hFwdScaleBiasMeanVarDesc, hScaleData, hBiasData, 0, hGlobalMean, hGlobalVar, 0, hSaveMean, hSaveInvVar, (bTraining) ? 1 : 0));
4191 public void BatchNormBackward(
long hCuDnn,
BATCHNORM_MODE mode, T fAlphaDiff, T fBetaDiff, T fAlphaParamDiff, T fBetaParamDiff,
long hBwdBottomDesc,
long hBottomData,
long hTopDiffDesc,
long hTopDiff,
long hBottomDiffDesc,
long hBottomDiff,
long hBwdScaleBiasMeanVarDesc,
long hScaleData,
long hScaleDiff,
long hBiasDiff,
double dfEps,
long hSaveMean,
long hSaveInvVar)
4194 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.BWD_BN, m_param.AsDouble(convertD(fAlphaDiff), convertD(fBetaDiff), convertD(fAlphaParamDiff), convertD(fBetaParamDiff), dfEps), m_param.AsLong(hCuDnn, (
int)mode, 0, 0, 0, 0, hBwdBottomDesc, hBottomData, hTopDiffDesc, hTopDiff, hBottomDiffDesc, hBottomDiff, hBwdScaleBiasMeanVarDesc, hScaleData, hScaleDiff, hBiasDiff, 0, hSaveMean, hSaveInvVar));
4196 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.BWD_BN, m_param.AsFloat(convertF(fAlphaDiff), convertF(fBetaDiff), convertF(fAlphaParamDiff), convertF(fBetaParamDiff), (
float)dfEps), m_param.AsLong(hCuDnn, (
int)mode, 0, 0, 0, 0, hBwdBottomDesc, hBottomData, hTopDiffDesc, hTopDiff, hBottomDiffDesc, hBottomDiff, hBwdScaleBiasMeanVarDesc, hScaleData, hScaleDiff, hBiasDiff, 0, hSaveMean, hSaveInvVar));
4207 double[] rg = m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.CREATE_DROPOUTDESC,
null);
4212 float[] rg = m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.CREATE_DROPOUTDESC,
null);
4224 m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.FREE_DROPOUTDESC, m_param.AsDouble(h));
4226 m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.FREE_DROPOUTDESC, m_param.AsFloat(h));
4237 public void SetDropoutDesc(
long hCuDnn,
long hDropoutDesc,
double dfDropout,
long hStates,
long lSeed)
4240 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.SET_DROPOUTDESC, m_param.AsDouble(dfDropout), m_param.AsLong(hCuDnn, hDropoutDesc, 0, hStates, lSeed));
4242 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.SET_DROPOUTDESC, m_param.AsFloat((
float)dfDropout), m_param.AsLong(hCuDnn, hDropoutDesc, 0, hStates, lSeed));
4252 public void GetDropoutInfo(
long hCuDnn,
long hBottomDesc, out ulong ulStateCount, out ulong ulReservedCount)
4256 double[] rg = m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.GET_DROPOUT_INFO,
null, m_param.AsLong(hCuDnn, hBottomDesc));
4257 ulStateCount = (ulong)Math.Round(rg[0] /
sizeof(
double), 0, MidpointRounding.AwayFromZero);
4258 ulReservedCount = (ulong)Math.Round(rg[1] /
sizeof(
double), 0, MidpointRounding.AwayFromZero);
4262 float[] rg = m_cuda.RunFloatEx2((
int)m_hKernel, (int)CUDAFN.GET_DROPOUT_INFO,
null, m_param.AsLong(hCuDnn, hBottomDesc));
4263 ulStateCount = (ulong)Math.Round(rg[0] /
sizeof(
float), 0, MidpointRounding.AwayFromZero);
4264 ulReservedCount = (ulong)Math.Round(rg[1] /
sizeof(
float), 0, MidpointRounding.AwayFromZero);
4278 public void DropoutForward(
long hCuDnn,
long hDropoutDesc,
long hBottomDesc,
long hBottomData,
long hTopDesc,
long hTopData,
long hReserved)
4281 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.FWD_DROPOUT,
null, m_param.AsLong(hCuDnn, hDropoutDesc, hBottomDesc, hBottomData, hTopDesc, hTopData, hReserved));
4283 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.FWD_DROPOUT,
null, m_param.AsLong(hCuDnn, hDropoutDesc, hBottomDesc, hBottomData, hTopDesc, hTopData, hReserved));
4296 public void DropoutBackward(
long hCuDnn,
long hDropoutDesc,
long hTopDesc,
long hTop,
long hBottomDesc,
long hBottom,
long hReserved)
4299 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.BWD_DROPOUT,
null, m_param.AsLong(hCuDnn, hDropoutDesc, hTopDesc, hTop, hBottomDesc, hBottom, hReserved));
4301 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.BWD_DROPOUT,
null, m_param.AsLong(hCuDnn, hDropoutDesc, hTopDesc, hTop, hBottomDesc, hBottom, hReserved));
4312 double[] rg = m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.CREATE_LRNDESC,
null);
4317 float[] rg = m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.CREATE_LRNDESC,
null);
4329 m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.FREE_LRNDESC, m_param.AsDouble(h));
4331 m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.FREE_LRNDESC, m_param.AsFloat(h));
4342 public void SetLRNDesc(
long hHandle, uint nSize,
double fAlpha,
double fBeta,
double fK)
4345 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.SET_LRNDESC, m_param.AsDouble(fAlpha, fBeta, fK), m_param.AsLong(hHandle, nSize, 0, 0, 0));
4347 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.SET_LRNDESC, m_param.AsFloat((
float)fAlpha, (
float)fBeta, (
float)fK), m_param.AsLong(hHandle, nSize, 0, 0, 0));
4361 public void LRNCrossChannelForward(
long hCuDnn,
long hNormDesc, T fAlpha,
long hBottomDesc,
long hBottomData, T fBeta,
long hTopDesc,
long hTopData)
4364 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.LRN_CC_FWD, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, hNormDesc, 0, hBottomDesc, hBottomData, 0, hTopDesc, hTopData));
4366 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.LRN_CC_FWD, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, hNormDesc, 0, hBottomDesc, hBottomData, 0, hTopDesc, hTopData));
4384 public void LRNCrossChannelBackward(
long hCuDnn,
long hNormDesc, T fAlpha,
long hTopDataDesc,
long hTopData,
long hTopDiffDesc,
long hTopDiff,
long hBottomDataDesc,
long hBottomData, T fBeta,
long hBottomDiffDesc,
long hBottomDiff)
4387 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.LRN_CC_BWD, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, hNormDesc, 0, hTopDataDesc, hTopData, hTopDiffDesc, hTopDiff, hBottomDataDesc, hBottomData, 0, hBottomDiffDesc, hBottomDiff));
4389 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.LRN_CC_BWD, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, hNormDesc, 0, hTopDataDesc, hTopData, hTopDiffDesc, hTopDiff, hBottomDataDesc, hBottomData, 0, hBottomDiffDesc, hBottomDiff));
4408 public void DivisiveNormalizationForward(
long hCuDnn,
long hNormDesc, T fAlpha,
long hBottomDataDesc,
long hBottomData,
long hTemp1,
long hTemp2, T fBeta,
long hTopDataDesc,
long hTopData)
4411 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.LCN_CC_FWD, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, hNormDesc, 0, hBottomDataDesc, hBottomData, hTemp1, hTemp2, 0, hTopDataDesc, hTopData));
4413 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.LCN_CC_FWD, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, hNormDesc, 0, hBottomDataDesc, hBottomData, hTemp1, hTemp2, 0, hTopDataDesc, hTopData));
4433 public void DivisiveNormalizationBackward(
long hCuDnn,
long hNormDesc, T fAlpha,
long hBottomDataDesc,
long hBottomData,
long hTopDiff,
long hTemp1,
long hTemp2, T fBeta,
long hBottomDiffDesc,
long hBottomDiff)
4436 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.LCN_CC_BWD, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, hNormDesc, 0, hBottomDataDesc, hBottomData, hTopDiff, hTemp1, hTemp2, 0, hBottomDiffDesc, hBottomDiff));
4438 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.LCN_CC_BWD, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, hNormDesc, 0, hBottomDataDesc, hBottomData, hTopDiff, hTemp1, hTemp2, 0, hBottomDiffDesc, hBottomDiff));
4451 public void TanhForward(
long hCuDnn, T fAlpha,
long hBottomDataDesc,
long hBottomData, T fBeta,
long hTopDataDesc,
long hTopData)
4454 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.TANH_FWD, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hBottomDataDesc, hBottomData, 0, hTopDataDesc, hTopData));
4456 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.TANH_FWD, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hBottomDataDesc, hBottomData, 0, hTopDataDesc, hTopData));
4473 public void TanhBackward(
long hCuDnn, T fAlpha,
long hTopDataDesc,
long hTopData,
long hTopDiffDesc,
long hTopDiff,
long hBottomDataDesc,
long hBottomData, T fBeta,
long hBottomDiffDesc,
long hBottomDiff)
4476 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.TANH_BWD, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hTopDataDesc, hTopData, hTopDiffDesc, hTopDiff, hBottomDataDesc, hBottomData, 0, hBottomDiffDesc, hBottomDiff));
4478 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.TANH_BWD, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hTopDataDesc, hTopData, hTopDiffDesc, hTopDiff, hBottomDataDesc, hBottomData, 0, hBottomDiffDesc, hBottomDiff));
4491 public void EluForward(
long hCuDnn, T fAlpha,
long hBottomDataDesc,
long hBottomData, T fBeta,
long hTopDataDesc,
long hTopData)
4494 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.ELU_FWD, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hBottomDataDesc, hBottomData, 0, hTopDataDesc, hTopData));
4496 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.ELU_FWD, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hBottomDataDesc, hBottomData, 0, hTopDataDesc, hTopData));
4513 public void EluBackward(
long hCuDnn, T fAlpha,
long hTopDataDesc,
long hTopData,
long hTopDiffDesc,
long hTopDiff,
long hBottomDataDesc,
long hBottomData, T fBeta,
long hBottomDiffDesc,
long hBottomDiff)
4516 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.ELU_BWD, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hTopDataDesc, hTopData, hTopDiffDesc, hTopDiff, hBottomDataDesc, hBottomData, 0, hBottomDiffDesc, hBottomDiff));
4518 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.ELU_BWD, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hTopDataDesc, hTopData, hTopDiffDesc, hTopDiff, hBottomDataDesc, hBottomData, 0, hBottomDiffDesc, hBottomDiff));
4531 public void SigmoidForward(
long hCuDnn, T fAlpha,
long hBottomDataDesc,
long hBottomData, T fBeta,
long hTopDataDesc,
long hTopData)
4534 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.SIGMOID_FWD, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hBottomDataDesc, hBottomData, 0, hTopDataDesc, hTopData));
4536 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.SIGMOID_FWD, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hBottomDataDesc, hBottomData, 0, hTopDataDesc, hTopData));
4553 public void SigmoidBackward(
long hCuDnn, T fAlpha,
long hTopDataDesc,
long hTopData,
long hTopDiffDesc,
long hTopDiff,
long hBottomDataDesc,
long hBottomData, T fBeta,
long hBottomDiffDesc,
long hBottomDiff)
4556 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.SIGMOID_BWD, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hTopDataDesc, hTopData, hTopDiffDesc, hTopDiff, hBottomDataDesc, hBottomData, 0, hBottomDiffDesc, hBottomDiff));
4558 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.SIGMOID_BWD, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hTopDataDesc, hTopData, hTopDiffDesc, hTopDiff, hBottomDataDesc, hBottomData, 0, hBottomDiffDesc, hBottomDiff));
4576 public void ReLUForward(
long hCuDnn, T fAlpha,
long hBottomDataDesc,
long hBottomData, T fBeta,
long hTopDataDesc,
long hTopData)
4579 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.RELU_FWD, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hBottomDataDesc, hBottomData, 0, hTopDataDesc, hTopData));
4581 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.RELU_FWD, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hBottomDataDesc, hBottomData, 0, hTopDataDesc, hTopData));
4598 public void ReLUBackward(
long hCuDnn, T fAlpha,
long hTopDataDesc,
long hTopData,
long hTopDiffDesc,
long hTopDiff,
long hBottomDataDesc,
long hBottomData, T fBeta,
long hBottomDiffDesc,
long hBottomDiff)
4601 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.RELU_BWD, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hTopDataDesc, hTopData, hTopDiffDesc, hTopDiff, hBottomDataDesc, hBottomData, 0, hBottomDiffDesc, hBottomDiff));
4603 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.RELU_BWD, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hTopDataDesc, hTopData, hTopDiffDesc, hTopDiff, hBottomDataDesc, hBottomData, 0, hBottomDiffDesc, hBottomDiff));
4621 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.SOFTMAX_FWD, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hBottomDataDesc, hBottomData, 0, hTopDataDesc, hTopData, (
int)alg, (
int)mode));
4623 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.SOFTMAX_FWD, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hBottomDataDesc, hBottomData, 0, hTopDataDesc, hTopData, (
int)alg, (
int)mode));
4640 public void SoftmaxBackward(
long hCuDnn,
SOFTMAX_ALGORITHM alg,
SOFTMAX_MODE mode, T fAlpha,
long hTopDataDesc,
long hTopData,
long hTopDiffDesc,
long hTopDiff, T fBeta,
long hBottomDiffDesc,
long hBottomDiff)
4643 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.SOFTMAX_BWD, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hTopDataDesc, hTopData, hTopDiffDesc, hTopDiff, 0, hBottomDiffDesc, hBottomDiff, (
int)alg, (
int)mode));
4645 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.SOFTMAX_BWD, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hTopDataDesc, hTopData, hTopDiffDesc, hTopDiff, 0, hBottomDiffDesc, hBottomDiff, (
int)alg, (
int)mode));
4654 int nFn = (m_bEnableRnnExtendedVersion) ? (
int)CUDAFN.CREATE_RNN_DATA_DESCEX : (int)CUDAFN.CREATE_RNN_DATA_DESC;
4658 double[] rg = m_cuda.RunDouble((
int)m_hKernel, nFn,
null);
4663 float[] rg = m_cuda.RunFloat((
int)m_hKernel, nFn,
null);
4674 int nFn = (m_bEnableRnnExtendedVersion) ? (
int)CUDAFN.FREE_RNN_DATA_DESCEX : (int)CUDAFN.FREE_RNN_DATA_DESC;
4677 m_cuda.RunDouble((
int)m_hKernel, nFn, m_param.AsDouble(h));
4679 m_cuda.RunFloat((
int)m_hKernel, nFn, m_param.AsFloat(h));
4692 public void SetRnnDataDesc(
long hRnnDataDesc,
RNN_DATALAYOUT layout,
int nMaxSeqLen,
int nBatchSize,
int nVectorSize,
bool bBidirectional =
false,
int[] rgSeqLen =
null)
4694 if (!m_bEnableRnnExtendedVersion && layout !=
RNN_DATALAYOUT.RNN_SEQ_MAJOR_UNPACKED)
4695 throw new Exception(
"The non-extended functions only support RNN_SEQ_MAJOR ordering.");
4697 int nFn = (m_bEnableRnnExtendedVersion) ? (
int)CUDAFN.SET_RNN_DATA_DESCEX : (int)CUDAFN.SET_RNN_DATA_DESC;
4701 List<long> rgArg =
new List<long>() { hRnnDataDesc, (long)layout, nMaxSeqLen, nBatchSize, nVectorSize, (bBidirectional) ? 1 : 0 };
4703 if (rgSeqLen !=
null)
4705 for (
int i = 0; i < rgSeqLen.Length; i++)
4707 rgArg.Add(rgSeqLen[i]);
4711 m_cuda.RunDoubleEx2((
int)m_hKernel, nFn,
null, rgArg.ToArray());
4715 List<long> rgArg =
new List<long>() { hRnnDataDesc, (long)layout, nMaxSeqLen, nBatchSize, nVectorSize, (bBidirectional) ? 1 : 0 };
4717 if (rgSeqLen !=
null)
4719 for (
int i = 0; i < rgSeqLen.Length; i++)
4721 rgArg.Add(rgSeqLen[i]);
4725 m_cuda.RunFloatEx2((
int)m_hKernel, nFn,
null, rgArg.ToArray());
4737 double[] rg = m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.CREATE_RNN_DESC,
null);
4742 float[] rg = m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.CREATE_RNN_DESC,
null);
4754 m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.FREE_RNN_DESC, m_param.AsDouble(h));
4756 m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.FREE_RNN_DESC, m_param.AsFloat(h));
4773 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.SET_RNN_DESC,
null, m_param.AsLong(hCuDnn, hRnnDesc, nHiddenCount, nNumLayers, hDropoutDesc, (
int)mode, (bUseTensorCores) ? 1 : 0, (
long)direction));
4775 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.SET_RNN_DESC,
null, m_param.AsLong(hCuDnn, hRnnDesc, nHiddenCount, nNumLayers, hDropoutDesc, (
int)mode, (bUseTensorCores) ? 1 : 0, (
long)direction));
4789 double[] rg = m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.GET_RNN_PARAMCOUNT,
null, m_param.AsLong(hCuDnn, hRnnDesc, hXDesc, (m_bEnableRnnExtendedVersion) ? 1 : 0));
4794 float[] rg = m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.GET_RNN_PARAMCOUNT,
null, m_param.AsLong(hCuDnn, hRnnDesc, hXDesc, (m_bEnableRnnExtendedVersion) ? 1 : 0));
4811 double[] rg = m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.GET_RNN_WORKSPACECOUNT,
null, m_param.AsLong(hCuDnn, hRnnDesc, (m_bEnableRnnExtendedVersion) ? 1 : 0, hXDesc));
4812 nReservedCount = (ulong)rg[1];
4813 return (ulong)rg[0];
4817 float[] rg = m_cuda.RunFloatEx2((
int)m_hKernel, (int)CUDAFN.GET_RNN_WORKSPACECOUNT,
null, m_param.AsLong(hCuDnn, hRnnDesc, (m_bEnableRnnExtendedVersion) ? 1 : 0, hXDesc));
4818 nReservedCount = (ulong)rg[1];
4819 return (ulong)rg[0];
4837 public void GetRnnLinLayerParams(
long hCuDnn,
long hRnnDesc,
int nLayer,
long hXDesc,
long hWtDesc,
long hWtData,
int nLinLayer, out
int nWtCount, out
long hWt, out
int nBiasCount, out
long hBias)
4841 double[] rg = m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.GET_RNN_LINLAYERPARAMS,
null, m_param.AsLong(hCuDnn, hRnnDesc, nLayer, hXDesc, hWtDesc, hWtData, nLinLayer, (m_bEnableRnnExtendedVersion) ? 1 : 0));
4842 nWtCount = (int)rg[0];
4844 nBiasCount = (int)rg[2];
4845 hBias = (long)rg[3];
4849 float[] rg = m_cuda.RunFloatEx2((
int)m_hKernel, (int)CUDAFN.GET_RNN_LINLAYERPARAMS,
null, m_param.AsLong(hCuDnn, hRnnDesc, nLayer, hXDesc, hWtDesc, hWtData, nLinLayer, (m_bEnableRnnExtendedVersion) ? 1 : 0));
4850 nWtCount = (int)rg[0];
4852 nBiasCount = (int)rg[2];
4853 hBias = (long)rg[3];
4881 public void RnnForward(
long hCuDnn,
long hRnnDesc,
long hXDesc,
long hXData,
long hHxDesc,
long hHxData,
long hCxDesc,
long hCxData,
long hWtDesc,
long hWtData,
long hYDesc,
long hYData,
long hHyDesc,
long hHyData,
long hCyDesc,
long hCyData,
long hWorkspace, ulong nWsCount,
long hReserved, ulong nResCount,
bool bTraining)
4885 List<long> rgArg =
new List<long>() { hCuDnn, hRnnDesc };
4906 rgArg.Add(hWorkspace);
4907 rgArg.Add((
long)nWsCount);
4908 rgArg.Add(hReserved);
4909 rgArg.Add((
long)nResCount);
4910 rgArg.Add((bTraining) ? 1 : 0);
4912 if (m_bEnableRnnExtendedVersion)
4915 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.FWD_RNN,
null, rgArg.ToArray());
4919 List<long> rgArg =
new List<long>() { hCuDnn, hRnnDesc };
4940 rgArg.Add(hWorkspace);
4941 rgArg.Add((
long)nWsCount);
4942 rgArg.Add(hReserved);
4943 rgArg.Add((
long)nResCount);
4944 rgArg.Add((bTraining) ? 1 : 0);
4946 if (m_bEnableRnnExtendedVersion)
4949 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.FWD_RNN,
null, rgArg.ToArray());
4981 public void RnnBackwardData(
long hCuDnn,
long hRnnDesc,
long hYDesc,
long hYData,
long hYDiff,
long hHyDesc,
long hHyDiff,
long hCyDesc,
long hCyDiff,
long hWtDesc,
long hWtData,
long hHxDesc,
long hHxData,
long hCxDesc,
long hCxData,
long hXDesc,
long hXDiff,
long hdHxDesc,
long hHxDiff,
long hdCxDesc,
long hCxDiff,
long hWorkspace, ulong nWsCount,
long hReserved, ulong nResCount)
4985 List<long> rgArg =
new List<long>() { hCuDnn, hRnnDesc };
5007 rgArg.Add(hdHxDesc);
5009 rgArg.Add(hdCxDesc);
5012 rgArg.Add(hWorkspace);
5013 rgArg.Add((
long)nWsCount);
5014 rgArg.Add(hReserved);
5015 rgArg.Add((
long)nResCount);
5017 if (m_bEnableRnnExtendedVersion)
5020 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.BWD_RNN_DATA,
null, rgArg.ToArray());
5024 List<long> rgArg =
new List<long>() { hCuDnn, hRnnDesc };
5046 rgArg.Add(hdHxDesc);
5048 rgArg.Add(hdCxDesc);
5051 rgArg.Add(hWorkspace);
5052 rgArg.Add((
long)nWsCount);
5053 rgArg.Add(hReserved);
5054 rgArg.Add((
long)nResCount);
5056 if (m_bEnableRnnExtendedVersion)
5059 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.BWD_RNN_DATA,
null, rgArg.ToArray());
5080 public void RnnBackwardWeights(
long hCuDnn,
long hRnnDesc,
long hXDesc,
long hXData,
long hHxDesc,
long hHxData,
long hYDesc,
long hYData,
long hWorkspace, ulong nWsCount,
long hWtDesc,
long hWtDiff,
long hReserved, ulong nResCount)
5084 List<long> rgArg =
new List<long>() { hCuDnn, hRnnDesc };
5095 rgArg.Add(hWorkspace);
5096 rgArg.Add((
long)nWsCount);
5101 rgArg.Add(hReserved);
5102 rgArg.Add((
long)nResCount);
5104 if (m_bEnableRnnExtendedVersion)
5107 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.BWD_RNN_WTS,
null, rgArg.ToArray());
5111 List<long> rgArg =
new List<long>() { hCuDnn, hRnnDesc };
5122 rgArg.Add(hWorkspace);
5123 rgArg.Add((
long)nWsCount);
5128 rgArg.Add(hReserved);
5129 rgArg.Add((
long)nResCount);
5131 if (m_bEnableRnnExtendedVersion)
5134 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.BWD_RNN_WTS,
null, rgArg.ToArray());
5146 double[] rg = m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.RNN8_IS_SUPPORTED,
null);
5147 return (rg[0] == 1) ? true :
false;
5151 float[] rg = m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.RNN8_IS_SUPPORTED,
null);
5152 return (rg[0] == 1) ? true :
false;
5164 double[] rg = m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.RNN8_CREATE,
null);
5169 float[] rg = m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.RNN8_CREATE,
null);
5181 m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.RNN8_FREE, m_param.AsDouble(h));
5183 m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.RNN8_FREE, m_param.AsFloat(h));
5205 public void SetRnn8(
long hCuDnn,
long hRnn,
bool bTraining,
RNN_DATALAYOUT layout,
RNN_MODE cellMode,
RNN_BIAS_MODE biasMode,
int nSequenceLen,
int nBatchSize,
int nInputs,
int nHidden,
int nOutputs,
int nProjection,
int nNumLayers,
float fDropout, ulong lSeed,
bool bBidirectional =
false)
5208 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.RNN8_SET, m_param.AsDouble((
double)fDropout), m_param.AsLong(hCuDnn, hRnn, (bTraining) ? 1 : 0, (
int)layout, (
int)cellMode, (
int)biasMode, nSequenceLen, nBatchSize, nInputs, nHidden, nOutputs, nProjection, nNumLayers, (
long)lSeed, (bBidirectional) ? 1 : 0));
5210 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.RNN8_SET, m_param.AsFloat(fDropout), m_param.AsLong(hCuDnn, hRnn, (bTraining) ? 1 : 0, (
int)layout, (
int)cellMode, (
int)biasMode, nSequenceLen, nBatchSize, nInputs, nHidden, nOutputs, nProjection, nNumLayers, (
long)lSeed, (bBidirectional) ? 1 : 0));
5221 public void GetRnn8MemorySizes(
long hCuDnn,
long hRnn, out ulong szWtCount, out ulong szWorkSize, out ulong szReservedSize)
5225 double[] rg = m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.RNN8_GET_MEMORY_SIZES,
null, m_param.AsLong(hCuDnn, hRnn));
5226 szWtCount = (ulong)rg[0];
5227 szWorkSize = (ulong)rg[1];
5228 szReservedSize = (ulong)rg[2];
5232 float[] rg = m_cuda.RunFloatEx2((
int)m_hKernel, (int)CUDAFN.RNN8_GET_MEMORY_SIZES,
null, m_param.AsLong(hCuDnn, hRnn));
5233 szWtCount = (ulong)rg[0];
5234 szWorkSize = (ulong)rg[1];
5235 szReservedSize = (ulong)rg[2];
5254 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.RNN8_INIT_WEIGHTS, m_param.AsDouble(fWtVal, fWtVal2, fBiasVal, fBiasVal2), m_param.AsLong(hCuDnn, hRnn, hWt, (
int)wtFt, (
int)biasFt));
5256 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.RNN8_INIT_WEIGHTS, m_param.AsFloat((
float)fWtVal, (
float)fWtVal2, (
float)fBiasVal, (
float)fBiasVal2), m_param.AsLong(hCuDnn, hRnn, hWt, (
int)wtFt, (
int)biasFt));
5273 public void Rnn8Forward(
long hCuDnn,
long hRnn,
long hX,
long hY,
long hhX,
long hhY,
long hcX,
long hcY,
long hWts,
long hWork,
long hReserved)
5276 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.RNN8_FWD,
null, m_param.AsLong(hCuDnn, hRnn, hX, hY, hhX, hhY, hcX, hcY, hWts, hWork, hReserved));
5278 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.RNN8_FWD,
null, m_param.AsLong(hCuDnn, hRnn, hX, hY, hhX, hhY, hcX, hcY, hWts, hWork, hReserved));
5300 public void Rnn8Backward(
long hCuDnn,
long hRnn,
long hY,
long hdY,
long hX,
long hdX,
long hhX,
long hdhY,
long hdhX,
long hcX,
long hdcY,
long hdcX,
long hWt,
long hdWt,
long hWork,
long hReserved)
5303 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.RNN8_BWD,
null, m_param.AsLong(hCuDnn, hRnn, hY, hdY, hX, hdX, hhX, hdhY, hdhX, hcX, hdcY, hdcX, hWt, hdWt, hWork, hReserved));
5305 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.RNN8_BWD,
null, m_param.AsLong(hCuDnn, hRnn, hY, hdY, hX, hdX, hhX, hdhY, hdhX, hcX, hdcY, hdcX, hWt, hdWt, hWork, hReserved));
5322 return AllocMemory(nCount);
5339 return AllocMemory(nCount);
5356 return AllocMemory(nCount);
5373 return AllocHostBuffer(nCount);
5392 public long CreatePCA(
int nMaxIterations,
int nM,
int nN,
int nK,
long hData,
long hScoresResult,
long hLoadsResult,
long hResiduals = 0,
long hEigenvalues = 0)
5396 double[] rg = m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CREATE_PCA,
null, m_param.AsLong(nMaxIterations, nM, nN, nK, hData, hScoresResult, hLoadsResult, hResiduals, hEigenvalues));
5401 float[] rg = m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CREATE_PCA,
null, m_param.AsLong(nMaxIterations, nM, nN, nK, hData, hScoresResult, hLoadsResult, hResiduals, hEigenvalues));
5417 public bool RunPCA(
long hPCA,
int nSteps, out
int nCurrentK, out
int nCurrentIteration)
5423 double[] rg = m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_RUN_PCA,
null, m_param.AsLong(hPCA, nSteps));
5424 bDone = (rg[0] == 1.0) ? true :
false;
5425 nCurrentIteration = (int)rg[1];
5426 nCurrentK = (int)rg[2];
5430 float[] rg = m_cuda.RunFloatEx2((
int)m_hKernel, (int)CUDAFN.CUDA_RUN_PCA,
null, m_param.AsLong(hPCA, nSteps));
5431 bDone = (rg[0] == 1.0f) ?
true :
false;
5432 nCurrentIteration = (int)rg[1];
5433 nCurrentK = (int)rg[2];
5449 m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.CUDA_FREE_PCA, m_param.AsDouble(hPCA));
5451 m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.CUDA_FREE_PCA, m_param.AsFloat(hPCA));
5482 public long CreateSSD(
int nNumClasses,
bool bShareLocation,
int nLocClasses,
int nBackgroundLabelId,
bool bUseDiffcultGt,
SSD_MINING_TYPE miningType,
SSD_MATCH_TYPE matchType,
float fOverlapThreshold,
bool bUsePriorForMatching,
SSD_CODE_TYPE codeType,
bool bEncodeVariantInTgt,
bool bBpInside,
bool bIgnoreCrossBoundaryBbox,
bool bUsePriorForNms,
SSD_CONF_LOSS_TYPE confLossType,
SSD_LOC_LOSS_TYPE locLossType,
float fNegPosRatio,
float fNegOverlap,
int nSampleSize,
bool bMapObjectToAgnostic,
bool bNmsParam,
float? fNmsThreshold =
null,
int? nNmsTopK =
null,
float? fNmsEta =
null)
5484 int nGpuID = GetDeviceID();
5488 List<double> rgArg =
new List<double>();
5493 rgArg.Add(nNumClasses);
5495 rgArg.Add((bShareLocation) ? 1 : 0);
5497 rgArg.Add(nLocClasses);
5499 rgArg.Add(nBackgroundLabelId);
5501 rgArg.Add((bUseDiffcultGt) ? 1 : 0);
5503 rgArg.Add((
int)miningType);
5505 rgArg.Add((
int)matchType);
5507 rgArg.Add(fOverlapThreshold);
5509 rgArg.Add((bUsePriorForMatching) ? 1 : 0);
5511 rgArg.Add((
int)codeType);
5513 rgArg.Add((bEncodeVariantInTgt) ? 1 : 0);
5515 rgArg.Add((bBpInside) ? 1 : 0);
5517 rgArg.Add((bIgnoreCrossBoundaryBbox) ? 1 : 0);
5519 rgArg.Add((bUsePriorForNms) ? 1 : 0);
5521 rgArg.Add((
int)confLossType);
5523 rgArg.Add((
int)locLossType);
5525 rgArg.Add(fNegPosRatio);
5527 rgArg.Add(fNegOverlap);
5529 rgArg.Add(nSampleSize);
5531 rgArg.Add((bMapObjectToAgnostic) ? 1 : 0);
5533 rgArg.Add((bNmsParam) ? 1 : 0);
5537 if (!fNmsThreshold.HasValue)
5538 throw new Exception(
"An NMS threshold must be specified when the 'bNmsParam' is true.");
5541 rgArg.Add(fNmsThreshold.GetValueOrDefault(0));
5543 rgArg.Add(nNmsTopK.GetValueOrDefault(-1));
5545 rgArg.Add(fNmsEta.GetValueOrDefault(1));
5548 double[] rg = m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.CUDA_CREATE_SSD, rgArg.ToArray());
5553 List<float> rgArg =
new List<float>();
5558 rgArg.Add(nNumClasses);
5560 rgArg.Add((bShareLocation) ? 1 : 0);
5562 rgArg.Add(nLocClasses);
5564 rgArg.Add(nBackgroundLabelId);
5566 rgArg.Add((bUseDiffcultGt) ? 1 : 0);
5568 rgArg.Add((
int)miningType);
5570 rgArg.Add((
int)matchType);
5572 rgArg.Add(fOverlapThreshold);
5574 rgArg.Add((bUsePriorForMatching) ? 1 : 0);
5576 rgArg.Add((
int)codeType);
5578 rgArg.Add((bEncodeVariantInTgt) ? 1 : 0);
5580 rgArg.Add((bBpInside) ? 1 : 0);
5582 rgArg.Add((bIgnoreCrossBoundaryBbox) ? 1 : 0);
5584 rgArg.Add((bUsePriorForNms) ? 1 : 0);
5586 rgArg.Add((
int)confLossType);
5588 rgArg.Add((
int)locLossType);
5590 rgArg.Add(fNegPosRatio);
5592 rgArg.Add(fNegOverlap);
5594 rgArg.Add(nSampleSize);
5596 rgArg.Add((bMapObjectToAgnostic) ? 1 : 0);
5598 rgArg.Add((bNmsParam) ? 1 : 0);
5602 if (!fNmsThreshold.HasValue)
5603 throw new Exception(
"An NMS threshold must be specified when the 'bNmsParam' is true.");
5606 rgArg.Add(fNmsThreshold.GetValueOrDefault(0));
5608 rgArg.Add(nNmsTopK.GetValueOrDefault(-1));
5610 rgArg.Add(fNmsEta.GetValueOrDefault(1));
5613 float[] rg = m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.CUDA_CREATE_SSD, rgArg.ToArray());
5625 public void SetupSSD(
long hSSD,
int nNum,
int nNumPriors,
int nNumGt)
5628 m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.CUDA_SETUP_SSD, m_param.AsDouble(hSSD, nNum, nNumPriors, nNumGt));
5630 m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.CUDA_SETUP_SSD, m_param.AsFloat(hSSD, nNum, nNumPriors, nNumGt));
5640 m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.CUDA_FREE_SSD, m_param.AsDouble(hSSD));
5642 m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.CUDA_FREE_SSD, m_param.AsFloat(hSSD));
5661 public int SsdMultiBoxLossForward(
long hSSD,
int nLocDataCount,
long hLocGpuData,
int nConfDataCount,
long hConfGpuData,
int nPriorDataCount,
long hPriorGpuData,
int nGtDataCount,
long hGtGpuData, out List<DictionaryMap<List<int>>> rgAllMatchIndices, out List<List<int>> rgrgAllNegIndices, out
int nNumNegs)
5664 int nMatchCount = 0;
5665 rgAllMatchIndices =
new List<DictionaryMap<List<int>>>();
5666 rgrgAllNegIndices =
new List<List<int>>();
5670 double[] rg = m_cuda.RunDoubleEx2((
int)m_hKernel, (int)CUDAFN.CUDA_SSD_FWD_MULTIBOXLOSS,
null, m_param.AsLong(hSSD, nLocDataCount, hLocGpuData, nConfDataCount, hConfGpuData, nPriorDataCount, hPriorGpuData, nGtDataCount, hGtGpuData));
5671 nMatchCount = (int)rg[nIdx];
5673 nNumNegs = (int)rg[nIdx];
5677 int nNumAllMatchIndices = (int)rg[nIdx];
5679 for (
int i = 0; i < nNumAllMatchIndices; i++)
5681 DictionaryMap<List<int>> map =
new DictionaryMap<List<int>>(
null);
5683 int nMapCount = (int)rg[nIdx];
5685 for (
int j = 0; j < nMapCount; j++)
5687 int nLabel = (int)rg[nIdx];
5689 List<int> rgIdx =
new List<int>();
5691 int nItemCount = (int)rg[nIdx];
5693 for (
int k = 0; k < nItemCount; k++)
5695 int nItemIdx = (int)rg[nIdx];
5697 rgIdx.Add(nItemIdx);
5700 map[nLabel] = rgIdx;
5703 rgAllMatchIndices.Add(map);
5707 int nNegListCount = (int)rg[nIdx];
5709 for (
int i = 0; i < nNegListCount; i++)
5711 int nItemCount = (int)rg[nIdx];
5713 List<int> rgItems =
new List<int>();
5715 for (
int j = 0; j < nItemCount; j++)
5717 int nItemIdx = (int)rg[nIdx];
5719 rgItems.Add(nItemIdx);
5722 rgrgAllNegIndices.Add(rgItems);
5727 float[] rg = m_cuda.RunFloatEx2((
int)m_hKernel, (int)CUDAFN.CUDA_SSD_FWD_MULTIBOXLOSS,
null, m_param.AsLong(hSSD, nLocDataCount, hLocGpuData, nConfDataCount, hConfGpuData, nPriorDataCount, hPriorGpuData, nGtDataCount, hGtGpuData));
5728 nMatchCount = (int)rg[nIdx];
5730 nNumNegs = (int)rg[nIdx];
5734 int nMapListCount = (int)rg[nIdx];
5736 for (
int i = 0; i < nMapListCount; i++)
5738 DictionaryMap<List<int>> map =
new DictionaryMap<List<int>>(
null);
5740 int nMapCount = (int)rg[nIdx];
5742 for (
int j = 0; j < nMapCount; j++)
5744 int nLabel = (int)rg[nIdx];
5746 List<int> rgIdx =
new List<int>();
5748 int nItemCount = (int)rg[nIdx];
5750 for (
int k = 0; k < nItemCount; k++)
5752 int nItemIdx = (int)rg[nIdx];
5754 rgIdx.Add(nItemIdx);
5757 map[nLabel] = rgIdx;
5760 rgAllMatchIndices.Add(map);
5764 int nNegListCount = (int)rg[nIdx];
5766 for (
int i = 0; i < nNegListCount; i++)
5768 int nItemCount = (int)rg[nIdx];
5770 List<int> rgItems =
new List<int>();
5772 for (
int j = 0; j < nItemCount; j++)
5774 int nItemIdx = (int)rg[nIdx];
5776 rgItems.Add(nItemIdx);
5779 rgrgAllNegIndices.Add(rgItems);
5797 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SSD_ENCODE_LOCPRED,
null, m_param.AsLong(hSSD, nLocPredCount, hLocPred, nLocGtCount, hLocGt));
5799 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SSD_ENCODE_LOCPRED,
null, m_param.AsLong(hSSD, nLocPredCount, hLocPred, nLocGtCount, hLocGt));
5813 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SSD_ENCODE_CONFPRED,
null, m_param.AsLong(hSSD, nConfPredCount, hConfPred, nConfGtCount, hConfGt));
5815 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SSD_ENCODE_CONFPRED,
null, m_param.AsLong(hSSD, nConfPredCount, hConfPred, nConfGtCount, hConfGt));
5828 public long CreateLayerNorm(
int nGpuID,
int nCount,
int nOuterNum,
int nChannels,
int nInnerNum,
float fEps = 1e-10f)
5832 double[] rg = m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CREATE_LAYERNORM, m_param.AsDouble(fEps), m_param.AsLong(nGpuID, nCount, nOuterNum, nChannels, nInnerNum, 0));
5837 float[] rg = m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CREATE_LAYERNORM, m_param.AsFloat(fEps), m_param.AsLong(nGpuID, nCount, nOuterNum, nChannels, nInnerNum, 0));
5849 m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.CUDA_FREE_LAYERNORM, m_param.AsDouble(hLayerNorm));
5851 m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.CUDA_FREE_LAYERNORM, m_param.AsFloat(hLayerNorm));
5863 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_LAYERNORM_FWD,
null, m_param.AsLong(hLayerNorm, hXdata, hYdata));
5865 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_LAYERNORM_FWD,
null, m_param.AsLong(hLayerNorm, hXdata, hYdata));
5878 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_LAYERNORM_BWD,
null, m_param.AsLong(hLayerNorm, hYdata, hYdiff, hXdiff));
5880 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_LAYERNORM_BWD,
null, m_param.AsLong(hLayerNorm, hYdata, hYdiff, hXdiff));
5888 #region ICudaMath Methods
5897 public void set(
int nCount,
long hHandle,
double fVal,
int nIdx = -1)
5899 set(nCount, hHandle, (T)Convert.ChangeType(fVal, typeof(T)), nIdx);
5909 public void set(
int nCount,
long hHandle,
float fVal,
int nIdx = -1)
5911 set(nCount, hHandle, (T)Convert.ChangeType(fVal, typeof(T)), nIdx);
5922 public void set(
int nCount,
long hHandle, T fVal,
int nIdx = -1,
int nXOff = 0)
5926 if (m_rgGhostMemory ==
null)
5928 m_cuda.RunDoubleEx2((
int)m_hKernel, (int)CUDAFN.CUDA_SET, m_param.AsDouble(convertD(fVal)), m_param.AsLong(nCount, hHandle, 0, nIdx, nXOff));
5933 m_rgGhostMemory[hHandle][nIdx] = fVal;
5935 Utility.Set<T>(m_rgGhostMemory[hHandle], fVal);
5940 if (m_rgGhostMemory ==
null)
5942 m_cuda.RunFloatEx2((
int)m_hKernel, (int)CUDAFN.CUDA_SET, m_param.AsFloat(convertF(fVal)), m_param.AsLong(nCount, hHandle, 0, nIdx, nXOff));
5947 m_rgGhostMemory[hHandle][nIdx] = fVal;
5949 Utility.Set<T>(m_rgGhostMemory[hHandle], fVal);
5961 public double[]
get_double(
int nCount,
long hHandle,
int nIdx = -1)
5963 return convertD(
get(nCount, hHandle, nIdx));
5973 public float[]
get_float(
int nCount,
long hHandle,
int nIdx = -1)
5975 return convertF(
get(nCount, hHandle, nIdx));
5985 public T[]
get(
int nCount,
long hHandle,
int nIdx = -1)
5988 return convert(m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_GET,
null, m_param.AsLong(nCount, hHandle, nIdx)));
5990 return convert(m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_GET,
null, m_param.AsLong(nCount, hHandle, nIdx)));
6007 public void copy(
int nCount,
long hSrc,
long hDst,
int nSrcOffset = 0,
int nDstOffset = 0,
long hStream = -1,
bool? bSrcHalfSizeOverride =
null,
bool? bDstHalfSizeOverride =
null)
6009 int nSrcHalfSizeOverride = -1;
6010 int nDstHalfSizeOverride = -1;
6012 if (bSrcHalfSizeOverride.HasValue)
6013 nSrcHalfSizeOverride = (bSrcHalfSizeOverride.Value) ? 1 : 0;
6015 if (bDstHalfSizeOverride.HasValue)
6016 nDstHalfSizeOverride = (bDstHalfSizeOverride.Value) ? 1 : 0;
6019 m_cuda.RunDoubleEx2((
int)m_hKernel, (int)CUDAFN.CUDA_COPY,
null, m_param.AsLong(nCount, hSrc, hDst, nSrcOffset, nDstOffset, hStream, nSrcHalfSizeOverride, nDstHalfSizeOverride));
6021 m_cuda.RunFloatEx2((
int)m_hKernel, (int)CUDAFN.CUDA_COPY,
null, m_param.AsLong(nCount, hSrc, hDst, nSrcOffset, nDstOffset, hStream, nSrcHalfSizeOverride, nDstHalfSizeOverride));
6035 public void copy(
int nCount,
int nNum,
int nDim,
long hSrc1,
long hSrc2,
long hDst,
long hSimilar,
bool bInvert =
false)
6038 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_COPY_SIM,
null, m_param.AsLong(nCount, nNum, nDim, hSrc1, hSrc2, hDst, hSimilar, (bInvert) ? 1 : 0));
6040 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_COPY_SIM,
null, m_param.AsLong(nCount, nNum, nDim, hSrc1, hSrc2, hDst, hSimilar, (bInvert) ? 1 : 0));
6062 public void copy_batch(
int nCount,
int nNum,
int nDim,
long hSrcData,
long hSrcLbl,
int nDstCount,
long hDstCache,
long hWorkDevData,
int nLabelStart,
int nLabelCount,
int nCacheSize,
long hCacheHostCursors,
long hWorkDataHost)
6065 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_COPY_BATCH,
null, m_param.AsLong(nCount, nNum, nDim, hSrcData, hSrcLbl, nDstCount, hDstCache, hWorkDevData, nLabelStart, nLabelCount, nCacheSize, hCacheHostCursors, hWorkDataHost));
6067 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_COPY_BATCH,
null, m_param.AsLong(nCount, nNum, nDim, hSrcData, hSrcLbl, nDstCount, hDstCache, hWorkDevData, nLabelStart, nLabelCount, nCacheSize, hCacheHostCursors, hWorkDataHost));
6095 public void copy_sequence(
int nK,
int nNum,
int nDim,
long hSrcData,
long hSrcLbl,
int nSrcCacheCount,
long hSrcCache,
int nLabelStart,
int nLabelCount,
int nCacheSize,
long hCacheHostCursors,
bool bOutputLabels, List<long> rghTop, List<int> rgnTopCount,
long hWorkDataHost,
bool bCombinePositiveAndNegative =
false,
int nSeed = 0)
6097 int nTopCount = 2 + nK;
6102 if (bCombinePositiveAndNegative && nK != 0)
6103 throw new ArgumentOutOfRangeException(
"nK",
"When using 'bCombinePositiveAndNegative', nK should be 0.");
6105 if (nK < 0 || nK > 10)
6106 throw new ArgumentOutOfRangeException(
"nK",
"The 'nK' parameter must be within the range [0,10]!");
6108 if (rghTop.Count != nTopCount)
6109 throw new ArgumentOutOfRangeException(
"rghTop",
"The 'rghTop' count must equal '" + nTopCount.ToString() +
"' given nK = " + nK.ToString() +
" and bOutputLabels = " + bOutputLabels.ToString() +
"!");
6111 if (rgnTopCount.Count != rghTop.Count)
6112 throw new ArgumentOutOfRangeException(
"rgnTopCount",
"The 'rgnTopCount' count must equal the 'rghTop' count!");
6116 List<long> rgarg =
new List<long>() { nK, nNum, nDim, hSrcData, hSrcLbl, nSrcCacheCount, hSrcCache, nLabelStart, nLabelCount, nCacheSize, hCacheHostCursors, (bOutputLabels) ? 1 : 0, hWorkDataHost, (bCombinePositiveAndNegative) ? 1 : 0, nSeed };
6118 for (
int i = 0; i < rghTop.Count; i++)
6120 rgarg.Add(rghTop[i]);
6123 for (
int i = 0; i < rgnTopCount.Count; i++)
6125 rgarg.Add(rgnTopCount[i]);
6128 m_cuda.RunDoubleEx2((
int)m_hKernel, (int)CUDAFN.CUDA_COPY_SEQUENCE,
null, rgarg.ToArray());
6132 List<long> rgarg =
new List<long>() { nK, nNum, nDim, hSrcData, hSrcLbl, nSrcCacheCount, hSrcCache, nLabelStart, nLabelCount, nCacheSize, hCacheHostCursors, (bOutputLabels) ? 1 : 0, hWorkDataHost, (bCombinePositiveAndNegative) ? 1 : 0, nSeed };
6134 for (
int i = 0; i < rghTop.Count; i++)
6136 rgarg.Add(rghTop[i]);
6139 for (
int i = 0; i < rgnTopCount.Count; i++)
6141 rgarg.Add(rgnTopCount[i]);
6144 m_cuda.RunFloatEx2((
int)m_hKernel, (int)CUDAFN.CUDA_COPY_SEQUENCE,
null, rgarg.ToArray());
6165 public void copy_sequence(
int n,
long hSrc,
int nSrcStep,
int nSrcStartIdx,
int nCopyCount,
int nCopyDim,
long hDst,
int nDstStep,
int nDstStartIdx,
int nSrcSpatialDim,
int nDstSpatialDim,
int nSrcSpatialDimStartIdx = 0,
int nDstSpatialDimStartIdx = 0,
int nSpatialDimCount = -1)
6168 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_COPY_SEQUENCE2,
null, m_param.AsLong(n, hSrc, nSrcStep, nSrcStartIdx, nCopyCount, nCopyDim, hDst, nDstStep, nDstStartIdx, nSrcSpatialDim, nDstSpatialDim, nSrcSpatialDimStartIdx, nDstSpatialDimStartIdx, nSpatialDimCount));
6170 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_COPY_SEQUENCE2,
null, m_param.AsLong(n, hSrc, nSrcStep, nSrcStartIdx, nCopyCount, nCopyDim, hDst, nDstStep, nDstStartIdx, nSrcSpatialDim, nDstSpatialDim, nSrcSpatialDimStartIdx, nDstSpatialDimStartIdx, nSpatialDimCount));
6185 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_COPY_EXPAND,
null, m_param.AsLong(n, nNum, nDim, hX, hA));
6187 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_COPY_EXPAND,
null, m_param.AsLong(n, nNum, nDim, hX, hA));
6199 public void fill(
int n,
int nDim,
long hSrc,
int nSrcOff,
int nCount,
long hDst)
6202 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_COPY_FILL,
null, m_param.AsLong(n, nDim, hSrc, nSrcOff, nCount, hDst));
6204 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_COPY_FILL,
null, m_param.AsLong(n, nDim, hSrc, nSrcOff, nCount, hDst));
6212 public void sort(
int nCount,
long hY)
6215 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SORT,
null, m_param.AsLong(nCount, hY));
6217 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SORT,
null, m_param.AsLong(nCount, hY)); ;
6236 public void gemm(
bool bTransA,
bool bTransB,
int m,
int n,
int k,
double fAlpha,
long hA,
long hB,
double fBeta,
long hC)
6238 gemm(bTransA, bTransB, m, n, k, (T)Convert.ChangeType(fAlpha, typeof(T)), hA, hB, (T)Convert.ChangeType(fBeta, typeof(T)), hC);
6257 public void gemm(
bool bTransA,
bool bTransB,
int m,
int n,
int k,
float fAlpha,
long hA,
long hB,
float fBeta,
long hC)
6259 gemm(bTransA, bTransB, m, n, k, (T)Convert.ChangeType(fAlpha, typeof(T)), hA, hB, (T)Convert.ChangeType(fBeta, typeof(T)), hC);
6285 public void gemm(
bool bTransA,
bool bTransB,
int m,
int n,
int k, T fAlpha,
long hA,
long hB, T fBeta,
long hC,
int nAOffset = 0,
int nBOffset = 0,
int nCOffset = 0,
int nGroups = 1,
int nGroupOffsetA = 0,
int nGroupOffsetB = 0,
int nGroupOffsetC = 0)
6288 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_GEMM, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong((bTransA) ? 1 : 0, (bTransB) ? 1 : 0, m, n, k, 0, hA, hB, 0, hC, nAOffset, nBOffset, nCOffset, nGroups, nGroupOffsetA, nGroupOffsetB, nGroupOffsetC));
6290 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_GEMM, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong((bTransA) ? 1 : 0, (bTransB) ? 1 : 0, m, n, k, 0, hA, hB, 0, hC, nAOffset, nBOffset, nCOffset, nGroups, nGroupOffsetA, nGroupOffsetB, nGroupOffsetC));
6312 public void gemm(
bool bTransA,
bool bTransB,
int m,
int n,
int k,
double fAlpha,
long hA,
long hB,
double fBeta,
long hC, uint lda, uint ldb, uint ldc)
6315 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_GEMM2, m_param.AsDouble(fAlpha, fBeta), m_param.AsLong((bTransA) ? 1 : 0, (bTransB) ? 1 : 0, m, n, k, 0, hA, hB, 0, hC, lda, ldb, ldc));
6317 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_GEMM2, m_param.AsFloat((
float)fAlpha, (
float)fBeta), m_param.AsLong((bTransA) ? 1 : 0, (bTransB) ? 1 : 0, m, n, k, 0, hA, hB, 0, hC, lda, ldb, ldc));
6343 public void gemm(
bool bTransA,
bool bTransB,
int m,
int n,
int k,
double fAlpha,
long hA,
long hB,
double fBeta,
long hC, uint lda, uint ldb, uint ldc, uint stridea, uint strideb, uint stridec, uint batch_count)
6346 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_GEMM2, m_param.AsDouble(fAlpha, fBeta), m_param.AsLong((bTransA) ? 1 : 0, (bTransB) ? 1 : 0, m, n, k, 0, hA, hB, 0, hC, lda, ldb, ldc, stridea, strideb, stridec, batch_count));
6348 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_GEMM2, m_param.AsFloat((
float)fAlpha, (
float)fBeta), m_param.AsLong((bTransA) ? 1 : 0, (bTransB) ? 1 : 0, m, n, k, 0, hA, hB, 0, hC, lda, ldb, ldc, stridea, strideb, stridec, batch_count));
6366 public void geam(
bool bTransA,
bool bTransB,
int m,
int n,
double fAlpha,
long hA,
long hB,
double fBeta,
long hC)
6368 geam(bTransA, bTransB, m, n, (T)Convert.ChangeType(fAlpha, typeof(T)), hA, hB, (T)Convert.ChangeType(fBeta, typeof(T)), hC);
6386 public void geam(
bool bTransA,
bool bTransB,
int m,
int n,
float fAlpha,
long hA,
long hB,
float fBeta,
long hC)
6388 geam(bTransA, bTransB, m, n, (T)Convert.ChangeType(fAlpha, typeof(T)), hA, hB, (T)Convert.ChangeType(fBeta, typeof(T)), hC);
6409 public void geam(
bool bTransA,
bool bTransB,
int m,
int n, T fAlpha,
long hA,
long hB, T fBeta,
long hC,
int nAOffset = 0,
int nBOffset = 0,
int nCOffset = 0)
6412 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_GEAM, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong((bTransA) ? 1 : 0, (bTransB) ? 1 : 0, m, n, 0, hA, hB, 0, hC, nAOffset, nBOffset, nCOffset));
6414 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_GEAM, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong((bTransA) ? 1 : 0, (bTransB) ? 1 : 0, m, n, 0, hA, hB, 0, hC, nAOffset, nBOffset, nCOffset));
6431 public void gemv(
bool bTransA,
int m,
int n,
double fAlpha,
long hA,
long hX,
double fBeta,
long hY)
6433 gemv(bTransA, m, n, (T)Convert.ChangeType(fAlpha, typeof(T)), hA, hX, (T)Convert.ChangeType(fBeta, typeof(T)), hY);
6450 public void gemv(
bool bTransA,
int m,
int n,
float fAlpha,
long hA,
long hX,
float fBeta,
long hY)
6452 gemv(bTransA, m, n, (T)Convert.ChangeType(fAlpha, typeof(T)), hA, hX, (T)Convert.ChangeType(fBeta, typeof(T)), hY);
6472 public void gemv(
bool bTransA,
int m,
int n, T fAlpha,
long hA,
long hX, T fBeta,
long hY,
int nAOffset = 0,
int nXOffset = 0,
int nYOffset = 0)
6475 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_GEMV, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong((bTransA) ? 1 : 0, m, n, 0, hA, hX, 0, hY, nAOffset, nXOffset, nYOffset));
6477 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_GEMV, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong((bTransA) ? 1 : 0, m, n, 0, hA, hX, 0, hY, nAOffset, nXOffset, nYOffset));
6492 public void ger(
int m,
int n,
double fAlpha,
long hX,
long hY,
long hA)
6494 ger(m, n, (T)Convert.ChangeType(fAlpha, typeof(T)), hX, hY, hA);
6509 public void ger(
int m,
int n,
float fAlpha,
long hX,
long hY,
long hA)
6511 ger(m, n, (T)Convert.ChangeType(fAlpha, typeof(T)), hX, hY, hA);
6526 public void ger(
int m,
int n, T fAlpha,
long hX,
long hY,
long hA)
6529 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_GER, m_param.AsDouble(convertD(fAlpha)), m_param.AsLong(m, n, 0, hX, hY, hA));
6531 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_GER, m_param.AsFloat(convertF(fAlpha)), m_param.AsLong(m, n, 0, hX, hY, hA));
6544 public void axpy(
int n,
double fAlpha,
long hX,
long hY)
6546 axpy(n, (T)Convert.ChangeType(fAlpha, typeof(T)), hX, hY);
6559 public void axpy(
int n,
float fAlpha,
long hX,
long hY)
6561 axpy(n, (T)Convert.ChangeType(fAlpha, typeof(T)), hX, hY);
6576 public void axpy(
int n, T fAlpha,
long hX,
long hY,
int nXOff = 0,
int nYOff = 0)
6579 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_AXPY, m_param.AsDouble(convertD(fAlpha)), m_param.AsLong(n, 0, hX, hY, nXOff, nYOff));
6581 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_AXPY, m_param.AsFloat(convertF(fAlpha)), m_param.AsLong(n, 0, hX, hY, nXOff, nYOff));
6595 public void axpby(
int n,
double fAlpha,
long hX,
double fBeta,
long hY)
6597 axpby(n, (T)Convert.ChangeType(fAlpha, typeof(T)), hX, (T)Convert.ChangeType(fBeta, typeof(T)), hY);
6611 public void axpby(
int n,
float fAlpha,
long hX,
float fBeta,
long hY)
6613 axpby(n, (T)Convert.ChangeType(fAlpha, typeof(T)), hX, (T)Convert.ChangeType(fBeta, typeof(T)), hY);
6629 public void axpby(
int n, T fAlpha,
long hX, T fBeta,
long hY)
6632 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_AXPBY, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(n, 0, hX, 0, hY));
6634 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_AXPBY, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(n, 0, hX, 0, hY));
6650 public void mulbsx(
int n,
long hA,
int nAOff,
long hX,
int nXOff,
int nC,
int nSpatialDim,
bool bTranspose,
long hB,
int nBOff)
6653 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MULBSX,
null, m_param.AsLong(n, hA, nAOff, hX, nXOff, nC, nSpatialDim, (bTranspose) ? 1 : 0, hB, nBOff));
6655 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MULBSX,
null, m_param.AsLong(n, hA, nAOff, hX, nXOff, nC, nSpatialDim, (bTranspose) ? 1 : 0, hB, nBOff));
6671 public void divbsx(
int n,
long hA,
int nAOff,
long hX,
int nXOff,
int nC,
int nSpatialDim,
bool bTranspose,
long hB,
int nBOff)
6674 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_DIVBSX,
null, m_param.AsLong(n, hA, nAOff, hX, nXOff, nC, nSpatialDim, (bTranspose) ? 1 : 0, hB, nBOff));
6676 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_DIVBSX,
null, m_param.AsLong(n, hA, nAOff, hX, nXOff, nC, nSpatialDim, (bTranspose) ? 1 : 0, hB, nBOff));
6695 public void matmul(uint nOuterCount,
int m,
int n,
int k,
long hA,
long hB,
long hC,
double dfScale = 1.0,
bool bTransA =
false,
bool bTransB =
false)
6700 uint strideb = (uint)(k * n);
6701 uint stridea = (uint)(m * k);
6702 uint stridec = (uint)(m * n);
6704 gemm(bTransB, bTransA, n, m, k, dfScale, hB, hA, 0.0, hC, ldb, lda, ldc, strideb, stridea, stridec, nOuterCount);
6716 public void transposeHW(
int n,
int c,
int h,
int w,
long hSrc,
long hDst)
6719 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_TRANSPOSE_HW,
null, m_param.AsLong(n, c, h, w, hSrc, hDst));
6721 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_TRANSPOSE_HW,
null, m_param.AsLong(n, c, h, w, hSrc, hDst));
6732 public void set_bounds(
int n,
double dfMin,
double dfMax,
long hX)
6736 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SET_BOUNDS, m_param.AsDouble(dfMin, dfMax), m_param.AsLong(n, 0, 0, hX));
6740 float fMin = -
float.MaxValue;
6741 float fMax =
float.MaxValue;
6743 if (dfMin > -
float.MaxValue && dfMin <
float.MaxValue)
6744 fMin = (float)dfMin;
6745 else if (dfMin >
float.MaxValue)
6746 fMin =
float.MaxValue;
6748 if (dfMax > -
float.MaxValue && dfMax <
float.MaxValue)
6749 fMax = (float)dfMax;
6750 else if (dfMin < -
float.MaxValue)
6751 fMax = -
float.MaxValue;
6753 m_cuda.RunFloatEx2((
int)m_hKernel, (int)CUDAFN.CUDA_SET_BOUNDS, m_param.AsFloat(fMin, fMax), m_param.AsLong(n, 0, 0, hX));
6767 public void scal(
int n,
double fAlpha,
long hX,
int nXOff = 0)
6769 scal(n, (T)Convert.ChangeType(fAlpha, typeof(T)), hX, nXOff);
6782 public void scal(
int n,
float fAlpha,
long hX,
int nXOff = 0)
6784 scal(n, (T)Convert.ChangeType(fAlpha, typeof(T)), hX, nXOff);
6797 public void scal(
int n, T fAlpha,
long hX,
int nXOff = 0)
6800 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SCAL, m_param.AsDouble(convertD(fAlpha)), m_param.AsLong(n, 0, hX, nXOff));
6802 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SCAL, m_param.AsFloat(convertF(fAlpha)), m_param.AsLong(n, 0, hX, nXOff));
6817 return (
double)Convert.ChangeType(dot(n, hX, hY), typeof(
double));
6832 return (
float)Convert.ChangeType(dot(n, hX, hY), typeof(
float));
6847 public T
dot(
int n,
long hX,
long hY,
int nXOff = 0,
int nYOff = 0)
6851 double[] rg = m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_DOT,
null, m_param.AsLong(n, hX, hY, nXOff, nYOff));
6852 return (T)Convert.ChangeType(rg[0], typeof(T));
6856 float[] rg = m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_DOT,
null, m_param.AsLong(n, hX, hY, nXOff, nYOff));
6857 return (T)Convert.ChangeType(rg[0], typeof(T));
6873 return (
double)Convert.ChangeType(asum(n, hX, nXOff), typeof(
double));
6888 return (
float)Convert.ChangeType(asum(n, hX, nXOff), typeof(
float));
6901 public T
asum(
int n,
long hX,
int nXOff = 0)
6905 double[] rg = m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_ASUM,
null, m_param.AsLong(n, hX, nXOff));
6906 return (T)Convert.ChangeType(rg[0], typeof(T));
6910 float[] rg = m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_ASUM,
null, m_param.AsLong(n, hX, nXOff));
6911 return (T)Convert.ChangeType(rg[0], typeof(T));
6925 public void scale(
int n,
double fAlpha,
long hX,
long hY)
6927 scale(n, (T)Convert.ChangeType(fAlpha, typeof(T)), hX, hY);
6940 public void scale(
int n,
float fAlpha,
long hX,
long hY)
6942 scale(n, (T)Convert.ChangeType(fAlpha, typeof(T)), hX, hY);
6957 public void scale(
int n, T fAlpha,
long hX,
long hY,
int nXOff = 0,
int nYOff = 0)
6960 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SCALE, m_param.AsDouble(convertD(fAlpha)), m_param.AsLong(n, 0, hX, hY, nXOff, nYOff));
6962 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SCALE, m_param.AsFloat(convertF(fAlpha)), m_param.AsLong(n, 0, hX, hY, nXOff, nYOff));
6976 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SCALE_TO_RANGE, m_param.AsDouble(fMin, fMax), m_param.AsLong(n, hX, hY, 0, 0));
6978 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SCALE_TO_RANGE, m_param.AsFloat((
float)fMin, (
float)fMax), m_param.AsLong(n, hX, hY, 0, 0));
6986 public double erf(
double dfVal)
6988 return convertD(erf(convertD1(dfVal)));
6998 return convertF(erf(convertF1(fVal)));
7010 double[] rg = m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.CUDA_ERF, m_param.AsDouble(convertD(fVal)));
7011 return convert(rg)[0];
7015 float[] rg = m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.CUDA_ERF, m_param.AsFloat(convertF(fVal)));
7016 return convert(rg)[0];
7030 public void mask(
int n,
int nMaskDim, T fSearch, T fReplace,
long hX,
long hMask,
long hY)
7033 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MASK, m_param.AsDouble(convertD(fSearch), convertD(fReplace)), m_param.AsLong(n, nMaskDim, 0, 0, hX, hMask, hY));
7035 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MASK, m_param.AsFloat(convertF(fSearch), convertF(fReplace)), m_param.AsLong(n, nMaskDim, 0, 0, hX, hMask, hY));
7048 public void mask(
int n,
int nMaskDim,
double fSearch,
double fReplace,
long hX,
long hMask,
long hY)
7050 mask(n, nMaskDim, (T)Convert.ChangeType(fSearch, typeof(T)), (T)Convert.ChangeType(fReplace, typeof(T)), hX, hMask, hY);
7063 public void mask(
int n,
int nMaskDim,
float fSearch,
float fReplace,
long hX,
long hMask,
long hY)
7065 mask(n, nMaskDim, (T)Convert.ChangeType(fSearch, typeof(T)), (T)Convert.ChangeType(fReplace, typeof(T)), hX, hMask, hY);
7079 public void mask_batch(
int n,
int nBatch,
int nMaskDim, T fSearch, T fReplace,
long hX,
long hMask,
long hY)
7082 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MASK_BATCH, m_param.AsDouble(convertD(fSearch), convertD(fReplace)), m_param.AsLong(n, nBatch, nMaskDim, 0, 0, hX, hMask, hY));
7084 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MASK_BATCH, m_param.AsFloat(convertF(fSearch), convertF(fReplace)), m_param.AsLong(n, nBatch, nMaskDim, 0, 0, hX, hMask, hY));
7098 public void mask_batch(
int n,
int nBatch,
int nMaskDim,
double fSearch,
double fReplace,
long hX,
long hMask,
long hY)
7100 mask_batch(n, nBatch, nMaskDim, (T)Convert.ChangeType(fSearch, typeof(T)), (T)Convert.ChangeType(fReplace, typeof(T)), hX, hMask, hY);
7114 public void mask_batch(
int n,
int nBatch,
int nMaskDim,
float fSearch,
float fReplace,
long hX,
long hMask,
long hY)
7116 mask_batch(n, nBatch, nMaskDim, (T)Convert.ChangeType(fSearch, typeof(T)), (T)Convert.ChangeType(fReplace, typeof(T)), hX, hMask, hY);
7138 public void interp2(
int nChannels,
long hData1,
int nX1,
int nY1,
int nHeight1,
int nWidth1,
int nHeight1A,
int nWidth1A,
long hData2,
int nX2,
int nY2,
int nHeight2,
int nWidth2,
int nHeight2A,
int nWidth2A,
bool bBwd =
false)
7140 if (!(nX1 >= 0 && nY1 >= 0 && nHeight1 > 0 && nWidth1 > 0 && nX2 >= 0 && nY2 >= 0 && nHeight2 > 0 && nWidth2 > 0))
7141 throw new ArgumentOutOfRangeException(
"interp2: Invalid arguments found.");
7143 if (!(nWidth1A >= nWidth1 + nX1 && nHeight1A >= nHeight1 + nY1 && nWidth2A >= nWidth2 + nX2 && nHeight2A >= nHeight2 + nY2))
7144 throw new ArgumentOutOfRangeException(
"interp2: Invalid arguments found.");
7147 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_INTERP2,
null, m_param.AsLong(nChannels, hData1, nX1, nY1, nHeight1, nWidth1, nHeight1A, nWidth1A, hData2, nX2, nY2, nHeight2, nWidth2, nHeight2A, nWidth2A, (bBwd) ? 1 : 0));
7149 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_INTERP2,
null, m_param.AsLong(nChannels, hData1, nX1, nY1, nHeight1, nWidth1, nHeight1A, nWidth1A, hData2, nX2, nY2, nHeight2, nWidth2, nHeight2A, nWidth2A, (bBwd) ? 1 : 0));
7163 add_scalar(n, (T)Convert.ChangeType(fAlpha, typeof(T)), hY);
7177 add_scalar(n, (T)Convert.ChangeType(fAlpha, typeof(T)), hY);
7193 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_ADD_SCALAR, m_param.AsDouble(convertD(fAlpha)), m_param.AsLong(n, 0, hY, nYOff));
7195 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_ADD_SCALAR, m_param.AsFloat(convertF(fAlpha)), m_param.AsLong(n, 0, hY, nYOff));
7209 public void add(
int n,
long hA,
long hB,
long hC,
long hY)
7212 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_ADD3,
null, m_param.AsLong(n, hA, hB, hC, hY));
7214 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_ADD3,
null, m_param.AsLong(n, hA, hB, hC, hY));
7227 public void add(
int n,
long hA,
long hB,
long hY)
7230 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_ADD,
null, m_param.AsLong(n, hA, hB, hY));
7232 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_ADD,
null, m_param.AsLong(n, hA, hB, hY));
7246 public void add(
int n,
long hA,
long hB,
long hY,
double dfAlpha)
7249 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_ADD, m_param.AsDouble(dfAlpha), m_param.AsLong(n, hA, hB, hY, 0));
7251 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_ADD, m_param.AsFloat((
float)dfAlpha), m_param.AsLong(n, hA, hB, hY, 0));
7265 public void add(
int n,
long hA,
long hB,
long hY,
float fAlpha)
7268 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_ADD, m_param.AsDouble(fAlpha), m_param.AsLong(n, hA, hB, hY, 0));
7270 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_ADD, m_param.AsFloat(fAlpha), m_param.AsLong(n, hA, hB, hY, 0));
7288 public void add(
int n,
long hA,
long hB,
long hY,
double dfAlphaA,
double dfAlphaB,
int nAOff = 0,
int nBOff = 0,
int nYOff = 0)
7291 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_ADD2, m_param.AsDouble(dfAlphaA, dfAlphaB), m_param.AsLong(n, hA, hB, hY, 0, 0, nAOff, nBOff, nYOff));
7293 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_ADD2, m_param.AsFloat((
float)dfAlphaA, (
float)dfAlphaB), m_param.AsLong(n, hA, hB, hY, 0, 0, nAOff, nBOff, nYOff));
7312 public void sub(
int n,
long hA,
long hB,
long hY,
int nAOff = 0,
int nBOff = 0,
int nYOff = 0,
int nB = 0)
7315 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SUB,
null, m_param.AsLong(n, hA, hB, hY, nAOff, nBOff, nYOff, nB));
7317 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SUB,
null, m_param.AsLong(n, hA, hB, hY, nAOff, nBOff, nYOff, nB));
7334 public void mul(
int n,
long hA,
long hB,
long hY,
int nAOff = 0,
int nBOff = 0,
int nYOff = 0)
7337 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MUL,
null, m_param.AsLong(n, hA, hB, hY, nAOff, nBOff, nYOff));
7339 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MUL,
null, m_param.AsLong(n, hA, hB, hY, nAOff, nBOff, nYOff));
7357 public void sub_and_dot(
int n,
int nN,
int nInnerNum,
long hA,
long hB,
long hY,
int nAOff,
int nBOff,
int nYOff)
7360 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SUB_AND_DOT,
null, m_param.AsLong(n, nN, nInnerNum, hA, hB, hY, nAOff, nBOff, nYOff));
7362 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SUB_AND_DOT,
null, m_param.AsLong(n, nN, nInnerNum, hA, hB, hY, nAOff, nBOff, nYOff));
7376 mul_scalar(n, (T)Convert.ChangeType(fAlpha, typeof(T)), hY);
7390 mul_scalar(n, (T)Convert.ChangeType(fAlpha, typeof(T)), hY);
7405 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MUL_SCALAR, m_param.AsDouble(convertD(fAlpha)), m_param.AsLong(n, 0, hY));
7407 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MUL_SCALAR, m_param.AsFloat(convertF(fAlpha)), m_param.AsLong(n, 0, hY));
7420 public void div(
int n,
long hA,
long hB,
long hY)
7423 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_DIV,
null, m_param.AsLong(n, hA, hB, hY));
7425 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_DIV,
null, m_param.AsLong(n, hA, hB, hY));
7437 public void abs(
int n,
long hA,
long hY)
7440 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_ABS,
null, m_param.AsLong(n, hA, hY));
7442 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_ABS,
null, m_param.AsLong(n, hA, hY));
7454 public void exp(
int n,
long hA,
long hY)
7456 exp(n, hA, hY, 0, 0, 1.0);
7471 public void exp(
int n,
long hA,
long hY,
int nAOff,
int nYOff,
double dfBeta)
7474 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_EXP, m_param.AsDouble(dfBeta), m_param.AsLong(n, hA, hY, nAOff, nYOff, 0));
7476 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_EXP, m_param.AsFloat((
float)dfBeta), m_param.AsLong(n, hA, hY, nAOff, nYOff, 0));
7488 public void log(
int n,
long hA,
long hY)
7490 log(n, hA, hY, 1.0, 0.0);
7504 public void log(
int n,
long hA,
long hY,
double dfBeta,
double dfAlpha = 0)
7507 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_LOG, m_param.AsDouble(dfBeta, dfAlpha), m_param.AsLong(n, hA, hY, 0, 0));
7509 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_LOG, m_param.AsFloat((
float)dfBeta, (
float)dfAlpha), m_param.AsLong(n, hA, hY, 0, 0));
7524 public void powx(
int n,
long hA,
double fAlpha,
long hY,
int nAOff = 0,
int nYOff = 0)
7526 powx(n, hA, (T)Convert.ChangeType(fAlpha, typeof(T)), hY, nAOff, nYOff);
7541 public void powx(
int n,
long hA,
float fAlpha,
long hY,
int nAOff = 0,
int nYOff = 0)
7543 powx(n, hA, (T)Convert.ChangeType(fAlpha, typeof(T)), hY, nAOff, nYOff);
7558 public void powx(
int n,
long hA, T fAlpha,
long hY,
int nAOff = 0,
int nYOff = 0)
7561 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_POWX, m_param.AsDouble(convertD(fAlpha)), m_param.AsLong(n, hA, 0, hY, nAOff, nYOff));
7563 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_POWX, m_param.AsFloat(convertF(fAlpha)), m_param.AsLong(n, hA, 0, hY, nAOff, nYOff));
7574 public void sign(
int n,
long hX,
long hY,
int nXOff = 0,
int nYOff = 0)
7577 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SIGN,
null, m_param.AsLong(n, hX, hY, nXOff, nYOff));
7579 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SIGN,
null, m_param.AsLong(n, hX, hY, nXOff, nYOff));
7582#pragma warning disable 1591
7584 public void student(
int n,
long hX,
long hY)
7587 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_STUDENT,
null, m_param.AsLong(n, hX, hY));
7589 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_STUDENT,
null, m_param.AsLong(n, hX, hY));
7592 public void logistic1(
int n,
long hX,
long hY)
7595 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_LOGISTIC1,
null, m_param.AsLong(n, hX, hY));
7597 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_LOGISTIC1,
null, m_param.AsLong(n, hX, hY));
7600 public void logistic2(
int n,
long hX,
long hY)
7603 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_LOGISTIC2,
null, m_param.AsLong(n, hX, hY));
7605 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_LOGISTIC2,
null, m_param.AsLong(n, hX, hY));
7608 public void reciprocol(
int n,
long hX,
long hY)
7611 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_RECIPROCOL,
null, m_param.AsLong(n, hX, hY));
7613 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_RECIPROCOL,
null, m_param.AsLong(n, hX, hY));
7616#pragma warning restore 1591
7624 public void sqrt(
int n,
long hX,
long hY)
7627 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SQRT,
null, m_param.AsLong(n, hX, hY));
7629 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SQRT,
null, m_param.AsLong(n, hX, hY));
7641 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SQRT_SCALE,
null, m_param.AsLong(nCount, hX, hY));
7643 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SQRT_SCALE,
null, m_param.AsLong(nCount, hX, hY));
7656 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_COMPARE_SIGNS,
null, m_param.AsLong(n, hA, hB, hY));
7658 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_COMPARE_SIGNS,
null, m_param.AsLong(n, hA, hB, hY));
7669 public void max(
int n,
long hA,
long hB,
long hY)
7672 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MAX,
null, m_param.AsLong(n, hA, hB, hY));
7674 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MAX,
null, m_param.AsLong(n, hA, hB, hY));
7686 public void max_bwd(
int n,
long hAdata,
long hBdata,
long hYdiff,
long hAdiff,
long hBdiff)
7689 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MAX_BWD2,
null, m_param.AsLong(n, hAdata, hBdata, hYdiff, hAdiff, hBdiff));
7691 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MAX_BWD2,
null, m_param.AsLong(n, hAdata, hBdata, hYdiff, hAdiff, hBdiff));
7702 public void min(
int n,
long hA,
long hB,
long hY)
7705 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MIN,
null, m_param.AsLong(n, hA, hB, hY));
7707 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MIN,
null, m_param.AsLong(n, hA, hB, hY));
7724 public double max(
int n,
long hA, out
long lPos,
int nAOff = 0,
long hWork = 0)
7730 double[] rg = m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MAXVAL,
null, m_param.AsLong(n, hA, nAOff, hWork));
7736 float[] rg = m_cuda.RunFloatEx2((
int)m_hKernel, (int)CUDAFN.CUDA_MAXVAL,
null, m_param.AsLong(n, hA, nAOff, hWork));
7745 double[] rg = m_cuda.RunDoubleEx2((
int)m_hKernel, (int)CUDAFN.CUDA_MAXVAL,
null, m_param.AsLong(n, hA, nAOff));
7751 float[] rg = m_cuda.RunFloatEx2((
int)m_hKernel, (int)CUDAFN.CUDA_MAXVAL,
null, m_param.AsLong(n, hA, nAOff));
7772 public double min(
int n,
long hA, out
long lPos,
int nAOff = 0,
long hWork = 0)
7778 double[] rg = m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MINVAL,
null, m_param.AsLong(n, hA, nAOff, hWork));
7784 float[] rg = m_cuda.RunFloatEx2((
int)m_hKernel, (int)CUDAFN.CUDA_MINVAL,
null, m_param.AsLong(n, hA, nAOff, hWork));
7793 double[] rg = m_cuda.RunDoubleEx2((
int)m_hKernel, (int)CUDAFN.CUDA_MINVAL,
null, m_param.AsLong(n, hA, nAOff));
7799 float[] rg = m_cuda.RunFloatEx2((
int)m_hKernel, (int)CUDAFN.CUDA_MINVAL,
null, m_param.AsLong(n, hA, nAOff));
7818 public Tuple<double, double, double, double>
minmax(
int n,
long hA,
long hWork1,
long hWork2,
bool bDetectNans =
false,
int nAOff = 0)
7822 double[] rg = m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MINMAXVAL,
null, m_param.AsLong(n, hA, hWork1, hWork2, (bDetectNans) ? 1 : 0, nAOff));
7823 return new Tuple<double, double, double, double>(rg[0], rg[1], rg[2], rg[3]);
7827 float[] rg = m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MINMAXVAL,
null, m_param.AsLong(n, hA, hWork1, hWork2, (bDetectNans) ? 1 : 0, nAOff));
7828 return new Tuple<double, double, double, double>(rg[0], rg[1], rg[2], rg[3]);
7843 public void minmax(
int n,
long hA,
long hWork1,
long hWork2,
int nK,
long hMin,
long hMax,
bool bNonZeroOnly)
7846 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MINMAXVEC,
null, m_param.AsLong(n, hA, hWork1, hWork2, nK, hMin, hMax, (bNonZeroOnly) ? 1 : 0));
7848 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MINMAXVEC,
null, m_param.AsLong(n, hA, hWork1, hWork2, nK, hMin, hMax, (bNonZeroOnly) ? 1 : 0));
7862 public void transpose(
int n,
long hX,
long hY,
long hXCounts,
long hYCounts,
long hMapping,
int nNumAxes,
long hBuffer)
7865 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_TRANSPOSE,
null, m_param.AsLong(n, hX, hY, hXCounts, hYCounts, hMapping, nNumAxes, hBuffer));
7867 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_TRANSPOSE,
null, m_param.AsLong(n, hX, hY, hXCounts, hYCounts, hMapping, nNumAxes, hBuffer));
7878 public double sumsq(
int n,
long hW,
long hA,
int nAOff = 0)
7882 double[] rg = m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SUMSQ,
null, m_param.AsLong(n, hW, hA, nAOff));
7887 float[] rg = m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SUMSQ,
null, m_param.AsLong(n, hW, hA, nAOff));
7902 public double sumsqdiff(
int n,
long hW,
long hA,
long hB,
int nAOff = 0,
int nBOff = 0)
7906 double[] rg = m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SUMSQDIFF,
null, m_param.AsLong(n, hW, hA, hB, nAOff, nBOff));
7911 float[] rg = m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SUMSQDIFF,
null, m_param.AsLong(n, hW, hA, hB, nAOff, nBOff));
7925 public void width(
int n,
long hMean,
long hMin,
long hMax,
double dfAlpha,
long hWidth)
7928 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_WIDTH, m_param.AsDouble(dfAlpha), m_param.AsLong(n, hMean, hMin, hMax, 0, hWidth));
7930 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_WIDTH, m_param.AsFloat((
float)dfAlpha), m_param.AsLong(n, hMean, hMin, hMax, 0, hWidth));
7943 public bool contains_point(
int n,
long hMean,
long hWidth,
long hX,
long hWork,
int nXOff = 0)
7947 double[] rg = m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CONTAINS_POINT,
null, m_param.AsLong(n, hMean, hWidth, hX, hWork, nXOff));
7948 return (rg[0] == 0) ? false :
true;
7952 float[] rg = m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CONTAINS_POINT,
null, m_param.AsLong(n, hMean, hWidth, hX, hWork, nXOff));
7953 return (rg[0] == 0) ? false :
true;
7963 public void denan(
int n,
long hX,
double dfReplacement)
7966 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_DENAN, m_param.AsDouble(dfReplacement), m_param.AsLong(n, hX, 0));
7968 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_DENAN, m_param.AsFloat((
float)dfReplacement), m_param.AsLong(n, hX, 0));
7989 public void im2col(
long hDataIm,
int nDataImOffset,
int nChannels,
int nHeight,
int nWidth,
int nKernelH,
int nKernelW,
int nPadH,
int nPadW,
int nStrideH,
int nStrideW,
int nDilationH,
int nDilationW,
long hDataCol,
int nDataColOffset)
7992 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_IM2COL,
null, m_param.AsLong(hDataIm, nDataImOffset, nChannels, nHeight, nWidth, nKernelH, nKernelW, nPadH, nPadW, nStrideH, nStrideW, nDilationH, nDilationW, hDataCol, nDataColOffset));
7994 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_IM2COL,
null, m_param.AsLong(hDataIm, nDataImOffset, nChannels, nHeight, nWidth, nKernelH, nKernelW, nPadH, nPadW, nStrideH, nStrideW, nDilationH, nDilationW, hDataCol, nDataColOffset));
8013 public void im2col_nd(
long hDataIm,
int nDataImOffset,
int nNumSpatialAxes,
int nImCount,
int nChannelAxis,
long hImShape,
long hColShape,
long hKernelShape,
long hPad,
long hStride,
long hDilation,
long hDataCol,
int nDataColOffset)
8016 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_IM2COL_ND,
null, m_param.AsLong(hDataIm, nDataImOffset, nNumSpatialAxes, nImCount, nChannelAxis, hImShape, hColShape, hKernelShape, hPad, hStride, hDilation, hDataCol, nDataColOffset));
8018 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_IM2COL_ND,
null, m_param.AsLong(hDataIm, nDataImOffset, nNumSpatialAxes, nImCount, nChannelAxis, hImShape, hColShape, hKernelShape, hPad, hStride, hDilation, hDataCol, nDataColOffset));
8039 public void col2im(
long hDataCol,
int nDataColOffset,
int nChannels,
int nHeight,
int nWidth,
int nKernelH,
int nKernelW,
int nPadH,
int nPadW,
int nStrideH,
int nStrideW,
int nDilationH,
int nDilationW,
long hDataIm,
int nDataImOffset)
8042 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_COL2IM,
null, m_param.AsLong(hDataCol, nDataColOffset, nChannels, nHeight, nWidth, nKernelH, nKernelW, nPadH, nPadW, nStrideH, nStrideW, nDilationH, nDilationW, hDataIm, nDataImOffset));
8044 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_COL2IM,
null, m_param.AsLong(hDataCol, nDataColOffset, nChannels, nHeight, nWidth, nKernelH, nKernelW, nPadH, nPadW, nStrideH, nStrideW, nDilationH, nDilationW, hDataIm, nDataImOffset));
8063 public void col2im_nd(
long hDataCol,
int nDataColOffset,
int nNumSpatialAxes,
int nColCount,
int nChannelAxis,
long hImShape,
long hColShape,
long hKernelShape,
long hPad,
long hStride,
long hDilation,
long hDataIm,
int nDataImOffset)
8066 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_COL2IM_ND,
null, m_param.AsLong(hDataCol, nDataColOffset, nNumSpatialAxes, nColCount, nChannelAxis, hImShape, hColShape, hKernelShape, hPad, hStride, hDilation, hDataIm, nDataImOffset));
8068 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_COL2IM_ND,
null, m_param.AsLong(hDataCol, nDataColOffset, nNumSpatialAxes, nColCount, nChannelAxis, hImShape, hColShape, hKernelShape, hPad, hStride, hDilation, hDataIm, nDataImOffset));
8081 public void channel_min(
int nCount,
int nOuterNum,
int nChannels,
int nInnerNum,
long hX,
long hY,
bool bReturnIdx =
false)
8084 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CHANNEL_MIN,
null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY, (bReturnIdx) ? 1 : 0));
8086 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CHANNEL_MIN,
null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY, (bReturnIdx) ? 1 : 0));
8099 public void channel_max(
int nCount,
int nOuterNum,
int nChannels,
int nInnerNum,
long hX,
long hY,
bool bReturnIdx =
false)
8102 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CHANNEL_MAX,
null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY, (bReturnIdx) ? 1 : 0));
8104 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CHANNEL_MAX,
null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY, (bReturnIdx) ? 1 : 0));
8116 public void channel_mean(
int nCount,
int nOuterNum,
int nChannels,
int nInnerNum,
long hX,
long hY)
8119 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CHANNEL_MEAN,
null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY));
8121 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CHANNEL_MEAN,
null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY));
8133 public void channel_compare(
int nCount,
int nOuterNum,
int nChannels,
int nInnerNum,
long hX,
long hY)
8136 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CHANNEL_COMPARE,
null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY));
8138 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CHANNEL_COMPARE,
null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY));
8152 public void channel_fillfrom(
int nCount,
int nOuterNum,
int nChannels,
int nInnerNum,
long hX,
long hY,
DIR dir)
8155 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CHANNEL_FILLFROM,
null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY, (
int)dir));
8157 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CHANNEL_FILLFROM,
null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY, (
int)dir));
8179 public void channel_fill(
int nCount,
int nOuterNum,
int nChannels,
int nInnerNum,
long hX,
int nLabelDim,
long hLabels,
long hY)
8182 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CHANNEL_FILL,
null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, nLabelDim, hLabels, hY));
8184 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CHANNEL_FILL,
null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, nLabelDim, hLabels, hY));
8197 public void channel_sub(
int nCount,
int nOuterNum,
int nChannels,
int nInnerNum,
long hA,
long hX,
long hY)
8200 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CHANNEL_SUB,
null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY, hA));
8202 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CHANNEL_SUB,
null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY, hA));
8214 public void channel_sub(
int nCount,
int nOuterNum,
int nChannels,
int nInnerNum,
long hX,
long hY)
8217 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CHANNEL_SUB,
null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY));
8219 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CHANNEL_SUB,
null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY));
8236 public void channel_sum(
int nCount,
int nOuterNum,
int nChannels,
int nInnerNum,
long hX,
long hY,
bool bSumAcrossChannels =
true,
DIR dir =
DIR.FWD,
int nChannelsY = -1)
8239 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CHANNEL_SUM,
null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY, (bSumAcrossChannels) ? 1 : 0, (
int)dir, nChannelsY));
8241 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CHANNEL_SUM,
null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY, (bSumAcrossChannels) ? 1 : 0, (
int)dir, nChannelsY));
8254 public void channel_div(
int nCount,
int nOuterNum,
int nChannels,
int nInnerNum,
long hX,
long hY,
int nMethod = 1)
8257 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CHANNEL_DIV,
null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY, nMethod));
8259 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CHANNEL_DIV,
null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY, nMethod));
8272 public void channel_mul(
int nCount,
int nOuterNum,
int nChannels,
int nInnerNum,
long hX,
long hY,
int nMethod = 1)
8275 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CHANNEL_MUL,
null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY, nMethod));
8277 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CHANNEL_MUL,
null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY, nMethod));
8290 public void channel_mulv(
int nCount,
int nOuterNum,
int nChannels,
int nInnerNum,
long hA,
long hX,
long hC)
8293 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CHANNEL_MULV,
null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hA, hX, hC));
8295 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CHANNEL_MULV,
null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hA, hX, hC));
8308 public void channel_scale(
int nCount,
int nOuterNum,
int nChannels,
int nInnerNum,
long hX,
long hA,
long hY)
8311 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CHANNEL_SCALE,
null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hA, hY));
8313 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CHANNEL_SCALE,
null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hA, hY));
8326 public void channel_dot(
int nCount,
int nOuterNum,
int nChannels,
int nInnerNum,
long hX,
long hA,
long hY)
8329 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CHANNEL_DOT,
null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hA, hY));
8331 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CHANNEL_DOT,
null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hA, hY));
8343 public void channel_duplicate(
int nCount,
int nOuterNum,
int nChannels,
int nInnerNum,
long hX,
long hY)
8346 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CHANNEL_DUP,
null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY));
8348 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CHANNEL_DUP,
null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY));
8361 public void channel_percentile(
int nCount,
int nOuterNum,
int nChannels,
int nInnerNum,
long hX,
long hY,
double dfPercentile)
8364 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CHANNEL_PERCENTILE, m_param.AsDouble(dfPercentile), m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY));
8366 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CHANNEL_PERCENTILE, m_param.AsFloat((
float)dfPercentile), m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY));
8382 public void channel_op_fwd(
OP op,
int nCount,
int nC,
int nN1,
int nSD1,
int nN2,
int nSD2,
long hA,
long hB,
long hY)
8384 int nCount1 = Math.Max(nN1, nN2) * nC * Math.Max(nSD1, nSD2);
8385 if (nCount1 != nCount)
8386 throw new Exception(
"The nCount must equal max(nN1, nN2) x nC x max(nSD1, nSD2).");
8389 m_cuda.RunDoubleEx2((
int) m_hKernel, (
int) CUDAFN.CUDA_CHANNEL_OP_FWD,
null, m_param.AsLong((
int)op, nCount, nC, nN1, nSD1, nN2, nSD2, hA, hB, hY));
8391 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CHANNEL_OP_FWD,
null, m_param.AsLong((
int)op, nCount, nC, nN1, nSD1, nN2, nSD2, hA, hB, hY));
8413 public void channel_op_bwd(
OP op,
int nCount,
int nC,
int nN1,
int nSD1,
int nN2,
int nSD2,
int nCy,
int nSDy,
long hA,
long hB,
long hY,
long hAd,
long hBd,
long hYd,
long hWork)
8415 int nCount1 = Math.Max(nN1, nN2) * nC * Math.Max(nSD1, nSD2);
8416 if (nCount1 != nCount)
8417 throw new Exception(
"The nCount must equal max(nN1, nN2) x nC x max(nSD1, nSD2).");
8420 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CHANNEL_OP_BWD,
null, m_param.AsLong((
int)op, nCount, nC, nN1, nSD1, nN2, nSD2, nCy, nSDy, hA, hB, hY, hAd, hBd, hYd, hWork));
8422 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CHANNEL_OP_BWD,
null, m_param.AsLong((
int)op, nCount, nC, nN1, nSD1, nN2, nSD2, nCy, nSDy, hA, hB, hY, hAd, hBd, hYd, hWork));
8437 public void channel_add(
int nCount,
int nOuterNum,
int nChannels,
int nBlocks,
int nInnerNum,
int nOffset,
long hX,
long hY,
DIR dir)
8440 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CHANNEL_ADD,
null, m_param.AsLong(nCount, nOuterNum, nChannels, nBlocks, nInnerNum, nOffset, hX, hY, (
int)dir));
8442 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CHANNEL_ADD,
null, m_param.AsLong(nCount, nOuterNum, nChannels, nBlocks, nInnerNum, nOffset, hX, hY, (
int)dir));
8457 public void channel_copy(
int nCount,
int nOuterNum,
int nChannels,
int nBlocks,
int nInnerNum,
int nOffset,
long hX,
long hY,
DIR dir)
8460 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CHANNEL_COPY,
null, m_param.AsLong(nCount, nOuterNum, nChannels, nBlocks, nInnerNum, nOffset, hX, hY, (
int)dir));
8462 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CHANNEL_COPY,
null, m_param.AsLong(nCount, nOuterNum, nChannels, nBlocks, nInnerNum, nOffset, hX, hY, (
int)dir));
8474 public void channel_copyall(
int nCount,
int nOuterNum,
int nChannels,
int nInnerNum,
long hX,
long hY)
8477 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CHANNEL_COPYALL,
null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY));
8479 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CHANNEL_COPYALL,
null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY));
8491 public void sum(
int nCount,
int nOuterNum,
int nInnerNum,
long hX,
long hY)
8494 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SUM,
null, m_param.AsLong(nCount, nOuterNum, nInnerNum, hX, hY));
8496 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SUM,
null, m_param.AsLong(nCount, nOuterNum, nInnerNum, hX, hY));
8509 m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.CUDA_RNG_SETSEED, m_param.AsDouble(lSeed));
8511 m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.CUDA_RNG_SETSEED, m_param.AsFloat(lSeed));
8526 rng_uniform(n, (T)Convert.ChangeType(fMin, typeof(T)), (T)Convert.ChangeType(fMax, typeof(T)), hY);
8541 rng_uniform(n, (T)Convert.ChangeType(fMin, typeof(T)), (T)Convert.ChangeType(fMax, typeof(T)), hY);
8558 if (m_rgGhostMemory ==
null || !m_bGhostMemoryEnabled)
8559 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_RNG_UNIFORM, m_param.AsDouble(convertD(fMin), convertD(fMax)), m_param.AsLong(n, 0, 0, hY));
8563 if (m_rgGhostMemory ==
null || !m_bGhostMemoryEnabled)
8564 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_RNG_UNIFORM, m_param.AsFloat(convertF(fMin), convertF(fMax)), m_param.AsLong(n, 0, 0, hY));
8580 rng_gaussian(n, (T)Convert.ChangeType(fMu, typeof(T)), (T)Convert.ChangeType(fSigma, typeof(T)), hY);
8595 rng_gaussian(n, (T)Convert.ChangeType(fMu, typeof(T)), (T)Convert.ChangeType(fSigma, typeof(T)), hY);
8612 if (m_rgGhostMemory ==
null || !m_bGhostMemoryEnabled)
8613 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_RNG_GAUSSIAN, m_param.AsDouble(convertD(fMu), convertD(fSigma)), m_param.AsLong(n, 0, 0, hY));
8617 if (m_rgGhostMemory ==
null || !m_bGhostMemoryEnabled)
8618 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_RNG_GAUSSIAN, m_param.AsFloat(convertF(fMu), convertF(fSigma)), m_param.AsLong(n, 0, 0, hY));
8633 rng_bernoulli(n, (T)Convert.ChangeType(fNonZeroProb, typeof(T)), hY);
8647 rng_bernoulli(n, (T)Convert.ChangeType(fNonZeroProb, typeof(T)), hY);
8666 T[] rg = GetMemory(hY);
8667 fill_random(fNonZeroProb, rg);
8671#pragma warning disable 1591
8673 public void fill_random(T fNonZeroProb, T[] rg)
8675 double dfNonZeroProb =
Utility.ConvertVal<T>(fNonZeroProb);
8677 for (
int i = 0; i < rg.Length; i++)
8680 rg[i] = (dfRand <= dfNonZeroProb) ? m_tOne : m_tZero;
8684#pragma warning restore 1591
8700 public void accuracy_fwd(
int nCount,
int nOuterNum,
int nInnerNum,
long hBottomData,
long hBottomLabel,
long hAccData,
long hAccTotals,
int? nIgnoreLabel,
bool bLastElementOnly,
int nBatch)
8704 List<long> rgArg =
new List<long>() { nCount, nOuterNum, nInnerNum, hBottomData, hBottomLabel, hAccData, hAccTotals, (bLastElementOnly) ? 1 : 0, nBatch };
8705 if (nIgnoreLabel.HasValue)
8706 rgArg.Add(nIgnoreLabel.Value);
8707 m_cuda.RunDoubleEx2((
int)m_hKernel, (int)CUDAFN.CUDA_ACCURACY_FWD,
null, rgArg.ToArray());
8711 List<long> rgArg =
new List<long>() { nCount, nOuterNum, nInnerNum, hBottomData, hBottomLabel, hAccData, hAccTotals, (bLastElementOnly) ? 1 : 0, nBatch };
8712 if (nIgnoreLabel.HasValue)
8713 rgArg.Add(nIgnoreLabel.Value);
8714 m_cuda.RunFloatEx2((
int)m_hKernel, (int)CUDAFN.CUDA_ACCURACY_FWD,
null, rgArg.ToArray());
8727 public void batchreidx_fwd(
int nCount,
int nInnerDim,
long hBottomData,
long hPermutData,
long hTopData)
8730 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_BATCHREIDX_FWD,
null, m_param.AsLong(nCount, nInnerDim, hBottomData, hPermutData, hTopData));
8732 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_BATCHREIDX_FWD,
null, m_param.AsLong(nCount, nInnerDim, hBottomData, hPermutData, hTopData));
8745 public void batchreidx_bwd(
int nCount,
int nInnerDim,
long hTopDiff,
long hTopIdx,
long hBegins,
long hCounts,
long hBottomDiff)
8748 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_BATCHREIDX_BWD,
null, m_param.AsLong(nCount, nInnerDim, hTopDiff, hTopIdx, hBegins, hCounts, hBottomDiff));
8750 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_BATCHREIDX_BWD,
null, m_param.AsLong(nCount, nInnerDim, hTopDiff, hTopIdx, hBegins, hCounts, hBottomDiff));
8763 public void embed_fwd(
int nCount,
long hBottomData,
long hWeight,
int nM,
int nN,
int nK,
long hTopData)
8766 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_EMBED_FWD,
null, m_param.AsLong(nCount, hBottomData, hWeight, nM, nN, nK, hTopData));
8768 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_EMBED_FWD,
null, m_param.AsLong(nCount, hBottomData, hWeight, nM, nN, nK, hTopData));
8781 public void embed_bwd(
int nCount,
long hBottomData,
long hTopDiff,
int nM,
int nN,
int nK,
long hWeightDiff)
8784 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_EMBED_BWD,
null, m_param.AsLong(nCount, hBottomData, hTopDiff, nM, nN, nK, hWeightDiff));
8786 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_EMBED_BWD,
null, m_param.AsLong(nCount, hBottomData, hTopDiff, nM, nN, nK, hWeightDiff));
8810 public void pooling_fwd(
POOLING_METHOD method,
int nCount,
long hBottomData,
int num,
int nChannels,
int nHeight,
int nWidth,
int nPooledHeight,
int nPooledWidth,
int nKernelH,
int nKernelW,
int nStrideH,
int nStrideW,
int nPadH,
int nPadW,
long hTopData,
long hMask,
long hTopMask)
8813 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_POOL_FWD,
null, m_param.AsLong((
int)method, nCount, hBottomData, num, nChannels, nHeight, nWidth, nPooledHeight, nPooledWidth, nKernelH, nKernelW, nStrideH, nStrideW, nPadH, nPadW, hTopData, hMask, hTopMask));
8815 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_POOL_FWD,
null, m_param.AsLong((
int)method, nCount, hBottomData, num, nChannels, nHeight, nWidth, nPooledHeight, nPooledWidth, nKernelH, nKernelW, nStrideH, nStrideW, nPadH, nPadW, hTopData, hMask, hTopMask));
8839 public void pooling_bwd(
POOLING_METHOD method,
int nCount,
long hTopDiff,
int num,
int nChannels,
int nHeight,
int nWidth,
int nPooledHeight,
int nPooledWidth,
int nKernelH,
int nKernelW,
int nStrideH,
int nStrideW,
int nPadH,
int nPadW,
long hBottomDiff,
long hMask,
long hTopMask)
8842 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_POOL_BWD,
null, m_param.AsLong((
int)method, nCount, hTopDiff, num, nChannels, nHeight, nWidth, nPooledHeight, nPooledWidth, nKernelH, nKernelW, nStrideH, nStrideW, nPadH, nPadW, hBottomDiff, hMask, hTopMask));
8844 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_POOL_BWD,
null, m_param.AsLong((
int)method, nCount, hTopDiff, num, nChannels, nHeight, nWidth, nPooledHeight, nPooledWidth, nKernelH, nKernelW, nStrideH, nStrideW, nPadH, nPadW, hBottomDiff, hMask, hTopMask));
8867 public void unpooling_fwd(
POOLING_METHOD method,
int nCount,
long hBottomData,
int num,
int nChannels,
int nHeight,
int nWidth,
int nPooledHeight,
int nPooledWidth,
int nKernelH,
int nKernelW,
int nStrideH,
int nStrideW,
int nPadH,
int nPadW,
long hTopData,
long hMask)
8870 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_UNPOOL_FWD,
null, m_param.AsLong((
int)method, nCount, hBottomData, num, nChannels, nHeight, nWidth, nPooledHeight, nPooledWidth, nKernelH, nKernelW, nStrideH, nStrideW, nPadH, nPadW, hTopData, hMask));
8872 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_UNPOOL_FWD,
null, m_param.AsLong((
int)method, nCount, hBottomData, num, nChannels, nHeight, nWidth, nPooledHeight, nPooledWidth, nKernelH, nKernelW, nStrideH, nStrideW, nPadH, nPadW, hTopData, hMask));
8895 public void unpooling_bwd(
POOLING_METHOD method,
int nCount,
long hTopDiff,
int num,
int nChannels,
int nHeight,
int nWidth,
int nPooledHeight,
int nPooledWidth,
int nKernelH,
int nKernelW,
int nStrideH,
int nStrideW,
int nPadH,
int nPadW,
long hBottomDiff,
long hMask)
8898 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_UNPOOL_BWD,
null, m_param.AsLong((
int)method, nCount, hTopDiff, num, nChannels, nHeight, nWidth, nPooledHeight, nPooledWidth, nKernelH, nKernelW, nStrideH, nStrideW, nPadH, nPadW, hBottomDiff, hMask));
8900 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_UNPOOL_BWD,
null, m_param.AsLong((
int)method, nCount, hTopDiff, num, nChannels, nHeight, nWidth, nPooledHeight, nPooledWidth, nKernelH, nKernelW, nStrideH, nStrideW, nPadH, nPadW, hBottomDiff, hMask));
8914 public void clip_fwd(
int nCount,
long hBottomData,
long hTopData, T fMin, T fMax)
8917 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CLIP_FWD, m_param.AsDouble(convertD1(fMin), convertD1(fMax)), m_param.AsLong(nCount, hBottomData, hTopData, 0, 0));
8919 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CLIP_FWD, m_param.AsFloat(convertF1(fMin), convertF1(fMax)), m_param.AsLong(nCount, hBottomData, hTopData, 0, 0));
8931 public void clip_bwd(
int nCount,
long hTopDiff,
long hBottomData,
long hBottomDiff, T fMin, T fMax)
8934 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CLIP_BWD, m_param.AsDouble(convertD1(fMin), convertD1(fMax)), m_param.AsLong(nCount, hTopDiff, hBottomData, hBottomDiff, 0, 0));
8936 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CLIP_BWD, m_param.AsFloat(convertF1(fMin), convertF1(fMax)), m_param.AsLong(nCount, hTopDiff, hBottomData, hBottomDiff, 0, 0));
8952 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MATH_FWD,
null, m_param.AsLong(nCount, hBottomData, hTopData, (
int)
function));
8954 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MATH_FWD,
null, m_param.AsLong(nCount, hBottomData, hTopData, (
int)
function));
8966 public void math_bwd(
int nCount,
long hTopDiff,
long hTopData,
long hBottomDiff,
long hBottomData,
MATH_FUNCTION function)
8969 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MATH_BWD,
null, m_param.AsLong(nCount, hTopDiff, hTopData, hBottomDiff, hBottomData, (
int)
function));
8971 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MATH_BWD,
null, m_param.AsLong(nCount, hTopDiff, hTopData, hBottomDiff, hBottomData, (
int)
function));
8994 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MEAN_ERROR_LOSS_BWD,
null, m_param.AsLong(nCount, hPredicted, hTarget, hBottomDiff, (
int)merr));
8996 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MEAN_ERROR_LOSS_BWD,
null, m_param.AsLong(nCount, hPredicted, hTarget, hBottomDiff, (
int)merr));
9011 public void mish_fwd(
int nCount,
long hBottomData,
long hTopData,
double dfThreshold)
9014 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MISH_FWD, m_param.AsDouble(dfThreshold), m_param.AsLong(nCount, hBottomData, hTopData, 0));
9016 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MISH_FWD, m_param.AsFloat((
float)dfThreshold), m_param.AsLong(nCount, hBottomData, hTopData, 0));
9035 public void mish_bwd(
int nCount,
long hTopDiff,
long hTopData,
long hBottomDiff,
long hBottomData,
double dfThreshold,
int nMethod = 0)
9038 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MISH_BWD, m_param.AsDouble(dfThreshold), m_param.AsLong(nCount, hTopDiff, hTopData, hBottomDiff, hBottomData, 0, nMethod));
9040 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MISH_BWD, m_param.AsFloat((
float)dfThreshold), m_param.AsLong(nCount, hTopDiff, hTopData, hBottomDiff, hBottomData, 0, nMethod));
9064 public void gelu_fwd(
int nCount,
long hBottomData,
long hTopData,
bool bEnableBertVersion)
9067 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_GELU_FWD,
null, m_param.AsLong(nCount, hBottomData, hTopData, (bEnableBertVersion) ? 1 : 0));
9069 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_GELU_FWD,
null, m_param.AsLong(nCount, hBottomData, hTopData, (bEnableBertVersion) ? 1 : 0));
9098 public void gelu_bwd(
int nCount,
long hTopDiff,
long hTopData,
long hBottomDiff,
long hBottomData,
bool bEnableBertVersion)
9101 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_GELU_BWD,
null, m_param.AsLong(nCount, hTopDiff, hTopData, hBottomDiff, hBottomData, (bEnableBertVersion) ? 1 : 0));
9103 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_GELU_BWD,
null, m_param.AsLong(nCount, hTopDiff, hTopData, hBottomDiff, hBottomData, (bEnableBertVersion) ? 1 : 0));
9118 public void silu_fwd(
int nCount,
long hBottomData,
long hTopData)
9121 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SILU_FWD,
null, m_param.AsLong(nCount, hBottomData, hTopData));
9123 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SILU_FWD,
null, m_param.AsLong(nCount, hBottomData, hTopData));
9140 public void silu_bwd(
int nCount,
long hTopDiff,
long hTopData,
long hBottomDiff,
long hBottomData)
9143 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SILU_BWD,
null, m_param.AsLong(nCount, hTopDiff, hTopData, hBottomDiff, hBottomData));
9145 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SILU_BWD,
null, m_param.AsLong(nCount, hTopDiff, hTopData, hBottomDiff, hBottomData));
9164 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SOFTPLUS_FWD,
null, m_param.AsLong(nCount, hBottomData, hTopData));
9166 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SOFTPLUS_FWD,
null, m_param.AsLong(nCount, hBottomData, hTopData));
9183 public void softplus_bwd(
int nCount,
long hTopDiff,
long hTopData,
long hBottomDiff,
long hBottomData)
9186 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SOFTPLUS_BWD,
null, m_param.AsLong(nCount, hTopDiff, hTopData, hBottomDiff, hBottomData));
9188 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SOFTPLUS_BWD,
null, m_param.AsLong(nCount, hTopDiff, hTopData, hBottomDiff, hBottomData));
9203 public void lecun_fwd(
int nCount,
long hBottomData,
long hTopData)
9206 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_LECUN_FWD,
null, m_param.AsLong(nCount, hBottomData, hTopData));
9208 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_LECUN_FWD,
null, m_param.AsLong(nCount, hBottomData, hTopData));
9225 public void lecun_bwd(
int nCount,
long hTopDiff,
long hTopData,
long hBottomDiff,
long hBottomData)
9228 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_LECUN_BWD,
null, m_param.AsLong(nCount, hTopDiff, hTopData, hBottomDiff, hBottomData));
9230 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_LECUN_BWD,
null, m_param.AsLong(nCount, hTopDiff, hTopData, hBottomDiff, hBottomData));
9245 public void serf_fwd(
int nCount,
long hBottomData,
long hTopData,
double dfThreshold)
9248 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SERF_FWD, m_param.AsDouble(dfThreshold), m_param.AsLong(nCount, hBottomData, hTopData, 0));
9250 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SERF_FWD, m_param.AsFloat((
float)dfThreshold), m_param.AsLong(nCount, hBottomData, hTopData, 0));
9267 public void serf_bwd(
int nCount,
long hTopDiff,
long hTopData,
long hBottomDiff,
long hBottomData,
double dfThreshold)
9270 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SERF_BWD, m_param.AsDouble(dfThreshold), m_param.AsLong(nCount, hTopDiff, hTopData, hBottomDiff, hBottomData, 0));
9272 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SERF_BWD, m_param.AsFloat((
float)dfThreshold), m_param.AsLong(nCount, hTopDiff, hTopData, hBottomDiff, hBottomData, 0));
9286 public void tanh_fwd(
int nCount,
long hBottomData,
long hTopData)
9289 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_TANH_FWD,
null, m_param.AsLong(nCount, hBottomData, hTopData));
9291 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_TANH_FWD,
null, m_param.AsLong(nCount, hBottomData, hTopData));
9304 public void tanh_bwd(
int nCount,
long hTopDiff,
long hTopData,
long hBottomDiff)
9307 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_TANH_BWD,
null, m_param.AsLong(nCount, hTopDiff, hTopData, hBottomDiff));
9309 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_TANH_BWD,
null, m_param.AsLong(nCount, hTopDiff, hTopData, hBottomDiff));
9326 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SIGMOID_FWD,
null, m_param.AsLong(nCount, hBottomData, hTopData));
9328 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SIGMOID_FWD,
null, m_param.AsLong(nCount, hBottomData, hTopData));
9341 public void sigmoid_bwd(
int nCount,
long hTopDiff,
long hTopData,
long hBottomDiff)
9344 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SIGMOID_BWD,
null, m_param.AsLong(nCount, hTopDiff, hTopData, hBottomDiff));
9346 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SIGMOID_BWD,
null, m_param.AsLong(nCount, hTopDiff, hTopData, hBottomDiff));
9361 public void swish_bwd(
int nCount,
long hTopDiff,
long hTopData,
long hSigmoidOutputData,
long hBottomDiff,
double dfBeta)
9364 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SWISH_BWD, m_param.AsDouble(dfBeta), m_param.AsLong(nCount, hTopDiff, hTopData, hSigmoidOutputData, hBottomDiff, 0));
9366 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SWISH_BWD, m_param.AsFloat((
float)dfBeta), m_param.AsLong(nCount, hTopDiff, hTopData, hSigmoidOutputData, hBottomDiff, 0));
9383 public void relu_fwd(
int nCount,
long hBottomData,
long hTopData, T fNegativeSlope)
9386 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_RELU_FWD, m_param.AsDouble(convertD(fNegativeSlope)), m_param.AsLong(nCount, hBottomData, hTopData, 0));
9388 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_RELU_FWD, m_param.AsFloat(convertF(fNegativeSlope)), m_param.AsLong(nCount, hBottomData, hTopData, 0));
9404 public void relu_bwd(
int nCount,
long hTopDiff,
long hTopData,
long hBottomDiff, T fNegativeSlope)
9407 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_RELU_BWD, m_param.AsDouble(convertD(fNegativeSlope)), m_param.AsLong(nCount, hTopDiff, hTopData, hBottomDiff, 0));
9409 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_RELU_BWD, m_param.AsFloat(convertF(fNegativeSlope)), m_param.AsLong(nCount, hTopDiff, hTopData, hBottomDiff, 0));
9424 public void elu_fwd(
int nCount,
long hBottomData,
long hTopData,
double dfAlpha)
9427 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_ELU_FWD, m_param.AsDouble(dfAlpha), m_param.AsLong(nCount, hBottomData, hTopData, 0));
9429 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_ELU_FWD, m_param.AsFloat((
float)dfAlpha), m_param.AsLong(nCount, hBottomData, hTopData, 0));
9444 public void elu_bwd(
int nCount,
long hTopDiff,
long hTopData,
long hBottomData,
long hBottomDiff,
double dfAlpha)
9447 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_ELU_BWD, m_param.AsDouble(dfAlpha), m_param.AsLong(nCount, hTopDiff, hTopData, hBottomData, hBottomDiff, 0));
9449 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_ELU_BWD, m_param.AsFloat((
float)dfAlpha), m_param.AsLong(nCount, hTopDiff, hTopData, hBottomData, hBottomDiff, 0));
9464 public void dropout_fwd(
int nCount,
long hBottomData,
long hMask, uint uiThreshold, T fScale,
long hTopData)
9467 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_DROPOUT_FWD, m_param.AsDouble(convertD(fScale)), m_param.AsLong(nCount, hBottomData, hMask, uiThreshold, 0, hTopData));
9469 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_DROPOUT_FWD, m_param.AsFloat(convertF(fScale)), m_param.AsLong(nCount, hBottomData, hMask, uiThreshold, 0, hTopData));
9484 public void dropout_bwd(
int nCount,
long hTopDiff,
long hMask, uint uiThreshold, T fScale,
long hBottomDiff)
9487 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_DROPOUT_BWD, m_param.AsDouble(convertD(fScale)), m_param.AsLong(nCount, hTopDiff, hMask, uiThreshold, 0, hBottomDiff));
9489 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_DROPOUT_BWD, m_param.AsFloat(convertF(fScale)), m_param.AsLong(nCount, hTopDiff, hMask, uiThreshold, 0, hBottomDiff));
9501 public void bnll_fwd(
int nCount,
long hBottomData,
long hTopData)
9504 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_BNLL_FWD,
null, m_param.AsLong(nCount, hBottomData, hTopData));
9506 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_BNLL_FWD,
null, m_param.AsLong(nCount, hBottomData, hTopData));
9516 public void bnll_bwd(
int nCount,
long hTopDiff,
long hBottomData,
long hBottomDiff)
9519 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_BNLL_BWD,
null, m_param.AsLong(nCount, hTopDiff, hBottomData, hBottomDiff));
9521 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_BNLL_BWD,
null, m_param.AsLong(nCount, hTopDiff, hBottomData, hBottomDiff));
9540 public void prelu_fwd(
int nCount,
int nChannels,
int nDim,
long hBottomData,
long hTopData,
long hSlopeData,
int nDivFactor)
9543 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_PRELU_FWD,
null, m_param.AsLong(nCount, nChannels, nDim, hBottomData, hTopData, hSlopeData, nDivFactor));
9545 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_PRELU_FWD,
null, m_param.AsLong(nCount, nChannels, nDim, hBottomData, hTopData, hSlopeData, nDivFactor));
9562 public void prelu_bwd_param(
int nCDim,
int nNum,
int nTopOffset,
long hTopDiff,
long hBottomData,
long hBackBuffDiff)
9565 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_PRELU_BWD_PARAM,
null, m_param.AsLong(nCDim, nNum, nTopOffset, hTopDiff, hBottomData, hBackBuffDiff));
9567 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_PRELU_BWD_PARAM,
null, m_param.AsLong(nCDim, nNum, nTopOffset, hTopDiff, hBottomData, hBackBuffDiff));
9585 public void prelu_bwd(
int nCount,
int nChannels,
int nDim,
long hTopDiff,
long hBottomData,
long hBottomDiff,
long hSlopeData,
int nDivFactor)
9588 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_PRELU_BWD,
null, m_param.AsLong(nCount, nChannels, nDim, hTopDiff, hBottomData, hBottomDiff, hSlopeData, nDivFactor));
9590 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_PRELU_BWD,
null, m_param.AsLong(nCount, nChannels, nDim, hTopDiff, hBottomData, hBottomDiff, hSlopeData, nDivFactor));
9605 public void softmaxloss_fwd(
int nCount,
long hProbData,
long hLabel,
long hLossData,
int nOuterNum,
int nDim,
int nInnerNum,
long hCounts,
int? nIgnoreLabel)
9609 List<long> rg =
new List<long>() { nCount, hProbData, hLabel, hLossData, nOuterNum, nDim, nInnerNum, hCounts };
9611 if (nIgnoreLabel.HasValue)
9612 rg.Add(nIgnoreLabel.Value);
9614 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SOFTMAXLOSS_FWD,
null, rg.ToArray());
9618 List<long> rg =
new List<long>() { nCount, hProbData, hLabel, hLossData, nOuterNum, nDim, nInnerNum, hCounts };
9620 if (nIgnoreLabel.HasValue)
9621 rg.Add(nIgnoreLabel.Value);
9623 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SOFTMAXLOSS_FWD,
null, rg.ToArray());
9639 public void softmaxloss_bwd(
int nCount,
long hTopData,
long hLabel,
long hBottomDiff,
int nOuterNum,
int nDim,
int nInnerNum,
long hCounts,
int? nIgnoreLabel)
9643 List<long> rg =
new List<long>() { nCount, hTopData, hLabel, hBottomDiff, nOuterNum, nDim, nInnerNum, hCounts };
9645 if (nIgnoreLabel.HasValue)
9646 rg.Add(nIgnoreLabel.Value);
9648 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SOFTMAXLOSS_BWD,
null, rg.ToArray());
9652 List<long> rg =
new List<long>() { nCount, hTopData, hLabel, hBottomDiff, nOuterNum, nDim, nInnerNum, hCounts };
9654 if (nIgnoreLabel.HasValue)
9655 rg.Add(nIgnoreLabel.Value);
9657 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SOFTMAXLOSS_BWD,
null, rg.ToArray());
9673 public void nllloss_fwd(
int nCount,
long hProbData,
long hLabel,
long hLossData,
int nOuterNum,
int nDim,
int nInnerNum,
long hCounts,
int? nIgnoreLabel)
9677 List<long> rg =
new List<long>() { nCount, hProbData, hLabel, hLossData, nOuterNum, nDim, nInnerNum, hCounts };
9679 if (nIgnoreLabel.HasValue)
9680 rg.Add(nIgnoreLabel.Value);
9682 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_NLLLOSS_FWD,
null, rg.ToArray());
9686 List<long> rg =
new List<long>() { nCount, hProbData, hLabel, hLossData, nOuterNum, nDim, nInnerNum, hCounts };
9688 if (nIgnoreLabel.HasValue)
9689 rg.Add(nIgnoreLabel.Value);
9691 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_NLLLOSS_FWD,
null, rg.ToArray());
9707 public void nllloss_bwd(
int nCount,
long hTopData,
long hLabel,
long hBottomDiff,
int nOuterNum,
int nDim,
int nInnerNum,
long hCounts,
int? nIgnoreLabel)
9711 List<long> rg =
new List<long>() { nCount, hTopData, hLabel, hBottomDiff, nOuterNum, nDim, nInnerNum, hCounts };
9713 if (nIgnoreLabel.HasValue)
9714 rg.Add(nIgnoreLabel.Value);
9716 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_NLLLOSS_BWD,
null, rg.ToArray());
9720 List<long> rg =
new List<long>() { nCount, hTopData, hLabel, hBottomDiff, nOuterNum, nDim, nInnerNum, hCounts };
9722 if (nIgnoreLabel.HasValue)
9723 rg.Add(nIgnoreLabel.Value);
9725 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_NLLLOSS_BWD,
null, rg.ToArray());
9742 public void max_fwd(
int nCount,
long hBottomDataA,
long hBottomDataB,
int nIdx,
long hTopData,
long hMask)
9745 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MAX_FWD,
null, m_param.AsLong(nCount, hBottomDataA, hBottomDataB, nIdx, hTopData, hMask));
9747 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MAX_FWD,
null, m_param.AsLong(nCount, hBottomDataA, hBottomDataB, nIdx, hTopData, hMask));
9758 public void max_bwd(
int nCount,
long hTopDiff,
int nIdx,
long hMask,
long hBottomDiff)
9761 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MAX_BWD,
null, m_param.AsLong(nCount, hTopDiff, nIdx, hMask, hBottomDiff));
9763 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MAX_BWD,
null, m_param.AsLong(nCount, hTopDiff, nIdx, hMask, hBottomDiff));
9778 public void min_fwd(
int nCount,
long hBottomDataA,
long hBottomDataB,
int nIdx,
long hTopData,
long hMask)
9781 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MIN_FWD,
null, m_param.AsLong(nCount, hBottomDataA, hBottomDataB, nIdx, hTopData, hMask));
9783 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MIN_FWD,
null, m_param.AsLong(nCount, hBottomDataA, hBottomDataB, nIdx, hTopData, hMask));
9794 public void min_bwd(
int nCount,
long hTopDiff,
int nIdx,
long hMask,
long hBottomDiff)
9797 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MIN_BWD,
null, m_param.AsLong(nCount, hTopDiff, nIdx, hMask, hBottomDiff));
9799 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MIN_BWD,
null, m_param.AsLong(nCount, hTopDiff, nIdx, hMask, hBottomDiff));
9812 public void crop_fwd(
int nCount,
int nNumAxes,
long hSrcStrides,
long hDstStrides,
long hOffsets,
long hBottomData,
long hTopData)
9815 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CROP_FWD,
null, m_param.AsLong(nCount, nNumAxes, hSrcStrides, hDstStrides, hOffsets, hBottomData, hTopData));
9817 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CROP_FWD,
null, m_param.AsLong(nCount, nNumAxes, hSrcStrides, hDstStrides, hOffsets, hBottomData, hTopData));
9830 public void crop_bwd(
int nCount,
int nNumAxes,
long hSrcStrides,
long hDstStrides,
long hOffsets,
long hBottomDiff,
long hTopDiff)
9833 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CROP_BWD,
null, m_param.AsLong(nCount, nNumAxes, hSrcStrides, hDstStrides, hOffsets, hBottomDiff, hTopDiff));
9835 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CROP_BWD,
null, m_param.AsLong(nCount, nNumAxes, hSrcStrides, hDstStrides, hOffsets, hBottomDiff, hTopDiff));
9849 public void concat_fwd(
int nCount,
long hBottomData,
int nNumConcats,
int nConcatInputSize,
int nTopConcatAxis,
int nBottomConcatAxis,
int nOffsetConcatAxis,
long hTopData)
9852 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CONCAT_FWD,
null, m_param.AsLong(nCount, hBottomData, nNumConcats, nConcatInputSize, nTopConcatAxis, nBottomConcatAxis, nOffsetConcatAxis, hTopData));
9854 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CONCAT_FWD,
null, m_param.AsLong(nCount, hBottomData, nNumConcats, nConcatInputSize, nTopConcatAxis, nBottomConcatAxis, nOffsetConcatAxis, hTopData));
9869 public void concat_bwd(
int nCount,
long hTopDiff,
int nNumConcats,
int nConcatInputSize,
int nTopConcatAxis,
int nBottomConcatAxis,
int nOffsetConcatAxis,
long hBottomDiff)
9872 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CONCAT_BWD,
null, m_param.AsLong(nCount, hTopDiff, nNumConcats, nConcatInputSize, nTopConcatAxis, nBottomConcatAxis, nOffsetConcatAxis, hBottomDiff));
9874 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CONCAT_BWD,
null, m_param.AsLong(nCount, hTopDiff, nNumConcats, nConcatInputSize, nTopConcatAxis, nBottomConcatAxis, nOffsetConcatAxis, hBottomDiff));
9888 public void slice_fwd(
int nCount,
long hBottomData,
int nNumSlices,
int nSliceSize,
int nBottomSliceAxis,
int nTopSliceAxis,
int nOffsetSliceAxis,
long hTopData)
9891 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SLICE_FWD,
null, m_param.AsLong(nCount, hBottomData, nNumSlices, nSliceSize, nBottomSliceAxis, nTopSliceAxis, nOffsetSliceAxis, hTopData));
9893 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SLICE_FWD,
null, m_param.AsLong(nCount, hBottomData, nNumSlices, nSliceSize, nBottomSliceAxis, nTopSliceAxis, nOffsetSliceAxis, hTopData));
9907 public void slice_bwd(
int nCount,
long hTopDiff,
int nNumSlices,
int nSliceSize,
int nBottomSliceAxis,
int nTopSliceAxis,
int nOffsetSliceAxis,
long hBottomDiff)
9910 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SLICE_BWD,
null, m_param.AsLong(nCount, hTopDiff, nNumSlices, nSliceSize, nBottomSliceAxis, nTopSliceAxis, nOffsetSliceAxis, hBottomDiff));
9912 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SLICE_BWD,
null, m_param.AsLong(nCount, hTopDiff, nNumSlices, nSliceSize, nBottomSliceAxis, nTopSliceAxis, nOffsetSliceAxis, hBottomDiff));
9924 public void tile_fwd(
int nCount,
long hBottomData,
int nInnerDim,
int nTiles,
int nBottomTileAxis,
long hTopData)
9927 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_TILE_FWD,
null, m_param.AsLong(nCount, hBottomData, nInnerDim, nTiles, nBottomTileAxis, hTopData));
9929 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_TILE_FWD,
null, m_param.AsLong(nCount, hBottomData, nInnerDim, nTiles, nBottomTileAxis, hTopData));
9941 public void tile_bwd(
int nCount,
long hTopDiff,
int nTileSize,
int nTiles,
int nBottomTileAxis,
long hBottomDiff)
9944 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_TILE_BWD,
null, m_param.AsLong(nCount, hTopDiff, nTileSize, nTiles, nBottomTileAxis, hBottomDiff));
9946 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_TILE_BWD,
null, m_param.AsLong(nCount, hTopDiff, nTileSize, nTiles, nBottomTileAxis, hBottomDiff));
9958 public void bias_fwd(
int nCount,
long hBottomData,
long hBiasData,
int nBiasDim,
int nInnerDim,
long hTopData)
9961 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_BIAS_FWD,
null, m_param.AsLong(nCount, hBottomData, hBiasData, nBiasDim, nInnerDim, hTopData));
9963 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_BIAS_FWD,
null, m_param.AsLong(nCount, hBottomData, hBiasData, nBiasDim, nInnerDim, hTopData));
9983 public void scale_fwd(
int nCount,
long hX,
long hScaleData,
int nScaleDim,
int nInnerDim,
long hY,
long hBiasData = 0)
9986 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SCALE_FWD,
null, m_param.AsLong(nCount, hX, hScaleData, nScaleDim, nInnerDim, hY, hBiasData));
9988 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SCALE_FWD,
null, m_param.AsLong(nCount, hX, hScaleData, nScaleDim, nInnerDim, hY, hBiasData));
10004 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_THRESHOLD_FWD, m_param.AsDouble(dfThreshold), m_param.AsLong(nCount, 0, hX, hY));
10006 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_THRESHOLD_FWD, m_param.AsFloat((
float)dfThreshold), m_param.AsLong(nCount, 0, hX, hY));
10025 public void cll_bwd(
int nCount,
int nChannels,
double dfMargin,
bool bLegacyVersion,
double dfAlpha,
long hY,
long hDiff,
long hDistSq,
long hBottomDiff)
10028 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CLL_BWD, m_param.AsDouble(dfMargin, dfAlpha), m_param.AsLong(nCount, nChannels, 0, (bLegacyVersion) ? 1 : 0, 0, hY, hDiff, hDistSq, hBottomDiff));
10030 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CLL_BWD, m_param.AsFloat((
float)dfMargin, (
float)dfAlpha), m_param.AsLong(nCount, nChannels, 0, (bLegacyVersion) ? 1 : 0, 0, hY, hDiff, hDistSq, hBottomDiff));
10047 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SMOOTHL1_FWD,
null, m_param.AsLong(nCount, hX, hY));
10049 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SMOOTHL1_FWD,
null, m_param.AsLong( nCount, hX, hY));
10066 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SMOOTHL1_BWD,
null, m_param.AsLong(nCount, hX, hY));
10068 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SMOOTHL1_BWD,
null, m_param.AsLong( nCount, hX, hY));
10082 public void permute(
int nCount,
long hBottom,
bool bFwd,
long hPermuteOrder,
long hOldSteps,
long hNewSteps,
int nNumAxes,
long hTop)
10085 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_PERMUTE,
null, m_param.AsLong(nCount, hBottom, (bFwd) ? 1 : 0, hPermuteOrder, hOldSteps, hNewSteps, nNumAxes, hTop));
10087 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_PERMUTE,
null, m_param.AsLong( nCount, hBottom, (bFwd) ? 1 : 0, hPermuteOrder, hOldSteps, hNewSteps, nNumAxes, hTop));
10102 public void gather_fwd(
int nCount,
long hBottom,
long hTop,
int nAxis,
int nDim,
int nDimAtAxis,
int nM,
int nN,
long hIdx)
10105 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_GATHER_FWD,
null, m_param.AsLong(nCount, hBottom, hTop, nAxis, nDim, nDimAtAxis, nM, nN, hIdx));
10107 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_GATHER_FWD,
null, m_param.AsLong( nCount, hBottom, hTop, nAxis, nDim, nDimAtAxis, nM, nN, hIdx));
10122 public void gather_bwd(
int nCount,
long hTop,
long hBottom,
int nAxis,
int nDim,
int nDimAtAxis,
int nM,
int nN,
long hIdx)
10125 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_GATHER_BWD,
null, m_param.AsLong(nCount, hTop, hBottom, nAxis, nDim, nDimAtAxis, nM, nN, hIdx));
10127 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_GATHER_BWD,
null, m_param.AsLong( nCount, hTop, hBottom, nAxis, nDim, nDimAtAxis, nM, nN, hIdx));
10143 public void lrn_fillscale(
int nCount,
long hBottomData,
int nNum,
int nChannels,
int nHeight,
int nWidth,
int nSize, T fAlphaOverSize, T fK,
long hScaleData)
10146 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_LRN_FILLSCALE, m_param.AsDouble(convertD(fAlphaOverSize), convertD(fK)), m_param.AsLong(nCount, hBottomData, nNum, nChannels, nHeight, nWidth, nSize, 0, 0, hScaleData));
10148 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_LRN_FILLSCALE, m_param.AsFloat(convertF(fAlphaOverSize), convertF(fK)), m_param.AsLong(nCount, hBottomData, nNum, nChannels, nHeight, nWidth, nSize, 0, 0, hScaleData));
10159 public void lrn_computeoutput(
int nCount,
long hBottomData,
long hScaleData, T fNegativeBeta,
long hTopData)
10162 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_LRN_COMPUTEOUTPUT, m_param.AsDouble(convertD(fNegativeBeta)), m_param.AsLong(nCount, hBottomData, hScaleData, 0, hTopData));
10164 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_LRN_COMPUTEOUTPUT, m_param.AsFloat(convertF(fNegativeBeta)), m_param.AsLong(nCount, hBottomData, hScaleData, 0, hTopData));
10184 public void lrn_computediff(
int nCount,
long hBottomData,
long hTopData,
long hScaleData,
long hTopDiff,
int nNum,
int nChannels,
int nHeight,
int nWidth,
int nSize, T fNegativeBeta, T fCacheRatio,
long hBottomDiff)
10187 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_LRN_COMPUTEDIFF, m_param.AsDouble(convertD(fNegativeBeta), convertD(fCacheRatio)), m_param.AsLong(nCount, hBottomData, hTopData, hScaleData, hTopDiff, nNum, nChannels, nHeight, nWidth, nSize, 0, 0, hBottomDiff));
10189 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_LRN_COMPUTEDIFF, m_param.AsFloat(convertF(fNegativeBeta), convertF(fCacheRatio)), m_param.AsLong(nCount, hBottomData, hTopData, hScaleData, hTopDiff, nNum, nChannels, nHeight, nWidth, nSize, 0, 0, hBottomDiff));
10203 public void sgd_update(
int nCount,
long hNetParamsDiff,
long hHistoryData, T fMomentum, T fLocalRate)
10206 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SGD_UPDATE, m_param.AsDouble(convertD(fMomentum), convertD(fLocalRate)), m_param.AsLong(nCount, hNetParamsDiff, hHistoryData, 0, 0));
10208 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SGD_UPDATE, m_param.AsFloat(convertF(fMomentum), convertF(fLocalRate)), m_param.AsLong(nCount, hNetParamsDiff, hHistoryData, 0, 0));
10223 public void nesterov_update(
int nCount,
long hNetParamsDiff,
long hHistoryData, T fMomentum, T fLocalRate)
10226 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_NESTEROV_UPDATE, m_param.AsDouble(convertD(fMomentum), convertD(fLocalRate)), m_param.AsLong(nCount, hNetParamsDiff, hHistoryData, 0, 0));
10228 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_NESTEROV_UPDATE, m_param.AsFloat(convertF(fMomentum), convertF(fLocalRate)), m_param.AsLong(nCount, hNetParamsDiff, hHistoryData, 0, 0));
10243 public void adagrad_update(
int nCount,
long hNetParamsDiff,
long hHistoryData, T fDelta, T fLocalRate)
10246 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_ADAGRAD_UPDATE, m_param.AsDouble(convertD(fDelta), convertD(fLocalRate)), m_param.AsLong(nCount, hNetParamsDiff, hHistoryData, 0, 0));
10248 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_ADAGRAD_UPDATE, m_param.AsFloat(convertF(fDelta), convertF(fLocalRate)), m_param.AsLong(nCount, hNetParamsDiff, hHistoryData, 0, 0));
10264 public void adadelta_update(
int nCount,
long hNetParamsDiff,
long hHistoryData1,
long hHistoryData2, T fMomentum, T fDelta, T fLocalRate)
10267 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_ADADELTA_UPDATE, m_param.AsDouble(convertD(fMomentum), convertD(fDelta), convertD(fLocalRate)), m_param.AsLong(nCount, hNetParamsDiff, hHistoryData1, hHistoryData2, 0, 0, 0));
10269 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_ADADELTA_UPDATE, m_param.AsFloat(convertF(fMomentum), convertF(fDelta), convertF(fLocalRate)), m_param.AsLong(nCount, hNetParamsDiff, hHistoryData1, hHistoryData2, 0, 0, 0));
10287 public void adam_update(
int nCount,
long hNetParamsDiff,
long hValM,
long hValV, T fBeta1, T fBeta2, T fEpsHat, T fLearningRate, T fCorrection)
10290 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_ADAM_UPDATE, m_param.AsDouble(convertD(fBeta1), convertD(fBeta2), convertD(fEpsHat), convertD(fLearningRate), convertD(fCorrection)), m_param.AsLong(nCount, hNetParamsDiff, hValM, hValV, 0, 0, 0, 0, 0));
10292 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_ADAM_UPDATE, m_param.AsFloat(convertF(fBeta1), convertF(fBeta2), convertF(fEpsHat), convertF(fLearningRate), convertF(fCorrection)), m_param.AsLong(nCount, hNetParamsDiff, hValM, hValV, 0, 0, 0, 0, 0));
10313 public void adamw_update(
int nCount,
long hNetParamsDiff,
long hValM,
long hValV, T fBeta1, T fBeta2, T fEpsHat, T fLearningRate, T fDecayRate,
long hNetParamsData,
int nStep)
10316 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_ADAMW_UPDATE, m_param.AsDouble(convertD(fBeta1), convertD(fBeta2), convertD(fEpsHat), convertD(fLearningRate), convertD(fDecayRate)), m_param.AsLong(nCount, hNetParamsDiff, hValM, hValV, 0, 0, 0, 0, 0, hNetParamsData, nStep));
10318 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_ADAMW_UPDATE, m_param.AsFloat(convertF(fBeta1), convertF(fBeta2), convertF(fEpsHat), convertF(fLearningRate), convertF(fDecayRate)), m_param.AsLong(nCount, hNetParamsDiff, hValM, hValV, 0, 0, 0, 0, 0, hNetParamsData, nStep));
10334 public void rmsprop_update(
int nCount,
long hNetParamsDiff,
long hHistoryData, T fRmsDecay, T fDelta, T fLocalRate)
10337 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_RMSPROP_UPDATE, m_param.AsDouble(convertD(fRmsDecay), convertD(fDelta), convertD(fLocalRate)), m_param.AsLong(nCount, hNetParamsDiff, hHistoryData, 0, 0, 0));
10339 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_RMSPROP_UPDATE, m_param.AsFloat(convertF(fRmsDecay), convertF(fDelta), convertF(fLocalRate)), m_param.AsLong(nCount, hNetParamsDiff, hHistoryData, 0, 0, 0));
10372 public void lstm_fwd(
int t,
int nN,
int nH,
int nI,
long hWeight_h,
long hWeight_i,
long hClipData,
int nClipOffset,
long hTopData,
int nTopOffset,
long hCellData,
int nCellOffset,
long hPreGateData,
int nPreGateOffset,
long hGateData,
int nGateOffset,
long hHT1Data,
int nHT1Offset,
long hCT1Data,
int nCT1Offset,
long hHtoGateData,
long hContext = 0,
long hWeight_c = 0,
long hCtoGetData = 0)
10375 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_LSTM_FWD,
null, m_param.AsLong(t, nN, nH, nI, hWeight_h, hWeight_i, hClipData, nClipOffset, hTopData, nTopOffset, hCellData, nCellOffset, hPreGateData, nPreGateOffset, hGateData, nGateOffset, hHT1Data, nHT1Offset, hCT1Data, nCT1Offset, hHtoGateData, hContext, hWeight_c, hCtoGetData));
10377 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_LSTM_FWD,
null, m_param.AsLong( t, nN, nH, nI, hWeight_h, hWeight_i, hClipData, nClipOffset, hTopData, nTopOffset, hCellData, nCellOffset, hPreGateData, nPreGateOffset, hGateData, nGateOffset, hHT1Data, nHT1Offset, hCT1Data, nCT1Offset, hHtoGateData, hContext, hWeight_c, hCtoGetData));
10413 public void lstm_bwd(
int t,
int nN,
int nH,
int nI,
double dfClippingThreshold,
long hWeight_h,
long hClipData,
int nClipOffset,
long hTopDiff,
int nTopOffset,
long hCellData,
long hCellDiff,
int nCellOffset,
long hPreGateDiff,
int nPreGateOffset,
long hGateData,
long hGateDiff,
int nGateOffset,
long hCT1Data,
int nCT1Offset,
long hDHT1Diff,
int nDHT1Offset,
long hDCT1Diff,
int nDCT1Offset,
long hHtoHData,
long hContextDiff = 0,
long hWeight_c = 0)
10416 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_LSTM_BWD, m_param.AsDouble(dfClippingThreshold), m_param.AsLong(t, nN, nH, nI, 0, hWeight_h, hClipData, nClipOffset, hTopDiff, nTopOffset, hCellData, hCellDiff, nCellOffset, hPreGateDiff, nPreGateOffset, hGateData, hGateDiff, nGateOffset, hCT1Data, nCT1Offset, hDHT1Diff, nDHT1Offset, hDCT1Diff, nDCT1Offset, hHtoHData, hContextDiff, hWeight_c));
10418 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_LSTM_BWD, m_param.AsFloat((
float)dfClippingThreshold), m_param.AsLong( t, nN, nH, nI, 0, hWeight_h, hClipData, nClipOffset, hTopDiff, nTopOffset, hCellData, hCellDiff, nCellOffset, hPreGateDiff, nPreGateOffset, hGateData, hGateDiff, nGateOffset, hCT1Data, nCT1Offset, hDHT1Diff, nDHT1Offset, hDCT1Diff, nDCT1Offset, hHtoHData, hContextDiff, hWeight_c));
10436 public void lstm_unit_fwd(
int nCount,
int nHiddenDim,
int nXCount,
long hX,
long hX_acts,
long hC_prev,
long hCont,
long hC,
long hH)
10439 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_LSTM_UNIT_FWD,
null, m_param.AsLong(nCount, nHiddenDim, nXCount, hX, hX_acts, hC_prev, hCont, hC, hH));
10441 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_LSTM_UNIT_FWD,
null, m_param.AsLong( nCount, nHiddenDim, nXCount, hX, hX_acts, hC_prev, hCont, hC, hH));
10463 public void lstm_unit_bwd(
int nCount,
int nHiddenDim,
int nXCount,
long hC_prev,
long hX_acts,
long hC,
long hH,
long hCont,
long hC_diff,
long hH_diff,
long hC_prev_diff,
long hX_acts_diff,
long hX_diff)
10466 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_LSTM_UNIT_BWD,
null, m_param.AsLong(nCount, nHiddenDim, nXCount, hC_prev, hX_acts, hC, hH, hCont, hC_diff, hH_diff, hC_prev_diff, hX_acts_diff, hX_diff));
10468 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_LSTM_UNIT_BWD,
null, m_param.AsLong(nCount, nHiddenDim, nXCount, hC_prev, hX_acts, hC, hH, hCont, hC_diff, hH_diff, hC_prev_diff, hX_acts_diff, hX_diff));
10481 public void coeff_sum_fwd(
int nCount,
int nDim,
int nNumOffset,
double dfCoeff,
long hCoeffData,
long hBottom,
long hTop)
10484 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_COEFF_SUM_FWD, m_param.AsDouble(dfCoeff), m_param.AsLong(nCount, nDim, nNumOffset, 0, hCoeffData, hBottom, hTop));
10486 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_COEFF_SUM_FWD, m_param.AsFloat((
float)dfCoeff), m_param.AsLong(nCount, nDim, nNumOffset, 0, hCoeffData, hBottom, hTop));
10500 public void coeff_sum_bwd(
int nCount,
int nDim,
int nNumOffset,
double dfCoeff,
long hCoeffData,
long hTopDiff,
long hBottomDiff)
10503 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_COEFF_SUM_BWD, m_param.AsDouble(dfCoeff), m_param.AsLong(nCount, nDim, nNumOffset, 0, hCoeffData, hTopDiff, hBottomDiff));
10505 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_COEFF_SUM_BWD, m_param.AsFloat((
float)dfCoeff), m_param.AsLong(nCount, nDim, nNumOffset, 0, hCoeffData, hTopDiff, hBottomDiff));
10518 public void coeff_sub_fwd(
int nCount,
int nDim,
int nNumOffset,
double dfCoeff,
long hCoeffData,
long hBottom,
long hTop)
10521 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_COEFF_SUB_FWD, m_param.AsDouble(dfCoeff), m_param.AsLong(nCount, nDim, nNumOffset, 0, hCoeffData, hBottom, hTop));
10523 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_COEFF_SUB_FWD, m_param.AsFloat((
float)dfCoeff), m_param.AsLong(nCount, nDim, nNumOffset, 0, hCoeffData, hBottom, hTop));
10537 public void coeff_sub_bwd(
int nCount,
int nDim,
int nNumOffset,
double dfCoeff,
long hCoeffData,
long hTopDiff,
long hBottomDiff)
10540 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_COEFF_SUB_BWD, m_param.AsDouble(dfCoeff), m_param.AsLong(nCount, nDim, nNumOffset, 0, hCoeffData, hTopDiff, hBottomDiff));
10542 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_COEFF_SUB_BWD, m_param.AsFloat((
float)dfCoeff), m_param.AsLong(nCount, nDim, nNumOffset, 0, hCoeffData, hTopDiff, hBottomDiff));
10556 public void sigmoid_cross_entropy_fwd(
int nCount,
long hInput,
long hTarget,
long hLoss,
bool bHasIgnoreLabel,
int nIgnoreLabel,
long hCountData)
10559 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SIGMOID_CROSS_ENTROPY_FWD,
null, m_param.AsLong(nCount, hInput, hTarget, hLoss, (bHasIgnoreLabel) ? 1 : 0, nIgnoreLabel, hCountData));
10561 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SIGMOID_CROSS_ENTROPY_FWD,
null, m_param.AsLong(nCount, hInput, hTarget, hLoss, (bHasIgnoreLabel) ? 1 : 0, nIgnoreLabel, hCountData));
10574 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SIGMOID_CROSS_ENTROPY_BWD,
null, m_param.AsLong(nCount, nIgnoreLabel, hTarget, hBottomDiff));
10576 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SIGMOID_CROSS_ENTROPY_BWD,
null, m_param.AsLong(nCount, nIgnoreLabel, hTarget, hBottomDiff));
10595 public void softmax_cross_entropy_fwd(
int nCount,
long hProbData,
long hLabel,
long hLossDiff,
long hLossData,
int nOuterNum,
int nDim,
int nInnerNum,
long hCounts,
int? nIgnoreLabel)
10599 List<long> rg =
new List<long>() { nCount, hProbData, hLabel, hLossDiff, hLossData, nOuterNum, nDim, nInnerNum, hCounts };
10601 if (nIgnoreLabel.HasValue)
10602 rg.Add(nIgnoreLabel.Value);
10604 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SOFTMAX_CROSS_ENTROPY_FWD,
null, rg.ToArray());
10608 List<long> rg =
new List<long>() { nCount, hProbData, hLabel, hLossDiff, hLossData, nOuterNum, nDim, nInnerNum, hCounts };
10610 if (nIgnoreLabel.HasValue)
10611 rg.Add(nIgnoreLabel.Value);
10613 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SOFTMAX_CROSS_ENTROPY_FWD,
null, rg.ToArray());
10627 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SOFTMAX_CROSS_ENTROPY_BWD,
null, m_param.AsLong(nCount, nIgnoreLabel, hTarget, hBottomDiff));
10629 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_SOFTMAX_CROSS_ENTROPY_BWD,
null, m_param.AsLong(nCount, nIgnoreLabel, hTarget, hBottomDiff));
10632#pragma warning disable 1591
10640 m_cuda.RunDouble((
int)m_hKernel, (
int)CUDAFN.CUDA_DEBUG,
null);
10642 m_cuda.RunFloat((
int)m_hKernel, (
int)CUDAFN.CUDA_DEBUG,
null);
10645 public void matrix_set_diagonal(
int nCount,
int nRows,
double dfVal,
long hData)
10648 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MTX_SET_DIAGONAL, m_param.AsDouble(dfVal), m_param.AsLong(nCount, nRows, 0, hData));
10650 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MTX_SET_DIAGONAL, m_param.AsFloat((
float)dfVal), m_param.AsLong(nCount, nRows, 0, hData));
10653 public void matrix_set_diagonal(
int nCount,
int nRows,
long hDiagonal,
double dfScaleA,
double dfScaleB,
long hData)
10656 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MTX_SET_DIAGONAL2, m_param.AsDouble(dfScaleA, dfScaleB), m_param.AsLong(nCount, nRows, hDiagonal, 0, 0, hData));
10658 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MTX_SET_DIAGONAL2, m_param.AsFloat((
float)dfScaleA, (
float)dfScaleB), m_param.AsLong(nCount, nRows, hDiagonal, 0, 0, hData));
10661 public void matrix_add_vector(
ORIENTATION orientation,
int nWidth,
int nHeight,
double dfScale,
long hA,
long hB,
long hY)
10664 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MTX_ADD_VECTOR, m_param.AsDouble(dfScale), m_param.AsLong((
int)orientation, nWidth, nHeight, 0, hA, hB, hY));
10666 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MTX_ADD_VECTOR, m_param.AsFloat((
float)dfScale), m_param.AsLong((
int)orientation, nWidth, nHeight, 0, hA, hB, hY));
10669 public void matrix_transpose_operation(
TRANSPOSE_OPERATION op,
int nWidth,
int nHeight,
long hA,
long hB,
long hY,
double dfScaleA = 1.0,
double dfScaleB = 1.0)
10672 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MTX_TRANSPOSE_OPERATION, m_param.AsDouble(dfScaleA, dfScaleB), m_param.AsLong((
int)op, nWidth, nHeight, hA, hB, hY, 0, 0));
10674 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MTX_TRANSPOSE_OPERATION, m_param.AsFloat((
float)dfScaleA, (
float)dfScaleB), m_param.AsLong((
int)op, nWidth, nHeight, hA, hB, hY, 0, 0));
10677 public void matrix_transpose_add(
int nWidth,
int nHeight,
double dfScaleA,
double dfScaleB,
long hA,
long hB,
long hY)
10679 matrix_transpose_operation(
TRANSPOSE_OPERATION.ADD, nWidth, nHeight, hA, hB, hY, dfScaleA, dfScaleB);
10682 public void matrix_transpose_mul(
int nWidth,
int nHeight,
long hA,
long hB,
long hY)
10687 public void matrix_transpose_div(
int nWidth,
int nHeight,
long hA,
long hB,
long hY)
10692 public void matrix_aggregate_cols(
AGGREGATIONS op,
int nWidth,
int nHeight,
long hA,
long hY)
10695 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MTX_AGGREGATE_COLS,
null, m_param.AsLong((
int)op, nWidth, nHeight, hA, hY));
10697 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MTX_AGGREGATE_COLS,
null, m_param.AsLong((
int)op, nWidth, nHeight, hA, hY));
10700 public void matrix_aggregate_rows(
AGGREGATIONS op,
int nWidth,
int nHeight,
long hA,
long hOnes,
long hY)
10703 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MTX_AGGREGATE_ROWS,
null, m_param.AsLong((
int)op, nWidth, nHeight, hA, hOnes, hY));
10705 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MTX_AGGREGATE_ROWS,
null, m_param.AsLong((
int)op, nWidth, nHeight, hA, hOnes, hY));
10708 public void matrix_transpose(
int nWidth,
int nHeight,
long hA,
long hY)
10711 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MTX_TRANSPOSE,
null, m_param.AsLong(nWidth, nHeight, hA, hY));
10713 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MTX_TRANSPOSE,
null, m_param.AsLong(nWidth, nHeight, hA, hY));
10728 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MTX_MEANCENTER_BY_COL,
null, m_param.AsLong(nWidth, nHeight, hA, hB, hY, (bNormalize) ? 1 : 0));
10730 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MTX_MEANCENTER_BY_COL,
null, m_param.AsLong(nWidth, nHeight, hA, hB, hY, (bNormalize) ? 1 : 0));
10733 public void matrix_euclidean_distance(
long hX,
long hY,
long hOut,
int n,
int d,
int nStart,
int nEnd)
10736 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MTX_EUCLIDEAN_DIST,
null, m_param.AsLong(hX, hY, hOut, n, d, nStart, nEnd));
10738 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MTX_EUCLIDEAN_DIST,
null, m_param.AsLong(hX, hY, hOut, n, d, nStart, nEnd));
10741 public void matrix_dot(
int m,
int n,
int k,
long hA,
long hB,
long hC)
10744 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MTX_DOT,
null, m_param.AsLong(m, n, k, hA, hB, hC));
10746 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MTX_DOT,
null, m_param.AsLong(m, n, k, hA, hB, hC));
10749 public void matrix_mean_rows(
int nWidth,
int nHeight,
long hA,
long hOnes,
double dfAlpha,
long hY)
10752 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MTX_MEAN, m_param.AsDouble(dfAlpha), m_param.AsLong(nWidth, nHeight, hA, hOnes, 0, hY));
10754 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MTX_MEAN, m_param.AsFloat((
float)dfAlpha), m_param.AsLong(nWidth, nHeight, hA, hOnes, 0, hY));
10757 public void matrix_stdev_rows(
int nWidth,
int nHeight,
long hA,
long hOnes,
long hMean,
long hWork,
long hY)
10760 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MTX_STDEV,
null, m_param.AsLong(nWidth, nHeight, hA, hOnes, hMean, hWork, hY));
10762 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MTX_STDEV,
null, m_param.AsLong(nWidth, nHeight, hA, hOnes, hMean, hWork, hY));
10765 public void matrix_correlations(
int nWidth,
int nHeight,
long hA,
long hOnes,
long hMean,
long hStdev,
long hWork,
long hY)
10768 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MTX_CORRELATIONS,
null, m_param.AsLong(nWidth, nHeight, hA, hOnes, hMean, hStdev, hWork, hY));
10770 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_MTX_CORRELATIONS,
null, m_param.AsLong(nWidth, nHeight, hA, hOnes, hMean, hStdev, hWork, hY));
10773#pragma warning restore 1591
10777 #region T-SNE Methods
10779#pragma warning disable 1591
10781 public void tsne_update(
int n,
double dfMomentum,
double dfLearningRate,
long hdY,
long huY,
long hGains,
long hY,
double fGainFactor1 = 0.2,
double fGainFactor2 = 0.8)
10784 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_TSNE_UPDATE, m_param.AsDouble(dfMomentum, dfLearningRate, fGainFactor1, fGainFactor2), m_param.AsLong(n, 0, 0, hdY, huY, hGains, hY, 0, 0));
10786 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_TSNE_UPDATE, m_param.AsFloat((
float)dfMomentum, (
float)dfLearningRate, (
float)fGainFactor1, (
float)fGainFactor2), m_param.AsLong(n, 0, 0, hdY, huY, hGains, hY, 0, 0));
10789 public void tsne_update_grad(
int n,
long hPosF,
long hNegF,
double dfSumQ,
long hdC)
10792 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_TSNE_UPDATE_GRAD, m_param.AsDouble(dfSumQ), m_param.AsLong(n, hPosF, hNegF, 0, hdC));
10794 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_TSNE_UPDATE_GRAD, m_param.AsFloat((
float)dfSumQ), m_param.AsLong(n, hPosF, hNegF, 0, hdC));
10797 public void tsne_compute_exact_error(
int n,
long hP,
long hQ,
long hY)
10800 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_TSNE_COMPUTE_EXACT_ERROR,
null, m_param.AsLong(n, hP, hQ, hY));
10802 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_TSNE_COMPUTE_EXACT_ERROR,
null, m_param.AsLong(n, hP, hQ, hY));
10805 public void tsne_compute_squared_euclidean_distance(
int n,
int d,
long hWork,
long hX,
long hDD_on_host)
10808 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_TSNE_COMPUTE_SQUARED_EUCLIDEAN_DISTANCE,
null, m_param.AsLong(n, d, hWork, hX, hDD_on_host));
10810 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_TSNE_COMPUTE_SQUARED_EUCLIDEAN_DISTANCE,
null, m_param.AsLong(n, d, hWork, hX, hDD_on_host));
10813 public double tsne_compute_q_matrix(
int n,
long hDD_on_host,
long hQ,
bool bQisHostMem)
10817 double[] rg = m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_TSNE_COMPUTE_Q_MATRIX,
null, m_param.AsLong(n, hDD_on_host, hQ, (bQisHostMem) ? 1 : 0));
10822 float[] rg = m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_TSNE_COMPUTE_Q_MATRIX,
null, m_param.AsLong(n, hDD_on_host, hQ, (bQisHostMem) ? 1 : 0));
10827 public void tsne_compute_exact_gradient(
int n,
int d,
long hY,
long hP,
long hQ,
bool bQonHost,
long hdC,
double dfSumQ)
10830 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_TSNE_COMPUTE_EXACT_GRADIENT, m_param.AsDouble(dfSumQ), m_param.AsLong(n, d, hY, hP, hQ, (bQonHost) ? 1 : 0, hdC, 0));
10832 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_TSNE_COMPUTE_EXACT_GRADIENT, m_param.AsFloat((
float)dfSumQ), m_param.AsLong(n, d, hY, hP, hQ, (bQonHost) ? 1 : 0, hdC, 0));
10835 public long tsne_symmetrize_matrix(
int n,
long hRowP,
long hColP,
long hValP)
10839 double[] rg = m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_TSNE_SYMMETRIZE_MATRIX,
null, m_param.AsLong(n, hRowP, hColP, hValP));
10840 return (
long)rg[0];
10844 float[] rg = m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_TSNE_SYMMETRIZE_MATRIX,
null, m_param.AsLong(n, hRowP, hColP, hValP));
10845 return (
long)rg[0];
10849 public void tsne_compute_knn_bounds(
int n,
long hData,
double dfCirclePct, out
double dfMinX, out
double dfMinY, out
double dfMaxX, out
double dfMaxY)
10853 double[] rg = m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_TSNE_COMPUTE_KNN_BOUNDS, m_param.AsDouble(dfCirclePct), m_param.AsLong(n, hData, 0));
10861 float[] rg = m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_TSNE_COMPUTE_KNN_BOUNDS, m_param.AsFloat((
float)dfCirclePct), m_param.AsLong(n, hData, 0));
10869 public long CreateTsneGaussianPerplexity(
int n,
int d,
int k,
long hX,
long hCurP,
long hValP,
long hRowPonHost,
long hColPonHost,
double fPerplexity)
10873 double[] rg = m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_TSNE_CREATE_GAUSSIAN_PERPLEXITY, m_param.AsDouble(fPerplexity), m_param.AsLong(n, d, k, hX, hCurP, hValP, hRowPonHost, hColPonHost, 0));
10874 return (
long)rg[0];
10878 float[] rg = m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_TSNE_CREATE_GAUSSIAN_PERPLEXITY, m_param.AsFloat((
float)fPerplexity), m_param.AsLong(n, d, k, hX, hCurP, hValP, hRowPonHost, hColPonHost, 0));
10879 return (
long)rg[0];
10883 public bool FindTsneGaussianPerplexity(
long hTsnePerplexity, out
int nCurrentIteration, out
int nMaxIteration)
10885 bool bDone =
false;
10889 double[] rg = m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_TSNE_FIND_GAUSSIAN_PERPLEXITY,
null, m_param.AsLong(hTsnePerplexity));
10890 bDone = (rg[0] == 1.0) ? true :
false;
10891 nCurrentIteration = (int)rg[1];
10892 nMaxIteration = (int)rg[2];
10896 float[] rg = m_cuda.RunFloatEx2((
int)m_hKernel, (int)CUDAFN.CUDA_TSNE_FIND_GAUSSIAN_PERPLEXITY,
null, m_param.AsLong(hTsnePerplexity));
10897 bDone = (rg[0] == 1.0) ? true :
false;
10898 nCurrentIteration = (int)rg[1];
10899 nMaxIteration = (int)rg[2];
10905 public void FreeTsneGaussianPerplexity(
long hTsnePerplexity)
10908 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_TSNE_FREE_GAUSSIAN_PERPLEXITY,
null, m_param.AsLong(hTsnePerplexity));
10910 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_TSNE_FREE_GAUSSIAN_PERPLEXITY,
null, m_param.AsLong(hTsnePerplexity));
10913 public long CreateTsne(
int n,
int d,
long hY,
long hValP,
long hRowP,
long hColP,
long hdC,
double fTheta)
10917 double[] rg = m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_TSNE_CREATE, m_param.AsDouble(fTheta), m_param.AsLong(n, d, hY, hValP, hRowP, hColP, hdC, 0));
10918 return (
long)rg[0];
10922 float[] rg = m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_TSNE_CREATE, m_param.AsFloat((
float)fTheta), m_param.AsLong(n, d, hY, hValP, hRowP, hColP, hdC, 0));
10923 return (
long)rg[0];
10927 public void ComputeTsneGradient(
long hTsne,
bool bValPUpdated)
10930 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_TSNE_COMPUTE_GRADIENT1,
null, m_param.AsLong(hTsne, (bValPUpdated) ? 1 : 0));
10932 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_TSNE_COMPUTE_GRADIENT1,
null, m_param.AsLong(hTsne, (bValPUpdated) ? 1 : 0));
10935 public double EvaluateTsneError(
long hTsne)
10939 double[] rg = m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_TSNE_COMPUTE_ERROR1,
null, m_param.AsLong(hTsne));
10944 float[] rg = m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_TSNE_COMPUTE_ERROR1,
null, m_param.AsLong(hTsne));
10949 public void FreeTsne(
long hTsne)
10952 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_TSNE_FREE,
null, m_param.AsLong(hTsne));
10954 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_TSNE_FREE,
null, m_param.AsLong(hTsne));
10957#pragma warning restore 1591
10961 #region Image Processing And Misc
10980 public void gaussian_blur(
int n,
int nChannels,
int nHeight,
int nWidth,
double dfSigma,
long hX,
long hY)
10983 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_GUASSIAN_BLUR, m_param.AsDouble(dfSigma), m_param.AsLong(n, nChannels, nHeight, nWidth, 0, hX, hY));
10985 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_GUASSIAN_BLUR, m_param.AsFloat((
float)dfSigma), m_param.AsLong(n, nChannels, nHeight, nWidth, 0, hX, hY));
11005 public double hamming_distance(
int n,
double dfThreshold,
long hA,
long hB,
long hY,
int nOffA = 0,
int nOffB = 0,
int nOffY = 0)
11008 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_HAMMING_DIFF, m_param.AsDouble(dfThreshold), m_param.AsLong(n, 0, hA, hB, hY, nOffA, nOffB, nOffY));
11010 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_HAMMING_DIFF, m_param.AsFloat((
float)dfThreshold), m_param.AsLong(n, 0, hA, hB, hY, nOffA, nOffB, nOffY));
11012 return asum_double(n, hY);
11030 m_cuda.RunDoubleEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CALC_DFT,
null, m_param.AsLong(n, hX, m, hY));
11032 m_cuda.RunFloatEx2((
int)m_hKernel, (
int)CUDAFN.CUDA_CALC_DFT,
null, m_param.AsLong(n, hX, m, hY));
11050 List<long> rgArg =
new List<long> { (int)distMethod, 0, nItemDim, hSrc, hTargets, hWork };
11051 int nDim0 = rgOffsets.GetLength(0);
11052 int nDim1 = rgOffsets.GetLength(1);
11057 for (
int i = 0; i < nDim0; i++)
11059 for (
int j = 0; j < nDim1; j++)
11061 rgArg.Add(rgOffsets[i, j]);
11065 return m_cuda.RunDoubleEx2((
int)m_hKernel, (int)CUDAFN.CUDA_CALC_BATCH_DIST, m_param.AsDouble(dfThreshold), rgArg.ToArray());
11069 List<long> rgArg =
new List<long> { (int)distMethod, 0, nItemDim, hSrc, hTargets, hWork };
11070 int nDim0 = rgOffsets.GetLength(0);
11071 int nDim1 = rgOffsets.GetLength(1);
11076 for (
int i = 0; i < nDim0; i++)
11078 for (
int j = 0; j < nDim1; j++)
11080 rgArg.Add(rgOffsets[i, j]);
11084 float[] rg = m_cuda.RunFloatEx2((
int)m_hKernel, (int)CUDAFN.CUDA_CALC_BATCH_DIST, m_param.AsFloat((
float)dfThreshold), rgArg.ToArray());
11085 double[] rgD =
new double[rg.Length];
11087 for (
int i = 0; i < rg.Length; i++)
11101 #region Convertion Methods
11103 private T[] convert(
double[] rg)
11108 if (typeof(T) == typeof(
double))
11109 return (T[])Convert.ChangeType(rg, typeof(T[]));
11111 T[] rgt =
new T[rg.Length];
11112 Array.Copy(Array.ConvertAll(rg, p => Convert.ToSingle(p)), rgt, rg.Length);
11117 private T[] convert(
float[] rg)
11122 if (typeof(T) == typeof(
float))
11123 return (T[])Convert.ChangeType(rg, typeof(T[]));
11125 T[] rgt =
new T[rg.Length];
11126 Array.Copy(rg, rgt, rg.Length);
11131 private float convertF1(T f)
11133 return (
float)Convert.ChangeType(f, typeof(
float));
11136 private T convertF1(
float f)
11138 return (T)Convert.ChangeType(f, typeof(T));
11141 private float[] convertF(T[] rg,
int nCount = -1)
11147 nCount = rg.Length;
11149 if (typeof(T) == typeof(
float))
11150 return (
float[])Convert.ChangeType(rg, typeof(
float[]));
11152 float[] rgf =
new float[rg.Length];
11153 Array.Copy(Array.ConvertAll(rg, p => Convert.ToSingle(p)), rgf, rg.Length);
11158 private float[] convertF(T[] rg,
float[] rgDst,
int nOffset = 0,
int nCount = -1)
11164 nCount = rg.Length;
11166 if (typeof(T) == typeof(
float))
11168 float[] rgConv = (
float[])Convert.ChangeType(rg, typeof(
float[]));
11169 Array.Copy(rgConv, 0, rgDst, nOffset, nCount);
11173 Array.Copy(rg, 0, rgDst, nOffset, nCount);
11179 private double convertD1(T df)
11181 return (
double)Convert.ChangeType(df, typeof(
double));
11184 private T convertD1(
double df)
11186 return (T)Convert.ChangeType(df, typeof(T));
11189 private double[] convertD(T[] rg,
int nCount = -1)
11195 nCount = rg.Length;
11197 if (typeof(T) == typeof(
double))
11198 return (
double[])Convert.ChangeType(rg, typeof(
double[]));
11200 double[] rgdf =
new double[rg.Length];
11201 Array.Copy(rg, rgdf, rg.Length);
11206 private double[] convertD(T[] rg,
double[] rgDst,
int nOffset = 0,
int nCount = -1)
11212 nCount = rg.Length;
11214 if (typeof(T) == typeof(
double))
11216 double[] rgConv = (
double[])Convert.ChangeType(rg, typeof(
double[]));
11217 Array.Copy(rgConv, 0, rgDst, nOffset, nCount);
11221 Array.Copy(rg, 0, rgDst, nOffset, nCount);
11229 #region Debugging Methods
11240 bool bCudaCallUsed;
11241 int nGpuID = GetDeviceID();
11242 double dfMem = GetDeviceMemory(out dfFree, out dfUsed, out bCudaCallUsed);
11243 log.
WriteLine(strLocation +
" Memory (GPU " + nGpuID.ToString() +
"): " + dfMem.ToString(
"N2") +
" GB total; " + dfFree.ToString(
"N2") +
" GB free; " + dfUsed.ToString(
"N2") +
" GB used.",
true);
11249#pragma warning disable 1591
11257 public long[] AsLong(params
long[] rg)
11262 public double[] AsDouble(params
double[] rg)
11267 public float[] AsFloat(params
float[] rg)
11273#pragma warning restore 1591
The CryptoRandom is a random number generator that can use either the standard .Net Random objec or t...
double NextDouble()
Returns a random double within the range .
The Log class provides general output in text form.
void WriteLine(string str, bool bOverrideEnabled=false, bool bHeader=false, bool bError=false, bool bDisable=false)
Write a line of output.
The Utility class provides general utility funtions.
static List< int > Create(int nCount, int nStart, int nInc)
Create a new List and fill it with values starting with start and incrementing by inc.
static double[] ConvertVec(float[] rgf)
Convert an array of float to an array of generics.
The CudaDnn object is the main interface to the Low-Level Cuda C++ DLL.
void channel_compare(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY)
Compares the values of the channels from X and places the result in Y where 1 is set if the values ar...
void relu_fwd(int nCount, long hBottomData, long hTopData, T fNegativeSlope)
Performs a Rectifier Linear Unit (ReLU) forward pass in Cuda.
long CreateTensorDesc()
Create a new instance of a tensor descriptor for use with NVIDIA's cuDnn.
long CreateConvolutionDesc()
Create a new instance of a convolution descriptor for use with NVIDIA's cuDnn.
void coeff_sub_bwd(int nCount, int nDim, int nNumOffset, double dfCoeff, long hCoeffData, long hTopDiff, long hBottomDiff)
Performs a coefficient sub backward pass in Cuda.
CudaDnn(int nDeviceID, DEVINIT flags=(DEVINIT.CUBLAS|DEVINIT.CURAND), long? lSeed=null, string strPath="", bool bResetFirst=false, bool bEnableMemoryTrace=false)
The CudaDnn constructor.
T[] GetMemory(long hMem, long lCount=-1)
Retrieves the GPU memory as an array of type 'T'
void SetTensorDesc(long hHandle, int n, int c, int h, int w, int nStride, int cStride, int hStride, int wStride, bool bHalf=false)
Sets the values of a tensor descriptor.
void SynchronizeStream(long h=0)
Synchronize a stream on the current GPU, waiting for its operations to complete.
void log(int n, long hA, long hY, double dfBeta, double dfAlpha=0)
Calculates the log value of (A * beta) + alpha, and places the result in Y.
void channel_mul(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY, int nMethod=1)
Multiplies the values of the channels from X and places the result in Y.
int GetDeviceID()
Returns the current device id set within Cuda.
void SetRnnDesc(long hCuDnn, long hRnnDesc, int nHiddenCount, int nNumLayers, long hDropoutDesc, RNN_MODE mode, bool bUseTensorCores, RNN_DIRECTION direction=RNN_DIRECTION.RNN_UNIDIRECTIONAL)
Sets the RNN Descriptor values.
void SetHostMemory(long hMem, T[] rgSrc)
Copies an array of type 'T' into a block of already allocated host memory.
void channel_op_fwd(OP op, int nCount, int nC, int nN1, int nSD1, int nN2, int nSD2, long hA, long hB, long hY)
Performs a channel operation forward on the data.
void DeviceDisablePeerAccess(int nPeerDeviceID)
Disables peer-to-peer access between the current device used by the CudaDnn instance and a peer devic...
void CopyHostToDevice(long lCount, long hHostSrc, long hGpuDst)
Copy from Host memory to GPU memory.
void ResetDevice()
Reset the current device.
void sort(int nCount, long hY)
Sort the data in the GPU memory specified.
float erf(float fVal)
Calculates the erf() function.
void math_fwd(int nCount, long hBottomData, long hTopData, MATH_FUNCTION function)
Performs a Math function forward pass in Cuda.
long CreateRnnDesc()
Create the RNN Descriptor.
void clip_bwd(int nCount, long hTopDiff, long hBottomData, long hBottomDiff, T fMin, T fMax)
Performs a Clip backward pass in Cuda.
void adadelta_update(int nCount, long hNetParamsDiff, long hHistoryData1, long hHistoryData2, T fMomentum, T fDelta, T fLocalRate)
Perform the AdaDelta update
void gemv(bool bTransA, int m, int n, double fAlpha, long hA, long hX, double fBeta, long hY)
Perform a matrix-vector multiplication operation: y = alpha transA (A) x + beta y (where x and y are ...
void copy_batch(int nCount, int nNum, int nDim, long hSrcData, long hSrcLbl, int nDstCount, long hDstCache, long hWorkDevData, int nLabelStart, int nLabelCount, int nCacheSize, long hCacheHostCursors, long hWorkDataHost)
Copy a batch of labeled items into a cache organized by label where older data is removed and replace...
void crop_bwd(int nCount, int nNumAxes, long hSrcStrides, long hDstStrides, long hOffsets, long hBottomDiff, long hTopDiff)
Performs the crop backward operation.
void matmul(uint nOuterCount, int m, int n, int k, long hA, long hB, long hC, double dfScale=1.0, bool bTransA=false, bool bTransB=false)
Perform matmul operation hC = matmul(hA, hB), where hA, hB and hC are all in row-major format.
bool IsRnn8Supported()
Returns whether or not RNN8 is supported.
void dropout_bwd(int nCount, long hTopDiff, long hMask, uint uiThreshold, T fScale, long hBottomDiff)
Performs a dropout backward pass in Cuda.
void col2im_nd(long hDataCol, int nDataColOffset, int nNumSpatialAxes, int nColCount, int nChannelAxis, long hImShape, long hColShape, long hKernelShape, long hPad, long hStride, long hDilation, long hDataIm, int nDataImOffset)
Rearranges the columns into image blocks.
void rng_uniform(int n, double fMin, double fMax, long hY)
Fill Y with random numbers using a uniform random distribution.
void ResetGhostMemory()
Resets the ghost memory by enabling it if this instance was configured to use ghost memory.
void channel_op_bwd(OP op, int nCount, int nC, int nN1, int nSD1, int nN2, int nSD2, int nCy, int nSDy, long hA, long hB, long hY, long hAd, long hBd, long hYd, long hWork)
Performs a channel operation backward on the data.
void mul_scalar(int n, float fAlpha, long hY)
Mutlipy each element of Y by a scalar.
double sumsq(int n, long hW, long hA, int nAOff=0)
Calculates the sum of squares of A.
void copy(int nCount, long hSrc, long hDst, int nSrcOffset=0, int nDstOffset=0, long hStream=-1, bool? bSrcHalfSizeOverride=null, bool? bDstHalfSizeOverride=null)
Copy data from one block of GPU memory to another.
void add_scalar(int n, T fAlpha, long hY, int nYOff=0)
Adds a scalar value to each element of Y.
void slice_fwd(int nCount, long hBottomData, int nNumSlices, int nSliceSize, int nBottomSliceAxis, int nTopSliceAxis, int nOffsetSliceAxis, long hTopData)
Performs a slice forward pass in Cuda.
void channel_sub(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hA, long hX, long hY)
Subtracts the values across the channels of X from A and places the result in Y.
void mish_bwd(int nCount, long hTopDiff, long hTopData, long hBottomDiff, long hBottomData, double dfThreshold, int nMethod=0)
Performs a Mish backward pass in Cuda.
void cll_bwd(int nCount, int nChannels, double dfMargin, bool bLegacyVersion, double dfAlpha, long hY, long hDiff, long hDistSq, long hBottomDiff)
Performs a contrastive loss layer backward pass in Cuda.
void powx(int n, long hA, double fAlpha, long hY, int nAOff=0, int nYOff=0)
Calculates the A raised to the power alpha and places the result in Y.
void mul_scalar(int n, double fAlpha, long hY)
Mutlipy each element of Y by a scalar.
void exp(int n, long hA, long hY, int nAOff, int nYOff, double dfBeta)
Calculates the exponent value of A * beta and places the result in Y.
long CreateDropoutDesc()
Create a new instance of a dropout descriptor for use with NVIDIA's cuDnn.
void nllloss_bwd(int nCount, long hTopData, long hLabel, long hBottomDiff, int nOuterNum, int nDim, int nInnerNum, long hCounts, int? nIgnoreLabel)
Performs NLL Loss backward pass in Cuda.
static ulong ConvertByteSizeToCount(ulong ulSizeInBytes)
Converts the byte size into the number of items in the base data type of float or double.
void FreeNCCL(long hNccl)
Free an instance of NCCL.
void FreeLayerNorm(long hLayerNorm)
Free the instance of LayerNorm GPU support.
void ger(int m, int n, float fAlpha, long hX, long hY, long hA)
Perform a vector-vector multiplication operation: A = x * (fAlpha * y) (where x and y are vectors and...
void add(int n, long hA, long hB, long hY, float fAlpha)
Adds A to (B times scalar) and places the result in Y.
void relu_bwd(int nCount, long hTopDiff, long hTopData, long hBottomDiff, T fNegativeSlope)
Performs a Rectifier Linear Unit (ReLU) backward pass in Cuda.
void adamw_update(int nCount, long hNetParamsDiff, long hValM, long hValV, T fBeta1, T fBeta2, T fEpsHat, T fLearningRate, T fDecayRate, long hNetParamsData, int nStep)
Perform the AdamW update
void lstm_unit_bwd(int nCount, int nHiddenDim, int nXCount, long hC_prev, long hX_acts, long hC, long hH, long hCont, long hC_diff, long hH_diff, long hC_prev_diff, long hX_acts_diff, long hX_diff)
Peforms the simple LSTM backward pass in Cuda for a given LSTM unit.
void scale(int n, float fAlpha, long hX, long hY)
Scales the values in X and places them in Y.
void LRNCrossChannelBackward(long hCuDnn, long hNormDesc, T fAlpha, long hTopDataDesc, long hTopData, long hTopDiffDesc, long hTopDiff, long hBottomDataDesc, long hBottomData, T fBeta, long hBottomDiffDesc, long hBottomDiff)
Perform LRN cross channel backward pass.
void Dispose()
Disposes this instance freeing up all of its host and GPU memory.
string GetRequiredCompute(out int nMinMajor, out int nMinMinor)
The GetRequiredCompute function returns the Major and Minor compute values required by the current Cu...
void AddTensor(long hCuDnn, T fAlpha, long hSrcDesc, long hSrc, int nSrcOffset, T fBeta, long hDstDesc, long hDst, int nDstOffset)
Add two tensors together.
void width(int n, long hMean, long hMin, long hMax, double dfAlpha, long hWidth)
Calculates the width values.
void scal(int n, double fAlpha, long hX, int nXOff=0)
Scales the data in X by a scaling factor.
void max(int n, long hA, long hB, long hY)
Calculates the max of A and B and places the result in Y. This max is only computed on a per item bas...
void ReLUBackward(long hCuDnn, T fAlpha, long hTopDataDesc, long hTopData, long hTopDiffDesc, long hTopDiff, long hBottomDataDesc, long hBottomData, T fBeta, long hBottomDiffDesc, long hBottomDiff)
Perform a ReLU backward pass.
void channel_add(int nCount, int nOuterNum, int nChannels, int nBlocks, int nInnerNum, int nOffset, long hX, long hY, DIR dir)
Add data along channels similar to numpy split function but where the data is added instead of copied...
long CreateRnnDataDesc()
Create the RNN Data Descriptor.
void FreePCA(long hPCA)
Free the PCA instance associated with handle.
void FreeMemory(long hMem)
Free previously allocated GPU memory.
double[] calculate_batch_distances(DistanceMethod distMethod, double dfThreshold, int nItemDim, long hSrc, long hTargets, long hWork, int[,] rgOffsets)
The calculate_batch_distances method calculates a set of distances based on the DistanceMethod specif...
void SetPoolingDesc(long hHandle, PoolingMethod method, int h, int w, int hPad, int wPad, int hStride, int wStride)
Set the values of a pooling descriptor.
void PoolingForward(long hCuDnn, long hPoolingDesc, T fAlpha, long hBottomDesc, long hBottomData, T fBeta, long hTopDesc, long hTopData)
Perform a pooling forward pass.
void gather_fwd(int nCount, long hBottom, long hTop, int nAxis, int nDim, int nDimAtAxis, int nM, int nN, long hIdx)
Performs a gather forward pass where data at specifies indexes along a given axis are copied to the o...
void rng_setseed(long lSeed)
Sets the random number generator seed used by random number operations.
void tile_bwd(int nCount, long hTopDiff, int nTileSize, int nTiles, int nBottomTileAxis, long hBottomDiff)
Performs a tile backward pass in Cuda.
double dot_double(int n, long hX, long hY)
Computes the dot product of X and Y.
void rng_gaussian(int n, double fMu, double fSigma, long hY)
Fill Y with random numbers using a gaussian random distribution.
void add_scalar(int n, double fAlpha, long hY)
Adds a scalar value to each element of Y.
void unpooling_fwd(POOLING_METHOD method, int nCount, long hBottomData, int num, int nChannels, int nHeight, int nWidth, int nPooledHeight, int nPooledWidth, int nKernelH, int nKernelW, int nStrideH, int nStrideW, int nPadH, int nPadW, long hTopData, long hMask)
Performs the forward pass for unpooling using Cuda
void matrix_meancenter_by_column(int nWidth, int nHeight, long hA, long hB, long hY, bool bNormalize=false)
Mean center the data by columns, where each column is summed and then subtracted from each column val...
void adagrad_update(int nCount, long hNetParamsDiff, long hHistoryData, T fDelta, T fLocalRate)
Perform the AdaGrad update
void SynchronizeDevice()
Synchronize the operations on the current device.
void sigmoid_cross_entropy_bwd(int nCount, int nIgnoreLabel, long hTarget, long hBottomDiff)
Performs a sigmoid cross entropy backward pass in Cuda when an ignore label is specified.
void smoothl1_bwd(int nCount, long hX, long hY)
Performs the backward operation for the SmoothL1 loss.
void scale_to_range(int n, long hX, long hY, double fMin, double fMax)
Scales the values in X and places the result in Y (can also run inline where X = Y).
void ConvolutionBackwardFilter(long hCuDnn, T fAlpha, long hBottomDesc, long hBottomData, int nBottomOffset, long hTopDesc, long hTopDiff, int nTopOffset, long hConvDesc, CONV_BWD_FILTER_ALGO algoBwd, long hWorkspace, int nWorkspaceOffset, ulong lWorkspaceSize, T fBeta, long hFilterDesc, long hWeightDiff, int nWeightOffset, bool bSyncStream=true)
Perform a convolution backward pass on the filter.
void SigmoidForward(long hCuDnn, T fAlpha, long hBottomDataDesc, long hBottomData, T fBeta, long hTopDataDesc, long hTopData)
Perform a Sigmoid forward pass.
int SsdMultiBoxLossForward(long hSSD, int nLocDataCount, long hLocGpuData, int nConfDataCount, long hConfGpuData, int nPriorDataCount, long hPriorGpuData, int nGtDataCount, long hGtGpuData, out List< DictionaryMap< List< int > > > rgAllMatchIndices, out List< List< int > > rgrgAllNegIndices, out int nNumNegs)
Performs the SSD MultiBoxLoss forward operation.
void TanhBackward(long hCuDnn, T fAlpha, long hTopDataDesc, long hTopData, long hTopDiffDesc, long hTopDiff, long hBottomDataDesc, long hBottomData, T fBeta, long hBottomDiffDesc, long hBottomDiff)
Perform a Tanh backward pass.
void pooling_bwd(POOLING_METHOD method, int nCount, long hTopDiff, int num, int nChannels, int nHeight, int nWidth, int nPooledHeight, int nPooledWidth, int nKernelH, int nKernelW, int nStrideH, int nStrideW, int nPadH, int nPadW, long hBottomDiff, long hMask, long hTopMask)
Performs the backward pass for pooling using Cuda
void gemm(bool bTransA, bool bTransB, int m, int n, int k, T fAlpha, long hA, long hB, T fBeta, long hC, int nAOffset=0, int nBOffset=0, int nCOffset=0, int nGroups=1, int nGroupOffsetA=0, int nGroupOffsetB=0, int nGroupOffsetC=0)
Perform a matrix-matrix multiplication operation: C = alpha transB (B) transA (A) + beta C
void permute(int nCount, long hBottom, bool bFwd, long hPermuteOrder, long hOldSteps, long hNewSteps, int nNumAxes, long hTop)
Performs data permutation on the input and reorders the data which is placed in the output.
void bnll_fwd(int nCount, long hBottomData, long hTopData)
Performs a binomial normal log liklihod (BNLL) forward pass in Cuda.
void silu_bwd(int nCount, long hTopDiff, long hTopData, long hBottomDiff, long hBottomData)
Performs the Sigmoid-weighted Linear Unit (SiLU) activation backward pass in Cuda.
void channel_min(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY, bool bReturnIdx=false)
Calculates the minimum value within each channel of X and places the result in Y.
void geam(bool bTransA, bool bTransB, int m, int n, float fAlpha, long hA, long hB, float fBeta, long hC)
Perform a matrix-matrix addition/transposition operation: C = alpha transA (A) + beta transB (B)
void nllloss_fwd(int nCount, long hProbData, long hLabel, long hLossData, int nOuterNum, int nDim, int nInnerNum, long hCounts, int? nIgnoreLabel)
Performs NLL Loss forward pass in Cuda.
void NcclBroadcast(long hNccl, long hStream, long hX, int nCount)
Broadcasts a block of GPU data to all NCCL instances.
void FreeCuDNN(long h)
Free an instance of cuDnn.
bool DeviceCanAccessPeer(int nSrcDeviceID, int nPeerDeviceID)
Query whether or not two devices can access each other via peer-to-peer memory copies.
long CreateMemoryPointer(long hData, long lOffset, long lCount)
Creates a memory pointer into an already existing block of GPU memory.
void lstm_bwd(int t, int nN, int nH, int nI, double dfClippingThreshold, long hWeight_h, long hClipData, int nClipOffset, long hTopDiff, int nTopOffset, long hCellData, long hCellDiff, int nCellOffset, long hPreGateDiff, int nPreGateOffset, long hGateData, long hGateDiff, int nGateOffset, long hCT1Data, int nCT1Offset, long hDHT1Diff, int nDHT1Offset, long hDCT1Diff, int nDCT1Offset, long hHtoHData, long hContextDiff=0, long hWeight_c=0)
Peforms the simple LSTM backward pass in Cuda.
void denan(int n, long hX, double dfReplacement)
Replaces all NAN values witin X with a replacement value.
void mask_batch(int n, int nBatch, int nMaskDim, float fSearch, float fReplace, long hX, long hMask, long hY)
Mask the mask the batch of data in the source with the mask by replacing all values 'fSearch' found i...
void rng_uniform(int n, float fMin, float fMax, long hY)
Fill Y with random numbers using a uniform random distribution.
void nesterov_update(int nCount, long hNetParamsDiff, long hHistoryData, T fMomentum, T fLocalRate)
Perform the Nesterov update
void ConvolutionBackwardData(long hCuDnn, T fAlpha, long hFilterDesc, long hWeight, int nWeightOffset, long hTopDesc, long hTopDiff, int nTopOffset, long hConvDesc, CONV_BWD_DATA_ALGO algoBwd, long hWorkspace, int nWorkspaceOffset, ulong lWorkspaceSize, T fBeta, long hBottomDesc, long hBottomDiff, int nBottomOffset, bool bSyncStream=true)
Perform a convolution backward pass on the data.
void ger(int m, int n, T fAlpha, long hX, long hY, long hA)
Perform a vector-vector multiplication operation: A = x * (fAlpha * y) (where x and y are vectors and...
void coeff_sum_bwd(int nCount, int nDim, int nNumOffset, double dfCoeff, long hCoeffData, long hTopDiff, long hBottomDiff)
Performs a coefficient sum backward pass in Cuda.
void channel_fill(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, int nLabelDim, long hLabels, long hY)
Fills each channel with the channel item of Y with the data of X matching the label index specified b...
void SetMemory(long hMem, List< double > rg)
Copies a list of doubles into a block of already allocated GPU memory.
void FreePoolingDesc(long h)
Free a pooling descriptor instance.
void embed_bwd(int nCount, long hBottomData, long hTopDiff, int nM, int nN, int nK, long hWeightDiff)
Performs the backward pass for embed
void mask(int n, int nMaskDim, double fSearch, double fReplace, long hX, long hMask, long hY)
Mask the mask the data in the source with the mask by replacing all values 'fSearch' found in the mas...
void threshold_fwd(int nCount, double dfThreshold, long hX, long hY)
Performs a threshold pass in Cuda.
void channel_mean(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY)
Calculates the mean value of each channel of X and places the result in Y.
static string GetCudaDnnDllPath()
Returns the path to the CudaDnnDll module to use for low level CUDA processing.
long AllocMemory(List< float > rg)
Allocate a block of GPU memory and copy a list of floats to it.
long CreateSSD(int nNumClasses, bool bShareLocation, int nLocClasses, int nBackgroundLabelId, bool bUseDiffcultGt, SSD_MINING_TYPE miningType, SSD_MATCH_TYPE matchType, float fOverlapThreshold, bool bUsePriorForMatching, SSD_CODE_TYPE codeType, bool bEncodeVariantInTgt, bool bBpInside, bool bIgnoreCrossBoundaryBbox, bool bUsePriorForNms, SSD_CONF_LOSS_TYPE confLossType, SSD_LOC_LOSS_TYPE locLossType, float fNegPosRatio, float fNegOverlap, int nSampleSize, bool bMapObjectToAgnostic, bool bNmsParam, float? fNmsThreshold=null, int? nNmsTopK=null, float? fNmsEta=null)
Create an instance of the SSD GPU support.
void embed_fwd(int nCount, long hBottomData, long hWeight, int nM, int nN, int nK, long hTopData)
Performs the forward pass for embed
long AllocMemory(List< double > rg)
Allocate a block of GPU memory and copy a list of doubles to it.
void copy_expand(int n, int nNum, int nDim, long hX, long hA)
Expand a vector of length 'nNum' into a matrix of size 'nNum' x 'nDim' by copying each value of the v...
void FreeConvolutionDesc(long h)
Free a convolution descriptor instance.
void SoftmaxBackward(long hCuDnn, SOFTMAX_ALGORITHM alg, SOFTMAX_MODE mode, T fAlpha, long hTopDataDesc, long hTopData, long hTopDiffDesc, long hTopDiff, T fBeta, long hBottomDiffDesc, long hBottomDiff)
Perform a Softmax backward pass.
void mask_batch(int n, int nBatch, int nMaskDim, T fSearch, T fReplace, long hX, long hMask, long hY)
Mask the mask the batch of data in the source with the mask by replacing all values 'fSearch' found i...
double[] get_double(int nCount, long hHandle, int nIdx=-1)
Queries the GPU memory by copying it into an array of
void SetMemory(long hMem, List< float > rg)
Copies a list of float into a block of already allocated GPU memory.
void mul_scalar(int n, T fAlpha, long hY)
Mutlipy each element of Y by a scalar.
bool RunPCA(long hPCA, int nSteps, out int nCurrentK, out int nCurrentIteration)
Runs a number of steps of the iterative PCA algorithm.
double hamming_distance(int n, double dfThreshold, long hA, long hB, long hY, int nOffA=0, int nOffB=0, int nOffY=0)
The hamming_distance calculates the Hamming Distance between X and Y both of length n.
void min_bwd(int nCount, long hTopDiff, int nIdx, long hMask, long hBottomDiff)
Performs a min backward pass in Cuda.
void sub(int n, long hA, long hB, long hY, int nAOff=0, int nBOff=0, int nYOff=0, int nB=0)
Subtracts B from A and places the result in Y.
void elu_bwd(int nCount, long hTopDiff, long hTopData, long hBottomData, long hBottomDiff, double dfAlpha)
Performs a Exponential Linear Unit (ELU) backward pass in Cuda.
void scale(int n, T fAlpha, long hX, long hY, int nXOff=0, int nYOff=0)
Scales the values in X and places them in Y.
void SetMemoryAt(long hMem, double[] rgSrc, int nOffset)
Copies an array of double into a block of already allocated GPU memory starting at a specific offset.
void slice_bwd(int nCount, long hTopDiff, int nNumSlices, int nSliceSize, int nBottomSliceAxis, int nTopSliceAxis, int nOffsetSliceAxis, long hBottomDiff)
Performs a slice backward pass in Cuda.
void Rnn8Backward(long hCuDnn, long hRnn, long hY, long hdY, long hX, long hdX, long hhX, long hdhY, long hdhX, long hcX, long hdcY, long hdcX, long hWt, long hdWt, long hWork, long hReserved)
Calculate the backward pass through the RNN8 for both data and weights.
void SetupSSD(long hSSD, int nNum, int nNumPriors, int nNumGt)
Setup the SSD GPU support.
void SetMemory(long hMem, float[] rgSrc, long hStream=0)
Copies an array of float into a block of already allocated GPU memory.
void FreeRnnDataDesc(long h)
Free an existing RNN Data descriptor.
void SetDeviceID(int nDeviceID=-1, DEVINIT flags=DEVINIT.NONE, long? lSeed=null)
Set the device ID used by the current instance of CudaDnn.
void DeviceEnablePeerAccess(int nPeerDeviceID)
Enables peer-to-peer access between the current device used by the CudaDnn instance and a peer device...
long AllocPCAScores(int nM, int nN, int nK, out int nCount)
Allocates the GPU memory for the PCA scores.
void FreeStream(long h)
Free a stream.
void add(int n, long hA, long hB, long hY)
Adds A to B and places the result in Y.
void tanh_fwd(int nCount, long hBottomData, long hTopData)
Performs a TanH forward pass in Cuda.
void channel_dot(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hA, long hY)
Calculates the dot product the the values within each channel of X and places the result in Y.
void DivisiveNormalizationForward(long hCuDnn, long hNormDesc, T fAlpha, long hBottomDataDesc, long hBottomData, long hTemp1, long hTemp2, T fBeta, long hTopDataDesc, long hTopData)
Performs a Devisive Normalization forward pass.
void elu_fwd(int nCount, long hBottomData, long hTopData, double dfAlpha)
Performs a Exponential Linear Unit (ELU) forward pass in Cuda.
long AllocMemory(float[] rgSrc, long hStream=0)
Allocate a block of GPU memory and copy an array of float to it, optionally using a stream for the co...
void bias_fwd(int nCount, long hBottomData, long hBiasData, int nBiasDim, int nInnerDim, long hTopData)
Performs a bias forward pass in Cuda.
void smoothl1_fwd(int nCount, long hX, long hY)
Performs the forward operation for the SmoothL1 loss.
void rng_gaussian(int n, T fMu, T fSigma, long hY)
Fill Y with random numbers using a gaussian random distribution.
void fill(int n, int nDim, long hSrc, int nSrcOff, int nCount, long hDst)
Fill data from the source data 'n' times in the destination.
void rng_bernoulli(int n, T fNonZeroProb, long hY)
Fill Y with random numbers using a bernoulli random distribution.
void softmax_cross_entropy_bwd(int nCount, int nIgnoreLabel, long hTarget, long hBottomDiff)
Performs a softmax cross entropy backward pass in Cuda when an ignore label is specified.
void Rnn8Forward(long hCuDnn, long hRnn, long hX, long hY, long hhX, long hhY, long hcX, long hcY, long hWts, long hWork, long hReserved)
Calculate the forward pass through the RNN8.
void lstm_fwd(int t, int nN, int nH, int nI, long hWeight_h, long hWeight_i, long hClipData, int nClipOffset, long hTopData, int nTopOffset, long hCellData, int nCellOffset, long hPreGateData, int nPreGateOffset, long hGateData, int nGateOffset, long hHT1Data, int nHT1Offset, long hCT1Data, int nCT1Offset, long hHtoGateData, long hContext=0, long hWeight_c=0, long hCtoGetData=0)
Peforms the simple LSTM foward pass in Cuda.
T[] SetPixel(long hMem, int nCount, bool bReturnOriginal, int nOffset, params Tuple< int, T >[] rgPixel)
Set a pixel value where each pixel is defined a set index, value tuple.
long CreateCuDNN(long hStream=0)
Create a new instance of NVIDIA's cuDnn.
void rng_gaussian(int n, float fMu, float fSigma, long hY)
Fill Y with random numbers using a gaussian random distribution.
void lecun_bwd(int nCount, long hTopDiff, long hTopData, long hBottomDiff, long hBottomData)
Performs the LeCun's Tanh function backward
void add(int n, long hA, long hB, long hC, long hY)
Adds A, B and C and places the result in Y.
void col2im(long hDataCol, int nDataColOffset, int nChannels, int nHeight, int nWidth, int nKernelH, int nKernelW, int nPadH, int nPadW, int nStrideH, int nStrideW, int nDilationH, int nDilationW, long hDataIm, int nDataImOffset)
Rearranges the columns into image blocks.
void axpby(int n, double fAlpha, long hX, double fBeta, long hY)
Scale the vector x and then multiply the vector X by a scalar and add the result to the vector Y.
long CreateFilterDesc()
Create a new instance of a filter descriptor for use with NVIDIA's cuDnn.
void KernelCopy(int nCount, long hSrc, int nSrcOffset, long hDstKernel, long hDst, int nDstOffset, long hHostBuffer, long hHostKernel=-1, long hStream=-1, long hSrcKernel=-1)
Copy memory from the look-up tables in one kernel to another.
void sign(int n, long hX, long hY, int nXOff=0, int nYOff=0)
Computes the sign of each element of X and places the result in Y.
void FreeMemoryTest(long h)
Free a memory test, freeing up all GPU memory used.
void axpby(int n, T fAlpha, long hX, T fBeta, long hY)
Scale the vector x by Alpha and scale vector y by Beta and then add both together.
float[] get_float(int nCount, long hHandle, int nIdx=-1)
Queries the GPU memory by copying it into an array of
void ConvolutionForward(long hCuDnn, T fAlpha, long hBottomDesc, long hBottomData, int nBottomOffset, long hFilterDesc, long hWeight, int nWeightOffset, long hConvDesc, CONV_FWD_ALGO algoFwd, long hWorkspace, int nWorkspaceOffset, ulong lWorkspaceSize, T fBeta, long hTopDesc, long hTopData, int nTopOffset, bool bSyncStream=true)
Perform a convolution forward pass.
void lecun_fwd(int nCount, long hBottomData, long hTopData)
Performs the LeCun's Tanh function forward
void div(int n, long hA, long hB, long hY)
Divides each element of A by each element of B and places the result in Y.
void RnnForward(long hCuDnn, long hRnnDesc, long hXDesc, long hXData, long hHxDesc, long hHxData, long hCxDesc, long hCxData, long hWtDesc, long hWtData, long hYDesc, long hYData, long hHyDesc, long hHyData, long hCyDesc, long hCyData, long hWorkspace, ulong nWsCount, long hReserved, ulong nResCount, bool bTraining)
Run the RNN through a forward pass.
void channel_copyall(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY)
Copy all data from X (shape 1,c,sd) to each num in Y (shape n,c,sd).
void gemm(bool bTransA, bool bTransB, int m, int n, int k, double fAlpha, long hA, long hB, double fBeta, long hC, uint lda, uint ldb, uint ldc, uint stridea, uint strideb, uint stridec, uint batch_count)
Perform a matrix-matrix multiplication operation: C = alpha transB (B) transA (A) + beta C
static ulong basetype_size(bool bUseHalfSize)
Returns the base type size in bytes.
void log(int n, long hA, long hY)
Calculates the log value of A and places the result in Y.
void channel_fillfrom(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY, DIR dir)
Fills each channel with the the values stored in Src data where the X data continains nOuterNum x nCh...
void GetRnn8MemorySizes(long hCuDnn, long hRnn, out ulong szWtCount, out ulong szWorkSize, out ulong szReservedSize)
Returns the memory sizes required for the RNN8.
long CreateNCCL(int nDeviceId, int nCount, int nRank, Guid guid)
Create an instance of NVIDIA's NCCL 'Nickel'
void silu_fwd(int nCount, long hBottomData, long hTopData)
Performs the Sigmoid-weighted Linear Unit (SiLU) activation forward pass in Cuda.
void axpy(int n, double fAlpha, long hX, long hY)
Multiply the vector X by a scalar and add the result to the vector Y.
void adam_update(int nCount, long hNetParamsDiff, long hValM, long hValV, T fBeta1, T fBeta2, T fEpsHat, T fLearningRate, T fCorrection)
Perform the Adam update
void sum(int nCount, int nOuterNum, int nInnerNum, long hX, long hY)
Calculates the sum of inner values of X and places the result in Y.
void SsdEncodeConfPrediction(long hSSD, int nConfPredCount, long hConfPred, int nConfGtCount, long hConfGt)
Encodes the SSD data into the confidence prediction and confidence ground truths.
void rng_bernoulli(int n, float fNonZeroProb, long hY)
Fill Y with random numbers using a bernoulli random distribution.
void transposeHW(int n, int c, int h, int w, long hSrc, long hDst)
Transpose a n*c number of matrices along the height and width dimensions. All matrices are in row-maj...
void prelu_fwd(int nCount, int nChannels, int nDim, long hBottomData, long hTopData, long hSlopeData, int nDivFactor)
Performs Parameterized Rectifier Linear Unit (ReLU) forward pass in Cuda.
void ConvolutionBackwardFilter(long hCuDnn, long hBottomDesc, long hBottomData, int nBottomOffset, long hTopDesc, long hTopDiff, int nTopOffset, long hConvDesc, CONV_BWD_FILTER_ALGO algoBwd, long hWorkspace, int nWorkspaceOffset, ulong lWorkspaceSize, long hFilterDesc, long hWeightDiff, int nWeightOffset, bool bSyncStream)
Perform a convolution backward pass on the filter.
long CreateLRNDesc()
Create a new instance of a LRN descriptor for use with NVIDIA's cuDnn.
void ReportMemory(Log log, string strLocation)
Report the memory use on the current GPU managed by the CudaDnn object.
void mean_error_loss_bwd(int nCount, long hPredicted, long hTarget, long hBottomDiff, MEAN_ERROR merr)
Performs a Mean Error Loss backward pass in Cuda.
void RnnBackwardWeights(long hCuDnn, long hRnnDesc, long hXDesc, long hXData, long hHxDesc, long hHxData, long hYDesc, long hYData, long hWorkspace, ulong nWsCount, long hWtDesc, long hWtDiff, long hReserved, ulong nResCount)
Run the RNN backward pass on the weights.
void channel_max(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY, bool bReturnIdx=false)
Calculates the maximum value within each channel of X and places the result in Y.
void scal(int n, float fAlpha, long hX, int nXOff=0)
Scales the data in X by a scaling factor.
void serf_fwd(int nCount, long hBottomData, long hTopData, double dfThreshold)
Performs a Serf forward pass in Cuda.
void concat_bwd(int nCount, long hTopDiff, int nNumConcats, int nConcatInputSize, int nTopConcatAxis, int nBottomConcatAxis, int nOffsetConcatAxis, long hBottomDiff)
Performs a concat backward pass in Cuda.
void sigmoid_fwd(int nCount, long hBottomData, long hTopData)
Performs a Sigmoid forward pass in Cuda.
int GetMultiGpuBoardGroupID(int nDeviceID)
Query the mutli-gpu board group id for a device.
bool contains_point(int n, long hMean, long hWidth, long hX, long hWork, int nXOff=0)
Returns true if the point is contained within the bounds.
long CreateRnn8()
Create the RNN8.
void lrn_computediff(int nCount, long hBottomData, long hTopData, long hScaleData, long hTopDiff, int nNum, int nChannels, int nHeight, int nWidth, int nSize, T fNegativeBeta, T fCacheRatio, long hBottomDiff)
Computes the diff used to calculate the LRN cross channel backward pass in Cuda.
void SigmoidBackward(long hCuDnn, T fAlpha, long hTopDataDesc, long hTopData, long hTopDiffDesc, long hTopDiff, long hBottomDataDesc, long hBottomData, T fBeta, long hBottomDiffDesc, long hBottomDiff)
Perform a Sigmoid backward pass.
long CreatePCA(int nMaxIterations, int nM, int nN, int nK, long hData, long hScoresResult, long hLoadsResult, long hResiduals=0, long hEigenvalues=0)
Creates a new PCA instance and returns the handle to it.
void rmsprop_update(int nCount, long hNetParamsDiff, long hHistoryData, T fRmsDecay, T fDelta, T fLocalRate)
Perform the RMSProp update
void SetMemory(long hMem, T[] rgSrc, long hStream=0, int nCount=-1)
Copies an array of type 'T' into a block of already allocated GPU memory.
void FreeFilterDesc(long h)
Free a filter descriptor instance.
void EluBackward(long hCuDnn, T fAlpha, long hTopDataDesc, long hTopData, long hTopDiffDesc, long hTopDiff, long hBottomDataDesc, long hBottomData, T fBeta, long hBottomDiffDesc, long hBottomDiff)
Perform a Elu backward pass.
void im2col(long hDataIm, int nDataImOffset, int nChannels, int nHeight, int nWidth, int nKernelH, int nKernelW, int nPadH, int nPadW, int nStrideH, int nStrideW, int nDilationH, int nDilationW, long hDataCol, int nDataColOffset)
Rearranges image blocks into columns.
void gemm(bool bTransA, bool bTransB, int m, int n, int k, double fAlpha, long hA, long hB, double fBeta, long hC)
Perform a matrix-matrix multiplication operation: C = alpha transB (B) transA (A) + beta C
void SetConvolutionDesc(long hHandle, int hPad, int wPad, int hStride, int wStride, int hDilation, int wDilation, bool bUseTensorCores, bool bHalf=false)
Set the values of a convolution descriptor.
void tile_fwd(int nCount, long hBottomData, int nInnerDim, int nTiles, int nBottomTileAxis, long hTopData)
Performs a tile forward pass in Cuda.
void FreeSSD(long hSSD)
Free the instance of SSD GPU support.
void SetFilterNdDesc(long hHandle, int[] rgDim, bool bHalf=false)
Sets the values of a filter descriptor.
void ConvolutionBackwardBias(long hCuDnn, T fAlpha, long hTopDesc, long hTopDiff, int nTopOffset, T fBeta, long hBiasDesc, long hBiasDiff, int nBiasOffset, bool bSyncStream=true)
Perform a convolution backward pass on the bias.
long CreatePoolingDesc()
Create a new instance of a pooling descriptor for use with NVIDIA's cuDnn.
void scale(int n, double fAlpha, long hX, long hY)
Scales the values in X and places them in Y.
void FreeRnn8(long h)
Free an existing RNN8.
Tuple< double, double, double, double > minmax(int n, long hA, long hWork1, long hWork2, bool bDetectNans=false, int nAOff=0)
Finds the minimum and maximum values within A.
void mask_batch(int n, int nBatch, int nMaskDim, double fSearch, double fReplace, long hX, long hMask, long hY)
Mask the mask the batch of data in the source with the mask by replacing all values 'fSearch' found i...
long AllocMemory(long lCapacity, bool bHalfSize=false)
Allocate a block of GPU memory with a specified capacity.
long AllocMemory(T[] rgSrc, long hStream=0, bool bHalfSize=false)
Allocate a block of GPU memory and copy an array of type 'T' to it, optionally using a stream for the...
void lrn_fillscale(int nCount, long hBottomData, int nNum, int nChannels, int nHeight, int nWidth, int nSize, T fAlphaOverSize, T fK, long hScaleData)
Performs the fill scale operation used to calculate the LRN cross channel forward pass in Cuda.
void LRNCrossChannelForward(long hCuDnn, long hNormDesc, T fAlpha, long hBottomDesc, long hBottomData, T fBeta, long hTopDesc, long hTopData)
Perform LRN cross channel forward pass.
float[] GetHostMemoryFloat(long hMem)
Retrieves the host memory as an array of floats.
void transpose(int n, long hX, long hY, long hXCounts, long hYCounts, long hMapping, int nNumAxes, long hBuffer)
Perform a transpose on X producing Y, similar to the numpy.transpose operation.
int GetRnnParamCount(long hCuDnn, long hRnnDesc, long hXDesc)
Returns the RNN parameter count.
void gaussian_blur(int n, int nChannels, int nHeight, int nWidth, double dfSigma, long hX, long hY)
The gaussian_blur runs a Gaussian blurring operation over each channel of the data using the sigma.
void gelu_bwd(int nCount, long hTopDiff, long hTopData, long hBottomDiff, long hBottomData, bool bEnableBertVersion)
Performs a GELU backward pass in Cuda.
void max_bwd(int nCount, long hTopDiff, int nIdx, long hMask, long hBottomDiff)
Performs a max backward pass in Cuda.
double max(int n, long hA, out long lPos, int nAOff=0, long hWork=0)
Finds the maximum value of A.
double GetDeviceMemory(out double dfFree, out double dfUsed, out bool bCudaCallUsed, int nDeviceID=-1)
Queries the amount of total, free and used memory on a given GPU.
void PoolingBackward(long hCuDnn, long hPoolingDesc, T fAlpha, long hTopDataDesc, long hTopData, long hTopDiffDesc, long hTopDiff, long hBottomDataDesc, long hBottomData, T fBeta, long hBottomDiffDesc, long hBottomDiff)
Perform a pooling backward pass.
void SsdEncodeLocPrediction(long hSSD, int nLocPredCount, long hLocPred, int nLocGtCount, long hLocGt)
Encodes the SSD data into the location prediction and location ground truths.
void exp(int n, long hA, long hY)
Calculates the exponent value of A and places the result in Y.
string GetDeviceInfo(int nDeviceID, bool bVerbose=false)
Query the device information of a device.
void BatchNormBackward(long hCuDnn, BATCHNORM_MODE mode, T fAlphaDiff, T fBetaDiff, T fAlphaParamDiff, T fBetaParamDiff, long hBwdBottomDesc, long hBottomData, long hTopDiffDesc, long hTopDiff, long hBottomDiffDesc, long hBottomDiff, long hBwdScaleBiasMeanVarDesc, long hScaleData, long hScaleDiff, long hBiasDiff, double dfEps, long hSaveMean, long hSaveInvVar)
Run the batch norm backward pass.
void sqrt_scale(int nCount, long hX, long hY)
Scale the data by the sqrt of the data. y = sqrt(abs(x)) * sign(x)
void channel_mulv(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hA, long hX, long hC)
Multiplies the values in vector X by each channel in matrix A and places the result in matrix C.
void softplus_bwd(int nCount, long hTopDiff, long hTopData, long hBottomDiff, long hBottomData)
Performs the Softplus function backward, a smooth approximation of the ReLU function
void SetFilterDesc(long hHandle, int n, int c, int h, int w, bool bHalf=false)
Sets the values of a filter descriptor.
void channel_sum(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY, bool bSumAcrossChannels=true, DIR dir=DIR.FWD, int nChannelsY=-1)
Calculates the sum the the values either across or within each channel (depending on bSumAcrossChanne...
void lrn_computeoutput(int nCount, long hBottomData, long hScaleData, T fNegativeBeta, long hTopData)
Computes the output used to calculate the LRN cross channel forward pass in Cuda.
long AllocHostBuffer(long lCapacity)
Allocate a block of host memory with a specified capacity.
void channel_copy(int nCount, int nOuterNum, int nChannels, int nBlocks, int nInnerNum, int nOffset, long hX, long hY, DIR dir)
Copy data along channels similar to numpy split function.
float dot_float(int n, long hX, long hY)
Computes the dot product of X and Y.
void add(int n, long hA, long hB, long hY, double dfAlphaA, double dfAlphaB, int nAOff=0, int nBOff=0, int nYOff=0)
Adds A to (B times scalar) and places the result in Y.
void batchreidx_fwd(int nCount, int nInnerDim, long hBottomData, long hPermutData, long hTopData)
Performs the forward pass for batch re-index
void mul(int n, long hA, long hB, long hY, int nAOff=0, int nBOff=0, int nYOff=0)
Multiplies each element of A with each element of B and places the result in Y.
void channel_duplicate(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY)
Duplicates each channel 'nInnerNum' of times in the destination.
long GetHostBufferCapacity(long hMem)
Returns the host memory capacity.
void im2col_nd(long hDataIm, int nDataImOffset, int nNumSpatialAxes, int nImCount, int nChannelAxis, long hImShape, long hColShape, long hKernelShape, long hPad, long hStride, long hDilation, long hDataCol, int nDataColOffset)
Rearranges image blocks into columns.
void GetRnnLinLayerParams(long hCuDnn, long hRnnDesc, int nLayer, long hXDesc, long hWtDesc, long hWtData, int nLinLayer, out int nWtCount, out long hWt, out int nBiasCount, out long hBias)
Returns the linear layer parameters (weights).
void BatchNormForward(long hCuDnn, BATCHNORM_MODE mode, T fAlpha, T fBeta, long hFwdBottomDesc, long hBottomData, long hFwdTopDesc, long hTopData, long hFwdScaleBiasMeanVarDesc, long hScaleData, long hBiasData, double dfFactor, long hGlobalMean, long hGlobalVar, double dfEps, long hSaveMean, long hSaveInvVar, bool bTraining)
Run the batch norm forward pass.
void unpooling_bwd(POOLING_METHOD method, int nCount, long hTopDiff, int num, int nChannels, int nHeight, int nWidth, int nPooledHeight, int nPooledWidth, int nKernelH, int nKernelW, int nStrideH, int nStrideW, int nPadH, int nPadW, long hBottomDiff, long hMask)
Performs the backward pass for unpooling using Cuda
void gelu_fwd(int nCount, long hBottomData, long hTopData, bool bEnableBertVersion)
Performs a GELU forward pass in Cuda.
void FreeDropoutDesc(long h)
Free a dropout descriptor instance.
void FreeExtension(long hExtension)
Free an instance of an Extension.
void GetConvolutionInfo(long hCuDnn, long hBottomDesc, long hFilterDesc, long hConvDesc, long hTopDesc, ulong lWorkspaceSizeLimitInBytes, bool bUseTensorCores, out CONV_FWD_ALGO algoFwd, out ulong lWsSizeFwd, out CONV_BWD_FILTER_ALGO algoBwdFilter, out ulong lWsSizeBwdFilter, out CONV_BWD_DATA_ALGO algoBwdData, out ulong lWsSizeBwdData, CONV_FWD_ALGO preferredFwdAlgo=CONV_FWD_ALGO.NONE)
Queryies the algorithms and workspace sizes used for a given convolution descriptor.
long CreateLayerNorm(int nGpuID, int nCount, int nOuterNum, int nChannels, int nInnerNum, float fEps=1e-10f)
Create the Cuda version of LayerNorm
void SoftmaxForward(long hCuDnn, SOFTMAX_ALGORITHM alg, SOFTMAX_MODE mode, T fAlpha, long hBottomDataDesc, long hBottomData, T fBeta, long hTopDataDesc, long hTopData)
Perform a Softmax forward pass.
void debug()
The debug function is uses only during debugging the debug version of the low-level DLL.
void SetTensorNdDesc(long hHandle, int[] rgDim, int[] rgStride, bool bHalf=false)
Sets the values of a tensor descriptor.
void gemm(bool bTransA, bool bTransB, int m, int n, int k, float fAlpha, long hA, long hB, float fBeta, long hC)
Perform a matrix-matrix multiplication operation: C = alpha transB (B) transA (A) + beta C
void channel_div(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY, int nMethod=1)
Divides the values of the channels from X and places the result in Y.
long KernelCopyNccl(long hSrcKernel, long hSrcNccl)
Copies an Nccl handle from one kernel to the current kernel of the current CudaDnn instance.
void calc_dft_coefficients(int n, long hX, int m, long hY)
Calculates the discrete Fourier Transform (DFT) coefficients across the frequencies 1....
void softmax_cross_entropy_fwd(int nCount, long hProbData, long hLabel, long hLossDiff, long hLossData, int nOuterNum, int nDim, int nInnerNum, long hCounts, int? nIgnoreLabel)
Performs a softmax cross entropy forward pass in Cuda.
void softmaxloss_bwd(int nCount, long hTopData, long hLabel, long hBottomDiff, int nOuterNum, int nDim, int nInnerNum, long hCounts, int? nIgnoreLabel)
Performs Softmax Loss backward pass in Cuda.
void SetMemoryAt(long hMem, float[] rgSrc, int nOffset)
Copies an array of float into a block of already allocated GPU memory starting at a specific offset.
void min_fwd(int nCount, long hBottomDataA, long hBottomDataB, int nIdx, long hTopData, long hMask)
Performs a min forward pass in Cuda.
long AllocMemory(double[] rgSrc, long hStream=0)
Allocate a block of GPU memory and copy an array of doubles to it, optionally using a stream for the ...
double[] GetMemoryDouble(long hMem, long lCount=-1)
Retrieves the GPU memory as an array of doubles.
void pooling_fwd(POOLING_METHOD method, int nCount, long hBottomData, int num, int nChannels, int nHeight, int nWidth, int nPooledHeight, int nPooledWidth, int nKernelH, int nKernelW, int nStrideH, int nStrideW, int nPadH, int nPadW, long hTopData, long hMask, long hTopMask)
Performs the forward pass for pooling using Cuda
double sumsqdiff(int n, long hW, long hA, long hB, int nAOff=0, int nBOff=0)
Calculates the sum of squares of differences between A and B
void SynchronizeThread()
Synchronize all kernel threads on the current GPU.
void SetRnn8(long hCuDnn, long hRnn, bool bTraining, RNN_DATALAYOUT layout, RNN_MODE cellMode, RNN_BIAS_MODE biasMode, int nSequenceLen, int nBatchSize, int nInputs, int nHidden, int nOutputs, int nProjection, int nNumLayers, float fDropout, ulong lSeed, bool bBidirectional=false)
Set the RNN8 parameters.
void add_scalar(int n, float fAlpha, long hY)
Adds a scalar value to each element of Y.
void sigmoid_bwd(int nCount, long hTopDiff, long hTopData, long hBottomDiff)
Performs a Sigmoid backward pass in Cuda.
void prelu_bwd_param(int nCDim, int nNum, int nTopOffset, long hTopDiff, long hBottomData, long hBackBuffDiff)
Performs Parameterized Rectifier Linear Unit (ReLU) backward param pass in Cuda.
void FreeRnnDesc(long h)
Free an existing RNN descriptor.
void mish_fwd(int nCount, long hBottomData, long hTopData, double dfThreshold)
Performs a Mish forward pass in Cuda.
void channel_percentile(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY, double dfPercentile)
Calculates the percentile along axis = 0.
void divbsx(int n, long hA, int nAOff, long hX, int nXOff, int nC, int nSpatialDim, bool bTranspose, long hB, int nBOff)
Divide a matrix by a vector.
void FreeLRNDesc(long h)
Free a LRN descriptor instance.
void FreeHostBuffer(long hMem)
Free previously allocated host memory.
void sigmoid_cross_entropy_fwd(int nCount, long hInput, long hTarget, long hLoss, bool bHasIgnoreLabel, int nIgnoreLabel, long hCountData)
Performs a sigmoid cross entropy forward pass in Cuda.
void softmaxloss_fwd(int nCount, long hProbData, long hLabel, long hLossData, int nOuterNum, int nDim, int nInnerNum, long hCounts, int? nIgnoreLabel)
Performs Softmax Loss forward pass in Cuda.
void rng_uniform(int n, T fMin, T fMax, long hY)
Fill Y with random numbers using a uniform random distribution.
CudaDnn(CudaDnn< T > cuda, bool bEnableGhostMemory)
Alternate CudaDnn constructor.
void EluForward(long hCuDnn, T fAlpha, long hBottomDataDesc, long hBottomData, T fBeta, long hTopDataDesc, long hTopData)
Perform a Elu forward pass.
void DropoutForward(long hCuDnn, long hDropoutDesc, long hBottomDesc, long hBottomData, long hTopDesc, long hTopData, long hReserved)
Performs a dropout forward pass.
void CopyDeviceToHost(long lCount, long hGpuSrc, long hHostDst)
Copy from GPU memory to Host memory.
void sqrt(int n, long hX, long hY)
Computes the square root of each element of X and places the result in Y.
void clip_fwd(int nCount, long hBottomData, long hTopData, T fMin, T fMax)
Performs a Clip forward pass in Cuda.
void DisableGhostMemory()
Disables the ghost memory, if enabled.
long AllocPCAData(int nM, int nN, int nK, out int nCount)
Allocates the GPU memory for the PCA Data.
double asum_double(int n, long hX, int nXOff=0)
Computes the sum of absolute values in X.
T asum(int n, long hX, int nXOff=0)
Computes the sum of absolute values in X.
T erf(T fVal)
Calculates the erf() function.
void add(int n, long hA, long hB, long hY, double dfAlpha)
Adds A to (B times scalar) and places the result in Y.
void lstm_unit_fwd(int nCount, int nHiddenDim, int nXCount, long hX, long hX_acts, long hC_prev, long hCont, long hC, long hH)
Peforms the simple LSTM foward pass in Cuda for a given LSTM unit.
void powx(int n, long hA, T fAlpha, long hY, int nAOff=0, int nYOff=0)
Calculates the A raised to the power alpha and places the result in Y.
void powx(int n, long hA, float fAlpha, long hY, int nAOff=0, int nYOff=0)
Calculates the A raised to the power alpha and places the result in Y.
void DivisiveNormalizationBackward(long hCuDnn, long hNormDesc, T fAlpha, long hBottomDataDesc, long hBottomData, long hTopDiff, long hTemp1, long hTemp2, T fBeta, long hBottomDiffDesc, long hBottomDiff)
Performs a Devisive Normalization backward pass.
void geam(bool bTransA, bool bTransB, int m, int n, double fAlpha, long hA, long hB, double fBeta, long hC)
Perform a matrix-matrix addition/transposition operation: C = alpha transA (A) + beta transB (B)
void gemv(bool bTransA, int m, int n, float fAlpha, long hA, long hX, float fBeta, long hY)
Perform a matrix-vector multiplication operation: y = alpha transA (A) x + beta y (where x and y are ...
void GetDropoutInfo(long hCuDnn, long hBottomDesc, out ulong ulStateCount, out ulong ulReservedCount)
Query the dropout state and reserved counts.
void SetMemory(long hMem, double[] rgSrc, long hStream=0)
Copies an array of double into a block of already allocated GPU memory.
long CreateImageOp(int nNum, double dfBrightnessProb, double dfBrightnessDelta, double dfContrastProb, double dfContrastLower, double dfContrastUpper, double dfSaturationProb, double dfSaturationLower, double dfSaturationUpper, long lRandomSeed=0)
Create a new ImageOp used to perform image operations on the GPU.
int GetDeviceCount()
Query the number of devices (gpu's) installed.
void SetRandomSeed(long lSeed)
Set the random number generator seed.
string GetDeviceP2PInfo(int nDeviceID)
Query the peer-to-peer information of a device.
void channel_sub(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY)
Subtracts the values across the channels from X and places the result in Y.
void ConvolutionForward(long hCuDnn, long hBottomDesc, long hBottomData, int nBottomOffset, long hFilterDesc, long hWeight, int nWeightOffset, long hConvDesc, CONV_FWD_ALGO algoFwd, long hWorkspace, int nWorkspaceOffset, ulong lWorkspaceSize, long hTopDesc, long hTopData, int nTopOffset, bool bSyncStream=true)
Perform a convolution forward pass.
void ger(int m, int n, double fAlpha, long hX, long hY, long hA)
Perform a vector-vector multiplication operation: A = x * (fAlpha * y) (where x and y are vectors and...
void DistortImage(long h, int nCount, int nNum, int nDim, long hX, long hY)
Distort an image using the ImageOp handle provided.
void geam(bool bTransA, bool bTransB, int m, int n, T fAlpha, long hA, long hB, T fBeta, long hC, int nAOffset=0, int nBOffset=0, int nCOffset=0)
Perform a matrix-matrix multiplication operation: C = alpha transB (B) transA (A) + beta C
void scal(int n, T fAlpha, long hX, int nXOff=0)
Scales the data in X by a scaling factor.
void rng_bernoulli(int n, double fNonZeroProb, long hY)
Fill Y with random numbers using a bernoulli random distribution.
long AllocPCAEigenvalues(int nM, int nN, int nK, out int nCount)
Allocates the GPU memory for the PCA eigenvalues.
void max_bwd(int n, long hAdata, long hBdata, long hYdiff, long hAdiff, long hBdiff)
Propagates the Y diff back to the max of A or B and places the result in A if its data has the max,...
void accuracy_fwd(int nCount, int nOuterNum, int nInnerNum, long hBottomData, long hBottomLabel, long hAccData, long hAccTotals, int? nIgnoreLabel, bool bLastElementOnly, int nBatch)
Performs the forward pass for the accuracy layer
string GetDeviceName(int nDeviceID)
Query the name of a device.
void interp2(int nChannels, long hData1, int nX1, int nY1, int nHeight1, int nWidth1, int nHeight1A, int nWidth1A, long hData2, int nX2, int nY2, int nHeight2, int nWidth2, int nHeight2A, int nWidth2A, bool bBwd=false)
Interpolates between two sizes within the spatial dimensions.
void swish_bwd(int nCount, long hTopDiff, long hTopData, long hSigmoidOutputData, long hBottomDiff, double dfBeta)
Performs a Swish backward pass in Cuda.
static void SetDefaultCudaPath(string strPath)
Used to optionally set the default path to the Low-Level Cuda Dnn DLL file.
void max_fwd(int nCount, long hBottomDataA, long hBottomDataB, int nIdx, long hTopData, long hMask)
Performs a max forward pass in Cuda.
void dropout_fwd(int nCount, long hBottomData, long hMask, uint uiThreshold, T fScale, long hTopData)
Performs a dropout forward pass in Cuda.
double min(int n, long hA, out long lPos, int nAOff=0, long hWork=0)
Finds the minimum value of A.
void bnll_bwd(int nCount, long hTopDiff, long hBottomData, long hBottomDiff)
Performs a binomial normal log liklihod (BNLL) backward pass in Cuda.
T[] GetHostMemory(long hMem)
Retrieves the host memory as an array of type 'T'
T[] RunMemoryTest(long h, MEMTEST_TYPE type, ulong ulBlockStartOffset, ulong ulBlockCount, bool bVerbose, bool bWrite, bool bReadWrite, bool bRead)
The RunMemoryTest method runs the memory test from the block start offset through the block count on ...
void AddTensor(long hCuDnn, long hSrcDesc, long hSrc, int nSrcOffset, long hDstDesc, long hDst, int nDstOffset)
Add two tensors together.
void coeff_sub_fwd(int nCount, int nDim, int nNumOffset, double dfCoeff, long hCoeffData, long hBottom, long hTop)
Performs a coefficient sub foward pass in Cuda.
void sub_and_dot(int n, int nN, int nInnerNum, long hA, long hB, long hY, int nAOff, int nBOff, int nYOff)
Subtracts every nInnterNum element of B from A and performs a dot product on the result.
void NcclInitializeMultiProcess(long hNccl)
Initializes a set of NCCL instances for use in different processes.
T[] RunExtension(long hExtension, long lfnIdx, T[] rgParam)
Run a function on the extension specified.
void gather_bwd(int nCount, long hTop, long hBottom, int nAxis, int nDim, int nDimAtAxis, int nM, int nN, long hIdx)
Performs a gather backward pass where data at specifies indexes along a given axis are copied to the ...
void gemv(bool bTransA, int m, int n, T fAlpha, long hA, long hX, T fBeta, long hY, int nAOffset=0, int nXOffset=0, int nYOffset=0)
Perform a matrix-vector multiplication operation: y = alpha transA (A) x + beta y (where x and y are ...
void prelu_bwd(int nCount, int nChannels, int nDim, long hTopDiff, long hBottomData, long hBottomDiff, long hSlopeData, int nDivFactor)
Performs Parameterized Rectifier Linear Unit (ReLU) backward pass in Cuda.
void KernelAdd(int nCount, long hA, long hDstKernel, long hB, long hC)
Add memory from one kernel to memory residing on another kernel.
void axpby(int n, float fAlpha, long hX, float fBeta, long hY)
Scale the vector x and then multiply the vector X by a scalar and add the result to the vector Y.
void SetTensorDesc(long hHandle, int n, int c, int h, int w, bool bHalf=false)
Sets the values of a tensor descriptor.
void SetMemoryAt(long hMem, T[] rgSrc, int nOffset)
Copies an array of type 'T' into a block of already allocated GPU memory starting at a specific offse...
void InitializeRnn8Weights(long hCuDnn, long hRnn, long hWt, RNN_FILLER_TYPE wtFt, double fWtVal, double fWtVal2, RNN_FILLER_TYPE biasFt, double fBiasVal, double fBiasVal2)
Initialize the RNN8 weights
void LayerNormBackward(long hLayerNorm, long hYdata, long hYdiff, long hXdiff)
Run the LayerNorm backward pass.
void axpy(int n, float fAlpha, long hX, long hY)
Multiply the vector X by a scalar and add the result to the vector Y.
void DeriveBatchNormDesc(long hFwdScaleBiasMeanVarDesc, long hFwdBottomDesc, long hBwdScaleBiasMeanVarDesc, long hBwdBottomDesc, BATCHNORM_MODE mode)
Derive the batch norm descriptors for both the forward and backward passes.
void sgd_update(int nCount, long hNetParamsDiff, long hHistoryData, T fMomentum, T fLocalRate)
Perform the Stochastic Gradient Descent (SGD) update
void minmax(int n, long hA, long hWork1, long hWork2, int nK, long hMin, long hMax, bool bNonZeroOnly)
Finds up to 'nK' minimum and maximum values within A.
void LayerNormForward(long hLayerNorm, long hXdata, long hYdata)
Run the LayerNorm forward pass.
double erf(double dfVal)
Calculates the erf() function.
bool CheckMemoryAttributes(long hSrc, int nSrcDeviceID, long hDst, int nDstDeviceID)
Check the memory attributes of two memory blocks on different devices to see if they are compatible f...
void FreeImageOp(long h)
Free an image op, freeing up all GPU memory used.
void copy_sequence(int nK, int nNum, int nDim, long hSrcData, long hSrcLbl, int nSrcCacheCount, long hSrcCache, int nLabelStart, int nLabelCount, int nCacheSize, long hCacheHostCursors, bool bOutputLabels, List< long > rghTop, List< int > rgnTopCount, long hWorkDataHost, bool bCombinePositiveAndNegative=false, int nSeed=0)
Copy a sequence of cached items, organized by label, into an anchor, positive (if nK > 0),...
void math_bwd(int nCount, long hTopDiff, long hTopData, long hBottomDiff, long hBottomData, MATH_FUNCTION function)
Performs a Math function backward pass in Cuda.
void ConvolutionBackwardData(long hCuDnn, long hFilterDesc, long hWeight, int nWeightOffset, long hTopDesc, long hTopDiff, int nTopOffset, long hConvDesc, CONV_BWD_DATA_ALGO algoBwd, long hWorkspace, int nWorkspaceOffset, ulong lWorkspaceSize, long hBottomDesc, long hBottomDiff, int nBottomOffset, bool bSyncStream=true)
Perform a convolution backward pass on the data.
void mask(int n, int nMaskDim, float fSearch, float fReplace, long hX, long hMask, long hY)
Mask the mask the data in the source with the mask by replacing all values 'fSearch' found in the mas...
void axpy(int n, T fAlpha, long hX, long hY, int nXOff=0, int nYOff=0)
Multiply the vector X by a scalar and add the result to the vector Y.
void TanhForward(long hCuDnn, T fAlpha, long hBottomDataDesc, long hBottomData, T fBeta, long hTopDataDesc, long hTopData)
Perform a Tanh forward pass.
void FreeTensorDesc(long h)
Free a tensor descriptor instance.
void mulbsx(int n, long hA, int nAOff, long hX, int nXOff, int nC, int nSpatialDim, bool bTranspose, long hB, int nBOff)
Multiply a matrix with a vector.
void scale_fwd(int nCount, long hX, long hScaleData, int nScaleDim, int nInnerDim, long hY, long hBiasData=0)
Performs a scale forward pass in Cuda.
long CreateStream(bool bNonBlocking=false, int nIndex=-1)
Create a new stream on the current GPU.
void tanh_bwd(int nCount, long hTopDiff, long hTopData, long hBottomDiff)
Performs a TanH backward pass in Cuda.
void ReLUForward(long hCuDnn, T fAlpha, long hBottomDataDesc, long hBottomData, T fBeta, long hTopDataDesc, long hTopData)
Perform a ReLU forward pass.
ulong GetRnnWorkspaceCount(long hCuDnn, long hRnnDesc, long hXDesc, out ulong nReservedCount)
Returns the workspace and reserved counts.
long CreateExtension(string strExtensionDllPath)
Create an instance of an Extension DLL.
void serf_bwd(int nCount, long hTopDiff, long hTopData, long hBottomDiff, long hBottomData, double dfThreshold)
Performs a Serf backward pass in Cuda.
void concat_fwd(int nCount, long hBottomData, int nNumConcats, int nConcatInputSize, int nTopConcatAxis, int nBottomConcatAxis, int nOffsetConcatAxis, long hTopData)
Performs a concat forward pass in Cuda.
void crop_fwd(int nCount, int nNumAxes, long hSrcStrides, long hDstStrides, long hOffsets, long hBottomData, long hTopData)
Performs the crop forward operation.
void softplus_fwd(int nCount, long hBottomData, long hTopData)
Performs the Softplus function forward, a smooth approximation of the ReLU function
void RnnBackwardData(long hCuDnn, long hRnnDesc, long hYDesc, long hYData, long hYDiff, long hHyDesc, long hHyDiff, long hCyDesc, long hCyDiff, long hWtDesc, long hWtData, long hHxDesc, long hHxData, long hCxDesc, long hCxData, long hXDesc, long hXDiff, long hdHxDesc, long hHxDiff, long hdCxDesc, long hCxDiff, long hWorkspace, ulong nWsCount, long hReserved, ulong nResCount)
Run the RNN backward pass through the data.
void copy(int nCount, int nNum, int nDim, long hSrc1, long hSrc2, long hDst, long hSimilar, bool bInvert=false)
Copy similar items of length 'nDim' from hSrc1 (where hSimilar(i) = 1) and dissimilar items of length...
void DropoutBackward(long hCuDnn, long hDropoutDesc, long hTopDesc, long hTop, long hBottomDesc, long hBottom, long hReserved)
Performs a dropout backward pass.
void NcclAllReduce(long hNccl, long hStream, long hX, int nCount, NCCL_REDUCTION_OP op, double dfScale=1.0)
Performs a reduction on all NCCL instances as specified by the reduction operation.
void FreeMemoryPointer(long hData)
Frees a memory pointer.
void SetRnnDataDesc(long hRnnDataDesc, RNN_DATALAYOUT layout, int nMaxSeqLen, int nBatchSize, int nVectorSize, bool bBidirectional=false, int[] rgSeqLen=null)
Sets the RNN Data Descriptor values.
float asum_float(int n, long hX, int nXOff=0)
Computes the sum of absolute values in X.
void min(int n, long hA, long hB, long hY)
Calculates the min of A and B and places the result in Y. This min is only computed on a per item bas...
long CreateMemoryTest(out ulong ulTotalNumBlocks, out double dfMemAllocatedInGB, out ulong ulMemStartAddr, out ulong ulBlockSize, double dfPctToAllocate=1.0)
Creates a new memory test on the current GPU.
void channel_scale(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hA, long hY)
Multiplies the values of the channels from X with the scalar values in B and places the result in Y.
void mask(int n, int nMaskDim, T fSearch, T fReplace, long hX, long hMask, long hY)
Mask the mask the data in the source with the mask by replacing all values 'fSearch' found in the mas...
void SetLRNDesc(long hHandle, uint nSize, double fAlpha, double fBeta, double fK)
Set the LRN descriptor values.
void batchreidx_bwd(int nCount, int nInnerDim, long hTopDiff, long hTopIdx, long hBegins, long hCounts, long hBottomDiff)
Performs the backward pass for batch re-index
void set_bounds(int n, double dfMin, double dfMax, long hX)
Set the bounds of all items within the data to a set range of values.
float[] GetMemoryFloat(long hMem, long lCount=-1)
Retrieves the GPU memory as an array of float.
void SetDropoutDesc(long hCuDnn, long hDropoutDesc, double dfDropout, long hStates, long lSeed)
Set the dropout descriptor values.
virtual void Dispose(bool bDisposing)
Disposes this instance freeing up all of its host and GPU memory.
void gemm(bool bTransA, bool bTransB, int m, int n, int k, double fAlpha, long hA, long hB, double fBeta, long hC, uint lda, uint ldb, uint ldc)
Perform a matrix-matrix multiplication operation: C = alpha transB (B) transA (A) + beta C
void NcclInitializeSingleProcess(params long[] rghNccl)
Initializes a set of NCCL instances for use in a single process.
void abs(int n, long hA, long hY)
Calculates the absolute value of A and places the result in Y.
double[] GetHostMemoryDouble(long hMem)
Retrieves the host memory as an array of doubles.
void compare_signs(int n, long hA, long hB, long hY)
Compares the signs of each value in A and B and places the result in Y.
long AllocPCALoads(int nM, int nN, int nK, out int nCount)
Allocates the GPU memory for the PCA loads.
T dot(int n, long hX, long hY, int nXOff=0, int nYOff=0)
Computes the dot product of X and Y.
void coeff_sum_fwd(int nCount, int nDim, int nNumOffset, double dfCoeff, long hCoeffData, long hBottom, long hTop)
Performs a coefficient sum foward pass in Cuda.
void ConvolutionBackwardBias(long hCuDnn, long hTopDesc, long hTopDiff, int nTopOffset, long hBiasDesc, long hBiasDiff, int nBiasOffset, bool bSyncStream=true)
Perform a convolution backward pass on the bias.
void copy_sequence(int n, long hSrc, int nSrcStep, int nSrcStartIdx, int nCopyCount, int nCopyDim, long hDst, int nDstStep, int nDstStartIdx, int nSrcSpatialDim, int nDstSpatialDim, int nSrcSpatialDimStartIdx=0, int nDstSpatialDimStartIdx=0, int nSpatialDimCount=-1)
Copy a sequence from a source to a destination and allow for skip steps.
The CudaDnnMemoryTracker is used for diagnostics in that it helps estimate the amount of memory that ...
void FreeMemory(long hKernel, int nDeviceID, long hMemory)
Simulate a memory free.
string TotalMemoryUsedText
Returns a text string describing the total amount of memory used (in bytes).
ulong TotalMemoryUsed
Returns the total amount of memory used (in bytes).
long AllocMemory(long hKernel, int nDeviceID, long hMemory, ulong lSize, bool bHalf)
Simulate a memory allocation.
The Params contains the base parameters used in multi-GPU training.
Specifies the parameters for the ReshapeTemporalLayer.
The MyCaffe.basecode contains all generic types used throughout MyCaffe.
@ NONE
No training category specified.
@ DEFAULT
Specifies to use the default data type of the gym used.
The MyCaffe.common namespace contains common MyCaffe classes.
OP
Defines the operations performed by the channel_op function.
@ SUB
Specifies to perform a subtraction operation.
@ DIV
Specifies to perform a division operation.
@ MUL
Specifies to perform a multiplication operation.
@ ADD
Specifies to perform an addition operation.
AGGREGATIONS
Specifies different aggregation operations.
MEMTEST_TYPE
Specifies the memory test to perform.
@ MOV_INV_8
Specifies the mov-inv-8 test.
DEVINIT
Specifies the initialization flags used when initializing CUDA.
@ CURAND
Initialize cuRand. This should be initialized for cuRand is used for most of the random operations.
@ SETSEED
Set the cuRand random number generator seed - typically only used when testing to ensure that random ...
@ CUBLAS
Initialize cuBlas. This should be initialized for cuBlas is used for many of the math operations.
SSD_CONF_LOSS_TYPE
Defines the confidence loss types used during SSD cuda training.
@ SOFTMAX
Specifies to use softmax.
@ LOGISTIC
Specifies to use logistic.
CONV_BWD_FILTER_ALGO
Specifies the cuDnn convolution backward filter algorithm to use.
@ ALGO_3
Specifies to use algorithm 0 with a workspace - which is non-deterministic.
@ ALGO_1
Specifies to use algorithm 1.
@ ALGO_0
Specifies to use algorithm 0 - which is non-deterministic.
RNN_DATALAYOUT
Specifies the RNN data layout of the data input.
@ RNN_BATCH_MAJOR_UNPACKED
Specifies ordering with batch major ordering, padded, outer stride from one batch to the next.
@ RNN_SEQ_MAJOR_PACKED
Specifies ordering with sequence major ordering, and sequence length sorted and packed.
@ RNN_SEQ_MAJOR_UNPACKED
Specifies ordering with sequence major ordering, and padded outer stride from one time-step to the ne...
DistanceMethod
Specifies the distance method used when calculating batch distances.
@ HAMMING
Specifies to calculate the hamming distance.
@ EUCLIDEAN
Specifies to calculate the euclidean distance.
MEAN_ERROR
Defines the type of Mean Error to use.
@ MSE
Mean Squared Error (MSE) where is the predicted value.
SSD_MATCH_TYPE
Defines the matching method used during SSD cuda training.
@ BIPARTITE
Specifies to use Bi-Partite.
@ PER_PREDICTION
Specifies to use per-prediction matching.
MATH_FUNCTION
Defines the mathematical function to run.
@ TANH
Specifies to run the tanh function.
@ ASINH
Specifies to run the asinh function.
@ NOP
Specifies to run a no operation.
@ ACOS
Specifies to run the acos function.
@ SQRT
Specifies to run the sqrt function.
@ ACOSH
Specifies to run the acosh function.
@ FLOOR
Specifies to run the floor function.
@ SIN
Specifies to run the sin function.
@ CEIL
Specifies to run the ceil function.
@ NEG
Specifies to flip the sign of the inputs.
@ SIGN
Specifies to run the sign function.
@ TAN
Specifies to run the tan function.
@ ATANH
Specifies to run the atanh function.
@ ASIN
Specifies to run the asin function.
@ SINH
Specifies to run the sinh function.
@ ATAN
Specifies to run the atan function.
@ COSH
Specifies to run the cosh function.
@ COS
Specifies to run the cos function.
PoolingMethod
Specifies the pooling method used by the cuDnn function SetPoolingDesc.
DataType
Specifies the base datatype corresponding the the template type 'T'. Currently, only
@ FLOAT
Specifies the single type.
@ DOUBLE
Specifies the double type.
DIR
Defines the direction of data flow.
@ FWD
Specifies data is moving forward.
@ BWD
Specifies data is moving backward.
NCCL_REDUCTION_OP
Specifies the reduction operation to use with 'Nickel' NCCL.
@ PROD
Multiply the values.
@ MIN
Return the minimum value.
SSD_CODE_TYPE
Defines the encode/decode type used during SSD cuda training.
@ CENTER_SIZE
Encode the center size.
@ CORNER
Encode the corner.
@ CORNER_SIZE
Encode the corner size.
CONV_FWD_ALGO
Specifies the cuDnn convolution forward algorithm to use.
@ ALGO_FFT_TILING
Specifies to use the fft tiling algorithm.
@ ALGO_FFT
Specifies to use the fft algorithm.
@ ALGO_GEMM
Specifies to use the gemm algorithm.
@ IMPLICIT_PRECOMP_GEMM
Specifies to use the implicit pre-computation gemm algorithm.
@ ALGO_DIRECT
Specifies to use the direct algorithm.
@ ALGO_WINOGRAD_NONFUSED
Specifies to use the non-fused winograd algorithm.
@ IMPLICIT_GEMM
Specifies to use the implicit gemm algorithm.
@ ALGO_WINOGRAD
Specifies to use the winograd algorithm.
RNN_MODE
Specifies the RNN mode to use with the Recurrent Layer when using the cuDNN engine.
@ RNN_TANH
Specifies to use a single TanH gate Recurrent Learning unit.
@ GRU
Specifies to use the GRU RNN where and
@ RNN_RELU
Specifies to use a single RelU gate Recurrent Learning unit.
@ LSTM
Specifies to use a 4 gate LSTM Recurrent Learning unit.
BATCHNORM_MODE
Specifies the cuDnn batch norm mode to use.
@ PER_ACTIVATION
Specifies to use the per-activation batch normalization mode.
@ SPATIAL
Specifies to use the spatial batch normalization mode.
@ SPATIAL_PERSISTENT
Specifies to use the spatial persistent batch normalization mode.
ORIENTATION
Specifies the orientation of a matrix.
@ ROW
Specifies to add the vector to each row.
@ COL
Specifies to add the vector to each column.
RNN_BIAS_MODE
Specifies the RNN bias mode to use with the Recurrent Layer when using the cuDNN engine.
@ RNN_DOUBLE_BIAS
Specifies to use two bias in the input Gemm and recurrent Gemm of the rnn cell (default).
@ RNN_NO_BIAS
Specifies to use no bias in the RNN cells.
@ RNN_SINGLE_INP_BIAS
Specifies to use one bias in the input Gemm of the rnn cell.
@ RNN_SINGLE_REC_BIAS
Specifies to use one recurrent bias in the recurrent Gemm of the rnn cell.
SSD_LOC_LOSS_TYPE
Defines the location loss types used during SSD cuda training.
@ SMOOTH_L1
Specifies to use smooth L1 loss.
@ L2
Specifies to use L2 loss.
DEVPROP
Specifies certain device properties to query from Cuda.
@ MULTIGPUBOARDGROUPID
Query a GPU board group ID.
@ DEVICECOUNT
Query the number of devices (gpu's) installed.
@ NAME
Query the name of a given GPU.
RNN_DIRECTION
Specifies the RNN directional used.
@ RNN_UNIDIRECTIONAL
Specifies a single direction RNN (default)
@ RNN_BIDIRECTIONAL
Specifies a bi-direction RNN where the output is concatinated at each layer.
TRANSPOSE_OPERATION
Specifies the type of operation to perform along with a matrix transposition.
SOFTMAX_MODE
Specifies the SOFTMAX mode to use.
@ INSTANCE
Specifies to run the softmax separately for each N, across CHW dimensions.
@ CHANNEL
Specifies to run the softmax separately for each N*C, across HW dimensions.
SSD_MINING_TYPE
Defines the mining type used during SSD cuda training.
@ MAX_NEGATIVE
Select negatives based on the score.
@ HARD_EXAMPLE
Select hard examples based on Shrivastava et. al. method.
SOFTMAX_ALGORITHM
Specifies the SOFTMAX algorithm to use.
@ ACCURATE
Specifies to use the accurate algorithm.
@ LOG
Specifies to use the log algorithm.
@ FAST
Specifies to use the fast algorithm.
POOLING_METHOD
Specifies the pooling method to use when using the Caffe pooling (instead of the pooling from NVIDIA'...
@ STO_TRAIN
Select the stochastic value in the kernel - used during a training pass.
@ STO_TEST
Select the stochastic value in the kernel - used during a testing pass.
RNN_FILLER_TYPE
Defines the filler types used to fill the RNN8 weights.
@ RNN_GAUSSIAN_FILLER
Specifies to fill with a gaussian distribution.
@ RNN_XAVIER_FILLER
Specifies to fill with a uniform distribution.
@ RNN_CONSTANT_FILLER
Specifies to fill with a constant value.
CONV_BWD_DATA_ALGO
Specifies the cuDnn convolution backward data algorithm to use.
The MyCaffe.param namespace contains parameters used to create models.
The MyCaffe namespace contains the main body of MyCaffe code that closesly tracks the C++ Caffe open-...