Deep learning software for Windows C# programmers.
1using System;
2using System.Collections.Generic;
3using System.Linq;
4using System.Text;
5using MyCaffe.basecode;
6using MyCaffe.common;
7using MyCaffe.param;
8using MyCaffe.fillers;
9using System.IO;
10using MyCaffe.db.image;
11using MyCaffe.param.gpt;
12using System.Net;
13using System.Globalization;
14using System.Diagnostics;
15using System.Xml.Linq;
16using System.Security.Cryptography;
17using System.Threading;
18using System.IO.Compression;
19using System.Reflection;
21namespace MyCaffe.layers.gpt
27 public class TokenizedDataPairsLayer<T> : Layer<T>
28 {
29 CancelEvent m_evtCancel;
30 InputData m_encoderData = null;
31 InputData m_decoderData = null;
32 Blob<T> m_blobX = null;
33 Blob<T> m_blobY = null;
34 Blob<T> m_blobTriangle = null;
35 Random m_random = new Random();
36 Blob<T> m_blobEncIn = null;
37 Blob<T> m_blobDecIn = null;
38 Layer<T> m_softmax = null;
39 Layer<T> m_argmax = null;
40 Stopwatch m_swUpdateTimer = new Stopwatch();
41 double m_dfLastProgress = 0;
42 AutoResetEvent m_evtDownloadDone = new AutoResetEvent(false);
47 public enum VOCABULARY
48 {
57 }
83 : base(cuda, log, p)
84 {
85 m_evtCancel = evtCancel;
86 m_type = LayerParameter.LayerType.TOKENIZED_DATA_PAIRS;
88 m_blobTriangle = new Blob<T>(m_cuda, m_log, false);
89 m_blobTriangle.Name = "triangle";
90 }
95 protected override void dispose()
96 {
97 dispose(ref m_blobY);
98 dispose(ref m_blobX);
99 dispose(ref m_blobTriangle);
100 dispose(ref m_blobEncIn);
101 dispose(ref m_blobDecIn);
102 dispose(ref m_softmax);
103 dispose(ref m_argmax);
105 base.dispose();
106 }
112 protected override void setup_internal_blobs(BlobCollection<T> col)
113 {
114 col.Add(m_blobTriangle);
115 }
120 public override int ExactNumBottomBlobs
121 {
122 get { return (m_phase == Phase.RUN) ? 2 : 0; }
123 }
128 public override int ExactNumTopBlobs
129 {
130 get { return 5; }
131 }
138 public override void LayerSetUp(BlobCollection<T> colBottom, BlobCollection<T> colTop)
139 {
141 {
142 case TokenizedDataParameter.INPUT_TYPE.TEXT_FILE:
143 download_vocab_data();
146 m_log.WriteLine("Encoder Vocabulary: " + m_encoderData.VocabularySize.ToString());
147 m_log.WriteLine("Decoder Vocabulary: " + m_decoderData.VocabularySize.ToString());
148 break;
153 break;
155 default:
156 throw new Exception("Unknown input type '" + m_param.tokenized_data_pairs_param.input_type.ToString() + "'");
157 }
158 }
160 private void download_vocab_data()
161 {
162 if (string.IsNullOrEmpty(m_param.tokenized_data_pairs_param.vocab_data_url))
163 return;
165 string strProgData = Environment.GetFolderPath(Environment.SpecialFolder.CommonApplicationData);
166 string strDataFile = Utility.ReplaceMacro(m_param.tokenized_data_pairs_param.vocab_data_dst_file, "$ProgramData$", strProgData);
167 string strVocabFile = Utility.ReplaceMacro(m_param.tokenized_data_pairs_param.source, "$ProgramData$", strProgData);
169 if (string.IsNullOrEmpty(strDataFile))
170 m_log.FAIL("You must specify a 'vocab_data_dst_file' when using 'vocab_data_url'.");
172 string strPath = Path.GetDirectoryName(strDataFile);
173 if (!Directory.Exists(strPath))
174 Directory.CreateDirectory(strPath);
176 string strFile1 = Path.GetFileName(strDataFile);
177 string strFile = downloadData(m_param.tokenized_data_pairs_param.vocab_data_url, strPath, strFile1);
178 if (!File.Exists(strFile))
179 return;
181 if (!File.Exists(strVocabFile))
182 ZipFile.ExtractToDirectory(strFile, strPath);
183 }
185 private string downloadData(string strUrl, string strPath, string strFileName)
186 {
187 if (!Directory.Exists(strPath))
188 Directory.CreateDirectory(strPath);
190 string strDataFile = strPath + "\\" + strFileName;
191 if (!File.Exists(strDataFile))
192 {
193 using (WebClient webClient = new WebClient())
194 {
195 string strFile1 = strFileName;
196 string strFile = strPath + "\\" + strFile1;
198 m_swUpdateTimer.Start();
199 m_dfLastProgress = 0;
201 webClient.DownloadProgressChanged += WebClient_DownloadProgressChanged;
202 webClient.DownloadFileCompleted += WebClient_DownloadFileCompleted;
203 webClient.DownloadFileAsync(new Uri(strUrl), strFile, strFile1);
205 m_evtDownloadDone.WaitOne();
206 }
207 }
209 return strDataFile;
210 }
212 private void WebClient_DownloadFileCompleted(object sender, System.ComponentModel.AsyncCompletedEventArgs e)
213 {
214 m_log.WriteLine("Downloading done.", true);
215 m_evtDownloadDone.Set();
216 }
218 private void WebClient_DownloadProgressChanged(object sender, DownloadProgressChangedEventArgs e)
219 {
220 if (m_swUpdateTimer.Elapsed.TotalMilliseconds >= 1000)
221 {
222 if (m_dfLastProgress != e.ProgressPercentage)
223 {
224 m_dfLastProgress = e.ProgressPercentage;
225 string strFile = e.UserState.ToString();
227 double dfPct = e.ProgressPercentage / 100.0;
228 m_log.WriteLine("Downloading '" + strFile + "' at " + dfPct.ToString("P") + "...", true);
229 }
231 m_swUpdateTimer.Restart();
232 }
233 }
240 public override void Reshape(BlobCollection<T> colBottom, BlobCollection<T> colTop)
241 {
242 int nBatchSize = (int)m_param.tokenized_data_pairs_param.batch_size;
243 int nBlockSize = (int)m_param.tokenized_data_pairs_param.block_size;
244 int nTokenSize = (int)m_encoderData.TokenSize;
246 if (m_phase == Phase.RUN)
247 nBatchSize = 1;
248 else
249 m_log.CHECK_EQ(colBottom.Count, 0, "Data Layer takes no input blobs.");
251 m_log.CHECK_EQ(colTop.Count, 5, "The TokenizedDataPairsLayer requires 5 top blobs.");
253 Blob<T> blobEncIn = colTop[0];
254 Blob<T> blobDecIn = colTop[1];
255 Blob<T> blobDecOut = colTop[2];
256 Blob<T> blobEncMask = colTop[3];
257 Blob<T> blobDecMask = colTop[4];
259 int nCount = 3;
260 if (nTokenSize == 1)
261 nCount = 2;
262 int[] rgShape = new int[nCount];
264 blobEncIn.SetParameter("vocab_size", m_encoderData.VocabularySize);
265 blobDecIn.SetParameter("vocab_size", m_decoderData.VocabularySize);
266 // reshape for single characters (each character is an index into the vocab vector)
267 rgShape[0] = nBatchSize;
268 rgShape[1] = nBlockSize;
269 if (rgShape.Length > 2)
270 rgShape[2] = nTokenSize;
272 blobEncIn.Reshape(rgShape);
273 blobDecIn.Reshape(rgShape);
274 blobDecOut.Reshape(rgShape);
275 blobEncMask.Reshape(nBatchSize, nBlockSize, 1, 1);
276 blobDecMask.Reshape(nBatchSize, nBlockSize, nBlockSize, 1);
278 if (!m_blobTriangle.CompareShape(blobDecMask.shape()))
279 {
280 m_blobTriangle.ReshapeLike(blobDecMask);
282 T[] rgMask = new T[m_blobTriangle.count()];
283 for (int n = 0; n < m_blobTriangle.num; n++)
284 {
285 for (int c = 0; c < m_blobTriangle.channels; c++)
286 {
287 for (int h = 0; h < m_blobTriangle.height; h++)
288 {
289 int nIdx = n * nBlockSize * nBlockSize + c * nBlockSize + h;
290 rgMask[nIdx] = (h > c) ? m_tZero : m_tOne;
291 }
292 }
293 }
295 m_blobTriangle.mutable_cpu_data = rgMask;
296 }
297 }
355 protected override void forward(BlobCollection<T> colBottom, BlobCollection<T> colTop)
356 {
357 if (m_phase == Phase.RUN)
358 {
359 colTop[0].CopyFromAndPad(colBottom[0]); // enc data
360 colTop[1].CopyFromAndPad(colBottom[1]); // dec data (with BOS token, initially)
361 // colTop[2] NO Dec target data when running.
362 }
363 else
364 {
365 int[] rgnIdx;
366 Tuple<float[], float[]> encData = m_encoderData.GetData((int)m_param.tokenized_data_pairs_param.batch_size, (int)m_param.tokenized_data_pairs_param.block_size, m_decoderData, out rgnIdx);
367 Tuple<float[], float[]> decData = m_decoderData.GetDataAt((int)m_param.tokenized_data_pairs_param.batch_size, (int)m_param.tokenized_data_pairs_param.block_size, rgnIdx);
369 colTop[0].mutable_cpu_data = convert(encData.Item1);
370 colTop[1].mutable_cpu_data = convert(decData.Item1);
371 colTop[2].mutable_cpu_data = convert(decData.Item2);
372 }
374 // Fill encoder mask based on encoder input.
375 m_cuda.sign(colTop[0].count(), colTop[0].gpu_data, colTop[3].mutable_gpu_data);
376 // Fill decoder mask based on decoder input.
377 m_cuda.channel_duplicate(colTop[4].count(), colTop[1].num, colTop[1].channels, colTop[4].count(2), colTop[1].gpu_data, colTop[4].mutable_gpu_data);
378 m_cuda.sign(colTop[4].count(), colTop[4].gpu_data, colTop[4].mutable_gpu_data);
379 // Overlay triangular matrix on decoder mask.
380 m_cuda.mul(colTop[4].count(), colTop[4].gpu_data, m_blobTriangle.gpu_data, colTop[4].mutable_gpu_data);
381 }
384 protected override void backward(BlobCollection<T> colTop, List<bool> rgbPropagateDown, BlobCollection<T> colBottom)
385 {
386 }
391 public override bool SupportsPreProcessing
392 {
393 get { return true; }
394 }
399 public override bool SupportsPostProcessingLogits
400 {
401 get { return true; }
402 }
410 public List<int> Tokenize(string str, VOCABULARY vocab)
411 {
412 if (vocab == VOCABULARY.ENCODER)
413 return m_encoderData.Tokenize(str, false, false);
414 else
415 return m_decoderData.Tokenize(str, false, false);
416 }
426 public string Detokenize(float[] rg, int nStartIdx, int nCount, VOCABULARY vocab)
427 {
428 InputData inputData = (vocab == VOCABULARY.ENCODER) ? m_encoderData : m_decoderData;
429 return inputData.Detokenize(rg, nStartIdx, nCount, false, false);
430 }
437 public uint GetVocabuarySize(VOCABULARY src)
438 {
439 InputData input = (src == VOCABULARY.ENCODER) ? m_encoderData : m_decoderData;
440 return input.VocabularySize;
441 }
450 public override BlobCollection<T> PreProcessInput(PropertySet customInput, out int nSeqLen, BlobCollection<T> colBottom = null)
451 {
454 if (m_blobEncIn == null)
455 m_blobEncIn = new Blob<T>(m_cuda, m_log);
457 if (m_blobDecIn == null)
458 m_blobDecIn = new Blob<T>(m_cuda, m_log);
460 string strInput = customInput.GetProperty("InputData");
461 if (string.IsNullOrEmpty(strInput))
462 throw new Exception("Could not find 'InputData' property!");
464 List<int> rgTokens = m_encoderData.Tokenize(strInput, true, true);
465 float[] rgInput = new float[rgTokens.Count];
467 for (int i = 0; i < rgTokens.Count; i++)
468 {
469 rgInput[i] = rgTokens[i];
470 }
472 int[] rgShape = new int[2];
473 rgShape[0] = 1;
474 rgShape[1] = rgInput.Length;
476 m_blobEncIn.Reshape(rgShape);
478 rgShape[1] = 1;
479 m_blobDecIn.Reshape(rgShape);
481 m_blobEncIn.mutable_cpu_data = convert(rgInput);
482 m_blobDecIn.SetData((int)SPECIAL_TOKENS.BOS);
484 return new BlobCollection<T>() { m_blobEncIn, m_blobDecIn };
485 }
494 public override bool PreProcessInput(string str, int? nTokIdx, BlobCollection<T> colBottom = null)
495 {
496 if (nTokIdx.HasValue && nTokIdx.Value == (int)SPECIAL_TOKENS.EOS)
497 return false;
499 Blob<T> blobBtm = (colBottom.Count > 1) ? colBottom[1] : colBottom[0];
501 List<float> rgTok = convertF(blobBtm.mutable_cpu_data).ToList();
503 rgTok.Add(nTokIdx.Value);
505 rgTok.RemoveAt(0);
507 List<int> rgShape = Utility.Clone<int>(blobBtm.shape());
508 rgShape[1] = rgTok.Count;
509 blobBtm.Reshape(rgShape);
511 blobBtm.mutable_cpu_data = convert(rgTok.ToArray());
513 return true;
514 }
528 public override List<Tuple<string, int, double>> PostProcessLogitsOutput(int nCurIdx, Blob<T> blobLogits, Layer<T> softmax, int nAxis, int nK = 1)
529 {
530 float[] rgData = convertF(blobLogits.mutable_cpu_data);
531 int nVocabCount = blobLogits.count(nAxis);
532 float[] rgLogits = new float[nVocabCount];
533 Dictionary<int, float> rgTopK = new Dictionary<int, float>();
535 if (m_blobX == null)
536 m_blobX = new Blob<T>(m_cuda, m_log);
537 if (m_blobY == null)
538 m_blobY = new Blob<T>(m_cuda, m_log);
540 BlobCollection<T> colBottom = new BlobCollection<T>() { blobLogits };
541 BlobCollection<T> colTop = new BlobCollection<T>() { m_blobY };
542 if (softmax == null)
543 {
544 if (m_softmax == null)
545 {
546 LayerParameter softmax_param = new LayerParameter(LayerParameter.LayerType.SOFTMAX);
547 softmax_param.softmax_param.axis = nAxis;
548 m_softmax = Layer<T>.Create(m_cuda, m_log, softmax_param, null);
549 m_softmax.Setup(colBottom, colTop);
550 }
552 softmax = m_softmax;
553 }
555 if (m_argmax == null)
556 {
557 LayerParameter argmax_param = new LayerParameter(LayerParameter.LayerType.ARGMAX);
558 argmax_param.argmax_param.out_max_val = false;
559 argmax_param.argmax_param.enable_cuda_impl = true;
560 argmax_param.argmax_param.axis = nAxis;
561 m_argmax = Layer<T>.Create(m_cuda, m_log, argmax_param, null);
562 softmax.Reshape(colBottom, colTop);
563 m_blobX.ReshapeLike(colTop[0]);
564 colBottom[0] = m_blobX;
565 m_argmax.Setup(colBottom, colTop);
566 }
568 colBottom[0] = blobLogits;
569 softmax.Forward(colBottom, colTop);
570 m_blobX.CopyFrom(colTop[0]);
571 colBottom[0] = m_blobX;
572 m_argmax.Forward(colBottom, colTop);
574 float[] rgArgMax = convertF(colTop[0].mutable_cpu_data);
575 int nCharIdx = (int)rgArgMax[nCurIdx];
577 string str = m_decoderData.Detokenize(nCharIdx, true, true);
578 str += " ";
580 return new List<Tuple<string, int, double>>() { new Tuple<string, int, double>(str, nCharIdx, 0) };
581 }
588 public override string PostProcessFullOutput(Blob<T> blobSoftmax)
589 {
590 float[] rgData = convertF(blobSoftmax.mutable_cpu_data);
591 string strOut = "";
593 foreach (float fTok in rgData)
594 {
595 if (fTok == 0)
596 break;
598 strOut += m_decoderData.Detokenize((int)fTok, true, true);
599 }
601 return strOut;
602 }
603 }
608 public class TextListData : InputData
609 {
610 List<string> m_rgstrData = new List<string>();
611 List<Tuple<int[], int[]>> m_rgnData = new List<Tuple<int[], int[]>>();
612 IVocabulary m_vocab;
613 float[] m_rgData = null;
614 float[] m_rgTgt = null;
615 Phase m_phase;
616 Log m_log;
621 public enum VOCABUARY_TYPE
622 {
630 WORD
631 }
643 public TextListData(Log log, string strSrcFile, string strVocabFile, bool bIncludeTarget, TokenizedDataParameter.VOCABULARY_TYPE vocabType, int? nRandomSeed = null, Phase phase = Phase.NONE) : base(nRandomSeed)
644 {
645 m_log = log;
646 m_phase = phase;
648 Stopwatch sw = new Stopwatch();
650 if (vocabType == TokenizedDataParameter.VOCABULARY_TYPE.WORD)
651 m_vocab = new VocabularyWord(m_random, true, true);
652 else if (vocabType == TokenizedDataParameter.VOCABULARY_TYPE.SENTENCEPIECE)
653 m_vocab = new VocabularySentencePiece(m_random, true, true, strVocabFile);
654 else
655 m_vocab = new VocabularyCharacter(m_random, true, true, true);
657 string strProgData = Environment.GetFolderPath(Environment.SpecialFolder.CommonApplicationData);
658 strSrcFile = Utility.ReplaceMacro(strSrcFile, "$ProgramData$", strProgData);
660 string[] rgstr = File.ReadAllLines(strSrcFile);
662 sw.Start();
664 for (int i = 0; i < rgstr.Length; i++)
665 {
666 m_rgstrData.Add(rgstr[i]);
667 m_vocab.Add(rgstr[i]);
669 if (sw.Elapsed.TotalMilliseconds > 1000)
670 {
671 sw.Restart();
672 double dfPct = (double)i/(double)rgstr.Length;
673 m_log.Progress = dfPct;
674 m_log.WriteLine("Loading vocabulary " + i.ToString("N0") + " of " + rgstr.Length.ToString("N0") + " (" + dfPct.ToString("P") + ") ...", true);
675 }
676 }
678 m_vocab.Build();
680 for (int i = 0; i < m_rgstrData.Count; i++)
681 {
682 string str = m_rgstrData[i];
683 int[] rgnSrc = m_vocab.Tokenize(str, bIncludeTarget, !bIncludeTarget);
684 int[] rgnTrg = null;
686 if (bIncludeTarget)
687 rgnTrg = m_vocab.CreateTarget(rgnSrc);
689 m_rgnData.Add(new Tuple<int[], int[]>(rgnSrc, rgnTrg));
691 if (sw.Elapsed.TotalMilliseconds > 1000)
692 {
693 sw.Restart();
694 double dfPct = (double)i / (double)m_rgstrData.Count;
695 m_log.Progress = dfPct;
696 m_log.WriteLine("Tokenizing data " + i.ToString("N0") + " of " + m_rgstrData.Count.ToString("N0") + " (" + dfPct.ToString("P") + ") ...", true);
697 }
698 }
700 m_log.WriteLine("'" + strSrcFile + "' vocabulary size = " + m_vocab.Count.ToString("N0"));
701 }
706 public override List<string> RawData
707 {
708 get { return m_rgstrData; }
709 }
714 public override uint TokenSize
715 {
716 get { return 1; }
717 }
722 public override uint VocabularySize
723 {
724 get { return (uint)m_vocab.Count; }
725 }
734 public override bool GetDataAvailabilityAt(int nIdx, bool bIncludeSrc, bool bIncludeTrg)
735 {
736 if (bIncludeSrc && m_rgnData[nIdx].Item1.Length == 0)
737 return false;
739 if (bIncludeTrg && m_rgnData[nIdx].Item2.Length == 0)
740 return false;
742 return true;
743 }
754 public override Tuple<float[], float[]> GetData(int nBatchSize, int nBlockSize, InputData trgData, out int[] rgnIdx)
755 {
756 int nSize = nBatchSize * nBlockSize;
758 if (m_rgData == null || m_rgData.Length != nSize)
759 m_rgData = new float[nSize];
760 else
761 Array.Clear(m_rgData, 0, m_rgData.Length);
763 if (m_rgTgt == null || m_rgTgt.Length != nSize)
764 m_rgTgt = new float[nSize];
765 else
766 Array.Clear(m_rgTgt, 0, m_rgTgt.Length);
768 rgnIdx = new int[nBatchSize];
770 for (int i = 0; i < nBatchSize; i++)
771 {
772 int nDataIdx = m_random.Next(m_rgnData.Count);
773 int[] rgSrc = m_rgnData[nDataIdx].Item1;
774 int nRetryCount = 0;
776 while (rgSrc.Length == 0 || !trgData.GetDataAvailabilityAt(nDataIdx, true, true))
777 {
778 nDataIdx = m_random.Next(m_rgnData.Count);
779 rgSrc = m_rgnData[nDataIdx].Item1;
781 nRetryCount++;
782 if (nRetryCount > 20 && (rgSrc.Length == 0 || !trgData.GetDataAvailabilityAt(nDataIdx, true, true)))
783 throw new Exception("Could not find a non-empty source data item!");
784 }
786 int[] rgTrg = m_rgnData[nDataIdx].Item2;
787 int nDstIdx = i * nBlockSize;
789 rgnIdx[i] = nDataIdx;
792 for (int j = 0; j < nBlockSize; j++)
793 {
794 if (j < rgSrc.Length)
795 m_rgData[nDstIdx + j] = rgSrc[j];
797 if (rgTrg != null && j < rgTrg.Length)
798 m_rgTgt[nDstIdx + j] = rgTrg[j];
799 }
801 if (rgTrg != null &&
802 rgTrg[rgTrg.Length - 1] == EOS &&
803 m_rgTgt[nDstIdx + nBlockSize - 1] != 0 &&
804 m_rgTgt[nDstIdx + nBlockSize - 1] != EOS)
805 m_rgTgt[nDstIdx + nBlockSize - 1] = EOS;
807 if (rgSrc[rgSrc.Length - 1] == EOS &&
808 m_rgData[nDstIdx + nBlockSize - 1] != 0 &&
809 m_rgData[nDstIdx + nBlockSize - 1] != EOS)
810 m_rgData[nDstIdx + nBlockSize - 1] = EOS;
811 }
813 return new Tuple<float[], float[]>(m_rgData, m_rgTgt);
814 }
823 public override Tuple<float[], float[]> GetDataAt(int nBatchSize, int nBlockSize, int[] rgnIdx)
824 {
825 int nSize = nBatchSize * nBlockSize;
827 if (m_rgData == null || m_rgData.Length != nSize)
828 m_rgData = new float[nSize];
829 else
830 Array.Clear(m_rgData, 0, m_rgData.Length);
832 if (m_rgTgt == null || m_rgTgt.Length != nSize)
833 m_rgTgt = new float[nSize];
834 else
835 Array.Clear(m_rgTgt, 0, m_rgTgt.Length);
837 for (int i = 0; i < rgnIdx.Length; i++)
838 {
839 int nDataIdx = rgnIdx[i];
840 int nDstIdx = i * nBlockSize;
842 int[] rgSrc = m_rgnData[nDataIdx].Item1;
843 int[] rgTrg = m_rgnData[nDataIdx].Item2;
845 for (int j = 0; j < nBlockSize; j++)
846 {
847 if (j < rgSrc.Length)
848 m_rgData[nDstIdx + j] = rgSrc[j];
850 if (j < rgTrg.Length && rgTrg != null)
851 m_rgTgt[nDstIdx + j] = rgTrg[j];
852 }
854 if (rgTrg != null &&
855 rgTrg[rgTrg.Length - 1] == EOS &&
856 m_rgTgt[nDstIdx + nBlockSize - 1] != 0 &&
857 m_rgTgt[nDstIdx + nBlockSize - 1] != EOS)
858 m_rgTgt[nDstIdx + nBlockSize - 1] = EOS;
860 if (rgSrc[rgSrc.Length - 1] == EOS &&
861 m_rgData[nDstIdx + nBlockSize - 1] != 0 &&
862 m_rgData[nDstIdx + nBlockSize - 1] != EOS)
863 m_rgData[nDstIdx + nBlockSize - 1] = EOS;
864 }
866 return new Tuple<float[], float[]>(m_rgData, m_rgTgt);
867 }
876 public override List<int> Tokenize(string str, bool bAddBos, bool bAddEos)
877 {
878 return m_vocab.Tokenize(str, bAddBos, bAddEos).ToList();
879 }
890 public override string Detokenize(float[] rgfTokIdx, int nStartIdx, int nCount, bool bIgnoreBos, bool bIgnoreEos)
891 {
892 string str = "";
893 for (int i=nStartIdx; i<nStartIdx + nCount; i++)
894 {
895 string strItem = m_vocab.Detokenize((int)rgfTokIdx[i], bIgnoreBos, bIgnoreEos);
896 if (string.IsNullOrEmpty(strItem))
897 break;
899 str += strItem + " ";
900 }
902 return str.TrimEnd(' ');
903 }
912 public override string Detokenize(int nTokIdx, bool bIgnoreBos, bool bIgnoreEos)
913 {
914 return m_vocab.Detokenize(nTokIdx, bIgnoreBos, bIgnoreEos);
915 }
920 public override char BOS
921 {
922 get { return m_vocab.BOS; }
923 }
928 public override char EOS
929 {
930 get { return m_vocab.EOS; }
931 }
932 }
938 {
939 List<Tuple<DateTime, int[], int[]>> m_rgnData = new List<Tuple<DateTime, int[], int[]>>();
940 List<int> m_rgVocabulary = new List<int>();
941 ICustomTokenInput m_iTokenInput;
942 string m_strVocabInfo;
943 float[] m_rgData = null;
944 float[] m_rgTgt = null;
945 Phase m_phase;
946 Log m_log;
947 int m_nVocabularySize;
963 public CustomListData(CancelEvent evtCancel, Log log, string strCustomDllFile, string strVocabInfo, int nBlockSizeSrc, int? nRandomSeed = null, Phase phase = Phase.NONE) : base(nRandomSeed)
964 {
965 m_log = log;
966 m_phase = phase;
967 m_strVocabInfo = strVocabInfo;
969 if (phase != Phase.RUN)
970 {
971 string strProgData = Environment.GetFolderPath(Environment.SpecialFolder.CommonApplicationData);
972 strCustomDllFile = Utility.ReplaceMacro(strCustomDllFile, "$ProgramData$", strProgData);
974 m_iTokenInput = loadCustomInput(strCustomDllFile);
976 m_rgnData = (strVocabInfo == "ENC") ? m_iTokenInput.LoadAllEncoderTokens(evtCancel, m_log, phase, out m_nVocabularySize) : m_iTokenInput.LoadAllDecoderTokens(evtCancel, m_log, phase, out m_nVocabularySize);
977 if (m_rgnData.Count < nBlockSizeSrc + nBlockSizeSrc)
978 throw new Exception("Insufficient number of tokens, must have at least " + (nBlockSizeSrc + nBlockSizeSrc).ToString() + " tokens.");
980 log.WriteLine(strVocabInfo + " vocabulary size = " + m_nVocabularySize.ToString());
981 }
982 }
984 private ICustomTokenInput loadCustomInput(string strCustomDllFile)
985 {
986 try
987 {
988 Assembly a = Assembly.LoadFile(strCustomDllFile);
989 AssemblyName aName = a.GetName();
991 foreach (Type t in a.GetTypes())
992 {
993 if (t.IsPublic)
994 {
995 Type iface = t.GetInterface("ICustomTokenInput");
997 if (iface != null)
998 {
999 object obj = Activator.CreateInstance(t);
1000 return (ICustomTokenInput)obj;
1001 }
1002 }
1003 }
1005 return null;
1006 }
1007 catch (Exception excpt)
1008 {
1009 throw excpt;
1010 }
1011 }
1016 public override List<string> RawData
1017 {
1018 get { throw new NotImplementedException("Raw data not supported by Custom Input"); }
1019 }
1024 public override uint TokenSize
1025 {
1026 get { return 1; }
1027 }
1032 public override uint VocabularySize
1033 {
1034 get { return (uint)m_nVocabularySize; }
1035 }
1044 public override bool GetDataAvailabilityAt(int nIdx, bool bIncludeSrc, bool bIncludeTrg)
1045 {
1046 if (bIncludeSrc && m_rgnData[nIdx].Item2.Length == 0)
1047 return false;
1049 if (bIncludeTrg && m_rgnData[nIdx].Item3.Length == 0)
1050 return false;
1052 return true;
1053 }
1064 public override Tuple<float[], float[]> GetData(int nBatchSize, int nBlockSize, InputData trgData, out int[] rgnIdx)
1065 {
1066 int nSize = nBatchSize * nBlockSize;
1068 if (m_rgData == null || m_rgData.Length != nSize)
1069 m_rgData = new float[nSize];
1070 else
1071 Array.Clear(m_rgData, 0, m_rgData.Length);
1073 if (m_rgTgt == null || m_rgTgt.Length != nSize)
1074 m_rgTgt = new float[nSize];
1075 else
1076 Array.Clear(m_rgTgt, 0, m_rgTgt.Length);
1078 rgnIdx = new int[nBatchSize];
1080 for (int i = 0; i < nBatchSize; i++)
1081 {
1082 int nDataIdx = m_random.Next(m_rgnData.Count);
1083 int[] rgSrc = m_rgnData[nDataIdx].Item2;
1084 int nRetryCount = 0;
1086 while (rgSrc.Length == 0 || !trgData.GetDataAvailabilityAt(nDataIdx, true, true))
1087 {
1088 nDataIdx = m_random.Next(m_rgnData.Count);
1089 rgSrc = m_rgnData[nDataIdx].Item2;
1091 nRetryCount++;
1092 if (nRetryCount > 20 && (rgSrc.Length == 0 || !trgData.GetDataAvailabilityAt(nDataIdx, true, true)))
1093 throw new Exception("Could not find a non-empty source data item!");
1094 }
1096 int[] rgTrg = m_rgnData[nDataIdx].Item3;
1097 int nDstIdx = i * nBlockSize;
1099 rgnIdx[i] = nDataIdx;
1102 for (int j = 0; j < nBlockSize; j++)
1103 {
1104 if (j < rgSrc.Length)
1105 m_rgData[nDstIdx + j] = rgSrc[j];
1107 if (rgTrg != null && j < rgTrg.Length)
1108 m_rgTgt[nDstIdx + j] = rgTrg[j];
1109 }
1111 if (rgTrg != null &&
1112 rgTrg[rgTrg.Length - 1] == EOS &&
1113 m_rgTgt[nDstIdx + nBlockSize - 1] != 0 &&
1114 m_rgTgt[nDstIdx + nBlockSize - 1] != EOS)
1115 m_rgTgt[nDstIdx + nBlockSize - 1] = EOS;
1117 if (rgSrc[rgSrc.Length - 1] == EOS &&
1118 m_rgData[nDstIdx + nBlockSize - 1] != 0 &&
1119 m_rgData[nDstIdx + nBlockSize - 1] != EOS)
1120 m_rgData[nDstIdx + nBlockSize - 1] = EOS;
1121 }
1123 return new Tuple<float[], float[]>(m_rgData, m_rgTgt);
1124 }
1133 public override Tuple<float[], float[]> GetDataAt(int nBatchSize, int nBlockSize, int[] rgnIdx)
1134 {
1135 int nSize = nBatchSize * nBlockSize;
1137 if (m_rgData == null || m_rgData.Length != nSize)
1138 m_rgData = new float[nSize];
1139 else
1140 Array.Clear(m_rgData, 0, m_rgData.Length);
1142 if (m_rgTgt == null || m_rgTgt.Length != nSize)
1143 m_rgTgt = new float[nSize];
1144 else
1145 Array.Clear(m_rgTgt, 0, m_rgTgt.Length);
1147 for (int i = 0; i < rgnIdx.Length; i++)
1148 {
1149 int nDataIdx = rgnIdx[i];
1150 int nDstIdx = i * nBlockSize;
1152 int[] rgSrc = m_rgnData[nDataIdx].Item2;
1153 int[] rgTrg = m_rgnData[nDataIdx].Item3;
1155 for (int j = 0; j < nBlockSize; j++)
1156 {
1157 if (j < rgSrc.Length)
1158 m_rgData[nDstIdx + j] = rgSrc[j];
1160 if (j < rgTrg.Length && rgTrg != null)
1161 m_rgTgt[nDstIdx + j] = rgTrg[j];
1162 }
1164 if (rgTrg != null &&
1165 rgTrg[rgTrg.Length - 1] == EOS &&
1166 m_rgTgt[nDstIdx + nBlockSize - 1] != 0 &&
1167 m_rgTgt[nDstIdx + nBlockSize - 1] != EOS)
1168 m_rgTgt[nDstIdx + nBlockSize - 1] = EOS;
1170 if (rgSrc[rgSrc.Length - 1] == EOS &&
1171 m_rgData[nDstIdx + nBlockSize - 1] != 0 &&
1172 m_rgData[nDstIdx + nBlockSize - 1] != EOS)
1173 m_rgData[nDstIdx + nBlockSize - 1] = EOS;
1174 }
1176 return new Tuple<float[], float[]>(m_rgData, m_rgTgt);
1177 }
1186 public override List<int> Tokenize(string str, bool bAddBos, bool bAddEos)
1187 {
1188 throw new NotImplementedException("Tokenize not supported by Custom Input.");
1189 }
1200 public override string Detokenize(float[] rgfTokIdx, int nStartIdx, int nCount, bool bIgnoreBos, bool bIgnoreEos)
1201 {
1202 throw new NotImplementedException("Detokenize not supported by Custom Input.");
1203 }
1212 public override string Detokenize(int nTokIdx, bool bIgnoreBos, bool bIgnoreEos)
1213 {
1214 throw new NotImplementedException("Detokenize not supported by Custom Input.");
1215 }
1220 public override char BOS
1221 {
1222 get { return (char)SPECIAL_TOKENS.BOS; }
1223 }
1228 public override char EOS
1229 {
1230 get { return (char)SPECIAL_TOKENS.EOS; }
1231 }
1232 }
The CancelEvent provides an extension to the manual cancel event that allows for overriding the manua...
Definition: CancelEvent.cs:17
The Log class provides general output in text form.
Definition: Log.cs:13
void WriteLine(string str, bool bOverrideEnabled=false, bool bHeader=false, bool bError=false, bool bDisable=false)
Write a line of output.
Definition: Log.cs:80
void FAIL(string str)
Causes a failure which throws an exception with the desciptive text.
Definition: Log.cs:394
double Progress
Get/set the progress associated with the Log.
Definition: Log.cs:147
void CHECK_EQ(double df1, double df2, string str)
Test whether one number is equal to another.
Definition: Log.cs:239
Specifies a key-value pair of properties.
Definition: PropertySet.cs:16
string GetProperty(string strName, bool bThrowExceptions=true)
Returns a property as a string value.
Definition: PropertySet.cs:146
The Utility class provides general utility funtions.
Definition: Utility.cs:35
static string ReplaceMacro(string strRaw, string strMacroName, string strReplacement)
The ConvertMacro method is used to replace a set of macros in a given string.
Definition: Utility.cs:947
static int Count(List< int > rgShape, int nStartIdx=0, int nEndIdx=-1)
Return the count of items given the shape.
Definition: Utility.cs:83
The BlobCollection contains a list of Blobs.
void Add(Blob< T > b)
Add a new Blob to the collection.
int Count
Returns the number of items in the collection.
The Blob is the main holder of data that moves through the Layers of the Net.
Definition: Blob.cs:25
int channels
DEPRECIATED; legacy shape accessor channels: use shape(1) instead.
Definition: Blob.cs:800
void SetData(T[] rgData, int nCount=-1, bool bSetCount=true)
Sets a number of items within the Blob's data.
Definition: Blob.cs:1922
int height
DEPRECIATED; legacy shape accessor height: use shape(2) instead.
Definition: Blob.cs:808
T[] mutable_cpu_data
Get data from the GPU and bring it over to the host, or Set data from the Host and send it over to th...
Definition: Blob.cs:1461
void Reshape(int nNum, int nChannels, int nHeight, int nWidth, bool? bUseHalfSize=null)
Definition: Blob.cs:442
bool CompareShape(List< int > rgShape, bool bCompareCpuDataLen=false)
Compares the shape of this blob to another shape.
Definition: Blob.cs:2108
void CopyFrom(Blob< T > src, int nSrcOffset, int nDstOffset, int nCount, bool bCopyData, bool bCopyDiff)
Copy from a source Blob.
Definition: Blob.cs:903
void SetParameter(string strName, double dfVal)
Set a blob parameter.
Definition: Blob.cs:233
List< int > shape()
Returns an array where each element contains the shape of an axis of the Blob.
Definition: Blob.cs:684
int count()
Returns the total number of items in the Blob.
Definition: Blob.cs:739
void ReshapeLike(Blob< T > b, bool? bUseHalfSize=null)
Reshape this Blob to have the same shape as another Blob.
Definition: Blob.cs:648
string Name
Get/set the name of the Blob.
Definition: Blob.cs:2184
int num
DEPRECIATED; legacy shape accessor num: use shape(0) instead.
Definition: Blob.cs:792
long gpu_data
Returns the data GPU handle used by the CudaDnn connection.
Definition: Blob.cs:1479
The CudaDnn object is the main interface to the Low-Level Cuda C++ DLL.
Definition: CudaDnn.cs:969
An interface for the units of computation which can be composed into a Net.
Definition: Layer.cs:31
Log m_log
Specifies the Log for output.
Definition: Layer.cs:43
LayerParameter m_param
Specifies the LayerParameter describing the Layer.
Definition: Layer.cs:47
void convert(BlobCollection< T > col)
Convert a collection of blobs from / to half size.
Definition: Layer.cs:535
T m_tZero
Specifies a generic type equal to 0.0.
Definition: Layer.cs:76
T m_tOne
Specifies a generic type equal to 1.0.
Definition: Layer.cs:72
double Forward(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Given the bottom (input) Blobs, this function computes the top (output) Blobs and the loss.
Definition: Layer.cs:728
float convertF(T df)
Converts a generic to a float value.
Definition: Layer.cs:1359
abstract void Reshape(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Adjust the shapes of top blobs and internal buffers to accomodate the shapes of the bottom blobs.
Phase m_phase
Specifies the Phase under which the Layer is run.
Definition: Layer.cs:51
CudaDnn< T > m_cuda
Specifies the CudaDnn connection to Cuda.
Definition: Layer.cs:39
void Setup(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Implements common Layer setup functionality.
Definition: Layer.cs:439
static Layer< T > Create(CudaDnn< T > cuda, Log log, LayerParameter p, CancelEvent evtCancel, IXDatabaseBase db=null, TransferInput trxinput=null)
Create a new Layer based on the LayerParameter.
Definition: Layer.cs:1468
LayerParameter.LayerType m_type
Specifies the Layer type.
Definition: Layer.cs:35
The CustomData supports external data input via an external Assembly DLL that supports the ICustomTok...
override bool GetDataAvailabilityAt(int nIdx, bool bIncludeSrc, bool bIncludeTrg)
Returns true if data is available at the given index.
override List< string > RawData
Returns the raw data.
CustomListData(CancelEvent evtCancel, Log log, string strCustomDllFile, string strVocabInfo, int nBlockSizeSrc, int? nRandomSeed=null, Phase phase=Phase.NONE)
The constructor.
override string Detokenize(int nTokIdx, bool bIgnoreBos, bool bIgnoreEos)
Detokenize a single token.
override uint TokenSize
Returns the token size.
override Tuple< float[], float[]> GetDataAt(int nBatchSize, int nBlockSize, int[] rgnIdx)
Fill a batch of data from a specified array of indexes.
override List< int > Tokenize(string str, bool bAddBos, bool bAddEos)
Tokenize an input string using the internal vocabulary.
override Tuple< float[], float[]> GetData(int nBatchSize, int nBlockSize, InputData trgData, out int[] rgnIdx)
Retrieve random blocks from the source data where the data and target are the same but offset by one ...
override char EOS
Return the special end of sequence character.
override char BOS
Return the special begin of sequence character.
override uint VocabularySize
Returns the vocabulary size.
override string Detokenize(float[] rgfTokIdx, int nStartIdx, int nCount, bool bIgnoreBos, bool bIgnoreEos)
Detokenize an array into a string.
The InputData is an abstract class used to get training data and tokenize input data.
Definition: Interfaces.cs:113
abstract uint TokenSize
Returns the size of a single token (e.g. 1 for character data)
Definition: Interfaces.cs:138
abstract uint VocabularySize
Returns the size of the vocabulary.
Definition: Interfaces.cs:142
abstract Tuple< float[], float[]> GetData(int nBatchSize, int nBlockSize, InputData trgData, out int[] rgnIdx)
Gets a set of randomly selected source/target data, where the target may be null.
abstract Tuple< float[], float[]> GetDataAt(int nBatchSize, int nBlockSize, int[] rgnIdx)
Gets a set of source/target data from a specific index.
abstract List< int > Tokenize(string str, bool bAddBos, bool bAddEos)
Tokenize an input string using the internal vocabulary.
abstract bool GetDataAvailabilityAt(int nIdx, bool bIncludeSrc, bool bIncludeTrg)
Returns true if data is available at the given index.
Random m_random
Specifies the random object made available to the derived classes.
Definition: Interfaces.cs:117
abstract string Detokenize(int nTokIdx, bool bIgnoreBos, bool bIgnoreEos)
Detokenize a single token.
The TextListData manages parallel lists of data where the first list contains the encoder input data ...
override string Detokenize(float[] rgfTokIdx, int nStartIdx, int nCount, bool bIgnoreBos, bool bIgnoreEos)
Detokenize an array into a string.
override uint VocabularySize
Returns the number of unique characters in the data.
override uint TokenSize
The text data token size is a single character.
override List< string > RawData
Return the raw data.
override string Detokenize(int nTokIdx, bool bIgnoreBos, bool bIgnoreEos)
Detokenize a single token.
override Tuple< float[], float[]> GetData(int nBatchSize, int nBlockSize, InputData trgData, out int[] rgnIdx)
Retrieve random blocks from the source data where the data and target are the same but offset by one ...
override char EOS
Return the special end of sequence character.
override List< int > Tokenize(string str, bool bAddBos, bool bAddEos)
Tokenize an input string using the internal vocabulary.
Defines the vocabulary time to use.
override char BOS
Return the special begin of sequence character.
override Tuple< float[], float[]> GetDataAt(int nBatchSize, int nBlockSize, int[] rgnIdx)
Fill a batch of data from a specified array of indexes.
override bool GetDataAvailabilityAt(int nIdx, bool bIncludeSrc, bool bIncludeTrg)
Returns true if data is available at the given index.
TextListData(Log log, string strSrcFile, string strVocabFile, bool bIncludeTarget, TokenizedDataParameter.VOCABULARY_TYPE vocabType, int? nRandomSeed=null, Phase phase=Phase.NONE)
The constructor.
The TokenizedDataPairsLayer loads and tokenizes data for a transformer model where data is loaded in ...
override void forward(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Run the Forward computation, which fills the data into the top (output) Blobs.
override void Reshape(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Reshape the top based on the parameter batch and block size.
override void backward(BlobCollection< T > colTop, List< bool > rgbPropagateDown, BlobCollection< T > colBottom)
Not implemented - data Layers do not perform backward..
override string PostProcessFullOutput(Blob< T > blobSoftmax)
The PostProcessFullOutput allows derivative data layers to post-process the results,...
override void setup_internal_blobs(BlobCollection< T > col)
Add all internal blobs.
override int ExactNumTopBlobs
Returns the minimum number of required top (output) Blobs: enc_in, dec_in, dec_out,...
override bool SupportsPostProcessingLogits
Specifies that this layer supports post processing the logits.
override BlobCollection< T > PreProcessInput(PropertySet customInput, out int nSeqLen, BlobCollection< T > colBottom=null)
Preproces the input and return as a set of bottom blobs.
override bool SupportsPreProcessing
Specifies that this layer supports preprocessing.
override void LayerSetUp(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Setup the layer.
override List< Tuple< string, int, double > > PostProcessLogitsOutput(int nCurIdx, Blob< T > blobLogits, Layer< T > softmax, int nAxis, int nK=1)
Allows post processing the logits output data by converting the logits to and selecting from the prob...
override void dispose()
Release all internal blobs.
override bool PreProcessInput(string str, int? nTokIdx, BlobCollection< T > colBottom=null)
Preproces the input and return as a set of bottom blobs.
List< int > Tokenize(string str, VOCABULARY vocab)
Tokenize an input string using the internal vocabulary.
TokenizedDataPairsLayer(CudaDnn< T > cuda, Log log, LayerParameter p, IXDatabaseBase db, CancelEvent evtCancel)
The TokenizedDataPairsLayer constructor.
string Detokenize(float[] rg, int nStartIdx, int nCount, VOCABULARY vocab)
Detokenize a set of tokens from the data specified.
override int? ExactNumBottomBlobs
Specifies the exact number of bottom blobs (TRAIN|TEST: 0, RUN:2 encin, decin)
uint GetVocabuarySize(VOCABULARY src)
Get the vocabulary size for the specified vocabulary source.
The VocabularyCharacters class manages the data vocabulary of characters.
The VocabularySentencePieces class manages the data vocabulary of words.
The VocabularyWords class manages the data vocabulary of words.
bool enable_cuda_impl
Specifies to use the low-level full cuda implementation of LayerNorm (default = false).
bool out_max_val
If true produce pairs (argmax, maxval)
int? axis
The axis along which to maximize – may be negative to index from the end (e.g., -1 for the last axis)...
Specifies the base parameter for all layers.
SoftmaxParameter softmax_param
Returns the parameter set when initialized with LayerType.SOFTMAX
ArgMaxParameter argmax_param
Returns the parameter set when initialized with LayerType.ARGMAX
TokenizedDataPairsParameter tokenized_data_pairs_param
Returns the parameter set when initialized with LayerType.TOKENIZED_DATA_PAIRS
Phase phase
Specifies the Phase for which this LayerParameter is run.
Specifies the layer type.
int axis
The axis along which to perform the softmax – may be negative to index from the end (e....
string vocab_data_url
Specifies the URL to the vocabulary data file used with the SENTENCEPIECE vocabulary type....
string vocab_data_dst_file
Specifies the destination file where the vocabulary data file data is downloaded. This pre-created vo...
string target_vocab_file
Specifies the target vocabulary file used with the SENTENCEPIECE vocabulary type. The vocabulary file...
string source_vocab_file
Specifies the source vocabulary file used with the SENTENCEPIECE vocabulary type. The vocabulary file...
string target
Specifies the data source based on the INPUT_TYPE used. Each dataset has both a training and testing ...
Specifies the parameters for the TokenizedDataLayer.
INPUT_TYPE input_type
Specifies data source input type.
string source
Specifies the data source based on the INPUT_TYPE used. Each dataset has both a training and testing ...
uint block_size
Specifies size of the block.
VOCABULARY_TYPE vocabulary_type
Specifies the vocabulary type to use.
int? seed
Specifies the seed used to initialize the random number generator (normally only for testing).
Defines the vocabulary type to use.
The IXDatabaseBase interface defines the general interface to the in-memory database.
Definition: Interfaces.cs:444
The ICustomTokenInput interface specifies the interface that all custom token inputs implement.
Definition: Interfaces.cs:88
List< Tuple< DateTime, int[], int[]> > LoadAllEncoderTokens(CancelEvent evtCancel, Log log, Phase phase, out int nVocabSize)
Load all encoder tokens and their associated date/time. evtCancel Specifies the cancel event....
List< Tuple< DateTime, int[], int[]> > LoadAllDecoderTokens(CancelEvent evtCancel, Log log, Phase phase, out int nVocabSize)
Load all decoder tokens and their associated date/time. evtCancel Specifies the cancel event....
The IVocabulary interface specifies the interface that all Vocabularies implement.
Definition: Interfaces.cs:14
The MyCaffe.basecode contains all generic types used throughout MyCaffe.
Definition: Annotation.cs:12
Defines the Phase under which to run a Net.
Definition: Interfaces.cs:61
Specifies the special tokens.
Definition: Interfaces.cs:15
The MyCaffe.common namespace contains common MyCaffe classes.
Definition: BatchInput.cs:8
The MyCaffe.db.image namespace contains all image database related classes.
Definition: Database.cs:18
The MyCaffe.fillers namespace contains all fillers including the Filler class.
The MyCaffe.layers.gpt namespace contains all GPT related layers.
Definition: LayerFactory.cs:15
The MyCaffe.param namespace contains parameters used to create models.
The MyCaffe namespace contains the main body of MyCaffe code that closesly tracks the C++ Caffe open-...
Definition: Annotation.cs:12