MyCaffe  1.12.2.41
Deep learning software for Windows C# programmers.
VocabularySentencePiece.cs
1using MyCaffe.basecode;
2using System;
3using System.Collections.Generic;
4using System.Diagnostics;
5using System.IO;
6using System.Linq;
7using System.Reflection;
8using System.Text;
9using System.Threading.Tasks;
10
11namespace MyCaffe.layers.gpt
12{
17 {
18 Random m_random;
19 Dictionary<string, double> m_rgPieces = new Dictionary<string, double>();
20 Dictionary<string, int> m_rgVocabKeyToIdx = new Dictionary<string, int>();
21 Dictionary<int, string> m_rgVocabIdxToKey = new Dictionary<int, string>();
22 bool m_bAddBos;
23 bool m_bAddEos;
24
32 public VocabularySentencePiece(Random random, bool bAddBos, bool bAddEos, string strVocabFile)
33 {
34 string strProgData = Environment.GetFolderPath(Environment.SpecialFolder.CommonApplicationData);
35 strVocabFile = Utility.ReplaceMacro(strVocabFile, "$ProgramData$", strProgData);
36
37 string[] rgstrLines = File.ReadAllLines(strVocabFile);
38
39 foreach (string strLine in rgstrLines)
40 {
41 string[] rgstr = strLine.Split(' ', '\t');
42 if (rgstr.Length == 2)
43 {
44 double dfVal;
45 if (double.TryParse(rgstr[1], out dfVal) && dfVal != 0)
46 {
47 string strKey = rgstr[0].Trim('_', (char)9601);
48
49 if (!m_rgPieces.ContainsKey(strKey))
50 m_rgPieces.Add(strKey, dfVal);
51 }
52 }
53 }
54
55 m_random = random;
56 m_bAddBos = bAddBos;
57 m_bAddEos = bAddEos;
58 }
59
63 public int Count
64 {
65 get { return m_rgVocabKeyToIdx.Count; }
66 }
67
68 private bool isSymbol(char ch)
69 {
70 if (char.IsDigit(ch))
71 return true;
72
73 if (char.IsPunctuation(ch))
74 return true;
75
76 if (char.IsSymbol(ch))
77 return true;
78
79 System.Globalization.UnicodeCategory cat = char.GetUnicodeCategory(ch);
80 if (cat == System.Globalization.UnicodeCategory.OtherPunctuation ||
81 cat == System.Globalization.UnicodeCategory.OtherSymbol ||
82 cat == System.Globalization.UnicodeCategory.DecimalDigitNumber)
83 return true;
84
85 return false;
86 }
87
88 private string trim(string str)
89 {
90 string strOut = "";
91
92 foreach (char ch in str)
93 {
94 System.Globalization.UnicodeCategory cat = char.GetUnicodeCategory(ch);
95
96 if (!char.IsWhiteSpace(ch) && cat != System.Globalization.UnicodeCategory.SpaceSeparator)
97 strOut += ch;
98 }
99
100 return strOut;
101 }
102
107 public void Add(string str)
108 {
109 string[] rgstr = str.Split(' ');
110
111 foreach (string strWord in rgstr)
112 {
113 string strWordA = strWord;
114 string strWord1 = strWordA;
115
116 while (strWord1.Length > 0)
117 {
118 if (m_rgPieces.ContainsKey(strWord1))
119 {
120 if (!m_rgVocabKeyToIdx.ContainsKey(strWord1))
121 m_rgVocabKeyToIdx.Add(strWord1, 1);
122
123 strWord1 = strWordA.Substring(strWord1.Length);
124 strWordA = strWord1;
125 }
126 else
127 {
128 if (strWord1.Length > 0)
129 strWord1 = strWord1.Substring(0, strWord1.Length - 1);
130 }
131 }
132 }
133 }
134
139 public int Build()
140 {
141 List<string> rgKeys = m_rgVocabKeyToIdx.Keys.ToList();
142 rgKeys.Sort();
143
144 m_rgVocabKeyToIdx.Clear();
145
146 if (m_bAddEos)
147 rgKeys.Insert(0, EOS.ToString());
148 if (m_bAddBos)
149 rgKeys.Insert(0, BOS.ToString());
150 rgKeys.Insert(0, ((char)0).ToString());
151
152 // index 0 reserved for pad.
153 for (int i = 0; i < rgKeys.Count; i++)
154 {
155 if (i <= 2 || (rgKeys[i][0] != 0 && rgKeys[i][0] != BOS && rgKeys[i][0] != EOS))
156 {
157 m_rgVocabKeyToIdx.Add(rgKeys[i], i);
158 m_rgVocabIdxToKey.Add(i, rgKeys[i]);
159 }
160 }
161
162 return Count;
163 }
164
170 public int BuildFromString(string strData)
171 {
172 string[] rgstrWords = strData.Split(' ');
173 foreach (string strWord in rgstrWords)
174 {
175 Add(strWord);
176 }
177
178 return Build();
179 }
180
184 public char BOS
185 {
186 get { return (char)SPECIAL_TOKENS.BOS; }
187 }
188
192 public char EOS
193 {
194 get { return (char)SPECIAL_TOKENS.EOS; }
195 }
196
202 public int[] CreateTarget(int[] rgSrc)
203 {
204 List<int> rgTrg = new List<int>(rgSrc);
205
206 if (rgSrc.Length > 0)
207 {
208 rgTrg.RemoveAt(0);
209 rgTrg.Add(EOS);
210 }
211
212 return rgTrg.ToArray();
213 }
214
221 public List<int> Tokenize(string strWord, bool bMustExist = true)
222 {
223 List<int> rgTokens = new List<int>();
224 string strWordA = strWord;
225 string strWord1 = strWordA;
226
227 while (strWord1.Length > 0)
228 {
229 if (m_rgPieces.ContainsKey(strWord1))
230 {
231 if (m_rgVocabKeyToIdx.ContainsKey(strWord1))
232 rgTokens.Add(m_rgVocabKeyToIdx[strWord1]);
233
234 strWord1 = strWordA.Substring(strWord1.Length);
235 strWordA = strWord1;
236 }
237 else
238 {
239 if (strWord1.Length > 0)
240 strWord1 = strWord1.Substring(0, strWord1.Length - 1);
241 }
242 }
243
244 //if (rgTokens.Count == 0)
245 // Trace.WriteLine("No tokens found!");
246
247 return rgTokens;
248 }
249
257 public int[] Tokenize(string str, bool bAddBos, bool bAddEos)
258 {
259 List<int> rgTokens = new List<int>();
260
261 if (string.IsNullOrEmpty(str))
262 return rgTokens.ToArray();
263
264 string[] rgstr = str.Split(' ');
265 foreach (string strWord in rgstr)
266 {
267 rgTokens.AddRange(Tokenize(strWord));
268 }
269
270 if (bAddBos)
271 rgTokens.Insert(0, BOS);
272
273 if (bAddEos)
274 rgTokens.Add(EOS);
275
276 return rgTokens.ToArray();
277 }
278
286 public string Detokenize(int nIdxToken, bool bIgnoreBos, bool bIgnoreEos)
287 {
288 string str = null;
289
290 if (nIdxToken == 0)
291 return str;
292
293 str = "";
294
295 if (m_bAddBos && nIdxToken == BOS)
296 {
297 if (!bIgnoreBos)
298 str += "<BOS>";
299 }
300
301 else if (m_bAddEos && nIdxToken == EOS)
302 {
303 if (!bIgnoreEos)
304 str += "<EOS>";
305 }
306
307 else
308 {
309 if (!m_rgVocabIdxToKey.ContainsKey(nIdxToken))
310 throw new Exception("The token '" + nIdxToken.ToString() + "' is not in the vocabulary!");
311
312 str += m_rgVocabIdxToKey[nIdxToken];
313 }
314
315 return str;
316 }
317
325 public string Detokenize(float[] rgf, bool bIgnoreBos, bool bIgnoreEos)
326 {
327 string str = "";
328
329 foreach (float f in rgf)
330 {
331 string str1 = Detokenize((int)f, bIgnoreBos, bIgnoreEos);
332
333 if (!string.IsNullOrEmpty(str1))
334 str += str1 + " ";
335 }
336
337 return str.TrimEnd(' ');
338 }
339 }
340}
The Utility class provides general utility funtions.
Definition: Utility.cs:35
static string ReplaceMacro(string strRaw, string strMacroName, string strReplacement)
The ConvertMacro method is used to replace a set of macros in a given string.
Definition: Utility.cs:947
The VocabularySentencePieces class manages the data vocabulary of words.
int[] Tokenize(string str, bool bAddBos, bool bAddEos)
Tokenize a string of data.
char BOS
Returns the special BOS character.
int BuildFromString(string strData)
Build the vocabulary from a string.
List< int > Tokenize(string strWord, bool bMustExist=true)
Tokenize a character into its corresponding index token.
char EOS
Returns the special EOS character.
void Add(string str)
Adds a new character to the vocabulary.
int Count
Returns the size of the vocabulary.
int[] CreateTarget(int[] rgSrc)
Create a target that is offset from the source by one and ends with a EOS.
VocabularySentencePiece(Random random, bool bAddBos, bool bAddEos, string strVocabFile)
The constructor.
int Build()
Builds the vocabulary from all words added.
string Detokenize(int nIdxToken, bool bIgnoreBos, bool bIgnoreEos)
Detokenize an index token into its corresponding character.
string Detokenize(float[] rgf, bool bIgnoreBos, bool bIgnoreEos)
Detokenize an array into a string.
The IVocabulary interface specifies the interface that all Vocabularies implement.
Definition: Interfaces.cs:14
The MyCaffe.basecode contains all generic types used throughout MyCaffe.
Definition: Annotation.cs:12
SPECIAL_TOKENS
Specifies the special tokens.
Definition: Interfaces.cs:15
The MyCaffe.layers.gpt namespace contains all GPT related layers.
Definition: LayerFactory.cs:15
The MyCaffe namespace contains the main body of MyCaffe code that closesly tracks the C++ Caffe open-...
Definition: Annotation.cs:12