2using System.Collections.Generic;
6using System.Threading.Tasks;
16 Dictionary<string, int> m_rgVocabKeyToIdx =
new Dictionary<string, int>();
17 Dictionary<int, string> m_rgVocabIdxToKey =
new Dictionary<int, string>();
34 m_rgVocabKeyToIdx.Add(
BOS.ToString(), 1);
37 m_rgVocabKeyToIdx.Add(
EOS.ToString(), 2);
45 get {
return m_rgVocabKeyToIdx.Count + 1; }
48 private bool isSymbol(
char ch)
53 if (
char.IsPunctuation(ch))
56 if (
char.IsSymbol(ch))
59 System.Globalization.UnicodeCategory cat =
char.GetUnicodeCategory(ch);
60 if (cat ==
System.Globalization.UnicodeCategory.OtherPunctuation ||
61 cat ==
System.Globalization.UnicodeCategory.OtherSymbol ||
62 cat ==
System.Globalization.UnicodeCategory.DecimalDigitNumber)
68 private string trim(
string str)
72 foreach (
char ch
in str)
74 System.Globalization.UnicodeCategory cat =
char.GetUnicodeCategory(ch);
76 if (!
char.IsWhiteSpace(ch) && cat !=
System.Globalization.UnicodeCategory.SpaceSeparator)
87 public void Add(
string str)
89 string[] rgstr = str.Split(
' ');
91 foreach (
string strWord
in rgstr)
93 if (!
string.IsNullOrEmpty(strWord))
95 string strWord1 = trim(strWord.ToLower().Trim(
'\'',
'\"'));
96 if (
string.IsNullOrEmpty(strWord1))
99 while (strWord1.Length > 0 && isSymbol(strWord1[strWord1.Length-1]) && strWord1[strWord1.Length-1] !=
' ')
101 string strLast = strWord1[strWord1.Length - 1].ToString();
102 if (!m_rgVocabKeyToIdx.ContainsKey(strLast))
103 m_rgVocabKeyToIdx.Add(strLast, 1);
105 strWord1 = strWord1.Substring(0, strWord1.Length - 1);
108 strWord1 = trim(strWord1);
109 if (
string.IsNullOrEmpty(strWord1))
112 while (strWord1.Length > 0 && isSymbol(strWord1[0]) && strWord1[0] !=
' ')
114 string strFirst = strWord1[0].ToString();
115 if (!m_rgVocabKeyToIdx.ContainsKey(strFirst))
116 m_rgVocabKeyToIdx.Add(strFirst, 1);
118 strWord1 = strWord1.Substring(1);
121 strWord1 = trim(strWord1);
122 if (
string.IsNullOrEmpty(strWord1))
125 if (!m_rgVocabKeyToIdx.ContainsKey(strWord1))
126 m_rgVocabKeyToIdx.Add(strWord1, 1);
137 List<string> rgKeys = m_rgVocabKeyToIdx.Keys.ToList();
140 m_rgVocabKeyToIdx.Clear();
143 for (
int i = 0; i < rgKeys.Count; i++)
145 m_rgVocabKeyToIdx.Add(rgKeys[i], i + 1);
146 m_rgVocabIdxToKey.Add(i + 1, rgKeys[i]);
159 string[] rgstrWords = strData.Split(
' ');
160 foreach (
string strWord
in rgstrWords)
173 get {
return (
char)1; }
181 get {
return (
char)2; }
191 List<int> rgTrg =
new List<int>(rgSrc);
196 return rgTrg.ToArray();
205 public List<int>
Tokenize(
string strWord,
bool bMustExist =
true)
207 List<int> rgTokens =
new List<int>();
209 if (!
string.IsNullOrEmpty(strWord))
211 string strWord1 = trim(strWord.ToLower().Trim(
'\'',
'\"'));
212 if (
string.IsNullOrEmpty(strWord1))
215 while (strWord1.Length > 0 && isSymbol(strWord1[strWord1.Length - 1]) && strWord1[strWord1.Length - 1] !=
' ')
217 string strLast = strWord1[strWord1.Length - 1].ToString();
218 if (m_rgVocabKeyToIdx.ContainsKey(strLast))
219 rgTokens.Add(m_rgVocabKeyToIdx[strLast]);
221 strWord1 = strWord1.Substring(0, strWord1.Length - 1);
224 strWord1 = trim(strWord1);
225 if (
string.IsNullOrEmpty(strWord1))
228 while (strWord1.Length > 0 && isSymbol(strWord1[0]) && strWord1[0] !=
' ')
230 string strFirst = strWord1[0].ToString();
231 if (m_rgVocabKeyToIdx.ContainsKey(strFirst))
232 rgTokens.Add(m_rgVocabKeyToIdx[strFirst]);
234 strWord1 = strWord1.Substring(1);
237 strWord1 = trim(strWord1);
238 if (
string.IsNullOrEmpty(strWord1))
241 if (m_rgVocabKeyToIdx.ContainsKey(strWord1))
242 rgTokens.Add(m_rgVocabKeyToIdx[strWord1]);
255 public int[]
Tokenize(
string str,
bool bAddBos,
bool bAddEos)
257 List<int> rgTokens =
new List<int>();
259 string[] rgstr = str.Split(
' ');
260 foreach (
string strWord
in rgstr)
262 rgTokens.AddRange(
Tokenize(strWord));
266 rgTokens.Insert(0,
BOS);
271 return rgTokens.ToArray();
281 public string Detokenize(
int nIdxToken,
bool bIgnoreBos,
bool bIgnoreEos)
290 if (m_bAddBos && nIdxToken ==
BOS)
296 else if (m_bAddEos && nIdxToken ==
EOS)
304 if (!m_rgVocabIdxToKey.ContainsKey(nIdxToken))
305 throw new Exception(
"The token '" + nIdxToken.ToString() +
"' is not in the vocabulary!");
307 str += m_rgVocabIdxToKey[nIdxToken];
320 public string Detokenize(
float[] rgf,
bool bIgnoreBos,
bool bIgnoreEos)
324 foreach (
float f
in rgf)
326 string str1 =
Detokenize((
int)f, bIgnoreBos, bIgnoreEos);
328 if (!
string.IsNullOrEmpty(str1))
The VocabularyWords class manages the data vocabulary of words.
string Detokenize(float[] rgf, bool bIgnoreBos, bool bIgnoreEos)
Detokenize an array into a string.
List< int > Tokenize(string strWord, bool bMustExist=true)
Tokenize a character into its corresponding index token.
char EOS
Returns the special EOS character.
VocabularyWord(Random random, bool bAddBos, bool bAddEos)
The constructor.
int BuildFromString(string strData)
Build the vocabulary from a string.
int Count
Returns the size of the vocabulary.
char BOS
Returns the special BOS character.
string Detokenize(int nIdxToken, bool bIgnoreBos, bool bIgnoreEos)
Detokenize an index token into its corresponding character.
int[] Tokenize(string str, bool bAddBos, bool bAddEos)
Tokenize a string of data.
int Build()
Builds the vocabulary from all words added.
int[] CreateTarget(int[] rgSrc)
Create a target that is offset from the source by one and ends with a EOS.
void Add(string str)
Adds a new character to the vocabulary.
The IVocabulary interface specifies the interface that all Vocabularies implement.
The MyCaffe.layers.gpt namespace contains all GPT related layers.