3using System.Collections.Generic;
9using System.Threading.Tasks;
19 Dictionary<string, double> m_rgPieces =
new Dictionary<string, double>();
20 Dictionary<string, int> m_rgVocabKeyToIdx =
new Dictionary<string, int>();
21 Dictionary<int, string> m_rgVocabIdxToKey =
new Dictionary<int, string>();
34 string strProgData = Environment.GetFolderPath(Environment.SpecialFolder.CommonApplicationData);
37 string[] rgstrLines = File.ReadAllLines(strVocabFile);
39 foreach (
string strLine
in rgstrLines)
41 string[] rgstr = strLine.Split(
' ',
'\t');
42 if (rgstr.Length == 2)
45 if (
double.TryParse(rgstr[1], out dfVal) && dfVal != 0)
47 string strKey = rgstr[0].Trim(
'_', (
char)9601);
49 if (!m_rgPieces.ContainsKey(strKey))
50 m_rgPieces.Add(strKey, dfVal);
65 get {
return m_rgVocabKeyToIdx.Count; }
68 private bool isSymbol(
char ch)
73 if (
char.IsPunctuation(ch))
76 if (
char.IsSymbol(ch))
79 System.Globalization.UnicodeCategory cat =
char.GetUnicodeCategory(ch);
80 if (cat ==
System.Globalization.UnicodeCategory.OtherPunctuation ||
81 cat ==
System.Globalization.UnicodeCategory.OtherSymbol ||
82 cat ==
System.Globalization.UnicodeCategory.DecimalDigitNumber)
88 private string trim(
string str)
92 foreach (
char ch
in str)
94 System.Globalization.UnicodeCategory cat =
char.GetUnicodeCategory(ch);
96 if (!
char.IsWhiteSpace(ch) && cat !=
System.Globalization.UnicodeCategory.SpaceSeparator)
107 public void Add(
string str)
109 string[] rgstr = str.Split(
' ');
111 foreach (
string strWord
in rgstr)
113 string strWordA = strWord;
114 string strWord1 = strWordA;
116 while (strWord1.Length > 0)
118 if (m_rgPieces.ContainsKey(strWord1))
120 if (!m_rgVocabKeyToIdx.ContainsKey(strWord1))
121 m_rgVocabKeyToIdx.Add(strWord1, 1);
123 strWord1 = strWordA.Substring(strWord1.Length);
128 if (strWord1.Length > 0)
129 strWord1 = strWord1.Substring(0, strWord1.Length - 1);
141 List<string> rgKeys = m_rgVocabKeyToIdx.Keys.ToList();
144 m_rgVocabKeyToIdx.Clear();
147 rgKeys.Insert(0,
EOS.ToString());
149 rgKeys.Insert(0,
BOS.ToString());
150 rgKeys.Insert(0, ((
char)0).ToString());
153 for (
int i = 0; i < rgKeys.Count; i++)
155 if (i <= 2 || (rgKeys[i][0] != 0 && rgKeys[i][0] !=
BOS && rgKeys[i][0] !=
EOS))
157 m_rgVocabKeyToIdx.Add(rgKeys[i], i);
158 m_rgVocabIdxToKey.Add(i, rgKeys[i]);
172 string[] rgstrWords = strData.Split(
' ');
173 foreach (
string strWord
in rgstrWords)
204 List<int> rgTrg =
new List<int>(rgSrc);
206 if (rgSrc.Length > 0)
212 return rgTrg.ToArray();
221 public List<int>
Tokenize(
string strWord,
bool bMustExist =
true)
223 List<int> rgTokens =
new List<int>();
224 string strWordA = strWord;
225 string strWord1 = strWordA;
227 while (strWord1.Length > 0)
229 if (m_rgPieces.ContainsKey(strWord1))
231 if (m_rgVocabKeyToIdx.ContainsKey(strWord1))
232 rgTokens.Add(m_rgVocabKeyToIdx[strWord1]);
234 strWord1 = strWordA.Substring(strWord1.Length);
239 if (strWord1.Length > 0)
240 strWord1 = strWord1.Substring(0, strWord1.Length - 1);
257 public int[]
Tokenize(
string str,
bool bAddBos,
bool bAddEos)
259 List<int> rgTokens =
new List<int>();
261 if (
string.IsNullOrEmpty(str))
262 return rgTokens.ToArray();
264 string[] rgstr = str.Split(
' ');
265 foreach (
string strWord
in rgstr)
267 rgTokens.AddRange(
Tokenize(strWord));
271 rgTokens.Insert(0,
BOS);
276 return rgTokens.ToArray();
286 public string Detokenize(
int nIdxToken,
bool bIgnoreBos,
bool bIgnoreEos)
295 if (m_bAddBos && nIdxToken ==
BOS)
301 else if (m_bAddEos && nIdxToken ==
EOS)
309 if (!m_rgVocabIdxToKey.ContainsKey(nIdxToken))
310 throw new Exception(
"The token '" + nIdxToken.ToString() +
"' is not in the vocabulary!");
312 str += m_rgVocabIdxToKey[nIdxToken];
325 public string Detokenize(
float[] rgf,
bool bIgnoreBos,
bool bIgnoreEos)
329 foreach (
float f
in rgf)
331 string str1 =
Detokenize((
int)f, bIgnoreBos, bIgnoreEos);
333 if (!
string.IsNullOrEmpty(str1))
337 return str.TrimEnd(
' ');
The Utility class provides general utility funtions.
static string ReplaceMacro(string strRaw, string strMacroName, string strReplacement)
The ConvertMacro method is used to replace a set of macros in a given string.
The VocabularySentencePieces class manages the data vocabulary of words.
int[] Tokenize(string str, bool bAddBos, bool bAddEos)
Tokenize a string of data.
char BOS
Returns the special BOS character.
int BuildFromString(string strData)
Build the vocabulary from a string.
List< int > Tokenize(string strWord, bool bMustExist=true)
Tokenize a character into its corresponding index token.
char EOS
Returns the special EOS character.
void Add(string str)
Adds a new character to the vocabulary.
int Count
Returns the size of the vocabulary.
int[] CreateTarget(int[] rgSrc)
Create a target that is offset from the source by one and ends with a EOS.
VocabularySentencePiece(Random random, bool bAddBos, bool bAddEos, string strVocabFile)
The constructor.
int Build()
Builds the vocabulary from all words added.
string Detokenize(int nIdxToken, bool bIgnoreBos, bool bIgnoreEos)
Detokenize an index token into its corresponding character.
string Detokenize(float[] rgf, bool bIgnoreBos, bool bIgnoreEos)
Detokenize an array into a string.
The IVocabulary interface specifies the interface that all Vocabularies implement.
The MyCaffe.basecode contains all generic types used throughout MyCaffe.
SPECIAL_TOKENS
Specifies the special tokens.
The MyCaffe.layers.gpt namespace contains all GPT related layers.
The MyCaffe namespace contains the main body of MyCaffe code that closesly tracks the C++ Caffe open-...