2using System.Collections.Generic;
5using System.Threading.Tasks;
14 bool m_bEnablePad =
true;
16 Dictionary<char, int> m_rgVocabKeyToIdx =
new Dictionary<char, int>();
17 Dictionary<int, char> m_rgVocabIdxToKey =
new Dictionary<int, char>();
33 m_bEnablePad = bEnablePad;
36 m_rgVocabKeyToIdx.Add(
BOS, 1);
39 m_rgVocabKeyToIdx.Add(
EOS, 2);
47 get {
return m_rgVocabKeyToIdx.Count + ((m_bEnablePad) ? 1 : 0); }
54 public void Add(
char ch)
56 if (!m_rgVocabKeyToIdx.ContainsKey(ch))
57 m_rgVocabKeyToIdx.Add(ch, 1);
64 public void Add(
string str)
66 foreach (
char ch
in str)
78 List<char> rgKeys = m_rgVocabKeyToIdx.Keys.ToList();
81 m_rgVocabKeyToIdx.Clear();
83 int nPadOffset = (m_bEnablePad) ? 1 : 0;
86 for (
int i = 0; i < rgKeys.Count; i++)
88 m_rgVocabKeyToIdx.Add(rgKeys[i], i + nPadOffset);
89 m_rgVocabIdxToKey.Add(i + nPadOffset, rgKeys[i]);
102 foreach (
char ch
in strData)
115 get {
return (
char)1; }
123 get {
return (
char)2; }
133 List<int> rgTrg =
new List<int>(rgSrc);
138 return rgTrg.ToArray();
147 public List<int>
Tokenize(
string str1,
bool bMustExist =
true)
149 if (str1.Length != 1)
150 throw new Exception(
"The character must be a single character!");
152 List<int> rgTokens =
new List<int>();
155 if (!m_rgVocabKeyToIdx.ContainsKey(ch))
158 throw new Exception(
"The character '" + ch.ToString() +
" is not in the vocabulary!");
160 rgTokens.Add(m_random.Next(
Count));
163 rgTokens.Add(m_rgVocabKeyToIdx[ch]);
174 public int[]
Tokenize(
string str,
bool bAddBos,
bool bAddEos)
176 List<int> rgTokens =
new List<int>();
178 foreach (
char ch
in str)
180 rgTokens.AddRange(
Tokenize(ch.ToString()));
184 rgTokens.Insert(0,
BOS);
189 return rgTokens.ToArray();
199 public string Detokenize(
int nIdxToken,
bool bIgnoreBos,
bool bIgnoreEos)
203 if (m_bAddBos && nIdxToken ==
BOS)
209 else if (m_bAddEos && nIdxToken ==
EOS)
217 if (m_rgVocabIdxToKey.ContainsKey(nIdxToken))
218 str += m_rgVocabIdxToKey[nIdxToken];
219 else if (nIdxToken == 0)
222 throw new Exception(
"The token '" + nIdxToken.ToString() +
"' is not in the vocabulary!");
235 public string Detokenize(
float[] rgf,
bool bIgnoreBos,
bool bIgnoreEos)
239 foreach (
float f
in rgf)
241 string str1 =
Detokenize((
int)f, bIgnoreBos, bIgnoreEos);
243 if (!
string.IsNullOrEmpty(str1))
250 if (ch != 0 && ch !=
BOS && ch !=
EOS)
The VocabularyCharacters class manages the data vocabulary of characters.
void Add(char ch)
Adds a new character to the vocabulary.
string Detokenize(int nIdxToken, bool bIgnoreBos, bool bIgnoreEos)
Detokenize an index token into its corresponding character.
int[] Tokenize(string str, bool bAddBos, bool bAddEos)
Tokenize a string of data.
int BuildFromString(string strData)
Build the vocabulary from a string.
List< int > Tokenize(string str1, bool bMustExist=true)
Tokenize a character into its corresponding index token.
char BOS
Returns the special BOS character.
int Build()
Builds the vocabulary from all characters added.
void Add(string str)
Add a string of characters to the vocabulary.
string Detokenize(float[] rgf, bool bIgnoreBos, bool bIgnoreEos)
Detokenize an array into a string.
VocabularyCharacter(Random random, bool bAddBos, bool bAddEos, bool bEnablePad)
The constructor.
int[] CreateTarget(int[] rgSrc)
Create a target that is offset from the source by one and ends with a EOS.
char EOS
Returns the special EOS character.
int? Count
Returns the size of the vocabulary.
The IVocabulary interface specifies the interface that all Vocabularies implement.
The MyCaffe.layers.gpt namespace contains all GPT related layers.