Skip to content

Commit

Permalink
Fix GLM4 Tokenization (#280)
Browse files Browse the repository at this point in the history
* Fix GLM4 Tokenization

* Fix GLM4 Tokenization
  • Loading branch information
apaniukov authored Oct 10, 2024
1 parent 1122e32 commit b4409fa
Show file tree
Hide file tree
Showing 7 changed files with 1,493 additions and 3,550 deletions.
118 changes: 20 additions & 98 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -459,23 +459,23 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
<tbody>
<tr>
<td >BPE</td>
<td >96.77</td>
<td >6062</td>
<td >97.10</td>
<td >4544</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >88.67</td>
<td >7123</td>
<td >88.32</td>
<td >6633</td>
</tr>
<tr>
<td >Tiktoken</td>
<td >100.00</td>
<td >261</td>
<td >96.56</td>
<td >524</td>
</tr>
<tr>
<td >WordPiece</td>
<td >99.11</td>
<td >1353</td>
<td >98.39</td>
<td >747</td>
</tr>
</tbody>
</table>
Expand All @@ -492,30 +492,12 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
</tr>
</thead>
<tbody>
<tr>
<td >BPE</td>
<td >EleutherAI/gpt-j-6b</td>
<td >95.40</td>
<td >261</td>
</tr>
<tr>
<td >BPE</td>
<td >EleutherAI/gpt-neox-20b</td>
<td >95.92</td>
<td >245</td>
</tr>
<tr>
<td >BPE</td>
<td >EleutherAI/pythia-12b-deduped</td>
<td >95.92</td>
<td >245</td>
</tr>
<tr>
<td >BPE</td>
<td >KoboldAI/fairseq-dense-13B</td>
<td >96.73</td>
<td >245</td>
</tr>
<tr>
<td >BPE</td>
<td >NousResearch/Meta-Llama-3-8B-Instruct</td>
Expand Down Expand Up @@ -558,12 +540,6 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
<td >99.24</td>
<td >263</td>
</tr>
<tr>
<td >BPE</td>
<td >facebook/bart-large-mnli</td>
<td >95.40</td>
<td >261</td>
</tr>
<tr>
<td >BPE</td>
<td >facebook/galactica-120b</td>
Expand Down Expand Up @@ -606,12 +582,6 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
<td >95.40</td>
<td >261</td>
</tr>
<tr>
<td >BPE</td>
<td >sentence-transformers/all-roberta-large-v1</td>
<td >95.40</td>
<td >261</td>
</tr>
<tr>
<td >BPE</td>
<td >stabilityai/stablecode-completion-alpha-3b-4k</td>
Expand All @@ -624,12 +594,6 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
<td >100.00</td>
<td >245</td>
</tr>
<tr>
<td >BPE</td>
<td >stabilityai/stablelm-tuned-alpha-7b</td>
<td >95.92</td>
<td >245</td>
</tr>
<tr>
<td >BPE</td>
<td >tiiuae/falcon-7b</td>
Expand Down Expand Up @@ -674,32 +638,20 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
</tr>
<tr>
<td >SentencePiece</td>
<td >camembert-base</td>
<td >52.24</td>
<td >245</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >camembert-base_legacy</td>
<td >75.51</td>
<td >245</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >codellama/CodeLlama-7b-hf</td>
<td >96.73</td>
<td >baichuan-inc/Baichuan2-7B-Chat_legacy</td>
<td >100.00</td>
<td >245</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >codellama/CodeLlama-7b-hf_legacy</td>
<td >95.92</td>
<td >camembert-base</td>
<td >52.24</td>
<td >245</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >codellama/CodeLlama-7b-hf_sp_backend</td>
<td >94.29</td>
<td >camembert-base_legacy</td>
<td >75.51</td>
<td >245</td>
</tr>
<tr>
Expand Down Expand Up @@ -817,20 +769,20 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
<td >261</td>
</tr>
<tr>
<td >WordPiece</td>
<td >ProsusAI/finbert</td>
<td >100.00</td>
<td >109</td>
<td >Tiktoken</td>
<td >THUDM/glm-4-9b-chat</td>
<td >93.16</td>
<td >263</td>
</tr>
<tr>
<td >WordPiece</td>
<td >bert-base-multilingual-cased</td>
<td >ProsusAI/finbert</td>
<td >100.00</td>
<td >109</td>
</tr>
<tr>
<td >WordPiece</td>
<td >bert-base-uncased</td>
<td >bert-base-multilingual-cased</td>
<td >100.00</td>
<td >109</td>
</tr>
Expand All @@ -846,36 +798,12 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
<td >100.00</td>
<td >109</td>
</tr>
<tr>
<td >WordPiece</td>
<td >google/electra-base-discriminator</td>
<td >100.00</td>
<td >109</td>
</tr>
<tr>
<td >WordPiece</td>
<td >google/mobilebert-uncased</td>
<td >100.00</td>
<td >93</td>
</tr>
<tr>
<td >WordPiece</td>
<td >jhgan/ko-sbert-sts</td>
<td >100.00</td>
<td >109</td>
</tr>
<tr>
<td >WordPiece</td>
<td >prajjwal1/bert-mini</td>
<td >100.00</td>
<td >93</td>
</tr>
<tr>
<td >WordPiece</td>
<td >rajiv003/ernie-finetuned-qqp</td>
<td >100.00</td>
<td >93</td>
</tr>
<tr>
<td >WordPiece</td>
<td >rasa/LaBSE</td>
Expand All @@ -888,12 +816,6 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
<td >100.00</td>
<td >109</td>
</tr>
<tr>
<td >WordPiece</td>
<td >squeezebert/squeezebert-uncased</td>
<td >100.00</td>
<td >93</td>
</tr>
</tbody>
</table>
Expand Down
2 changes: 1 addition & 1 deletion python/openvino_tokenizers/hf_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -1056,7 +1056,7 @@ def convert_tiktoken_model_tokenizer(
)

# (chat)GLM model adds spaces around <sop> token
decoder_vocab = pipeline[3].vocab
decoder_vocab = deepcopy(pipeline[3].vocab)
sop_index = next((idx for idx, token in enumerate(decoder_vocab) if token == "<sop>".encode()), None)
if sop_index is not None:
decoder_vocab[sop_index] = " <sop> ".encode()
Expand Down
2 changes: 1 addition & 1 deletion python/openvino_tokenizers/tiktoken_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def generate_vocab_and_merges(
if len(merged) == 2:
merges.append(merged)
else:
added_tokens[token.decode("latin-1")] = rank
added_tokens[token] = rank

# Also add special tokens
vocab.update({string.encode(): idx for string, idx in encoding._special_tokens.items()})
Expand Down
4 changes: 4 additions & 0 deletions python/openvino_tokenizers/tokenizer_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -609,6 +609,10 @@ def from_tiktoken_encoding(
if ref_idx in existing_indices:
continue

# special tokens from reference vocab can be strings, not bytes
if isinstance(ref_token, str):
ref_token = ref_token.encode()

vocab[ref_token] = ref_idx

return cls(
Expand Down
2 changes: 1 addition & 1 deletion tests/pass_rates.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
{
"tests/tokenizers_test.py::test_": 0.9314420803782506
"tests/tokenizers_test.py::test_": 0.9247631283121889
}
Loading

0 comments on commit b4409fa

Please sign in to comment.