Skip to content

Commit

Permalink
Add Test Tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
apaniukov committed Oct 7, 2024
1 parent 196f270 commit a73707c
Show file tree
Hide file tree
Showing 6 changed files with 283 additions and 283 deletions.
16 changes: 8 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -459,8 +459,8 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
<tbody>
<tr>
<td >BPE</td>
<td >96.58</td>
<td >6076</td>
<td >96.77</td>
<td >6062</td>
</tr>
<tr>
<td >SentencePiece</td>
Expand Down Expand Up @@ -498,12 +498,6 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
<td >95.40</td>
<td >261</td>
</tr>
<tr>
<td >BPE</td>
<td >EleutherAI/gpt-neo-125m</td>
<td >95.40</td>
<td >261</td>
</tr>
<tr>
<td >BPE</td>
<td >EleutherAI/gpt-neox-20b</td>
Expand Down Expand Up @@ -588,6 +582,12 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
<td >95.40</td>
<td >261</td>
</tr>
<tr>
<td >BPE</td>
<td >koalajun/Gemma-2-9b-it-Ko-Crypto-Translate</td>
<td >100.00</td>
<td >247</td>
</tr>
<tr>
<td >BPE</td>
<td >laion/CLIP-ViT-bigG-14-laion2B-39B-b160k</td>
Expand Down
4 changes: 2 additions & 2 deletions python/openvino_tokenizers/hf_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -1104,9 +1104,9 @@ def convert_tiktoken_model_tokenizer(

# (chat)GLM model adds spaces around <sop> token
decoder_vocab = pipeline[3].vocab
sop_index = next((idx for idx, token in enumerate(decoder_vocab) if token == "<sop>"), None)
sop_index = next((idx for idx, token in enumerate(decoder_vocab) if token == "<sop>".encode()), None)
if sop_index is not None:
decoder_vocab[sop_index] = " <sop> "
decoder_vocab[sop_index] = " <sop> ".encode()

pipeline.add_steps(
[
Expand Down
16 changes: 8 additions & 8 deletions python/openvino_tokenizers/tokenizer_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,14 +554,14 @@ def finalize(self) -> None:
if self.added_tokens is None:
return

vocab_set = set(self.vocab)
for (
token,
idx,
) in sorted(self.added_tokens.items(), key=lambda x: (x[1], x[0])):
if token not in vocab_set:
if idx >= len(self.vocab):
self.vocab.append(token)
size_diff = max(self.added_tokens.values()) - len(self.vocab) + 1
if size_diff > 0:
self.vocab.extend(type(self.vocab[0])() for _ in range(size_diff))

for token, idx in self.added_tokens.items():
if isinstance(self.vocab[0], bytes) and not isinstance(token, bytes):
token = token.encode()
self.vocab[idx] = token

@classmethod
def from_hf_json(cls, tokenizer_json: Dict[str, Any]) -> "BPETokenizationStep":
Expand Down
2 changes: 1 addition & 1 deletion tests/pass_rates.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
{
"tests/tokenizers_test.py::test_": 0.9306783665204185
"tests/tokenizers_test.py::test_": 0.9314235524626715
}
526 changes: 263 additions & 263 deletions tests/stats.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion tests/tokenizers_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@
"stabilityai/stablecode-completion-alpha-3b-4k",
"stabilityai/stablelm-tuned-alpha-7b",
"databricks/dolly-v2-3b",
"EleutherAI/gpt-neo-125m",
"koalajun/Gemma-2-9b-it-Ko-Crypto-Translate",
"EleutherAI/gpt-j-6b",
"roberta-base",
"sentence-transformers/all-roberta-large-v1", # standin for setfit
Expand Down

0 comments on commit a73707c

Please sign in to comment.