Add Test Tokenizer

openvinotoolkit · Oct 7, 2024 · a73707c · a73707c
1 parent 196f270
commit a73707c
Show file tree

Hide file tree

Showing 6 changed files with 283 additions and 283 deletions.
diff --git a/README.md b/README.md
@@ -459,8 +459,8 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
   <tbody>
     <tr>
       <td >BPE</td>
-      <td >96.58</td>
-      <td >6076</td>
+      <td >96.77</td>
+      <td >6062</td>
     </tr>
     <tr>
       <td >SentencePiece</td>
@@ -498,12 +498,6 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
       <td >95.40</td>
       <td >261</td>
     </tr>
-    <tr>
-      <td >BPE</td>
-      <td >EleutherAI/gpt-neo-125m</td>
-      <td >95.40</td>
-      <td >261</td>
-    </tr>
     <tr>
       <td >BPE</td>
       <td >EleutherAI/gpt-neox-20b</td>
@@ -588,6 +582,12 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
       <td >95.40</td>
       <td >261</td>
     </tr>
+    <tr>
+      <td >BPE</td>
+      <td >koalajun/Gemma-2-9b-it-Ko-Crypto-Translate</td>
+      <td >100.00</td>
+      <td >247</td>
+    </tr>
     <tr>
       <td >BPE</td>
       <td >laion/CLIP-ViT-bigG-14-laion2B-39B-b160k</td>

diff --git a/python/openvino_tokenizers/hf_parser.py b/python/openvino_tokenizers/hf_parser.py
@@ -1104,9 +1104,9 @@ def convert_tiktoken_model_tokenizer(
 
     # (chat)GLM model adds spaces around <sop> token
     decoder_vocab = pipeline[3].vocab
-    sop_index = next((idx for idx, token in enumerate(decoder_vocab) if token == "<sop>"), None)
+    sop_index = next((idx for idx, token in enumerate(decoder_vocab) if token == "<sop>".encode()), None)
     if sop_index is not None:
-        decoder_vocab[sop_index] = " <sop> "
+        decoder_vocab[sop_index] = " <sop> ".encode()
 
     pipeline.add_steps(
         [

diff --git a/python/openvino_tokenizers/tokenizer_pipeline.py b/python/openvino_tokenizers/tokenizer_pipeline.py
@@ -554,14 +554,14 @@ def finalize(self) -> None:
         if self.added_tokens is None:
             return
 
-        vocab_set = set(self.vocab)
-        for (
-            token,
-            idx,
-        ) in sorted(self.added_tokens.items(), key=lambda x: (x[1], x[0])):
-            if token not in vocab_set:
-                if idx >= len(self.vocab):
-                    self.vocab.append(token)
+        size_diff = max(self.added_tokens.values()) - len(self.vocab) + 1
+        if size_diff > 0:
+            self.vocab.extend(type(self.vocab[0])() for _ in range(size_diff))
+
+        for token, idx in self.added_tokens.items():
+            if isinstance(self.vocab[0], bytes) and not isinstance(token, bytes):
+                token = token.encode()
+            self.vocab[idx] = token
 
     @classmethod
     def from_hf_json(cls, tokenizer_json: Dict[str, Any]) -> "BPETokenizationStep":

diff --git a/tests/pass_rates.json b/tests/pass_rates.json
@@ -1,3 +1,3 @@
 {
-    "tests/tokenizers_test.py::test_": 0.9306783665204185
+    "tests/tokenizers_test.py::test_": 0.9314235524626715
 }
diff --git a/tests/stats.json b/tests/stats.json
diff --git a/tests/tokenizers_test.py b/tests/tokenizers_test.py
@@ -105,7 +105,7 @@
     "stabilityai/stablecode-completion-alpha-3b-4k",
     "stabilityai/stablelm-tuned-alpha-7b",
     "databricks/dolly-v2-3b",
-    "EleutherAI/gpt-neo-125m",
+    "koalajun/Gemma-2-9b-it-Ko-Crypto-Translate",
     "EleutherAI/gpt-j-6b",
     "roberta-base",
     "sentence-transformers/all-roberta-large-v1",  # standin for setfit