fix #1963 pretokenized sequences as input for flair.data.Sentence

flairNLP · Nov 12, 2020 · edcd88c · edcd88c
1 parent f5084d1
commit edcd88c
Show file tree

Hide file tree

Showing 3 changed files with 31 additions and 2 deletions.
diff --git a/flair/data.py b/flair/data.py
@@ -6,6 +6,7 @@
 
 from collections import Counter
 from collections import defaultdict
+from collections.abc import Iterable
 
 from deprecated import deprecated
 from flair.file_utils import Tqdm
@@ -525,19 +526,21 @@ class Sentence(DataPoint):
 
     def __init__(
         self,
-        text: str = None,
+        text: Union[str, List[str]] = None,
         use_tokenizer: Union[bool, Tokenizer] = True,
+        is_pretokenized: bool = False,
         language_code: str = None,
         start_position: int = None
     ):
         """
         Class to hold all meta related to a text (tokens, predictions, language code, ...)
-        :param text: original string
+        :param text: original string (sentence), or a list of string tokens (words)
         :param use_tokenizer: a custom tokenizer (default is :class:`SpaceTokenizer`)
             more advanced options are :class:`SegTokTokenizer` to use segtok or :class:`SpacyTokenizer`
             to use Spacy library if available). Check the implementations of abstract class Tokenizer or
             implement your own subclass (if you need it). If instead of providing a Tokenizer, this parameter
             is just set to True (deprecated), :class:`SegtokTokenizer` will be used.
+        :param is_pretokenized: Flag if "text" is expected to be a list of tokens
         :param language_code: Language of the sentence
         :param start_position: Start char offset of the sentence in the superordinate document
         """
@@ -570,6 +573,9 @@ def __init__(
         if text is not None:
             text = self._restore_windows_1252_characters(text)
             [self.add_token(token) for token in tokenizer.tokenize(text)]
+        elif is_pretokenized and isinstance(text, Iterable):
+            [self.add_token(self._restore_windows_1252_characters(token))
+             for token in text]
 
         # log a warning if the dataset is empty
         if text == "":

diff --git a/resources/docs/TUTORIAL_1_BASICS.md b/resources/docs/TUTORIAL_1_BASICS.md
@@ -109,6 +109,22 @@ You can write your own tokenization routine. Check the code of `flair.data.Token
  (e.g. `flair.tokenization.SegtokTokenizer` or `flair.tokenization.SpacyTokenizer`) to get an idea of how to add 
  your own tokenization method.  
 
+### Using pretokenized sequences
+You can pass pass a pretokenized sequence as list of words, e.g.
+
+```python
+from flair.data import Sentence
+my_sent = Sentence(['The', 'grass', 'is', 'green', '.'], is_pretokenized=True)
+print(my_sent)
+```
+
+This should print:
+
+```console
+Sentence: "The grass is green."   [− Tokens: 5]
+```
+
+
 ## Adding Labels
 
 In Flair, any data point can be labeled. For instance, you can label a word or label a sentence:

diff --git a/tests/test_data.py b/tests/test_data.py
@@ -866,3 +866,10 @@ def test_sentence_to_dict():
     assert "Facebook, Inc." == dict["entities"][0]["text"]
     assert "Google" == dict["entities"][1]["text"]
     assert 0 == len(dict["labels"])
+
+
+def test_pretokenized():
+    pretoks = ['The', 'grass', 'is', 'green', '.']
+    sent = Sentence(pretoks, is_pretokenized=True)
+    for i, token in enumerate(sent):
+        assert token.text == pretoks[i]