Merge pull request #556 from zalandoresearch/release-0.4.1

Release 0.4.1
flairNLP · Feb 22, 2019 · eae6905 · eae6905
2 parents 40fbcbd + 8507359
commit eae6905
Show file tree

Hide file tree

Showing 24 changed files with 498 additions and 301 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -8,5 +8,6 @@ install:
   - pip install -r requirements.txt -q
 before_script: cd tests
 script:
+  - pip freeze
   - 'if [ "$TRAVIS_PULL_REQUEST" != "false" ]; then pytest --runintegration; fi'
   - 'if [ "$TRAVIS_PULL_REQUEST" = "false" ]; then pytest; fi'
diff --git a/README.md b/README.md
@@ -25,7 +25,7 @@ document embeddings, including our proposed **[Flair embeddings](https://drive.g
 * **A Pytorch NLP framework.** Our framework builds directly on [Pytorch](https://pytorch.org/), making it easy to 
 train your own models and experiment with new approaches using Flair embeddings and classes.
 
-Now at [version 0.4.0](https://github.com/zalandoresearch/flair/releases)!
+Now at [version 0.4.1](https://github.com/zalandoresearch/flair/releases)!
 
 ## Comparison with State-of-the-Art
 

diff --git a/flair/data.py b/flair/data.py
@@ -32,7 +32,7 @@ def __init__(self, add_unk=True):
     def add_item(self, item: str) -> int:
         """
         add string - if already in dictionary returns its ID. if not in dictionary, it will get a new ID.
-        :param item: a string for which to assign an id
+        :param item: a string for which to assign an id.
         :return: ID of string
         """
         item = item.encode('utf-8')

diff --git a/flair/data_fetcher.py b/flair/data_fetcher.py
@@ -192,7 +192,9 @@ def load_corpus(task: Union[NLPTask, str], base_path: [str, Path] = None) -> Tag
 
         # for text classifiers, we use our own special format
         if task in [NLPTask.IMDB.value, NLPTask.AG_NEWS.value, NLPTask.TREC_6.value, NLPTask.TREC_50.value]:
-            return NLPTaskDataFetcher.load_classification_corpus(data_folder)
+            use_tokenizer: bool = False if task in [NLPTask.TREC_6.value, NLPTask.TREC_50.value] else True
+
+            return NLPTaskDataFetcher.load_classification_corpus(data_folder, use_tokenizer=use_tokenizer)
 
         # NER corpus for Basque
         if task == NLPTask.NER_BASQUE.value:
@@ -285,7 +287,7 @@ def load_column_corpus(
 
     @staticmethod
     def load_ud_corpus(
-            data_folder: Union[str,Path],
+            data_folder: Union[str, Path],
             train_file=None,
             test_file=None,
             dev_file=None) -> TaggedCorpus:
@@ -326,10 +328,11 @@ def load_ud_corpus(
 
     @staticmethod
     def load_classification_corpus(
-            data_folder: Union[str,Path],
+            data_folder: Union[str, Path],
             train_file=None,
             test_file=None,
-            dev_file=None) -> TaggedCorpus:
+            dev_file=None,
+            use_tokenizer: bool = True) -> TaggedCorpus:
         """
         Helper function to get a TaggedCorpus from text classification-formatted task data
 
@@ -370,11 +373,14 @@ def load_classification_corpus(
         log.info("Dev: {}".format(dev_file))
         log.info("Test: {}".format(test_file))
 
-        sentences_train: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file(train_file)
-        sentences_test: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file(test_file)
+        sentences_train: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file(train_file,
+                                                                                           use_tokenizer=use_tokenizer)
+        sentences_test: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file(test_file,
+                                                                                          use_tokenizer=use_tokenizer)
 
         if dev_file is not None:
-            sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file(dev_file)
+            sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file(dev_file,
+                                                                                             use_tokenizer=use_tokenizer)
         else:
             sentences_dev: List[Sentence] = [sentences_train[i] for i in
                                              NLPTaskDataFetcher.__sample(len(sentences_train), 0.1)]
@@ -383,7 +389,8 @@ def load_classification_corpus(
         return TaggedCorpus(sentences_train, sentences_dev, sentences_test)
 
     @staticmethod
-    def read_text_classification_file(path_to_file: Union[str,Path], max_tokens_per_doc=-1) -> List[Sentence]:
+    def read_text_classification_file(path_to_file: Union[str, Path], max_tokens_per_doc=-1, use_tokenizer=True) -> \
+            List[Sentence]:
         """
         Reads a data file for text classification. The file should contain one document/text per line.
         The line should have the following format:
@@ -416,7 +423,7 @@ def read_text_classification_file(path_to_file: Union[str,Path], max_tokens_per_
                 text = line[l_len:].strip()
 
                 if text and labels:
-                    sentence = Sentence(text, labels=labels, use_tokenizer=True)
+                    sentence = Sentence(text, labels=labels, use_tokenizer=use_tokenizer)
                     if len(sentence) > max_tokens_per_doc and max_tokens_per_doc > 0:
                         sentence.tokens = sentence.tokens[:max_tokens_per_doc]
                     if len(sentence.tokens) > 0: