flairNLP · alanakbik · Aug 26, 2019 · Aug 23, 2019 · Aug 23, 2019 · Aug 23, 2019
diff --git a/flair/datasets.py b/flair/datasets.py
diff --git a/flair/embeddings.py b/flair/embeddings.py
@@ -39,7 +39,6 @@
 from .nn import LockedDropout, WordDropout
 from .data import Dictionary, Token, Sentence
 from .file_utils import cached_path, open_inside_zip
-from .training_utils import log_line
 
 log = logging.getLogger("flair")
 
@@ -730,6 +729,9 @@ def __init__(
             if model == "medium":
                 options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json"
                 weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5"
+            if model in ["large", "5.5B"]:
+                options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json"
+                weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5"
             if model == "pt" or model == "portuguese":
                 options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/contributed/pt/elmo_pt_options.json"
                 weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/contributed/pt/elmo_pt_weights.hdf5"
@@ -1653,8 +1655,10 @@ def __init__(self, model, fine_tune: bool = False, chars_per_chunk: int = 512):
             "es-forward-fast": f"{aws_path}/embeddings-v0.4/language_model_es_forward/lm-es-forward-fast.pt",
             "es-backward-fast": f"{aws_path}/embeddings-v0.4/language_model_es_backward/lm-es-backward-fast.pt",
             # Basque
-            "eu-forward": f"{aws_path}/embeddings-stefan-it/lm-eu-opus-large-forward-v0.1.pt",
-            "eu-backward": f"{aws_path}/embeddings-stefan-it/lm-eu-opus-large-backward-v0.1.pt",
+            "eu-forward": f"{aws_path}/embeddings-stefan-it/lm-eu-opus-large-forward-v0.2.pt",
+            "eu-backward": f"{aws_path}/embeddings-stefan-it/lm-eu-opus-large-backward-v0.2.pt",
+            "eu-v1-forward": f"{aws_path}/embeddings-stefan-it/lm-eu-opus-large-forward-v0.1.pt",
+            "eu-v1-backward": f"{aws_path}/embeddings-stefan-it/lm-eu-opus-large-backward-v0.1.pt",
             "eu-v0-forward": f"{aws_path}/embeddings-v0.4/lm-eu-large-forward-v0.1.pt",
             "eu-v0-backward": f"{aws_path}/embeddings-v0.4/lm-eu-large-backward-v0.1.pt",
             # Persian
@@ -1713,6 +1717,9 @@ def __init__(self, model, fine_tune: bool = False, chars_per_chunk: int = 512):
             "sv-backward": f"{aws_path}/embeddings-stefan-it/lm-sv-opus-large-backward-v0.1.pt",
             "sv-v0-forward": f"{aws_path}/embeddings-v0.4/lm-sv-large-forward-v0.1.pt",
             "sv-v0-backward": f"{aws_path}/embeddings-v0.4/lm-sv-large-backward-v0.1.pt",
+            # Tamil
+            "ta-forward": f"{aws_path}/embeddings-stefan-it/lm-ta-opus-large-forward-v0.1.pt",
+            "ta-backward": f"{aws_path}/embeddings-stefan-it/lm-ta-opus-large-backward-v0.1.pt",
         }
 
         if type(model) == str:

diff --git a/flair/models/sequence_tagger_model.py b/flair/models/sequence_tagger_model.py
@@ -82,6 +82,20 @@ def __init__(
         train_initial_hidden_state: bool = False,
         pickle_module: str = "pickle",
     ):
+        """
+        Initializes a SequenceTagger
+        :param hidden_size: number of hidden states in RNN
+        :param embeddings: word embeddings used in tagger
+        :param tag_dictionary: dictionary of tags you want to predict
+        :param tag_type: string identifier for tag type
+        :param use_crf: if True use CRF decoder, else project directly to tag space
+        :param use_rnn: if True use RNN layer, otherwise use word embeddings directly
+        :param rnn_layers: number of RNN layers
+        :param dropout: dropout probability
+        :param word_dropout: word dropout probability
+        :param locked_dropout: locked dropout probability
+        :param train_initial_hidden_state: if True, trains initial hidden state of RNN
+        """
 
         super(SequenceTagger, self).__init__()
 

diff --git a/flair/models/text_classification_model.py b/flair/models/text_classification_model.py
@@ -37,6 +37,14 @@ def __init__(
         multi_label: bool = None,
         multi_label_threshold: float = 0.5,
     ):
+        """
+        Initializes a TextClassifier
+        :param document_embeddings: embeddings used to embed each data point
+        :param label_dictionary: dictionary of labels you want to predict
+        :param multi_label: auto-detected by default, but you can set this to True to force multi-label prediction
+        or False to force single-label prediction
+        :param multi_label_threshold: If multi-label you can set the threshold to make predictions
+        """
 
         super(TextClassifier, self).__init__()
 

diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py
@@ -40,16 +40,24 @@ def __init__(
         corpus: Corpus,
         optimizer: torch.optim.Optimizer = SGD,
         epoch: int = 0,
-        loss: float = 10000.0,
         optimizer_state: dict = None,
         scheduler_state: dict = None,
         use_tensorboard: bool = False,
     ):
+        """
+        Initialize a model trainer
+        :param model: The model that you want to train. The model should inherit from flair.nn.Model
+        :param corpus: The dataset used to train the model, should be of type Corpus
+        :param optimizer: The optimizer to use (typically SGD or Adam)
+        :param epoch: The starting epoch (normally 0 but could be higher if you continue training model)
+        :param optimizer_state: Optimizer state (necessary if continue training from checkpoint)
+        :param scheduler_state: Scheduler state (necessary if continue training from checkpoint)
+        :param use_tensorboard: If True, writes out tensorboard information
+        """
         self.model: flair.nn.Model = model
         self.corpus: Corpus = corpus
         self.optimizer: torch.optim.Optimizer = optimizer
         self.epoch: int = epoch
-        self.loss: float = loss
         self.scheduler_state: dict = scheduler_state
         self.optimizer_state: dict = optimizer_state
         self.use_tensorboard: bool = use_tensorboard
@@ -539,7 +547,6 @@ def load_from_checkpoint(
             corpus,
             optimizer,
             epoch=checkpoint["epoch"],
-            loss=checkpoint["loss"],
             optimizer_state=checkpoint["optimizer_state_dict"],
             scheduler_state=checkpoint["scheduler_state_dict"],
         )

diff --git a/resources/docs/EXPERIMENTS.md b/resources/docs/EXPERIMENTS.md
@@ -26,11 +26,11 @@ resources/tasks/conll_03/eng.testb
 resources/tasks/conll_03/eng.train
 ```
 
-This allows the `NLPTaskDataFetcher` class to read the data into our data structures. Use the `NLPTask` enum to select 
-the dataset, as follows: 
+This allows the `CONLL_03()` corpus object to read the data into our data structures. Initialize the corpus as follows: 
 
 ```python
-corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03, base_path='resources/tasks')
+from flair.datasets import CONLL_03
+corpus: Corpus = CONLL_03(base_path='resources/tasks')
 ```
 
 This gives you a `Corpus` object that contains the data. Now, select `ner` as the tag you wish to predict and init the embeddings you wish to use.
@@ -41,12 +41,12 @@ The full code to get a state-of-the-art model for English NER is as follows:
 
 ```python
 from flair.data import Corpus
-from flair.data_fetcher import  NLPTaskDataFetcher, NLPTask
+from flair.datasets import CONLL_03
 from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, PooledFlairEmbeddings
 from typing import List
 
 # 1. get the corpus
-corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03, base_path='resources/tasks')
+corpus: Corpus = CONLL_03(base_path='resources/tasks')
 
 # 2. what tag do we want to predict?
 tag_type = 'ner'
@@ -83,6 +83,7 @@ from flair.trainers import ModelTrainer
 trainer: ModelTrainer = ModelTrainer(tagger, corpus)
 
 trainer.train('resources/taggers/example-ner',
+              train_with_dev=True,  
               max_epochs=150)
 ```
 
@@ -109,12 +110,12 @@ FastText word embeddings and German contextual string embeddings. The full code
 
 ```python
 from flair.data import Corpus
-from flair.data_fetcher import  NLPTaskDataFetcher, NLPTask
+from flair.datasets import CONLL_03_GERMAN
 from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, PooledFlairEmbeddings
 from typing import List
 
 # 1. get the corpus
-corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03_GERMAN, base_path='resources/tasks')
+corpus: Corpus = CONLL_03_GERMAN(base_path='resources/tasks')
 
 # 2. what tag do we want to predict?
 tag_type = 'ner'
@@ -145,6 +146,7 @@ from flair.trainers import ModelTrainer
 trainer: ModelTrainer = ModelTrainer(tagger, corpus)
 
 trainer.train('resources/taggers/example-ner',
+              train_with_dev=True,  
               max_epochs=150)
 ```
 
@@ -164,12 +166,12 @@ FastText word embeddings and German contextual string embeddings. The full code
 
 ```python
 from flair.data import Corpus
-from flair.data_fetcher import  NLPTaskDataFetcher, NLPTask
+from flair.datasets import CONLL_03_DUTCH
 from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, PooledFlairEmbeddings
 from typing import List
 
 # 1. get the corpus
-corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03_DUTCH, base_path='resources/tasks')
+corpus: Corpus = CONLL_03_DUTCH()
 
 # 2. what tag do we want to predict?
 tag_type = 'ner'
@@ -200,6 +202,7 @@ from flair.trainers import ModelTrainer
 trainer: ModelTrainer = ModelTrainer(tagger, corpus)
 
 trainer.train('resources/taggers/example-ner',
+              train_with_dev=True,  
               max_epochs=150)
 ```
 
@@ -215,16 +218,16 @@ Data is included in Flair and will get automatically downloaded when you run the
 
 #### Best Known Configuration
 Once you have the data, reproduce our experiments exactly like for CoNLL-03, just with a different dataset and with
-FastText word embeddings and German contextual string embeddings. The full code then is as follows:
+FastText word embeddings for twitter and crawls. The full code then is as follows:
 
 ```python
 from flair.data import Corpus
-from flair.data_fetcher import  NLPTaskDataFetcher, NLPTask
+from flair.datasets import WNUT_17
 from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings
 from typing import List
 
 # 1. get the corpus
-corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03_DUTCH, base_path='resources/tasks')
+corpus: Corpus = WNUT_17()
 
 # 2. what tag do we want to predict?
 tag_type = 'ner'
@@ -256,6 +259,7 @@ from flair.trainers import ModelTrainer
 trainer: ModelTrainer = ModelTrainer(tagger, corpus)
 
 trainer.train('resources/taggers/example-ner',
+              train_with_dev=True,  
               max_epochs=150)
 ```
 
@@ -283,16 +287,18 @@ resources/tasks/onto-ner/eng.train
 #### Best Known Configuration
 
 Once you have the data, reproduce our experiments exactly like for CoNLL-03, just with a different dataset and with 
-FastText embeddings (they work better on this dataset). The full code then is as follows: 
+FastText embeddings (they work better on this dataset). You also need to provide a `column_format` for the `ColumnCorpus` object indicating which column in the training file is the 'ner' information. The full code then is as follows: 
 
 ```python
 from flair.data import Corpus
-from flair.data_fetcher import  NLPTaskDataFetcher, NLPTask
+from flair.datasets import ColumnCorpus
 from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings
 from typing import List
 
 # 1. get the corpus
-corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.ONTONER, base_path='resources/tasks')
+corpus: Corpus = flair.datasets.ColumnCorpus('resources/tasks/onto-ner',
+                                             column_format={0: 'text', 1: 'pos', 2: 'upos', 3: 'ner'},
+                                             tag_to_bioes='ner')
 
 # 2. what tag do we want to predict?
 tag_type = 'ner'
@@ -324,8 +330,9 @@ trainer: ModelTrainer = ModelTrainer(tagger, corpus)
 
 trainer.train('resources/taggers/example-ner',
               learning_rate=0.1,
-              # it's a big dataset so maybe set embeddings_in_memory to False
-              embeddings_in_memory=False)
+              train_with_dev=True,  
+              # it's a big dataset so maybe set embeddings_storage_mode to 'none' (embeddings are not kept in memory)
+              embeddings_storage_mode='none')
 ```
 
 
@@ -340,12 +347,12 @@ trainer.train('resources/taggers/example-ner',
 
 Get the [Penn treebank](https://catalog.ldc.upenn.edu/ldc99t42) and follow the guidelines
 in [Collins (2002)](http://www.cs.columbia.edu/~mcollins/papers/tagperc.pdf) to produce train, dev and test splits.
-Convert splits into CoNLLU-U format and place train, test and dev data in `resources/tasks/penn/` as follows: 
+Convert splits into CoNLLU-U format and place train, test and dev data in `/path/to/penn/` as follows: 
 
 ```
-resources/tasks/penn/test.conll
-resources/tasks/penn/train.conll
-resources/tasks/penn/valid.conll
+/path/to/penn/test.conll
+/path/to/penn/train.conll
+/path/to/penn/valid.conll
 ```
 
 Then, run the experiments with extvec embeddings and contextual string embeddings. Also, select 'pos' as `tag_type`, 
@@ -355,12 +362,12 @@ so the algorithm knows that POS tags and not NER are to be predicted from this d
 
 ```python
 from flair.data import Corpus
-from flair.data_fetcher import  NLPTaskDataFetcher, NLPTask
+from flair.datasets import UniversalDependenciesCorpus
 from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings
 from typing import List
 
 # 1. get the corpus
-corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.PENN, base_path='resources/tasks')
+corpus: Corpus = UniversalDependenciesCorpus(base_path='/path/to/penn')
 
 # 2. what tag do we want to predict?
 tag_type = 'pos'
@@ -389,10 +396,9 @@ from flair.trainers import ModelTrainer
 
 trainer: ModelTrainer = ModelTrainer(tagger, corpus)
 
-trainer.train('resources/taggers/example-ner',
-              max_epochs=150,
-              # its a big dataset, so maybe set embeddings_in_memory=False
-              embeddings_in_memory=True)
+trainer.train('resources/taggers/example-pos',
+              train_with_dev=True,  
+              max_epochs=150)
 ```
 
 ## CoNLL-2000 Noun Phrase Chunking (English)
@@ -411,12 +417,12 @@ so the algorithm knows that chunking tags and not NER are to be predicted from t
 
 ```python
 from flair.data import Corpus
-from flair.data_fetcher import  NLPTaskDataFetcher, NLPTask
+from flair.datasets import CONLL_2000
 from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings
 from typing import List
 
 # 1. get the corpus
-corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_2000)
+corpus: Corpus = CONLL_2000()
 
 # 2. what tag do we want to predict?
 tag_type = 'np'
@@ -446,6 +452,7 @@ from flair.trainers import ModelTrainer
 
 trainer: ModelTrainer = ModelTrainer(tagger, corpus)
 
-trainer.train('resources/taggers/example-ner',
+trainer.train('resources/taggers/example-chunk',
+              train_with_dev=True,  
               max_epochs=150)
 ```