From 378853d632a9a6853ea5830e260ca2788e2b36ec Mon Sep 17 00:00:00 2001
From: aakbik <alan.akbik@zalando.de>
Date: Fri, 23 Aug 2019 17:22:36 +0200
Subject: [PATCH 01/23] add embeddings by @stefan-it and documentation

---
 flair/embeddings.py                           |  10 +-
 resources/docs/TUTORIAL_3_WORD_EMBEDDING.md   | 145 +-----
 .../TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md   | 493 ------------------
 .../docs/embeddings/BYTE_PAIR_EMBEDDINGS.md   |  26 +
 .../docs/embeddings/CHARACTER_EMBEDDINGS.md   |  20 +
 .../embeddings/CLASSIC_WORD_EMBEDDINGS.md     | 115 ++++
 resources/docs/embeddings/ELMO_EMBEDDINGS.md  |  32 ++
 .../docs/embeddings/FASTTEXT_EMBEDDINGS.md    |  23 +
 resources/docs/embeddings/FLAIR_EMBEDDINGS.md | 112 ++++
 .../docs/embeddings/TRANSFOMER_EMBEDDINGS.md  | 350 +++++++++++++
 10 files changed, 688 insertions(+), 638 deletions(-)
 create mode 100644 resources/docs/embeddings/BYTE_PAIR_EMBEDDINGS.md
 create mode 100644 resources/docs/embeddings/CHARACTER_EMBEDDINGS.md
 create mode 100644 resources/docs/embeddings/CLASSIC_WORD_EMBEDDINGS.md
 create mode 100644 resources/docs/embeddings/ELMO_EMBEDDINGS.md
 create mode 100644 resources/docs/embeddings/FASTTEXT_EMBEDDINGS.md
 create mode 100644 resources/docs/embeddings/FLAIR_EMBEDDINGS.md
 create mode 100644 resources/docs/embeddings/TRANSFOMER_EMBEDDINGS.md

diff --git a/flair/embeddings.py b/flair/embeddings.py
index 6e1750694..3b00bd2e0 100644
--- a/flair/embeddings.py
+++ b/flair/embeddings.py
@@ -39,7 +39,6 @@
 from .nn import LockedDropout, WordDropout
 from .data import Dictionary, Token, Sentence
 from .file_utils import cached_path, open_inside_zip
-from .training_utils import log_line
 
 log = logging.getLogger("flair")
 
@@ -1653,8 +1652,10 @@ def __init__(self, model, fine_tune: bool = False, chars_per_chunk: int = 512):
             "es-forward-fast": f"{aws_path}/embeddings-v0.4/language_model_es_forward/lm-es-forward-fast.pt",
             "es-backward-fast": f"{aws_path}/embeddings-v0.4/language_model_es_backward/lm-es-backward-fast.pt",
             # Basque
-            "eu-forward": f"{aws_path}/embeddings-stefan-it/lm-eu-opus-large-forward-v0.1.pt",
-            "eu-backward": f"{aws_path}/embeddings-stefan-it/lm-eu-opus-large-backward-v0.1.pt",
+            "eu-forward": f"{aws_path}/embeddings-stefan-it/lm-eu-opus-large-forward-v0.2.pt",
+            "eu-backward": f"{aws_path}/embeddings-stefan-it/lm-eu-opus-large-backward-v0.2.pt",
+            "eu-v1-forward": f"{aws_path}/embeddings-stefan-it/lm-eu-opus-large-forward-v0.1.pt",
+            "eu-v1-backward": f"{aws_path}/embeddings-stefan-it/lm-eu-opus-large-backward-v0.1.pt",
             "eu-v0-forward": f"{aws_path}/embeddings-v0.4/lm-eu-large-forward-v0.1.pt",
             "eu-v0-backward": f"{aws_path}/embeddings-v0.4/lm-eu-large-backward-v0.1.pt",
             # Persian
@@ -1713,6 +1714,9 @@ def __init__(self, model, fine_tune: bool = False, chars_per_chunk: int = 512):
             "sv-backward": f"{aws_path}/embeddings-stefan-it/lm-sv-opus-large-backward-v0.1.pt",
             "sv-v0-forward": f"{aws_path}/embeddings-v0.4/lm-sv-large-forward-v0.1.pt",
             "sv-v0-backward": f"{aws_path}/embeddings-v0.4/lm-sv-large-backward-v0.1.pt",
+            # Tamil
+            "ta-forward": f"{aws_path}/embeddings-stefan-it/lm-ta-opus-large-forward-v0.1.pt",
+            "ta-backward": f"{aws_path}/embeddings-stefan-it/lm-ta-opus-large-backward-v0.1.pt",
         }
 
         if type(model) == str:
diff --git a/resources/docs/TUTORIAL_3_WORD_EMBEDDING.md b/resources/docs/TUTORIAL_3_WORD_EMBEDDING.md
index 220fe050b..5d037b5dc 100644
--- a/resources/docs/TUTORIAL_3_WORD_EMBEDDING.md
+++ b/resources/docs/TUTORIAL_3_WORD_EMBEDDING.md
@@ -11,7 +11,7 @@ All word embedding classes inherit from the `TokenEmbeddings` class and implemen
 call to embed your text. This means that for most users of Flair, the complexity of different embeddings remains 
 hidden behind this interface. Simply instantiate the embedding class you require and call `embed()` to embed your text.
 
-All embeddings produced with our methods are Pytorch vectors, so they can be immediately used for training and
+All embeddings produced with our methods are PyTorch vectors, so they can be immediately used for training and
 fine-tuning.
 
 
@@ -52,59 +52,7 @@ id string to the constructor of the `WordEmbeddings` class. Typically, you use
 the **two-letter language code** to init an embedding, so 'en' for English and
 'de' for German and so on. By default, this will initialize FastText embeddings trained over Wikipedia.
 You can also always use FastText embeddings over Web crawls, by instantiating with '-crawl'. So 'de-crawl' 
-to use embeddings trained over German web crawls. 
-
-For English, we provide a few more options, so
-here you can choose between instantiating 'en-glove', 'en-extvec' and so on.
-
-The following embeddings are currently supported:
- 
-| ID | Language | Embedding | 
-| ------------- | -------------  | ------------- |
-| 'en-glove' (or 'glove') | English | GloVe embeddings |
-| 'en-extvec' (or 'extvec') | English |Komninos embeddings |
-| 'en-crawl' (or 'crawl')  | English | FastText embeddings over Web crawls |
-| 'en-twitter' (or 'twitter')  | English | Twitter embeddings |
-| 'en-turian' (or 'turian')  | English | Turian embeddings (small) |
-| 'en' (or 'en-news' or 'news')  |English | FastText embeddings over news and wikipedia data |
-| 'de' | German |German FastText embeddings |
-| 'nl' | Dutch | Dutch FastText embeddings |
-| 'fr' | French | French FastText embeddings |
-| 'it' | Italian | Italian FastText embeddings |
-| 'es' | Spanish | Spanish FastText embeddings |
-| 'pt' | Portuguese | Portuguese FastText embeddings |
-| 'ro' | Romanian | Romanian FastText embeddings |
-| 'ca' | Catalan | Catalan FastText embeddings |
-| 'sv' | Swedish | Swedish FastText embeddings |
-| 'da' | Danish | Danish FastText embeddings |
-| 'no' | Norwegian | Norwegian FastText embeddings |
-| 'fi' | Finnish | Finnish FastText embeddings |
-| 'pl' | Polish | Polish FastText embeddings |
-| 'cz' | Czech | Czech FastText embeddings |
-| 'sk' | Slovak | Slovak FastText embeddings |
-| 'si' | Slovenian | Slovenian FastText embeddings |
-| 'sr' | Serbian | Serbian FastText embeddings |
-| 'hr' | Croatian | Croatian FastText embeddings |
-| 'bg' | Bulgarian | Bulgarian FastText embeddings |
-| 'ru' | Russian | Russian FastText embeddings |
-| 'ar' | Arabic | Arabic FastText embeddings |
-| 'he' | Hebrew | Hebrew FastText embeddings |
-| 'tr' | Turkish | Turkish FastText embeddings |
-| 'fa' | Persian | Persian FastText embeddings |
-| 'ja' | Japanese | Japanese FastText embeddings |
-| 'ko' | Korean | Korean FastText embeddings |
-| 'zh' | Chinese | Chinese FastText embeddings |
-| 'hi' | Hindi | Hindi FastText embeddings |
-| 'id' | Indonesian | Indonesian FastText embeddings |
-| 'eu' | Basque | Basque FastText embeddings |
-
-So, if you want to load German FastText embeddings, instantiate as follows:
-
-```python
-german_embedding = WordEmbeddings('de')
-```
-
-Alternatively, if you want to load German FastText embeddings trained over crawls, instantiate as follows:
+to use embeddings trained over German web crawls:
 
 ```python
 german_embedding = WordEmbeddings('de-crawl')
@@ -112,96 +60,9 @@ german_embedding = WordEmbeddings('de-crawl')
 
 We generally recommend the FastText embeddings, or GloVe if you want a smaller model.
 
-If you want to use any other embeddings (not listed in the list above), you can load those by calling
-```python
-custom_embedding = WordEmbeddings('path/to/your/custom/embeddings.gensim')
-```
-If you want to load custom embeddings you need to make sure, that the custom embeddings are correctly formatted to
-[gensim](https://radimrehurek.com/gensim/models/word2vec.html).
-
-You can, for example, convert [FastText embeddings](https://fasttext.cc/docs/en/crawl-vectors.html) to gensim using the
-following code snippet:
-```python
-import gensim
-
-word_vectors = gensim.models.KeyedVectors.load_word2vec_format('/path/to/fasttext/embeddings.txt', binary=False)
-word_vectors.save('/path/to/converted')
-```
-
-However, FastText embeddings have the functionality of returning vectors for out of vocabulary words using the sub-word information. If you want to use this then try `FastTextEmbeddings` class. 
-
-
-## FastText Embeddings
-
-FastText Embeddings can give you vectors for out of vocabulary(oov) words by using the sub-word information. To use this functionality with Flair, use `FastTextEmbeddings` class as shown:
-
-```python
-from flair.embeddings import FastTextEmbeddings 
-  
-# init embedding  
-embedding = FastTextEmbeddings('/path/to/local/custom_fasttext_embeddings.bin')  
-  
-# create a sentence  
-sentence = Sentence('The grass is green .')  
-  
-# embed words in sentence  
-embedding.embed(sentence)  
-```
-
-You can initialize the class by passing the remote downloadable URL as well.
-
-```python
-embedding = FastTextEmbeddings('/path/to/remote/downloadable/custom_fasttext_embeddings.bin', use_local=False)  
-```
-
 
-## Character Embeddings
-
-Some embeddings - such as character-features - are not pre-trained but rather trained on the downstream task. Normally
-this requires you to implement a [hierarchical embedding architecture](http://neuroner.com/NeuroNERengine_with_caption_no_figure.png). 
-
-With Flair, you don't need to worry about such things. Just choose the appropriate
-embedding class and character features will then automatically train during downstream task training. 
-
-```python
-from flair.embeddings import CharacterEmbeddings
-
-# init embedding
-embedding = CharacterEmbeddings()
-
-# create a sentence
-sentence = Sentence('The grass is green .')
-
-# embed words in sentence
-embedding.embed(sentence)
-```
-
-## New: Byte Pair Embeddings
-
-We now also include the byte pair embeddings calulated by @bheinzerling that segment words into subsequences.
-This can dramatically reduce the model size vis-a-vis using normal word embeddings at nearly the same accuracy.
-So, if you want to train small models try out the new `BytePairEmbeddings` class.
-
-You initialize with a language code (275 languages supported), a number of 'syllables' (one of ) and
-a number of dimensions (one of 50, 100, 200 or 300). The following initializes and uses byte pair embeddings
-for English:
-
-```python
-from flair.embeddings import BytePairEmbeddings
-
-# init embedding
-embedding = BytePairEmbeddings('en')
-
-# create a sentence
-sentence = Sentence('The grass is green .')
-
-# embed words in sentence
-embedding.embed(sentence)
-```
+## Flair Embeddings
 
-More information can be found
-on the [byte pair embeddings](https://nlp.h-its.org/bpemb/) web page.
-Given its memory advantages, we would be interested to hear from the community how well these embeddings work.
 
 ## Stacked Embeddings
 
diff --git a/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md b/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md
index 384767e0e..afd9b2222 100644
--- a/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md
+++ b/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md
@@ -13,500 +13,7 @@ hidden behind this interface. Simply instantiate the embedding class you require
 All embeddings produced with our methods are Pytorch vectors, so they can be immediately used for training and
 fine-tuning.
 
-## Flair Embeddings
 
-Contextual string embeddings are [powerful embeddings](https://drive.google.com/file/d/17yVpFA7MmXaQFTe-HDpZuqw9fJlmzg56/view?usp=sharing)
- that capture latent syntactic-semantic information that goes beyond
-standard word embeddings. Key differences are: (1) they are trained without any explicit notion of words and
-thus fundamentally model words as sequences of characters. And (2) they are **contextualized** by their
-surrounding text, meaning that the *same word will have different embeddings depending on its
-contextual use*.
-
-With Flair, you can use these embeddings simply by instantiating the appropriate embedding class, same as standard word embeddings:
-
-```python
-from flair.embeddings import FlairEmbeddings
-
-# init embedding
-flair_embedding_forward = FlairEmbeddings('news-forward')
-
-# create a sentence
-sentence = Sentence('The grass is green .')
-
-# embed words in sentence
-flair_embedding_forward.embed(sentence)
-```
-
-You choose which embeddings you load by passing the appropriate string to the constructor of the `FlairEmbeddings` class.
-Currently, the following contextual string embeddings are provided (note: replace '*X*' with either '*forward*' or '*backward*'):
-
-| ID | Language | Embedding |
-| -------------     | ------------- | ------------- |
-| 'multi-X'    | English, German, French, Italian, Dutch, Polish | Mix of corpora (Web, Wikipedia, Subtitles, News) |
-| 'multi-X-fast'    | English, German, French, Italian, Dutch, Polish | Mix of corpora (Web, Wikipedia, Subtitles, News), CPU-friendly |
-| 'news-X'    | English | Trained with 1 billion word corpus |
-| 'news-X-fast'    | English | Trained with 1 billion word corpus, CPU-friendly |
-| 'mix-X'     | English | Trained with mixed corpus (Web, Wikipedia, Subtitles) |
-| 'ar-X'     | Arabic | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
-| 'bg-X'  | Bulgarian | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
-| 'bg-X-fast'  | Bulgarian  | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): Trained with various sources (Europarl, Wikipedia or SETimes) |
-| 'cs-X'     | Czech | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
-| 'cs-v0-X'    | Czech | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): LM embeddings (earlier version) |
-| 'de-X'  | German  | Trained with mixed corpus (Web, Wikipedia, Subtitles) |
-| 'de-historic-ha-X'  | German (historical) | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): Historical German trained over *Hamburger Anzeiger* |
-| 'de-historic-wz-X'  | German (historical) | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): Historical German trained over *Wiener Zeitung* |
-| 'es-X'    | Spanish | Added by [@iamyihwa](https://github.com/zalandoresearch/flair/issues/80): Trained with Wikipedia |
-| 'es-X-fast'    | Spanish | Added by [@iamyihwa](https://github.com/zalandoresearch/flair/issues/80): Trained with Wikipediam CPU-friendly |
-| 'eu-X'    | Basque | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
-| 'eu-v0-X'    | Basque | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): LM embeddings (earlier version) |
-| 'fa-X'     | Persian | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
-| 'fi-X'     | Finnish | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
-| 'fr-X'    | French | Added by [@mhham](https://github.com/mhham): Trained with French Wikipedia |
-| 'he-X'     | Hebrew | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
-| 'hi-X'     | Hindi | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
-| 'hr-X'     | Croatian | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
-| 'id-X'     | Indonesian | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
-| 'it-X'     | Italian | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
-| 'ja-X'    | Japanese | Added by [@frtacoa](https://github.com/zalandoresearch/flair/issues/527): Trained with 439M words of Japanese Web crawls (2048 hidden states, 2 layers)|
-| 'nl-X'     | Dutch | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
-| 'nl-v0-X'    | Dutch | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): LM embeddings (earlier version) |
-| 'no-X'     | Norwegian | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
-| 'pl-X'  | Polish  | Added by [@borchmann](https://github.com/applicaai/poleval-2018): Trained with web crawls (Polish part of CommonCrawl) |
-| 'pl-opus-X'     | Polish | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
-| 'pt-X'    | Portuguese | Added by [@ericlief](https://github.com/ericlief/language_models): LM embeddings |
-| 'sl-X'     | Slovenian | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
-| 'sl-v0-X'  | Slovenian  | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): Trained with various sources (Europarl, Wikipedia and OpenSubtitles2018) |
-| 'sv-X'    | Swedish | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
-| 'sv-v0-X'    | Swedish | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): Trained with various sources (Europarl, Wikipedia or OpenSubtitles2018) |
-| 'pubmed-X'    | English | Added by [@jessepeng](https://github.com/zalandoresearch/flair/pull/519): Trained with 5% of PubMed abstracts until 2015 (1150 hidden states, 3 layers)|
-
-
-So, if you want to load embeddings from the German forward LM model, instantiate the method as follows:
-
-```python
-flair_de_forward = FlairEmbeddings('de-forward')
-```
-
-And if you want to load embeddings from the Bulgarian backward LM model, instantiate the method as follows:
-
-```python
-flair_bg_backward = FlairEmbeddings('bg-backward')
-```
-
-## Recommended Flair Usage
-
-We recommend combining both forward and backward Flair embeddings. Depending on the task, we also recommend adding standard word embeddings into the mix. So, our recommended `StackedEmbedding` for most English tasks is:
-
-
-```python
-from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings
-
-# create a StackedEmbedding object that combines glove and forward/backward flair embeddings
-stacked_embeddings = StackedEmbeddings([
-                                        WordEmbeddings('glove'),
-                                        FlairEmbeddings('news-forward'),
-                                        FlairEmbeddings('news-backward'),
-                                       ])
-```
-
-That's it! Now just use this embedding like all the other embeddings, i.e. call the `embed()` method over your sentences.
-
-```python
-sentence = Sentence('The grass is green .')
-
-# just embed a sentence using the StackedEmbedding as you would with any single embedding.
-stacked_embeddings.embed(sentence)
-
-# now check out the embedded tokens.
-for token in sentence:
-    print(token)
-    print(token.embedding)
-```
-Words are now embedded using a concatenation of three different embeddings. This combination often gives state-of-the-art accuracy.
-
-
-## PyTorch-Transformers
-
-Thanks to the brilliant [`pytorch-transformers`](https://github.com/huggingface/pytorch-transformers) library from [Hugging Face](https://github.com/huggingface),
-Flair is able to support various Transformer-based architectures like BERT or XLNet.
-
-The following embeddings can be used in Flair:
-
-* `BertEmbeddings`
-* `OpenAIGPTEmbeddings`
-* `OpenAIGPT2Embeddings`
-* `TransformerXLEmbeddings`
-* `XLNetEmbeddings`
-* `XLMEmbeddings`
-* `RoBERTaEmbeddings`
-
-This section shows how to use these Transformer-based architectures in Flair and is heavily based on the excellent
-[PyTorch-Transformers pre-trained models documentation](https://huggingface.co/pytorch-transformers/pretrained_models.html).
-
-### BERT Embeddings
-
-[BERT embeddings](https://arxiv.org/pdf/1810.04805.pdf) were developed by Devlin et al. (2018) and are a different kind
-of powerful word embedding based on a bidirectional transformer architecture.
-The embeddings itself are wrapped into our simple embedding interface, so that they can be used like any other
-embedding.
-
-```python
-from flair.embeddings import BertEmbeddings
-
-# init embedding
-embedding = BertEmbeddings()
-
-# create a sentence
-sentence = Sentence('The grass is green .')
-
-# embed words in sentence
-embedding.embed(sentence)
-```
-
-The `BertEmbeddings` class has several arguments:
-
-| Argument             | Default             | Description
-| -------------------- | ------------------- | -------------------------------------------------
-| `bert_model_or_path` | `bert-base-uncased` | Defines BERT model or points to user-defined path
-| `layers`             | `-1,-2,-3,-4`       | Defines the to be used layers of the Transformer-based model
-| `pooling_operation`  | `first`             | See [Pooling operation section](#Pooling-operation).
-| `use_scalar_mix`     | `False`             | See [Scalar mix section](#Scalar-mix).
-
-You can load any of the pre-trained BERT models by providing `bert_model_or_path` during initialization:
-
-| Model                                                   | Details
-| ------------------------------------------------------- | -----------------------------------------------
-| `bert-base-uncased`                                     | 12-layer, 768-hidden, 12-heads, 110M parameters
-|                                                         | Trained on lower-cased English text
-| `bert-large-uncased`                                    | 24-layer, 1024-hidden, 16-heads, 340M parameters
-|                                                         | Trained on lower-cased English text
-| `bert-base-cased`                                       | 12-layer, 768-hidden, 12-heads, 110M parameters
-|                                                         | Trained on cased English text
-| `bert-large-cased`                                      | 24-layer, 1024-hidden, 16-heads, 340M parameters
-|                                                         | Trained on cased English text
-| `bert-base-multilingual-uncased`                        | (Original, not recommended) 12-layer, 768-hidden, 12-heads, 110M parameters
-|                                                         | Trained on lower-cased text in the top 102 languages with the largest Wikipedias
-|                                                         | (see [details](https://github.com/google-research/bert/blob/master/multilingual.md))
-| `bert-base-multilingual-cased`                          | (New, **recommended**) 12-layer, 768-hidden, 12-heads, 110M parameters
-|                                                         | Trained on cased text in the top 104 languages with the largest Wikipedias
-|                                                         | (see [details](https://github.com/google-research/bert/blob/master/multilingual.md))
-| `bert-base-chinese`                                     | 12-layer, 768-hidden, 12-heads, 110M parameters
-|                                                         | Trained on cased Chinese Simplified and Traditional text
-| `bert-base-german-cased`                                | 12-layer, 768-hidden, 12-heads, 110M parameters
-|                                                         | Trained on cased German text by Deepset.ai
-|                                                         | (see [details on deepset.ai website](https://deepset.ai/german-bert))
-| `bert-large-uncased-whole-word-masking`                 | 24-layer, 1024-hidden, 16-heads, 340M parameters
-|                                                         | Trained on lower-cased English text using Whole-Word-Masking
-|                                                         | (see [details](https://github.com/google-research/bert/#bert))
-| `bert-large-cased-whole-word-masking`                   | 24-layer, 1024-hidden, 16-heads, 340M parameters
-|                                                         | Trained on cased English text using Whole-Word-Masking
-|                                                         | (see [details](https://github.com/google-research/bert/#bert))
-| `bert-large-uncased-whole-word-masking-finetuned-squad` | 24-layer, 1024-hidden, 16-heads, 340M parameters
-|                                                         | The `bert-large-uncased-whole-word-masking` model fine-tuned on SQuAD (see details of fine-tuning in the
-|                                                         | [example section of PyTorch-Transformers](https://github.com/huggingface/pytorch-transformers/tree/master/examples))
-| `bert-large-cased-whole-word-masking-finetuned-squad`   | 24-layer, 1024-hidden, 16-heads, 340M parameters
-|                                                         | The `bert-large-cased-whole-word-masking` model fine-tuned on SQuAD
-|                                                         | (see [details of fine-tuning in the example section](https://huggingface.co/pytorch-transformers/examples.html))
-| `bert-base-cased-finetuned-mrpc`                        | 12-layer, 768-hidden, 12-heads, 110M parameters
-|                                                         | The `bert-base-cased` model fine-tuned on MRPC
-|                                                         | (see [details of fine-tuning in the example section of PyTorch-Transformers](https://huggingface.co/pytorch-transformers/examples.html))
-
-## OpenAI GPT Embeddings
-
-The OpenAI GPT model was proposed by [Radford et. al (2018)](https://s3-us-west-2.amazonaws.com/openai-assets/research-covers/language-unsupervised/language_understanding_paper.pdf).
-GPT is an uni-directional Transformer-based model.
-
-The following example shows how to use the `OpenAIGPTEmbeddings`:
-
-```python
-from flair.embeddings import OpenAIGPTEmbeddings
-
-# init embedding
-embedding = OpenAIGPTEmbeddings()
-
-# create a sentence
-sentence = Sentence('Berlin and Munich are nice cities .')
-
-# embed words in sentence
-embedding.embed(sentence)
-```
-
-The `OpenAIGPTEmbeddings` class has several arguments:
-
-| Argument                        | Default      | Description
-| ------------------------------- | ------------ | -------------------------------------------------
-| `pretrained_model_name_or_path` | `openai-gpt` | Defines name or path of GPT model
-| `layers`                        | `1`          | Defines the to be used layers of the Transformer-based model
-| `pooling_operation`             | `first_last` | See [Pooling operation section](#Pooling-operation)
-| `use_scalar_mix`                | `False`      | See [Scalar mix section](#Scalar-mix)
-
-## OpenAI GPT-2 Embeddings
-
-The OpenAI GPT-2 model was proposed by [Radford et. al (2018)](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf).
-GPT-2 is also an uni-directional Transformer-based model, that was trained on a larger corpus compared to the GPT model.
-
-The GPT-2 model can be used with the `OpenAIGPT2Embeddings` class:
-
-```python
-from flair.embeddings import OpenAIGPT2Embeddings
-
-# init embedding
-embedding = OpenAIGPT2Embeddings()
-
-# create a sentence
-sentence = Sentence('The Englischer Garten is a large public park in the centre of Munich .')
-
-# embed words in sentence
-embedding.embed(sentence)
-```
-
-The `OpenAIGPT2Embeddings` class has several arguments:
-
-| Argument                        | Default       | Description
-| ------------------------------- | ------------- | -------------------------------------------------
-| `pretrained_model_name_or_path` | `gpt2-medium` | Defines name or path of GPT-2 model
-| `layers`                        | `1`           | Defines the to be used layers of the Transformer-based model
-| `pooling_operation`             | `first_last`  | See [Pooling operation section](#Pooling-operation)
-| `use_scalar_mix`                | `False`       | See [Scalar mix section](#Scalar-mix)
-
-Following GPT-2 models can be used:
-
-| Model         | Details
-| ------------- | -----------------------------------------------
-| `gpt2`        | 12-layer, 768-hidden, 12-heads, 117M parameters
-|               | OpenAI GPT-2 English model
-| `gpt2-medium` | 24-layer, 1024-hidden, 16-heads, 345M parameters
-|               | OpenAI's Medium-sized GPT-2 English model
-
-## Transformer-XL Embeddings
-
-The Transformer-XL model was proposed by [Dai et. al (2019)](https://arxiv.org/abs/1901.02860).
-It is an uni-directional Transformer-based model with relative positioning embeddings.
-
-The Transformer-XL model can be used with the `TransformerXLEmbeddings` class:
-
-```python
-from flair.embeddings import TransformerXLEmbeddings
-
-# init embedding
-embedding = TransformerXLEmbeddings()
-
-# create a sentence
-sentence = Sentence('The Berlin Zoological Garden is the oldest and best-known zoo in Germany .')
-
-# embed words in sentence
-embedding.embed(sentence)
-```
-
-The following arguments can be passed to the `TransformerXLEmbeddings` class:
-
-| Argument                        | Default            | Description
-| ------------------------------- | ------------------ | -------------------------------------------------
-| `pretrained_model_name_or_path` | `transfo-xl-wt103` | Defines name or path of Transformer-XL model
-| `layers`                        | `1,2,3`            | Defines the to be used layers of the Transformer-based model
-| `use_scalar_mix`                | `False`            | See [Scalar mix section](#Scalar-mix)
-
-Notice: The Transformer-XL model (trained on WikiText-103) is a word-based language model. Thus, no subword tokenization
-is necessary is needed (`pooling_operation` is not needed).
-
-## XLNet Embeddings
-
-The XLNet model was proposed by [Yang et. al (2019)](https://arxiv.org/abs/1906.08237).
-It is an extension of the Transformer-XL model using an autoregressive method to learn bi-directional contexts.
-
-The XLNet model can be used with the `XLNetEmbeddings` class:
-
-```python
-from flair.embeddings import XLNetEmbeddings
-
-# init embedding
-embedding = XLNetEmbeddings()
-
-# create a sentence
-sentence = Sentence('The Hofbräuhaus is a beer hall in Munich .')
-
-# embed words in sentence
-embedding.embed(sentence)
-```
-
-The following arguments can be passed to the `XLNetEmbeddings` class:
-
-| Argument                        | Default             | Description
-| ------------------------------- | ------------------- | -------------------------------------------------
-| `pretrained_model_name_or_path` | `xlnet-large-cased` | Defines name or path of XLNet model
-| `layers`                        | `1`                 | Defines the to be used layers of the Transformer-based model
-| `pooling_operation`             | `first_last`        | See [Pooling operation section](#Pooling-operation)
-| `use_scalar_mix`                | `False`             | See [Scalar mix section](#Scalar-mix)
-
-Following XLNet models can be used:
-
-| Model              | Details
-| ------------------ | -----------------------------------------------
-| `xlnet-base-cased` | 12-layer, 768-hidden, 12-heads, 110M parameters
-|                    | XLNet English model
-| `xlnet-large-cased`| 24-layer, 1024-hidden, 16-heads, 340M parameters
-|                    | XLNet Large English model
-
-## XLM Embeddings
-
-The XLM model was proposed by [Lample and Conneau (2019)](https://arxiv.org/abs/1901.07291).
-It extends the generative pre-training approach for English to multiple languages and show the effectiveness of
-cross-lingual pretraining.
-
-The XLM model can be used with the `XLMEmbeddings` class:
-
-```python
-from flair.embeddings import XLMEmbeddings
-
-# init embedding
-embedding = XLMEmbeddings()
-
-# create a sentence
-sentence = Sentence('The BER is an international airport under construction near Berlin .')
-
-# embed words in sentence
-embedding.embed(sentence)
-```
-
-The following arguments can be passed to the `XLMEmbeddings` class:
-
-| Argument                        | Default             | Description
-| ------------------------------- | ------------------- | -------------------------------------------------
-| `pretrained_model_name_or_path` | `xlm-mlm-en-2048`   | Defines name or path of XLM model
-| `layers`                        | `1`                 | Defines the to be used layers of the Transformer-based model
-| `pooling_operation`             | `first_last`        | See [Pooling operation section](#Pooling-operation)
-| `use_scalar_mix`                | `False`             | See [Scalar mix section](#Scalar-mix)
-
-Following XLM models can be used:
-
-| Model                     | Details
-| ------------------------- | -------------------------------------------------------------------------------------------------------
-| `xlm-mlm-en-2048`         | 12-layer, 1024-hidden, 8-heads
-|                           | XLM English model
-| `xlm-mlm-ende-1024`       | 6-layer, 1024-hidden, 8-heads
-|                           | XLM English-German Multi-language model
-| `xlm-mlm-enfr-1024`       | 6-layer, 1024-hidden, 8-heads
-|                           | XLM English-French Multi-language model
-| `xlm-mlm-enro-1024`       | 6-layer, 1024-hidden, 8-heads
-|                           | XLM English-Romanian Multi-language model
-| `xlm-mlm-xnli15-1024`     | 12-layer, 1024-hidden, 8-heads
-|                           | XLM Model pre-trained with MLM on the [15 XNLI languages](https://github.com/facebookresearch/XNLI)
-| `xlm-mlm-tlm-xnli15-1024` | 12-layer, 1024-hidden, 8-heads
-|                           | XLM Model pre-trained with MLM + TLM on the [15 XNLI languages](https://github.com/facebookresearch/XNLI)
-| `xlm-clm-enfr-1024`       | 12-layer, 1024-hidden, 8-heads
-|                           | XLM English model trained with CLM (Causal Language Modeling)
-| `xlm-clm-ende-1024`       | 6-layer, 1024-hidden, 8-heads
-|                           | XLM English-German Multi-language model trained with CLM (Causal Language Modeling)
-
-## RoBERTa Embeddings
-
-The RoBERTa (**R**obustly **o**ptimized **BERT** pre-training **a**pproach) model was proposed by [Liu et. al (2019)](https://arxiv.org/abs/1907.11692),
-and uses an improved pre-training procedure to train a BERT model on a large corpus.
-
-It can be used with the `RoBERTaEmbeddings` class:
-
-```python
-from flair.embeddings import RoBERTaEmbeddings
-
-# init embedding
-embedding = RoBERTaEmbeddings()
-
-# create a sentence
-sentence = Sentence("The Oktoberfest is the world's largest Volksfest .")
-
-# embed words in sentence
-embedding.embed(sentence)
-```
-
-The following arguments can be passed to the `RoBERTaEmbeddings` class:
-
-| Argument                        | Default         | Description
-| ------------------------------- | --------------- | -------------------------------------------------
-| `pretrained_model_name_or_path` | `roberta-base`  | Defines name or path of RoBERTa model
-| `layers`                        | `-1`            | Defines the to be used layers of the Transformer-based model
-| `pooling_operation`             | `first`         | [Pooling operation section](#Pooling-operation)
-| `use_scalar_mix`                | `False`         | [Scalar mix section](#Scalar-mix)
-
-Following XLM models can be used:
-
-| Model                | Details
-| -------------------- | -------------------------------------------------------------------------------------------------------
-| `roberta-base`       | 12-layer, 768-hidden, 12-heads
-|                      | RoBERTa English model
-| `roberta-large`      | 24-layer, 1024-hidden, 16-heads
-|                      | RoBERTa English model
-| `roberta-large-mnli` | 24-layer, 1024-hidden, 16-heads
-|                      | RoBERTa English model, finetuned on MNLI
-
-### Pooling operation
-
-Most of the Transformer-based models (except Transformer-XL) use subword tokenization. E.g. the following
-token `puppeteer` could be tokenized into the subwords: `pupp`, `##ete` and `##er`.
-
-We implement different pooling operations for these subwords to generate the final token representation:
-
-* `first`: only the embedding of the first subword is used
-* `last`: only the embedding of the last subword is used
-* `first_last`: embeddings of the first and last subwords are concatenated and used
-* `mean`: a `torch.mean` over all subword embeddings is calculated and used
-
-### Scalar mix
-
-The Transformer-based models have a certain number of layers. [Liu et. al (2019)](https://arxiv.org/abs/1903.08855)
-propose a technique called scalar mix, that computes a parameterised scalar mixture of user-defined layers.
-
-This technique is very useful, because for some downstream tasks like NER or PoS tagging it can be unclear which
-layer(s) of a Transformer-based model perform well, and per-layer analysis can take a lot of time.
-
-To use scalar mix, all Transformer-based embeddings in Flair come with a `use_scalar_mix` argument. The following
-example shows how to use scalar mix for a base RoBERTa model on all layers:
-
-```python
-from flair.embeddings import RoBERTaEmbeddings
-
-# init embedding
-embedding = RoBERTaEmbeddings(pretrained_model_name_or_path="roberta-base", layers="0,1,2,3,4,5,6,7,8,9,10,11,12",
-                              pooling_operation="first", use_scalar_mix=True)
-
-# create a sentence
-sentence = Sentence("The Oktoberfest is the world's largest Volksfest .")
-
-# embed words in sentence
-embedding.embed(sentence)
-```
-
-## ELMo Embeddings
-
-[ELMo embeddings](http://www.aclweb.org/anthology/N18-1202) were presented by Peters et al. in 2018. They are using
-a bidirectional recurrent neural network to predict the next word in a text.
-We are using the implementation of [AllenNLP](https://allennlp.org/elmo). As this implementation comes with a lot of
-sub-dependencies, which we don't want to include in Flair, you need to first install the library via
-`pip install allennlp` before you can use it in Flair.
-Using the embeddings is as simple as using any other embedding type:
-
-```python
-from flair.embeddings import ELMoEmbeddings
-
-# init embedding
-embedding = ELMoEmbeddings()
-
-# create a sentence
-sentence = Sentence('The grass is green .')
-
-# embed words in sentence
-embedding.embed(sentence)
-```
-
-AllenNLP provides the following pre-trained models. To use any of the following models inside Flair
-simple specify the embedding id when initializing the `ELMoEmbeddings`.
-
-| ID | Language | Embedding |
-| ------------- | ------------- | ------------- |
-| 'small' | English | 1024-hidden, 1 layer, 14.6M parameters |
-| 'medium'   | English | 2048-hidden, 1 layer, 28.0M parameters |
-| 'original'    | English | 4096-hidden, 2 layers, 93.6M parameters |
-| 'pt'   | Portuguese | |
-| 'pubmed' | English biomedical data | [more information](https://allennlp.org/elmo) |
 
 
 ## Combining BERT and Flair
diff --git a/resources/docs/embeddings/BYTE_PAIR_EMBEDDINGS.md b/resources/docs/embeddings/BYTE_PAIR_EMBEDDINGS.md
new file mode 100644
index 000000000..83097ebe1
--- /dev/null
+++ b/resources/docs/embeddings/BYTE_PAIR_EMBEDDINGS.md
@@ -0,0 +1,26 @@
+## New: Byte Pair Embeddings
+
+We now also include the byte pair embeddings calulated by @bheinzerling that segment words into subsequences.
+This can dramatically reduce the model size vis-a-vis using normal word embeddings at nearly the same accuracy.
+So, if you want to train small models try out the new `BytePairEmbeddings` class.
+
+You initialize with a language code (275 languages supported), a number of 'syllables' (one of ) and
+a number of dimensions (one of 50, 100, 200 or 300). The following initializes and uses byte pair embeddings
+for English:
+
+```python
+from flair.embeddings import BytePairEmbeddings
+
+# init embedding
+embedding = BytePairEmbeddings('en')
+
+# create a sentence
+sentence = Sentence('The grass is green .')
+
+# embed words in sentence
+embedding.embed(sentence)
+```
+
+More information can be found
+on the [byte pair embeddings](https://nlp.h-its.org/bpemb/) web page.
+Given its memory advantages, we would be interested to hear from the community how well these embeddings work.
\ No newline at end of file
diff --git a/resources/docs/embeddings/CHARACTER_EMBEDDINGS.md b/resources/docs/embeddings/CHARACTER_EMBEDDINGS.md
new file mode 100644
index 000000000..d498719c5
--- /dev/null
+++ b/resources/docs/embeddings/CHARACTER_EMBEDDINGS.md
@@ -0,0 +1,20 @@
+## Character Embeddings
+
+Some embeddings - such as character-features - are not pre-trained but rather trained on the downstream task. Normally
+this requires you to implement a [hierarchical embedding architecture](http://neuroner.com/NeuroNERengine_with_caption_no_figure.png).
+
+With Flair, you don't need to worry about such things. Just choose the appropriate
+embedding class and character features will then automatically train during downstream task training.
+
+```python
+from flair.embeddings import CharacterEmbeddings
+
+# init embedding
+embedding = CharacterEmbeddings()
+
+# create a sentence
+sentence = Sentence('The grass is green .')
+
+# embed words in sentence
+embedding.embed(sentence)
+```
\ No newline at end of file
diff --git a/resources/docs/embeddings/CLASSIC_WORD_EMBEDDINGS.md b/resources/docs/embeddings/CLASSIC_WORD_EMBEDDINGS.md
new file mode 100644
index 000000000..c7836c769
--- /dev/null
+++ b/resources/docs/embeddings/CLASSIC_WORD_EMBEDDINGS.md
@@ -0,0 +1,115 @@
+# Classic Word Embeddings
+
+Classic word embeddings are static and word-level, meaning that each distinct word gets exactly one pre-computed
+embedding. Most embeddings fall under this class, including the popular GloVe or Komninos embeddings.
+
+Simply instantiate the WordEmbeddings class and pass a string identifier of the embedding you wish to load. So, if
+you want to use GloVe embeddings, pass the string 'glove' to the constructor:
+
+```python
+from flair.embeddings import WordEmbeddings
+
+# init embedding
+glove_embedding = WordEmbeddings('glove')
+```
+Now, create an example sentence and call the embedding's `embed()` method. You can also pass a list of sentences to
+this method since some embedding types make use of batching to increase speed.
+
+```python
+# create sentence.
+sentence = Sentence('The grass is green .')
+
+# embed a sentence using glove.
+glove_embedding.embed(sentence)
+
+# now check out the embedded tokens.
+for token in sentence:
+    print(token)
+    print(token.embedding)
+```
+
+This prints out the tokens and their embeddings. GloVe embeddings are Pytorch vectors of dimensionality 100.
+
+You choose which pre-trained embeddings you load by passing the appropriate
+id string to the constructor of the `WordEmbeddings` class. Typically, you use
+the **two-letter language code** to init an embedding, so 'en' for English and
+'de' for German and so on. By default, this will initialize FastText embeddings trained over Wikipedia.
+You can also always use FastText embeddings over Web crawls, by instantiating with '-crawl'. So 'de-crawl'
+to use embeddings trained over German web crawls.
+
+For English, we provide a few more options, so
+here you can choose between instantiating 'en-glove', 'en-extvec' and so on.
+
+The following embeddings are currently supported:
+
+| ID | Language | Embedding |
+| ------------- | -------------  | ------------- |
+| 'en-glove' (or 'glove') | English | GloVe embeddings |
+| 'en-extvec' (or 'extvec') | English |Komninos embeddings |
+| 'en-crawl' (or 'crawl')  | English | FastText embeddings over Web crawls |
+| 'en-twitter' (or 'twitter')  | English | Twitter embeddings |
+| 'en-turian' (or 'turian')  | English | Turian embeddings (small) |
+| 'en' (or 'en-news' or 'news')  |English | FastText embeddings over news and wikipedia data |
+| 'de' | German |German FastText embeddings |
+| 'nl' | Dutch | Dutch FastText embeddings |
+| 'fr' | French | French FastText embeddings |
+| 'it' | Italian | Italian FastText embeddings |
+| 'es' | Spanish | Spanish FastText embeddings |
+| 'pt' | Portuguese | Portuguese FastText embeddings |
+| 'ro' | Romanian | Romanian FastText embeddings |
+| 'ca' | Catalan | Catalan FastText embeddings |
+| 'sv' | Swedish | Swedish FastText embeddings |
+| 'da' | Danish | Danish FastText embeddings |
+| 'no' | Norwegian | Norwegian FastText embeddings |
+| 'fi' | Finnish | Finnish FastText embeddings |
+| 'pl' | Polish | Polish FastText embeddings |
+| 'cz' | Czech | Czech FastText embeddings |
+| 'sk' | Slovak | Slovak FastText embeddings |
+| 'si' | Slovenian | Slovenian FastText embeddings |
+| 'sr' | Serbian | Serbian FastText embeddings |
+| 'hr' | Croatian | Croatian FastText embeddings |
+| 'bg' | Bulgarian | Bulgarian FastText embeddings |
+| 'ru' | Russian | Russian FastText embeddings |
+| 'ar' | Arabic | Arabic FastText embeddings |
+| 'he' | Hebrew | Hebrew FastText embeddings |
+| 'tr' | Turkish | Turkish FastText embeddings |
+| 'fa' | Persian | Persian FastText embeddings |
+| 'ja' | Japanese | Japanese FastText embeddings |
+| 'ko' | Korean | Korean FastText embeddings |
+| 'zh' | Chinese | Chinese FastText embeddings |
+| 'hi' | Hindi | Hindi FastText embeddings |
+| 'id' | Indonesian | Indonesian FastText embeddings |
+| 'eu' | Basque | Basque FastText embeddings |
+
+So, if you want to load German FastText embeddings, instantiate as follows:
+
+```python
+german_embedding = WordEmbeddings('de')
+```
+
+Alternatively, if you want to load German FastText embeddings trained over crawls, instantiate as follows:
+
+```python
+german_embedding = WordEmbeddings('de-crawl')
+```
+
+We generally recommend the FastText embeddings, or GloVe if you want a smaller model.
+
+If you want to use any other embeddings (not listed in the list above), you can load those by calling
+```python
+custom_embedding = WordEmbeddings('path/to/your/custom/embeddings.gensim')
+```
+If you want to load custom embeddings you need to make sure, that the custom embeddings are correctly formatted to
+[gensim](https://radimrehurek.com/gensim/models/word2vec.html).
+
+You can, for example, convert [FastText embeddings](https://fasttext.cc/docs/en/crawl-vectors.html) to gensim using the
+following code snippet:
+```python
+import gensim
+
+word_vectors = gensim.models.KeyedVectors.load_word2vec_format('/path/to/fasttext/embeddings.txt', binary=False)
+word_vectors.save('/path/to/converted')
+```
+
+However, FastText embeddings have the functionality of returning vectors for out of vocabulary words using the sub-word information. If you want to use this then try `FastTextEmbeddings` class.
+
diff --git a/resources/docs/embeddings/ELMO_EMBEDDINGS.md b/resources/docs/embeddings/ELMO_EMBEDDINGS.md
new file mode 100644
index 000000000..155a4063e
--- /dev/null
+++ b/resources/docs/embeddings/ELMO_EMBEDDINGS.md
@@ -0,0 +1,32 @@
+## ELMo Embeddings
+
+[ELMo embeddings](http://www.aclweb.org/anthology/N18-1202) were presented by Peters et al. in 2018. They are using
+a bidirectional recurrent neural network to predict the next word in a text.
+We are using the implementation of [AllenNLP](https://allennlp.org/elmo). As this implementation comes with a lot of
+sub-dependencies, which we don't want to include in Flair, you need to first install the library via
+`pip install allennlp` before you can use it in Flair.
+Using the embeddings is as simple as using any other embedding type:
+
+```python
+from flair.embeddings import ELMoEmbeddings
+
+# init embedding
+embedding = ELMoEmbeddings()
+
+# create a sentence
+sentence = Sentence('The grass is green .')
+
+# embed words in sentence
+embedding.embed(sentence)
+```
+
+AllenNLP provides the following pre-trained models. To use any of the following models inside Flair
+simple specify the embedding id when initializing the `ELMoEmbeddings`.
+
+| ID | Language | Embedding |
+| ------------- | ------------- | ------------- |
+| 'small' | English | 1024-hidden, 1 layer, 14.6M parameters |
+| 'medium'   | English | 2048-hidden, 1 layer, 28.0M parameters |
+| 'original'    | English | 4096-hidden, 2 layers, 93.6M parameters |
+| 'pt'   | Portuguese | |
+| 'pubmed' | English biomedical data | [more information](https://allennlp.org/elmo) |
\ No newline at end of file
diff --git a/resources/docs/embeddings/FASTTEXT_EMBEDDINGS.md b/resources/docs/embeddings/FASTTEXT_EMBEDDINGS.md
new file mode 100644
index 000000000..02b953fdd
--- /dev/null
+++ b/resources/docs/embeddings/FASTTEXT_EMBEDDINGS.md
@@ -0,0 +1,23 @@
+## FastText Embeddings
+
+FastText Embeddings can give you vectors for out of vocabulary(oov) words by using the sub-word information. To use this functionality with Flair, use `FastTextEmbeddings` class as shown:
+
+```python
+from flair.embeddings import FastTextEmbeddings
+
+# init embedding
+embedding = FastTextEmbeddings('/path/to/local/custom_fasttext_embeddings.bin')
+
+# create a sentence
+sentence = Sentence('The grass is green .')
+
+# embed words in sentence
+embedding.embed(sentence)
+```
+
+You can initialize the class by passing the remote downloadable URL as well.
+
+```python
+embedding = FastTextEmbeddings('/path/to/remote/downloadable/custom_fasttext_embeddings.bin', use_local=False)
+```
+
diff --git a/resources/docs/embeddings/FLAIR_EMBEDDINGS.md b/resources/docs/embeddings/FLAIR_EMBEDDINGS.md
new file mode 100644
index 000000000..bdd65392e
--- /dev/null
+++ b/resources/docs/embeddings/FLAIR_EMBEDDINGS.md
@@ -0,0 +1,112 @@
+
+## Flair Embeddings
+
+Contextual string embeddings are [powerful embeddings](https://drive.google.com/file/d/17yVpFA7MmXaQFTe-HDpZuqw9fJlmzg56/view?usp=sharing)
+ that capture latent syntactic-semantic information that goes beyond
+standard word embeddings. Key differences are: (1) they are trained without any explicit notion of words and
+thus fundamentally model words as sequences of characters. And (2) they are **contextualized** by their
+surrounding text, meaning that the *same word will have different embeddings depending on its
+contextual use*.
+
+With Flair, you can use these embeddings simply by instantiating the appropriate embedding class, same as standard word embeddings:
+
+```python
+from flair.embeddings import FlairEmbeddings
+
+# init embedding
+flair_embedding_forward = FlairEmbeddings('news-forward')
+
+# create a sentence
+sentence = Sentence('The grass is green .')
+
+# embed words in sentence
+flair_embedding_forward.embed(sentence)
+```
+
+You choose which embeddings you load by passing the appropriate string to the constructor of the `FlairEmbeddings` class.
+Currently, the following contextual string embeddings are provided (note: replace '*X*' with either '*forward*' or '*backward*'):
+
+| ID | Language | Embedding |
+| -------------     | ------------- | ------------- |
+| 'multi-X'    | English, German, French, Italian, Dutch, Polish | Mix of corpora (Web, Wikipedia, Subtitles, News) |
+| 'multi-X-fast'    | English, German, French, Italian, Dutch, Polish | Mix of corpora (Web, Wikipedia, Subtitles, News), CPU-friendly |
+| 'news-X'    | English | Trained with 1 billion word corpus |
+| 'news-X-fast'    | English | Trained with 1 billion word corpus, CPU-friendly |
+| 'mix-X'     | English | Trained with mixed corpus (Web, Wikipedia, Subtitles) |
+| 'ar-X'     | Arabic | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
+| 'bg-X'  | Bulgarian | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
+| 'bg-X-fast'  | Bulgarian  | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): Trained with various sources (Europarl, Wikipedia or SETimes) |
+| 'cs-X'     | Czech | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
+| 'cs-v0-X'    | Czech | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): LM embeddings (earlier version) |
+| 'de-X'  | German  | Trained with mixed corpus (Web, Wikipedia, Subtitles) |
+| 'de-historic-ha-X'  | German (historical) | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): Historical German trained over *Hamburger Anzeiger* |
+| 'de-historic-wz-X'  | German (historical) | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): Historical German trained over *Wiener Zeitung* |
+| 'es-X'    | Spanish | Added by [@iamyihwa](https://github.com/zalandoresearch/flair/issues/80): Trained with Wikipedia |
+| 'es-X-fast'    | Spanish | Added by [@iamyihwa](https://github.com/zalandoresearch/flair/issues/80): Trained with Wikipediam CPU-friendly |
+| 'eu-X'    | Basque | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
+| 'eu-v0-X'    | Basque | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): LM embeddings (earlier version) |
+| 'fa-X'     | Persian | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
+| 'fi-X'     | Finnish | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
+| 'fr-X'    | French | Added by [@mhham](https://github.com/mhham): Trained with French Wikipedia |
+| 'he-X'     | Hebrew | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
+| 'hi-X'     | Hindi | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
+| 'hr-X'     | Croatian | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
+| 'id-X'     | Indonesian | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
+| 'it-X'     | Italian | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
+| 'ja-X'    | Japanese | Added by [@frtacoa](https://github.com/zalandoresearch/flair/issues/527): Trained with 439M words of Japanese Web crawls (2048 hidden states, 2 layers)|
+| 'nl-X'     | Dutch | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
+| 'nl-v0-X'    | Dutch | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): LM embeddings (earlier version) |
+| 'no-X'     | Norwegian | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
+| 'pl-X'  | Polish  | Added by [@borchmann](https://github.com/applicaai/poleval-2018): Trained with web crawls (Polish part of CommonCrawl) |
+| 'pl-opus-X'     | Polish | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
+| 'pt-X'    | Portuguese | Added by [@ericlief](https://github.com/ericlief/language_models): LM embeddings |
+| 'sl-X'     | Slovenian | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
+| 'sl-v0-X'  | Slovenian  | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): Trained with various sources (Europarl, Wikipedia and OpenSubtitles2018) |
+| 'sv-X'    | Swedish | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
+| 'sv-v0-X'    | Swedish | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): Trained with various sources (Europarl, Wikipedia or OpenSubtitles2018) |
+| 'pubmed-X'    | English | Added by [@jessepeng](https://github.com/zalandoresearch/flair/pull/519): Trained with 5% of PubMed abstracts until 2015 (1150 hidden states, 3 layers)|
+
+
+So, if you want to load embeddings from the German forward LM model, instantiate the method as follows:
+
+```python
+flair_de_forward = FlairEmbeddings('de-forward')
+```
+
+And if you want to load embeddings from the Bulgarian backward LM model, instantiate the method as follows:
+
+```python
+flair_bg_backward = FlairEmbeddings('bg-backward')
+```
+
+## Recommended Flair Usage
+
+We recommend combining both forward and backward Flair embeddings. Depending on the task, we also recommend adding standard word embeddings into the mix. So, our recommended `StackedEmbedding` for most English tasks is:
+
+
+```python
+from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings
+
+# create a StackedEmbedding object that combines glove and forward/backward flair embeddings
+stacked_embeddings = StackedEmbeddings([
+                                        WordEmbeddings('glove'),
+                                        FlairEmbeddings('news-forward'),
+                                        FlairEmbeddings('news-backward'),
+                                       ])
+```
+
+That's it! Now just use this embedding like all the other embeddings, i.e. call the `embed()` method over your sentences.
+
+```python
+sentence = Sentence('The grass is green .')
+
+# just embed a sentence using the StackedEmbedding as you would with any single embedding.
+stacked_embeddings.embed(sentence)
+
+# now check out the embedded tokens.
+for token in sentence:
+    print(token)
+    print(token.embedding)
+```
+Words are now embedded using a concatenation of three different embeddings. This combination often gives state-of-the-art accuracy.
+
diff --git a/resources/docs/embeddings/TRANSFOMER_EMBEDDINGS.md b/resources/docs/embeddings/TRANSFOMER_EMBEDDINGS.md
new file mode 100644
index 000000000..e3c6fdb6b
--- /dev/null
+++ b/resources/docs/embeddings/TRANSFOMER_EMBEDDINGS.md
@@ -0,0 +1,350 @@
+
+## PyTorch-Transformers
+
+Thanks to the brilliant [`pytorch-transformers`](https://github.com/huggingface/pytorch-transformers) library from [Hugging Face](https://github.com/huggingface),
+Flair is able to support various Transformer-based architectures like BERT or XLNet.
+
+The following embeddings can be used in Flair:
+
+* `BertEmbeddings`
+* `OpenAIGPTEmbeddings`
+* `OpenAIGPT2Embeddings`
+* `TransformerXLEmbeddings`
+* `XLNetEmbeddings`
+* `XLMEmbeddings`
+* `RoBERTaEmbeddings`
+
+This section shows how to use these Transformer-based architectures in Flair and is heavily based on the excellent
+[PyTorch-Transformers pre-trained models documentation](https://huggingface.co/pytorch-transformers/pretrained_models.html).
+
+### BERT Embeddings
+
+[BERT embeddings](https://arxiv.org/pdf/1810.04805.pdf) were developed by Devlin et al. (2018) and are a different kind
+of powerful word embedding based on a bidirectional transformer architecture.
+The embeddings itself are wrapped into our simple embedding interface, so that they can be used like any other
+embedding.
+
+```python
+from flair.embeddings import BertEmbeddings
+
+# init embedding
+embedding = BertEmbeddings()
+
+# create a sentence
+sentence = Sentence('The grass is green .')
+
+# embed words in sentence
+embedding.embed(sentence)
+```
+
+The `BertEmbeddings` class has several arguments:
+
+| Argument             | Default             | Description
+| -------------------- | ------------------- | -------------------------------------------------
+| `bert_model_or_path` | `bert-base-uncased` | Defines BERT model or points to user-defined path
+| `layers`             | `-1,-2,-3,-4`       | Defines the to be used layers of the Transformer-based model
+| `pooling_operation`  | `first`             | See [Pooling operation section](#Pooling-operation).
+| `use_scalar_mix`     | `False`             | See [Scalar mix section](#Scalar-mix).
+
+You can load any of the pre-trained BERT models by providing `bert_model_or_path` during initialization:
+
+| Model                                                   | Details
+| ------------------------------------------------------- | -----------------------------------------------
+| `bert-base-uncased`                                     | 12-layer, 768-hidden, 12-heads, 110M parameters
+|                                                         | Trained on lower-cased English text
+| `bert-large-uncased`                                    | 24-layer, 1024-hidden, 16-heads, 340M parameters
+|                                                         | Trained on lower-cased English text
+| `bert-base-cased`                                       | 12-layer, 768-hidden, 12-heads, 110M parameters
+|                                                         | Trained on cased English text
+| `bert-large-cased`                                      | 24-layer, 1024-hidden, 16-heads, 340M parameters
+|                                                         | Trained on cased English text
+| `bert-base-multilingual-uncased`                        | (Original, not recommended) 12-layer, 768-hidden, 12-heads, 110M parameters
+|                                                         | Trained on lower-cased text in the top 102 languages with the largest Wikipedias
+|                                                         | (see [details](https://github.com/google-research/bert/blob/master/multilingual.md))
+| `bert-base-multilingual-cased`                          | (New, **recommended**) 12-layer, 768-hidden, 12-heads, 110M parameters
+|                                                         | Trained on cased text in the top 104 languages with the largest Wikipedias
+|                                                         | (see [details](https://github.com/google-research/bert/blob/master/multilingual.md))
+| `bert-base-chinese`                                     | 12-layer, 768-hidden, 12-heads, 110M parameters
+|                                                         | Trained on cased Chinese Simplified and Traditional text
+| `bert-base-german-cased`                                | 12-layer, 768-hidden, 12-heads, 110M parameters
+|                                                         | Trained on cased German text by Deepset.ai
+|                                                         | (see [details on deepset.ai website](https://deepset.ai/german-bert))
+| `bert-large-uncased-whole-word-masking`                 | 24-layer, 1024-hidden, 16-heads, 340M parameters
+|                                                         | Trained on lower-cased English text using Whole-Word-Masking
+|                                                         | (see [details](https://github.com/google-research/bert/#bert))
+| `bert-large-cased-whole-word-masking`                   | 24-layer, 1024-hidden, 16-heads, 340M parameters
+|                                                         | Trained on cased English text using Whole-Word-Masking
+|                                                         | (see [details](https://github.com/google-research/bert/#bert))
+| `bert-large-uncased-whole-word-masking-finetuned-squad` | 24-layer, 1024-hidden, 16-heads, 340M parameters
+|                                                         | The `bert-large-uncased-whole-word-masking` model fine-tuned on SQuAD (see details of fine-tuning in the
+|                                                         | [example section of PyTorch-Transformers](https://github.com/huggingface/pytorch-transformers/tree/master/examples))
+| `bert-large-cased-whole-word-masking-finetuned-squad`   | 24-layer, 1024-hidden, 16-heads, 340M parameters
+|                                                         | The `bert-large-cased-whole-word-masking` model fine-tuned on SQuAD
+|                                                         | (see [details of fine-tuning in the example section](https://huggingface.co/pytorch-transformers/examples.html))
+| `bert-base-cased-finetuned-mrpc`                        | 12-layer, 768-hidden, 12-heads, 110M parameters
+|                                                         | The `bert-base-cased` model fine-tuned on MRPC
+|                                                         | (see [details of fine-tuning in the example section of PyTorch-Transformers](https://huggingface.co/pytorch-transformers/examples.html))
+
+## OpenAI GPT Embeddings
+
+The OpenAI GPT model was proposed by [Radford et. al (2018)](https://s3-us-west-2.amazonaws.com/openai-assets/research-covers/language-unsupervised/language_understanding_paper.pdf).
+GPT is an uni-directional Transformer-based model.
+
+The following example shows how to use the `OpenAIGPTEmbeddings`:
+
+```python
+from flair.embeddings import OpenAIGPTEmbeddings
+
+# init embedding
+embedding = OpenAIGPTEmbeddings()
+
+# create a sentence
+sentence = Sentence('Berlin and Munich are nice cities .')
+
+# embed words in sentence
+embedding.embed(sentence)
+```
+
+The `OpenAIGPTEmbeddings` class has several arguments:
+
+| Argument                        | Default      | Description
+| ------------------------------- | ------------ | -------------------------------------------------
+| `pretrained_model_name_or_path` | `openai-gpt` | Defines name or path of GPT model
+| `layers`                        | `1`          | Defines the to be used layers of the Transformer-based model
+| `pooling_operation`             | `first_last` | See [Pooling operation section](#Pooling-operation)
+| `use_scalar_mix`                | `False`      | See [Scalar mix section](#Scalar-mix)
+
+## OpenAI GPT-2 Embeddings
+
+The OpenAI GPT-2 model was proposed by [Radford et. al (2018)](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf).
+GPT-2 is also an uni-directional Transformer-based model, that was trained on a larger corpus compared to the GPT model.
+
+The GPT-2 model can be used with the `OpenAIGPT2Embeddings` class:
+
+```python
+from flair.embeddings import OpenAIGPT2Embeddings
+
+# init embedding
+embedding = OpenAIGPT2Embeddings()
+
+# create a sentence
+sentence = Sentence('The Englischer Garten is a large public park in the centre of Munich .')
+
+# embed words in sentence
+embedding.embed(sentence)
+```
+
+The `OpenAIGPT2Embeddings` class has several arguments:
+
+| Argument                        | Default       | Description
+| ------------------------------- | ------------- | -------------------------------------------------
+| `pretrained_model_name_or_path` | `gpt2-medium` | Defines name or path of GPT-2 model
+| `layers`                        | `1`           | Defines the to be used layers of the Transformer-based model
+| `pooling_operation`             | `first_last`  | See [Pooling operation section](#Pooling-operation)
+| `use_scalar_mix`                | `False`       | See [Scalar mix section](#Scalar-mix)
+
+Following GPT-2 models can be used:
+
+| Model         | Details
+| ------------- | -----------------------------------------------
+| `gpt2`        | 12-layer, 768-hidden, 12-heads, 117M parameters
+|               | OpenAI GPT-2 English model
+| `gpt2-medium` | 24-layer, 1024-hidden, 16-heads, 345M parameters
+|               | OpenAI's Medium-sized GPT-2 English model
+
+## Transformer-XL Embeddings
+
+The Transformer-XL model was proposed by [Dai et. al (2019)](https://arxiv.org/abs/1901.02860).
+It is an uni-directional Transformer-based model with relative positioning embeddings.
+
+The Transformer-XL model can be used with the `TransformerXLEmbeddings` class:
+
+```python
+from flair.embeddings import TransformerXLEmbeddings
+
+# init embedding
+embedding = TransformerXLEmbeddings()
+
+# create a sentence
+sentence = Sentence('The Berlin Zoological Garden is the oldest and best-known zoo in Germany .')
+
+# embed words in sentence
+embedding.embed(sentence)
+```
+
+The following arguments can be passed to the `TransformerXLEmbeddings` class:
+
+| Argument                        | Default            | Description
+| ------------------------------- | ------------------ | -------------------------------------------------
+| `pretrained_model_name_or_path` | `transfo-xl-wt103` | Defines name or path of Transformer-XL model
+| `layers`                        | `1,2,3`            | Defines the to be used layers of the Transformer-based model
+| `use_scalar_mix`                | `False`            | See [Scalar mix section](#Scalar-mix)
+
+Notice: The Transformer-XL model (trained on WikiText-103) is a word-based language model. Thus, no subword tokenization
+is necessary is needed (`pooling_operation` is not needed).
+
+## XLNet Embeddings
+
+The XLNet model was proposed by [Yang et. al (2019)](https://arxiv.org/abs/1906.08237).
+It is an extension of the Transformer-XL model using an autoregressive method to learn bi-directional contexts.
+
+The XLNet model can be used with the `XLNetEmbeddings` class:
+
+```python
+from flair.embeddings import XLNetEmbeddings
+
+# init embedding
+embedding = XLNetEmbeddings()
+
+# create a sentence
+sentence = Sentence('The Hofbräuhaus is a beer hall in Munich .')
+
+# embed words in sentence
+embedding.embed(sentence)
+```
+
+The following arguments can be passed to the `XLNetEmbeddings` class:
+
+| Argument                        | Default             | Description
+| ------------------------------- | ------------------- | -------------------------------------------------
+| `pretrained_model_name_or_path` | `xlnet-large-cased` | Defines name or path of XLNet model
+| `layers`                        | `1`                 | Defines the to be used layers of the Transformer-based model
+| `pooling_operation`             | `first_last`        | See [Pooling operation section](#Pooling-operation)
+| `use_scalar_mix`                | `False`             | See [Scalar mix section](#Scalar-mix)
+
+Following XLNet models can be used:
+
+| Model              | Details
+| ------------------ | -----------------------------------------------
+| `xlnet-base-cased` | 12-layer, 768-hidden, 12-heads, 110M parameters
+|                    | XLNet English model
+| `xlnet-large-cased`| 24-layer, 1024-hidden, 16-heads, 340M parameters
+|                    | XLNet Large English model
+
+## XLM Embeddings
+
+The XLM model was proposed by [Lample and Conneau (2019)](https://arxiv.org/abs/1901.07291).
+It extends the generative pre-training approach for English to multiple languages and show the effectiveness of
+cross-lingual pretraining.
+
+The XLM model can be used with the `XLMEmbeddings` class:
+
+```python
+from flair.embeddings import XLMEmbeddings
+
+# init embedding
+embedding = XLMEmbeddings()
+
+# create a sentence
+sentence = Sentence('The BER is an international airport under construction near Berlin .')
+
+# embed words in sentence
+embedding.embed(sentence)
+```
+
+The following arguments can be passed to the `XLMEmbeddings` class:
+
+| Argument                        | Default             | Description
+| ------------------------------- | ------------------- | -------------------------------------------------
+| `pretrained_model_name_or_path` | `xlm-mlm-en-2048`   | Defines name or path of XLM model
+| `layers`                        | `1`                 | Defines the to be used layers of the Transformer-based model
+| `pooling_operation`             | `first_last`        | See [Pooling operation section](#Pooling-operation)
+| `use_scalar_mix`                | `False`             | See [Scalar mix section](#Scalar-mix)
+
+Following XLM models can be used:
+
+| Model                     | Details
+| ------------------------- | -------------------------------------------------------------------------------------------------------
+| `xlm-mlm-en-2048`         | 12-layer, 1024-hidden, 8-heads
+|                           | XLM English model
+| `xlm-mlm-ende-1024`       | 6-layer, 1024-hidden, 8-heads
+|                           | XLM English-German Multi-language model
+| `xlm-mlm-enfr-1024`       | 6-layer, 1024-hidden, 8-heads
+|                           | XLM English-French Multi-language model
+| `xlm-mlm-enro-1024`       | 6-layer, 1024-hidden, 8-heads
+|                           | XLM English-Romanian Multi-language model
+| `xlm-mlm-xnli15-1024`     | 12-layer, 1024-hidden, 8-heads
+|                           | XLM Model pre-trained with MLM on the [15 XNLI languages](https://github.com/facebookresearch/XNLI)
+| `xlm-mlm-tlm-xnli15-1024` | 12-layer, 1024-hidden, 8-heads
+|                           | XLM Model pre-trained with MLM + TLM on the [15 XNLI languages](https://github.com/facebookresearch/XNLI)
+| `xlm-clm-enfr-1024`       | 12-layer, 1024-hidden, 8-heads
+|                           | XLM English model trained with CLM (Causal Language Modeling)
+| `xlm-clm-ende-1024`       | 6-layer, 1024-hidden, 8-heads
+|                           | XLM English-German Multi-language model trained with CLM (Causal Language Modeling)
+
+## RoBERTa Embeddings
+
+The RoBERTa (**R**obustly **o**ptimized **BERT** pre-training **a**pproach) model was proposed by [Liu et. al (2019)](https://arxiv.org/abs/1907.11692),
+and uses an improved pre-training procedure to train a BERT model on a large corpus.
+
+It can be used with the `RoBERTaEmbeddings` class:
+
+```python
+from flair.embeddings import RoBERTaEmbeddings
+
+# init embedding
+embedding = RoBERTaEmbeddings()
+
+# create a sentence
+sentence = Sentence("The Oktoberfest is the world's largest Volksfest .")
+
+# embed words in sentence
+embedding.embed(sentence)
+```
+
+The following arguments can be passed to the `RoBERTaEmbeddings` class:
+
+| Argument                        | Default         | Description
+| ------------------------------- | --------------- | -------------------------------------------------
+| `pretrained_model_name_or_path` | `roberta-base`  | Defines name or path of RoBERTa model
+| `layers`                        | `-1`            | Defines the to be used layers of the Transformer-based model
+| `pooling_operation`             | `first`         | [Pooling operation section](#Pooling-operation)
+| `use_scalar_mix`                | `False`         | [Scalar mix section](#Scalar-mix)
+
+Following XLM models can be used:
+
+| Model                | Details
+| -------------------- | -------------------------------------------------------------------------------------------------------
+| `roberta-base`       | 12-layer, 768-hidden, 12-heads
+|                      | RoBERTa English model
+| `roberta-large`      | 24-layer, 1024-hidden, 16-heads
+|                      | RoBERTa English model
+| `roberta-large-mnli` | 24-layer, 1024-hidden, 16-heads
+|                      | RoBERTa English model, finetuned on MNLI
+
+### Pooling operation
+
+Most of the Transformer-based models (except Transformer-XL) use subword tokenization. E.g. the following
+token `puppeteer` could be tokenized into the subwords: `pupp`, `##ete` and `##er`.
+
+We implement different pooling operations for these subwords to generate the final token representation:
+
+* `first`: only the embedding of the first subword is used
+* `last`: only the embedding of the last subword is used
+* `first_last`: embeddings of the first and last subwords are concatenated and used
+* `mean`: a `torch.mean` over all subword embeddings is calculated and used
+
+### Scalar mix
+
+The Transformer-based models have a certain number of layers. [Liu et. al (2019)](https://arxiv.org/abs/1903.08855)
+propose a technique called scalar mix, that computes a parameterised scalar mixture of user-defined layers.
+
+This technique is very useful, because for some downstream tasks like NER or PoS tagging it can be unclear which
+layer(s) of a Transformer-based model perform well, and per-layer analysis can take a lot of time.
+
+To use scalar mix, all Transformer-based embeddings in Flair come with a `use_scalar_mix` argument. The following
+example shows how to use scalar mix for a base RoBERTa model on all layers:
+
+```python
+from flair.embeddings import RoBERTaEmbeddings
+
+# init embedding
+embedding = RoBERTaEmbeddings(pretrained_model_name_or_path="roberta-base", layers="0,1,2,3,4,5,6,7,8,9,10,11,12",
+                              pooling_operation="first", use_scalar_mix=True)
+
+# create a sentence
+sentence = Sentence("The Oktoberfest is the world's largest Volksfest .")
+
+# embed words in sentence
+embedding.embed(sentence)
+```

From 1c6d940564496cbe46673100fa643a13229e7cdb Mon Sep 17 00:00:00 2001
From: Alan Akbik <akbika@us.ibm.com>
Date: Fri, 23 Aug 2019 17:32:37 +0200
Subject: [PATCH 02/23] Update TUTORIAL_3_WORD_EMBEDDING.md

---
 resources/docs/TUTORIAL_3_WORD_EMBEDDING.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/resources/docs/TUTORIAL_3_WORD_EMBEDDING.md b/resources/docs/TUTORIAL_3_WORD_EMBEDDING.md
index 5d037b5dc..938cc273d 100644
--- a/resources/docs/TUTORIAL_3_WORD_EMBEDDING.md
+++ b/resources/docs/TUTORIAL_3_WORD_EMBEDDING.md
@@ -11,10 +11,11 @@ All word embedding classes inherit from the `TokenEmbeddings` class and implemen
 call to embed your text. This means that for most users of Flair, the complexity of different embeddings remains 
 hidden behind this interface. Simply instantiate the embedding class you require and call `embed()` to embed your text.
 
+This tutorial introduces some common embeddings. For an overview of all supported embeddings, check here. 
+
 All embeddings produced with our methods are PyTorch vectors, so they can be immediately used for training and
 fine-tuning.
 
-
 ## Classic Word Embeddings
 
 Classic word embeddings are static and word-level, meaning that each distinct word gets exactly one pre-computed 

From 15725c5157058555fa60e87a0db7aa7caf9fc801 Mon Sep 17 00:00:00 2001
From: Alan Akbik <akbika@us.ibm.com>
Date: Fri, 23 Aug 2019 17:36:12 +0200
Subject: [PATCH 03/23] Update TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md

---
 .../TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md   | 28 +++++++++++++------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md b/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md
index afd9b2222..1197c5b77 100644
--- a/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md
+++ b/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md
@@ -1,19 +1,29 @@
-# Tutorial 4: BERT, ELMo, and Flair Embeddings
+# Tutorial 4: List of all embeddings
 
-Next to standard WordEmbeddings and CharacterEmbeddings, we also provide classes for BERT, ELMo and Flair embeddings. These embeddings enable you to train truly state-of-the-art NLP models.
+This is not so much a tutorial, but rather a list of all embeddings that we currently support in Flair. We assume that you're familiar with the [base types](/resources/docs/TUTORIAL_1_BASICS.md) of this library as well as [standard word embeddings](/resources/docs/TUTORIAL_3_WORD_EMBEDDING.md), in particular the `StackedEmbeddings` class.
 
-This tutorial explains how to use these embeddings. We assume that you're familiar with the [base types](/resources/docs/TUTORIAL_1_BASICS.md) of this library as well as [standard word embeddings](/resources/docs/TUTORIAL_3_WORD_EMBEDDING.md), in particular the `StackedEmbeddings` class.
-
-## Embeddings
+## Overview 
 
 All word embedding classes inherit from the `TokenEmbeddings` class and implement the `embed()` method which you need to
 call to embed your text. This means that for most users of Flair, the complexity of different embeddings remains
 hidden behind this interface. Simply instantiate the embedding class you require and call `embed()` to embed your text.
 
-All embeddings produced with our methods are Pytorch vectors, so they can be immediately used for training and
-fine-tuning.
-
-
+The following word embeddings are currently supported: 
+
+| Class | Type | Paper | 
+| ------------- | -------------  | -------------  | 
+| BytePairEmbeddings | Subword-level word embeddings |  |
+| CharacterEmbeddings | Task-trained character-level embeddings of words |  |
+| ELMoEmbeddings | Contextualized word-level embeddings |   |
+| FastTextEmbeddings | Word embeddings with subword features |   |
+| FlairEmbeddings | Contextualized character-level embeddings |   |
+| WordEmbeddings | Classic word embeddings |  |
+| BertEmbeddings | Embeddings from pretrained BERT | |  
+| OpenAIGPTEmbeddings and OpenAIGPT2Embeddings | Embeddings from pretrained OpenAIGPT models | |  
+| TransformerXLEmbeddings | Embeddings from pretrained transformer-XL | |  
+| XLNetEmbeddings | Embeddings from pretrained XLNet | |  
+| XLMEmbeddings | Embeddings from pretrained XLM | |  
+| RoBERTaEmbeddings | Embeddings from RoBERTa | |  
 
 
 ## Combining BERT and Flair

From f5d1770fb9276bc14cf95503c2ca1ed422493d91 Mon Sep 17 00:00:00 2001
From: Alan Akbik <akbika@us.ibm.com>
Date: Fri, 23 Aug 2019 17:38:17 +0200
Subject: [PATCH 04/23] Update TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md

---
 .../TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md   | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md b/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md
index 1197c5b77..01e79388e 100644
--- a/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md
+++ b/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md
@@ -1,4 +1,4 @@
-# Tutorial 4: List of all embeddings
+# Tutorial 4: List of All Word Embeddings
 
 This is not so much a tutorial, but rather a list of all embeddings that we currently support in Flair. We assume that you're familiar with the [base types](/resources/docs/TUTORIAL_1_BASICS.md) of this library as well as [standard word embeddings](/resources/docs/TUTORIAL_3_WORD_EMBEDDING.md), in particular the `StackedEmbeddings` class.
 
@@ -12,18 +12,18 @@ The following word embeddings are currently supported:
 
 | Class | Type | Paper | 
 | ------------- | -------------  | -------------  | 
-| BytePairEmbeddings | Subword-level word embeddings |  |
-| CharacterEmbeddings | Task-trained character-level embeddings of words |  |
-| ELMoEmbeddings | Contextualized word-level embeddings |   |
-| FastTextEmbeddings | Word embeddings with subword features |   |
-| FlairEmbeddings | Contextualized character-level embeddings |   |
-| WordEmbeddings | Classic word embeddings |  |
-| BertEmbeddings | Embeddings from pretrained BERT | |  
-| OpenAIGPTEmbeddings and OpenAIGPT2Embeddings | Embeddings from pretrained OpenAIGPT models | |  
-| TransformerXLEmbeddings | Embeddings from pretrained transformer-XL | |  
-| XLNetEmbeddings | Embeddings from pretrained XLNet | |  
-| XLMEmbeddings | Embeddings from pretrained XLM | |  
-| RoBERTaEmbeddings | Embeddings from RoBERTa | |  
+| `BertEmbeddings` | Embeddings from pretrained BERT | |  
+| `BytePairEmbeddings` | Subword-level word embeddings |  |
+| `CharacterEmbeddings` | Task-trained character-level embeddings of words |  |
+| `ELMoEmbeddings` | Contextualized word-level embeddings |   |
+| `FastTextEmbeddings` | Word embeddings with subword features |   |
+| `FlairEmbeddings` | Contextualized character-level embeddings |   |
+| `OpenAIGPTEmbeddings` and `OpenAIGPT2Embeddings` | Embeddings from pretrained OpenAIGPT models | |  
+| `RoBERTaEmbeddings` | Embeddings from RoBERTa | |  
+| `TransformerXLEmbeddings` | Embeddings from pretrained transformer-XL | |  
+| `WordEmbeddings` | Classic word embeddings |  |
+| `XLNetEmbeddings` | Embeddings from pretrained XLNet | |  
+| `XLMEmbeddings` | Embeddings from pretrained XLM | |  
 
 
 ## Combining BERT and Flair

From bb867950882dfa279de484d7f2868bd4ceaf69a3 Mon Sep 17 00:00:00 2001
From: Alan Akbik <akbika@us.ibm.com>
Date: Fri, 23 Aug 2019 17:39:54 +0200
Subject: [PATCH 05/23] Update TUTORIAL_3_WORD_EMBEDDING.md

---
 resources/docs/TUTORIAL_3_WORD_EMBEDDING.md | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/resources/docs/TUTORIAL_3_WORD_EMBEDDING.md b/resources/docs/TUTORIAL_3_WORD_EMBEDDING.md
index 938cc273d..7d70fa555 100644
--- a/resources/docs/TUTORIAL_3_WORD_EMBEDDING.md
+++ b/resources/docs/TUTORIAL_3_WORD_EMBEDDING.md
@@ -9,12 +9,11 @@ library.
 
 All word embedding classes inherit from the `TokenEmbeddings` class and implement the `embed()` method which you need to 
 call to embed your text. This means that for most users of Flair, the complexity of different embeddings remains 
-hidden behind this interface. Simply instantiate the embedding class you require and call `embed()` to embed your text.
+hidden behind this interface. Simply instantiate the embedding class you require and call `embed()` to embed your text. All embeddings produced with our methods are PyTorch vectors, so they can be immediately used for training and
+fine-tuning.
 
-This tutorial introduces some common embeddings. For an overview of all supported embeddings, check here. 
+This tutorial introduces some common embeddings and shows you how to use them. For more details on these embeddings and an overview of all supported embeddings, check [here](/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md). 
 
-All embeddings produced with our methods are PyTorch vectors, so they can be immediately used for training and
-fine-tuning.
 
 ## Classic Word Embeddings
 

From 14a4e4e445c57a2f4a725a2cc44253b80c689330 Mon Sep 17 00:00:00 2001
From: Alan Akbik <akbika@us.ibm.com>
Date: Fri, 23 Aug 2019 17:41:13 +0200
Subject: [PATCH 06/23] Update TUTORIAL_3_WORD_EMBEDDING.md

---
 resources/docs/TUTORIAL_3_WORD_EMBEDDING.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/resources/docs/TUTORIAL_3_WORD_EMBEDDING.md b/resources/docs/TUTORIAL_3_WORD_EMBEDDING.md
index 7d70fa555..9bedc4e36 100644
--- a/resources/docs/TUTORIAL_3_WORD_EMBEDDING.md
+++ b/resources/docs/TUTORIAL_3_WORD_EMBEDDING.md
@@ -45,7 +45,7 @@ for token in sentence:
     print(token.embedding)
 ```
 
-This prints out the tokens and their embeddings. GloVe embeddings are Pytorch vectors of dimensionality 100.
+This prints out the tokens and their embeddings. GloVe embeddings are PyTorch vectors of dimensionality 100.
 
 You choose which pre-trained embeddings you load by passing the appropriate 
 id string to the constructor of the `WordEmbeddings` class. Typically, you use

From 5cdb765e92440447820c57461aec496dec51d5be Mon Sep 17 00:00:00 2001
From: Alan Akbik <akbika@us.ibm.com>
Date: Fri, 23 Aug 2019 17:50:13 +0200
Subject: [PATCH 07/23] Update TUTORIAL_3_WORD_EMBEDDING.md

---
 resources/docs/TUTORIAL_3_WORD_EMBEDDING.md | 58 ++++++++++++++++-----
 1 file changed, 46 insertions(+), 12 deletions(-)

diff --git a/resources/docs/TUTORIAL_3_WORD_EMBEDDING.md b/resources/docs/TUTORIAL_3_WORD_EMBEDDING.md
index 9bedc4e36..b4bf2018c 100644
--- a/resources/docs/TUTORIAL_3_WORD_EMBEDDING.md
+++ b/resources/docs/TUTORIAL_3_WORD_EMBEDDING.md
@@ -63,16 +63,44 @@ We generally recommend the FastText embeddings, or GloVe if you want a smaller m
 
 ## Flair Embeddings
 
+Contextual string embeddings are [powerful embeddings](https://www.aclweb.org/anthology/C18-1139/)
+ that capture latent syntactic-semantic information that goes beyond
+standard word embeddings. Key differences are: (1) they are trained without any explicit notion of words and
+thus fundamentally model words as sequences of characters. And (2) they are **contextualized** by their
+surrounding text, meaning that the *same word will have different embeddings depending on its
+contextual use*.
+
+With Flair, you can use these embeddings simply by instantiating the appropriate embedding class, same as standard word embeddings:
+
+```python
+from flair.embeddings import FlairEmbeddings
+
+# init embedding
+flair_embedding_forward = FlairEmbeddings('news-forward')
+
+# create a sentence
+sentence = Sentence('The grass is green .')
+
+# embed words in sentence
+flair_embedding_forward.embed(sentence)
+```
+
+You choose which embeddings you load by passing the appropriate string to the constructor of the `FlairEmbeddings` class. For all supported languages, there is a forward and a backward model. You can load a model for a language by using the **two-letter language code** followed by a hyphen and either **forward** or **backward**. So, if you want to load the forward and backward Flair models for German, do it like this: 
+
+```python
+# init forward embedding for German
+flair_embedding_forward = FlairEmbeddings('de-forward')
+flair_embedding_backward = FlairEmbeddings('de-backward')
+```
 
 ## Stacked Embeddings
 
 Stacked embeddings are one of the most important concepts of this library. You can use them to combine different
 embeddings together, for instance if you want to use both traditional embeddings together with contextual string
-embeddings (see next chapter).
-Stacked embeddings allow you to mix and match. We find that a combination of embeddings often gives best results. 
+embeddings. Stacked embeddings allow you to mix and match. We find that a combination of embeddings often gives best results. 
 
 All you need to do is use the `StackedEmbeddings` class and instantiate it by passing a list of embeddings that you wish 
-to combine. For instance, lets combine classic GloVe embeddings with character embeddings. This is effectively the architecture proposed in (Lample et al., 2016).
+to combine. For instance, lets combine classic GloVe embeddings with forward and backward Flair embeddings. This is a combination that we generally recommend to most users, especially for sequence labeling.
 
 First, instantiate the two embeddings you wish to combine: 
 
@@ -82,20 +110,25 @@ from flair.embeddings import WordEmbeddings, CharacterEmbeddings
 # init standard GloVe embedding
 glove_embedding = WordEmbeddings('glove')
 
-# init standard character embeddings
-character_embeddings = CharacterEmbeddings()
+# init Flair forward and backwards embeddings
+flair_embedding_forward = FlairEmbeddings('news-forward')
+flair_embedding_backward = FlairEmbeddings('news-backward')
 ```
 
 Now instantiate the `StackedEmbeddings` class and pass it a list containing these two embeddings.
 
 ```python
-from flair.embeddings import StackedEmbeddings
-
-# now create the StackedEmbedding object that combines all embeddings
-stacked_embeddings = StackedEmbeddings(
-    embeddings=[glove_embedding, character_embeddings])
+from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings
+
+# create a StackedEmbedding object that combines glove and forward/backward flair embeddings
+stacked_embeddings = StackedEmbeddings([
+                                        glove_embedding,
+                                        flair_embedding_forward,
+                                        flair_embedding_backward,
+                                       ])
 ```
 
+
 That's it! Now just use this embedding like all the other embeddings, i.e. call the `embed()` method over your sentences.
 
 ```python
@@ -110,11 +143,12 @@ for token in sentence:
     print(token.embedding)
 ```
 
-Words are now embedded using a concatenation of two different embeddings. This means that the resulting embedding
+Words are now embedded using a concatenation of three different embeddings. This means that the resulting embedding
 vector is still a single PyTorch vector.
 
 ## Next 
 
-You can now either look into [BERT, ELMo, and Flair embeddings](/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md),
+To get more details on this embeddings and a full overview of all embeddings that we support, you can look into this 
+[tutorial](/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md),
 or go directly to the tutorial about [loading your corpus](/resources/docs/TUTORIAL_6_CORPUS.md), which is a
 pre-requirement for [training your own models](/resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md).

From feaf08309cb59fdc6372fa1cff2ca164fde1edad Mon Sep 17 00:00:00 2001
From: Alan Akbik <akbika@us.ibm.com>
Date: Fri, 23 Aug 2019 17:51:53 +0200
Subject: [PATCH 08/23] Update TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md

---
 resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md b/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md
index 01e79388e..8f01c9ff6 100644
--- a/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md
+++ b/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md
@@ -12,7 +12,7 @@ The following word embeddings are currently supported:
 
 | Class | Type | Paper | 
 | ------------- | -------------  | -------------  | 
-| `BertEmbeddings` | Embeddings from pretrained BERT | |  
+| [`BertEmbeddings`](/resources/docs/embeddings/BERT_EMBEDDINGS.md) | Embeddings from pretrained BERT | |  
 | `BytePairEmbeddings` | Subword-level word embeddings |  |
 | `CharacterEmbeddings` | Task-trained character-level embeddings of words |  |
 | `ELMoEmbeddings` | Contextualized word-level embeddings |   |

From 127526fef6b3753a5f8eba845ff5bbe31ae56363 Mon Sep 17 00:00:00 2001
From: Alan Akbik <akbika@us.ibm.com>
Date: Fri, 23 Aug 2019 17:52:42 +0200
Subject: [PATCH 09/23] Update TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md

---
 resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md b/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md
index 8f01c9ff6..f97616e63 100644
--- a/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md
+++ b/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md
@@ -12,7 +12,7 @@ The following word embeddings are currently supported:
 
 | Class | Type | Paper | 
 | ------------- | -------------  | -------------  | 
-| [`BertEmbeddings`](/resources/docs/embeddings/BERT_EMBEDDINGS.md) | Embeddings from pretrained BERT | |  
+| [`BertEmbeddings`](embeddings/BERT_EMBEDDINGS.md) | Embeddings from pretrained BERT | |  
 | `BytePairEmbeddings` | Subword-level word embeddings |  |
 | `CharacterEmbeddings` | Task-trained character-level embeddings of words |  |
 | `ELMoEmbeddings` | Contextualized word-level embeddings |   |

From a3320c3a9391b7a733d54e94bbdbc79d8b3ea2a0 Mon Sep 17 00:00:00 2001
From: Alan Akbik <akbika@us.ibm.com>
Date: Fri, 23 Aug 2019 17:57:05 +0200
Subject: [PATCH 10/23] Update TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md

---
 .../TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md   | 25 ++++++++++---------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md b/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md
index f97616e63..592451ed5 100644
--- a/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md
+++ b/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md
@@ -12,18 +12,19 @@ The following word embeddings are currently supported:
 
 | Class | Type | Paper | 
 | ------------- | -------------  | -------------  | 
-| [`BertEmbeddings`](embeddings/BERT_EMBEDDINGS.md) | Embeddings from pretrained BERT | |  
-| `BytePairEmbeddings` | Subword-level word embeddings |  |
-| `CharacterEmbeddings` | Task-trained character-level embeddings of words |  |
-| `ELMoEmbeddings` | Contextualized word-level embeddings |   |
-| `FastTextEmbeddings` | Word embeddings with subword features |   |
-| `FlairEmbeddings` | Contextualized character-level embeddings |   |
-| `OpenAIGPTEmbeddings` and `OpenAIGPT2Embeddings` | Embeddings from pretrained OpenAIGPT models | |  
-| `RoBERTaEmbeddings` | Embeddings from RoBERTa | |  
-| `TransformerXLEmbeddings` | Embeddings from pretrained transformer-XL | |  
-| `WordEmbeddings` | Classic word embeddings |  |
-| `XLNetEmbeddings` | Embeddings from pretrained XLNet | |  
-| `XLMEmbeddings` | Embeddings from pretrained XLM | |  
+| [`BertEmbeddings`](/resources/docs/embeddings/TRANSFOMER_EMBEDDINGS.md) | Embeddings from pretrained BERT | |  
+| [`BytePairEmbeddings`](/resources/docs/embeddings/BYTE_PAIR_EMBEDDINGS.md) | Subword-level word embeddings |  |
+| [`CharacterEmbeddings`](/resources/docs/embeddings/CHARACTER_EMBEDDINGS.md) | Task-trained character-level embeddings of words |  |
+| [`ELMoEmbeddings`](/resources/docs/embeddings/ELMO_EMBEDDINGS.md) | Contextualized word-level embeddings |   |
+| [`FastTextEmbeddings`](/resources/docs/embeddings/FASTTEXT_EMBEDDINGS.md) | Word embeddings with subword features |   |
+| [`FlairEmbeddings`](/resources/docs/embeddings/FLAIR_EMBEDDINGS.md) | Contextualized character-level embeddings |   |
+| [`PooledFlairEmbeddings`](/resources/docs/embeddings/FLAIR_EMBEDDINGS.md) | Pooled variant of `FlairEmbeddings` |   |
+| [`OpenAIGPTEmbeddings`](/resources/docs/embeddings/TRANSFOMER_EMBEDDINGS.md) and [`OpenAIGPT2Embeddings`](/resources/docs/embeddings/TRANSFOMER_EMBEDDINGS.md) | Embeddings from pretrained OpenAIGPT models | |  
+| [`RoBERTaEmbeddings`](/resources/docs/embeddings/TRANSFOMER_EMBEDDINGS.md) | Embeddings from RoBERTa | |  
+| [`TransformerXLEmbeddings`](/resources/docs/embeddings/TRANSFOMER_EMBEDDINGS.md) | Embeddings from pretrained transformer-XL | |  
+| [`WordEmbeddings`](/resources/docs/embeddings/CLASSIC_WORD_EMBEDDINGS.md) | Classic word embeddings |  |
+| [`XLNetEmbeddings`](/resources/docs/embeddings/TRANSFOMER_EMBEDDINGS.md) | Embeddings from pretrained XLNet | |  
+| [`XLMEmbeddings`](/resources/docs/embeddings/TRANSFOMER_EMBEDDINGS.md) | Embeddings from pretrained XLM | |  
 
 
 ## Combining BERT and Flair

From 65ba3825795e5a5f9f295b43ff4fcaf4fd8ded0e Mon Sep 17 00:00:00 2001
From: Alan Akbik <akbika@us.ibm.com>
Date: Fri, 23 Aug 2019 18:03:35 +0200
Subject: [PATCH 11/23] Update TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md

---
 .../TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md    | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md b/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md
index 592451ed5..daf1c64a2 100644
--- a/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md
+++ b/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md
@@ -12,13 +12,13 @@ The following word embeddings are currently supported:
 
 | Class | Type | Paper | 
 | ------------- | -------------  | -------------  | 
-| [`BertEmbeddings`](/resources/docs/embeddings/TRANSFOMER_EMBEDDINGS.md) | Embeddings from pretrained BERT | |  
-| [`BytePairEmbeddings`](/resources/docs/embeddings/BYTE_PAIR_EMBEDDINGS.md) | Subword-level word embeddings |  |
-| [`CharacterEmbeddings`](/resources/docs/embeddings/CHARACTER_EMBEDDINGS.md) | Task-trained character-level embeddings of words |  |
-| [`ELMoEmbeddings`](/resources/docs/embeddings/ELMO_EMBEDDINGS.md) | Contextualized word-level embeddings |   |
-| [`FastTextEmbeddings`](/resources/docs/embeddings/FASTTEXT_EMBEDDINGS.md) | Word embeddings with subword features |   |
-| [`FlairEmbeddings`](/resources/docs/embeddings/FLAIR_EMBEDDINGS.md) | Contextualized character-level embeddings |   |
-| [`PooledFlairEmbeddings`](/resources/docs/embeddings/FLAIR_EMBEDDINGS.md) | Pooled variant of `FlairEmbeddings` |   |
+| [`BertEmbeddings`](/resources/docs/embeddings/TRANSFOMER_EMBEDDINGS.md) | Embeddings from pretrained BERT | [Devlin et. al, 2018](https://www.aclweb.org/anthology/N19-1423/) |  
+| [`BytePairEmbeddings`](/resources/docs/embeddings/BYTE_PAIR_EMBEDDINGS.md) | Subword-level word embeddings | [Heinzerling and Strube, 2018](https://www.aclweb.org/anthology/L18-1473)  |
+| [`CharacterEmbeddings`](/resources/docs/embeddings/CHARACTER_EMBEDDINGS.md) | Task-trained character-level embeddings of words | [Lample et al., 2016](https://www.aclweb.org/anthology/N16-1030) |
+| [`ELMoEmbeddings`](/resources/docs/embeddings/ELMO_EMBEDDINGS.md) | Contextualized word-level embeddings | [Peters et al., 2018](https://aclweb.org/anthology/N18-1202)  |
+| [`FastTextEmbeddings`](/resources/docs/embeddings/FASTTEXT_EMBEDDINGS.md) | Word embeddings with subword features | [Bojanowski et al., 2017](https://aclweb.org/anthology/Q17-1010)  |
+| [`FlairEmbeddings`](/resources/docs/embeddings/FLAIR_EMBEDDINGS.md) | Contextualized character-level embeddings | [Akbik et. al, 2018](https://www.aclweb.org/anthology/C18-1139/)  |
+| [`PooledFlairEmbeddings`](/resources/docs/embeddings/FLAIR_EMBEDDINGS.md) | Pooled variant of `FlairEmbeddings` |  [Akbik et. al, 2019](https://www.aclweb.org/anthology/N19-1078/)  |
 | [`OpenAIGPTEmbeddings`](/resources/docs/embeddings/TRANSFOMER_EMBEDDINGS.md) and [`OpenAIGPT2Embeddings`](/resources/docs/embeddings/TRANSFOMER_EMBEDDINGS.md) | Embeddings from pretrained OpenAIGPT models | |  
 | [`RoBERTaEmbeddings`](/resources/docs/embeddings/TRANSFOMER_EMBEDDINGS.md) | Embeddings from RoBERTa | |  
 | [`TransformerXLEmbeddings`](/resources/docs/embeddings/TRANSFOMER_EMBEDDINGS.md) | Embeddings from pretrained transformer-XL | |  
@@ -31,9 +31,7 @@ The following word embeddings are currently supported:
 
 You can very easily mix and match Flair, ELMo, BERT and classic word embeddings. All you need to do is instantiate each embedding you wish to combine and use them in a StackedEmbedding.
 
-For instance, let's say we want to combine the multilingual Flair and BERT embeddings to train a hyper-powerful multilingual downstream task model.
-
-First, instantiate the embeddings you wish to combine:
+For instance, let's say we want to combine the multilingual Flair and BERT embeddings to train a hyper-powerful multilingual downstream task model. First, instantiate the embeddings you wish to combine:
 
 ```python
 from flair.embeddings import FlairEmbeddings, BertEmbeddings

From e40a7ea3a1a3a1fb9119840a23af9414be59d3e6 Mon Sep 17 00:00:00 2001
From: Alan Akbik <akbika@us.ibm.com>
Date: Fri, 23 Aug 2019 18:07:55 +0200
Subject: [PATCH 12/23] Update TUTORIAL_6_CORPUS.md

---
 resources/docs/TUTORIAL_6_CORPUS.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/resources/docs/TUTORIAL_6_CORPUS.md b/resources/docs/TUTORIAL_6_CORPUS.md
index 9b250fc35..c1fcc7e72 100644
--- a/resources/docs/TUTORIAL_6_CORPUS.md
+++ b/resources/docs/TUTORIAL_6_CORPUS.md
@@ -227,11 +227,11 @@ data the first time you call the corresponding constructor ID. The following dat
 | 'WASSA_SADNESS' | English | The [WASSA](https://competitions.codalab.org/competitions/16380#learn_the_details) emotion-intensity detection challenge (sadness) |
 
 
-So to load the 20 newsgroups corpus for text classification, simply do:
+So to load the IMDB corpus for sentiment text classification, simply do:
 
 ```python
 import flair.datasets
-corpus = flair.datasets.NEWSGROUPS()
+corpus = flair.datasets.IMDB()
 ```
 
 This downloads and sets up everything you need to train your model. 

From b177a69d089cc6b5b643e8ff483bb573d8a6356d Mon Sep 17 00:00:00 2001
From: Alan Akbik <akbika@us.ibm.com>
Date: Fri, 23 Aug 2019 18:09:07 +0200
Subject: [PATCH 13/23] Update TUTORIAL_6_CORPUS.md

---
 resources/docs/TUTORIAL_6_CORPUS.md | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/resources/docs/TUTORIAL_6_CORPUS.md b/resources/docs/TUTORIAL_6_CORPUS.md
index c1fcc7e72..172e7c65f 100644
--- a/resources/docs/TUTORIAL_6_CORPUS.md
+++ b/resources/docs/TUTORIAL_6_CORPUS.md
@@ -339,10 +339,7 @@ corpus: Corpus = CSVClassificationCorpus(data_folder,
 
 
 #### FastText Format
-If using `CSVClassificationCorpus` is not practical, you may format your data to the  
-FastText format, in which each line in the file represents a 
-text document. A document can have one or multiple labels that are defined at the beginning of the line starting with 
-the prefix `__label__`. This looks like this:
+If using `CSVClassificationCorpus` is not practical, you may format your data to the FastText format, in which each line in the file represents a text document. A document can have one or multiple labels that are defined at the beginning of the line starting with the prefix `__label__`. This looks like this:
 
 ```bash
 __label__<label_1> <text>

From 7d9b0e28bb962d41bf9751eb7a9a3337fadb63e8 Mon Sep 17 00:00:00 2001
From: Alan Akbik <akbika@us.ibm.com>
Date: Fri, 23 Aug 2019 18:19:12 +0200
Subject: [PATCH 14/23] Update EXPERIMENTS.md

---
 resources/docs/EXPERIMENTS.md | 36 ++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/resources/docs/EXPERIMENTS.md b/resources/docs/EXPERIMENTS.md
index 6d24cbeb4..5606f6ada 100644
--- a/resources/docs/EXPERIMENTS.md
+++ b/resources/docs/EXPERIMENTS.md
@@ -26,11 +26,11 @@ resources/tasks/conll_03/eng.testb
 resources/tasks/conll_03/eng.train
 ```
 
-This allows the `NLPTaskDataFetcher` class to read the data into our data structures. Use the `NLPTask` enum to select 
-the dataset, as follows: 
+This allows the `CONLL_03()` corpus object to read the data into our data structures. Initialize the corpus as follows: 
 
 ```python
-corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03, base_path='resources/tasks')
+from flair.datasets import CONLL_03
+corpus: Corpus = CONLL_03(base_path='resources/tasks')
 ```
 
 This gives you a `Corpus` object that contains the data. Now, select `ner` as the tag you wish to predict and init the embeddings you wish to use.
@@ -41,12 +41,12 @@ The full code to get a state-of-the-art model for English NER is as follows:
 
 ```python
 from flair.data import Corpus
-from flair.data_fetcher import  NLPTaskDataFetcher, NLPTask
+from flair.datasets import CONLL_03
 from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, PooledFlairEmbeddings
 from typing import List
 
 # 1. get the corpus
-corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03, base_path='resources/tasks')
+corpus: Corpus = CONLL_03(base_path='resources/tasks')
 
 # 2. what tag do we want to predict?
 tag_type = 'ner'
@@ -109,12 +109,12 @@ FastText word embeddings and German contextual string embeddings. The full code
 
 ```python
 from flair.data import Corpus
-from flair.data_fetcher import  NLPTaskDataFetcher, NLPTask
+from flair.datasets import CONLL_03_GERMAN
 from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, PooledFlairEmbeddings
 from typing import List
 
 # 1. get the corpus
-corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03_GERMAN, base_path='resources/tasks')
+corpus: Corpus = CONLL_03_GERMAN(base_path='resources/tasks')
 
 # 2. what tag do we want to predict?
 tag_type = 'ner'
@@ -164,12 +164,12 @@ FastText word embeddings and German contextual string embeddings. The full code
 
 ```python
 from flair.data import Corpus
-from flair.data_fetcher import  NLPTaskDataFetcher, NLPTask
+from flair.datasets import CONLL_03_DUTCH
 from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, PooledFlairEmbeddings
 from typing import List
 
 # 1. get the corpus
-corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03_DUTCH, base_path='resources/tasks')
+corpus: Corpus = CONLL_03_DUTCH()
 
 # 2. what tag do we want to predict?
 tag_type = 'ner'
@@ -215,16 +215,16 @@ Data is included in Flair and will get automatically downloaded when you run the
 
 #### Best Known Configuration
 Once you have the data, reproduce our experiments exactly like for CoNLL-03, just with a different dataset and with
-FastText word embeddings and German contextual string embeddings. The full code then is as follows:
+FastText word embeddings for twitter and crawls. The full code then is as follows:
 
 ```python
 from flair.data import Corpus
-from flair.data_fetcher import  NLPTaskDataFetcher, NLPTask
+from flair.datasets import WNUT_17
 from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings
 from typing import List
 
 # 1. get the corpus
-corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03_DUTCH, base_path='resources/tasks')
+corpus: Corpus = WNUT_17()
 
 # 2. what tag do we want to predict?
 tag_type = 'ner'
@@ -283,16 +283,18 @@ resources/tasks/onto-ner/eng.train
 #### Best Known Configuration
 
 Once you have the data, reproduce our experiments exactly like for CoNLL-03, just with a different dataset and with 
-FastText embeddings (they work better on this dataset). The full code then is as follows: 
+FastText embeddings (they work better on this dataset). You also need to provide a `column_format` for the `ColumnCorpus` object indicating which column in the training file is the 'ner' information. The full code then is as follows: 
 
 ```python
 from flair.data import Corpus
-from flair.data_fetcher import  NLPTaskDataFetcher, NLPTask
+from flair.datasets import ColumnCorpus
 from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings
 from typing import List
 
 # 1. get the corpus
-corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.ONTONER, base_path='resources/tasks')
+corpus: Corpus = flair.datasets.ColumnCorpus('resources/tasks/onto-ner',
+                                             column_format={0: 'text', 1: 'pos', 2: 'upos', 3: 'ner'},
+                                             tag_to_bioes='ner')
 
 # 2. what tag do we want to predict?
 tag_type = 'ner'
@@ -324,8 +326,8 @@ trainer: ModelTrainer = ModelTrainer(tagger, corpus)
 
 trainer.train('resources/taggers/example-ner',
               learning_rate=0.1,
-              # it's a big dataset so maybe set embeddings_in_memory to False
-              embeddings_in_memory=False)
+              # it's a big dataset so maybe set embeddings_storage_mode to 'none' (embeddings are not kept in memory)
+              embeddings_storage_mode='none')
 ```
 
 

From 604a85a39bbea77ac57c873a96d60b3e9d420b05 Mon Sep 17 00:00:00 2001
From: Alan Akbik <akbika@us.ibm.com>
Date: Fri, 23 Aug 2019 18:30:47 +0200
Subject: [PATCH 15/23] Update EXPERIMENTS.md

---
 resources/docs/EXPERIMENTS.md | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/resources/docs/EXPERIMENTS.md b/resources/docs/EXPERIMENTS.md
index 5606f6ada..3cf7a3359 100644
--- a/resources/docs/EXPERIMENTS.md
+++ b/resources/docs/EXPERIMENTS.md
@@ -342,12 +342,12 @@ trainer.train('resources/taggers/example-ner',
 
 Get the [Penn treebank](https://catalog.ldc.upenn.edu/ldc99t42) and follow the guidelines
 in [Collins (2002)](http://www.cs.columbia.edu/~mcollins/papers/tagperc.pdf) to produce train, dev and test splits.
-Convert splits into CoNLLU-U format and place train, test and dev data in `resources/tasks/penn/` as follows: 
+Convert splits into CoNLLU-U format and place train, test and dev data in `/path/to/penn/` as follows: 
 
 ```
-resources/tasks/penn/test.conll
-resources/tasks/penn/train.conll
-resources/tasks/penn/valid.conll
+/path/to/penn/test.conll
+/path/to/penn/train.conll
+/path/to/penn/valid.conll
 ```
 
 Then, run the experiments with extvec embeddings and contextual string embeddings. Also, select 'pos' as `tag_type`, 
@@ -357,12 +357,12 @@ so the algorithm knows that POS tags and not NER are to be predicted from this d
 
 ```python
 from flair.data import Corpus
-from flair.data_fetcher import  NLPTaskDataFetcher, NLPTask
+from flair.datasets import UniversalDependenciesCorpus
 from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings
 from typing import List
 
 # 1. get the corpus
-corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.PENN, base_path='resources/tasks')
+corpus: Corpus = UniversalDependenciesCorpus(base_path='/path/to/penn')
 
 # 2. what tag do we want to predict?
 tag_type = 'pos'
@@ -391,10 +391,8 @@ from flair.trainers import ModelTrainer
 
 trainer: ModelTrainer = ModelTrainer(tagger, corpus)
 
-trainer.train('resources/taggers/example-ner',
-              max_epochs=150,
-              # its a big dataset, so maybe set embeddings_in_memory=False
-              embeddings_in_memory=True)
+trainer.train('resources/taggers/example-pos',
+              max_epochs=150)
 ```
 
 ## CoNLL-2000 Noun Phrase Chunking (English)
@@ -413,12 +411,12 @@ so the algorithm knows that chunking tags and not NER are to be predicted from t
 
 ```python
 from flair.data import Corpus
-from flair.data_fetcher import  NLPTaskDataFetcher, NLPTask
+from flair.datasets import CONLL_2000
 from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings
 from typing import List
 
 # 1. get the corpus
-corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_2000)
+corpus: Corpus = CONLL_2000()
 
 # 2. what tag do we want to predict?
 tag_type = 'np'
@@ -448,6 +446,6 @@ from flair.trainers import ModelTrainer
 
 trainer: ModelTrainer = ModelTrainer(tagger, corpus)
 
-trainer.train('resources/taggers/example-ner',
+trainer.train('resources/taggers/example-chunk',
               max_epochs=150)
 ```

From ab7399b95b629f6bfa89805d97ecd0930c5b69d6 Mon Sep 17 00:00:00 2001
From: Alan Akbik <akbika@us.ibm.com>
Date: Fri, 23 Aug 2019 18:46:27 +0200
Subject: [PATCH 16/23] Update EXPERIMENTS.md

---
 resources/docs/EXPERIMENTS.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/resources/docs/EXPERIMENTS.md b/resources/docs/EXPERIMENTS.md
index 3cf7a3359..99798e82b 100644
--- a/resources/docs/EXPERIMENTS.md
+++ b/resources/docs/EXPERIMENTS.md
@@ -145,6 +145,7 @@ from flair.trainers import ModelTrainer
 trainer: ModelTrainer = ModelTrainer(tagger, corpus)
 
 trainer.train('resources/taggers/example-ner',
+              train_with_dev=True,  
               max_epochs=150)
 ```
 
@@ -200,6 +201,7 @@ from flair.trainers import ModelTrainer
 trainer: ModelTrainer = ModelTrainer(tagger, corpus)
 
 trainer.train('resources/taggers/example-ner',
+              train_with_dev=True,  
               max_epochs=150)
 ```
 
@@ -256,6 +258,7 @@ from flair.trainers import ModelTrainer
 trainer: ModelTrainer = ModelTrainer(tagger, corpus)
 
 trainer.train('resources/taggers/example-ner',
+              train_with_dev=True,  
               max_epochs=150)
 ```
 
@@ -326,6 +329,7 @@ trainer: ModelTrainer = ModelTrainer(tagger, corpus)
 
 trainer.train('resources/taggers/example-ner',
               learning_rate=0.1,
+              train_with_dev=True,  
               # it's a big dataset so maybe set embeddings_storage_mode to 'none' (embeddings are not kept in memory)
               embeddings_storage_mode='none')
 ```
@@ -392,6 +396,7 @@ from flair.trainers import ModelTrainer
 trainer: ModelTrainer = ModelTrainer(tagger, corpus)
 
 trainer.train('resources/taggers/example-pos',
+              train_with_dev=True,  
               max_epochs=150)
 ```
 
@@ -447,5 +452,6 @@ from flair.trainers import ModelTrainer
 trainer: ModelTrainer = ModelTrainer(tagger, corpus)
 
 trainer.train('resources/taggers/example-chunk',
+              train_with_dev=True,  
               max_epochs=150)
 ```

From 3f4c55cccc52fd91d3387c75af628fa35c9b03ef Mon Sep 17 00:00:00 2001
From: Alan Akbik <akbika@us.ibm.com>
Date: Fri, 23 Aug 2019 18:46:51 +0200
Subject: [PATCH 17/23] Update EXPERIMENTS.md

---
 resources/docs/EXPERIMENTS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/resources/docs/EXPERIMENTS.md b/resources/docs/EXPERIMENTS.md
index 99798e82b..bebc8d838 100644
--- a/resources/docs/EXPERIMENTS.md
+++ b/resources/docs/EXPERIMENTS.md
@@ -83,6 +83,7 @@ from flair.trainers import ModelTrainer
 trainer: ModelTrainer = ModelTrainer(tagger, corpus)
 
 trainer.train('resources/taggers/example-ner',
+              train_with_dev=True,  
               max_epochs=150)
 ```
 

From 880452913d370d3bc51498b935c564e58491ed2e Mon Sep 17 00:00:00 2001
From: aakbik <alan.akbik@zalando.de>
Date: Fri, 23 Aug 2019 19:06:46 +0200
Subject: [PATCH 18/23] Documentation of data loaders

---
 flair/datasets.py | 423 +++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 365 insertions(+), 58 deletions(-)

diff --git a/flair/datasets.py b/flair/datasets.py
index 077535d85..0c8895168 100644
--- a/flair/datasets.py
+++ b/flair/datasets.py
@@ -896,8 +896,22 @@ def __getitem__(self, index: int = 0) -> Sentence:
 
 class CONLL_03(ColumnCorpus):
     def __init__(
-        self, base_path=None, tag_to_bioes: str = "ner", in_memory: bool = True
+        self,
+        base_path: Union[str, Path] = None,
+        tag_to_bioes: str = "ner",
+        in_memory: bool = True,
     ):
+        """
+        Initialize the CoNLL-03 corpus. This is only possible if you've manually downloaded it to your machine.
+        Obtain the corpus from https://www.clips.uantwerpen.be/conll2003/ner/ and put it into some folder. Then point
+        the base_path parameter in the constructor to this folder
+        :param base_path: Path to the CoNLL-03 corpus on your machine
+        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' or 'np' to predict
+        POS tags or chunks respectively
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        """
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
 
         # column format
         columns = {0: "text", 1: "pos", 2: "np", 3: "ner"}
@@ -926,8 +940,22 @@ def __init__(
 
 class CONLL_03_GERMAN(ColumnCorpus):
     def __init__(
-        self, base_path=None, tag_to_bioes: str = "ner", in_memory: bool = True
+        self,
+        base_path: Union[str, Path] = None,
+        tag_to_bioes: str = "ner",
+        in_memory: bool = True,
     ):
+        """
+        Initialize the CoNLL-03 corpus for German. This is only possible if you've manually downloaded it to your machine.
+        Obtain the corpus from https://www.clips.uantwerpen.be/conll2003/ner/ and put it into some folder. Then point
+        the base_path parameter in the constructor to this folder
+        :param base_path: Path to the CoNLL-03 corpus on your machine
+        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'lemma', 'pos' or 'np' to predict
+        word lemmas, POS tags or chunks respectively
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        """
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
 
         # column format
         columns = {0: "text", 1: "lemma", 2: "pos", 3: "np", 4: "ner"}
@@ -956,8 +984,23 @@ def __init__(
 
 class CONLL_03_DUTCH(ColumnCorpus):
     def __init__(
-        self, base_path=None, tag_to_bioes: str = "ner", in_memory: bool = True
+        self,
+        base_path: Union[str, Path] = None,
+        tag_to_bioes: str = "ner",
+        in_memory: bool = True,
     ):
+        """
+        Initialize the CoNLL-03 corpus for Dutch. The first time you call this constructor it will automatically
+        download the dataset.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
+        POS tags instead
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        """
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # column format
         columns = {0: "text", 1: "pos", 2: "ner"}
 
@@ -982,8 +1025,22 @@ def __init__(
 
 class CONLL_03_SPANISH(ColumnCorpus):
     def __init__(
-        self, base_path=None, tag_to_bioes: str = "ner", in_memory: bool = True
+        self,
+        base_path: Union[str, Path] = None,
+        tag_to_bioes: str = "ner",
+        in_memory: bool = True,
     ):
+        """
+        Initialize the CoNLL-03 corpus for Spanish. The first time you call this constructor it will automatically
+        download the dataset.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param tag_to_bioes: NER by default, should not be changed
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        """
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # column format
         columns = {0: "text", 1: "ner"}
 
@@ -1008,8 +1065,21 @@ def __init__(
 
 class CONLL_2000(ColumnCorpus):
     def __init__(
-        self, base_path=None, tag_to_bioes: str = "np", in_memory: bool = True
+        self,
+        base_path: Union[str, Path] = None,
+        tag_to_bioes: str = "np",
+        in_memory: bool = True,
     ):
+        """
+        Initialize the CoNLL-2000 corpus for English chunking.
+        The first time you call this constructor it will automatically download the dataset.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param tag_to_bioes: 'np' by default, should not be changed, but you can set 'pos' instead to predict POS tags
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        """
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
 
         # column format
         columns = {0: "text", 1: "pos", 2: "np"}
@@ -1059,8 +1129,21 @@ def __init__(
 
 class GERMEVAL(ColumnCorpus):
     def __init__(
-        self, base_path=None, tag_to_bioes: str = "ner", in_memory: bool = True
+        self,
+        base_path: Union[str, Path] = None,
+        tag_to_bioes: str = "ner",
+        in_memory: bool = True,
     ):
+        """
+        Initialize the GermEval NER corpus for German. This is only possible if you've manually downloaded it to your
+        machine. Obtain the corpus from https://sites.google.com/site/germeval2014ner/home/ and put it into some folder.
+        Then point the base_path parameter in the constructor to this folder
+        :param base_path: Path to the GermEval corpus on your machine
+        :param tag_to_bioes: 'ner' by default, should not be changed.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        """
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
 
         # column format
         columns = {1: "text", 2: "ner"}
@@ -1091,7 +1174,10 @@ def __init__(
 
 
 class IMDB(ClassificationCorpus):
-    def __init__(self, base_path=None, in_memory: bool = False):
+    def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = False):
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
@@ -1146,7 +1232,11 @@ def __init__(self, base_path=None, in_memory: bool = False):
 
 
 class NEWSGROUPS(ClassificationCorpus):
-    def __init__(self, base_path=None, in_memory: bool = False):
+    def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = False):
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
@@ -1229,8 +1319,13 @@ def __init__(self, base_path=None, in_memory: bool = False):
 
 class NER_BASQUE(ColumnCorpus):
     def __init__(
-        self, base_path=None, tag_to_bioes: str = "ner", in_memory: bool = True
+        self,
+        base_path: Union[str, Path] = None,
+        tag_to_bioes: str = "ner",
+        in_memory: bool = True,
     ):
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
 
         # column format
         columns = {0: "text", 1: "ner"}
@@ -1271,7 +1366,11 @@ def __init__(
 
 
 class TREC_50(ClassificationCorpus):
-    def __init__(self, base_path=None, in_memory: bool = True):
+    def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
@@ -1325,7 +1424,11 @@ def __init__(self, base_path=None, in_memory: bool = True):
 
 
 class TREC_6(ClassificationCorpus):
-    def __init__(self, base_path=None, in_memory: bool = True):
+    def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
@@ -1379,7 +1482,11 @@ def __init__(self, base_path=None, in_memory: bool = True):
 
 
 class UD_ENGLISH(UniversalDependenciesCorpus):
-    def __init__(self, base_path=None, in_memory: bool = True):
+    def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
@@ -1402,7 +1509,11 @@ def __init__(self, base_path=None, in_memory: bool = True):
 
 
 class UD_GERMAN(UniversalDependenciesCorpus):
-    def __init__(self, base_path=None, in_memory: bool = True):
+    def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
@@ -1423,7 +1534,11 @@ def __init__(self, base_path=None, in_memory: bool = True):
 
 
 class UD_GERMAN_HDT(UniversalDependenciesCorpus):
-    def __init__(self, base_path=None, in_memory: bool = False):
+    def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = False):
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
@@ -1465,7 +1580,11 @@ def __init__(self, base_path=None, in_memory: bool = False):
 
 
 class UD_DUTCH(UniversalDependenciesCorpus):
-    def __init__(self, base_path=None, in_memory: bool = True):
+    def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
@@ -1490,7 +1609,11 @@ def __init__(self, base_path=None, in_memory: bool = True):
 
 
 class UD_FRENCH(UniversalDependenciesCorpus):
-    def __init__(self, base_path=None, in_memory: bool = True):
+    def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
@@ -1510,7 +1633,11 @@ def __init__(self, base_path=None, in_memory: bool = True):
 
 
 class UD_ITALIAN(UniversalDependenciesCorpus):
-    def __init__(self, base_path=None, in_memory: bool = True):
+    def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
@@ -1532,7 +1659,11 @@ def __init__(self, base_path=None, in_memory: bool = True):
 
 
 class UD_SPANISH(UniversalDependenciesCorpus):
-    def __init__(self, base_path=None, in_memory: bool = True):
+    def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
@@ -1548,11 +1679,15 @@ def __init__(self, base_path=None, in_memory: bool = True):
         cached_path(
             f"{ud_path}/es_gsd-ud-train.conllu", Path("datasets") / dataset_name
         )
-        super(UD_SPANISH, self).__init__(data_folder)
+        super(UD_SPANISH, self).__init__(data_folder, in_memory=in_memory)
 
 
 class UD_PORTUGUESE(UniversalDependenciesCorpus):
-    def __init__(self, base_path=None, in_memory: bool = True):
+    def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
@@ -1576,7 +1711,11 @@ def __init__(self, base_path=None, in_memory: bool = True):
 
 
 class UD_ROMANIAN(UniversalDependenciesCorpus):
-    def __init__(self, base_path=None, in_memory: bool = True):
+    def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
@@ -1596,7 +1735,11 @@ def __init__(self, base_path=None, in_memory: bool = True):
 
 
 class UD_CATALAN(UniversalDependenciesCorpus):
-    def __init__(self, base_path=None, in_memory: bool = True):
+    def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
@@ -1620,7 +1763,11 @@ def __init__(self, base_path=None, in_memory: bool = True):
 
 
 class UD_POLISH(UniversalDependenciesCorpus):
-    def __init__(self, base_path=None, in_memory: bool = True):
+    def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
@@ -1641,7 +1788,11 @@ def __init__(self, base_path=None, in_memory: bool = True):
 
 
 class UD_CZECH(UniversalDependenciesCorpus):
-    def __init__(self, base_path=None, in_memory: bool = False):
+    def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = False):
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
@@ -1690,7 +1841,11 @@ def __init__(self, base_path=None, in_memory: bool = False):
 
 
 class UD_SLOVAK(UniversalDependenciesCorpus):
-    def __init__(self, base_path=None, in_memory: bool = True):
+    def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
@@ -1711,7 +1866,11 @@ def __init__(self, base_path=None, in_memory: bool = True):
 
 
 class UD_SWEDISH(UniversalDependenciesCorpus):
-    def __init__(self, base_path=None, in_memory: bool = True):
+    def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
@@ -1736,7 +1895,11 @@ def __init__(self, base_path=None, in_memory: bool = True):
 
 
 class UD_DANISH(UniversalDependenciesCorpus):
-    def __init__(self, base_path=None, in_memory: bool = True):
+    def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
@@ -1757,7 +1920,11 @@ def __init__(self, base_path=None, in_memory: bool = True):
 
 
 class UD_NORWEGIAN(UniversalDependenciesCorpus):
-    def __init__(self, base_path=None, in_memory: bool = True):
+    def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
@@ -1782,7 +1949,11 @@ def __init__(self, base_path=None, in_memory: bool = True):
 
 
 class UD_FINNISH(UniversalDependenciesCorpus):
-    def __init__(self, base_path=None, in_memory: bool = True):
+    def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
@@ -1803,7 +1974,11 @@ def __init__(self, base_path=None, in_memory: bool = True):
 
 
 class UD_SLOVENIAN(UniversalDependenciesCorpus):
-    def __init__(self, base_path=None, in_memory: bool = True):
+    def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
@@ -1824,7 +1999,11 @@ def __init__(self, base_path=None, in_memory: bool = True):
 
 
 class UD_CROATIAN(UniversalDependenciesCorpus):
-    def __init__(self, base_path=None, in_memory: bool = True):
+    def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
@@ -1845,7 +2024,11 @@ def __init__(self, base_path=None, in_memory: bool = True):
 
 
 class UD_SERBIAN(UniversalDependenciesCorpus):
-    def __init__(self, base_path=None, in_memory: bool = True):
+    def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
@@ -1866,7 +2049,11 @@ def __init__(self, base_path=None, in_memory: bool = True):
 
 
 class UD_BULGARIAN(UniversalDependenciesCorpus):
-    def __init__(self, base_path=None, in_memory: bool = True):
+    def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
@@ -1887,7 +2074,11 @@ def __init__(self, base_path=None, in_memory: bool = True):
 
 
 class UD_ARABIC(UniversalDependenciesCorpus):
-    def __init__(self, base_path=None, in_memory: bool = True):
+    def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
@@ -1909,7 +2100,11 @@ def __init__(self, base_path=None, in_memory: bool = True):
 
 
 class UD_HEBREW(UniversalDependenciesCorpus):
-    def __init__(self, base_path=None, in_memory: bool = True):
+    def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
@@ -1929,7 +2124,11 @@ def __init__(self, base_path=None, in_memory: bool = True):
 
 
 class UD_TURKISH(UniversalDependenciesCorpus):
-    def __init__(self, base_path=None, in_memory: bool = True):
+    def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
@@ -1952,7 +2151,11 @@ def __init__(self, base_path=None, in_memory: bool = True):
 
 
 class UD_PERSIAN(UniversalDependenciesCorpus):
-    def __init__(self, base_path=None, in_memory: bool = True):
+    def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
@@ -1977,7 +2180,11 @@ def __init__(self, base_path=None, in_memory: bool = True):
 
 
 class UD_RUSSIAN(UniversalDependenciesCorpus):
-    def __init__(self, base_path=None, in_memory: bool = True):
+    def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
@@ -2002,7 +2209,11 @@ def __init__(self, base_path=None, in_memory: bool = True):
 
 
 class UD_HINDI(UniversalDependenciesCorpus):
-    def __init__(self, base_path=None, in_memory: bool = True):
+    def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
@@ -2025,7 +2236,11 @@ def __init__(self, base_path=None, in_memory: bool = True):
 
 
 class UD_INDONESIAN(UniversalDependenciesCorpus):
-    def __init__(self, base_path=None, in_memory: bool = True):
+    def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
@@ -2046,7 +2261,11 @@ def __init__(self, base_path=None, in_memory: bool = True):
 
 
 class UD_JAPANESE(UniversalDependenciesCorpus):
-    def __init__(self, base_path=None, in_memory: bool = True):
+    def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
@@ -2067,7 +2286,11 @@ def __init__(self, base_path=None, in_memory: bool = True):
 
 
 class UD_CHINESE(UniversalDependenciesCorpus):
-    def __init__(self, base_path=None, in_memory: bool = True):
+    def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
@@ -2088,7 +2311,11 @@ def __init__(self, base_path=None, in_memory: bool = True):
 
 
 class UD_KOREAN(UniversalDependenciesCorpus):
-    def __init__(self, base_path=None, in_memory: bool = True):
+    def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
@@ -2113,7 +2340,11 @@ def __init__(self, base_path=None, in_memory: bool = True):
 
 
 class UD_BASQUE(UniversalDependenciesCorpus):
-    def __init__(self, base_path=None, in_memory: bool = True):
+    def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
@@ -2160,7 +2391,11 @@ def _download_wassa_if_not_there(emotion, data_folder, dataset_name):
 
 
 class WASSA_ANGER(ClassificationCorpus):
-    def __init__(self, base_path=None, in_memory: bool = False):
+    def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = False):
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
@@ -2178,7 +2413,11 @@ def __init__(self, base_path=None, in_memory: bool = False):
 
 
 class WASSA_FEAR(ClassificationCorpus):
-    def __init__(self, base_path=None, in_memory: bool = False):
+    def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = False):
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
@@ -2196,7 +2435,11 @@ def __init__(self, base_path=None, in_memory: bool = False):
 
 
 class WASSA_JOY(ClassificationCorpus):
-    def __init__(self, base_path=None, in_memory: bool = False):
+    def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = False):
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
@@ -2214,7 +2457,11 @@ def __init__(self, base_path=None, in_memory: bool = False):
 
 
 class WASSA_SADNESS(ClassificationCorpus):
-    def __init__(self, base_path=None, in_memory: bool = False):
+    def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = False):
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
@@ -2275,8 +2522,14 @@ def _download_wikiner(language_code: str, dataset_name: str):
 
 class WIKINER_ENGLISH(ColumnCorpus):
     def __init__(
-        self, base_path=None, tag_to_bioes: str = "ner", in_memory: bool = False
+        self,
+        base_path: Union[str, Path] = None,
+        tag_to_bioes: str = "ner",
+        in_memory: bool = False,
     ):
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # column format
         columns = {0: "text", 1: "pos", 2: "ner"}
 
@@ -2298,8 +2551,14 @@ def __init__(
 
 class WIKINER_GERMAN(ColumnCorpus):
     def __init__(
-        self, base_path=None, tag_to_bioes: str = "ner", in_memory: bool = False
+        self,
+        base_path: Union[str, Path] = None,
+        tag_to_bioes: str = "ner",
+        in_memory: bool = False,
     ):
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # column format
         columns = {0: "text", 1: "pos", 2: "ner"}
 
@@ -2321,8 +2580,14 @@ def __init__(
 
 class WIKINER_DUTCH(ColumnCorpus):
     def __init__(
-        self, base_path=None, tag_to_bioes: str = "ner", in_memory: bool = False
+        self,
+        base_path: Union[str, Path] = None,
+        tag_to_bioes: str = "ner",
+        in_memory: bool = False,
     ):
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # column format
         columns = {0: "text", 1: "pos", 2: "ner"}
 
@@ -2344,8 +2609,14 @@ def __init__(
 
 class WIKINER_FRENCH(ColumnCorpus):
     def __init__(
-        self, base_path=None, tag_to_bioes: str = "ner", in_memory: bool = False
+        self,
+        base_path: Union[str, Path] = None,
+        tag_to_bioes: str = "ner",
+        in_memory: bool = False,
     ):
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # column format
         columns = {0: "text", 1: "pos", 2: "ner"}
 
@@ -2367,8 +2638,14 @@ def __init__(
 
 class WIKINER_ITALIAN(ColumnCorpus):
     def __init__(
-        self, base_path=None, tag_to_bioes: str = "ner", in_memory: bool = False
+        self,
+        base_path: Union[str, Path] = None,
+        tag_to_bioes: str = "ner",
+        in_memory: bool = False,
     ):
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # column format
         columns = {0: "text", 1: "pos", 2: "ner"}
 
@@ -2390,8 +2667,14 @@ def __init__(
 
 class WIKINER_SPANISH(ColumnCorpus):
     def __init__(
-        self, base_path=None, tag_to_bioes: str = "ner", in_memory: bool = False
+        self,
+        base_path: Union[str, Path] = None,
+        tag_to_bioes: str = "ner",
+        in_memory: bool = False,
     ):
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # column format
         columns = {0: "text", 1: "pos", 2: "ner"}
 
@@ -2413,8 +2696,14 @@ def __init__(
 
 class WIKINER_PORTUGUESE(ColumnCorpus):
     def __init__(
-        self, base_path=None, tag_to_bioes: str = "ner", in_memory: bool = False
+        self,
+        base_path: Union[str, Path] = None,
+        tag_to_bioes: str = "ner",
+        in_memory: bool = False,
     ):
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # column format
         columns = {0: "text", 1: "pos", 2: "ner"}
 
@@ -2436,8 +2725,14 @@ def __init__(
 
 class WIKINER_POLISH(ColumnCorpus):
     def __init__(
-        self, base_path=None, tag_to_bioes: str = "ner", in_memory: bool = False
+        self,
+        base_path: Union[str, Path] = None,
+        tag_to_bioes: str = "ner",
+        in_memory: bool = False,
     ):
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # column format
         columns = {0: "text", 1: "pos", 2: "ner"}
 
@@ -2459,8 +2754,14 @@ def __init__(
 
 class WIKINER_RUSSIAN(ColumnCorpus):
     def __init__(
-        self, base_path=None, tag_to_bioes: str = "ner", in_memory: bool = False
+        self,
+        base_path: Union[str, Path] = None,
+        tag_to_bioes: str = "ner",
+        in_memory: bool = False,
     ):
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # column format
         columns = {0: "text", 1: "pos", 2: "ner"}
 
@@ -2482,8 +2783,14 @@ def __init__(
 
 class WNUT_17(ColumnCorpus):
     def __init__(
-        self, base_path=None, tag_to_bioes: str = "ner", in_memory: bool = True
+        self,
+        base_path: Union[str, Path] = None,
+        tag_to_bioes: str = "ner",
+        in_memory: bool = True,
     ):
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # column format
         columns = {0: "text", 1: "ner"}
 

From 81be6d83b06043e4a9a7893446185688ec80f015 Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Mon, 26 Aug 2019 00:08:23 +0200
Subject: [PATCH 19/23] embeddings: add support for large ELMo model (trained
 on 5.5B tokens)

---
 flair/embeddings.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/flair/embeddings.py b/flair/embeddings.py
index 3b00bd2e0..b4b549fea 100644
--- a/flair/embeddings.py
+++ b/flair/embeddings.py
@@ -729,6 +729,9 @@ def __init__(
             if model == "medium":
                 options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json"
                 weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5"
+            if model in ["large", "5.5B"]:
+                options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json"
+                weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5"
             if model == "pt" or model == "portuguese":
                 options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/contributed/pt/elmo_pt_options.json"
                 weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/contributed/pt/elmo_pt_weights.hdf5"

From a3ccf178e6bf4714940d6684d9edab159b39867e Mon Sep 17 00:00:00 2001
From: aakbik <alan.akbik@zalando.de>
Date: Mon, 26 Aug 2019 12:08:37 +0200
Subject: [PATCH 20/23] Added comments to datasets

---
 flair/datasets.py | 53 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/flair/datasets.py b/flair/datasets.py
index 0c8895168..d62c4a93f 100644
--- a/flair/datasets.py
+++ b/flair/datasets.py
@@ -38,6 +38,8 @@ def __init__(
         :param test_file: the name of the test file
         :param dev_file: the name of the dev file, if None, dev data is sampled from train
         :param tag_to_bioes: whether to convert to BIOES tagging scheme
+        :param comment_symbol: if set, lines that begin with this symbol are treated as comments
+        :param in_memory: If set to True, the dataset is kept in memory as Sentence objects, otherwise does disk reads
         :return: a Corpus with annotated train, dev and test data
         """
 
@@ -140,6 +142,7 @@ def __init__(
         :param train_file: the name of the train file
         :param test_file: the name of the test file
         :param dev_file: the name of the dev file, if None, dev data is sampled from train
+        :param in_memory: If set to True, keeps full dataset in memory, otherwise does disk reads
         :return: a Corpus with annotated train, dev and test data
         """
         if type(data_folder) == str:
@@ -198,6 +201,10 @@ def __init__(
         :param train_file: the name of the train file
         :param test_file: the name of the test file
         :param dev_file: the name of the dev file, if None, dev data is sampled from train
+        :param use_tokenizer: If True, tokenizes the dataset, otherwise uses whitespace tokenization
+        :param max_tokens_per_doc: If set, truncates each Sentence to a maximum number of Tokens
+        :param max_chars_per_doc: If set, truncates each Sentence to a maximum number of chars
+        :param in_memory: If True, keeps dataset as Sentences in memory, otherwise only keeps strings
         :return: a Corpus with annotated train, dev and test data
         """
 
@@ -285,9 +292,14 @@ def __init__(
         Instantiates a Corpus for text classification from CSV column formatted data
 
         :param data_folder: base folder with the task data
+        :param column_name_map: a column name map that indicates which column is text and which the label(s)
         :param train_file: the name of the train file
         :param test_file: the name of the test file
         :param dev_file: the name of the dev file, if None, dev data is sampled from train
+        :param max_tokens_per_doc: If set, truncates each Sentence to a maximum number of Tokens
+        :param max_chars_per_doc: If set, truncates each Sentence to a maximum number of chars
+        :param use_tokenizer: If True, tokenizes the dataset, otherwise uses whitespace tokenization
+        :param in_memory: If True, keeps dataset as Sentences in memory, otherwise only keeps strings
         :param fmtparams: additional parameters for the CSV file reader
         :return: a Corpus with annotated train, dev and test data
         """
@@ -375,7 +387,15 @@ def __init__(
 
 
 class SentenceDataset(FlairDataset):
+    """
+    A simple Dataset object to wrap a List of Sentence
+    """
+
     def __init__(self, sentences: Union[Sentence, List[Sentence]]):
+        """
+        Instantiate SentenceDataset
+        :param sentences: Sentence or List of Sentence that make up SentenceDataset
+        """
         # cast to list if necessary
         if type(sentences) == Sentence:
             sentences = [sentences]
@@ -401,6 +421,15 @@ def __init__(
         comment_symbol: str = None,
         in_memory: bool = True,
     ):
+        """
+        Instantiates a column dataset (typically used for sequence labeling or word-level prediction).
+
+        :param path_to_column_file: path to the file with the column-formatted data
+        :param column_name_map: a map specifying the column format
+        :param tag_to_bioes: whether to convert to BIOES tagging scheme
+        :param comment_symbol: if set, lines that begin with this symbol are treated as comments
+        :param in_memory: If set to True, the dataset is kept in memory as Sentence objects, otherwise does disk reads
+        """
         assert path_to_column_file.exists()
         self.path_to_column_file = path_to_column_file
         self.tag_to_bioes = tag_to_bioes
@@ -533,6 +562,12 @@ def __getitem__(self, index: int = 0) -> Sentence:
 
 class UniversalDependenciesDataset(FlairDataset):
     def __init__(self, path_to_conll_file: Path, in_memory: bool = True):
+        """
+        Instantiates a column dataset in CoNLL-U format.
+
+        :param path_to_conll_file: Path to the CoNLL-U formatted file
+        :param in_memory: If set to True, keeps full dataset in memory, otherwise does disk reads
+        """
         assert path_to_conll_file.exists()
 
         self.in_memory = in_memory
@@ -664,6 +699,20 @@ def __init__(
         skip_header: bool = False,
         **fmtparams,
     ):
+        """
+        Instantiates a Dataset for text classification from CSV column formatted data
+
+        :param path_to_file: path to the file with the CSV data
+        :param column_name_map: a column name map that indicates which column is text and which the label(s)
+        :param max_tokens_per_doc: If set, truncates each Sentence to a maximum number of Tokens
+        :param max_chars_per_doc: If set, truncates each Sentence to a maximum number of chars
+        :param use_tokenizer: If True, tokenizes the dataset, otherwise uses whitespace tokenization
+        :param in_memory: If True, keeps dataset as Sentences in memory, otherwise only keeps strings
+        :param skip_header: If True, skips first line because it is header
+        :param fmtparams: additional parameters for the CSV file reader
+        :return: a Corpus with annotated train, dev and test data
+        """
+
         if type(path_to_file) == str:
             path_to_file: Path = Path(path_to_file)
 
@@ -794,6 +843,10 @@ def __init__(
         __label__<class_name_1> __label__<class_name_2> <text>
         :param path_to_file: the path to the data file
         :param max_tokens_per_doc: Takes at most this amount of tokens per document. If set to -1 all documents are taken as is.
+        :param max_tokens_per_doc: If set, truncates each Sentence to a maximum number of Tokens
+        :param max_chars_per_doc: If set, truncates each Sentence to a maximum number of chars
+        :param use_tokenizer: If True, tokenizes the dataset, otherwise uses whitespace tokenization
+        :param in_memory: If True, keeps dataset as Sentences in memory, otherwise only keeps strings
         :return: list of sentences
         """
         if type(path_to_file) == str:

From fb5f6ed7c36e8e48279014e1029ec3cb1e544329 Mon Sep 17 00:00:00 2001
From: aakbik <alan.akbik@zalando.de>
Date: Mon, 26 Aug 2019 13:42:48 +0200
Subject: [PATCH 21/23] documentation

---
 .../docs/embeddings/BYTE_PAIR_EMBEDDINGS.md   | 18 ++++++---
 .../docs/embeddings/CHARACTER_EMBEDDINGS.md   | 38 ++++++++++++-------
 .../embeddings/CLASSIC_WORD_EMBEDDINGS.md     |  2 +-
 resources/docs/embeddings/ELMO_EMBEDDINGS.md  |  2 +-
 .../docs/embeddings/FASTTEXT_EMBEDDINGS.md    | 32 +++++++++++++++-
 resources/docs/embeddings/FLAIR_EMBEDDINGS.md |  3 +-
 .../docs/embeddings/TRANSFOMER_EMBEDDINGS.md  |  7 ++--
 7 files changed, 73 insertions(+), 29 deletions(-)

diff --git a/resources/docs/embeddings/BYTE_PAIR_EMBEDDINGS.md b/resources/docs/embeddings/BYTE_PAIR_EMBEDDINGS.md
index 83097ebe1..3a21cf595 100644
--- a/resources/docs/embeddings/BYTE_PAIR_EMBEDDINGS.md
+++ b/resources/docs/embeddings/BYTE_PAIR_EMBEDDINGS.md
@@ -1,8 +1,9 @@
-## New: Byte Pair Embeddings
+# Byte Pair Embeddings
 
-We now also include the byte pair embeddings calulated by @bheinzerling that segment words into subsequences.
-This can dramatically reduce the model size vis-a-vis using normal word embeddings at nearly the same accuracy.
-So, if you want to train small models try out the new `BytePairEmbeddings` class.
+`BytePairEmbeddings` are word embeddings that are precomputed on the subword-level. This means that they are able to
+embed any word by splitting words into subwords and looking up their embeddings. BytePairEmbeddings were proposed
+and computed by @bheinzerling who found that they offer nearly the same accuracy as word embeddings, but at a fraction
+of the model size. So they are a great choice if you want to train small models.
 
 You initialize with a language code (275 languages supported), a number of 'syllables' (one of ) and
 a number of dimensions (one of 50, 100, 200 or 300). The following initializes and uses byte pair embeddings
@@ -23,4 +24,11 @@ embedding.embed(sentence)
 
 More information can be found
 on the [byte pair embeddings](https://nlp.h-its.org/bpemb/) web page.
-Given its memory advantages, we would be interested to hear from the community how well these embeddings work.
\ No newline at end of file
+
+`BytePairEmbeddings` also have a multilingual model capable of embedding any word in any language.
+ You can instantiate it with:
+
+```python
+# init embedding
+embedding = BytePairEmbeddings('multi')
+```
\ No newline at end of file
diff --git a/resources/docs/embeddings/CHARACTER_EMBEDDINGS.md b/resources/docs/embeddings/CHARACTER_EMBEDDINGS.md
index d498719c5..5e1400abf 100644
--- a/resources/docs/embeddings/CHARACTER_EMBEDDINGS.md
+++ b/resources/docs/embeddings/CHARACTER_EMBEDDINGS.md
@@ -1,20 +1,30 @@
-## Character Embeddings
+# Character Embeddings
 
-Some embeddings - such as character-features - are not pre-trained but rather trained on the downstream task. Normally
-this requires you to implement a [hierarchical embedding architecture](http://neuroner.com/NeuroNERengine_with_caption_no_figure.png).
+`CharacterEmbeddings` allow you to add character-level word embeddings during model training. Note that these embeddings
+are randomly initialized when you initialize the class, so they are not meaningful unless you train them on a specific
+downstream task.
 
-With Flair, you don't need to worry about such things. Just choose the appropriate
-embedding class and character features will then automatically train during downstream task training.
+For instance, the standard sequence labeling architecture used by Lample et al. is a combination of classic word embeddings
+with task-trained character features. Normally this would require you to implement a
+[hierarchical embedding architecture](http://neuroner.com/NeuroNERengine_with_caption_no_figure.png) in which character-level
+embeddings for each word are computed using an RNN and then concatenated with word embeddings.
+
+In Flair, we simplify this by treating `CharacterEmbeddings` just like any other embedding class. To reproduce the
+Lample architecture, you need only combine them with standard `WordEmbeddings` in an embedding stack:
 
-```python
-from flair.embeddings import CharacterEmbeddings
 
-# init embedding
-embedding = CharacterEmbeddings()
+```python
+# init embedding stack
+embedding = StackedEmbeddings(
+    [
+        # standard word embeddings
+        WordEmbeddings('glove'),
 
-# create a sentence
-sentence = Sentence('The grass is green .')
+        # character-level features
+        CharacterEmbeddings(),
+    ]
+)
+```
 
-# embed words in sentence
-embedding.embed(sentence)
-```
\ No newline at end of file
+If you pass this stacked embedding to a train method, the character-level features will now automatically be trained
+for your downstream task.
\ No newline at end of file
diff --git a/resources/docs/embeddings/CLASSIC_WORD_EMBEDDINGS.md b/resources/docs/embeddings/CLASSIC_WORD_EMBEDDINGS.md
index c7836c769..a5c780284 100644
--- a/resources/docs/embeddings/CLASSIC_WORD_EMBEDDINGS.md
+++ b/resources/docs/embeddings/CLASSIC_WORD_EMBEDDINGS.md
@@ -3,7 +3,7 @@
 Classic word embeddings are static and word-level, meaning that each distinct word gets exactly one pre-computed
 embedding. Most embeddings fall under this class, including the popular GloVe or Komninos embeddings.
 
-Simply instantiate the WordEmbeddings class and pass a string identifier of the embedding you wish to load. So, if
+Simply instantiate the `WordEmbeddings` class and pass a string identifier of the embedding you wish to load. So, if
 you want to use GloVe embeddings, pass the string 'glove' to the constructor:
 
 ```python
diff --git a/resources/docs/embeddings/ELMO_EMBEDDINGS.md b/resources/docs/embeddings/ELMO_EMBEDDINGS.md
index 155a4063e..6c6448b68 100644
--- a/resources/docs/embeddings/ELMO_EMBEDDINGS.md
+++ b/resources/docs/embeddings/ELMO_EMBEDDINGS.md
@@ -1,4 +1,4 @@
-## ELMo Embeddings
+# ELMo Embeddings
 
 [ELMo embeddings](http://www.aclweb.org/anthology/N18-1202) were presented by Peters et al. in 2018. They are using
 a bidirectional recurrent neural network to predict the next word in a text.
diff --git a/resources/docs/embeddings/FASTTEXT_EMBEDDINGS.md b/resources/docs/embeddings/FASTTEXT_EMBEDDINGS.md
index 02b953fdd..dfb86475f 100644
--- a/resources/docs/embeddings/FASTTEXT_EMBEDDINGS.md
+++ b/resources/docs/embeddings/FASTTEXT_EMBEDDINGS.md
@@ -1,6 +1,7 @@
-## FastText Embeddings
+# FastText Embeddings
 
-FastText Embeddings can give you vectors for out of vocabulary(oov) words by using the sub-word information. To use this functionality with Flair, use `FastTextEmbeddings` class as shown:
+FastText Embeddings can give you vectors for out of vocabulary(oov) words by using the sub-word information.
+ To use this functionality with Flair, use `FastTextEmbeddings` class as shown:
 
 ```python
 from flair.embeddings import FastTextEmbeddings
@@ -21,3 +22,30 @@ You can initialize the class by passing the remote downloadable URL as well.
 embedding = FastTextEmbeddings('/path/to/remote/downloadable/custom_fasttext_embeddings.bin', use_local=False)
 ```
 
+Note that FastText embeddings typically have huge models resulting in equally huge models for downstream tasks.
+
+Alternatively, you can use FastText embeddings without the oov functionality by using normal `WordEmbeddings` which
+are smaller and get
+the oov functionality from the `BytePairEmbeddings` which are tiny. So, instead of using English `FastTextEmbeddings`
+with oov handling, you could use this stack:
+
+```python
+from flair.embeddings import WordEmbeddings, BytePairEmbeddings, StackedEmbeddings
+
+# init embedding
+embedding = StackedEmbeddings(
+    [
+        # standard FastText word embeddings for English
+        WordEmbeddings('en'),
+        # Byte pair embeddings for English
+        BytePairEmbeddings('en'),
+    ]
+)
+
+# create a sentence
+sentence = Sentence('The grass is green .')
+
+# embed words in sentence
+embedding.embed(sentence)
+```
+
diff --git a/resources/docs/embeddings/FLAIR_EMBEDDINGS.md b/resources/docs/embeddings/FLAIR_EMBEDDINGS.md
index bdd65392e..8331d7acb 100644
--- a/resources/docs/embeddings/FLAIR_EMBEDDINGS.md
+++ b/resources/docs/embeddings/FLAIR_EMBEDDINGS.md
@@ -1,5 +1,4 @@
-
-## Flair Embeddings
+# Flair Embeddings
 
 Contextual string embeddings are [powerful embeddings](https://drive.google.com/file/d/17yVpFA7MmXaQFTe-HDpZuqw9fJlmzg56/view?usp=sharing)
  that capture latent syntactic-semantic information that goes beyond
diff --git a/resources/docs/embeddings/TRANSFOMER_EMBEDDINGS.md b/resources/docs/embeddings/TRANSFOMER_EMBEDDINGS.md
index e3c6fdb6b..0a489b450 100644
--- a/resources/docs/embeddings/TRANSFOMER_EMBEDDINGS.md
+++ b/resources/docs/embeddings/TRANSFOMER_EMBEDDINGS.md
@@ -1,5 +1,4 @@
-
-## PyTorch-Transformers
+# PyTorch-Transformers
 
 Thanks to the brilliant [`pytorch-transformers`](https://github.com/huggingface/pytorch-transformers) library from [Hugging Face](https://github.com/huggingface),
 Flair is able to support various Transformer-based architectures like BERT or XLNet.
@@ -17,7 +16,7 @@ The following embeddings can be used in Flair:
 This section shows how to use these Transformer-based architectures in Flair and is heavily based on the excellent
 [PyTorch-Transformers pre-trained models documentation](https://huggingface.co/pytorch-transformers/pretrained_models.html).
 
-### BERT Embeddings
+## BERT Embeddings
 
 [BERT embeddings](https://arxiv.org/pdf/1810.04805.pdf) were developed by Devlin et al. (2018) and are a different kind
 of powerful word embedding based on a bidirectional transformer architecture.
@@ -324,7 +323,7 @@ We implement different pooling operations for these subwords to generate the fin
 * `first_last`: embeddings of the first and last subwords are concatenated and used
 * `mean`: a `torch.mean` over all subword embeddings is calculated and used
 
-### Scalar mix
+## Scalar mix
 
 The Transformer-based models have a certain number of layers. [Liu et. al (2019)](https://arxiv.org/abs/1903.08855)
 propose a technique called scalar mix, that computes a parameterised scalar mixture of user-defined layers.

From cd5d58b1e8a93ae17e63f857e068435f30a37231 Mon Sep 17 00:00:00 2001
From: Alan Akbik <akbika@us.ibm.com>
Date: Mon, 26 Aug 2019 13:44:26 +0200
Subject: [PATCH 22/23] Update BYTE_PAIR_EMBEDDINGS.md

---
 resources/docs/embeddings/BYTE_PAIR_EMBEDDINGS.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/resources/docs/embeddings/BYTE_PAIR_EMBEDDINGS.md b/resources/docs/embeddings/BYTE_PAIR_EMBEDDINGS.md
index 3a21cf595..f8ebc6306 100644
--- a/resources/docs/embeddings/BYTE_PAIR_EMBEDDINGS.md
+++ b/resources/docs/embeddings/BYTE_PAIR_EMBEDDINGS.md
@@ -1,7 +1,7 @@
 # Byte Pair Embeddings
 
 `BytePairEmbeddings` are word embeddings that are precomputed on the subword-level. This means that they are able to
-embed any word by splitting words into subwords and looking up their embeddings. BytePairEmbeddings were proposed
+embed any word by splitting words into subwords and looking up their embeddings. `BytePairEmbeddings` were proposed
 and computed by @bheinzerling who found that they offer nearly the same accuracy as word embeddings, but at a fraction
 of the model size. So they are a great choice if you want to train small models.
 
@@ -31,4 +31,4 @@ on the [byte pair embeddings](https://nlp.h-its.org/bpemb/) web page.
 ```python
 # init embedding
 embedding = BytePairEmbeddings('multi')
-```
\ No newline at end of file
+```

From 7f7823fe5e0c409adf7316d2e5aa7a0b99abfcd4 Mon Sep 17 00:00:00 2001
From: aakbik <alan.akbik@zalando.de>
Date: Mon, 26 Aug 2019 17:10:51 +0200
Subject: [PATCH 23/23] comments

---
 flair/models/sequence_tagger_model.py     | 14 ++++++++++++++
 flair/models/text_classification_model.py |  8 ++++++++
 flair/trainers/trainer.py                 | 13 ++++++++++---
 3 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/flair/models/sequence_tagger_model.py b/flair/models/sequence_tagger_model.py
index 0755671d1..e421a1df8 100644
--- a/flair/models/sequence_tagger_model.py
+++ b/flair/models/sequence_tagger_model.py
@@ -82,6 +82,20 @@ def __init__(
         train_initial_hidden_state: bool = False,
         pickle_module: str = "pickle",
     ):
+        """
+        Initializes a SequenceTagger
+        :param hidden_size: number of hidden states in RNN
+        :param embeddings: word embeddings used in tagger
+        :param tag_dictionary: dictionary of tags you want to predict
+        :param tag_type: string identifier for tag type
+        :param use_crf: if True use CRF decoder, else project directly to tag space
+        :param use_rnn: if True use RNN layer, otherwise use word embeddings directly
+        :param rnn_layers: number of RNN layers
+        :param dropout: dropout probability
+        :param word_dropout: word dropout probability
+        :param locked_dropout: locked dropout probability
+        :param train_initial_hidden_state: if True, trains initial hidden state of RNN
+        """
 
         super(SequenceTagger, self).__init__()
 
diff --git a/flair/models/text_classification_model.py b/flair/models/text_classification_model.py
index c66489097..041682cb0 100644
--- a/flair/models/text_classification_model.py
+++ b/flair/models/text_classification_model.py
@@ -37,6 +37,14 @@ def __init__(
         multi_label: bool = None,
         multi_label_threshold: float = 0.5,
     ):
+        """
+        Initializes a TextClassifier
+        :param document_embeddings: embeddings used to embed each data point
+        :param label_dictionary: dictionary of labels you want to predict
+        :param multi_label: auto-detected by default, but you can set this to True to force multi-label prediction
+        or False to force single-label prediction
+        :param multi_label_threshold: If multi-label you can set the threshold to make predictions
+        """
 
         super(TextClassifier, self).__init__()
 
diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py
index e3057fa53..89b081470 100644
--- a/flair/trainers/trainer.py
+++ b/flair/trainers/trainer.py
@@ -40,16 +40,24 @@ def __init__(
         corpus: Corpus,
         optimizer: torch.optim.Optimizer = SGD,
         epoch: int = 0,
-        loss: float = 10000.0,
         optimizer_state: dict = None,
         scheduler_state: dict = None,
         use_tensorboard: bool = False,
     ):
+        """
+        Initialize a model trainer
+        :param model: The model that you want to train. The model should inherit from flair.nn.Model
+        :param corpus: The dataset used to train the model, should be of type Corpus
+        :param optimizer: The optimizer to use (typically SGD or Adam)
+        :param epoch: The starting epoch (normally 0 but could be higher if you continue training model)
+        :param optimizer_state: Optimizer state (necessary if continue training from checkpoint)
+        :param scheduler_state: Scheduler state (necessary if continue training from checkpoint)
+        :param use_tensorboard: If True, writes out tensorboard information
+        """
         self.model: flair.nn.Model = model
         self.corpus: Corpus = corpus
         self.optimizer: torch.optim.Optimizer = optimizer
         self.epoch: int = epoch
-        self.loss: float = loss
         self.scheduler_state: dict = scheduler_state
         self.optimizer_state: dict = optimizer_state
         self.use_tensorboard: bool = use_tensorboard
@@ -539,7 +547,6 @@ def load_from_checkpoint(
             corpus,
             optimizer,
             epoch=checkpoint["epoch"],
-            loss=checkpoint["loss"],
             optimizer_state=checkpoint["optimizer_state_dict"],
             scheduler_state=checkpoint["scheduler_state_dict"],
         )