Skip to content

Commit

Permalink
Merge pull request #556 from zalandoresearch/release-0.4.1
Browse files Browse the repository at this point in the history
Release 0.4.1
  • Loading branch information
Alan Akbik authored Feb 22, 2019
2 parents 40fbcbd + 8507359 commit eae6905
Show file tree
Hide file tree
Showing 24 changed files with 498 additions and 301 deletions.
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,6 @@ install:
- pip install -r requirements.txt -q
before_script: cd tests
script:
- pip freeze
- 'if [ "$TRAVIS_PULL_REQUEST" != "false" ]; then pytest --runintegration; fi'
- 'if [ "$TRAVIS_PULL_REQUEST" = "false" ]; then pytest; fi'
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ document embeddings, including our proposed **[Flair embeddings](https://drive.g
* **A Pytorch NLP framework.** Our framework builds directly on [Pytorch](https://pytorch.org/), making it easy to
train your own models and experiment with new approaches using Flair embeddings and classes.

Now at [version 0.4.0](https://github.com/zalandoresearch/flair/releases)!
Now at [version 0.4.1](https://github.com/zalandoresearch/flair/releases)!

## Comparison with State-of-the-Art

Expand Down
2 changes: 1 addition & 1 deletion flair/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def __init__(self, add_unk=True):
def add_item(self, item: str) -> int:
"""
add string - if already in dictionary returns its ID. if not in dictionary, it will get a new ID.
:param item: a string for which to assign an id
:param item: a string for which to assign an id.
:return: ID of string
"""
item = item.encode('utf-8')
Expand Down
25 changes: 16 additions & 9 deletions flair/data_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,9 @@ def load_corpus(task: Union[NLPTask, str], base_path: [str, Path] = None) -> Tag

# for text classifiers, we use our own special format
if task in [NLPTask.IMDB.value, NLPTask.AG_NEWS.value, NLPTask.TREC_6.value, NLPTask.TREC_50.value]:
return NLPTaskDataFetcher.load_classification_corpus(data_folder)
use_tokenizer: bool = False if task in [NLPTask.TREC_6.value, NLPTask.TREC_50.value] else True

return NLPTaskDataFetcher.load_classification_corpus(data_folder, use_tokenizer=use_tokenizer)

# NER corpus for Basque
if task == NLPTask.NER_BASQUE.value:
Expand Down Expand Up @@ -285,7 +287,7 @@ def load_column_corpus(

@staticmethod
def load_ud_corpus(
data_folder: Union[str,Path],
data_folder: Union[str, Path],
train_file=None,
test_file=None,
dev_file=None) -> TaggedCorpus:
Expand Down Expand Up @@ -326,10 +328,11 @@ def load_ud_corpus(

@staticmethod
def load_classification_corpus(
data_folder: Union[str,Path],
data_folder: Union[str, Path],
train_file=None,
test_file=None,
dev_file=None) -> TaggedCorpus:
dev_file=None,
use_tokenizer: bool = True) -> TaggedCorpus:
"""
Helper function to get a TaggedCorpus from text classification-formatted task data
Expand Down Expand Up @@ -370,11 +373,14 @@ def load_classification_corpus(
log.info("Dev: {}".format(dev_file))
log.info("Test: {}".format(test_file))

sentences_train: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file(train_file)
sentences_test: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file(test_file)
sentences_train: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file(train_file,
use_tokenizer=use_tokenizer)
sentences_test: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file(test_file,
use_tokenizer=use_tokenizer)

if dev_file is not None:
sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file(dev_file)
sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_text_classification_file(dev_file,
use_tokenizer=use_tokenizer)
else:
sentences_dev: List[Sentence] = [sentences_train[i] for i in
NLPTaskDataFetcher.__sample(len(sentences_train), 0.1)]
Expand All @@ -383,7 +389,8 @@ def load_classification_corpus(
return TaggedCorpus(sentences_train, sentences_dev, sentences_test)

@staticmethod
def read_text_classification_file(path_to_file: Union[str,Path], max_tokens_per_doc=-1) -> List[Sentence]:
def read_text_classification_file(path_to_file: Union[str, Path], max_tokens_per_doc=-1, use_tokenizer=True) -> \
List[Sentence]:
"""
Reads a data file for text classification. The file should contain one document/text per line.
The line should have the following format:
Expand Down Expand Up @@ -416,7 +423,7 @@ def read_text_classification_file(path_to_file: Union[str,Path], max_tokens_per_
text = line[l_len:].strip()

if text and labels:
sentence = Sentence(text, labels=labels, use_tokenizer=True)
sentence = Sentence(text, labels=labels, use_tokenizer=use_tokenizer)
if len(sentence) > max_tokens_per_doc and max_tokens_per_doc > 0:
sentence.tokens = sentence.tokens[:max_tokens_per_doc]
if len(sentence.tokens) > 0:
Expand Down
Loading

0 comments on commit eae6905

Please sign in to comment.