Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Google doc strings #3164

Merged
merged 12 commits into from
Apr 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[flake8]

docstring-convention=google
104 changes: 51 additions & 53 deletions flair/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,7 @@ def _len_dataset(dataset: Optional[Dataset]) -> int:


class Dictionary:
"""
This class holds a dictionary that maps strings to IDs, used to generate one-hot encodings of strings.
"""
"""This class holds a dictionary that maps strings to IDs, used to generate one-hot encodings of strings."""

def __init__(self, add_unk=True):
# init dictionaries
Expand All @@ -65,8 +63,8 @@ def remove_item(self, item: str):
del self.item2idx[bytes_item]

def add_item(self, item: str) -> int:
"""
add string - if already in dictionary returns its ID. if not in dictionary, it will get a new ID.
"""Add string - if already in dictionary returns its ID. if not in dictionary, it will get a new ID.

:param item: a string for which to assign an id.
:return: ID of string
"""
Expand All @@ -77,8 +75,8 @@ def add_item(self, item: str) -> int:
return self.item2idx[bytes_item]

def get_idx_for_item(self, item: str) -> int:
"""
returns the ID of the string, otherwise 0
"""Returns the ID of the string, otherwise 0.

:param item: string for which ID is requested
:return: ID of string, otherwise 0
"""
Expand All @@ -95,8 +93,8 @@ def get_idx_for_item(self, item: str) -> int:
raise IndexError

def get_idx_for_items(self, items: List[str]) -> List[int]:
"""
returns the IDs for each item of the list of string, otherwise 0 if not found
"""Returns the IDs for each item of the list of string, otherwise 0 if not found.

:param items: List of string for which IDs are requested
:return: List of ID of strings
"""
Expand Down Expand Up @@ -198,9 +196,10 @@ def __str__(self):


class Label:
"""
This class represents a label. Each label has a value and optionally a confidence score. The
score needs to be between 0.0 and 1.0. Default value for the score is 1.0.
"""This class represents a label.

Each label has a value and optionally a confidence score. The score needs to be between 0.0 and 1.0.
Default value for the score is 1.0.
"""

def __init__(self, data_point: "DataPoint", value: str, score: float = 1.0):
Expand Down Expand Up @@ -253,10 +252,11 @@ def unlabeled_identifier(self):


class DataPoint:
"""
This is the parent class of all data points in Flair (including Token, Sentence, Image, etc.). Each DataPoint
must be embeddable (hence the abstract property embedding() and methods to() and clear_embeddings()). Also,
each DataPoint may have Labels in several layers of annotation (hence the functions add_label(), get_labels()
"""This is the parent class of all data points in Flair.

Examples for data points are Token, Sentence, Image, etc.
Each DataPoint must be embeddable (hence the abstract property embedding() and methods to() and clear_embeddings()).
Also, each DataPoint may have Labels in several layers of annotation (hence the functions add_label(), get_labels()
and the property 'label')
"""

Expand Down Expand Up @@ -456,9 +456,9 @@ def remove_labels(self, typename: str):


class Token(_PartOfSentence):
"""
This class represents one word in a tokenized sentence. Each token may have any number of tags. It may also point
to its head in a dependency tree.
"""This class represents one word in a tokenized sentence.

Each token may have any number of tags. It may also point to its head in a dependency tree.
"""

def __init__(
Expand Down Expand Up @@ -530,30 +530,24 @@ def __repr__(self):
return self.__str__()

def add_label(self, typename: str, value: str, score: float = 1.0):
"""
The Token is a special _PartOfSentence in that it may be initialized without a Sentence.
Therefore, labels get added only to the Sentence if it exists
"""
# The Token is a special _PartOfSentence in that it may be initialized without a Sentence.
# therefore, labels get added only to the Sentence if it exists
if self.sentence:
super().add_label(typename=typename, value=value, score=score)
else:
DataPoint.add_label(self, typename=typename, value=value, score=score)

def set_label(self, typename: str, value: str, score: float = 1.0):
"""
The Token is a special _PartOfSentence in that it may be initialized without a Sentence.
Therefore, labels get set only to the Sentence if it exists
"""
# The Token is a special _PartOfSentence in that it may be initialized without a Sentence.
# Therefore, labels get set only to the Sentence if it exists
if self.sentence:
super().set_label(typename=typename, value=value, score=score)
else:
DataPoint.set_label(self, typename=typename, value=value, score=score)


class Span(_PartOfSentence):
"""
This class represents one textual span consisting of Tokens.
"""
"""This class represents one textual span consisting of Tokens."""

def __new__(self, tokens: List[Token]):
# check if the span already exists. If so, return it
Expand Down Expand Up @@ -674,9 +668,7 @@ def embedding(self):


class Sentence(DataPoint):
"""
A Sentence is a list of tokens and is used to represent a sentence or text fragment.
"""
"""A Sentence is a list of tokens and is used to represent a sentence or text fragment."""

def __init__(
self,
Expand All @@ -685,8 +677,9 @@ def __init__(
language_code: str = None,
start_position: int = 0,
):
"""
Class to hold all meta related to a text (tokens, predictions, language code, ...)
"""Class to hold all metadata related to a text.

Metadata can be tokens, predictions, language code, ...
:param text: original string (sentence), or a list of string tokens (words)
:param use_tokenizer: a custom tokenizer (default is :class:`SpaceTokenizer`)
more advanced options are :class:`SegTokTokenizer` to use segtok or :class:`SpacyTokenizer`
Expand Down Expand Up @@ -905,9 +898,10 @@ def to_plain_string(self):
return plain.rstrip()

def infer_space_after(self):
"""
Heuristics in case you wish to infer whitespace_after values for tokenized text. This is useful for some old NLP
tasks (such as CoNLL-03 and CoNLL-2000) that provide only tokenized data with no info of original whitespacing.
"""Heuristics in case you wish to infer whitespace_after values for tokenized text.

This is useful for some old NLP tasks (such as CoNLL-03 and CoNLL-2000) that provide only tokenized data with
no info of original whitespacing.
:return:
"""
last_token = None
Expand Down Expand Up @@ -1034,8 +1028,9 @@ def to_windows_1252(match):
return re.sub(r"[\u0080-\u0099]", to_windows_1252, text)

def next_sentence(self):
"""
Get the next sentence in the document (works only if context is set through dataloader or elsewhere)
"""Get the next sentence in the document.

This only works if context is set through dataloader or elsewhere
:return: next Sentence in document if set, otherwise None
"""
if self._next_sentence is not None:
Expand All @@ -1050,8 +1045,9 @@ def next_sentence(self):
return None

def previous_sentence(self):
"""
Get the previous sentence in the document (works only if context is set through dataloader or elsewhere)
"""Get the previous sentence in the document.

works only if context is set through dataloader or elsewhere
:return: previous Sentence in document if set, otherwise None
"""
if self._previous_sentence is not None:
Expand All @@ -1066,7 +1062,8 @@ def previous_sentence(self):
return None

def is_context_set(self) -> bool:
"""
"""Determines if this sentence has a context of sentences before or after set.

Return True or False depending on whether context is set (for instance in dataloader or elsewhere)
:return: True if context is set, else False
"""
Expand Down Expand Up @@ -1317,8 +1314,8 @@ def _filter_empty_sentences(dataset) -> Dataset:
return subset

def make_vocab_dictionary(self, max_tokens=-1, min_freq=1) -> Dictionary:
"""
Creates a dictionary of all tokens contained in the corpus.
"""Creates a dictionary of all tokens contained in the corpus.

By defining `max_tokens` you can set the maximum number of tokens that should be contained in the dictionary.
If there are more than `max_tokens` tokens in the corpus, the most frequent tokens are added first.
If `min_freq` is set the a value greater than 1 only tokens occurring more than `min_freq` times are considered
Expand Down Expand Up @@ -1358,9 +1355,9 @@ def _downsample_to_proportion(dataset: Dataset, proportion: float):
return splits[0]

def obtain_statistics(self, label_type: str = None, pretty_print: bool = True) -> Union[dict, str]:
"""
Print statistics about the class distribution (only labels of sentences are taken into account) and sentence
sizes.
"""Print statistics about the class distribution and sentence sizes.

only labels of sentences are taken into account
"""
json_data = {
"TRAIN": self._obtain_statistics_for(self.train, "TRAIN", label_type),
Expand Down Expand Up @@ -1435,8 +1432,8 @@ def __str__(self) -> str:
def make_label_dictionary(
self, label_type: str, min_count: int = -1, add_unk: bool = True, add_dev_test: bool = False
) -> Dictionary:
"""
Creates a dictionary of all labels assigned to the sentences in the corpus.
"""Creates a dictionary of all labels assigned to the sentences in the corpus.

:return: dictionary of labels
"""
if min_count > 0 and not add_unk:
Expand Down Expand Up @@ -1517,8 +1514,8 @@ def add_label_noise(
split: str = "train",
noise_transition_matrix: Optional[Dict[str, List[float]]] = None,
):
"""
Generates uniform label noise distribution in the chosen dataset split.
"""Generates uniform label noise distribution in the chosen dataset split.

:label_type: the type of labels for which the noise should be simulated.
:labels: an array with unique labels of said type (retrievable from label dictionary).
:noise_share: the desired share of noise in the train split.
Expand Down Expand Up @@ -1728,7 +1725,8 @@ def cummulative_sizes(self):


def iob2(tags):
"""
"""Converts the tags to the IOB2 format.

Check that tags have a valid IOB format.
Tags in IOB1 format are converted to IOB2.
"""
Expand Down
22 changes: 9 additions & 13 deletions flair/datasets/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,11 @@ def __init__(


class FlairDatapointDataset(FlairDataset, Generic[DT]):
"""
A simple Dataset object to wrap a List of Datapoints, for example Sentences
"""
"""A simple Dataset object to wrap a List of Datapoints, for example Sentences."""

def __init__(self, datapoints: Union[DT, List[DT]]):
"""
Instantiate FlairDatapointDataset
"""Instantiate FlairDatapointDataset.

:param sentences: DT or List of DT that make up FlairDatapointDataset
"""
# cast to list if necessary
Expand All @@ -70,17 +68,15 @@ def __init__(self, sentences: Union[Sentence, List[Sentence]]):


class StringDataset(FlairDataset):
"""
A Dataset taking string as input and returning Sentence during iteration
"""
"""A Dataset taking string as input and returning Sentence during iteration."""

def __init__(
self,
texts: Union[str, List[str]],
use_tokenizer: Union[bool, Tokenizer] = SpaceTokenizer(),
):
"""
Instantiate StringDataset
"""Instantiate StringDataset.

:param texts: a string or List of string that make up StringDataset
:param use_tokenizer: Custom tokenizer to use (default is SpaceTokenizer,
more advanced options are SegTokTokenizer to use segtok or SpacyTokenizer to use Spacy library models
Expand Down Expand Up @@ -121,8 +117,9 @@ def __init__(
in_memory: bool = True,
tag_type: str = "class",
):
"""
Reads Mongo collections. Each collection should contain one document/text per item.
"""Reads Mongo collections.

Each collection should contain one document/text per item.

Each item should have the following format:
{
Expand All @@ -147,7 +144,6 @@ def __init__(
:param in_memory: If True, keeps dataset as Sentences in memory, otherwise only keeps strings
:return: list of sentences
"""

# first, check if pymongo is installed
try:
import pymongo
Expand Down
Loading