flairNLP · alanakbik · Apr 19, 2023 · Mar 27, 2023 · Mar 27, 2023 · Mar 27, 2023
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,3 @@
+[flake8]
+
+docstring-convention=google
diff --git a/flair/data.py b/flair/data.py
@@ -43,9 +43,7 @@ def _len_dataset(dataset: Optional[Dataset]) -> int:
 
 
 class Dictionary:
-    """
-    This class holds a dictionary that maps strings to IDs, used to generate one-hot encodings of strings.
-    """
+    """This class holds a dictionary that maps strings to IDs, used to generate one-hot encodings of strings."""
 
     def __init__(self, add_unk=True):
         # init dictionaries
@@ -65,8 +63,8 @@ def remove_item(self, item: str):
             del self.item2idx[bytes_item]
 
     def add_item(self, item: str) -> int:
-        """
-        add string - if already in dictionary returns its ID. if not in dictionary, it will get a new ID.
+        """Add string - if already in dictionary returns its ID. if not in dictionary, it will get a new ID.
+
         :param item: a string for which to assign an id.
         :return: ID of string
         """
@@ -77,8 +75,8 @@ def add_item(self, item: str) -> int:
         return self.item2idx[bytes_item]
 
     def get_idx_for_item(self, item: str) -> int:
-        """
-        returns the ID of the string, otherwise 0
+        """Returns the ID of the string, otherwise 0.
+
         :param item: string for which ID is requested
         :return: ID of string, otherwise 0
         """
@@ -95,8 +93,8 @@ def get_idx_for_item(self, item: str) -> int:
             raise IndexError
 
     def get_idx_for_items(self, items: List[str]) -> List[int]:
-        """
-        returns the IDs for each item of the list of string, otherwise 0 if not found
+        """Returns the IDs for each item of the list of string, otherwise 0 if not found.
+
         :param items: List of string for which IDs are requested
         :return: List of ID of strings
         """
@@ -198,9 +196,10 @@ def __str__(self):
 
 
 class Label:
-    """
-    This class represents a label. Each label has a value and optionally a confidence score. The
-    score needs to be between 0.0 and 1.0. Default value for the score is 1.0.
+    """This class represents a label.
+
+    Each label has a value and optionally a confidence score. The score needs to be between 0.0 and 1.0.
+    Default value for the score is 1.0.
     """
 
     def __init__(self, data_point: "DataPoint", value: str, score: float = 1.0):
@@ -253,10 +252,11 @@ def unlabeled_identifier(self):
 
 
 class DataPoint:
-    """
-    This is the parent class of all data points in Flair (including Token, Sentence, Image, etc.). Each DataPoint
-    must be embeddable (hence the abstract property embedding() and methods to() and clear_embeddings()). Also,
-    each DataPoint may have Labels in several layers of annotation (hence the functions add_label(), get_labels()
+    """This is the parent class of all data points in Flair.
+
+    Examples for data points are Token, Sentence, Image, etc.
+    Each DataPoint must be embeddable (hence the abstract property embedding() and methods to() and clear_embeddings()).
+    Also, each DataPoint may have Labels in several layers of annotation (hence the functions add_label(), get_labels()
     and the property 'label')
     """
 
@@ -456,9 +456,9 @@ def remove_labels(self, typename: str):
 
 
 class Token(_PartOfSentence):
-    """
-    This class represents one word in a tokenized sentence. Each token may have any number of tags. It may also point
-    to its head in a dependency tree.
+    """This class represents one word in a tokenized sentence.
+
+    Each token may have any number of tags. It may also point to its head in a dependency tree.
     """
 
     def __init__(
@@ -530,30 +530,24 @@ def __repr__(self):
         return self.__str__()
 
     def add_label(self, typename: str, value: str, score: float = 1.0):
-        """
-        The Token is a special _PartOfSentence in that it may be initialized without a Sentence.
-        Therefore, labels get added only to the Sentence if it exists
-        """
+        # The Token is a special _PartOfSentence in that it may be initialized without a Sentence.
+        # therefore, labels get added only to the Sentence if it exists
         if self.sentence:
             super().add_label(typename=typename, value=value, score=score)
         else:
             DataPoint.add_label(self, typename=typename, value=value, score=score)
 
     def set_label(self, typename: str, value: str, score: float = 1.0):
-        """
-        The Token is a special _PartOfSentence in that it may be initialized without a Sentence.
-        Therefore, labels get set only to the Sentence if it exists
-        """
+        # The Token is a special _PartOfSentence in that it may be initialized without a Sentence.
+        # Therefore, labels get set only to the Sentence if it exists
         if self.sentence:
             super().set_label(typename=typename, value=value, score=score)
         else:
             DataPoint.set_label(self, typename=typename, value=value, score=score)
 
 
 class Span(_PartOfSentence):
-    """
-    This class represents one textual span consisting of Tokens.
-    """
+    """This class represents one textual span consisting of Tokens."""
 
     def __new__(self, tokens: List[Token]):
         # check if the span already exists. If so, return it
@@ -674,9 +668,7 @@ def embedding(self):
 
 
 class Sentence(DataPoint):
-    """
-    A Sentence is a list of tokens and is used to represent a sentence or text fragment.
-    """
+    """A Sentence is a list of tokens and is used to represent a sentence or text fragment."""
 
     def __init__(
         self,
@@ -685,8 +677,9 @@ def __init__(
         language_code: str = None,
         start_position: int = 0,
     ):
-        """
-        Class to hold all meta related to a text (tokens, predictions, language code, ...)
+        """Class to hold all metadata related to a text.
+
+        Metadata can be tokens, predictions, language code, ...
         :param text: original string (sentence), or a list of string tokens (words)
         :param use_tokenizer: a custom tokenizer (default is :class:`SpaceTokenizer`)
             more advanced options are :class:`SegTokTokenizer` to use segtok or :class:`SpacyTokenizer`
@@ -905,9 +898,10 @@ def to_plain_string(self):
         return plain.rstrip()
 
     def infer_space_after(self):
-        """
-        Heuristics in case you wish to infer whitespace_after values for tokenized text. This is useful for some old NLP
-        tasks (such as CoNLL-03 and CoNLL-2000) that provide only tokenized data with no info of original whitespacing.
+        """Heuristics in case you wish to infer whitespace_after values for tokenized text.
+
+        This is useful for some old NLP tasks (such as CoNLL-03 and CoNLL-2000) that provide only tokenized data with
+        no info of original whitespacing.
         :return:
         """
         last_token = None
@@ -1034,8 +1028,9 @@ def to_windows_1252(match):
         return re.sub(r"[\u0080-\u0099]", to_windows_1252, text)
 
     def next_sentence(self):
-        """
-        Get the next sentence in the document (works only if context is set through dataloader or elsewhere)
+        """Get the next sentence in the document.
+
+        This only works if context is set through dataloader or elsewhere
         :return: next Sentence in document if set, otherwise None
         """
         if self._next_sentence is not None:
@@ -1050,8 +1045,9 @@ def next_sentence(self):
         return None
 
     def previous_sentence(self):
-        """
-        Get the previous sentence in the document (works only if context is set through dataloader or elsewhere)
+        """Get the previous sentence in the document.
+
+        works only if context is set through dataloader or elsewhere
         :return: previous Sentence in document if set, otherwise None
         """
         if self._previous_sentence is not None:
@@ -1066,7 +1062,8 @@ def previous_sentence(self):
         return None
 
     def is_context_set(self) -> bool:
-        """
+        """Determines if this sentence has a context of sentences before or after set.
+
         Return True or False depending on whether context is set (for instance in dataloader or elsewhere)
         :return: True if context is set, else False
         """
@@ -1317,8 +1314,8 @@ def _filter_empty_sentences(dataset) -> Dataset:
         return subset
 
     def make_vocab_dictionary(self, max_tokens=-1, min_freq=1) -> Dictionary:
-        """
-        Creates a dictionary of all tokens contained in the corpus.
+        """Creates a dictionary of all tokens contained in the corpus.
+
         By defining `max_tokens` you can set the maximum number of tokens that should be contained in the dictionary.
         If there are more than `max_tokens` tokens in the corpus, the most frequent tokens are added first.
         If `min_freq` is set the a value greater than 1 only tokens occurring more than `min_freq` times are considered
@@ -1358,9 +1355,9 @@ def _downsample_to_proportion(dataset: Dataset, proportion: float):
         return splits[0]
 
     def obtain_statistics(self, label_type: str = None, pretty_print: bool = True) -> Union[dict, str]:
-        """
-        Print statistics about the class distribution (only labels of sentences are taken into account) and sentence
-        sizes.
+        """Print statistics about the class distribution and sentence sizes.
+
+        only labels of sentences are taken into account
         """
         json_data = {
             "TRAIN": self._obtain_statistics_for(self.train, "TRAIN", label_type),
@@ -1435,8 +1432,8 @@ def __str__(self) -> str:
     def make_label_dictionary(
         self, label_type: str, min_count: int = -1, add_unk: bool = True, add_dev_test: bool = False
     ) -> Dictionary:
-        """
-        Creates a dictionary of all labels assigned to the sentences in the corpus.
+        """Creates a dictionary of all labels assigned to the sentences in the corpus.
+
         :return: dictionary of labels
         """
         if min_count > 0 and not add_unk:
@@ -1517,8 +1514,8 @@ def add_label_noise(
         split: str = "train",
         noise_transition_matrix: Optional[Dict[str, List[float]]] = None,
     ):
-        """
-        Generates uniform label noise distribution in the chosen dataset split.
+        """Generates uniform label noise distribution in the chosen dataset split.
+
         :label_type: the type of labels for which the noise should be simulated.
         :labels: an array with unique labels of said type (retrievable from label dictionary).
         :noise_share: the desired share of noise in the train split.
@@ -1728,7 +1725,8 @@ def cummulative_sizes(self):
 
 
 def iob2(tags):
-    """
+    """Converts the tags to the IOB2 format.
+
     Check that tags have a valid IOB format.
     Tags in IOB1 format are converted to IOB2.
     """

diff --git a/flair/datasets/base.py b/flair/datasets/base.py
@@ -39,13 +39,11 @@ def __init__(
 
 
 class FlairDatapointDataset(FlairDataset, Generic[DT]):
-    """
-    A simple Dataset object to wrap a List of Datapoints, for example Sentences
-    """
+    """A simple Dataset object to wrap a List of Datapoints, for example Sentences."""
 
     def __init__(self, datapoints: Union[DT, List[DT]]):
-        """
-        Instantiate FlairDatapointDataset
+        """Instantiate FlairDatapointDataset.
+
         :param sentences: DT or List of DT that make up FlairDatapointDataset
         """
         # cast to list if necessary
@@ -70,17 +68,15 @@ def __init__(self, sentences: Union[Sentence, List[Sentence]]):
 
 
 class StringDataset(FlairDataset):
-    """
-    A Dataset taking string as input and returning Sentence during iteration
-    """
+    """A Dataset taking string as input and returning Sentence during iteration."""
 
     def __init__(
         self,
         texts: Union[str, List[str]],
         use_tokenizer: Union[bool, Tokenizer] = SpaceTokenizer(),
     ):
-        """
-        Instantiate StringDataset
+        """Instantiate StringDataset.
+
         :param texts: a string or List of string that make up StringDataset
         :param use_tokenizer: Custom tokenizer to use (default is SpaceTokenizer,
         more advanced options are SegTokTokenizer to use segtok or SpacyTokenizer to use Spacy library models
@@ -121,8 +117,9 @@ def __init__(
         in_memory: bool = True,
         tag_type: str = "class",
     ):
-        """
-        Reads Mongo collections. Each collection should contain one document/text per item.
+        """Reads Mongo collections.
+
+        Each collection should contain one document/text per item.
 
         Each item should have the following format:
         {
@@ -147,7 +144,6 @@ def __init__(
         :param in_memory: If True, keeps dataset as Sentences in memory, otherwise only keeps strings
         :return: list of sentences
         """
-
         # first, check if pymongo is installed
         try:
             import pymongo