Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Deid logging update #466

Merged
merged 3 commits into from
Jul 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions medcat/ner/transformers_ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,7 @@ def train(self,


# Encode dataset
# Note: tokenizer.encode performs chunking
encoded_dataset = dataset.map(
lambda examples: self.tokenizer.encode(examples, ignore_subwords=False),
batched=True,
Expand Down Expand Up @@ -261,6 +262,7 @@ def eval(self, json_path: Union[str, list, None] = None, dataset=None, ignore_ex
cache_dir='/tmp/')

# Encode dataset
# Note: tokenizer.encode performs chunking
encoded_dataset = dataset.map(
lambda examples: self.tokenizer.encode(examples, ignore_subwords=False),
batched=True,
Expand Down
8 changes: 7 additions & 1 deletion medcat/tokenizers/transformers_ner.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import dill
from typing import Optional, Dict
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
import logging

logger = logging.getLogger(__name__)


class TransformersTokenizerNER(object):
Expand Down Expand Up @@ -35,7 +38,7 @@ def calculate_label_map(self, dataset) -> None:

def encode(self, examples: Dict, ignore_subwords: bool = False) -> Dict:
"""Used with huggingface datasets map function to convert medcat_ner dataset into the
appropriate form for NER with BERT. It will split long text segments into max_len sequences.
appropriate form for NER with BERT. It will split long text segments into max_len sequences (performs chunking).

Args:
examples (Dict):
Expand Down Expand Up @@ -92,6 +95,9 @@ def encode(self, examples: Dict, ignore_subwords: bool = False) -> Dict:
labels.append(self.label_map['X'])

if len(input_ids) >= self.max_len:
logger.debug(
"Document exceeding max length encountered. Length of current document is %d. Performing chunking...",
len(tokens['offset_mapping']))
# Split into multiple examples if too long
examples['input_ids'].append(input_ids)
examples['labels'].append(labels)
Expand Down
Loading