From 1f796fa015254313ece867a5055acc0af9960428 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 10 May 2023 11:29:13 +0300 Subject: [PATCH 01/47] CU-863gntc58 Add parent to child relationship getter to UMLS preprocessing --- medcat/utils/preprocess_umls.py | 91 +++++++++++++++++++++++++++++---- 1 file changed, 82 insertions(+), 9 deletions(-) diff --git a/medcat/utils/preprocess_umls.py b/medcat/utils/preprocess_umls.py index 0b3505981..b192b58b3 100644 --- a/medcat/utils/preprocess_umls.py +++ b/medcat/utils/preprocess_umls.py @@ -1,6 +1,8 @@ from typing import List, Union import pandas as pd +import tqdm +import os _DEFAULT_COLUMNS: list = [ "CUI", @@ -20,7 +22,7 @@ "STR", "SRL", "SUPPRESS", - "CVF", + "CVF", ] _DEFAULT_SEM_TYPE_COLUMNS: list = [ @@ -32,12 +34,24 @@ "CVF", ] +_DEFAULT_MRHIER_COLUMNS: list = [ + "CUI", + "AUI", + "CXN", + "PAUI", + "SAB", + "RELA", + "PTR", + "HCD", + "CVF", +] + medcat_csv_mapper: dict = { 'CUI': 'cui', 'STR': 'name', 'SAB': 'ontologies', 'ISPREF': 'name_status', - 'TUI': 'type_ids', # from MRSTY.RRF + 'TUI': 'type_ids', # from MRSTY.RRF } @@ -57,11 +71,13 @@ class UMLS: def __init__(self, main_file_name: str, sem_types_file: str, allow_languages: list = ['ENG'], sep: str = '|'): self.main_file_name = main_file_name self.sem_types_file = sem_types_file - self.main_columns = list(_DEFAULT_COLUMNS) # copy - self.sem_types_columns = list(_DEFAULT_SEM_TYPE_COLUMNS) # copy + self.main_columns = list(_DEFAULT_COLUMNS) # copy + self.sem_types_columns = list(_DEFAULT_SEM_TYPE_COLUMNS) # copy + self.mrhier_columns = list(_DEFAULT_MRHIER_COLUMNS) # copy self.sep = sep # copy in case of default list - self.allow_langugages = list(allow_languages) if allow_languages else allow_languages + self.allow_langugages = list( + allow_languages) if allow_languages else allow_languages def to_concept_df(self) -> pd.DataFrame: """Create a concept DataFrame. @@ -72,7 +88,8 @@ def to_concept_df(self) -> pd.DataFrame: """ # target columns: # cui, name, name_status, ontologies, description_type_ids, type_ids - df = pd.read_csv(self.main_file_name, names=self.main_columns, sep=self.sep, index_col=False) + df = pd.read_csv(self.main_file_name, + names=self.main_columns, sep=self.sep, index_col=False) # filter languages if self.allow_langugages: @@ -82,7 +99,8 @@ def to_concept_df(self) -> pd.DataFrame: # get TUI - sem_types = pd.read_csv(self.sem_types_file, names=self.sem_types_columns, sep=self.sep, index_col=False) + sem_types = pd.read_csv( + self.sem_types_file, names=self.sem_types_columns, sep=self.sep, index_col=False) df = df.merge(sem_types) # rename columns @@ -109,7 +127,8 @@ def map_umls2snomed(self) -> pd.DataFrame: Returns: pd.DataFrame: Dataframe that contains the SCUI (source CUI) as well as the UMLS CUI for each applicable concept """ - df = pd.read_csv(self.main_file_name, names=self.main_columns, sep=self.sep, index_col=False, dtype={'SCUI': 'str'}) + df = pd.read_csv(self.main_file_name, names=self.main_columns, + sep=self.sep, index_col=False, dtype={'SCUI': 'str'}) # get only SNOMED-CT US based concepts that have a SNOMED-CT (source) CUI df = df[df.SAB == 'SNOMEDCT_US'][df.SCUI.notna()] # sort by SCUI @@ -154,7 +173,8 @@ def map_umls2source(self, sources: Union[str, List[str]]) -> pd.DataFrame: Returns: pd.DataFrame: DataFrame that has the target source codes """ - df = pd.read_csv(self.main_file_name, names=self.main_columns, sep=self.sep, index_col=False, dtype={'CODE': 'str'}) + df = pd.read_csv(self.main_file_name, names=self.main_columns, + sep=self.sep, index_col=False, dtype={'CODE': 'str'}) # get the specified source(s) if isinstance(sources, list): df = df[df.SAB.isin(sources)][df.CODE.notna()] @@ -166,6 +186,48 @@ def map_umls2source(self, sources: Union[str, List[str]]) -> pd.DataFrame: df = df[['CODE',] + [col for col in df.columns.values if col != 'CODE']] return df + def get_pt2ch(self) -> dict: + path = self.main_file_name.rsplit('/', 1)[0] + hier_file = f"{path}/MRHIER.RRF" + + if not os.path.exists(hier_file): + raise ValueError( + f'Expected MRHIER.RRF to exist within the same parent folder ({path})') + + conso_df = pd.read_csv(self.main_file_name, names=self.main_columns, + sep=self.sep, index_col=False) + + hier_df = pd.read_csv(hier_file, sep=self.sep, index_col=False, + header=None, names=self.mrhier_columns) + + # filter languages + if self.allow_langugages: + conso_df = conso_df[conso_df["LAT"].isin(self.allow_langugages)] + + # merge dataframes + merged_df = pd.merge(conso_df, hier_df, on=['AUI', 'CUI']) + + # create a AUI -> CUI map + aui_cui = dict(zip(merged_df["AUI"], merged_df["CUI"])) + + # only keep CUI and parent AUI + cui_parent = merged_df[['CUI', 'PAUI']] + # only include CUIs with a parent + cui_parent = cui_parent[cui_parent['PAUI'].notna()] + + # create dict + pt2ch: dict[str, set[str]] = {} + for _, row in tqdm.tqdm(cui_parent.iterrows(), total=len(cui_parent.index)): + cur_cui = row['CUI'] + paui = row['PAUI'] + parent_cui = aui_cui[paui] + if cur_cui not in pt2ch: + pt2ch[cur_cui] = set() + pt2ch[cur_cui].add(parent_cui) + for k, v in pt2ch.items(): + pt2ch[k] = list(v) + return pt2ch + if __name__ == '__main__': import sys @@ -187,3 +249,14 @@ def map_umls2source(self, sources: Union[str, List[str]]) -> pd.DataFrame: to_ICD10_man = umls.map_umls2source(sources=['ICD10']) print('As ICD-10(MAN):') print(to_ICD10_man.head()) + pt2ch = umls.get_pt2ch() + print('Get parent-child dict', len(pt2ch), + '' if len(pt2ch) > 1_000 else pt2ch) + import random + random_4_keys = random.sample(list(pt2ch.keys()), k=4) + + def _get_name(cui: str) -> str: + return df[df['cui'] == cui]['name'].iloc[0] + print('FF RAW ', [f"{k}:{pt2ch[k]}" for k in random_4_keys]) + print('FIRST FEW', [ + (f"{_get_name(key)} ({key})", [f"{_get_name(child)} ({child})" for child in pt2ch[key]]) for key in random_4_keys]) From 47215e9f49e25f6d8b8e6c7e2ebeb214c861e90b Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 10 May 2023 12:56:44 +0300 Subject: [PATCH 02/47] CU-863gntc58 Only use ISA relationships --- medcat/utils/preprocess_umls.py | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/medcat/utils/preprocess_umls.py b/medcat/utils/preprocess_umls.py index b192b58b3..8048c28aa 100644 --- a/medcat/utils/preprocess_umls.py +++ b/medcat/utils/preprocess_umls.py @@ -187,6 +187,23 @@ def map_umls2source(self, sources: Union[str, List[str]]) -> pd.DataFrame: return df def get_pt2ch(self) -> dict: + """Generates a parent to children dict. + + It goes through all the < # TODO + + The resulting dictionary maps a CUI to a list of CUIs that + consider that CUI as their parent. + + PS: + This expects the MRHIER.RRF file to also exist in the same folder + as the MRCONSO.RRF file. + + Raises: + ValueError: If the MRHIER.RRF file wasn't found + + Returns: + dict: The dictionary of parent CUI and their children. + """ path = self.main_file_name.rsplit('/', 1)[0] hier_file = f"{path}/MRHIER.RRF" @@ -204,12 +221,15 @@ def get_pt2ch(self) -> dict: if self.allow_langugages: conso_df = conso_df[conso_df["LAT"].isin(self.allow_langugages)] + # create a AUI -> CUI map + aui_cui = dict(zip(conso_df["AUI"], conso_df["CUI"])) + + # filter ISA relationships + hier_df = hier_df[hier_df['RELA'] == 'isa'] + # merge dataframes merged_df = pd.merge(conso_df, hier_df, on=['AUI', 'CUI']) - # create a AUI -> CUI map - aui_cui = dict(zip(merged_df["AUI"], merged_df["CUI"])) - # only keep CUI and parent AUI cui_parent = merged_df[['CUI', 'PAUI']] # only include CUIs with a parent From 9d04fbfb5900080a329a2a493e46b6675714d504 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 10 May 2023 14:24:23 +0300 Subject: [PATCH 03/47] Make sure parents do not have themselves as children --- medcat/utils/preprocess_umls.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/medcat/utils/preprocess_umls.py b/medcat/utils/preprocess_umls.py index 8048c28aa..6557ff087 100644 --- a/medcat/utils/preprocess_umls.py +++ b/medcat/utils/preprocess_umls.py @@ -245,6 +245,8 @@ def get_pt2ch(self) -> dict: pt2ch[cur_cui] = set() pt2ch[cur_cui].add(parent_cui) for k, v in pt2ch.items(): + if k in v: + v.remove(k) pt2ch[k] = list(v) return pt2ch @@ -272,11 +274,17 @@ def get_pt2ch(self) -> dict: pt2ch = umls.get_pt2ch() print('Get parent-child dict', len(pt2ch), '' if len(pt2ch) > 1_000 else pt2ch) + all_vals = [len(v) for v in pt2ch.values()] + print('LEN of VALS:', sum(all_vals), 'max', + max(all_vals), 'min', min(all_vals), 'mean', sum(all_vals) / len(all_vals)) import random random_4_keys = random.sample(list(pt2ch.keys()), k=4) def _get_name(cui: str) -> str: - return df[df['cui'] == cui]['name'].iloc[0] + matches = df[df['cui'] == cui] + if len(matches.index) == 0: + return 'N/A' # UNKNOWN + return matches['name'].iloc[0] print('FF RAW ', [f"{k}:{pt2ch[k]}" for k in random_4_keys]) print('FIRST FEW', [ (f"{_get_name(key)} ({key})", [f"{_get_name(child)} ({child})" for child in pt2ch[key]]) for key in random_4_keys]) From 69abf16a5258d5f8e33e265da9a814dada004c6d Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 10 May 2023 15:13:19 +0300 Subject: [PATCH 04/47] CU-863gntc58 Only keep preferred names --- medcat/utils/preprocess_umls.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/medcat/utils/preprocess_umls.py b/medcat/utils/preprocess_umls.py index 6557ff087..544813cfa 100644 --- a/medcat/utils/preprocess_umls.py +++ b/medcat/utils/preprocess_umls.py @@ -224,6 +224,9 @@ def get_pt2ch(self) -> dict: # create a AUI -> CUI map aui_cui = dict(zip(conso_df["AUI"], conso_df["CUI"])) + # remove non-preferred from conso + conso_df = conso_df[conso_df['ISPREF'] == 'Y'] + # filter ISA relationships hier_df = hier_df[hier_df['RELA'] == 'isa'] From 21aec90630d6a1f0619fe824c9b8c02e698172e1 Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 5 Jun 2023 09:22:28 +0100 Subject: [PATCH 05/47] CU-346mpwz Add memory optimiser for CDB --- medcat/utils/memory_optimiser.py | 163 +++++++++++++++++++++++++++++++ 1 file changed, 163 insertions(+) create mode 100644 medcat/utils/memory_optimiser.py diff --git a/medcat/utils/memory_optimiser.py b/medcat/utils/memory_optimiser.py new file mode 100644 index 000000000..8e3f0cbd5 --- /dev/null +++ b/medcat/utils/memory_optimiser.py @@ -0,0 +1,163 @@ +from typing import Any, Dict, Iterator, List, Tuple + +from medcat.cdb import CDB + + +class DelegatingDict: + + def __init__(self, delegate: Dict[str, List[Any]], nr: int, + nr_of_overall_items: int = 8) -> None: + self.delegate = delegate + self.nr = nr + self.nr_of_overall_items = nr_of_overall_items + + def _generate_empty_entry(self) -> List[Any]: + return [None for _ in range(self.nr_of_overall_items)] + + def __getitem__(self, key: str) -> Any: + val = self.delegate[key][self.nr] + if val is None: + raise KeyError + return val + + def get(self, key: str, default: Any) -> Any: + try: + return self[key] + except KeyError: + return default + + def __setitem__(self, key: str, value: Any) -> None: + if key not in self.delegate: + self.delegate[key] = self._generate_empty_entry() + self.delegate[key][self.nr] = value + + def __contains__(self, key: str) -> bool: + return key in self.delegate and self.delegate[key][self.nr] is not None + + def keys(self) -> Iterator[str]: + for key in self.delegate.keys(): + if key in self: + yield key + + def items(self) -> Iterator[Tuple[str, Any]]: + for key in self: + yield key, self[key] + + def values(self) -> Iterator[Any]: + for key in self: + yield self[key] + + def __iter__(self) -> Iterator[str]: + yield from self.keys() + + def __len__(self) -> int: + return len(list(self.keys())) + + +def perform_optimisation(cdb: CDB) -> None: + """Attempts to optimise the memory footprint of the CDB. + + Does so by unifying the following dicts: + + cui2names (Dict[str, Set[str]]): + From cui to all names assigned to it. Mainly used for subsetting (maybe even only). + cui2snames (Dict[str, Set[str]]): + From cui to all sub-names assigned to it. Only used for subsetting. + cui2context_vectors (Dict[str, Dict[str, np.array]]): + From cui to a dictionary of different kinds of context vectors. Normally you would have here + a short and a long context vector - they are calculated separately. + cui2count_train (Dict[str, int]): + From CUI to the number of training examples seen. + cui2tags (Dict[str, List[str]]): + From CUI to a list of tags. This can be used to tag concepts for grouping of whatever. + cui2type_ids (Dict[str, Set[str]]): + From CUI to type id (e.g. TUI in UMLS). + cui2preferred_name (Dict[str, str]): + From CUI to the preferred name for this concept. + cui2average_confidence (Dict[str, str]): + Used for dynamic thresholding. Holds the average confidence for this CUI given the training examples. + + They will all be included in 1 dict with CUI keys and a list of values for each pre-existing dict. + + Args: + cdb (CDB): The CDB to modify. + """ + dict_names_to_combine = [ + "cui2names", "cui2snames", "cui2context_vectors", + "cui2count_train", "cui2tags", "cui2type_ids", + "cui2preferred_name", "cui2average_confidence" + ] + dicts = [getattr(cdb, dict_name) for dict_name in dict_names_to_combine] + cui2many, delegators = map_to_many(dicts) + for delegator, name in zip(delegators, dict_names_to_combine): + setattr(cdb, name, delegator) + cdb.cui2many = cui2many + cdb.is_dirty = True + + +def map_to_many(dicts: List[Dict[str, Any]]) -> Tuple[Dict[str, List[Any]], List[DelegatingDict]]: + one2many: Dict[str, List[Any]] = {} + delegators: list[DelegatingDict] = [] + for nr, d in enumerate(dicts): + delegator = DelegatingDict( + one2many, nr, nr_of_overall_items=len(dicts)) + for key, value in d.items(): + if key not in one2many: + one2many[key] = delegator._generate_empty_entry() + one2many[key][nr] = value + delegators.append(delegator) + return one2many, delegators + + +# TODO - remove anything below +def main(file_name: str): + import dill + d1 = {'c1': ['n11', 'n12'], + 'c2': ['n21', 'n22']} + d2 = {'c1': 'n11', + 'c2': 'n22'} + one2many, (delegate1, delegate2) = map_to_many([d1, d2]) + print('DEL1, DEL2', delegate1, delegate2) + print('O2M ', one2many) + print('DEL1', delegate1.delegate) + print('DEL2', delegate2.delegate) + print('COMP1', delegate1.delegate is one2many) + print('COMP2', delegate2.delegate is one2many) + print('COMP3', delegate1.delegate is delegate2.delegate) + to_save = {'one2many': one2many, + 'delegate1': delegate1, + 'delegate2': delegate2} + print('SAVING to', file_name) + with open(file_name, 'wb') as f: + dill.dump(to_save, f) + print('Done saving, now LOADING') + with open(file_name, 'rb') as f: + data = dill.load(f) + print('GOT/loaded', data) + print('FOR each key') + o2m = data['one2many'] + del1, del2 = data['delegate1'], data['delegate2'] + print('DEL1, DEL2', del1.delegate, del2.delegate) + print('COMP1', del1.delegate is o2m) + print('COMP2', del2.delegate is o2m) + print('COMP3', del1.delegate is del2.delegate) + print('KEYS', list(one2many)) + for key in one2many: + print('KEY', key) + print(one2many[key], '\nvs\n', o2m[key]) + print('Through delegates') + print('DELEGATE1') + print(delegate1[key], 'vs', del1[key]) + print('DELEGATE2') + print(delegate2[key], 'vs', del2[key]) + # changing o2m should change del1 and/or del2 as well + o2m['c10'] = [['f11', 'f50'], 'f50'] + print('o2m', o2m) + print('And for c10 in delegates') + print('del1', del1['c10']) + print('del2', del2['c10']) + + +if __name__ == "__main__": + import sys + main(*sys.argv[1:]) From b795a862fd62077b1a1c498ddeb073d281dafc46 Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 5 Jun 2023 09:50:32 +0100 Subject: [PATCH 06/47] CU-346mpwz Add name2 to memory optimiser for CDB --- medcat/utils/memory_optimiser.py | 35 ++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/medcat/utils/memory_optimiser.py b/medcat/utils/memory_optimiser.py index 8e3f0cbd5..49417d0e8 100644 --- a/medcat/utils/memory_optimiser.py +++ b/medcat/utils/memory_optimiser.py @@ -54,6 +54,16 @@ def __len__(self) -> int: return len(list(self.keys())) +def _optimise(cdb: CDB, to_many_name: str, dict_names_to_combine: List[str]) -> None: + dicts = [getattr(cdb, dict_name) + for dict_name in dict_names_to_combine] + one2many, delegators = map_to_many(dicts) + for delegator, name in zip(delegators, dict_names_to_combine): + setattr(cdb, name, delegator) + setattr(cdb, to_many_name, one2many) + cdb.is_dirty = True + + def perform_optimisation(cdb: CDB) -> None: """Attempts to optimise the memory footprint of the CDB. @@ -77,22 +87,31 @@ def perform_optimisation(cdb: CDB) -> None: cui2average_confidence (Dict[str, str]): Used for dynamic thresholding. Holds the average confidence for this CUI given the training examples. + name2cuis (Dict[str, List[str]]): + Map fro concept name to CUIs - one name can map to multiple CUIs. + name2cuis2status (Dict[str, Dict[str, str]]): + What is the status for a given name and cui pair - each name can be: + P - Preferred, A - Automatic (e.g. let medcat decide), N - Not common. + name2count_train (Dict[str, str]): + Counts how often did a name appear during training. + They will all be included in 1 dict with CUI keys and a list of values for each pre-existing dict. Args: cdb (CDB): The CDB to modify. """ - dict_names_to_combine = [ + # cui2<...> -> cui2many + cui_dict_names_to_combine = [ "cui2names", "cui2snames", "cui2context_vectors", "cui2count_train", "cui2tags", "cui2type_ids", - "cui2preferred_name", "cui2average_confidence" + "cui2preferred_name", "cui2average_confidence", ] - dicts = [getattr(cdb, dict_name) for dict_name in dict_names_to_combine] - cui2many, delegators = map_to_many(dicts) - for delegator, name in zip(delegators, dict_names_to_combine): - setattr(cdb, name, delegator) - cdb.cui2many = cui2many - cdb.is_dirty = True + _optimise(cdb, 'cui2many', cui_dict_names_to_combine) + # name2<...> -> name2many + name_dict_names_to_combine = [ + "cui2names", "name2cuis2status", "cui2preferred_name", + ] + _optimise(cdb, 'name2many', name_dict_names_to_combine) def map_to_many(dicts: List[Dict[str, Any]]) -> Tuple[Dict[str, List[Any]], List[DelegatingDict]]: From 9a76a27e390c69a6fdc0b39d3c9ba4793e223d44 Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 5 Jun 2023 10:23:09 +0100 Subject: [PATCH 07/47] CU-346mpwz Add keys/items/values views to memory optimiser fake dicts --- medcat/utils/memory_optimiser.py | 49 ++++++++++++++++++++++++++------ 1 file changed, 40 insertions(+), 9 deletions(-) diff --git a/medcat/utils/memory_optimiser.py b/medcat/utils/memory_optimiser.py index 49417d0e8..a6ba1b352 100644 --- a/medcat/utils/memory_optimiser.py +++ b/medcat/utils/memory_optimiser.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, Iterator, List, Tuple +from typing import Any, Dict, KeysView, ValuesView, ItemsView, Iterator, List, Tuple from medcat.cdb import CDB @@ -35,23 +35,54 @@ def __contains__(self, key: str) -> bool: return key in self.delegate and self.delegate[key][self.nr] is not None def keys(self) -> Iterator[str]: - for key in self.delegate.keys(): - if key in self: - yield key + return self.KeysView(self.delegate.keys(), self) def items(self) -> Iterator[Tuple[str, Any]]: - for key in self: - yield key, self[key] + return self.ItemsView(self) def values(self) -> Iterator[Any]: - for key in self: - yield self[key] + return self.ValuesView(self) def __iter__(self) -> Iterator[str]: yield from self.keys() def __len__(self) -> int: - return len(list(self.keys())) + return len(self.keys()) + + class KeysView: + def __init__(self, keys: KeysView, parent: 'DelegatingDict'): + self._keys = keys + self._parent = parent + + def __iter__(self) -> Iterator[Any]: + for key in self._keys: + if key in self._parent: + yield key + + def __len__(self) -> int: + return len([_ for _ in self]) + + class ItemsView: + def __init__(self, parent: 'DelegatingDict') -> None: + self._parent = parent + + def __iter__(self) -> Iterator[Any]: + for key in self._parent: + yield key, self._parent[key] + + def __len__(self) -> int: + return len(self._parent) + + class ValuesView: + def __init__(self, parent: 'DelegatingDict') -> None: + self._parent = parent + + def __iter__(self) -> Iterator[Any]: + for key in self._parent: + yield self._parent[key] + + def __len__(self) -> int: + return len(self._parent) def _optimise(cdb: CDB, to_many_name: str, dict_names_to_combine: List[str]) -> None: From 35a98582e8a539f3ddcef2777b267c9c484ec789 Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 5 Jun 2023 10:27:24 +0100 Subject: [PATCH 08/47] CU-346mpwz Fix keys/items/values views in memory optimiser fake dicts --- medcat/utils/memory_optimiser.py | 85 +++++++++++++++++--------------- 1 file changed, 44 insertions(+), 41 deletions(-) diff --git a/medcat/utils/memory_optimiser.py b/medcat/utils/memory_optimiser.py index a6ba1b352..3577ca636 100644 --- a/medcat/utils/memory_optimiser.py +++ b/medcat/utils/memory_optimiser.py @@ -3,6 +3,44 @@ from medcat.cdb import CDB +class _KeysView: + def __init__(self, keys: KeysView, parent: 'DelegatingDict'): + self._keys = keys + self._parent = parent + + def __iter__(self) -> Iterator[Any]: + for key in self._keys: + if key in self._parent: + yield key + + def __len__(self) -> int: + return len([_ for _ in self]) + + +class _ItemsView: + def __init__(self, parent: 'DelegatingDict') -> None: + self._parent = parent + + def __iter__(self) -> Iterator[Any]: + for key in self._parent: + yield key, self._parent[key] + + def __len__(self) -> int: + return len(self._parent) + + +class _ValuesView: + def __init__(self, parent: 'DelegatingDict') -> None: + self._parent = parent + + def __iter__(self) -> Iterator[Any]: + for key in self._parent: + yield self._parent[key] + + def __len__(self) -> int: + return len(self._parent) + + class DelegatingDict: def __init__(self, delegate: Dict[str, List[Any]], nr: int, @@ -34,14 +72,14 @@ def __setitem__(self, key: str, value: Any) -> None: def __contains__(self, key: str) -> bool: return key in self.delegate and self.delegate[key][self.nr] is not None - def keys(self) -> Iterator[str]: - return self.KeysView(self.delegate.keys(), self) + def keys(self) -> _KeysView: + return _KeysView(self.delegate.keys(), self) - def items(self) -> Iterator[Tuple[str, Any]]: - return self.ItemsView(self) + def items(self) -> _ItemsView: + return _ItemsView(self) - def values(self) -> Iterator[Any]: - return self.ValuesView(self) + def values(self) -> _ValuesView: + return _ValuesView(self) def __iter__(self) -> Iterator[str]: yield from self.keys() @@ -49,41 +87,6 @@ def __iter__(self) -> Iterator[str]: def __len__(self) -> int: return len(self.keys()) - class KeysView: - def __init__(self, keys: KeysView, parent: 'DelegatingDict'): - self._keys = keys - self._parent = parent - - def __iter__(self) -> Iterator[Any]: - for key in self._keys: - if key in self._parent: - yield key - - def __len__(self) -> int: - return len([_ for _ in self]) - - class ItemsView: - def __init__(self, parent: 'DelegatingDict') -> None: - self._parent = parent - - def __iter__(self) -> Iterator[Any]: - for key in self._parent: - yield key, self._parent[key] - - def __len__(self) -> int: - return len(self._parent) - - class ValuesView: - def __init__(self, parent: 'DelegatingDict') -> None: - self._parent = parent - - def __iter__(self) -> Iterator[Any]: - for key in self._parent: - yield self._parent[key] - - def __len__(self) -> int: - return len(self._parent) - def _optimise(cdb: CDB, to_many_name: str, dict_names_to_combine: List[str]) -> None: dicts = [getattr(cdb, dict_name) From acca90be139b88470eb241e500736bdf7c9db5e1 Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 5 Jun 2023 11:29:20 +0100 Subject: [PATCH 09/47] CU-346mpwz Add option to optimise or not cui and/or name based dicts in memory optimiser --- medcat/utils/memory_optimiser.py | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/medcat/utils/memory_optimiser.py b/medcat/utils/memory_optimiser.py index 3577ca636..177182089 100644 --- a/medcat/utils/memory_optimiser.py +++ b/medcat/utils/memory_optimiser.py @@ -98,7 +98,8 @@ def _optimise(cdb: CDB, to_many_name: str, dict_names_to_combine: List[str]) -> cdb.is_dirty = True -def perform_optimisation(cdb: CDB) -> None: +def perform_optimisation(cdb: CDB, optimise_cuis: bool = True, + optimise_names: bool = True) -> None: """Attempts to optimise the memory footprint of the CDB. Does so by unifying the following dicts: @@ -133,19 +134,23 @@ def perform_optimisation(cdb: CDB) -> None: Args: cdb (CDB): The CDB to modify. + optimise_cuis (bool, optional): Whether to optimise cui2<...> dicts. Defaults to True. + optimise_names (bool, optional): Whether to optimise name2<...> dicts. Defaults to True. """ # cui2<...> -> cui2many - cui_dict_names_to_combine = [ - "cui2names", "cui2snames", "cui2context_vectors", - "cui2count_train", "cui2tags", "cui2type_ids", - "cui2preferred_name", "cui2average_confidence", - ] - _optimise(cdb, 'cui2many', cui_dict_names_to_combine) + if optimise_cuis: + cui_dict_names_to_combine = [ + "cui2names", "cui2snames", "cui2context_vectors", + "cui2count_train", "cui2tags", "cui2type_ids", + "cui2preferred_name", "cui2average_confidence", + ] + _optimise(cdb, 'cui2many', cui_dict_names_to_combine) # name2<...> -> name2many - name_dict_names_to_combine = [ - "cui2names", "name2cuis2status", "cui2preferred_name", - ] - _optimise(cdb, 'name2many', name_dict_names_to_combine) + if optimise_names: + name_dict_names_to_combine = [ + "cui2names", "name2cuis2status", "cui2preferred_name", + ] + _optimise(cdb, 'name2many', name_dict_names_to_combine) def map_to_many(dicts: List[Dict[str, Any]]) -> Tuple[Dict[str, List[Any]], List[DelegatingDict]]: From 8635c19c4cb455e6e34444307deaf0bf3a80d124 Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 5 Jun 2023 11:33:00 +0100 Subject: [PATCH 10/47] CU-346mpwz Make default memory optimiser omit name2... optimising; add comment regarding this in docstring --- medcat/utils/memory_optimiser.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/medcat/utils/memory_optimiser.py b/medcat/utils/memory_optimiser.py index 177182089..df64e2baa 100644 --- a/medcat/utils/memory_optimiser.py +++ b/medcat/utils/memory_optimiser.py @@ -99,9 +99,14 @@ def _optimise(cdb: CDB, to_many_name: str, dict_names_to_combine: List[str]) -> def perform_optimisation(cdb: CDB, optimise_cuis: bool = True, - optimise_names: bool = True) -> None: + optimise_names: bool = False) -> None: """Attempts to optimise the memory footprint of the CDB. + This can perform optimisation for cui2<...> and name2<...> dicts. + However, by default, only cui2many optimisation will be done. + This is because at the time of writing, there were not enough name2<...> + dicts to be able to benefit from the optimisation. + Does so by unifying the following dicts: cui2names (Dict[str, Set[str]]): @@ -135,7 +140,7 @@ def perform_optimisation(cdb: CDB, optimise_cuis: bool = True, Args: cdb (CDB): The CDB to modify. optimise_cuis (bool, optional): Whether to optimise cui2<...> dicts. Defaults to True. - optimise_names (bool, optional): Whether to optimise name2<...> dicts. Defaults to True. + optimise_names (bool, optional): Whether to optimise name2<...> dicts. Defaults to False. """ # cui2<...> -> cui2many if optimise_cuis: From 48bee48710ff7f5d8e353c75281f5376952a298d Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 5 Jun 2023 11:33:55 +0100 Subject: [PATCH 11/47] CU-346mpwz Remove unused/legacy code from memory optimiser --- medcat/utils/memory_optimiser.py | 54 -------------------------------- 1 file changed, 54 deletions(-) diff --git a/medcat/utils/memory_optimiser.py b/medcat/utils/memory_optimiser.py index df64e2baa..356d9ca94 100644 --- a/medcat/utils/memory_optimiser.py +++ b/medcat/utils/memory_optimiser.py @@ -170,57 +170,3 @@ def map_to_many(dicts: List[Dict[str, Any]]) -> Tuple[Dict[str, List[Any]], List one2many[key][nr] = value delegators.append(delegator) return one2many, delegators - - -# TODO - remove anything below -def main(file_name: str): - import dill - d1 = {'c1': ['n11', 'n12'], - 'c2': ['n21', 'n22']} - d2 = {'c1': 'n11', - 'c2': 'n22'} - one2many, (delegate1, delegate2) = map_to_many([d1, d2]) - print('DEL1, DEL2', delegate1, delegate2) - print('O2M ', one2many) - print('DEL1', delegate1.delegate) - print('DEL2', delegate2.delegate) - print('COMP1', delegate1.delegate is one2many) - print('COMP2', delegate2.delegate is one2many) - print('COMP3', delegate1.delegate is delegate2.delegate) - to_save = {'one2many': one2many, - 'delegate1': delegate1, - 'delegate2': delegate2} - print('SAVING to', file_name) - with open(file_name, 'wb') as f: - dill.dump(to_save, f) - print('Done saving, now LOADING') - with open(file_name, 'rb') as f: - data = dill.load(f) - print('GOT/loaded', data) - print('FOR each key') - o2m = data['one2many'] - del1, del2 = data['delegate1'], data['delegate2'] - print('DEL1, DEL2', del1.delegate, del2.delegate) - print('COMP1', del1.delegate is o2m) - print('COMP2', del2.delegate is o2m) - print('COMP3', del1.delegate is del2.delegate) - print('KEYS', list(one2many)) - for key in one2many: - print('KEY', key) - print(one2many[key], '\nvs\n', o2m[key]) - print('Through delegates') - print('DELEGATE1') - print(delegate1[key], 'vs', del1[key]) - print('DELEGATE2') - print(delegate2[key], 'vs', del2[key]) - # changing o2m should change del1 and/or del2 as well - o2m['c10'] = [['f11', 'f50'], 'f50'] - print('o2m', o2m) - print('And for c10 in delegates') - print('del1', del1['c10']) - print('del2', del2['c10']) - - -if __name__ == "__main__": - import sys - main(*sys.argv[1:]) From 5a39b2a41b3cab177e3a470b8a2be68c81d608ae Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 5 Jun 2023 11:34:49 +0100 Subject: [PATCH 12/47] CU-346mpwz Add tests for memory optimiser --- tests/utils/test_memory_optimiser.py | 68 ++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 tests/utils/test_memory_optimiser.py diff --git a/tests/utils/test_memory_optimiser.py b/tests/utils/test_memory_optimiser.py new file mode 100644 index 000000000..615eee598 --- /dev/null +++ b/tests/utils/test_memory_optimiser.py @@ -0,0 +1,68 @@ +from medcat.utils import memory_optimiser + +import unittest + + +class DelegatingDictTests(unittest.TestCase): + _dict = {'c1': [None, 0], 'c2': [1, None]} + + def setUp(self) -> None: + self.del_dict1 = memory_optimiser.DelegatingDict(self._dict, 0, 2) + self.del_dict2 = memory_optimiser.DelegatingDict(self._dict, 1, 2) + self.delegators = [self.del_dict1, self.del_dict2] + self.names = ['delegator 1', 'delegator 2'] + self.expected_lens = [len( + [v[nr] for v in self._dict.values() if v[nr] is not None] + ) for nr in range(len(self._dict[list(self._dict.keys())[0]]))] + + def test_delegating_dict_has_correct_keys(self): + for delegator, exp_len, name in zip(self.delegators, self.expected_lens, self.names): + with self.subTest(name): + self.assertEqual(len(delegator.keys()), exp_len) + + def test_delegating_dict_has_same_number_of_keys_and_values(self): + for delegator, exp_len, name in zip(self.delegators, self.expected_lens, self.names): + with self.subTest(name): + self.assertEqual(len(delegator.keys()), exp_len) + self.assertEqual(len(delegator.values()), exp_len) + + def test_delegating_dict_has_same_number_of_items_and_iter_values(self): + for delegator, exp_len, name in zip(self.delegators, self.expected_lens, self.names): + with self.subTest(name): + self.assertEqual(len(delegator.items()), exp_len) + # __iter__ -> list -> len + self.assertEqual(len(list(delegator)), exp_len) + + def test_delegator_do_not_have_None_values(self): + for delegator, name in zip(self.delegators, self.names): + for key, val in delegator.items(): + with self.subTest(f"{name}: {key}"): + self.assertIsNotNone(val) + + def test_delegator_keys_in_original(self): + for delegator, name in zip(self.delegators, self.names): + for key in delegator.keys(): + with self.subTest(f"{name}: {key}"): + self.assertIn(key, self._dict) + + def test_delegator_keys_in_container(self): + for delegator, name in zip(self.delegators, self.names): + for key in delegator.keys(): + with self.subTest(f"{name}: {key}"): + self.assertIn(key, delegator) + + def test_delegator_get_gets_key(self, def_value='#DEFAULT#'): + for delegator, name in zip(self.delegators, self.names): + for key in delegator.keys(): + with self.subTest(f"{name}: {key}"): + val = delegator.get(key, def_value) + self.assertIsNot(val, def_value) + + def test_delegator_get_defaults_non_existant_key(self, def_value='#DEFAULT#'): + for delegator, name in zip(self.delegators, self.names): + for key in self._dict.keys(): + if key in delegator: + continue + with self.subTest(f"{name}: {key}"): + val = delegator.get(key, def_value) + self.assertIs(val, def_value) From 9999d73a30c68952bcd2e195681bcb832ffe2410 Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 5 Jun 2023 14:27:16 +0100 Subject: [PATCH 13/47] CU-346mpwz Add tests memory optimised CDB --- tests/utils/test_memory_optimiser.py | 62 ++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/tests/utils/test_memory_optimiser.py b/tests/utils/test_memory_optimiser.py index 615eee598..b47642d0c 100644 --- a/tests/utils/test_memory_optimiser.py +++ b/tests/utils/test_memory_optimiser.py @@ -1,6 +1,12 @@ from medcat.utils import memory_optimiser import unittest +import tempfile +import os +import shutil +from medcat.cat import CAT +from medcat.cdb import CDB +from medcat.vocab import Vocab class DelegatingDictTests(unittest.TestCase): @@ -66,3 +72,59 @@ def test_delegator_get_defaults_non_existant_key(self, def_value='#DEFAULT#'): with self.subTest(f"{name}: {key}"): val = delegator.get(key, def_value) self.assertIs(val, def_value) + + +class OperationalTests(unittest.TestCase): + temp_folder = tempfile.TemporaryDirectory() + temp_cdb_path = os.path.join(temp_folder.name, 'cat.cdb') + # importing here so it's in the local namespace + # otherwise, all of its parts would get run again + from tests.test_cat import CATTests + test_callable_with_single_text = CATTests.test_callable_with_single_text + test_callable_with_single_empty_text = CATTests.test_callable_with_single_empty_text + test_callable_with_single_none_text = CATTests.test_callable_with_single_none_text + test_get_entities = CATTests.test_get_entities + test_get_entities_including_text = CATTests.test_get_entities_including_text + test_get_entities_multi_texts = CATTests.test_get_entities_multi_texts + test_get_entities_multi_texts_including_text = CATTests.test_get_entities_multi_texts_including_text + + @classmethod + def setUpClass(cls) -> None: + cls.cdb = CDB.load(os.path.join(os.path.dirname( + os.path.realpath(__file__)), "..", "..", "examples", "cdb.dat")) + memory_optimiser.perform_optimisation(cls.cdb) + cls.vocab = Vocab.load(os.path.join(os.path.dirname( + os.path.realpath(__file__)), "..", "..", "examples", "vocab.dat")) + cls.cdb.config.general.spacy_model = "en_core_web_md" + cls.cdb.config.ner.min_name_len = 2 + cls.cdb.config.ner.upper_case_limit_len = 3 + cls.cdb.config.general.spell_check = True + cls.cdb.config.linking.train_count_threshold = 10 + cls.cdb.config.linking.similarity_threshold = 0.3 + cls.cdb.config.linking.train = True + cls.cdb.config.linking.disamb_length_limit = 5 + cls.cdb.config.general.full_unlink = True + cls.meta_cat_dir = os.path.join( + os.path.dirname(os.path.realpath(__file__)), "tmp") + cls.undertest = CAT(cdb=cls.cdb, config=cls.cdb.config, + vocab=cls.vocab, meta_cats=[]) + cls._linkng_filters = cls.undertest.config.linking.filters.copy_of() + + # # add tests from CAT tests + + @classmethod + def tearDownClass(cls) -> None: + cls.temp_folder.cleanup() + cls.undertest.destroy_pipe() + if os.path.exists(cls.meta_cat_dir): + shutil.rmtree(cls.meta_cat_dir) + + def tearDown(self) -> None: + self.cdb.config.annotation_output.include_text_in_output = False + # need to make sure linking filters are not retained beyond a test scope + self.undertest.config.linking.filters = self._linkng_filters.copy_of() + + def test_cdb_has_one2many(self, one2many_name='cui2many'): + self.assertTrue(hasattr(self.cdb, one2many_name)) + one2many = getattr(self.cdb, one2many_name) + self.assertIsInstance(one2many, dict) From 0bbeb2feb997d7f53fa2a089c02a7ce70712534a Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 5 Jun 2023 14:30:23 +0100 Subject: [PATCH 14/47] CU-346mpwz Make dict names available within memory optimiser --- medcat/utils/memory_optimiser.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/medcat/utils/memory_optimiser.py b/medcat/utils/memory_optimiser.py index 356d9ca94..5d9d8010e 100644 --- a/medcat/utils/memory_optimiser.py +++ b/medcat/utils/memory_optimiser.py @@ -3,6 +3,17 @@ from medcat.cdb import CDB +CUI_DICT_NAMES_TO_COMBINE = [ + "cui2names", "cui2snames", "cui2context_vectors", + "cui2count_train", "cui2tags", "cui2type_ids", + "cui2preferred_name", "cui2average_confidence", +] + +NAME_DICT_NAMES_TO_COMBINE = [ + "cui2names", "name2cuis2status", "cui2preferred_name", +] + + class _KeysView: def __init__(self, keys: KeysView, parent: 'DelegatingDict'): self._keys = keys @@ -144,18 +155,10 @@ def perform_optimisation(cdb: CDB, optimise_cuis: bool = True, """ # cui2<...> -> cui2many if optimise_cuis: - cui_dict_names_to_combine = [ - "cui2names", "cui2snames", "cui2context_vectors", - "cui2count_train", "cui2tags", "cui2type_ids", - "cui2preferred_name", "cui2average_confidence", - ] - _optimise(cdb, 'cui2many', cui_dict_names_to_combine) + _optimise(cdb, 'cui2many', CUI_DICT_NAMES_TO_COMBINE) # name2<...> -> name2many if optimise_names: - name_dict_names_to_combine = [ - "cui2names", "name2cuis2status", "cui2preferred_name", - ] - _optimise(cdb, 'name2many', name_dict_names_to_combine) + _optimise(cdb, 'name2many', NAME_DICT_NAMES_TO_COMBINE) def map_to_many(dicts: List[Dict[str, Any]]) -> Tuple[Dict[str, List[Any]], List[DelegatingDict]]: From df98418750477803f88d993fc3d693112a5cb19e Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 5 Jun 2023 14:32:49 +0100 Subject: [PATCH 15/47] CU-346mpwz Add separate tests for memory optimised CDB --- tests/utils/test_memory_optimiser.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/tests/utils/test_memory_optimiser.py b/tests/utils/test_memory_optimiser.py index b47642d0c..7441778c8 100644 --- a/tests/utils/test_memory_optimiser.py +++ b/tests/utils/test_memory_optimiser.py @@ -74,6 +74,26 @@ def test_delegator_get_defaults_non_existant_key(self, def_value='#DEFAULT#'): self.assertIs(val, def_value) +class MemoryOptimisingTests(unittest.TestCase): + + @classmethod + def setUpClass(cls) -> None: + cls.cdb = CDB.load(os.path.join(os.path.dirname( + os.path.realpath(__file__)), "..", "..", "examples", "cdb.dat")) + memory_optimiser.perform_optimisation(cls.cdb) + + def test_cdb_has_one2many(self, one2many_name='cui2many'): + self.assertTrue(hasattr(self.cdb, one2many_name)) + one2many = getattr(self.cdb, one2many_name) + self.assertIsInstance(one2many, dict) + + def test_cdb_has_delegating_dicts(self): + for dict_name in memory_optimiser.CUI_DICT_NAMES_TO_COMBINE: + with self.subTest(dict_name): + d = getattr(self.cdb, dict_name) + self.assertIsInstance(d, memory_optimiser.DelegatingDict) + + class OperationalTests(unittest.TestCase): temp_folder = tempfile.TemporaryDirectory() temp_cdb_path = os.path.join(temp_folder.name, 'cat.cdb') @@ -123,8 +143,3 @@ def tearDown(self) -> None: self.cdb.config.annotation_output.include_text_in_output = False # need to make sure linking filters are not retained beyond a test scope self.undertest.config.linking.filters = self._linkng_filters.copy_of() - - def test_cdb_has_one2many(self, one2many_name='cui2many'): - self.assertTrue(hasattr(self.cdb, one2many_name)) - one2many = getattr(self.cdb, one2many_name) - self.assertIsInstance(one2many, dict) From f5df9647350b742c21a1d291ceef505673bf3665 Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 5 Jun 2023 16:23:22 +0100 Subject: [PATCH 16/47] CU-346mpwz Remove unused imports in memory optimiser --- medcat/utils/memory_optimiser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/medcat/utils/memory_optimiser.py b/medcat/utils/memory_optimiser.py index 5d9d8010e..7b7f623a6 100644 --- a/medcat/utils/memory_optimiser.py +++ b/medcat/utils/memory_optimiser.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, KeysView, ValuesView, ItemsView, Iterator, List, Tuple +from typing import Any, Dict, KeysView, Iterator, List, Tuple from medcat.cdb import CDB From f2f0b35644ed3fda5a61783c470a26f70fb4b7d9 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 6 Jun 2023 16:49:43 +0100 Subject: [PATCH 17/47] CU-346mpwz Move some encoding and decoing stuff within serialisation to their own module --- medcat/utils/saving/coding.py | 157 ++++++++++++++++++++++++++++++ medcat/utils/saving/serializer.py | 50 ++++------ 2 files changed, 175 insertions(+), 32 deletions(-) create mode 100644 medcat/utils/saving/coding.py diff --git a/medcat/utils/saving/coding.py b/medcat/utils/saving/coding.py new file mode 100644 index 000000000..ca8088669 --- /dev/null +++ b/medcat/utils/saving/coding.py @@ -0,0 +1,157 @@ +from typing import Any, Protocol, runtime_checkable, List, Union, Type, Optional, Callable + +import json + + +@runtime_checkable +class EncodeableObject(Protocol): + + def to_dict(self) -> dict: + """Converts the object to a dict. + + Returns: + dict: The dict to be serialised. + """ + + +class UnsuitableObject(ValueError): + pass + + +class PartEncoder(Protocol): + + def try_encode(self, obj: object) -> Any: + """Try to encode an object + + Args: + obj (object): The object to encode + + Raises: + UnsuitableObject: If the object is unsuitable for encoding. + + Returns: + Any: The encoded object + """ + + +SET_IDENTIFIER = '==SET==' + + +class SetEncoder(PartEncoder): + """JSONEncoder (and decoder) for sets. + + Generally, JSON doesn't support serializing of sets natively. + This encoder adds a set identifier to the data when being serialized + and provides a method to read said identifier upon decoding.""" + + def try_encode(self, obj): + if isinstance(obj, set): + return {SET_IDENTIFIER: list(obj)} + raise UnsuitableObject() + + +class PartDecoder(Protocol): + + def try_decode(dct: dict) -> Union[dict, Any]: + """Try to decode the dictionary. + + Args: + dct (dict): The dict to decode. + + Returns: + Union[dict, Any]: The dict if unable to decode, the decoded object otherwise + """ + + +class SetDecoder(PartDecoder): + + def try_decode(self, dct: dict) -> Union[dict, set]: + """Decode sets from input dicts. + + Args: + dct (dict): The input dict + + Returns: + Union[dict, set]: The original dict if this was not a serialized set, the set otherwise + """ + if SET_IDENTIFIER in dct: + print('SET decoder') + return set(dct[SET_IDENTIFIER]) + return dct + + +PostProcessor = Callable[[Any], None] # CDB -> None + +DEFAULT_ENCODERS: List[Type[PartEncoder]] = [SetEncoder, ] +DEFAULT_DECODERS: List[Type[PartDecoder]] = [SetDecoder, ] +LOADING_POSTPROCESSORS: List[PostProcessor] = [] + + +def register_encoder_decoder(encoder: Optional[Type[PartEncoder]], + decoder: Optional[Type[PartDecoder]], + loading_postprocessor: Optional[PostProcessor]): + if encoder: + print('Registering encoder', encoder) + DEFAULT_ENCODERS.append(encoder) + if decoder: + print('Registering decoder', decoder) + DEFAULT_DECODERS.append(decoder) + if loading_postprocessor: + print('Registering postprocessor', loading_postprocessor) + LOADING_POSTPROCESSORS.append(loading_postprocessor) + + +class CustomDelegatingEncoder(json.JSONEncoder): + + def __init__(self, delegates: List[PartEncoder], *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + self._delegates = delegates + + def default(self, obj): + for delegator in self._delegates: + try: + return delegator.try_encode(obj) + except UnsuitableObject: + pass + return json.JSONEncoder.default(self, obj) + + @classmethod + def def_inst(cls, *args, **kwargs) -> 'CustomDelegatingDecoder': + print('def inst of', cls, ':', len(DEFAULT_ENCODERS)) + return cls([_cls() for _cls in DEFAULT_ENCODERS], *args, **kwargs) + + +class CustomDelegatingDecoder(json.JSONDecoder): + _def_inst: Optional['CustomDelegatingDecoder'] = None + + def __init__(self, delegates: List[PartDecoder]) -> None: + self._delegates = delegates + + def object_hook(self, dct: dict) -> Any: + for delegator in self._delegates: + ret_val = delegator.try_decode(dct) + if ret_val is not dct: + print('DECODER result', type(ret_val), + # (len(ret_val) + # if hasattr(ret_val, '__len__') else ret_val) + ) + return ret_val + return dct + + @classmethod + def def_inst(cls) -> 'CustomDelegatingDecoder': + if cls._def_inst is None: + cls._def_inst = cls([_cls() for _cls in DEFAULT_DECODERS]) + print('def inst of', type(cls._def_inst), ':', len(DEFAULT_DECODERS)) + return cls._def_inst + + +def default_hook(dct: dict) -> Any: + cdd = CustomDelegatingDecoder.def_inst() + return cdd.object_hook(dct) + + +def default_postprocessing(cdb) -> None: + for pp in LOADING_POSTPROCESSORS: + print('DOING POSTPROCESSING w', pp) + pp(cdb) diff --git a/medcat/utils/saving/serializer.py b/medcat/utils/saving/serializer.py index c08124831..9813e77af 100644 --- a/medcat/utils/saving/serializer.py +++ b/medcat/utils/saving/serializer.py @@ -5,11 +5,13 @@ """ import os import logging -from typing import cast, Dict, Optional, Union +from typing import cast, Dict, Optional import dill import json from medcat.config import Config +from medcat.utils.saving.coding import CustomDelegatingEncoder, default_hook, default_postprocessing + logger = logging.getLogger(__name__) @@ -17,35 +19,8 @@ __SPECIALITY_NAMES_NAME = set( ["name2cuis", "name2cuis2status", "name_isupper"]) __SPECIALITY_NAMES_OTHER = set(["snames", "addl_info"]) -SPECIALITY_NAMES = __SPECIALITY_NAMES_CUI | __SPECIALITY_NAMES_NAME | __SPECIALITY_NAMES_OTHER - - -class SetEncode(json.JSONEncoder): - """JSONEncoder (and decoder) for sets. - - Generally, JSON doesn't support serializing of sets natively. - This encoder adds a set identifier to the data when being serialized - and provides a method to read said identifier upon decoding.""" - SET_IDENTIFIER = '==SET==' - - def default(self, obj): - if isinstance(obj, set): - return {SetEncode.SET_IDENTIFIER: list(obj)} - return json.JSONEncoder.default(self, obj) - - @staticmethod - def set_decode(dct: dict) -> Union[dict, set]: - """Decode sets from input dicts. - - Args: - dct (dict): The input dict - - Returns: - Union[dict, set]: The original dict if this was not a serialized set, the set otherwise - """ - if SetEncode.SET_IDENTIFIER in dct: - return set(dct[SetEncode.SET_IDENTIFIER]) - return dct +ONE2MANY = set(['cui2many', 'name2many']) # these may or may not exist +SPECIALITY_NAMES = __SPECIALITY_NAMES_CUI | __SPECIALITY_NAMES_NAME | __SPECIALITY_NAMES_OTHER | ONE2MANY class JsonSetSerializer: @@ -75,7 +50,7 @@ def write(self, d: dict) -> None: logger.info('Writing data for "%s" into "%s"', self.name, self.file_name) with open(self.file_name, 'w') as f: - json.dump(d, f, cls=SetEncode) + json.dump(d, f, cls=CustomDelegatingEncoder.def_inst) def read(self) -> dict: """Read the json file specified by this serializer. @@ -85,7 +60,8 @@ def read(self) -> dict: """ logger.info('Reading data for %s from %s', self.name, self.file_name) with open(self.file_name, 'r') as f: - data = json.load(f, object_hook=SetEncode.set_decode) + data = json.load( + f, object_hook=default_hook) return data @@ -168,6 +144,8 @@ def serialize(self, cdb, overwrite: bool = False) -> None: dill.dump(to_save, f) if self.jsons is not None: for name in SPECIALITY_NAMES: + if name not in cdb.__dict__: + continue # in case cui2many doesn't exit self.jsons[name].write(cdb.__dict__[name]) def deserialize(self, cdb_cls): @@ -199,5 +177,13 @@ def deserialize(self, cdb_cls): # if applicable if self.jsons is not None: for name in SPECIALITY_NAMES: + if not os.path.exists(self.jsons[name].file_name): + continue # in case of non-memory-optimised where cui2many doesn't exist + _val = self.jsons[name].read() cdb.__dict__[name] = self.jsons[name].read() + # if anything has + # been registered to postprocess the CDBs + example_dict = 'cui2type_ids' + val = cdb.__dict__[example_dict] + default_postprocessing(cdb) return cdb From 7e4259b5173a061c7a3087be2ab4749aff678d20 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 6 Jun 2023 16:50:20 +0100 Subject: [PATCH 18/47] CU-346mpwz Add tests for encoding/decoding stuff --- tests/utils/saving/test_coding.py | 77 +++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 tests/utils/saving/test_coding.py diff --git a/tests/utils/saving/test_coding.py b/tests/utils/saving/test_coding.py new file mode 100644 index 000000000..c60a3b1f2 --- /dev/null +++ b/tests/utils/saving/test_coding.py @@ -0,0 +1,77 @@ +from medcat.utils.saving import coding + +import json + +import unittest + + +class SetEncodeTests(unittest.TestCase): + string2sets_dict1 = {'s1': set(['v1', 'v2', 'v3']), + 's2': set(['u1', 'u2', 'u3'])} + string2sets_dict2 = {'p1': set([1, 2, 3]), + 'p2': set([3, 4, 5])} + + def serialise(self, d: dict) -> str: + return json.dumps(d, cls=coding.CustomDelegatingEncoder.def_inst) + + def _helper_serialises(self, d: dict): + s = self.serialise(d) + self.assertIsInstance(s, str) + + def test_sets_of_strings_serialise(self): + self._helper_serialises(self.string2sets_dict1) + + def test_sets_of_ints_serialise(self): + self._helper_serialises(self.string2sets_dict2) + + def _helper_keys_in_json(self, d: dict): + s = self.serialise(d) + for k in d.keys(): + with self.subTest(k): + self.assertIn(str(k), s) + + def test_sos_keys_in_json(self): + self._helper_keys_in_json(self.string2sets_dict1) + + def test_soi_keys_in_json(self): + self._helper_keys_in_json(self.string2sets_dict2) + + def _helper_values_in_json(self, d: dict): + s = self.serialise(d) + for key, v in d.items(): + for nr, el in enumerate(v): + with self.subTest(f"Key: {key}; Element {nr}"): + self.assertIn(str(el), s) + + def test_sos_values_in_json(self): + self._helper_values_in_json(self.string2sets_dict1) + + def test_soi_values_in_json(self): + self._helper_values_in_json(self.string2sets_dict2) + + +class SetDecodeTests(unittest.TestCase): + + def deserialise(self, s: str) -> dict: + return json.loads(s, object_hook=coding.default_hook) + + def setUp(self) -> None: + self.encoder = SetEncodeTests() + self.encoded1 = self.encoder.serialise(self.encoder.string2sets_dict1) + self.encoded2 = self.encoder.serialise(self.encoder.string2sets_dict2) + + def test_sos_decodes(self): + d = self.deserialise(self.encoded1) + self.assertIsInstance(d, dict) + + def test_soi_decodes(self): + d = self.deserialise(self.encoded2) + self.assertIsInstance(d, dict) + + def test_sos_decodes_to_identical(self): + d = self.deserialise(self.encoded1) + self.assertEqual(d, self.encoder.string2sets_dict1) + + def test_soi_decodes_to_identical(self): + d = self.deserialise(self.encoded2) + self.assertEqual(d, self.encoder.string2sets_dict2) From c448b52fb06ef871c0f18ea872d71be982c96bca Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 6 Jun 2023 16:52:14 +0100 Subject: [PATCH 19/47] CU-346mpwz Add encoding/decoding for delegating dict as well as postprocessing for delegation linking with json serialisation --- medcat/utils/memory_optimiser.py | 66 ++++++++++++++++++++++++++++++-- 1 file changed, 63 insertions(+), 3 deletions(-) diff --git a/medcat/utils/memory_optimiser.py b/medcat/utils/memory_optimiser.py index 7b7f623a6..941dfa710 100644 --- a/medcat/utils/memory_optimiser.py +++ b/medcat/utils/memory_optimiser.py @@ -1,6 +1,7 @@ -from typing import Any, Dict, KeysView, Iterator, List, Tuple +from typing import Any, Dict, KeysView, Iterator, List, Tuple, Union from medcat.cdb import CDB +from medcat.utils.saving.coding import EncodeableObject, PartEncoder, PartDecoder, UnsuitableObject, register_encoder_decoder CUI_DICT_NAMES_TO_COMBINE = [ @@ -8,10 +9,14 @@ "cui2count_train", "cui2tags", "cui2type_ids", "cui2preferred_name", "cui2average_confidence", ] +ONE2MANY = 'cui2many' NAME_DICT_NAMES_TO_COMBINE = [ "cui2names", "name2cuis2status", "cui2preferred_name", ] +NAME2MANY = 'name2many' + +DELEGATING_DICT_IDENTIFIER = '==DELEGATING_DICT==' class _KeysView: @@ -98,6 +103,50 @@ def __iter__(self) -> Iterator[str]: def __len__(self) -> int: return len(self.keys()) + def to_dict(self) -> dict: + return {'delegate': None, + 'nr': self.nr, + 'nr_of_overall_items': self.nr_of_overall_items} + + def __eq__(self, __value: object) -> bool: + if not isinstance(__value, DelegatingDict): + return False + return self.delegate == __value.delegate and self.nr == __value.nr + + def __hash__(self) -> int: + return hash((self.delegate, self.nr)) + + +class DelegatingDictEncoder(PartEncoder): + + def try_encode(self, obj): + if isinstance(obj, DelegatingDict): + return {DELEGATING_DICT_IDENTIFIER: obj.to_dict()} + raise UnsuitableObject() + + +class DelegatingDictDecoder(PartDecoder): + + def try_decode(self, dct: dict) -> Union[dict, EncodeableObject]: + if DELEGATING_DICT_IDENTIFIER in dct: + info = dct[DELEGATING_DICT_IDENTIFIER] + delegate = info['delegate'] + nr = info['nr'] + overall = info['nr_of_overall_items'] + return DelegatingDict(delegate, nr, overall) + return dct + + +def attempt_fix_after_load(cdb: CDB): + _attempt_fix_after_load(cdb, ONE2MANY, CUI_DICT_NAMES_TO_COMBINE) + _attempt_fix_after_load(cdb, NAME2MANY, NAME_DICT_NAMES_TO_COMBINE) + + +# register encoder and decoders +register_encoder_decoder(encoder=DelegatingDictEncoder, + decoder=DelegatingDictDecoder, + loading_postprocessor=attempt_fix_after_load) + def _optimise(cdb: CDB, to_many_name: str, dict_names_to_combine: List[str]) -> None: dicts = [getattr(cdb, dict_name) @@ -155,10 +204,21 @@ def perform_optimisation(cdb: CDB, optimise_cuis: bool = True, """ # cui2<...> -> cui2many if optimise_cuis: - _optimise(cdb, 'cui2many', CUI_DICT_NAMES_TO_COMBINE) + _optimise(cdb, ONE2MANY, CUI_DICT_NAMES_TO_COMBINE) # name2<...> -> name2many if optimise_names: - _optimise(cdb, 'name2many', NAME_DICT_NAMES_TO_COMBINE) + _optimise(cdb, NAME2MANY, NAME_DICT_NAMES_TO_COMBINE) + + +def _attempt_fix_after_load(cdb: CDB, one2many_name: str, dict_names: List[str]): + if not hasattr(cdb, one2many_name): + return + one2many = getattr(cdb, one2many_name) + for dict_name in dict_names: + d = getattr(cdb, dict_name) + if not isinstance(d, DelegatingDict): + raise ValueError(f'Unknown type for {dict_name}: {type(d)}') + d.delegate = one2many def map_to_many(dicts: List[Dict[str, Any]]) -> Tuple[Dict[str, List[Any]], List[DelegatingDict]]: From c191a40a73f134b8693eb0b83919edcc8f560884 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 6 Jun 2023 16:52:54 +0100 Subject: [PATCH 20/47] CU-346mpwz Fix decision upon JSON deserialisation of CDB when loading model pack --- medcat/cat.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/medcat/cat.py b/medcat/cat.py index cd6aa769d..852bb8048 100644 --- a/medcat/cat.py +++ b/medcat/cat.py @@ -40,7 +40,7 @@ from medcat.vocab import Vocab from medcat.utils.decorators import deprecated from medcat.ner.transformers_ner import TransformersNER -from medcat.utils.saving.serializer import SPECIALITY_NAMES +from medcat.utils.saving.serializer import SPECIALITY_NAMES, ONE2MANY logger = logging.getLogger(__name__) # separate logger from the package-level one @@ -353,7 +353,8 @@ def load_model_pack(cls, # Load the CDB cdb_path = os.path.join(model_pack_path, "cdb.dat") - has_jsons = len(glob.glob(os.path.join(model_pack_path, '*.json'))) >= len(SPECIALITY_NAMES) + nr_of_jsons_expected = len(SPECIALITY_NAMES) - len(ONE2MANY) + has_jsons = len(glob.glob(os.path.join(model_pack_path, '*.json'))) >= nr_of_jsons_expected json_path = model_pack_path if has_jsons else None logger.info('Loading model pack with %s', 'JSON format' if json_path else 'dill format') cdb = CDB.load(cdb_path, json_path) From 7ad50adefe6704af36b7b1a63fc1707510ed5207 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 6 Jun 2023 16:53:46 +0100 Subject: [PATCH 21/47] CU-346mpwz Adapt serialisation tests to the potential one2many mappings --- tests/utils/saving/test_serialization.py | 26 ++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/tests/utils/saving/test_serialization.py b/tests/utils/saving/test_serialization.py index 6313906dc..d2b753772 100644 --- a/tests/utils/saving/test_serialization.py +++ b/tests/utils/saving/test_serialization.py @@ -9,7 +9,9 @@ from medcat.cat import CAT from medcat.vocab import Vocab -from medcat.utils.saving.serializer import JsonSetSerializer, CDBSerializer, SPECIALITY_NAMES +from medcat.utils.saving.serializer import JsonSetSerializer, CDBSerializer, SPECIALITY_NAMES, ONE2MANY + +import medcat.utils.saving.coding as _ class JSONSerialoizationTests(unittest.TestCase): @@ -42,6 +44,11 @@ def test_round_trip(self): self.ser.serialize(self.cdb, overwrite=True) cdb = self.ser.deserialize(CDB) for name in SPECIALITY_NAMES: + if name in ONE2MANY: + # ignore cui2many and name2many + # since they don't exist if/when + # optimisation hasn't been done + continue with self.subTest(name): orig = getattr(self.cdb, name) now = getattr(cdb, name) @@ -75,6 +82,7 @@ def setUp(self) -> None: self.dill_model_pack.name) def test_dill_to_json(self): + self._check_cdb(self.undertest.cdb, prefix='TDJ') model_pack_path = self.undertest.create_model_pack( self.json_model_pack.name, cdb_format='json') model_pack_folder = os.path.join( @@ -82,11 +90,19 @@ def test_dill_to_json(self): json_path = os.path.join(model_pack_folder, "*.json") jsons = glob.glob(json_path) # there is also a model_card.json - self.assertGreaterEqual(len(jsons), len(SPECIALITY_NAMES)) + # but nothing for cui2many or name2many + # so can remove the length of ONE2MANY + self.assertGreaterEqual(len(jsons), len( + SPECIALITY_NAMES) - len(ONE2MANY)) for json in jsons: with self.subTest(f'JSON {json}'): if json.endswith('model_card.json'): continue # ignore model card here + if any(name in json for name in ONE2MANY): + # ignore cui2many and name2many + # since they don't exist if/when + # optimisation hasn't been done + continue self.assertTrue( any(special_name in json for special_name in SPECIALITY_NAMES)) return model_pack_folder @@ -99,6 +115,7 @@ def test_load_json(self): def test_round_trip(self): folder = self.test_dill_to_json() # make sure the files exist cat = CAT.load_model_pack(folder) + self._check_cdb(cat.cdb, prefix='TRT') # The spacy model has full path in the loaded model, thus won't be equal cat.config.general.spacy_model = os.path.basename( cat.config.general.spacy_model) @@ -128,6 +145,11 @@ def test_round_trip(self): self.assertEqual(cat.vocab.unigram_table, self.undertest.vocab.unigram_table) for name in SPECIALITY_NAMES: + if name in ONE2MANY: + # ignore cui2many and name2many + # since they don't exist if/when + # optimisation hasn't been done + continue with self.subTest(f'CDB Name {name}'): self.assertEqual(cat.cdb.__dict__[ name], self.undertest.cdb.__dict__[name]) From b6d99e1c173342b68713678a2910c800fab0ccc4 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 6 Jun 2023 16:54:40 +0100 Subject: [PATCH 22/47] CU-346mpwz Add tests for memory optimisation, including JSON serialisation ones --- tests/utils/test_memory_optimiser.py | 87 ++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/tests/utils/test_memory_optimiser.py b/tests/utils/test_memory_optimiser.py index 7441778c8..231aa50a2 100644 --- a/tests/utils/test_memory_optimiser.py +++ b/tests/utils/test_memory_optimiser.py @@ -4,9 +4,11 @@ import tempfile import os import shutil +import json from medcat.cat import CAT from medcat.cdb import CDB from medcat.vocab import Vocab +from medcat.utils.saving import coding class DelegatingDictTests(unittest.TestCase): @@ -74,6 +76,74 @@ def test_delegator_get_defaults_non_existant_key(self, def_value='#DEFAULT#'): self.assertIs(val, def_value) +class DelegatingDictJsonTests(unittest.TestCase): + _dict = {'c5': [None, 10], 'c6': [11, None]} + + def setUp(self) -> None: + self.del_dict1 = memory_optimiser.DelegatingDict(self._dict, 0, 2) + self.del_dict2 = memory_optimiser.DelegatingDict(self._dict, 1, 2) + self.delegators = [self.del_dict1, self.del_dict2] + self.master_dict = {'one2many': self._dict, + 'part1': self.del_dict1, + 'part2': self.del_dict2} + + def serialise_master(self) -> str: + return json.dumps(self.master_dict, + cls=coding.CustomDelegatingEncoder.def_inst) + + def deserialise(self, s: str, one2many_name='one2many') -> dict: + d = json.loads(s, object_hook=coding.default_hook) + one2many = d[one2many_name] + for key, value in d.items(): + if key == one2many_name: + continue + if value.delegate is None: + value.delegate = one2many + return d + + def test_dict_of_delegation_serialises(self): + s = self.serialise_master() + self.assertIsInstance(s, str) + + def test_dod_ser_has_keys(self): + s = self.serialise_master() + for key in self.master_dict: + with self.subTest(key): + self.assertIn(key, s) + + def test_dod_ser_one2many_has_sub_keys(self): + s = self.serialise_master() + for key in self.master_dict['one2many']: + with self.subTest(key): + self.assertIn(key, s) + + def test_round_trip(self): + s = self.serialise_master() + d = self.deserialise(s) + self.assertIsInstance(d, dict) + + def test_round_trip_equal(self): + s = self.serialise_master() + d = self.deserialise(s) + self.assertEqual(d, self.master_dict) + + +class UnOptimisingTests(unittest.TestCase): + + @classmethod + def setUpClass(cls) -> None: + cls.cdb = CDB.load(os.path.join(os.path.dirname( + os.path.realpath(__file__)), "..", "..", "examples", "cdb.dat")) + + def test_unoptimised_cdb_does_not_have_cui2many(self): + self.assertFalse(hasattr(self.cdb, 'cui2many')) + + def test_unoptmised_cdb_does_not_have_delegating_dicts(self): + for key, val in self.cdb.__dict__.items(): + with self.subTest(key): + self.assertNotIsInstance(val, memory_optimiser.DelegatingDict) + + class MemoryOptimisingTests(unittest.TestCase): @classmethod @@ -97,6 +167,7 @@ def test_cdb_has_delegating_dicts(self): class OperationalTests(unittest.TestCase): temp_folder = tempfile.TemporaryDirectory() temp_cdb_path = os.path.join(temp_folder.name, 'cat.cdb') + json_path = temp_cdb_path.rsplit(os.path.sep, 1)[0] # importing here so it's in the local namespace # otherwise, all of its parts would get run again from tests.test_cat import CATTests @@ -143,3 +214,19 @@ def tearDown(self) -> None: self.cdb.config.annotation_output.include_text_in_output = False # need to make sure linking filters are not retained beyond a test scope self.undertest.config.linking.filters = self._linkng_filters.copy_of() + + def test_optimised_cdb_has_cui2many(self): + self.assertTrue(hasattr(self.cdb, 'cui2many')) + + def test_can_be_saved_as_json(self): + self.cdb.save(self.temp_cdb_path, json_path=self.json_path) + + def test_can_be_loaded_as_json(self): + self.test_can_be_saved_as_json() + cdb = CDB.load(self.temp_cdb_path, self.json_path) + self.assertEqual(self.cdb.cui2many, cdb.cui2many) + for del_name in memory_optimiser.CUI_DICT_NAMES_TO_COMBINE: + d = getattr(cdb, del_name) + with self.subTest(del_name): + self.assertIsInstance(d, memory_optimiser.DelegatingDict) + self.assertIs(cdb.cui2many, d.delegate) From eb569d5548bc78ec0ecd44411c89ffe948021f25 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 6 Jun 2023 16:56:12 +0100 Subject: [PATCH 23/47] CU-346mpwz Remove debug print statements --- medcat/utils/saving/coding.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/medcat/utils/saving/coding.py b/medcat/utils/saving/coding.py index ca8088669..8a8b33817 100644 --- a/medcat/utils/saving/coding.py +++ b/medcat/utils/saving/coding.py @@ -75,7 +75,6 @@ def try_decode(self, dct: dict) -> Union[dict, set]: Union[dict, set]: The original dict if this was not a serialized set, the set otherwise """ if SET_IDENTIFIER in dct: - print('SET decoder') return set(dct[SET_IDENTIFIER]) return dct @@ -91,13 +90,10 @@ def register_encoder_decoder(encoder: Optional[Type[PartEncoder]], decoder: Optional[Type[PartDecoder]], loading_postprocessor: Optional[PostProcessor]): if encoder: - print('Registering encoder', encoder) DEFAULT_ENCODERS.append(encoder) if decoder: - print('Registering decoder', decoder) DEFAULT_DECODERS.append(decoder) if loading_postprocessor: - print('Registering postprocessor', loading_postprocessor) LOADING_POSTPROCESSORS.append(loading_postprocessor) @@ -117,7 +113,6 @@ def default(self, obj): @classmethod def def_inst(cls, *args, **kwargs) -> 'CustomDelegatingDecoder': - print('def inst of', cls, ':', len(DEFAULT_ENCODERS)) return cls([_cls() for _cls in DEFAULT_ENCODERS], *args, **kwargs) @@ -131,10 +126,6 @@ def object_hook(self, dct: dict) -> Any: for delegator in self._delegates: ret_val = delegator.try_decode(dct) if ret_val is not dct: - print('DECODER result', type(ret_val), - # (len(ret_val) - # if hasattr(ret_val, '__len__') else ret_val) - ) return ret_val return dct @@ -142,7 +133,6 @@ def object_hook(self, dct: dict) -> Any: def def_inst(cls) -> 'CustomDelegatingDecoder': if cls._def_inst is None: cls._def_inst = cls([_cls() for _cls in DEFAULT_DECODERS]) - print('def inst of', type(cls._def_inst), ':', len(DEFAULT_DECODERS)) return cls._def_inst @@ -153,5 +143,4 @@ def default_hook(dct: dict) -> Any: def default_postprocessing(cdb) -> None: for pp in LOADING_POSTPROCESSORS: - print('DOING POSTPROCESSING w', pp) pp(cdb) From a2cfe73436146d21a9bccf2961d3a8b20ab3fdc4 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 6 Jun 2023 17:02:58 +0100 Subject: [PATCH 24/47] CU-346mpwz Remove debug methods from tests --- tests/utils/saving/test_serialization.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/utils/saving/test_serialization.py b/tests/utils/saving/test_serialization.py index d2b753772..9beb7f1b1 100644 --- a/tests/utils/saving/test_serialization.py +++ b/tests/utils/saving/test_serialization.py @@ -82,7 +82,6 @@ def setUp(self) -> None: self.dill_model_pack.name) def test_dill_to_json(self): - self._check_cdb(self.undertest.cdb, prefix='TDJ') model_pack_path = self.undertest.create_model_pack( self.json_model_pack.name, cdb_format='json') model_pack_folder = os.path.join( @@ -115,7 +114,6 @@ def test_load_json(self): def test_round_trip(self): folder = self.test_dill_to_json() # make sure the files exist cat = CAT.load_model_pack(folder) - self._check_cdb(cat.cdb, prefix='TRT') # The spacy model has full path in the loaded model, thus won't be equal cat.config.general.spacy_model = os.path.basename( cat.config.general.spacy_model) From bc7908270b8cfe4811b674f6c1b6e2488fd8e49a Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 6 Jun 2023 17:03:38 +0100 Subject: [PATCH 25/47] CU-346mpwz Fix method signatures in encoding/decoding methods --- medcat/utils/saving/coding.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/medcat/utils/saving/coding.py b/medcat/utils/saving/coding.py index 8a8b33817..c03e6816f 100644 --- a/medcat/utils/saving/coding.py +++ b/medcat/utils/saving/coding.py @@ -52,7 +52,7 @@ def try_encode(self, obj): class PartDecoder(Protocol): - def try_decode(dct: dict) -> Union[dict, Any]: + def try_decode(self, dct: dict) -> Union[dict, Any]: """Try to decode the dictionary. Args: @@ -112,7 +112,7 @@ def default(self, obj): return json.JSONEncoder.default(self, obj) @classmethod - def def_inst(cls, *args, **kwargs) -> 'CustomDelegatingDecoder': + def def_inst(cls, *args, **kwargs) -> 'CustomDelegatingEncoder': return cls([_cls() for _cls in DEFAULT_ENCODERS], *args, **kwargs) From 3b7c44f277dc27218de3c853e0bc3e24d443c562 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 6 Jun 2023 17:09:10 +0100 Subject: [PATCH 26/47] CU-346mpwz Fix typing issue in serialiser when passing encoder --- medcat/utils/saving/serializer.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/medcat/utils/saving/serializer.py b/medcat/utils/saving/serializer.py index 9813e77af..cf91bdde4 100644 --- a/medcat/utils/saving/serializer.py +++ b/medcat/utils/saving/serializer.py @@ -5,7 +5,7 @@ """ import os import logging -from typing import cast, Dict, Optional +from typing import cast, Dict, Optional, Type import dill import json @@ -50,7 +50,11 @@ def write(self, d: dict) -> None: logger.info('Writing data for "%s" into "%s"', self.name, self.file_name) with open(self.file_name, 'w') as f: - json.dump(d, f, cls=CustomDelegatingEncoder.def_inst) + # the def_inst method, when called, + # returns the right type of object anyway + + json.dump(d, f, cls=cast(Type[json.JSONEncoder], + CustomDelegatingEncoder.def_inst)) def read(self) -> dict: """Read the json file specified by this serializer. From 48e0dac1359bb367e8ac545d4815668671a7e50a Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 6 Jun 2023 17:09:49 +0100 Subject: [PATCH 27/47] CU-346mpwz Relax typing restrictions for umls preprocessing / parent2child mapping --- medcat/utils/preprocess_umls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/medcat/utils/preprocess_umls.py b/medcat/utils/preprocess_umls.py index 544813cfa..78fd8fc9f 100644 --- a/medcat/utils/preprocess_umls.py +++ b/medcat/utils/preprocess_umls.py @@ -239,7 +239,7 @@ def get_pt2ch(self) -> dict: cui_parent = cui_parent[cui_parent['PAUI'].notna()] # create dict - pt2ch: dict[str, set[str]] = {} + pt2ch: dict = {} for _, row in tqdm.tqdm(cui_parent.iterrows(), total=len(cui_parent.index)): cur_cui = row['CUI'] paui = row['PAUI'] From 05638be8af1fce674b9e2b069bf79661e1b5ac84 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 6 Jun 2023 17:24:05 +0100 Subject: [PATCH 28/47] CU-346mpwz Remove some debug variables --- medcat/utils/saving/serializer.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/medcat/utils/saving/serializer.py b/medcat/utils/saving/serializer.py index cf91bdde4..d82df751c 100644 --- a/medcat/utils/saving/serializer.py +++ b/medcat/utils/saving/serializer.py @@ -183,11 +183,8 @@ def deserialize(self, cdb_cls): for name in SPECIALITY_NAMES: if not os.path.exists(self.jsons[name].file_name): continue # in case of non-memory-optimised where cui2many doesn't exist - _val = self.jsons[name].read() cdb.__dict__[name] = self.jsons[name].read() # if anything has - # been registered to postprocess the CDBs - example_dict = 'cui2type_ids' - val = cdb.__dict__[example_dict] + # been registered to postprocess the CDBs default_postprocessing(cdb) return cdb From 82c1f54b91fd4a3c23e16371c417f949eedcb2f6 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 6 Jun 2023 17:38:58 +0100 Subject: [PATCH 29/47] CU-346mpwz Fix remnant merge conflict --- medcat/utils/preprocess_umls.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/medcat/utils/preprocess_umls.py b/medcat/utils/preprocess_umls.py index 5cc6a2be7..7c47f451a 100644 --- a/medcat/utils/preprocess_umls.py +++ b/medcat/utils/preprocess_umls.py @@ -3,10 +3,7 @@ import pandas as pd import tqdm import os -<<<<<<< HEAD -======= -from typing import Dict, Set ->>>>>>> 564d15caa6d328733f8da8b56b4dfebc2982bc4d +from typing import Dict _DEFAULT_COLUMNS: list = [ "CUI", From f6af4a0cb607f870a6a5c6e6e2d015619746d4b8 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 7 Jun 2023 09:19:43 +0100 Subject: [PATCH 30/47] CU-346mpwz Add item removal and popping to delegating dict --- medcat/utils/memory_optimiser.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/medcat/utils/memory_optimiser.py b/medcat/utils/memory_optimiser.py index 941dfa710..ccc5cd341 100644 --- a/medcat/utils/memory_optimiser.py +++ b/medcat/utils/memory_optimiser.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, KeysView, Iterator, List, Tuple, Union +from typing import Any, Dict, KeysView, Iterator, List, Tuple, Union, Optional from medcat.cdb import CDB from medcat.utils.saving.coding import EncodeableObject, PartEncoder, PartDecoder, UnsuitableObject, register_encoder_decoder @@ -116,6 +116,17 @@ def __eq__(self, __value: object) -> bool: def __hash__(self) -> int: return hash((self.delegate, self.nr)) + def __delitem__(self, key: str) -> None: + self[key] = None + + def pop(self, key: str, default: Optional[Any] = None) -> Any: + if key in self: + item = self[key] + else: + item = default + del self[key] + return item + class DelegatingDictEncoder(PartEncoder): From 144bbdb5e060d3e7963a01d54b99a416023f7104 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 7 Jun 2023 09:19:57 +0100 Subject: [PATCH 31/47] CU-346mpwz Add item removal and popping tests to delegating dict --- tests/utils/test_memory_optimiser.py | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/tests/utils/test_memory_optimiser.py b/tests/utils/test_memory_optimiser.py index 231aa50a2..5df71f08f 100644 --- a/tests/utils/test_memory_optimiser.py +++ b/tests/utils/test_memory_optimiser.py @@ -15,13 +15,33 @@ class DelegatingDictTests(unittest.TestCase): _dict = {'c1': [None, 0], 'c2': [1, None]} def setUp(self) -> None: - self.del_dict1 = memory_optimiser.DelegatingDict(self._dict, 0, 2) - self.del_dict2 = memory_optimiser.DelegatingDict(self._dict, 1, 2) + # deep copy so that the origianl remains unchangeds + _dict = dict((k, v.copy() + ) for k, v in self._dict.items()) + self.del_dict1 = memory_optimiser.DelegatingDict(_dict, 0, 2) + self.del_dict2 = memory_optimiser.DelegatingDict(_dict, 1, 2) self.delegators = [self.del_dict1, self.del_dict2] self.names = ['delegator 1', 'delegator 2'] self.expected_lens = [len( - [v[nr] for v in self._dict.values() if v[nr] is not None] - ) for nr in range(len(self._dict[list(self._dict.keys())[0]]))] + [v[nr] for v in _dict.values() if v[nr] is not None] + ) for nr in range(len(_dict[list(_dict.keys())[0]]))] + + def test_removal(self, key='c2'): + self.assertIn(key, self.del_dict1) + del self.del_dict1[key] + self.assertNotIn(key, self.del_dict1) + + def test_pop_no_def_existing(self, key='c2'): + self.assertIn(key, self.del_dict1) + val = self.del_dict1.pop(key) + self.assertNotIn(key, self.del_dict1) + self.assertIs(val, self._dict[key][0]) + + def test_pop_def_non_existing(self, key='c1', def_val='DEF VAL'): + self.assertNotIn(key, self.del_dict1) + val = self.del_dict1.pop(key, def_val) + self.assertNotIn(key, self.del_dict1) + self.assertIs(val, def_val) def test_delegating_dict_has_correct_keys(self): for delegator, exp_len, name in zip(self.delegators, self.expected_lens, self.names): From 366c4873b82d120619e94d2b976d4d01866c66eb Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 7 Jun 2023 09:22:25 +0100 Subject: [PATCH 32/47] CU-346mpwz Add item adding/setting tests to delegating dict --- tests/utils/test_memory_optimiser.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/utils/test_memory_optimiser.py b/tests/utils/test_memory_optimiser.py index 5df71f08f..a171d2927 100644 --- a/tests/utils/test_memory_optimiser.py +++ b/tests/utils/test_memory_optimiser.py @@ -43,6 +43,21 @@ def test_pop_def_non_existing(self, key='c1', def_val='DEF VAL'): self.assertNotIn(key, self.del_dict1) self.assertIs(val, def_val) + def test_adding_exiting_key_nonexist_value(self, key: str = 'c1'): + self.assertNotIn(key, self.del_dict1) + self.del_dict1[key] = 'value' + self.assertIn(key, self.del_dict1) + + def test_adding_nonexiting_key(self, key: str = 'nek1'): + self.assertNotIn(key, self.del_dict1) + self.del_dict1[key] = 'value-NEW' + self.assertIn(key, self.del_dict1) + + def test_adding_nonexiting_key_not_affect_other(self, key: str = 'nek2'): + self.assertNotIn(key, self.del_dict2) + self.del_dict1[key] = 'value-NEW-2' + self.assertNotIn(key, self.del_dict2) + def test_delegating_dict_has_correct_keys(self): for delegator, exp_len, name in zip(self.delegators, self.expected_lens, self.names): with self.subTest(name): From 7273c5353f75c6cfe902a6637ec7d6b7a14a6e00 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 7 Jun 2023 09:42:22 +0100 Subject: [PATCH 33/47] CU-346mpwz Fix typing issue (List vs list) --- medcat/utils/memory_optimiser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/medcat/utils/memory_optimiser.py b/medcat/utils/memory_optimiser.py index ccc5cd341..a93d381cf 100644 --- a/medcat/utils/memory_optimiser.py +++ b/medcat/utils/memory_optimiser.py @@ -234,7 +234,7 @@ def _attempt_fix_after_load(cdb: CDB, one2many_name: str, dict_names: List[str]) def map_to_many(dicts: List[Dict[str, Any]]) -> Tuple[Dict[str, List[Any]], List[DelegatingDict]]: one2many: Dict[str, List[Any]] = {} - delegators: list[DelegatingDict] = [] + delegators: List[DelegatingDict] = [] for nr, d in enumerate(dicts): delegator = DelegatingDict( one2many, nr, nr_of_overall_items=len(dicts)) From 579b59d9caa9b64607cf404a33b7c9dcae1d314d Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 3 Jul 2023 11:44:45 +0100 Subject: [PATCH 34/47] CU-346mpwz Add possibility of memory-optimising for snames as well --- medcat/cdb.py | 10 ++++--- medcat/utils/memory_optimiser.py | 40 ++++++++++++++++++++++++++-- tests/utils/test_memory_optimiser.py | 28 +++++++++++++++++++ 3 files changed, 73 insertions(+), 5 deletions(-) diff --git a/medcat/cdb.py b/medcat/cdb.py index 8a58166b8..e5361fc72 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -180,9 +180,13 @@ def remove_cui(self, cui: str) -> None: for name, cuis2status in self.name2cuis2status.items(): if cui in cuis2status: del cuis2status[cui] - self.snames = set() - for cuis in self.cui2snames.values(): - self.snames |= cuis + if isinstance(self.snames, set): + # if this is a memory optimised CDB, this won't be a set + # but it also won't need to be changed since it + # relies directly on cui2snames + self.snames = set() + for cuis in self.cui2snames.values(): + self.snames |= cuis self.name2count_train = {name: len(cuis) for name, cuis in self.name2cuis.items()} self.is_dirty = True diff --git a/medcat/utils/memory_optimiser.py b/medcat/utils/memory_optimiser.py index a93d381cf..613953214 100644 --- a/medcat/utils/memory_optimiser.py +++ b/medcat/utils/memory_optimiser.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, KeysView, Iterator, List, Tuple, Union, Optional +from typing import Any, Dict, KeysView, Iterator, List, Tuple, Union, Optional, Set from medcat.cdb import CDB from medcat.utils.saving.coding import EncodeableObject, PartEncoder, PartDecoder, UnsuitableObject, register_encoder_decoder @@ -128,6 +128,22 @@ def pop(self, key: str, default: Optional[Any] = None) -> Any: return item +class DelegatingValueSet: + + def __init__(self, delegate: Dict[str, Set[str]]) -> None: + self.delegate = delegate + + def update(self, other: Any) -> None: + # do nothing since the value will be updated in delegate + pass + + def __contains__(self, value: str) -> bool: + for cui_value in self.delegate.values(): + if value in cui_value: + return True + return False + + class DelegatingDictEncoder(PartEncoder): def try_encode(self, obj): @@ -169,8 +185,22 @@ def _optimise(cdb: CDB, to_many_name: str, dict_names_to_combine: List[str]) -> cdb.is_dirty = True +def _optimise_snames(cdb: CDB, cui2snames: str, snames_attr: str = 'snames') -> None: + """Optimise the snames part of a CDB. + + Args: + cdb (CDB): The CDB to optimise snames on. + one2many_name (str): The cui2snames dict name to delegate to. + snames_attr (str, optional): The `snames` attribute name. Defaults to 'snames'. + """ + delegate = getattr(cdb, cui2snames) + dvs = DelegatingValueSet(delegate) + setattr(cdb, snames_attr, dvs) + + def perform_optimisation(cdb: CDB, optimise_cuis: bool = True, - optimise_names: bool = False) -> None: + optimise_names: bool = False, + optimise_snames: bool = False) -> None: """Attempts to optimise the memory footprint of the CDB. This can perform optimisation for cui2<...> and name2<...> dicts. @@ -206,12 +236,15 @@ def perform_optimisation(cdb: CDB, optimise_cuis: bool = True, name2count_train (Dict[str, str]): Counts how often did a name appear during training. + It can also delegate the `snames` set to use the various sets in `cui2snames` instead. + They will all be included in 1 dict with CUI keys and a list of values for each pre-existing dict. Args: cdb (CDB): The CDB to modify. optimise_cuis (bool, optional): Whether to optimise cui2<...> dicts. Defaults to True. optimise_names (bool, optional): Whether to optimise name2<...> dicts. Defaults to False. + optimise_snames (bool, optional): Whether to optimise `snames` set. Defaults to False. """ # cui2<...> -> cui2many if optimise_cuis: @@ -219,6 +252,9 @@ def perform_optimisation(cdb: CDB, optimise_cuis: bool = True, # name2<...> -> name2many if optimise_names: _optimise(cdb, NAME2MANY, NAME_DICT_NAMES_TO_COMBINE) + if optimise_snames: + # check snames based on cui2sanmes + _optimise_snames(cdb, "cui2snames") def _attempt_fix_after_load(cdb: CDB, one2many_name: str, dict_names: List[str]): diff --git a/tests/utils/test_memory_optimiser.py b/tests/utils/test_memory_optimiser.py index a171d2927..e07cbb8cb 100644 --- a/tests/utils/test_memory_optimiser.py +++ b/tests/utils/test_memory_optimiser.py @@ -265,3 +265,31 @@ def test_can_be_loaded_as_json(self): with self.subTest(del_name): self.assertIsInstance(d, memory_optimiser.DelegatingDict) self.assertIs(cdb.cui2many, d.delegate) + + +class DelegatingValueSetTests(unittest.TestCase): + + def setUp(self) -> None: + self.delegate = {'a': set('abcd'), + 'b': set('efghij'), + 'c': set('lm'), # skip k + 'd': set('qrst'), # skip a bunch + } + self.original = set([v for s in self.delegate for v in s]) + + def test_DelegatingValueSet_constructs(self): + dvs = memory_optimiser.DelegatingValueSet(self.delegate) + self.assertIsInstance(dvs, memory_optimiser.DelegatingValueSet) + + def test_DelegatingValueSet_contains_values(self): + dvs = memory_optimiser.DelegatingValueSet(self.delegate) + for v in self.original: + with self.subTest(f'Check: {v}'): + self.assertIn(v, dvs) + + def test_DelegatingValueSet_contains_incorrect_values(self, + to_check=set('kopuvwxyz')): + dvs = memory_optimiser.DelegatingValueSet(self.delegate) + for v in to_check: + with self.subTest(f'Check: {v}'): + self.assertNotIn(v, dvs) From efd06c5e956c591d3c3ff4d4e1826a912f7dc84b Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 3 Jul 2023 12:06:10 +0100 Subject: [PATCH 35/47] CU-346mpwz Add comment regarding memory-optimising for filtering by CUI to CDB --- medcat/cdb.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/medcat/cdb.py b/medcat/cdb.py index e5361fc72..df3594573 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -544,6 +544,10 @@ def filter_by_cui(self, cuis_to_keep: Union[List[str], Set[str]]) -> None: This also will not remove any data from cdb.addl_info - as this field can contain data of unknown structure. + As a side note, if the CDB has been memory-optimised, filtering will undo this memory optimisation. + This is because the dicts being involved will be rewritten. + However, the memory optimisation can be performed again afterwards. + Args: cuis_to_keep (List[str]): CUIs that will be kept, the rest will be removed (not completely, look above). From 440524647604fac661b58450a39742a0596a2442 Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 3 Jul 2023 12:06:45 +0100 Subject: [PATCH 36/47] CU-346mpwz Add sname based memory optimisation tests --- tests/utils/test_memory_optimiser.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tests/utils/test_memory_optimiser.py b/tests/utils/test_memory_optimiser.py index e07cbb8cb..6606719ba 100644 --- a/tests/utils/test_memory_optimiser.py +++ b/tests/utils/test_memory_optimiser.py @@ -185,7 +185,7 @@ class MemoryOptimisingTests(unittest.TestCase): def setUpClass(cls) -> None: cls.cdb = CDB.load(os.path.join(os.path.dirname( os.path.realpath(__file__)), "..", "..", "examples", "cdb.dat")) - memory_optimiser.perform_optimisation(cls.cdb) + memory_optimiser.perform_optimisation(cls.cdb, optimise_snames=True) def test_cdb_has_one2many(self, one2many_name='cui2many'): self.assertTrue(hasattr(self.cdb, one2many_name)) @@ -198,6 +198,16 @@ def test_cdb_has_delegating_dicts(self): d = getattr(self.cdb, dict_name) self.assertIsInstance(d, memory_optimiser.DelegatingDict) + def test_has_delegating_set(self): + self.assertIsInstance( + self.cdb.snames, memory_optimiser.DelegatingValueSet) + + def test_delegating_set_has_values(self): + for values in self.cdb.cui2snames.values(): + for val in values: + with self.subTest(f'Checking {val}'): + self.assertIn(val, self.cdb.snames) + class OperationalTests(unittest.TestCase): temp_folder = tempfile.TemporaryDirectory() @@ -218,7 +228,7 @@ class OperationalTests(unittest.TestCase): def setUpClass(cls) -> None: cls.cdb = CDB.load(os.path.join(os.path.dirname( os.path.realpath(__file__)), "..", "..", "examples", "cdb.dat")) - memory_optimiser.perform_optimisation(cls.cdb) + memory_optimiser.perform_optimisation(cls.cdb, optimise_snames=True) cls.vocab = Vocab.load(os.path.join(os.path.dirname( os.path.realpath(__file__)), "..", "..", "examples", "vocab.dat")) cls.cdb.config.general.spacy_model = "en_core_web_md" From d19633e6078dbf6a426b32ca2091f47f422623f7 Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 3 Jul 2023 12:07:15 +0100 Subject: [PATCH 37/47] CU-346mpwz Add json serialisation capabilities to snames delegation --- medcat/utils/memory_optimiser.py | 33 ++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/medcat/utils/memory_optimiser.py b/medcat/utils/memory_optimiser.py index 613953214..36905c4bd 100644 --- a/medcat/utils/memory_optimiser.py +++ b/medcat/utils/memory_optimiser.py @@ -18,6 +18,8 @@ DELEGATING_DICT_IDENTIFIER = '==DELEGATING_DICT==' +DELEGATING_SET_IDENTIFIER = '==DELEGATING_SET==' + class _KeysView: def __init__(self, keys: KeysView, parent: 'DelegatingDict'): @@ -143,6 +145,9 @@ def __contains__(self, value: str) -> bool: return True return False + def to_dict(self) -> dict: + return {'delegate': None} + class DelegatingDictEncoder(PartEncoder): @@ -164,15 +169,43 @@ def try_decode(self, dct: dict) -> Union[dict, EncodeableObject]: return dct +class DelegatingValueSetEncoder(PartEncoder): + + def try_encode(self, obj): + if isinstance(obj, DelegatingValueSet): + return {DELEGATING_SET_IDENTIFIER: obj.to_dict()} + raise UnsuitableObject() + + +class DelegatingValueSetDecoder(PartDecoder): + + def try_decode(self, dct: dict) -> Union[dict, EncodeableObject]: + if DELEGATING_SET_IDENTIFIER in dct: + info = dct[DELEGATING_SET_IDENTIFIER] + delegate = info['delegate'] + return DelegatingValueSet(delegate) + return dct + + def attempt_fix_after_load(cdb: CDB): _attempt_fix_after_load(cdb, ONE2MANY, CUI_DICT_NAMES_TO_COMBINE) _attempt_fix_after_load(cdb, NAME2MANY, NAME_DICT_NAMES_TO_COMBINE) +def attempt_fix_snames_after_load(cdb: CDB, snames_attr_name: str = 'snames'): + snames = getattr(cdb, snames_attr_name) + if isinstance(snames, DelegatingValueSet) and snames.delegate is None: + snames = DelegatingValueSet(cdb.cui2snames) + setattr(cdb, snames_attr_name, snames) + + # register encoder and decoders register_encoder_decoder(encoder=DelegatingDictEncoder, decoder=DelegatingDictDecoder, loading_postprocessor=attempt_fix_after_load) +register_encoder_decoder(encoder=DelegatingValueSetEncoder, + decoder=DelegatingValueSetDecoder, + loading_postprocessor=attempt_fix_snames_after_load) def _optimise(cdb: CDB, to_many_name: str, dict_names_to_combine: List[str]) -> None: From 6690f15df7ba4cd43a7f8673becbc337e4b35a2a Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 3 Jul 2023 13:35:24 +0100 Subject: [PATCH 38/47] CU-346mpwz Make sname optimisation default for memory optimisation --- medcat/utils/memory_optimiser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/medcat/utils/memory_optimiser.py b/medcat/utils/memory_optimiser.py index 36905c4bd..a00a92b4c 100644 --- a/medcat/utils/memory_optimiser.py +++ b/medcat/utils/memory_optimiser.py @@ -233,7 +233,7 @@ def _optimise_snames(cdb: CDB, cui2snames: str, snames_attr: str = 'snames') -> def perform_optimisation(cdb: CDB, optimise_cuis: bool = True, optimise_names: bool = False, - optimise_snames: bool = False) -> None: + optimise_snames: bool = True) -> None: """Attempts to optimise the memory footprint of the CDB. This can perform optimisation for cui2<...> and name2<...> dicts. @@ -277,7 +277,7 @@ def perform_optimisation(cdb: CDB, optimise_cuis: bool = True, cdb (CDB): The CDB to modify. optimise_cuis (bool, optional): Whether to optimise cui2<...> dicts. Defaults to True. optimise_names (bool, optional): Whether to optimise name2<...> dicts. Defaults to False. - optimise_snames (bool, optional): Whether to optimise `snames` set. Defaults to False. + optimise_snames (bool, optional): Whether to optimise `snames` set. Defaults to True. """ # cui2<...> -> cui2many if optimise_cuis: From bf5f1e3c6271d054b3dbc5810358205fca177481 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 5 Jul 2023 10:14:20 +0100 Subject: [PATCH 39/47] CU-346mpwz Fix typo in serialisation tests --- tests/utils/saving/test_serialization.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/utils/saving/test_serialization.py b/tests/utils/saving/test_serialization.py index 9beb7f1b1..f0cc75de1 100644 --- a/tests/utils/saving/test_serialization.py +++ b/tests/utils/saving/test_serialization.py @@ -14,8 +14,8 @@ import medcat.utils.saving.coding as _ -class JSONSerialoizationTests(unittest.TestCase): - folder = os.path.join('temp', 'JSONSerialoizationTests') +class JSONSerializationTests(unittest.TestCase): + folder = os.path.join('temp', 'JSONSerializationTests') def setUp(self) -> None: return super().setUp() From 0f984b080c0f0f148eb4092281c057991f3792fa Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 6 Jul 2023 09:37:59 +0100 Subject: [PATCH 40/47] CU-346mpwz Add variable to keep track of current memory optimisation info to CDB --- medcat/cdb.py | 3 ++ medcat/model.py | 64 ++++++++++++++++++++++++++++++++ medcat/utils/memory_optimiser.py | 3 ++ 3 files changed, 70 insertions(+) create mode 100644 medcat/model.py diff --git a/medcat/cdb.py b/medcat/cdb.py index df3594573..8e9448192 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -95,6 +95,7 @@ def __init__(self, config: Union[Config, None] = None) -> None: self._optim_params = None self.is_dirty = False self._hash: Optional[str] = None + self._memory_optimised_parts: Set[str] = set() def get_name(self, cui: str) -> str: """Returns preferred name if it exists, otherwise it will return @@ -611,6 +612,8 @@ def filter_by_cui(self, cuis_to_keep: Union[List[str], Set[str]]) -> None: self.cui2type_ids = new_cui2type_ids self.cui2preferred_name = new_cui2preferred_name self.is_dirty = True + # reset memory optimisation state + self._memory_optimised_parts.clear() def make_stats(self): stats = {} diff --git a/medcat/model.py b/medcat/model.py new file mode 100644 index 000000000..4aa4930e2 --- /dev/null +++ b/medcat/model.py @@ -0,0 +1,64 @@ +from abc import abstractmethod, ABC +from typing import Any, Dict, List, Optional + +from spacy.tokens import Doc + +from pydantic import BaseModel + +from medcat.config import FakeDict + + +class EntityDescriptor(BaseModel): + pretty_name: SyntaxWarning + cui: str + type_ids: List[str] + types: List[str] + source_value: str + detected_name: str + acc: float + context_similarity: float + start: int + end: int + icd10: List[str] + ontologies: List[str] + snomed: List[str] + id: int + meta_anns: Dict + + +class ExtractedEntities(BaseModel, FakeDict): + entities: Dict[int, EntityDescriptor] = {} + tokens: list = [] + + +class AbstractModel(ABC): + + @abstractmethod + def __call__(self, *args: Any, **kwds: Any) -> Optional[Doc]: + """_summary_ + + Returns: + Any: _description_ + """ + + def get_entities(self, text: str) -> ExtractedEntities: + """_summary_ + + Args: + text (str): _description_ + + Returns: + EntitiesFormat: _description _ + """ + pass + + @abstractmethod + def train(self, ): + """_summary_ + """ + + +def main(): + am = AbstractModel() + ents = am.get_entities('') + ents diff --git a/medcat/utils/memory_optimiser.py b/medcat/utils/memory_optimiser.py index a00a92b4c..85305e2f8 100644 --- a/medcat/utils/memory_optimiser.py +++ b/medcat/utils/memory_optimiser.py @@ -282,12 +282,15 @@ def perform_optimisation(cdb: CDB, optimise_cuis: bool = True, # cui2<...> -> cui2many if optimise_cuis: _optimise(cdb, ONE2MANY, CUI_DICT_NAMES_TO_COMBINE) + cdb._memory_optimised_parts.add('CUIS') # name2<...> -> name2many if optimise_names: _optimise(cdb, NAME2MANY, NAME_DICT_NAMES_TO_COMBINE) + cdb._memory_optimised_parts.add('NAMES') if optimise_snames: # check snames based on cui2sanmes _optimise_snames(cdb, "cui2snames") + cdb._memory_optimised_parts.add('snames') def _attempt_fix_after_load(cdb: CDB, one2many_name: str, dict_names: List[str]): From f75a1d078c8015fa7b24128fd06d241aa9277b12 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 6 Jul 2023 09:54:43 +0100 Subject: [PATCH 41/47] CU-346mpwz Add default cui2snames to sname optimisations; make sure sname optimisation dirties the CDB --- medcat/utils/memory_optimiser.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/medcat/utils/memory_optimiser.py b/medcat/utils/memory_optimiser.py index 85305e2f8..dadb6040a 100644 --- a/medcat/utils/memory_optimiser.py +++ b/medcat/utils/memory_optimiser.py @@ -218,17 +218,19 @@ def _optimise(cdb: CDB, to_many_name: str, dict_names_to_combine: List[str]) -> cdb.is_dirty = True -def _optimise_snames(cdb: CDB, cui2snames: str, snames_attr: str = 'snames') -> None: +def _optimise_snames(cdb: CDB, cui2snames: str = 'cui2snames', + snames_attr: str = 'snames') -> None: """Optimise the snames part of a CDB. Args: cdb (CDB): The CDB to optimise snames on. - one2many_name (str): The cui2snames dict name to delegate to. + one2many_name (str): The cui2snames dict name to delegate to. Defaults to 'cui2snames'. snames_attr (str, optional): The `snames` attribute name. Defaults to 'snames'. """ delegate = getattr(cdb, cui2snames) dvs = DelegatingValueSet(delegate) setattr(cdb, snames_attr, dvs) + cdb.is_dirty = True def perform_optimisation(cdb: CDB, optimise_cuis: bool = True, @@ -289,7 +291,7 @@ def perform_optimisation(cdb: CDB, optimise_cuis: bool = True, cdb._memory_optimised_parts.add('NAMES') if optimise_snames: # check snames based on cui2sanmes - _optimise_snames(cdb, "cui2snames") + _optimise_snames(cdb) cdb._memory_optimised_parts.add('snames') From d73b4e0e650006e1bd59d170d8b5e3bf03b7574d Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 6 Jul 2023 10:03:01 +0100 Subject: [PATCH 42/47] CU-346mpwz Add method to undo CDB memory optimisation --- medcat/utils/memory_optimiser.py | 40 ++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/medcat/utils/memory_optimiser.py b/medcat/utils/memory_optimiser.py index dadb6040a..5a54e86d9 100644 --- a/medcat/utils/memory_optimiser.py +++ b/medcat/utils/memory_optimiser.py @@ -306,6 +306,46 @@ def _attempt_fix_after_load(cdb: CDB, one2many_name: str, dict_names: List[str]) d.delegate = one2many +def _unoptimise(cdb: CDB, to_many_name: str, dict_names_to_combine: List[str]): + # remove one2many attribute + # the references still exist on each delegator + delattr(cdb, to_many_name) + + delegating_dicts: List[Dict[str, Any]] = [getattr(cdb, dict_name) + for dict_name in dict_names_to_combine] + for del_dict, dict_name in zip(delegating_dicts, dict_names_to_combine): + raw_dict = dict(del_dict.items()) + setattr(cdb, dict_name, raw_dict) + cdb.is_dirty = True + + +def _unoptimise_snames(cdb: CDB, cui2snames: str = 'cui2snames', + snames_attr: str = 'snames') -> None: + # rebuild snames + delegate: Dict[str, Set[str]] = getattr(cdb, cui2snames) + snames = set() + for values in delegate.values(): + snames.update(values) + setattr(cdb, snames_attr, snames) + cdb.is_dirty = True + + +def unoptimise_cdb(cdb: CDB): + """This undoes all the (potential) memory optimisations done in `perform_optimisation`. + + This method relies on `CDB._memory_optimised_parts` to be up to date. + + Args: + cdb (CDB): The CDB to work on. + """ + if 'CUIS' in cdb._memory_optimised_parts: + _unoptimise(cdb, ONE2MANY, CUI_DICT_NAMES_TO_COMBINE) + if 'NAMES' in cdb._memory_optimised_parts: + _unoptimise(cdb, NAME2MANY, NAME_DICT_NAMES_TO_COMBINE) + if 'snames' in cdb._memory_optimised_parts: + _unoptimise_snames(cdb) + + def map_to_many(dicts: List[Dict[str, Any]]) -> Tuple[Dict[str, List[Any]], List[DelegatingDict]]: one2many: Dict[str, List[Any]] = {} delegators: List[DelegatingDict] = [] From 4fdfbfc6ef74ff44864cdac5a131f946f0f05825 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 6 Jul 2023 10:03:47 +0100 Subject: [PATCH 43/47] CU-346mpwz Add tests for undoing CDB memory optimisation --- tests/utils/test_memory_optimiser.py | 34 ++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/tests/utils/test_memory_optimiser.py b/tests/utils/test_memory_optimiser.py index 6606719ba..c94f1daf5 100644 --- a/tests/utils/test_memory_optimiser.py +++ b/tests/utils/test_memory_optimiser.py @@ -209,6 +209,40 @@ def test_delegating_set_has_values(self): self.assertIn(val, self.cdb.snames) +class MemoryUnoptimisingTests(unittest.TestCase): + + @classmethod + def setUpClass(cls) -> None: + cls.cdb = CDB.load(os.path.join(os.path.dirname( + os.path.realpath(__file__)), "..", "..", "examples", "cdb.dat")) + + def test_optimisation_round_trip_cuis(self): + cui_dicts_before = [getattr(self.cdb, dict_name) + for dict_name in memory_optimiser.CUI_DICT_NAMES_TO_COMBINE] + memory_optimiser.perform_optimisation(self.cdb) + memory_optimiser.unoptimise_cdb(self.cdb) + cui_dicts_after = [getattr(self.cdb, dict_name) + for dict_name in memory_optimiser.CUI_DICT_NAMES_TO_COMBINE] + for before, after, name in zip(cui_dicts_before, + cui_dicts_after, + memory_optimiser.CUI_DICT_NAMES_TO_COMBINE): + with self.subTest(f'{name}'): + self.assertIsInstance(before, dict) + self.assertIsInstance(after, dict) + self.assertEquals(len(before), len(after)) + self.assertEquals(before, after) + + def test_optimisation_round_trip_snames(self): + snames_before = self.cdb.snames + memory_optimiser.perform_optimisation(self.cdb) + memory_optimiser.unoptimise_cdb(self.cdb) + snames_after = self.cdb.snames + self.assertIsInstance(snames_before, set) + self.assertIsInstance(snames_after, set) + self.assertEquals(len(snames_before), len(snames_after)) + self.assertEquals(snames_before, snames_after) + + class OperationalTests(unittest.TestCase): temp_folder = tempfile.TemporaryDirectory() temp_cdb_path = os.path.join(temp_folder.name, 'cat.cdb') From 22eb13d61cd7fcd3cdefdd38410747987b07d473 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 6 Jul 2023 10:08:16 +0100 Subject: [PATCH 44/47] CU-346mpwz Clear memory optimised parts if/when undoing optimisations --- medcat/utils/memory_optimiser.py | 1 + 1 file changed, 1 insertion(+) diff --git a/medcat/utils/memory_optimiser.py b/medcat/utils/memory_optimiser.py index 5a54e86d9..8f6bfda04 100644 --- a/medcat/utils/memory_optimiser.py +++ b/medcat/utils/memory_optimiser.py @@ -316,6 +316,7 @@ def _unoptimise(cdb: CDB, to_many_name: str, dict_names_to_combine: List[str]): for del_dict, dict_name in zip(delegating_dicts, dict_names_to_combine): raw_dict = dict(del_dict.items()) setattr(cdb, dict_name, raw_dict) + cdb._memory_optimised_parts.clear() cdb.is_dirty = True From 1f60f0236949940d95c5117a6374797ebf3c7971 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 6 Jul 2023 10:19:01 +0100 Subject: [PATCH 45/47] CU-346mpwz Remove accidentally added file/module --- medcat/model.py | 64 ------------------------------------------------- 1 file changed, 64 deletions(-) delete mode 100644 medcat/model.py diff --git a/medcat/model.py b/medcat/model.py deleted file mode 100644 index 4aa4930e2..000000000 --- a/medcat/model.py +++ /dev/null @@ -1,64 +0,0 @@ -from abc import abstractmethod, ABC -from typing import Any, Dict, List, Optional - -from spacy.tokens import Doc - -from pydantic import BaseModel - -from medcat.config import FakeDict - - -class EntityDescriptor(BaseModel): - pretty_name: SyntaxWarning - cui: str - type_ids: List[str] - types: List[str] - source_value: str - detected_name: str - acc: float - context_similarity: float - start: int - end: int - icd10: List[str] - ontologies: List[str] - snomed: List[str] - id: int - meta_anns: Dict - - -class ExtractedEntities(BaseModel, FakeDict): - entities: Dict[int, EntityDescriptor] = {} - tokens: list = [] - - -class AbstractModel(ABC): - - @abstractmethod - def __call__(self, *args: Any, **kwds: Any) -> Optional[Doc]: - """_summary_ - - Returns: - Any: _description_ - """ - - def get_entities(self, text: str) -> ExtractedEntities: - """_summary_ - - Args: - text (str): _description_ - - Returns: - EntitiesFormat: _description _ - """ - pass - - @abstractmethod - def train(self, ): - """_summary_ - """ - - -def main(): - am = AbstractModel() - ents = am.get_entities('') - ents From 57ceef11b1a8fad129d383110753b1ccc7170959 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 6 Jul 2023 10:20:16 +0100 Subject: [PATCH 46/47] CU-346mpwz Add more straight forward optimisation part names; Fix memory optimisation part clearing --- medcat/utils/memory_optimiser.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/medcat/utils/memory_optimiser.py b/medcat/utils/memory_optimiser.py index 8f6bfda04..e8328734d 100644 --- a/medcat/utils/memory_optimiser.py +++ b/medcat/utils/memory_optimiser.py @@ -20,6 +20,11 @@ DELEGATING_SET_IDENTIFIER = '==DELEGATING_SET==' +# these will be used in CDB._memory_optimised_parts +CUIS_PART = 'CUIS' +NAMES_PART = 'NAMES' +SNAMES_PART = 'snames' + class _KeysView: def __init__(self, keys: KeysView, parent: 'DelegatingDict'): @@ -284,15 +289,15 @@ def perform_optimisation(cdb: CDB, optimise_cuis: bool = True, # cui2<...> -> cui2many if optimise_cuis: _optimise(cdb, ONE2MANY, CUI_DICT_NAMES_TO_COMBINE) - cdb._memory_optimised_parts.add('CUIS') + cdb._memory_optimised_parts.add(CUIS_PART) # name2<...> -> name2many if optimise_names: _optimise(cdb, NAME2MANY, NAME_DICT_NAMES_TO_COMBINE) - cdb._memory_optimised_parts.add('NAMES') + cdb._memory_optimised_parts.add(NAMES_PART) if optimise_snames: # check snames based on cui2sanmes _optimise_snames(cdb) - cdb._memory_optimised_parts.add('snames') + cdb._memory_optimised_parts.add(SNAMES_PART) def _attempt_fix_after_load(cdb: CDB, one2many_name: str, dict_names: List[str]): @@ -316,7 +321,6 @@ def _unoptimise(cdb: CDB, to_many_name: str, dict_names_to_combine: List[str]): for del_dict, dict_name in zip(delegating_dicts, dict_names_to_combine): raw_dict = dict(del_dict.items()) setattr(cdb, dict_name, raw_dict) - cdb._memory_optimised_parts.clear() cdb.is_dirty = True @@ -339,12 +343,13 @@ def unoptimise_cdb(cdb: CDB): Args: cdb (CDB): The CDB to work on. """ - if 'CUIS' in cdb._memory_optimised_parts: + if CUIS_PART in cdb._memory_optimised_parts: _unoptimise(cdb, ONE2MANY, CUI_DICT_NAMES_TO_COMBINE) - if 'NAMES' in cdb._memory_optimised_parts: + if NAMES_PART in cdb._memory_optimised_parts: _unoptimise(cdb, NAME2MANY, NAME_DICT_NAMES_TO_COMBINE) - if 'snames' in cdb._memory_optimised_parts: + if SNAMES_PART in cdb._memory_optimised_parts: _unoptimise_snames(cdb) + cdb._memory_optimised_parts.clear() def map_to_many(dicts: List[Dict[str, Any]]) -> Tuple[Dict[str, List[Any]], List[DelegatingDict]]: From 9bc89055254ff8b5d4e6e6808d50f436f1e86964 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 6 Jul 2023 10:21:00 +0100 Subject: [PATCH 47/47] CU-346mpwz Add further tests for memory optimisation (dirty state, checking optimised parts) --- tests/utils/test_memory_optimiser.py | 36 ++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tests/utils/test_memory_optimiser.py b/tests/utils/test_memory_optimiser.py index c94f1daf5..5f59f5274 100644 --- a/tests/utils/test_memory_optimiser.py +++ b/tests/utils/test_memory_optimiser.py @@ -178,6 +178,13 @@ def test_unoptmised_cdb_does_not_have_delegating_dicts(self): with self.subTest(key): self.assertNotIsInstance(val, memory_optimiser.DelegatingDict) + def test_unoptimised_knows_has_no_optimsied_parts(self): + self.assertFalse(self.cdb._memory_optimised_parts, + "Should have empty optimised partss") + + def test_simply_loaded_model_not_dirty(self): + self.assertFalse(self.cdb.is_dirty) + class MemoryOptimisingTests(unittest.TestCase): @@ -187,6 +194,24 @@ def setUpClass(cls) -> None: os.path.realpath(__file__)), "..", "..", "examples", "cdb.dat")) memory_optimiser.perform_optimisation(cls.cdb, optimise_snames=True) + def test_is_dirty(self): + self.assertTrue(self.cdb.is_dirty, + "Should be dirty after optimisation") + + def test_knows_optimised(self): + self.assertTrue(self.cdb._memory_optimised_parts, + "Should have non-empty `_memory_optimised_parts`") + + def test_knows_correct_parts_optimsed(self, should_be=['CUIS', 'snames']): + for name in should_be: + with self.subTest(name): + self.assertIn(name, self.cdb._memory_optimised_parts) + + def test_knows_incorrect_parts_NOT_optimised(self, should_not_be=['NAMES']): + for name in should_not_be: + with self.subTest(name): + self.assertNotIn(name, self.cdb._memory_optimised_parts) + def test_cdb_has_one2many(self, one2many_name='cui2many'): self.assertTrue(hasattr(self.cdb, one2many_name)) one2many = getattr(self.cdb, one2many_name) @@ -242,6 +267,17 @@ def test_optimisation_round_trip_snames(self): self.assertEquals(len(snames_before), len(snames_after)) self.assertEquals(snames_before, snames_after) + def test_optimisation_round_trip_dirty(self): + memory_optimiser.perform_optimisation(self.cdb) + memory_optimiser.unoptimise_cdb(self.cdb) + self.assertTrue(self.cdb.is_dirty) + + def test_optimisation_round_trip_no_optimised_parts(self): + memory_optimiser.perform_optimisation(self.cdb) + memory_optimiser.unoptimise_cdb(self.cdb) + self.assertFalse(self.cdb._memory_optimised_parts, + "Should have no optimised parts") + class OperationalTests(unittest.TestCase): temp_folder = tempfile.TemporaryDirectory()