CogStack · mart-r · Jul 6, 2023 · May 10, 2023 · May 10, 2023 · May 10, 2023
diff --git a/medcat/cat.py b/medcat/cat.py
@@ -40,7 +40,7 @@
 from medcat.vocab import Vocab
 from medcat.utils.decorators import deprecated
 from medcat.ner.transformers_ner import TransformersNER
-from medcat.utils.saving.serializer import SPECIALITY_NAMES
+from medcat.utils.saving.serializer import SPECIALITY_NAMES, ONE2MANY
 
 
 logger = logging.getLogger(__name__) # separate logger from the package-level one
@@ -353,7 +353,8 @@ def load_model_pack(cls,
 
         # Load the CDB
         cdb_path = os.path.join(model_pack_path, "cdb.dat")
-        has_jsons = len(glob.glob(os.path.join(model_pack_path, '*.json'))) >= len(SPECIALITY_NAMES)
+        nr_of_jsons_expected = len(SPECIALITY_NAMES) - len(ONE2MANY)
+        has_jsons = len(glob.glob(os.path.join(model_pack_path, '*.json'))) >= nr_of_jsons_expected
         json_path = model_pack_path if has_jsons else None
         logger.info('Loading model pack with %s', 'JSON format' if json_path else 'dill format')
         cdb = CDB.load(cdb_path, json_path)

diff --git a/medcat/cdb.py b/medcat/cdb.py
@@ -95,6 +95,7 @@ def __init__(self, config: Union[Config, None] = None) -> None:
         self._optim_params = None
         self.is_dirty = False
         self._hash: Optional[str] = None
+        self._memory_optimised_parts: Set[str] = set()
 
     def get_name(self, cui: str) -> str:
         """Returns preferred name if it exists, otherwise it will return
@@ -180,9 +181,13 @@ def remove_cui(self, cui: str) -> None:
         for name, cuis2status in self.name2cuis2status.items():
             if cui in cuis2status:
                 del cuis2status[cui]
-        self.snames = set()
-        for cuis in self.cui2snames.values():
-            self.snames |= cuis
+        if isinstance(self.snames, set):
+            # if this is a memory optimised CDB, this won't be a set
+            # but it also won't need to be changed since it
+            # relies directly on cui2snames
+            self.snames = set()
+            for cuis in self.cui2snames.values():
+                self.snames |= cuis
         self.name2count_train = {name: len(cuis) for name, cuis in self.name2cuis.items()}
         self.is_dirty = True
 
@@ -540,6 +545,10 @@ def filter_by_cui(self, cuis_to_keep: Union[List[str], Set[str]]) -> None:
         This also will not remove any data from cdb.addl_info - as this field can contain data of
         unknown structure.
 
+        As a side note, if the CDB has been memory-optimised, filtering will undo this memory optimisation.
+        This is because the dicts being involved will be rewritten.
+        However, the memory optimisation can be performed again afterwards.
+
         Args:
             cuis_to_keep (List[str]):
                 CUIs that will be kept, the rest will be removed (not completely, look above).
@@ -603,6 +612,8 @@ def filter_by_cui(self, cuis_to_keep: Union[List[str], Set[str]]) -> None:
         self.cui2type_ids = new_cui2type_ids
         self.cui2preferred_name = new_cui2preferred_name
         self.is_dirty = True
+        # reset memory optimisation state
+        self._memory_optimised_parts.clear()
 
     def make_stats(self):
         stats = {}