From 2f0fbfbed24281e57b55e5d3090c2307fe313a04 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 16 May 2024 15:55:18 +0100 Subject: [PATCH 01/10] CU-8693n892x: Save environment/dependency snapshot upon model pack creation --- medcat/cat.py | 7 +++ medcat/utils/saving/envsnapshot.py | 21 ++++++++ tests/utils/saving/test_envsnapshot.py | 69 ++++++++++++++++++++++++++ 3 files changed, 97 insertions(+) create mode 100644 medcat/utils/saving/envsnapshot.py create mode 100644 tests/utils/saving/test_envsnapshot.py diff --git a/medcat/cat.py b/medcat/cat.py index 8df7526b7..e327cf550 100644 --- a/medcat/cat.py +++ b/medcat/cat.py @@ -40,6 +40,7 @@ from medcat.utils.decorators import deprecated from medcat.ner.transformers_ner import TransformersNER from medcat.utils.saving.serializer import SPECIALITY_NAMES, ONE2MANY +from medcat.utils.saving.envsnapshot import get_environment_info from medcat.stats.stats import get_stats from medcat.utils.filters import set_project_filters @@ -315,6 +316,12 @@ def create_model_pack(self, save_dir_path: str, model_pack_name: str = DEFAULT_M with open(model_card_path, 'w') as f: json.dump(self.get_model_card(as_dict=True), f, indent=2) + # add a dependency snapshot + env_info = get_environment_info() + env_info_path = os.path.join(save_dir_path, "environment_snapshot.json") + with open(env_info_path, 'w') as f: + json.dump(env_info, f) + # Zip everything shutil.make_archive(os.path.join(_save_dir_path, model_pack_name), 'zip', root_dir=save_dir_path) diff --git a/medcat/utils/saving/envsnapshot.py b/medcat/utils/saving/envsnapshot.py new file mode 100644 index 000000000..bb3852422 --- /dev/null +++ b/medcat/utils/saving/envsnapshot.py @@ -0,0 +1,21 @@ +from typing import List, Tuple, Dict, Any + +import pkg_resources +import platform + + + +def get_installed_packages() -> List[Tuple[str, str]]: + installed_packages = [] + for package in pkg_resources.working_set: + installed_packages.append([package.project_name, package.version]) + return installed_packages + + +def get_environment_info() -> Dict[str, Any]: + return { + "dependencies": get_installed_packages(), + "os": platform.platform(), + "cpu_architecture": platform.machine(), + "python_version": platform.python_version() + } diff --git a/tests/utils/saving/test_envsnapshot.py b/tests/utils/saving/test_envsnapshot.py new file mode 100644 index 000000000..7d974641c --- /dev/null +++ b/tests/utils/saving/test_envsnapshot.py @@ -0,0 +1,69 @@ +from typing import Any +import platform +import os +import tempfile +import json + +from medcat.cat import CAT +from medcat.utils.saving import envsnapshot + +import unittest + + +class EnvSnapshotAloneTests(unittest.TestCase): + + def setUp(self) -> None: + self.env_info = envsnapshot.get_environment_info() + + def test_info_is_dict(self): + self.assertIsInstance(self.env_info, dict) + + def test_info_is_not_empty(self): + self.assertTrue(self.env_info) + + def assert_has_target(self, target: str, expected: Any): + self.assertIn(target, self.env_info) + py_ver = self.env_info[target] + self.assertEqual(py_ver, expected) + + def test_has_os(self): + self.assert_has_target("os", platform.platform()) + + def test_has_py_ver(self): + self.assert_has_target("python_version", platform.python_version()) + + def test_has_cpu_arch(self): + self.assert_has_target("cpu_architecture", platform.machine()) + + def test_has_dependencies(self, name: str = "dependencies"): + # NOTE: just making sure it's a anon-empty list + self.assertIn(name, self.env_info) + deps = self.env_info[name] + self.assertTrue(deps) + + +CAT_PATH = os.path.join(os.path.dirname(__file__), "..", "..", "..", "examples") + + +class EnvSnapshotInCATTests(unittest.TestCase): + expected_env = envsnapshot.get_environment_info() + + @classmethod + def setUpClass(cls) -> None: + cls.cat = CAT.load_model_pack(CAT_PATH) + cls._temp_dir = tempfile.TemporaryDirectory() + mpn = cls.cat.create_model_pack(cls._temp_dir.name) + cls.cat_folder = os.path.join(cls._temp_dir.name, mpn) + cls.envrion_file_path = os.path.join(cls.cat_folder, "environment_snapshot.json") + + def test_has_environment(self): + self.assertTrue(os.path.exists(self.envrion_file_path)) + + def test_eviron_saved(self): + with open(self.envrion_file_path) as f: + saved_info: dict = json.load(f) + self.assertEqual(saved_info.keys(), self.expected_env.keys()) + for k in saved_info: + with self.subTest(k): + v1, v2 = saved_info[k], self.expected_env[k] + self.assertEqual(v1, v2) From d9dac3fe86ee39c03f440db24419913bc1a16d89 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 16 May 2024 15:57:50 +0100 Subject: [PATCH 02/10] CU-8693n892x: Fix typing for env snapshot module --- medcat/utils/saving/envsnapshot.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/medcat/utils/saving/envsnapshot.py b/medcat/utils/saving/envsnapshot.py index bb3852422..ef996ca12 100644 --- a/medcat/utils/saving/envsnapshot.py +++ b/medcat/utils/saving/envsnapshot.py @@ -1,11 +1,10 @@ -from typing import List, Tuple, Dict, Any +from typing import List, Dict, Any import pkg_resources import platform - -def get_installed_packages() -> List[Tuple[str, str]]: +def get_installed_packages() -> List[List[str]]: installed_packages = [] for package in pkg_resources.working_set: installed_packages.append([package.project_name, package.version]) From c3e7a758726a222292e594d2e93150af5a3c9485 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 16 May 2024 15:58:14 +0100 Subject: [PATCH 03/10] CU-8693n892x: Add test for env file existance in .zip --- tests/utils/saving/test_envsnapshot.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/tests/utils/saving/test_envsnapshot.py b/tests/utils/saving/test_envsnapshot.py index 7d974641c..e43b1c64c 100644 --- a/tests/utils/saving/test_envsnapshot.py +++ b/tests/utils/saving/test_envsnapshot.py @@ -3,6 +3,7 @@ import os import tempfile import json +import zipfile from medcat.cat import CAT from medcat.utils.saving import envsnapshot @@ -10,6 +11,11 @@ import unittest +def list_zip_contents(zip_file_path): + with zipfile.ZipFile(zip_file_path, 'r') as zip_ref: + return zip_ref.namelist() + + class EnvSnapshotAloneTests(unittest.TestCase): def setUp(self) -> None: @@ -43,6 +49,7 @@ def test_has_dependencies(self, name: str = "dependencies"): CAT_PATH = os.path.join(os.path.dirname(__file__), "..", "..", "..", "examples") +ENV_SNAPSHOT_FILE_NAME = "environment_snapshot.json" class EnvSnapshotInCATTests(unittest.TestCase): @@ -54,7 +61,7 @@ def setUpClass(cls) -> None: cls._temp_dir = tempfile.TemporaryDirectory() mpn = cls.cat.create_model_pack(cls._temp_dir.name) cls.cat_folder = os.path.join(cls._temp_dir.name, mpn) - cls.envrion_file_path = os.path.join(cls.cat_folder, "environment_snapshot.json") + cls.envrion_file_path = os.path.join(cls.cat_folder, ENV_SNAPSHOT_FILE_NAME) def test_has_environment(self): self.assertTrue(os.path.exists(self.envrion_file_path)) @@ -67,3 +74,7 @@ def test_eviron_saved(self): with self.subTest(k): v1, v2 = saved_info[k], self.expected_env[k] self.assertEqual(v1, v2) + + def test_zip_has_env_snapshot(self): + filenames = list_zip_contents(self.cat_folder + ".zip") + self.assertIn(ENV_SNAPSHOT_FILE_NAME, filenames) From f48e08402d92801f63d0e00e789aa63eea5b7c58 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 16 May 2024 15:59:47 +0100 Subject: [PATCH 04/10] CU-8693n892x: Add doc strings --- medcat/utils/saving/envsnapshot.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/medcat/utils/saving/envsnapshot.py b/medcat/utils/saving/envsnapshot.py index ef996ca12..4f0b0dd46 100644 --- a/medcat/utils/saving/envsnapshot.py +++ b/medcat/utils/saving/envsnapshot.py @@ -5,6 +5,11 @@ def get_installed_packages() -> List[List[str]]: + """Get the installed packages and their versions. + + Returns: + List[List[str]]: List of lists. Each item contains of a dependency name and version. + """ installed_packages = [] for package in pkg_resources.working_set: installed_packages.append([package.project_name, package.version]) @@ -12,6 +17,13 @@ def get_installed_packages() -> List[List[str]]: def get_environment_info() -> Dict[str, Any]: + """Get the current environment information. + + This includes dependency versions, the OS, the CPU architecture and the python version. + + Returns: + Dict[str, Any]: _description_ + """ return { "dependencies": get_installed_packages(), "os": platform.platform(), From 2ca3be8b154d7ef4fa14833e6abb0d65e15ab8a7 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 16 May 2024 16:42:45 +0100 Subject: [PATCH 05/10] CU-8693n892x: Centralise env snapshot file name --- medcat/cat.py | 4 ++-- medcat/utils/saving/envsnapshot.py | 3 +++ tests/utils/saving/test_envsnapshot.py | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/medcat/cat.py b/medcat/cat.py index e327cf550..9fbd38271 100644 --- a/medcat/cat.py +++ b/medcat/cat.py @@ -40,7 +40,7 @@ from medcat.utils.decorators import deprecated from medcat.ner.transformers_ner import TransformersNER from medcat.utils.saving.serializer import SPECIALITY_NAMES, ONE2MANY -from medcat.utils.saving.envsnapshot import get_environment_info +from medcat.utils.saving.envsnapshot import get_environment_info, ENV_SNAPSHOT_FILE_NAME from medcat.stats.stats import get_stats from medcat.utils.filters import set_project_filters @@ -318,7 +318,7 @@ def create_model_pack(self, save_dir_path: str, model_pack_name: str = DEFAULT_M # add a dependency snapshot env_info = get_environment_info() - env_info_path = os.path.join(save_dir_path, "environment_snapshot.json") + env_info_path = os.path.join(save_dir_path, ENV_SNAPSHOT_FILE_NAME) with open(env_info_path, 'w') as f: json.dump(env_info, f) diff --git a/medcat/utils/saving/envsnapshot.py b/medcat/utils/saving/envsnapshot.py index 4f0b0dd46..2fb5ace6d 100644 --- a/medcat/utils/saving/envsnapshot.py +++ b/medcat/utils/saving/envsnapshot.py @@ -4,6 +4,9 @@ import platform +ENV_SNAPSHOT_FILE_NAME = "environment_snapshot.json" + + def get_installed_packages() -> List[List[str]]: """Get the installed packages and their versions. diff --git a/tests/utils/saving/test_envsnapshot.py b/tests/utils/saving/test_envsnapshot.py index e43b1c64c..36fb0dc83 100644 --- a/tests/utils/saving/test_envsnapshot.py +++ b/tests/utils/saving/test_envsnapshot.py @@ -49,7 +49,7 @@ def test_has_dependencies(self, name: str = "dependencies"): CAT_PATH = os.path.join(os.path.dirname(__file__), "..", "..", "..", "examples") -ENV_SNAPSHOT_FILE_NAME = "environment_snapshot.json" +ENV_SNAPSHOT_FILE_NAME = envsnapshot.ENV_SNAPSHOT_FILE_NAME class EnvSnapshotInCATTests(unittest.TestCase): From 46a52b69f8109be916e8d83dc33c33b58c905d42 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 16 May 2024 16:44:43 +0100 Subject: [PATCH 06/10] CU-8693n892x: Add env snapshot file to exceptions in serialisation tests --- tests/utils/saving/test_serialization.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/utils/saving/test_serialization.py b/tests/utils/saving/test_serialization.py index c2c44da16..6f636e3f0 100644 --- a/tests/utils/saving/test_serialization.py +++ b/tests/utils/saving/test_serialization.py @@ -10,6 +10,7 @@ from medcat.vocab import Vocab from medcat.utils.saving.serializer import JsonSetSerializer, CDBSerializer, SPECIALITY_NAMES, ONE2MANY +from medcat.utils.saving.envsnapshot import ENV_SNAPSHOT_FILE_NAME import medcat.utils.saving.coding as _ @@ -60,6 +61,7 @@ class ModelCreationTests(unittest.TestCase): json_model_pack = tempfile.TemporaryDirectory() EXAMPLES = os.path.join(os.path.dirname( os.path.realpath(__file__)), "..", "..", "..", "examples") + EXCEPTIONAL_JSONS = ['model_card.json', ENV_SNAPSHOT_FILE_NAME] @classmethod def setUpClass(cls) -> None: @@ -95,7 +97,7 @@ def test_dill_to_json(self): SPECIALITY_NAMES) - len(ONE2MANY)) for json in jsons: with self.subTest(f'JSON {json}'): - if json.endswith('model_card.json'): + if any(json.endswith(exception) for exception in self.EXCEPTIONAL_JSONS): continue # ignore model card here if any(name in json for name in ONE2MANY): # ignore cui2many and name2many From 401fe8c64582797ea2052867d913e9570bf53747 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 29 May 2024 16:21:45 +0100 Subject: [PATCH 07/10] CU-8693n892x: Only list direct dependencies --- medcat/utils/saving/envsnapshot.py | 39 +++++++++++++++++++++++++- tests/utils/saving/test_envsnapshot.py | 20 +++++++++++++ 2 files changed, 58 insertions(+), 1 deletion(-) diff --git a/medcat/utils/saving/envsnapshot.py b/medcat/utils/saving/envsnapshot.py index 2fb5ace6d..2a2a76eae 100644 --- a/medcat/utils/saving/envsnapshot.py +++ b/medcat/utils/saving/envsnapshot.py @@ -1,10 +1,44 @@ -from typing import List, Dict, Any +from typing import List, Dict, Any, Set +import os +import re import pkg_resources import platform ENV_SNAPSHOT_FILE_NAME = "environment_snapshot.json" +SETUP_PY_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "setup.py")) +SETUP_PY_REGEX = re.compile("install_requires=\[([\s\S]*?)\]") + + +def get_direct_dependencies() -> Set[str]: + """Get the set of direct dependeny names. + + The current implementation reads setup.py for the install_requires + keyword argument, evaluates the list, removes the versions and returns + the names as a set. + + Raises: + FileNotFoundError: If the setup.py file was not found. + ValueError: If found different sets of instal lrequirements. + + Returns: + Set[str]: The set of direct dependeny names. + """ + if not os.path.exists(SETUP_PY_PATH): + raise FileNotFoundError(f"{SETUP_PY_PATH} does not exist.") + with open(SETUP_PY_PATH) as f: + setup_py_code = f.read() + found = SETUP_PY_REGEX.findall(setup_py_code) + if not found: + raise ValueError("Did not find install requirements in setup.py") + if len(found) > 1: + raise ValueError("Ambiguous install requirements in setup.py") + deps_str = found[0] + # evaluate list of dependencies (including potential version pins) + deps: List[str] = eval("[" + deps_str + "]") + # remove versions where applicable + return set(re.split("[<=>~]", dep)[0] for dep in deps) def get_installed_packages() -> List[List[str]]: @@ -13,8 +47,11 @@ def get_installed_packages() -> List[List[str]]: Returns: List[List[str]]: List of lists. Each item contains of a dependency name and version. """ + direct_deps = get_direct_dependencies() installed_packages = [] for package in pkg_resources.working_set: + if package.project_name not in direct_deps: + continue installed_packages.append([package.project_name, package.version]) return installed_packages diff --git a/tests/utils/saving/test_envsnapshot.py b/tests/utils/saving/test_envsnapshot.py index 36fb0dc83..937a4dfe2 100644 --- a/tests/utils/saving/test_envsnapshot.py +++ b/tests/utils/saving/test_envsnapshot.py @@ -16,6 +16,26 @@ def list_zip_contents(zip_file_path): return zip_ref.namelist() +class DirectDependenciesTests(unittest.TestCase): + + def setUp(self) -> None: + self.direct_deps = envsnapshot.get_direct_dependencies() + + def test_nonempty(self): + self.assertTrue(self.direct_deps) + + def test_does_not_contain_versions(self, version_starters: str = '<=>~'): + for dep in self.direct_deps: + for vs in version_starters: + with self.subTest(f"DEP '{dep}' check for '{vs}'"): + self.assertNotIn(vs, dep) + + def test_deps_are_installed_packages(self): + for dep in self.direct_deps: + with self.subTest(f"Has '{dep}'"): + envsnapshot.pkg_resources.require(dep) + + class EnvSnapshotAloneTests(unittest.TestCase): def setUp(self) -> None: From 7f8a1643e1227f3d54fbe3ea1468773661ddfb40 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 29 May 2024 16:30:02 +0100 Subject: [PATCH 08/10] CU-8693n892x: Add test that verifies all direct dependencies are listed in environment --- tests/utils/saving/test_envsnapshot.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/utils/saving/test_envsnapshot.py b/tests/utils/saving/test_envsnapshot.py index 937a4dfe2..16bee1ffb 100644 --- a/tests/utils/saving/test_envsnapshot.py +++ b/tests/utils/saving/test_envsnapshot.py @@ -67,6 +67,11 @@ def test_has_dependencies(self, name: str = "dependencies"): deps = self.env_info[name] self.assertTrue(deps) + def test_all_direct_dependencies_are_installed(self): + deps = self.env_info['dependencies'] + direct_deps = envsnapshot.get_direct_dependencies() + self.assertEqual(len(deps), len(direct_deps)) + CAT_PATH = os.path.join(os.path.dirname(__file__), "..", "..", "..", "examples") ENV_SNAPSHOT_FILE_NAME = envsnapshot.ENV_SNAPSHOT_FILE_NAME From c7abf15483064dbaf37e7825a512dd8bcf0e331c Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 10 Jun 2024 15:48:09 +0100 Subject: [PATCH 09/10] CU-8693n892x: Update to using importlib for required dependencies --- medcat/utils/saving/envsnapshot.py | 30 ++++++++------------------ tests/utils/saving/test_envsnapshot.py | 28 ++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 21 deletions(-) diff --git a/medcat/utils/saving/envsnapshot.py b/medcat/utils/saving/envsnapshot.py index 2a2a76eae..526fc608a 100644 --- a/medcat/utils/saving/envsnapshot.py +++ b/medcat/utils/saving/envsnapshot.py @@ -1,43 +1,31 @@ from typing import List, Dict, Any, Set -import os import re import pkg_resources import platform +from importlib_metadata import distribution ENV_SNAPSHOT_FILE_NAME = "environment_snapshot.json" -SETUP_PY_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "setup.py")) -SETUP_PY_REGEX = re.compile("install_requires=\[([\s\S]*?)\]") def get_direct_dependencies() -> Set[str]: """Get the set of direct dependeny names. - The current implementation reads setup.py for the install_requires - keyword argument, evaluates the list, removes the versions and returns - the names as a set. + The current implementation uses importlib_metadata to figure out + the names of the required packages and removes their version info. Raises: - FileNotFoundError: If the setup.py file was not found. - ValueError: If found different sets of instal lrequirements. + ValueError: If the unlikely event that the dependencies are unable to be obtained. Returns: Set[str]: The set of direct dependeny names. """ - if not os.path.exists(SETUP_PY_PATH): - raise FileNotFoundError(f"{SETUP_PY_PATH} does not exist.") - with open(SETUP_PY_PATH) as f: - setup_py_code = f.read() - found = SETUP_PY_REGEX.findall(setup_py_code) - if not found: - raise ValueError("Did not find install requirements in setup.py") - if len(found) > 1: - raise ValueError("Ambiguous install requirements in setup.py") - deps_str = found[0] - # evaluate list of dependencies (including potential version pins) - deps: List[str] = eval("[" + deps_str + "]") - # remove versions where applicable + package_name = __package__.split(".")[0] + dist = distribution(package_name) + deps = dist.metadata.get_all('Requires-Dist') + if not deps: + raise ValueError("Unable to identify dependencies") return set(re.split("[<=>~]", dep)[0] for dep in deps) diff --git a/tests/utils/saving/test_envsnapshot.py b/tests/utils/saving/test_envsnapshot.py index 16bee1ffb..4cb3827b0 100644 --- a/tests/utils/saving/test_envsnapshot.py +++ b/tests/utils/saving/test_envsnapshot.py @@ -4,6 +4,7 @@ import tempfile import json import zipfile +import re from medcat.cat import CAT from medcat.utils.saving import envsnapshot @@ -16,6 +17,29 @@ def list_zip_contents(zip_file_path): return zip_ref.namelist() + +ENV_SNAPSHOT_FILE_NAME = "environment_snapshot.json" +SETUP_PY_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "setup.py")) +SETUP_PY_REGEX = re.compile("install_requires=\[([\s\S]*?)\]") + + +def get_direct_dependencies_regex() -> set: + if not os.path.exists(SETUP_PY_PATH): + raise FileNotFoundError(f"{SETUP_PY_PATH} does not exist.") + with open(SETUP_PY_PATH) as f: + setup_py_code = f.read() + found = SETUP_PY_REGEX.findall(setup_py_code) + if not found: + raise ValueError("Did not find install requirements in setup.py") + if len(found) > 1: + raise ValueError("Ambiguous install requirements in setup.py") + deps_str = found[0] + # evaluate list of dependencies (including potential version pins) + deps: list = eval("[" + deps_str + "]") + # remove versions where applicable + return set(re.split("[<=>~]", dep)[0] for dep in deps) + + class DirectDependenciesTests(unittest.TestCase): def setUp(self) -> None: @@ -35,6 +59,10 @@ def test_deps_are_installed_packages(self): with self.subTest(f"Has '{dep}'"): envsnapshot.pkg_resources.require(dep) + def test_deps_are_same_as_per_regex(self): + regex_deps = get_direct_dependencies_regex() + self.assertEqual(regex_deps, self.direct_deps) + class EnvSnapshotAloneTests(unittest.TestCase): From 205f26234f8cf5fcc5bb6a7f48b9347250149e87 Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 10 Jun 2024 15:53:01 +0100 Subject: [PATCH 10/10] DO NOT INCLUDE: only run relevant test --- .github/workflows/main.yml | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 7c7a2b742..02c55cd12 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -25,15 +25,9 @@ jobs: run: | python -m pip install --upgrade pip pip install -r requirements-dev.txt - - name: Check types - run: | - python -m mypy --follow-imports=normal medcat - - name: Lint - run: | - flake8 medcat - name: Test run: | - timeout 17m python -m unittest discover + python -m unittest tests.utils.saving.test_envsnapshot publish-to-test-pypi: