CLIMADA-project · emanuel-schmid · Jun 19, 2023 · Jun 16, 2023 · Jun 16, 2023 · Jun 19, 2023
diff --git a/climada/test/test_api_client.py b/climada/test/test_api_client.py
@@ -19,8 +19,8 @@
 Test save module.
 """
 from pathlib import Path
+import tempfile
 import unittest
-from shutil import rmtree
 
 import numpy as np
 
@@ -233,6 +233,46 @@ def test_multiplicity_split(self):
         self.assertEqual(straight, {'b': '1'})
         self.assertEqual(multi, {'country_name': ['x', 'y', 'z']})
 
+    def test_purge_cache(self):
+        client = Client()
+
+        active_ds = client.get_dataset_info(data_type="litpop", name="LitPop_150arcsec_ABW", version="v2")
+        outdated_ds = client.get_dataset_info(data_type="litpop", name="LitPop_150arcsec_ABW", version="v1")
+        test_ds = client.get_dataset_info(data_type="storm_europe", name="test_storm_europe_icon_2021012800", version="v1", status="test_dataset")
+        expired_ds = client.get_dataset_info(data_type="tropical_cyclone", name="rename_files2", version="v1", status="expired")
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            for ds in [active_ds, outdated_ds, test_ds, expired_ds]:
+                client.download_dataset(dataset=ds, target_dir=Path(temp_dir))
+            self.assertEqual(  # outdated dataset present
+                1,
+                len(list(Path(temp_dir).joinpath('exposures/litpop/LitPop_150arcsec_ABW/v1').iterdir()))
+            )
+            self.assertEqual(  # expired data set present
+                1,
+                len(list(Path(temp_dir).joinpath('hazard/tropical_cyclone/rename_files2/v1').iterdir()))
+            )
+
+            client.purge_cache(target_dir=temp_dir)
+            self.assertFalse(  # outdated data set removed
+                Path(temp_dir).joinpath('exposures/litpop/LitPop_150arcsec_ABW/v1').is_dir()
+            )
+            self.assertFalse(  # expired data set removed
+                Path(temp_dir).joinpath('hazard/tropical_cyclone/rename_files2/v1').is_dir()
+            )
+            self.assertEqual(  # test files are still there
+                3,
+                len(list(Path(temp_dir).joinpath('hazard/storm_europe/test_storm_europe_icon_2021012800/v1').iterdir()))
+            )
+
+            client.purge_cache(target_dir=temp_dir, keep_testfiles=False)
+            self.assertTrue(  # uptodate active dataset file still there
+                Path(temp_dir).joinpath('exposures/litpop/LitPop_150arcsec_ABW/v2/LitPop_150arcsec_ABW.hdf5').exists()
+            )
+            self.assertFalse(  # test data removed, empty directories removed
+                Path(temp_dir).joinpath('hazard/').exists()
+            )
+
 
 def rm_empty_dir(folder):
     for subfolder in folder.iterdir():

diff --git a/climada/util/api_client.py b/climada/util/api_client.py
@@ -23,6 +23,7 @@
 import hashlib
 import json
 import logging
+from os.path import commonprefix
 from pathlib import Path
 from urllib.parse import quote, unquote, urlsplit, urlunsplit
 import time
@@ -595,13 +596,13 @@ def _download_file(self, local_path, fileinfo, check=checksize, retries=3):
             downloaded = self._tracked_download(remote_url=fileinfo.url, local_path=local_path)
             if not downloaded.enddownload:
                 raise Download.Failed("Download seems to be in progress, please try again later"
-                                      " or remove cache entry by calling"
-                                      f" `Client.purge_cache(Path('{local_path}'))`!")
+                                      " or remove cache entry from database by calling"
+                                      f" `Client.purge_cache_db(Path('{local_path}'))`!")
             try:
                 check(local_path, fileinfo)
             except Download.Failed as dlf:
                 local_path.unlink(missing_ok=True)
-                self.purge_cache(local_path)
+                self.purge_cache_db(local_path)
                 raise dlf
             return local_path
         except Download.Failed as dle:
@@ -663,7 +664,7 @@ def _organize_path(dataset, target_dir):
         return target_dir
 
     @staticmethod
-    def purge_cache(local_path):
+    def purge_cache_db(local_path):
         """Removes entry from the sqlite database that keeps track of files downloaded by
         `cached_download`. This may be necessary in case a previous attempt has failed
         in an uncontroled way (power outage or the like).
@@ -1009,3 +1010,55 @@ def into_files_df(dataset_infos):
         """
         return Client.into_datasets_df(dataset_infos) \
             .merge(pd.DataFrame([dsfile for ds in dataset_infos for dsfile in ds.files]))
+
+    def purge_cache(self, target_dir=SYSTEM_DIR, keep_testfiles=True):
+        """Removes downloaded dataset files from the given directory if they have been downloaded
+        with the API client, if they are beneath the given directory and if one of the following
+        is the case:
+        - there status is neither 'active' nor 'test_dataset'
+        = their status is 'test_dataset' and keep_testfiles is set to False
-        = their status is 'test_dataset' and keep_testfiles is set to False
+        - their status is 'test_dataset' and keep_testfiles is set to False
-        = their status is 'test_dataset' and keep_testfiles is set to False
+        - their status is 'test_dataset' and keep_testfiles is set to False
+        - their status is 'active' and they are outdated, i.e., there is a dataset with the same
+          data_type and name but a newer version.
+
+        Parameters
+        ----------
+        target_dir : Path or str, optional
+            files downloaded beneath this directory and empty subdirectories will be removed.
+            default: SYSTEM_DIR
+        keep_testfiles : bool, optional
+            if set to True, files from datasets with status 'test_dataset' will not be removed.
+            default: True
+        """
+
+        # collect urls from datasets that should not be removed
+        test_datasets = self.list_dataset_infos(status='test_dataset') if keep_testfiles else []
+        test_urls = set(filinf.url for dsinf in test_datasets for filinf in dsinf.files)
+
+        active_datasets = self.list_dataset_infos(status='active', version='newest')
+        active_urls = set(filinf.url for dsinf in active_datasets for filinf in dsinf.files)
+
+        not_to_be_removed = test_urls.union(active_urls)
+
+        # make a list of downloaded files that could be removed
+        to_be_removed = [d for d in Download.select() if d.url not in not_to_be_removed]
+
+        # helper function for filtering by target_dir
+        target_dir = Path(target_dir).absolute()
+
+        # remove files and sqlite db entries
+        for obsolete in to_be_removed:
+            opath = Path(obsolete.path)
+            if opath.exists() and Path(commonprefix([target_dir, opath])) == target_dir:
+                opath.unlink()
+                obsolete.delete_instance()
+
+        # clean up: remove all empty directories beneath target_dir
+        def rm_empty_dirs(directory: Path):
+            for subdir in directory.iterdir():
+                if subdir.is_dir():
+                    rm_empty_dirs(subdir)
+            try:
+                directory.rmdir()
+            except OSError:
+                pass
+        rm_empty_dirs(target_dir)