Skip to content

Commit

Permalink
cleaning up data api file cache (#737)
Browse files Browse the repository at this point in the history
* api_client.Client: introduce purge_cache method that deletes obsolete files from ~/climada/data

* api_client.purge_cache: simplication

* util.api_client: improved readabliity

* doc: Client.purge_cache

* doc.api_client: cosmetics
  • Loading branch information
emanuel-schmid authored Jun 19, 2023
1 parent 654f9cb commit c15fc53
Show file tree
Hide file tree
Showing 4 changed files with 121 additions and 7 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ Removed:
- `climada.util.coordinates.match_centroids` method for matching (hazard) centroids to GeoDataFrames [#602](https://github.com/CLIMADA-project/climada_python/pull/602)
- 'Extra' requirements `doc`, `test`, and `dev` for Python package [#712](https://github.com/CLIMADA-project/climada_python/pull/712)
- Added method `Exposures.centroids_total_value` to replace the functionality of `Exposures.affected_total_value`. This method is temporary and deprecated. [#702](https://github.com/CLIMADA-project/climada_python/pull/702)

- New method `climada.util.api_client.Client.purge_cache`: utility function to remove outdated files from the local file system to free disk space.
([#737](https://github.com/CLIMADA-project/climada_python/pull/737))

### Changed

Expand Down
42 changes: 41 additions & 1 deletion climada/test/test_api_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
Test save module.
"""
from pathlib import Path
import tempfile
import unittest
from shutil import rmtree

import numpy as np

Expand Down Expand Up @@ -233,6 +233,46 @@ def test_multiplicity_split(self):
self.assertEqual(straight, {'b': '1'})
self.assertEqual(multi, {'country_name': ['x', 'y', 'z']})

def test_purge_cache(self):
client = Client()

active_ds = client.get_dataset_info(data_type="litpop", name="LitPop_150arcsec_ABW", version="v2")
outdated_ds = client.get_dataset_info(data_type="litpop", name="LitPop_150arcsec_ABW", version="v1")
test_ds = client.get_dataset_info(data_type="storm_europe", name="test_storm_europe_icon_2021012800", version="v1", status="test_dataset")
expired_ds = client.get_dataset_info(data_type="tropical_cyclone", name="rename_files2", version="v1", status="expired")

with tempfile.TemporaryDirectory() as temp_dir:
for ds in [active_ds, outdated_ds, test_ds, expired_ds]:
client.download_dataset(dataset=ds, target_dir=Path(temp_dir))
self.assertEqual( # outdated dataset present
1,
len(list(Path(temp_dir).joinpath('exposures/litpop/LitPop_150arcsec_ABW/v1').iterdir()))
)
self.assertEqual( # expired data set present
1,
len(list(Path(temp_dir).joinpath('hazard/tropical_cyclone/rename_files2/v1').iterdir()))
)

client.purge_cache(target_dir=temp_dir)
self.assertFalse( # outdated data set removed
Path(temp_dir).joinpath('exposures/litpop/LitPop_150arcsec_ABW/v1').is_dir()
)
self.assertFalse( # expired data set removed
Path(temp_dir).joinpath('hazard/tropical_cyclone/rename_files2/v1').is_dir()
)
self.assertEqual( # test files are still there
3,
len(list(Path(temp_dir).joinpath('hazard/storm_europe/test_storm_europe_icon_2021012800/v1').iterdir()))
)

client.purge_cache(target_dir=temp_dir, keep_testfiles=False)
self.assertTrue( # uptodate active dataset file still there
Path(temp_dir).joinpath('exposures/litpop/LitPop_150arcsec_ABW/v2/LitPop_150arcsec_ABW.hdf5').exists()
)
self.assertFalse( # test data removed, empty directories removed
Path(temp_dir).joinpath('hazard/').exists()
)


def rm_empty_dir(folder):
for subfolder in folder.iterdir():
Expand Down
71 changes: 66 additions & 5 deletions climada/util/api_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import hashlib
import json
import logging
from os.path import commonprefix
from pathlib import Path
from urllib.parse import quote, unquote, urlsplit, urlunsplit
import time
Expand Down Expand Up @@ -594,14 +595,20 @@ def _download_file(self, local_path, fileinfo, check=checksize, retries=3):
local_path /= fileinfo.file_name
downloaded = self._tracked_download(remote_url=fileinfo.url, local_path=local_path)
if not downloaded.enddownload:
raise Download.Failed("Download seems to be in progress, please try again later"
" or remove cache entry by calling"
f" `Client.purge_cache(Path('{local_path}'))`!")
raise Download.Failed(f"A download of {fileinfo.url} via the API Client has been"
" requested before. Either it is still in progress or the"
" process got interrupted. In the former case just wait"
" until the download has finished and try again, in the"
f" latter run `Client.purge_cache_db(Path('{local_path}'))`"
" from Python. If unsure, check your internet connection,"
" wait for as long as it takes to download a file of size"
f" {fileinfo.file_size} and try again. If the problem"
" persists, purge the cache db with said call.")
try:
check(local_path, fileinfo)
except Download.Failed as dlf:
local_path.unlink(missing_ok=True)
self.purge_cache(local_path)
self.purge_cache_db(local_path)
raise dlf
return local_path
except Download.Failed as dle:
Expand Down Expand Up @@ -663,7 +670,7 @@ def _organize_path(dataset, target_dir):
return target_dir

@staticmethod
def purge_cache(local_path):
def purge_cache_db(local_path):
"""Removes entry from the sqlite database that keeps track of files downloaded by
`cached_download`. This may be necessary in case a previous attempt has failed
in an uncontroled way (power outage or the like).
Expand Down Expand Up @@ -1009,3 +1016,57 @@ def into_files_df(dataset_infos):
"""
return Client.into_datasets_df(dataset_infos) \
.merge(pd.DataFrame([dsfile for ds in dataset_infos for dsfile in ds.files]))

def purge_cache(self, target_dir=SYSTEM_DIR, keep_testfiles=True):
"""Removes downloaded dataset files from the given directory if they have been downloaded
with the API client, if they are beneath the given directory and if one of the following
is the case:
- there status is neither 'active' nor 'test_dataset'
- their status is 'test_dataset' and keep_testfiles is set to False
- their status is 'active' and they are outdated, i.e., there is a dataset with the same
data_type and name but a newer version.
Parameters
----------
target_dir : Path or str, optional
files downloaded beneath this directory and empty subdirectories will be removed.
default: SYSTEM_DIR
keep_testfiles : bool, optional
if set to True, files from datasets with status 'test_dataset' will not be removed.
default: True
"""

# collect urls from datasets that should not be removed
test_datasets = self.list_dataset_infos(status='test_dataset') if keep_testfiles else []
test_urls = set(
file_info.url for ds_info in test_datasets for file_info in ds_info.files)

active_datasets = self.list_dataset_infos(status='active', version='newest')
active_urls = set(
file_info.url for ds_info in active_datasets for file_info in ds_info.files)

not_to_be_removed = test_urls.union(active_urls)

# make a list of downloaded files that could be removed
to_be_removed = [d for d in Download.select() if d.url not in not_to_be_removed]

# helper function for filtering by target_dir
target_dir = Path(target_dir).absolute()

# remove files and sqlite db entries
for obsolete in to_be_removed:
opath = Path(obsolete.path)
if opath.exists() and Path(commonprefix([target_dir, opath])) == target_dir:
opath.unlink()
obsolete.delete_instance()

# clean up: remove all empty directories beneath target_dir
def rm_empty_dirs(directory: Path):
for subdir in directory.iterdir():
if subdir.is_dir():
rm_empty_dirs(subdir)
try:
directory.rmdir()
except OSError: # raised when the directory is not empty
pass
rm_empty_dirs(target_dir)
12 changes: 12 additions & 0 deletions doc/tutorial/climada_util_api_client.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1204,6 +1204,18 @@
"ds_files[0], ds_files[0].is_file()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Local File Cache\n",
"\n",
"By default, the API Client downloads files into the `~/climada/data` directory.\n",
"\n",
"In the course of time obsolete files may be accumulated within this directory, because there is a newer version of these files available from the [CLIMADA data API](https://climada.ethz.ch), or because the according dataset got expired altogether.\\\n",
"To prevent file rot and free disk space, it's possible to remove all outdated files at once, by simply calling `Client().purge_cache()`. This will remove all files that were ever downloaded with the `api_client.Client` and for which a newer version exists, even when the newer version has not been downloaded yet."
]
},
{
"cell_type": "markdown",
"metadata": {
Expand Down

0 comments on commit c15fc53

Please sign in to comment.