diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 208d702..01ca457 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -5,9 +5,9 @@ name: Tests # Controls when the action will run. on: push: - branches: [ master ] + branches: + - "master" pull_request: - branches: [ master ] # Allows you to run this workflow manually from the Actions tab workflow_dispatch: @@ -20,8 +20,8 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.7, 3.8, 3.9] - os: [ubuntu-latest, macos-latest] + python-version: [3.7, '3.10'] + os: [ubuntu-latest, macos-latest, windows-latest] steps: - uses: actions/checkout@v2 diff --git a/README.md b/README.md index 67d7491..29d56dc 100644 --- a/README.md +++ b/README.md @@ -2,30 +2,40 @@ Due to licensing we are not allowed to distribute the IMDB datasets (more info on their license can be found [here](https://help.imdb.com/article/imdb/general-information/can-i-use-imdb-data-in-my-software/G5JTRESSHJBBHTGX?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=3aefe545-f8d3-4562-976a-e5eb47d1bb18&pf_rd_r=2TNAA9FRS3TJWM3AEQ2X&pf_rd_s=center-1&pf_rd_t=60601&pf_rd_i=interfaces&ref_=fea_mn_lk1#)) What we can do is let you build the IMDB side of the entity resolution datasets yourself. Please be aware, that the mentioned license applies to the IMDB data you produce. -# Dependencies -The only dependency is `requests`, although with `tqdm` you will have nice progress bars (this is optional). - -# Getting the data -The TMDB and TVDB datasets are already provided in this repo and where created from the public APIs of [TheMovieDB](https://www.themoviedb.org/documentation/api) and [TVDB](https://www.thetvdb.com/api-information). What you have to do is create the IMDB data. +# Usage +You can simply install the package via pip: +```bash +pip install moviegraphbenchmark +``` +and then run +```bash +moviegraphbenchmark +``` +which will create the data in the default data path `~/.data/moviegraphbenchmark/data` -If you love one-liners and trust random people on the internet (that promise to be nice) you can simply run: +You can also define a specific folder if you want with ```bash -curl -sSL https://raw.githubusercontent.com/ScaDS/MovieGraphBenchmark/master/src/main.py | python3 - +moviegraphbenchmark --data-path anotherpath ``` -This will download this repo, execute `python src/create_graph.py`, which downloads the IMDB data and creates the missing datasets. Furthermore it cleans up and only leaves a `ScaDSMovieGraphBenchmark` in your current directory with the datasets. +For ease-of-usage in your project you can also use this library for loading the data (this will create the data if it's not present): -You can also specify a specific directory where data should go: +```python +from moviegraphbenchmark import load_data +ds = load_data() +# by default this will load `imdb-tmdb` +print(ds.attr_triples_1) -```bash -curl -sSL https://raw.githubusercontent.com/ScaDS/MovieGraphBenchmark/master/src/main.py | python3 - mypath/benchmarkfolder -``` +# specify other pair and specific data path +ds = load_data(pair="imdb-tmdb",data_path="anotherpath") -If you don't like piping scripts from the internet (or you use windows) you can do the steps by yourself: -``` -git clone https://github.com/ScaDS/MovieGraphBenchmark.git -cd MovieGraphBenchmark -python3 src/create_graph.py +# the dataclass contains all the files loaded as pandas dataframes +print(ds.attr_triples_2) +print(ds.rel_triples_1) +print(ds.rel_triples_2) +print(ds.ent_links) +for fold in in ds.folds: + print(fold) ``` # Dataset structure diff --git a/data/imdb-tmdb/attr_triples_2 b/data/imdb-tmdb/attr_triples_2 index 1a7a737..e108f33 100644 --- a/data/imdb-tmdb/attr_triples_2 +++ b/data/imdb-tmdb/attr_triples_2 @@ -10925,7 +10925,7 @@ https://www.scads.de/movieBenchmark/resource/TMDB/person1336352 http://xmlns.com https://www.scads.de/movieBenchmark/resource/TMDB/episode1282716 http://dbpedia.org/ontology/releaseDate 2017-03-14^^ https://www.scads.de/movieBenchmark/resource/TMDB/episode311214 http://dbpedia.org/ontology/episodeNumber 3^^ https://www.scads.de/movieBenchmark/resource/TMDB/tvSeries75891 http://dbpedia.org/ontology/abstract Orphan Remi (13) goes on a incredible journey to find his family.^^ -https://www.scads.de/movieBenchmark/resource/TMDB/movie167366 http://dbpedia.org/ontology/abstract After airing more than 100 MTV Unplugged specials, MTV wanted to bring back the series, in order to expose them to a younger generation. The channel recruited various mainstream and popular artists to perform as part of the series, including Perry, who particularly expressed interest in the idea as it would allow her to showcase herself as an artist and share the stories behind her songs. The extended play includes rearrangements of five songs from Perry's album One of the Boys (2008), a previously unreleased original song and a cover version of a song by Fountains of Wayne. Alongside the audio disc, the album includes a DVD with the video recording of her performance and an exclusive interview.^^ +https://www.scads.de/movieBenchmark/resource/TMDB/movie167366 http://dbpedia.org/ontology/abstract After airing more than 100 MTV Unplugged specials, MTV wanted to bring back the series, in order to expose them to a younger generation. The channel recruited various mainstream and popular artists to perform as part of the series, including Perry, who particularly expressed interest in the idea as it would allow her to showcase herself as an artist and share the stories behind her songs. The extended play includes rearrangements of five songs from Perry's album One of the Boys (2008), a previously unreleased original song and a cover version of a song by Fountains of Wayne. Alongside the audio disc, the album includes a DVD with the video recording of her performance and an exclusive interview.^^ https://www.scads.de/movieBenchmark/resource/TMDB/tvSeries70115 http://dbpedia.org/ontology/releaseDate 2016-11-18^^ https://www.scads.de/movieBenchmark/resource/TMDB/episode513361 http://dbpedia.org/ontology/episodeNumber 13^^ https://www.scads.de/movieBenchmark/resource/TMDB/episode182364 http://dbpedia.org/ontology/abstract While the Monarch schemes to dispose of Dr. Venture once and for all, Henchman 21 seeks revenge on Brock for the death of 24.^^ diff --git a/data/tmdb-tvdb/attr_triples_1 b/data/tmdb-tvdb/attr_triples_1 index 8a3e8a7..49a9750 100644 --- a/data/tmdb-tvdb/attr_triples_1 +++ b/data/tmdb-tvdb/attr_triples_1 @@ -10925,7 +10925,7 @@ https://www.scads.de/movieBenchmark/resource/TMDB/person1336352 http://xmlns.com https://www.scads.de/movieBenchmark/resource/TMDB/episode311214 http://dbpedia.org/ontology/episodeNumber 3^^ https://www.scads.de/movieBenchmark/resource/TMDB/episode1282716 http://dbpedia.org/ontology/releaseDate 2017-03-14^^ https://www.scads.de/movieBenchmark/resource/TMDB/tvSeries75891 http://dbpedia.org/ontology/abstract Orphan Remi (13) goes on a incredible journey to find his family.^^ -https://www.scads.de/movieBenchmark/resource/TMDB/movie167366 http://dbpedia.org/ontology/abstract After airing more than 100 MTV Unplugged specials, MTV wanted to bring back the series, in order to expose them to a younger generation. The channel recruited various mainstream and popular artists to perform as part of the series, including Perry, who particularly expressed interest in the idea as it would allow her to showcase herself as an artist and share the stories behind her songs. The extended play includes rearrangements of five songs from Perry's album One of the Boys (2008), a previously unreleased original song and a cover version of a song by Fountains of Wayne. Alongside the audio disc, the album includes a DVD with the video recording of her performance and an exclusive interview.^^ +https://www.scads.de/movieBenchmark/resource/TMDB/movie167366 http://dbpedia.org/ontology/abstract After airing more than 100 MTV Unplugged specials, MTV wanted to bring back the series, in order to expose them to a younger generation. The channel recruited various mainstream and popular artists to perform as part of the series, including Perry, who particularly expressed interest in the idea as it would allow her to showcase herself as an artist and share the stories behind her songs. The extended play includes rearrangements of five songs from Perry's album One of the Boys (2008), a previously unreleased original song and a cover version of a song by Fountains of Wayne. Alongside the audio disc, the album includes a DVD with the video recording of her performance and an exclusive interview.^^ https://www.scads.de/movieBenchmark/resource/TMDB/tvSeries70115 http://dbpedia.org/ontology/releaseDate 2016-11-18^^ https://www.scads.de/movieBenchmark/resource/TMDB/episode513361 http://dbpedia.org/ontology/episodeNumber 13^^ https://www.scads.de/movieBenchmark/resource/TMDB/episode182364 http://dbpedia.org/ontology/abstract While the Monarch schemes to dispose of Dr. Venture once and for all, Henchman 21 seeks revenge on Brock for the death of 24.^^ diff --git a/noxfile.py b/noxfile.py index 076fe8b..f47ef89 100644 --- a/noxfile.py +++ b/noxfile.py @@ -1,16 +1,11 @@ import nox - -@nox.session -def tests_without_tqdm(session): - session.install("pytest") - session.install("requests") - session.run("pytest") - - @nox.session def tests(session): session.install("pytest") + session.install(".") session.install("requests") session.install("tqdm") + session.install("pandas") + session.install("pystow") session.run("pytest") diff --git a/poetry.lock b/poetry.lock new file mode 100644 index 0000000..3959dc6 --- /dev/null +++ b/poetry.lock @@ -0,0 +1,752 @@ +[[package]] +name = "appnope" +version = "0.1.3" +description = "Disable App Nap on macOS >= 10.9" +category = "dev" +optional = false +python-versions = "*" + +[[package]] +name = "atomicwrites" +version = "1.4.1" +description = "Atomic file writes." +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + +[[package]] +name = "attrs" +version = "21.4.0" +description = "Classes Without Boilerplate" +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" + +[package.extras] +dev = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "furo", "sphinx", "sphinx-notfound-page", "pre-commit", "cloudpickle"] +docs = ["furo", "sphinx", "zope.interface", "sphinx-notfound-page"] +tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "cloudpickle"] +tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "cloudpickle"] + +[[package]] +name = "backcall" +version = "0.2.0" +description = "Specifications for callback functions passed in to an API" +category = "dev" +optional = false +python-versions = "*" + +[[package]] +name = "certifi" +version = "2022.6.15" +description = "Python package for providing Mozilla's CA Bundle." +category = "main" +optional = false +python-versions = ">=3.6" + +[[package]] +name = "charset-normalizer" +version = "2.1.0" +description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." +category = "main" +optional = false +python-versions = ">=3.6.0" + +[package.extras] +unicode_backport = ["unicodedata2"] + +[[package]] +name = "click" +version = "8.1.3" +description = "Composable command line interface toolkit" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} +importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} + +[[package]] +name = "colorama" +version = "0.4.5" +description = "Cross-platform colored terminal text." +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" + +[[package]] +name = "decorator" +version = "5.1.1" +description = "Decorators for Humans" +category = "dev" +optional = false +python-versions = ">=3.5" + +[[package]] +name = "idna" +version = "3.3" +description = "Internationalized Domain Names in Applications (IDNA)" +category = "main" +optional = false +python-versions = ">=3.5" + +[[package]] +name = "importlib-metadata" +version = "4.12.0" +description = "Read metadata from Python packages" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +typing-extensions = {version = ">=3.6.4", markers = "python_version < \"3.8\""} +zipp = ">=0.5" + +[package.extras] +docs = ["sphinx", "jaraco.packaging (>=9)", "rst.linker (>=1.9)"] +perf = ["ipython"] +testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.3)", "packaging", "pyfakefs", "flufl.flake8", "pytest-perf (>=0.9.2)", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)", "importlib-resources (>=1.3)"] + +[[package]] +name = "iniconfig" +version = "1.1.1" +description = "iniconfig: brain-dead simple config-ini parsing" +category = "dev" +optional = false +python-versions = "*" + +[[package]] +name = "ipdb" +version = "0.13.9" +description = "IPython-enabled pdb" +category = "dev" +optional = false +python-versions = ">=2.7" + +[package.dependencies] +decorator = {version = "*", markers = "python_version > \"3.6\""} +ipython = {version = ">=7.17.0", markers = "python_version > \"3.6\""} +toml = {version = ">=0.10.2", markers = "python_version > \"3.6\""} + +[[package]] +name = "ipython" +version = "7.34.0" +description = "IPython: Productive Interactive Computing" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +appnope = {version = "*", markers = "sys_platform == \"darwin\""} +backcall = "*" +colorama = {version = "*", markers = "sys_platform == \"win32\""} +decorator = "*" +jedi = ">=0.16" +matplotlib-inline = "*" +pexpect = {version = ">4.3", markers = "sys_platform != \"win32\""} +pickleshare = "*" +prompt-toolkit = ">=2.0.0,<3.0.0 || >3.0.0,<3.0.1 || >3.0.1,<3.1.0" +pygments = "*" +traitlets = ">=4.2" + +[package.extras] +all = ["Sphinx (>=1.3)", "ipykernel", "ipyparallel", "ipywidgets", "nbconvert", "nbformat", "nose (>=0.10.1)", "notebook", "numpy (>=1.17)", "pygments", "qtconsole", "requests", "testpath"] +doc = ["Sphinx (>=1.3)"] +kernel = ["ipykernel"] +nbconvert = ["nbconvert"] +nbformat = ["nbformat"] +notebook = ["notebook", "ipywidgets"] +parallel = ["ipyparallel"] +qtconsole = ["qtconsole"] +test = ["nose (>=0.10.1)", "requests", "testpath", "pygments", "nbformat", "ipykernel", "numpy (>=1.17)"] + +[[package]] +name = "jedi" +version = "0.18.1" +description = "An autocompletion tool for Python that can be used for text editors." +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +parso = ">=0.8.0,<0.9.0" + +[package.extras] +qa = ["flake8 (==3.8.3)", "mypy (==0.782)"] +testing = ["Django (<3.1)", "colorama", "docopt", "pytest (<7.0.0)"] + +[[package]] +name = "matplotlib-inline" +version = "0.1.3" +description = "Inline Matplotlib backend for Jupyter" +category = "dev" +optional = false +python-versions = ">=3.5" + +[package.dependencies] +traitlets = "*" + +[[package]] +name = "numpy" +version = "1.21.6" +description = "NumPy is the fundamental package for array computing with Python." +category = "main" +optional = false +python-versions = ">=3.7,<3.11" + +[[package]] +name = "numpy" +version = "1.23.1" +description = "NumPy is the fundamental package for array computing with Python." +category = "main" +optional = false +python-versions = ">=3.8" + +[[package]] +name = "packaging" +version = "21.3" +description = "Core utilities for Python packages" +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +pyparsing = ">=2.0.2,<3.0.5 || >3.0.5" + +[[package]] +name = "pandas" +version = "1.3.5" +description = "Powerful data structures for data analysis, time series, and statistics" +category = "main" +optional = false +python-versions = ">=3.7.1" + +[package.dependencies] +numpy = [ + {version = ">=1.17.3", markers = "platform_machine != \"aarch64\" and platform_machine != \"arm64\" and python_version < \"3.10\""}, + {version = ">=1.19.2", markers = "platform_machine == \"aarch64\" and python_version < \"3.10\""}, + {version = ">=1.20.0", markers = "platform_machine == \"arm64\" and python_version < \"3.10\""}, + {version = ">=1.21.0", markers = "python_version >= \"3.10\""}, +] +python-dateutil = ">=2.7.3" +pytz = ">=2017.3" + +[package.extras] +test = ["hypothesis (>=3.58)", "pytest (>=6.0)", "pytest-xdist"] + +[[package]] +name = "parso" +version = "0.8.3" +description = "A Python Parser" +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.extras] +qa = ["flake8 (==3.8.3)", "mypy (==0.782)"] +testing = ["docopt", "pytest (<6.0.0)"] + +[[package]] +name = "pexpect" +version = "4.8.0" +description = "Pexpect allows easy control of interactive console applications." +category = "dev" +optional = false +python-versions = "*" + +[package.dependencies] +ptyprocess = ">=0.5" + +[[package]] +name = "pickle5" +version = "0.0.12" +description = "Backport of the pickle 5 protocol (PEP 574) and other pickle changes" +category = "main" +optional = false +python-versions = ">=3.5, <3.8" + +[[package]] +name = "pickleshare" +version = "0.7.5" +description = "Tiny 'shelve'-like database with concurrency support" +category = "dev" +optional = false +python-versions = "*" + +[[package]] +name = "pluggy" +version = "1.0.0" +description = "plugin and hook calling mechanisms for python" +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} + +[package.extras] +dev = ["pre-commit", "tox"] +testing = ["pytest", "pytest-benchmark"] + +[[package]] +name = "prompt-toolkit" +version = "3.0.30" +description = "Library for building powerful interactive command lines in Python" +category = "dev" +optional = false +python-versions = ">=3.6.2" + +[package.dependencies] +wcwidth = "*" + +[[package]] +name = "ptyprocess" +version = "0.7.0" +description = "Run a subprocess in a pseudo terminal" +category = "dev" +optional = false +python-versions = "*" + +[[package]] +name = "py" +version = "1.11.0" +description = "library with cross-python path, ini-parsing, io, code, log facilities" +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" + +[[package]] +name = "pygments" +version = "2.12.0" +description = "Pygments is a syntax highlighting package written in Python." +category = "dev" +optional = false +python-versions = ">=3.6" + +[[package]] +name = "pyparsing" +version = "3.0.9" +description = "pyparsing module - Classes and methods to define and execute parsing grammars" +category = "dev" +optional = false +python-versions = ">=3.6.8" + +[package.extras] +diagrams = ["railroad-diagrams", "jinja2"] + +[[package]] +name = "pystow" +version = "0.4.5" +description = "Easily pick a place to store data for your python package." +category = "main" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +click = "*" +pickle5 = {version = "*", markers = "python_version < \"3.8\""} +requests = "*" +tqdm = "*" + +[package.extras] +aws = ["boto3"] +docs = ["sphinx", "sphinx-rtd-theme", "sphinx-click", "sphinx-autodoc-typehints", "sphinx-automodapi"] +pandas = ["pandas"] +rdf = ["rdflib"] +tests = ["coverage", "pytest", "requests-file"] +xml = ["lxml"] + +[[package]] +name = "pytest" +version = "7.1.2" +description = "pytest: simple powerful testing with Python" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""} +attrs = ">=19.2.0" +colorama = {version = "*", markers = "sys_platform == \"win32\""} +importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} +iniconfig = "*" +packaging = "*" +pluggy = ">=0.12,<2.0" +py = ">=1.8.2" +tomli = ">=1.0.0" + +[package.extras] +testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] + +[[package]] +name = "python-dateutil" +version = "2.8.2" +description = "Extensions to the standard Python datetime module" +category = "main" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" + +[package.dependencies] +six = ">=1.5" + +[[package]] +name = "pytz" +version = "2022.1" +description = "World timezone definitions, modern and historical" +category = "main" +optional = false +python-versions = "*" + +[[package]] +name = "requests" +version = "2.28.1" +description = "Python HTTP for Humans." +category = "main" +optional = false +python-versions = ">=3.7, <4" + +[package.dependencies] +certifi = ">=2017.4.17" +charset-normalizer = ">=2,<3" +idna = ">=2.5,<4" +urllib3 = ">=1.21.1,<1.27" + +[package.extras] +socks = ["PySocks (>=1.5.6,!=1.5.7)"] +use_chardet_on_py3 = ["chardet (>=3.0.2,<6)"] + +[[package]] +name = "six" +version = "1.16.0" +description = "Python 2 and 3 compatibility utilities" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" + +[[package]] +name = "toml" +version = "0.10.2" +description = "Python Library for Tom's Obvious, Minimal Language" +category = "dev" +optional = false +python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" + +[[package]] +name = "tomli" +version = "2.0.1" +description = "A lil' TOML parser" +category = "dev" +optional = false +python-versions = ">=3.7" + +[[package]] +name = "tqdm" +version = "4.64.0" +description = "Fast, Extensible Progress Meter" +category = "main" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7" + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + +[package.extras] +dev = ["py-make (>=0.1.0)", "twine", "wheel"] +notebook = ["ipywidgets (>=6)"] +slack = ["slack-sdk"] +telegram = ["requests"] + +[[package]] +name = "traitlets" +version = "5.3.0" +description = "" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.extras] +test = ["pre-commit", "pytest"] + +[[package]] +name = "typing-extensions" +version = "4.3.0" +description = "Backported and Experimental Type Hints for Python 3.7+" +category = "main" +optional = false +python-versions = ">=3.7" + +[[package]] +name = "urllib3" +version = "1.26.10" +description = "HTTP library with thread-safe connection pooling, file post, and more." +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, <4" + +[package.extras] +brotli = ["brotlicffi (>=0.8.0)", "brotli (>=1.0.9)", "brotlipy (>=0.6.0)"] +secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "ipaddress"] +socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] + +[[package]] +name = "wcwidth" +version = "0.2.5" +description = "Measures the displayed width of unicode strings in a terminal" +category = "dev" +optional = false +python-versions = "*" + +[[package]] +name = "zipp" +version = "3.8.1" +description = "Backport of pathlib-compatible object wrapper for zip files" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.extras] +docs = ["sphinx", "jaraco.packaging (>=9)", "rst.linker (>=1.9)", "jaraco.tidelift (>=1.4)"] +testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.3)", "jaraco.itertools", "func-timeout", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)"] + +[metadata] +lock-version = "1.1" +python-versions = ">=3.7.1,<3.11" +content-hash = "0da3ec7a2062a9bb73e5f647019d6b65b686411fae4e66560f6c02e2f6d11dfc" + +[metadata.files] +appnope = [ + {file = "appnope-0.1.3-py2.py3-none-any.whl", hash = "sha256:265a455292d0bd8a72453494fa24df5a11eb18373a60c7c0430889f22548605e"}, + {file = "appnope-0.1.3.tar.gz", hash = "sha256:02bd91c4de869fbb1e1c50aafc4098827a7a54ab2f39d9dcba6c9547ed920e24"}, +] +atomicwrites = [ + {file = "atomicwrites-1.4.1.tar.gz", hash = "sha256:81b2c9071a49367a7f770170e5eec8cb66567cfbbc8c73d20ce5ca4a8d71cf11"}, +] +attrs = [ + {file = "attrs-21.4.0-py2.py3-none-any.whl", hash = "sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4"}, + {file = "attrs-21.4.0.tar.gz", hash = "sha256:626ba8234211db98e869df76230a137c4c40a12d72445c45d5f5b716f076e2fd"}, +] +backcall = [ + {file = "backcall-0.2.0-py2.py3-none-any.whl", hash = "sha256:fbbce6a29f263178a1f7915c1940bde0ec2b2a967566fe1c65c1dfb7422bd255"}, + {file = "backcall-0.2.0.tar.gz", hash = "sha256:5cbdbf27be5e7cfadb448baf0aa95508f91f2bbc6c6437cd9cd06e2a4c215e1e"}, +] +certifi = [] +charset-normalizer = [] +click = [ + {file = "click-8.1.3-py3-none-any.whl", hash = "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"}, + {file = "click-8.1.3.tar.gz", hash = "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e"}, +] +colorama = [ + {file = "colorama-0.4.5-py2.py3-none-any.whl", hash = "sha256:854bf444933e37f5824ae7bfc1e98d5bce2ebe4160d46b5edf346a89358e99da"}, + {file = "colorama-0.4.5.tar.gz", hash = "sha256:e6c6b4334fc50988a639d9b98aa429a0b57da6e17b9a44f0451f930b6967b7a4"}, +] +decorator = [ + {file = "decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186"}, + {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"}, +] +idna = [ + {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"}, + {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"}, +] +importlib-metadata = [ + {file = "importlib_metadata-4.12.0-py3-none-any.whl", hash = "sha256:7401a975809ea1fdc658c3aa4f78cc2195a0e019c5cbc4c06122884e9ae80c23"}, + {file = "importlib_metadata-4.12.0.tar.gz", hash = "sha256:637245b8bab2b6502fcbc752cc4b7a6f6243bb02b31c5c26156ad103d3d45670"}, +] +iniconfig = [ + {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"}, + {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"}, +] +ipdb = [ + {file = "ipdb-0.13.9.tar.gz", hash = "sha256:951bd9a64731c444fd907a5ce268543020086a697f6be08f7cc2c9a752a278c5"}, +] +ipython = [ + {file = "ipython-7.34.0-py3-none-any.whl", hash = "sha256:c175d2440a1caff76116eb719d40538fbb316e214eda85c5515c303aacbfb23e"}, + {file = "ipython-7.34.0.tar.gz", hash = "sha256:af3bdb46aa292bce5615b1b2ebc76c2080c5f77f54bda2ec72461317273e7cd6"}, +] +jedi = [ + {file = "jedi-0.18.1-py2.py3-none-any.whl", hash = "sha256:637c9635fcf47945ceb91cd7f320234a7be540ded6f3e99a50cb6febdfd1ba8d"}, + {file = "jedi-0.18.1.tar.gz", hash = "sha256:74137626a64a99c8eb6ae5832d99b3bdd7d29a3850fe2aa80a4126b2a7d949ab"}, +] +matplotlib-inline = [ + {file = "matplotlib-inline-0.1.3.tar.gz", hash = "sha256:a04bfba22e0d1395479f866853ec1ee28eea1485c1d69a6faf00dc3e24ff34ee"}, + {file = "matplotlib_inline-0.1.3-py3-none-any.whl", hash = "sha256:aed605ba3b72462d64d475a21a9296f400a19c4f74a31b59103d2a99ffd5aa5c"}, +] +numpy = [ + {file = "numpy-1.21.6-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:8737609c3bbdd48e380d463134a35ffad3b22dc56295eff6f79fd85bd0eeeb25"}, + {file = "numpy-1.21.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:fdffbfb6832cd0b300995a2b08b8f6fa9f6e856d562800fea9182316d99c4e8e"}, + {file = "numpy-1.21.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3820724272f9913b597ccd13a467cc492a0da6b05df26ea09e78b171a0bb9da6"}, + {file = "numpy-1.21.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f17e562de9edf691a42ddb1eb4a5541c20dd3f9e65b09ded2beb0799c0cf29bb"}, + {file = "numpy-1.21.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5f30427731561ce75d7048ac254dbe47a2ba576229250fb60f0fb74db96501a1"}, + {file = "numpy-1.21.6-cp310-cp310-win32.whl", hash = "sha256:d4bf4d43077db55589ffc9009c0ba0a94fa4908b9586d6ccce2e0b164c86303c"}, + {file = "numpy-1.21.6-cp310-cp310-win_amd64.whl", hash = "sha256:d136337ae3cc69aa5e447e78d8e1514be8c3ec9b54264e680cf0b4bd9011574f"}, + {file = "numpy-1.21.6-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:6aaf96c7f8cebc220cdfc03f1d5a31952f027dda050e5a703a0d1c396075e3e7"}, + {file = "numpy-1.21.6-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:67c261d6c0a9981820c3a149d255a76918278a6b03b6a036800359aba1256d46"}, + {file = "numpy-1.21.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:a6be4cb0ef3b8c9250c19cc122267263093eee7edd4e3fa75395dfda8c17a8e2"}, + {file = "numpy-1.21.6-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c4068a8c44014b2d55f3c3f574c376b2494ca9cc73d2f1bd692382b6dffe3db"}, + {file = "numpy-1.21.6-cp37-cp37m-win32.whl", hash = "sha256:7c7e5fa88d9ff656e067876e4736379cc962d185d5cd808014a8a928d529ef4e"}, + {file = "numpy-1.21.6-cp37-cp37m-win_amd64.whl", hash = "sha256:bcb238c9c96c00d3085b264e5c1a1207672577b93fa666c3b14a45240b14123a"}, + {file = "numpy-1.21.6-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:82691fda7c3f77c90e62da69ae60b5ac08e87e775b09813559f8901a88266552"}, + {file = "numpy-1.21.6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:643843bcc1c50526b3a71cd2ee561cf0d8773f062c8cbaf9ffac9fdf573f83ab"}, + {file = "numpy-1.21.6-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:357768c2e4451ac241465157a3e929b265dfac85d9214074985b1786244f2ef3"}, + {file = "numpy-1.21.6-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:9f411b2c3f3d76bba0865b35a425157c5dcf54937f82bbeb3d3c180789dd66a6"}, + {file = "numpy-1.21.6-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:4aa48afdce4660b0076a00d80afa54e8a97cd49f457d68a4342d188a09451c1a"}, + {file = "numpy-1.21.6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d6a96eef20f639e6a97d23e57dd0c1b1069a7b4fd7027482a4c5c451cd7732f4"}, + {file = "numpy-1.21.6-cp38-cp38-win32.whl", hash = "sha256:5c3c8def4230e1b959671eb959083661b4a0d2e9af93ee339c7dada6759a9470"}, + {file = "numpy-1.21.6-cp38-cp38-win_amd64.whl", hash = "sha256:bf2ec4b75d0e9356edea834d1de42b31fe11f726a81dfb2c2112bc1eaa508fcf"}, + {file = "numpy-1.21.6-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:4391bd07606be175aafd267ef9bea87cf1b8210c787666ce82073b05f202add1"}, + {file = "numpy-1.21.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:67f21981ba2f9d7ba9ade60c9e8cbaa8cf8e9ae51673934480e45cf55e953673"}, + {file = "numpy-1.21.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ee5ec40fdd06d62fe5d4084bef4fd50fd4bb6bfd2bf519365f569dc470163ab0"}, + {file = "numpy-1.21.6-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:1dbe1c91269f880e364526649a52eff93ac30035507ae980d2fed33aaee633ac"}, + {file = "numpy-1.21.6-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:d9caa9d5e682102453d96a0ee10c7241b72859b01a941a397fd965f23b3e016b"}, + {file = "numpy-1.21.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58459d3bad03343ac4b1b42ed14d571b8743dc80ccbf27444f266729df1d6f5b"}, + {file = "numpy-1.21.6-cp39-cp39-win32.whl", hash = "sha256:7f5ae4f304257569ef3b948810816bc87c9146e8c446053539947eedeaa32786"}, + {file = "numpy-1.21.6-cp39-cp39-win_amd64.whl", hash = "sha256:e31f0bb5928b793169b87e3d1e070f2342b22d5245c755e2b81caa29756246c3"}, + {file = "numpy-1.21.6-pp37-pypy37_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:dd1c8f6bd65d07d3810b90d02eba7997e32abbdf1277a481d698969e921a3be0"}, + {file = "numpy-1.21.6.zip", hash = "sha256:ecb55251139706669fdec2ff073c98ef8e9a84473e51e716211b41aa0f18e656"}, +] +packaging = [ + {file = "packaging-21.3-py3-none-any.whl", hash = "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"}, + {file = "packaging-21.3.tar.gz", hash = "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb"}, +] +pandas = [ + {file = "pandas-1.3.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:62d5b5ce965bae78f12c1c0df0d387899dd4211ec0bdc52822373f13a3a022b9"}, + {file = "pandas-1.3.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:adfeb11be2d54f275142c8ba9bf67acee771b7186a5745249c7d5a06c670136b"}, + {file = "pandas-1.3.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:60a8c055d58873ad81cae290d974d13dd479b82cbb975c3e1fa2cf1920715296"}, + {file = "pandas-1.3.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fd541ab09e1f80a2a1760032d665f6e032d8e44055d602d65eeea6e6e85498cb"}, + {file = "pandas-1.3.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2651d75b9a167cc8cc572cf787ab512d16e316ae00ba81874b560586fa1325e0"}, + {file = "pandas-1.3.5-cp310-cp310-win_amd64.whl", hash = "sha256:aaf183a615ad790801fa3cf2fa450e5b6d23a54684fe386f7e3208f8b9bfbef6"}, + {file = "pandas-1.3.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:344295811e67f8200de2390093aeb3c8309f5648951b684d8db7eee7d1c81fb7"}, + {file = "pandas-1.3.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:552020bf83b7f9033b57cbae65589c01e7ef1544416122da0c79140c93288f56"}, + {file = "pandas-1.3.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5cce0c6bbeb266b0e39e35176ee615ce3585233092f685b6a82362523e59e5b4"}, + {file = "pandas-1.3.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d28a3c65463fd0d0ba8bbb7696b23073efee0510783340a44b08f5e96ffce0c"}, + {file = "pandas-1.3.5-cp37-cp37m-win32.whl", hash = "sha256:a62949c626dd0ef7de11de34b44c6475db76995c2064e2d99c6498c3dba7fe58"}, + {file = "pandas-1.3.5-cp37-cp37m-win_amd64.whl", hash = "sha256:8025750767e138320b15ca16d70d5cdc1886e8f9cc56652d89735c016cd8aea6"}, + {file = "pandas-1.3.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:fe95bae4e2d579812865db2212bb733144e34d0c6785c0685329e5b60fcb85dd"}, + {file = "pandas-1.3.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5f261553a1e9c65b7a310302b9dbac31cf0049a51695c14ebe04e4bfd4a96f02"}, + {file = "pandas-1.3.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8b6dbec5f3e6d5dc80dcfee250e0a2a652b3f28663492f7dab9a24416a48ac39"}, + {file = "pandas-1.3.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d3bc49af96cd6285030a64779de5b3688633a07eb75c124b0747134a63f4c05f"}, + {file = "pandas-1.3.5-cp38-cp38-win32.whl", hash = "sha256:b6b87b2fb39e6383ca28e2829cddef1d9fc9e27e55ad91ca9c435572cdba51bf"}, + {file = "pandas-1.3.5-cp38-cp38-win_amd64.whl", hash = "sha256:a395692046fd8ce1edb4c6295c35184ae0c2bbe787ecbe384251da609e27edcb"}, + {file = "pandas-1.3.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bd971a3f08b745a75a86c00b97f3007c2ea175951286cdda6abe543e687e5f2f"}, + {file = "pandas-1.3.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:37f06b59e5bc05711a518aa10beaec10942188dccb48918bb5ae602ccbc9f1a0"}, + {file = "pandas-1.3.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2c21778a688d3712d35710501f8001cdbf96eb70a7c587a3d5613573299fdca6"}, + {file = "pandas-1.3.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3345343206546545bc26a05b4602b6a24385b5ec7c75cb6059599e3d56831da2"}, + {file = "pandas-1.3.5-cp39-cp39-win32.whl", hash = "sha256:c69406a2808ba6cf580c2255bcf260b3f214d2664a3a4197d0e640f573b46fd3"}, + {file = "pandas-1.3.5-cp39-cp39-win_amd64.whl", hash = "sha256:32e1a26d5ade11b547721a72f9bfc4bd113396947606e00d5b4a5b79b3dcb006"}, + {file = "pandas-1.3.5.tar.gz", hash = "sha256:1e4285f5de1012de20ca46b188ccf33521bff61ba5c5ebd78b4fb28e5416a9f1"}, +] +parso = [ + {file = "parso-0.8.3-py2.py3-none-any.whl", hash = "sha256:c001d4636cd3aecdaf33cbb40aebb59b094be2a74c556778ef5576c175e19e75"}, + {file = "parso-0.8.3.tar.gz", hash = "sha256:8c07be290bb59f03588915921e29e8a50002acaf2cdc5fa0e0114f91709fafa0"}, +] +pexpect = [ + {file = "pexpect-4.8.0-py2.py3-none-any.whl", hash = "sha256:0b48a55dcb3c05f3329815901ea4fc1537514d6ba867a152b581d69ae3710937"}, + {file = "pexpect-4.8.0.tar.gz", hash = "sha256:fc65a43959d153d0114afe13997d439c22823a27cefceb5ff35c2178c6784c0c"}, +] +pickle5 = [ + {file = "pickle5-0.0.12-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:748a1252c86c9ba18778d2cc2d314ec36eec224acc1b965da4cdf74702086c88"}, + {file = "pickle5-0.0.12-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc61c59343ad63745da4855846cfe459cf046ff669aad6373ea6b41f98ed835d"}, + {file = "pickle5-0.0.12-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3fbdc106ae5551cd808897d30582e65bd1efa44c3610b2aee37df34397316b4f"}, + {file = "pickle5-0.0.12-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:49a51f75132b76747ad7b54196bca1a6c4c814fec116b2909fccaabf1073d919"}, + {file = "pickle5-0.0.12-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2141be966d40c1e6bbe4d77c4ea38df3be6f35b611941acbb50405a247bcedde"}, + {file = "pickle5-0.0.12-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:5c7eb5a1b759bf0acf2cfb7ee3394b2935ec6de9174f4eb9044349cb06ca0ff7"}, + {file = "pickle5-0.0.12-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:d68731aae66b0727aca64fc4a7b11e637088a33c829db70f6e9749474174eab6"}, + {file = "pickle5-0.0.12-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:fa281642c1fe043fb3ad3cdfbdc9d4531ad47b1e9c0fdeaf76c54a7b84b3eb2d"}, + {file = "pickle5-0.0.12-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:91464df7f22f40d656e6b73ebf042710ebf6ea6a3fb232bd70b1c384626abd69"}, + {file = "pickle5-0.0.12-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:65297f87aef6aff5361a39d9f841437f08295e8dce985f08b97939ced749f3ff"}, + {file = "pickle5-0.0.12-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:8d3c9d8110eb775828a5a5a636eca5e9c63bf3f41c3fd1b05f97a0b46e1c5255"}, + {file = "pickle5-0.0.12-cp36-cp36m-win32.whl", hash = "sha256:167aa4cc2349513819267e8d8e84b1244e69132366830a2574889a3b5d3cacf8"}, + {file = "pickle5-0.0.12-cp36-cp36m-win_amd64.whl", hash = "sha256:b844af6e179163b40020f06ed45cebfb9a227077dc512cc40d9e9bc8b3aa62a6"}, + {file = "pickle5-0.0.12-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c396073370353391ead10af3100234c9e4fe442e1719f49d469b70e6058589e9"}, + {file = "pickle5-0.0.12-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:251ba2c996ccbc36147ae18d7ed80020c4739f828fd3cd2833291c6f473c7070"}, + {file = "pickle5-0.0.12-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a77e6c9c760abfee6875779b675577e2516206d17d6f4e0e6af76798d84be4db"}, + {file = "pickle5-0.0.12-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9075a22b9ae42bb3f68b751561c152233529502ab91feab34ce87e0673530098"}, + {file = "pickle5-0.0.12-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:17352fbea99a77c179cbdc58406dff84948b7675acfa1adaab1c153949fbbb43"}, + {file = "pickle5-0.0.12-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:734ce538fdb441a78a40c333d744d66ca1fbf9803a14756ddad4bf9af8a82890"}, + {file = "pickle5-0.0.12-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:66df777711fe548210cf11685e36be8bbd2ed201cfd18518da261a0375e57fb3"}, + {file = "pickle5-0.0.12-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:ccda4c39b8beb3a58881c1a139b919e5b1644fa64f1b8596c6e3392b99cd191d"}, + {file = "pickle5-0.0.12-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:d8c0436ce4875133208fc458907466a631ac214d44e4e71b032b384d61599d97"}, + {file = "pickle5-0.0.12-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:c533bd89093ebb3e95465fbc45428082b68e1dca4ce039205c1530a4343dc792"}, + {file = "pickle5-0.0.12-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:403eec60bb72ae4e4a01ff0b092509bf3822fda406363be472e18e70c5397a02"}, + {file = "pickle5-0.0.12-cp37-cp37m-win32.whl", hash = "sha256:67c19d88cabce5c0730b6c09b0d0ade163b0881904cfc491790205f7d9a09a2b"}, + {file = "pickle5-0.0.12-cp37-cp37m-win_amd64.whl", hash = "sha256:880883674f0ce0785709bdacec244dcb10b86db2d4cd84770006c1ff2aa852b6"}, + {file = "pickle5-0.0.12.tar.gz", hash = "sha256:80143d4e4ea9d6cd70e841af8745dbc4d50adef5adf99e725d240bcb92e6f1e8"}, +] +pickleshare = [ + {file = "pickleshare-0.7.5-py2.py3-none-any.whl", hash = "sha256:9649af414d74d4df115d5d718f82acb59c9d418196b7b4290ed47a12ce62df56"}, + {file = "pickleshare-0.7.5.tar.gz", hash = "sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca"}, +] +pluggy = [ + {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, + {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"}, +] +prompt-toolkit = [ + {file = "prompt_toolkit-3.0.30-py3-none-any.whl", hash = "sha256:d8916d3f62a7b67ab353a952ce4ced6a1d2587dfe9ef8ebc30dd7c386751f289"}, + {file = "prompt_toolkit-3.0.30.tar.gz", hash = "sha256:859b283c50bde45f5f97829f77a4674d1c1fcd88539364f1b28a37805cfd89c0"}, +] +ptyprocess = [ + {file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"}, + {file = "ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220"}, +] +py = [ + {file = "py-1.11.0-py2.py3-none-any.whl", hash = "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"}, + {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"}, +] +pygments = [ + {file = "Pygments-2.12.0-py3-none-any.whl", hash = "sha256:dc9c10fb40944260f6ed4c688ece0cd2048414940f1cea51b8b226318411c519"}, + {file = "Pygments-2.12.0.tar.gz", hash = "sha256:5eb116118f9612ff1ee89ac96437bb6b49e8f04d8a13b514ba26f620208e26eb"}, +] +pyparsing = [ + {file = "pyparsing-3.0.9-py3-none-any.whl", hash = "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"}, + {file = "pyparsing-3.0.9.tar.gz", hash = "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb"}, +] +pystow = [ + {file = "pystow-0.4.5-py3-none-any.whl", hash = "sha256:b2ebc9630214d5801039d2cf2bbfd0d2bf249d9aa62b51af4b8f8083db8a296a"}, + {file = "pystow-0.4.5.tar.gz", hash = "sha256:9a5acb705376516ba663c4957d2b1a43a23b3f1df7af52ced4aee7ae2fcd0f1c"}, +] +pytest = [ + {file = "pytest-7.1.2-py3-none-any.whl", hash = "sha256:13d0e3ccfc2b6e26be000cb6568c832ba67ba32e719443bfe725814d3c42433c"}, + {file = "pytest-7.1.2.tar.gz", hash = "sha256:a06a0425453864a270bc45e71f783330a7428defb4230fb5e6a731fde06ecd45"}, +] +python-dateutil = [ + {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, + {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"}, +] +pytz = [ + {file = "pytz-2022.1-py2.py3-none-any.whl", hash = "sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c"}, + {file = "pytz-2022.1.tar.gz", hash = "sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7"}, +] +requests = [] +six = [ + {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, + {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, +] +toml = [ + {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"}, + {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"}, +] +tomli = [ + {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, + {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, +] +tqdm = [ + {file = "tqdm-4.64.0-py2.py3-none-any.whl", hash = "sha256:74a2cdefe14d11442cedf3ba4e21a3b84ff9a2dbdc6cfae2c34addb2a14a5ea6"}, + {file = "tqdm-4.64.0.tar.gz", hash = "sha256:40be55d30e200777a307a7585aee69e4eabb46b4ec6a4b4a5f2d9f11e7d5408d"}, +] +traitlets = [ + {file = "traitlets-5.3.0-py3-none-any.whl", hash = "sha256:65fa18961659635933100db8ca120ef6220555286949774b9cfc106f941d1c7a"}, + {file = "traitlets-5.3.0.tar.gz", hash = "sha256:0bb9f1f9f017aa8ec187d8b1b2a7a6626a2a1d877116baba52a129bfa124f8e2"}, +] +typing-extensions = [ + {file = "typing_extensions-4.3.0-py3-none-any.whl", hash = "sha256:25642c956049920a5aa49edcdd6ab1e06d7e5d467fc00e0506c44ac86fbfca02"}, + {file = "typing_extensions-4.3.0.tar.gz", hash = "sha256:e6d2677a32f47fc7eb2795db1dd15c1f34eff616bcaf2cfb5e997f854fa1c4a6"}, +] +urllib3 = [] +wcwidth = [ + {file = "wcwidth-0.2.5-py2.py3-none-any.whl", hash = "sha256:beb4802a9cebb9144e99086eff703a642a13d6a0052920003a230f3294bbe784"}, + {file = "wcwidth-0.2.5.tar.gz", hash = "sha256:c4d647b99872929fdb7bdcaa4fbe7f01413ed3d98077df798530e5b04f116c83"}, +] +zipp = [ + {file = "zipp-3.8.1-py3-none-any.whl", hash = "sha256:47c40d7fe183a6f21403a199b3e4192cca5774656965b0a4988ad2f8feb5f009"}, + {file = "zipp-3.8.1.tar.gz", hash = "sha256:05b45f1ee8f807d0cc928485ca40a07cb491cf092ff587c0df9cb1fd154848d2"}, +] diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..997c0bf --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,23 @@ +[tool.poetry] +name = "MovieGraphBenchmark" +version = "1.0.0" +description = "Benchmark datasets for Entity Resolution on Knowledge Graphs containing information about movies, tv shows and persons from IMDB,TMDB and TheTVDB" +authors = ["Daniel Obraczka "] + +[tool.poetry.dependencies] +python = ">=3.7.1,<3.11" +requests = "*" +tqdm = "*" +pandas = "*" +pystow = "*" + +[tool.poetry.scripts] +moviegraphbenchmark = "moviegraphbenchmark.create_graph:create_graph_data" + +[tool.poetry.dev-dependencies] +ipdb = "^0.13.9" +pytest = "^7.1.2" + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" diff --git a/src/create_graph.py b/src/create_graph.py deleted file mode 100644 index 8c43b84..0000000 --- a/src/create_graph.py +++ /dev/null @@ -1,405 +0,0 @@ -import ast -import os - -from get_imdb_data import download_if_needed - -DTYPE_DOUBLE = "" -DTYPE_NON_NEG_INT = "" -DTYPE_US_DOLLER = "" -DTYPE_DATE = "" - -BENCHMARK_RESOURCE_PREFIX = "https://www.scads.de/movieBenchmark/resource/IMDB/" -BENCHMARK_ONTOLOGY_PREFIX = "https://www.scads.de/movieBenchmark/ontology/" - -property_dict = { - "birthYear": "http://dbpedia.org/ontology/birthYear", - "deathYear": "http://dbpedia.org/ontology/deathYear", - "episodeNumber": "http://dbpedia.org/ontology/episodeNumber", - "seasonNumber": "http://dbpedia.org/ontology/seasonNumber", - "endYear": "https://www.scads.de/movieBenchmark/ontology/endYear", - "genres": "https://www.scads.de/movieBenchmark/ontology/genre_list", - "isAdult": "https://www.scads.de/movieBenchmark/ontology/isAdult", - "primaryName": "https://www.scads.de/movieBenchmark/ontology/name", - "originalTitle": "https://www.scads.de/movieBenchmark/ontology/originalTitle", - "primaryProfession": "https://www.scads.de/movieBenchmark/ontology/primaryProfessions", - "runtimeMinutes": "https://www.scads.de/movieBenchmark/ontology/runtimeMinutes", - "startYear": "https://www.scads.de/movieBenchmark/ontology/startYear", - "primaryTitle": "https://www.scads.de/movieBenchmark/ontology/title", - "episodeOf": "https://www.scads.de/movieBenchmark/ontology/is_episode_of", - "participatedIn": "https://www.scads.de/movieBenchmark/ontology/participated_in", - "knownForTitles": "https://www.scads.de/movieBenchmark/ontology/participated_in", - "type": "http://www.w3.org/1999/02/22-rdf-syntax-ns#type", -} - -FILM_TYPE = "http://dbpedia.org/ontology/Film" -TV_EPISODE_TYPE = "http://dbpedia.org/ontology/TelevisionEpisode" -TV_SHOW_TYPE = "http://dbpedia.org/ontology/TelevisionShow" -PERSON_TYPE = "http://xmlns.com/foaf/0.1/Person" - - -def get_allowed(path): - with open(path, "r") as in_file: - return {line.strip() for line in in_file} - - -def get_excluded(path): - with open(path, "r") as in_file: - return { - (line.strip().split("\t")[0], line.strip().split("\t")[1]) - for line in in_file - } - - -def _should_write(s, o, allowed, exclude): - if (s.startswith("nm") or s.startswith("tt")) and ( - o.startswith("nm") or o.startswith("tt") - ): - if (s, o) in exclude: - return False - if s in allowed and o in allowed: - return True - else: - return False - elif s.startswith("nm") or s.startswith("tt"): - if s in allowed: - return True - else: - return False - elif o.startswith("nm") or o.startswith("tt"): - if o in allowed: - return True - else: - return False - return False - - -def _add_dtype(obj, dtype): - if dtype is None: - return obj - return '"' + obj + '"^^' + dtype - - -def _sanity_check(input): - if input is None or input == "": - return False - return True - - -def _normalize_year(year): - return year + "-01-01" - - -def create_trips(s, p, o, multiple_possible, allowed, exclude, dtype=None): - if not (_sanity_check(s) and _sanity_check(p) and _sanity_check(o)): - return [] - if p == "titleType": - if o in {"movie", "short", "tvMovie", "tvShort"} or "video" in o: - o = FILM_TYPE - elif o == "tvEpisode": - o = TV_EPISODE_TYPE - else: - o = TV_SHOW_TYPE - p = property_dict["type"] - else: - try: - p = property_dict[p] - except KeyError: - print(s, p, o) - return [] - - trips = [] - if not (s == "\\N" or o == "\\N"): - if "Year" in p: - o = _normalize_year(o) - if multiple_possible: - if o.startswith("["): - o_list = ast.literal_eval(o) - else: - o_list = [o] - for obj in o_list: - if _should_write(s, obj, allowed, exclude): - if s.startswith("nm") or s.startswith("tt"): - s = BENCHMARK_RESOURCE_PREFIX + s - if obj.startswith("nm") or obj.startswith("tt"): - obj = BENCHMARK_RESOURCE_PREFIX + obj - trips.append([s, p, _add_dtype(obj, dtype)]) - else: - if _should_write(s, o, allowed, exclude): - if s.startswith("nm") or s.startswith("tt"): - s = BENCHMARK_RESOURCE_PREFIX + s - if o.startswith("nm") or o.startswith("tt"): - o = BENCHMARK_RESOURCE_PREFIX + o - trips.append([s, p, _add_dtype(o, dtype)]) - return trips - - -def handle_name_basics(path, allowed, exclude): - attr_trips = [] - rel_trips = [] - with open(path, "r") as in_file: - for line in in_file: - if not line.startswith("nconst\t"): - row = line.strip().split("\t") - if row[0] in allowed: - attr_trips.extend( - create_trips( - row[0], "primaryName", row[1], False, allowed, exclude - ) - ) - attr_trips.extend( - create_trips( - row[0], - "birthYear", - row[2], - False, - allowed, - exclude, - DTYPE_DATE, - ) - ) - attr_trips.extend( - create_trips( - row[0], - "deathYear", - row[3], - False, - allowed, - exclude, - DTYPE_DATE, - ) - ) - attr_trips.extend( - create_trips( - row[0], "primaryProfession", row[4], True, allowed, exclude - ) - ) - rel_trips.extend( - create_trips( - row[0], "knownForTitles", row[5], True, allowed, exclude - ) - ) - rel_trips.append( - [ - BENCHMARK_RESOURCE_PREFIX + row[0], - property_dict["type"], - PERSON_TYPE, - ] - ) - return attr_trips, rel_trips - - -def handle_title_basics(path, allowed, exclude): - attr_trips = [] - rel_trips = [] - with open(path, "r") as in_file: - for line in in_file: - if not line.startswith("tconst\t"): - row = line.strip().split("\t") - if row[0] in allowed: - rel_trips.extend( - create_trips( - row[0], "titleType", row[1], False, allowed, exclude - ) - ) - attr_trips.extend( - create_trips( - row[0], "primaryTitle", row[2], False, allowed, exclude - ) - ) - attr_trips.extend( - create_trips( - row[0], "originalTitle", row[3], False, allowed, exclude - ) - ) - attr_trips.extend( - create_trips(row[0], "isAdult", row[4], False, allowed, exclude) - ) - attr_trips.extend( - create_trips( - row[0], - "startYear", - row[5], - False, - allowed, - exclude, - DTYPE_DATE, - ) - ) - attr_trips.extend( - create_trips( - row[0], - "endYear", - row[6], - False, - allowed, - exclude, - DTYPE_DATE, - ) - ) - attr_trips.extend( - create_trips( - row[0], - "runtimeMinutes", - row[7], - False, - allowed, - exclude, - ) - ) - attr_trips.extend( - create_trips(row[0], "genres", row[8], False, allowed, exclude) - ) - return attr_trips, rel_trips - - -def handle_title_crew(path, allowed, exclude): - rel_trips = [] - with open(path, "r") as in_file: - for line in in_file: - if not line.startswith("tconst\t"): - row = line.strip().split("\t") - if row[0] in allowed: - rel_trips.extend( - create_trips( - row[0], "participatedIn", row[1], True, allowed, exclude - ) - ) - rel_trips.extend( - create_trips( - row[0], "participatedIn", row[2], True, allowed, exclude - ) - ) - return [], rel_trips - - -def handle_title_episode(path, allowed, exclude): - attr_trips = [] - rel_trips = [] - with open(path, "r") as in_file: - for line in in_file: - if not line.startswith("tconst\t"): - row = line.strip().split("\t") - if row[1] in allowed: - rel_trips.extend( - create_trips( - row[0], "episodeOf", row[1], False, allowed, exclude - ) - ) - rel_trips.extend( - create_trips( - row[0], "titleType", "tvEpisode", False, allowed, exclude - ) - ) - attr_trips.extend( - create_trips( - row[0], - "seasonNumber", - row[2], - False, - allowed, - DTYPE_NON_NEG_INT, - ) - ) - attr_trips.extend( - create_trips( - row[0], - "episodeNumber", - row[3], - False, - allowed, - DTYPE_NON_NEG_INT, - ) - ) - return attr_trips, rel_trips - - -def handle_title_principals(path, allowed, exclude): - attr_trips = [] - rel_trips = [] - with open(path, "r") as in_file: - for line in in_file: - if not line.startswith("tconst\t"): - row = line.strip().split("\t") - if row[0] in allowed: - rel_trips.extend( - create_trips( - row[2], "participatedIn", row[0], False, allowed, exclude - ) - ) - return attr_trips, rel_trips - - -def _dedup(trips): - d = [] - for t in trips: - if t not in d: - d.append(t) - return d - - -def parse_files(imdb_dir, allowed, exclude): - file_handler_dict = { - "name.basics.tsv": handle_name_basics, - "title.basics.tsv": handle_title_basics, - # "title.crew.tsv": handle_title_crew, - "title.episode.tsv": handle_title_episode, - "title.principals.tsv": handle_title_principals, - } - # collect triples - rel_trips = [] - attr_trips = [] - # use tqdm if available - try: - from tqdm import tqdm - - for filename, handle_fun in tqdm( - file_handler_dict.items(), desc="Creating triples" - ): - tmp_a, tmp_r = handle_fun( - os.path.join(imdb_dir, filename), allowed, exclude - ) - attr_trips.extend(tmp_a) - rel_trips.extend(tmp_r) - except ImportError: - for filename, handle_fun in file_handler_dict.items(): - tmp_a, tmp_r = handle_fun( - os.path.join(imdb_dir, filename), allowed, exclude - ) - attr_trips.extend(tmp_a) - rel_trips.extend(tmp_r) - - # ignore attr trips that do not show up in rel trips - rel_ids = set() - for r in rel_trips: - rel_ids.add(r[0]) - rel_ids.add(r[2]) - cleaned_attr = [a for a in attr_trips if a[0] in rel_ids] - return _dedup(cleaned_attr), _dedup(rel_trips) - - -def write_files(cleaned_attr, rel_trips, out_folder): - if not os.path.exists(out_folder): - os.makedirs(out_folder) - with open(os.path.join(out_folder, "attr_triples_1"), "w") as out_writer_attr: - for t in cleaned_attr: - out_writer_attr.write("\t".join(t) + "\n") - with open(os.path.join(out_folder, "rel_triples_1"), "w") as out_writer_rel: - for t in rel_trips: - out_writer_rel.write("\t".join(t) + "\n") - - -def create_graph_data(): - file_path = os.path.abspath(__file__) - repo_path = os.path.split(os.path.split(file_path)[0])[0] - data_path = os.path.join(repo_path, "data") - imdb_path = os.path.join(data_path, "imdb") - download_if_needed(imdb_path) - allowed = get_allowed(os.path.join(data_path, "imdb", "allowed")) - exclude = get_excluded(os.path.join(data_path, "imdb", "exclude")) - cleaned_attr, rel_trips = parse_files(imdb_path, allowed, exclude) - write_files(cleaned_attr, rel_trips, os.path.join(data_path, "imdb-tmdb")) - write_files(cleaned_attr, rel_trips, os.path.join(data_path, "imdb-tvdb")) - - -if __name__ == "__main__": - create_graph_data() diff --git a/src/get_imdb_data.py b/src/get_imdb_data.py deleted file mode 100644 index 44488f4..0000000 --- a/src/get_imdb_data.py +++ /dev/null @@ -1,59 +0,0 @@ -import gzip -import os -import shutil -import sys - -import requests - -uris = { - "https://datasets.imdbws.com/name.basics.tsv.gz": "name.basics.tsv", - "https://datasets.imdbws.com/title.basics.tsv.gz": "title.basics.tsv", - # "https://datasets.imdbws.com/title.crew.tsv.gz": "title.crew.tsv", - "https://datasets.imdbws.com/title.episode.tsv.gz": "title.episode.tsv", - "https://datasets.imdbws.com/title.principals.tsv.gz": "title.principals.tsv", -} - - -def download_file(url, dl_path, chunk_size=1024): - filename = os.path.basename(url) - filesize = int(requests.head(url).headers["Content-Length"]) - try: - from tqdm import tqdm # noqa: autoimport - - with requests.get(url, stream=True) as r, open( - os.path.join(dl_path, filename), "wb" - ) as f, tqdm( - unit="B", - unit_scale=True, - unit_divisor=1024, - total=filesize, - file=sys.stdout, - desc=filename, - ) as progress: - for chunk in r.iter_content(chunk_size=chunk_size): - datasize = f.write(chunk) - progress.update(datasize) - except ImportError: - with requests.get(url, stream=True) as r, open( - os.path.join(dl_path, filename), "wb" - ) as f: - for chunk in r.iter_content(chunk_size=chunk_size): - f.write(chunk) - - -def unzip(filepath): - with gzip.open(filepath + ".gz", "rb") as f_in: - with open(filepath, "wb") as f_out: - shutil.copyfileobj(f_in, f_out) - - -def download_if_needed(imdb_path): - os.makedirs(imdb_path, exist_ok=True) - for u, p in uris.items(): - filepath = os.path.join(imdb_path, p) - if not os.path.isfile(filepath): - print(f"Did not find {filepath}, therefore downloading") - download_file(u, imdb_path) - print("Unpacking gz archive") - unzip(filepath) - os.remove(filepath + ".gz") diff --git a/src/main.py b/src/main.py deleted file mode 100644 index c787393..0000000 --- a/src/main.py +++ /dev/null @@ -1,28 +0,0 @@ -import os -import shutil -import sys -from pathlib import Path - - -def main(folder_loc: str = "ScaDSMovieGraphBenchmark"): - target_loc = Path(folder_loc) - # create target paths parents if they don't exist - if not target_loc.parent == Path(".") and not os.path.exists(target_loc.parent): - os.makedirs(target_loc.parent) - print("Downloading repo") - os.system("git clone https://github.com/ScaDS/MovieGraphBenchmark.git") - os.chdir("MovieGraphBenchmark") - print("Creating IMDB data") - os.system("python3 src/create_graph.py") - print("Cleanup") - os.chdir("..") - shutil.move("MovieGraphBenchmark/data", target_loc) - shutil.rmtree("MovieGraphBenchmark") - print("Done!") - - -if __name__ == "__main__": - if len(sys.argv) > 1: - main(sys.argv[1]) - else: - main() diff --git a/src/moviegraphbenchmark/__init__.py b/src/moviegraphbenchmark/__init__.py new file mode 100644 index 0000000..98e9464 --- /dev/null +++ b/src/moviegraphbenchmark/__init__.py @@ -0,0 +1,6 @@ +import sys +import logging +from .loading import load_data +logging.basicConfig(stream=sys.stdout, level=logging.INFO) + +__all__ = ["load_data"] diff --git a/src/moviegraphbenchmark/create_graph.py b/src/moviegraphbenchmark/create_graph.py new file mode 100644 index 0000000..efd0b1c --- /dev/null +++ b/src/moviegraphbenchmark/create_graph.py @@ -0,0 +1,568 @@ +import ast +import logging +import os +import sys +import zipfile +from typing import List, Set, Tuple + +import click + +from moviegraphbenchmark.get_imdb_data import download_if_needed +from moviegraphbenchmark.utils import download_file + +DTYPE_DOUBLE = "" +DTYPE_NON_NEG_INT = "" +DTYPE_US_DOLLER = "" +DTYPE_DATE = "" + +BENCHMARK_RESOURCE_PREFIX = "https://www.scads.de/movieBenchmark/resource/IMDB/" +BENCHMARK_ONTOLOGY_PREFIX = "https://www.scads.de/movieBenchmark/ontology/" + +property_dict = { + "birthYear": "http://dbpedia.org/ontology/birthYear", + "deathYear": "http://dbpedia.org/ontology/deathYear", + "episodeNumber": "http://dbpedia.org/ontology/episodeNumber", + "seasonNumber": "http://dbpedia.org/ontology/seasonNumber", + "endYear": "https://www.scads.de/movieBenchmark/ontology/endYear", + "genres": "https://www.scads.de/movieBenchmark/ontology/genre_list", + "isAdult": "https://www.scads.de/movieBenchmark/ontology/isAdult", + "primaryName": "https://www.scads.de/movieBenchmark/ontology/name", + "originalTitle": "https://www.scads.de/movieBenchmark/ontology/originalTitle", + "primaryProfession": "https://www.scads.de/movieBenchmark/ontology/primaryProfessions", + "runtimeMinutes": "https://www.scads.de/movieBenchmark/ontology/runtimeMinutes", + "startYear": "https://www.scads.de/movieBenchmark/ontology/startYear", + "primaryTitle": "https://www.scads.de/movieBenchmark/ontology/title", + "episodeOf": "https://www.scads.de/movieBenchmark/ontology/is_episode_of", + "participatedIn": "https://www.scads.de/movieBenchmark/ontology/participated_in", + "knownForTitles": "https://www.scads.de/movieBenchmark/ontology/participated_in", + "type": "http://www.w3.org/1999/02/22-rdf-syntax-ns#type", +} + +FILM_TYPE = "http://dbpedia.org/ontology/Film" +TV_EPISODE_TYPE = "http://dbpedia.org/ontology/TelevisionEpisode" +TV_SHOW_TYPE = "http://dbpedia.org/ontology/TelevisionShow" +PERSON_TYPE = "http://xmlns.com/foaf/0.1/Person" + +logger = logging.getLogger("moviegraphbenchmark") + + +def get_allowed(path: str) -> Set[str]: + with open(path, "r", encoding="utf8") as in_file: + return {line.strip() for line in in_file} + + +def get_excluded(path: str) -> Set[Tuple[str, str]]: + with open(path, "r", encoding="utf8") as in_file: + return { + (line.strip().split("\t")[0], line.strip().split("\t")[1]) + for line in in_file + } + + +def _should_write( + s: str, o: str, allowed: Set[str], exclude: Set[Tuple[str, str]] +) -> bool: + if (s.startswith("nm") or s.startswith("tt")) and ( + o.startswith("nm") or o.startswith("tt") + ): + if (s, o) in exclude: + return False + if s in allowed and o in allowed: + return True + else: + return False + elif s.startswith("nm") or s.startswith("tt"): + if s in allowed: + return True + else: + return False + elif o.startswith("nm") or o.startswith("tt"): + if o in allowed: + return True + else: + return False + return False + + +def _add_dtype(obj: str, dtype: str = None) -> str: + if dtype is None: + return obj + return '"' + obj + '"^^' + dtype + + +def _sanity_check(value) -> bool: + if value is None or value == "": + return False + return True + + +def _normalize_year(year: str) -> str: + return year + "-01-01" + + +def create_trips( + s: str, + p: str, + o: str, + multiple_possible: bool, + allowed: Set[str], + exclude: Set[Tuple[str, str]], + dtype: str = None, +) -> List[Tuple[str, str, str]]: + if not (_sanity_check(s) and _sanity_check(p) and _sanity_check(o)): + return [] + if p == "titleType": + if o in {"movie", "short", "tvMovie", "tvShort"} or "video" in o: + o = FILM_TYPE + elif o == "tvEpisode": + o = TV_EPISODE_TYPE + else: + o = TV_SHOW_TYPE + p = property_dict["type"] + else: + try: + p = property_dict[p] + except KeyError: + logger.debug((s, p, o)) + return [] + + trips = [] + if not (s == "\\N" or o == "\\N"): + if "Year" in p: + o = _normalize_year(o) + if multiple_possible: + if o.startswith("["): + o_list = ast.literal_eval(o) + else: + o_list = [o] + for obj in o_list: + if _should_write(s, obj, allowed, exclude): + if s.startswith("nm") or s.startswith("tt"): + s = BENCHMARK_RESOURCE_PREFIX + s + if obj.startswith("nm") or obj.startswith("tt"): + obj = BENCHMARK_RESOURCE_PREFIX + obj + trips.append((s, p, _add_dtype(obj, dtype))) + else: + if _should_write(s, o, allowed, exclude): + if s.startswith("nm") or s.startswith("tt"): + s = BENCHMARK_RESOURCE_PREFIX + s + if o.startswith("nm") or o.startswith("tt"): + o = BENCHMARK_RESOURCE_PREFIX + o + trips.append((s, p, _add_dtype(o, dtype))) + return trips + + +def handle_name_basics( + path: str, allowed: Set[str], exclude: Set[Tuple[str, str]] +) -> Tuple[List[Tuple[str, str, str]], List[Tuple[str, str, str]]]: + attr_trips = [] + rel_trips = [] + with open(path, "r", encoding="utf8") as in_file: + for line in in_file: + if not line.startswith("nconst\t"): + row = line.strip().split("\t") + if row[0] in allowed: + attr_trips.extend( + create_trips( + s=row[0], + p="primaryName", + o=row[1], + multiple_possible=False, + allowed=allowed, + exclude=exclude, + ) + ) + attr_trips.extend( + create_trips( + s=row[0], + p="birthYear", + o=row[2], + multiple_possible=False, + allowed=allowed, + exclude=exclude, + dtype=DTYPE_DATE, + ) + ) + attr_trips.extend( + create_trips( + s=row[0], + p="deathYear", + o=row[3], + multiple_possible=False, + allowed=allowed, + exclude=exclude, + dtype=DTYPE_DATE, + ) + ) + attr_trips.extend( + create_trips( + s=row[0], + p="primaryProfession", + o=row[4], + multiple_possible=True, + allowed=allowed, + exclude=exclude, + ) + ) + rel_trips.extend( + create_trips( + s=row[0], + p="knownForTitles", + o=row[5], + multiple_possible=True, + allowed=allowed, + exclude=exclude, + ) + ) + rel_trips.append( + ( + BENCHMARK_RESOURCE_PREFIX + row[0], + property_dict["type"], + PERSON_TYPE, + ) + ) + return attr_trips, rel_trips + + +def handle_title_basics( + path: str, allowed: Set[str], exclude: Set[Tuple[str, str]] +) -> Tuple[List[Tuple[str, str, str]], List[Tuple[str, str, str]]]: + attr_trips = [] + rel_trips = [] + with open(path, "r", encoding="utf8") as in_file: + for line in in_file: + if not line.startswith("tconst\t"): + row = line.strip().split("\t") + if row[0] in allowed: + rel_trips.extend( + create_trips( + s=row[0], + p="titleType", + o=row[1], + multiple_possible=False, + allowed=allowed, + exclude=exclude, + ) + ) + attr_trips.extend( + create_trips( + s=row[0], + p="primaryTitle", + o=row[2], + multiple_possible=False, + allowed=allowed, + exclude=exclude, + ) + ) + attr_trips.extend( + create_trips( + s=row[0], + p="originalTitle", + o=row[3], + multiple_possible=False, + allowed=allowed, + exclude=exclude, + ) + ) + attr_trips.extend( + create_trips( + s=row[0], + p="isAdult", + o=row[4], + multiple_possible=False, + allowed=allowed, + exclude=exclude, + ) + ) + attr_trips.extend( + create_trips( + s=row[0], + p="startYear", + o=row[5], + multiple_possible=False, + allowed=allowed, + exclude=exclude, + dtype=DTYPE_DATE, + ) + ) + attr_trips.extend( + create_trips( + s=row[0], + p="endYear", + o=row[6], + multiple_possible=False, + allowed=allowed, + exclude=exclude, + dtype=DTYPE_DATE, + ) + ) + attr_trips.extend( + create_trips( + s=row[0], + p="runtimeMinutes", + o=row[7], + multiple_possible=False, + allowed=allowed, + exclude=exclude, + ) + ) + attr_trips.extend( + create_trips( + s=row[0], + p="genres", + o=row[8], + multiple_possible=False, + allowed=allowed, + exclude=exclude, + ) + ) + return attr_trips, rel_trips + + +def handle_title_crew( + path: str, allowed: Set[str], exclude: Set[Tuple[str, str]] +) -> Tuple[List[Tuple[str, str, str]], List[Tuple[str, str, str]]]: + rel_trips = [] + with open(path, "r", encoding="utf8") as in_file: + for line in in_file: + if not line.startswith("tconst\t"): + row = line.strip().split("\t") + if row[0] in allowed: + rel_trips.extend( + create_trips( + s=row[0], + p="participatedIn", + o=row[1], + multiple_possible=True, + allowed=allowed, + exclude=exclude, + ) + ) + rel_trips.extend( + create_trips( + s=row[0], + p="participatedIn", + o=row[2], + multiple_possible=True, + allowed=allowed, + exclude=exclude, + ) + ) + return [], rel_trips + + +def handle_title_episode( + path: str, allowed: Set[str], exclude: Set[Tuple[str, str]] +) -> Tuple[List[Tuple[str, str, str]], List[Tuple[str, str, str]]]: + attr_trips = [] + rel_trips = [] + with open(path, "r", encoding="utf8") as in_file: + for line in in_file: + if not line.startswith("tconst\t"): + row = line.strip().split("\t") + if row[1] in allowed: + rel_trips.extend( + create_trips( + s=row[0], + p="episodeOf", + o=row[1], + multiple_possible=False, + allowed=allowed, + exclude=exclude, + ) + ) + rel_trips.extend( + create_trips( + s=row[0], + p="titleType", + o="tvEpisode", + multiple_possible=False, + allowed=allowed, + exclude=exclude, + ) + ) + attr_trips.extend( + create_trips( + s=row[0], + p="seasonNumber", + o=row[2], + multiple_possible=False, + allowed=allowed, + exclude=exclude, + dtype=DTYPE_NON_NEG_INT, + ) + ) + attr_trips.extend( + create_trips( + s=row[0], + p="episodeNumber", + o=row[3], + multiple_possible=False, + allowed=allowed, + exclude=exclude, + dtype=DTYPE_NON_NEG_INT, + ) + ) + return attr_trips, rel_trips + + +def handle_title_principals( + path: str, allowed: Set[str], exclude: Set[Tuple[str, str]] +) -> Tuple[List[Tuple[str, str, str]], List[Tuple[str, str, str]]]: + attr_trips: List[Tuple[str, str, str]] = [] + rel_trips = [] + with open(path, "r", encoding="utf8") as in_file: + for line in in_file: + if not line.startswith("tconst\t"): + row = line.strip().split("\t") + if row[0] in allowed: + rel_trips.extend( + create_trips( + s=row[2], + p="participatedIn", + o=row[0], + multiple_possible=False, + allowed=allowed, + exclude=exclude, + ) + ) + return attr_trips, rel_trips + + +def _dedup(trips: List[Tuple[str, str, str]]) -> List[Tuple[str, str, str]]: + d = [] + for t in trips: + if t not in d: + d.append(t) + return d + + +def parse_files( + imdb_dir: str, allowed: Set[str], exclude: Set[Tuple[str, str]] +) -> Tuple[List[Tuple[str, str, str]], List[Tuple[str, str, str]]]: + file_handler_dict = { + "name.basics.tsv": handle_name_basics, + "title.basics.tsv": handle_title_basics, + # "title.crew.tsv": handle_title_crew, + "title.episode.tsv": handle_title_episode, + "title.principals.tsv": handle_title_principals, + } + # collect triples + rel_trips = [] + attr_trips = [] + # use tqdm if available + try: + from tqdm import tqdm + + for filename, handle_fun in tqdm( + file_handler_dict.items(), desc="Creating triples" + ): + tmp_a, tmp_r = handle_fun( + os.path.join(imdb_dir, filename), allowed, exclude + ) + attr_trips.extend(tmp_a) + rel_trips.extend(tmp_r) + except ImportError: + for filename, handle_fun in file_handler_dict.items(): + tmp_a, tmp_r = handle_fun( + os.path.join(imdb_dir, filename), allowed, exclude + ) + attr_trips.extend(tmp_a) + rel_trips.extend(tmp_r) + + # ignore attr trips that do not show up in rel trips + rel_ids = set() + for r in rel_trips: + rel_ids.add(r[0]) + rel_ids.add(r[2]) + cleaned_attr = [a for a in attr_trips if a[0] in rel_ids] + return _dedup(cleaned_attr), _dedup(rel_trips) + + +def write_files( + cleaned_attr: List[Tuple[str, str, str]], + rel_trips: List[Tuple[str, str, str]], + out_folder: str, +): + if not os.path.exists(out_folder): + os.makedirs(out_folder) + with open( + os.path.join(out_folder, "attr_triples_1"), "w", encoding="utf8" + ) as out_writer_attr: + for t in cleaned_attr: + out_writer_attr.write("\t".join(t) + "\n") + with open( + os.path.join(out_folder, "rel_triples_1"), "w", encoding="utf8" + ) as out_writer_rel: + for t in rel_trips: + out_writer_rel.write("\t".join(t) + "\n") + + +def _download(data_path: str): + if not os.path.exists(data_path): + os.makedirs(data_path) + download_file( + "https://cloud.scadsai.uni-leipzig.de/index.php/s/SdzWXCarFCFGeN9/download/ScadsMovieGraphBenchmark.zip", + data_path, + ) + zip_path = os.path.join(data_path, "ScadsMovieGraphBenchmark.zip") + with zipfile.ZipFile(zip_path, "r") as zip_ref: + zip_ref.extractall(data_path) + os.remove(zip_path) + + +def _data_path() -> str: + file_path = os.path.abspath(__file__) + repo_path = os.path.split(os.path.split(os.path.split(file_path)[0])[0])[0] + data_path = os.path.join(repo_path, "data") + # if repo was cloned this exists + if os.path.exists(data_path): + return data_path + else: + # else we use pystow + try: + import pystow + + data_path = pystow.join("moviegraphbenchmark", "data") + except ImportError: + logger.error("Please install pystow: pip install pystow") + return data_path + + +def _create_graph_data(data_path: str = None) -> str: + """(Download and) create benchmark data on specified path. + + :param data_path: Path where data should be stored. + :return: data_path + """ + if data_path is None: + data_path = _data_path() + # check if data was already created + if os.path.exists(os.path.join(data_path, "imdb-tmdb", "rel_triples_1")): + logger.info(f"Data already present in {data_path}") + return data_path + logger.info(f"Using data path: {data_path}") + if not os.path.exists(os.path.join(data_path, "imdb-tmdb", "rel_triples_2")): + _download(data_path) + imdb_path = os.path.join(data_path, "imdb") + download_if_needed(imdb_path) + allowed = get_allowed(os.path.join(data_path, "imdb", "allowed")) + exclude = get_excluded(os.path.join(data_path, "imdb", "exclude")) + cleaned_attr, rel_trips = parse_files(imdb_path, allowed, exclude) + write_files(cleaned_attr, rel_trips, os.path.join(data_path, "imdb-tmdb")) + write_files(cleaned_attr, rel_trips, os.path.join(data_path, "imdb-tvdb")) + return data_path + + +@click.command +@click.option("--data-path", default=None, help="Path where data is stored") +def create_graph_data(data_path: str = None): + """(Download and) create benchmark data on specified path. + + :param data_path: Path where data should be stored. + """ + _create_graph_data(data_path) + + +if __name__ == "__main__": + create_graph_data() diff --git a/src/moviegraphbenchmark/get_imdb_data.py b/src/moviegraphbenchmark/get_imdb_data.py new file mode 100644 index 0000000..13b065c --- /dev/null +++ b/src/moviegraphbenchmark/get_imdb_data.py @@ -0,0 +1,36 @@ +import gzip +import logging +import os +import shutil +import sys +from moviegraphbenchmark.utils import download_file + +import requests + +uris = { + "https://web.archive.org/web/20200717014821/https://datasets.imdbws.com/name.basics.tsv.gz": "name.basics.tsv", + "https://web.archive.org/web/20200717014801/https://datasets.imdbws.com/title.basics.tsv.gz": "title.basics.tsv", + # "https://datasets.imdbws.com/title.crew.tsv.gz": "title.crew.tsv", + "https://web.archive.org/web/20200717014920/https://datasets.imdbws.com/title.episode.tsv.gz": "title.episode.tsv", + "https://web.archive.org/web/20200717014706/https://datasets.imdbws.com/title.principals.tsv.gz": "title.principals.tsv", +} + + +logger = logging.getLogger("moviegraphbenchmark") + +def unzip(filepath: str): + with gzip.open(filepath + ".gz", "rb") as f_in: + with open(filepath, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + + +def download_if_needed(imdb_path: str): + os.makedirs(imdb_path, exist_ok=True) + for u, p in uris.items(): + filepath = os.path.join(imdb_path, p) + if not os.path.isfile(filepath): + logger.info(f"Did not find {filepath}, therefore downloading") + download_file(u, imdb_path) + logger.info("Unpacking gz archive") + unzip(filepath) + os.remove(filepath + ".gz") diff --git a/src/moviegraphbenchmark/loading.py b/src/moviegraphbenchmark/loading.py new file mode 100644 index 0000000..3a7e9e3 --- /dev/null +++ b/src/moviegraphbenchmark/loading.py @@ -0,0 +1,84 @@ +import logging +import os +import zipfile +from dataclasses import dataclass +from typing import List +import pystow + +from moviegraphbenchmark.create_graph import _create_graph_data +from moviegraphbenchmark.utils import download_file + +logger = logging.getLogger("moviegraphbenchmark") + +try: + import pandas as pd +except ImportError: + logger.error("Please install pandas for loading data: pip install pandas") + +@dataclass +class Fold: + train_links: pd.DataFrame + test_links: pd.DataFrame + valid_links: pd.DataFrame + + +@dataclass +class ERData: + attr_triples_1: pd.DataFrame + attr_triples_2: pd.DataFrame + rel_triples_1: pd.DataFrame + rel_triples_2: pd.DataFrame + ent_links: pd.DataFrame + folds: List[Fold] + + + +def _read(path, names): + return pd.read_csv( + path, + header=None, + names=names, + sep="\t", + encoding="utf8", + dtype=str, + ) + + +def load_data(pair: str = "imdb-tmdb", data_path: str = None) -> ERData: + if data_path is None: + data_path = _create_graph_data(data_path) + logger.info(f"Loading from data path: {data_path}") + pair_path = os.path.join(data_path, pair) + triple_columns = ["head", "relation", "tail"] + link_columns = ["left", "right"] + attr_1 = _read(os.path.join(pair_path, "attr_triples_1"), triple_columns) + attr_2 = _read(os.path.join(pair_path, "attr_triples_2"), triple_columns) + rel_1 = _read(os.path.join(pair_path, "rel_triples_1"), triple_columns) + rel_2 = _read(os.path.join(pair_path, "rel_triples_2"), triple_columns) + ent_links = _read(os.path.join(pair_path, "ent_links"), link_columns) + folds = [] + for fold in range(1, 6): + folds.append( + Fold( + train_links=_read( + os.path.join(pair_path, "721_5fold", str(fold), "train_links"), + link_columns, + ), + test_links=_read( + os.path.join(pair_path, "721_5fold", str(fold), "test_links"), + link_columns, + ), + valid_links=_read( + os.path.join(pair_path, "721_5fold", str(fold), "valid_links"), + link_columns, + ), + ) + ) + return ERData( + attr_triples_1=attr_1, + attr_triples_2=attr_2, + rel_triples_1=rel_1, + rel_triples_2=rel_2, + ent_links=ent_links, + folds=folds, + ) diff --git a/src/moviegraphbenchmark/utils.py b/src/moviegraphbenchmark/utils.py new file mode 100644 index 0000000..c32b66a --- /dev/null +++ b/src/moviegraphbenchmark/utils.py @@ -0,0 +1,33 @@ +import os +import requests +import sys + +def download_file(url: str, dl_path: str, chunk_size: int = 1024): + filename = os.path.basename(url) + header = requests.head(url) + try: + filesize = int(header.headers["Content-Length"]) + except KeyError: + filesize = int(header.raw.info()["Content-Length"]) + try: + from tqdm import tqdm # noqa: autoimport + + with requests.get(url, stream=True) as r, open( + os.path.join(dl_path, filename), "wb" + ) as f, tqdm( + unit="B", + unit_scale=True, + unit_divisor=1024, + total=filesize, + file=sys.stdout, + desc=filename, + ) as progress: + for chunk in r.iter_content(chunk_size=chunk_size): + datasize = f.write(chunk) + progress.update(datasize) + except ImportError: + with requests.get(url, stream=True) as r, open( + os.path.join(dl_path, filename), "wb" + ) as f: + for chunk in r.iter_content(chunk_size=chunk_size): + f.write(chunk) diff --git a/tests/test_load.py b/tests/test_load.py new file mode 100644 index 0000000..3d94e80 --- /dev/null +++ b/tests/test_load.py @@ -0,0 +1,18 @@ +import pytest +from moviegraphbenchmark import load_data + +@pytest.mark.parametrize("pair", [None, "imdb-tmdb","imdb-tvdb","tmdb-tvdb"]) +def test_load(pair): + if pair is None: + ds = load_data() + else: + ds = load_data(pair=pair) + assert not ds.attr_triples_1.empty + assert not ds.attr_triples_2.empty + assert not ds.rel_triples_1.empty + assert not ds.rel_triples_2.empty + for fold in ds.folds: + assert not fold.test_links.empty + assert not fold.train_links.empty + assert not fold.valid_links.empty + assert not ds.ent_links.empty diff --git a/tests/test_one_liner.py b/tests/test_one_liner.py deleted file mode 100644 index 3faf29a..0000000 --- a/tests/test_one_liner.py +++ /dev/null @@ -1,41 +0,0 @@ -import os -import sys - -from main import main -from pytest import approx - -sys.path.append("src") - - -def line_count(path: str) -> int: - with open(path, "r") as in_file: - return len(in_file.readlines()) - - -def check_data(folder_name): - for ds_pair in ["imdb-tmdb", "imdb-tvdb", "tmdb-tvdb"]: - assert os.path.exists(folder_name) - ds_path = os.path.join(folder_name, ds_pair) - for fold in range(1, 6): - fold_path = os.path.join(ds_path, "721_5fold", str(fold)) - test_links = line_count(os.path.join(fold_path, "test_links")) - train_links = line_count(os.path.join(fold_path, "train_links")) - valid_links = line_count(os.path.join(fold_path, "valid_links")) - total_links = test_links + train_links + valid_links - assert test_links == approx(total_links * 0.7, abs=1) - assert train_links == approx(total_links * 0.2, abs=1) - assert valid_links == approx(total_links * 0.1, abs=1) - # basically just testing if the files are there and not empty - for file_name in [ - "attr_triples_1", - "attr_triples_2", - "rel_triples_1", - "rel_triples_2", - ]: - assert line_count(os.path.join(ds_path, file_name)) > 10000 - - -def test(tmpdir): - folder_path = os.path.join(tmpdir, "myfolder") - main(folder_path) - check_data(folder_path)