From 117f67d6e1eac9e780d3bdad7e02a77d46c2b97c Mon Sep 17 00:00:00 2001 From: Linlang Date: Thu, 23 May 2024 06:57:59 +0800 Subject: [PATCH 1/5] fix get data error --- examples/orderbook_data/README.md | 5 +++-- qlib/tests/data.py | 14 +++++--------- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/examples/orderbook_data/README.md b/examples/orderbook_data/README.md index 890e11f41e..53fd523d7f 100644 --- a/examples/orderbook_data/README.md +++ b/examples/orderbook_data/README.md @@ -16,7 +16,7 @@ Current version of script with default value tries to connect localhost **via de Run following command to install necessary libraries ``` -pip install pytest coverage +pip install pytest coverage gdown pip install arctic # NOTE: pip may fail to resolve the right package dependency !!! Please make sure the dependency are satisfied. ``` @@ -27,7 +27,8 @@ pip install arctic # NOTE: pip may fail to resolve the right package dependency 2. Please follow following steps to download example data ```bash cd examples/orderbook_data/ -python ../../scripts/get_data.py download_data --target_dir . --file_name highfreq_orderbook_example_data.zip +gdown https://drive.google.com/uc?id=15nZF7tFT_eKVZAcMFL1qPS4jGyJflH7e # Proxies may be necessary here. +python ../../scripts/get_data.py _unzip --file_path highfreq_orderbook_example_data.zip --target_dir . ``` 3. Please import the example data to your mongo db diff --git a/qlib/tests/data.py b/qlib/tests/data.py index f6bd780905..0d53b2dc62 100644 --- a/qlib/tests/data.py +++ b/qlib/tests/data.py @@ -12,15 +12,11 @@ from tqdm import tqdm from pathlib import Path from loguru import logger -from cryptography.fernet import Fernet from qlib.utils import exists_qlib_data class GetData: - REMOTE_URL = "https://qlibpublic.blob.core.windows.net/data/default/stock_data" - # "?" is not included in the token. - TOKEN = b"gAAAAABkmDhojHc0VSCDdNK1MqmRzNLeDFXe5hy8obHpa6SDQh4de6nW5gtzuD-fa6O_WZb0yyqYOL7ndOfJX_751W3xN5YB4-n-P22jK-t6ucoZqhT70KPD0Lf0_P328QPJVZ1gDnjIdjhi2YLOcP4BFTHLNYO0mvzszR8TKm9iT5AKRvuysWnpi8bbYwGU9zAcJK3x9EPL43hOGtxliFHcPNGMBoJW4g_ercdhi0-Qgv5_JLsV-29_MV-_AhuaYvJuN2dEywBy" - KEY = "EYcA8cgorA8X9OhyMwVfuFxn_1W3jGk6jCbs3L2oPoA=" + REMOTE_URL = "https://github.com/SunsetWolf/qlib_dataset/releases/download" def __init__(self, delete_zip_file=False): """ @@ -33,9 +29,7 @@ def __init__(self, delete_zip_file=False): self.delete_zip_file = delete_zip_file def merge_remote_url(self, file_name: str): - fernet = Fernet(self.KEY) - token = fernet.decrypt(self.TOKEN).decode() - return f"{self.REMOTE_URL}/{file_name}?{token}" + return f"{self.REMOTE_URL}/{file_name}" def download_data(self, file_name: str, target_dir: [Path, str], delete_old: bool = True): """ @@ -99,7 +93,9 @@ def check_dataset(self, file_name: str): return status @staticmethod - def _unzip(file_path: Path, target_dir: Path, delete_old: bool = True): + def _unzip(file_path: [Path, str], target_dir: [Path, str], delete_old: bool = True): + file_path = Path(file_path) + target_dir = Path(target_dir) if delete_old: logger.warning( f"will delete the old qlib data directory(features, instruments, calendars, features_cache, dataset_cache): {target_dir}" From 9bb4259080601e5ab18690d7523bd8516dcb3db3 Mon Sep 17 00:00:00 2001 From: Linlang Date: Thu, 23 May 2024 07:19:04 +0800 Subject: [PATCH 2/5] fix get v0 data error --- qlib/tests/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qlib/tests/data.py b/qlib/tests/data.py index 0d53b2dc62..ed99e7cdfe 100644 --- a/qlib/tests/data.py +++ b/qlib/tests/data.py @@ -29,7 +29,7 @@ def __init__(self, delete_zip_file=False): self.delete_zip_file = delete_zip_file def merge_remote_url(self, file_name: str): - return f"{self.REMOTE_URL}/{file_name}" + return f"{self.REMOTE_URL}/{file_name}" if "/" in file_name else f"{self.REMOTE_URL}/v0/{file_name}" def download_data(self, file_name: str, target_dir: [Path, str], delete_old: bool = True): """ From fb54d08236db58a48fc4638b792b71cad40dd9bf Mon Sep 17 00:00:00 2001 From: Linlang Date: Thu, 23 May 2024 16:01:57 +0800 Subject: [PATCH 3/5] optimize get_data code --- qlib/tests/data.py | 44 +++++++++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/qlib/tests/data.py b/qlib/tests/data.py index ed99e7cdfe..d58d58bb06 100644 --- a/qlib/tests/data.py +++ b/qlib/tests/data.py @@ -31,6 +31,34 @@ def __init__(self, delete_zip_file=False): def merge_remote_url(self, file_name: str): return f"{self.REMOTE_URL}/{file_name}" if "/" in file_name else f"{self.REMOTE_URL}/v0/{file_name}" + def download(self, url: str, target_path: [Path, str]): + """ + Download a file from the specified url. + + Parameters + ---------- + url: str + The url of the data. + target_path: str + The location where the data is saved, including the file name. + """ + file_name = str(target_path).split("/")[-1] + resp = requests.get(url, stream=True, timeout=60) + resp.raise_for_status() + if resp.status_code != 200: + raise requests.exceptions.HTTPError() + + chunk_size = 1024 + logger.warning( + f"The data for the example is collected from Yahoo Finance. Please be aware that the quality of the data might not be perfect. (You can refer to the original data source: https://finance.yahoo.com/lookup.)" + ) + logger.info(f"{os.path.basename(file_name)} downloading......") + with tqdm(total=int(resp.headers.get("Content-Length", 0))) as p_bar: + with target_path.open("wb") as fp: + for chunk in resp.iter_content(chunk_size=chunk_size): + fp.write(chunk) + p_bar.update(chunk_size) + def download_data(self, file_name: str, target_dir: [Path, str], delete_old: bool = True): """ Download the specified file to the target folder. @@ -64,21 +92,7 @@ def download_data(self, file_name: str, target_dir: [Path, str], delete_old: boo target_path = target_dir.joinpath(_target_file_name) url = self.merge_remote_url(file_name) - resp = requests.get(url, stream=True, timeout=60) - resp.raise_for_status() - if resp.status_code != 200: - raise requests.exceptions.HTTPError() - - chunk_size = 1024 - logger.warning( - f"The data for the example is collected from Yahoo Finance. Please be aware that the quality of the data might not be perfect. (You can refer to the original data source: https://finance.yahoo.com/lookup.)" - ) - logger.info(f"{os.path.basename(file_name)} downloading......") - with tqdm(total=int(resp.headers.get("Content-Length", 0))) as p_bar: - with target_path.open("wb") as fp: - for chunk in resp.iter_content(chunk_size=chunk_size): - fp.write(chunk) - p_bar.update(chunk_size) + self.download(url=url, target_path=target_path) self._unzip(target_path, target_dir, delete_old) if self.delete_zip_file: From d0b84d5696746239e420bfe8b1d5271a887c1733 Mon Sep 17 00:00:00 2001 From: Linlang Date: Thu, 23 May 2024 16:21:54 +0800 Subject: [PATCH 4/5] fix pylint error --- qlib/tests/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qlib/tests/data.py b/qlib/tests/data.py index d58d58bb06..7fedb56d0a 100644 --- a/qlib/tests/data.py +++ b/qlib/tests/data.py @@ -42,7 +42,7 @@ def download(self, url: str, target_path: [Path, str]): target_path: str The location where the data is saved, including the file name. """ - file_name = str(target_path).split("/")[-1] + file_name = str(target_path).rsplit("/", maxsplit=1)[-1] resp = requests.get(url, stream=True, timeout=60) resp.raise_for_status() if resp.status_code != 200: From 5f9219acf252836d92c47945063116a0a6a06a45 Mon Sep 17 00:00:00 2001 From: Linlang Date: Thu, 23 May 2024 20:47:32 +0800 Subject: [PATCH 5/5] add comments --- qlib/tests/data.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/qlib/tests/data.py b/qlib/tests/data.py index 7fedb56d0a..2fa76855b5 100644 --- a/qlib/tests/data.py +++ b/qlib/tests/data.py @@ -29,6 +29,16 @@ def __init__(self, delete_zip_file=False): self.delete_zip_file = delete_zip_file def merge_remote_url(self, file_name: str): + """ + Generate download links. + + Parameters + ---------- + file_name: str + The name of the file to be downloaded. + The file name can be accompanied by a version number, (e.g.: v2/qlib_data_simple_cn_1d_latest.zip), + if no version number is attached, it will be downloaded from v0 by default. + """ return f"{self.REMOTE_URL}/{file_name}" if "/" in file_name else f"{self.REMOTE_URL}/v0/{file_name}" def download(self, url: str, target_path: [Path, str]):