From d8d822b42a9e168bf492b41d7bb657d7b3ecde27 Mon Sep 17 00:00:00 2001 From: Maxim Smolskiy Date: Tue, 15 Nov 2022 03:49:36 +0300 Subject: [PATCH] Fix the Errors with unexpected indentation when building Qlib's documentation (#1352) * Fix ERROR: Unexpected indentation in qlib/data/dataset/handler.py * Fix ERROR: Unexpected indentation in qlib/data/dataset/__init__.py * Fix ERROR: Unexpected indentation in ../qlib/data/cache.py * Fix ERROR: Unexpected indentation in qlib/model/meta/task.py * Fix ERROR: Unexpected indentation in qlib/model/meta/dataset.py * Fix ERROR: Unexpected indentation in qlib/workflow/online/manager.py * Fix ERROR: Unexpected indentation in qlib/workflow/online/update.py * Fix ERROR: Unexpected indentation in /qlib/workflow/__init__.py * Fix ERROR: Unexpected indentation in qlib/data/base.py * Fix ERROR: Unexpected indentation in qlib/data/dataset/loader.py * Fix ERROR: Unexpected indentation in qlib/contrib/evaluate.py * Fix ERROR: Unexpected indentation in qlib/workflow/record_temp.py * Fix ERROR: Unexpected indentation in qlib/workflow/task/gen.py * Fix ERROR: Unexpected indentation in qlib/strategy/base.py * Fix qlib/data/dataset/handler.py * Retest --- qlib/contrib/evaluate.py | 8 ++++++-- qlib/data/base.py | 5 +++++ qlib/data/cache.py | 2 +- qlib/data/dataset/__init__.py | 5 +++-- qlib/data/dataset/handler.py | 32 +++++++++++++++++++++++++------- qlib/data/dataset/loader.py | 2 ++ qlib/model/meta/dataset.py | 4 ++++ qlib/model/meta/task.py | 5 ++++- qlib/strategy/base.py | 6 +++++- qlib/workflow/__init__.py | 1 + qlib/workflow/online/manager.py | 2 ++ qlib/workflow/online/update.py | 27 ++++++++++++++++++--------- qlib/workflow/record_temp.py | 2 ++ qlib/workflow/task/gen.py | 2 ++ 14 files changed, 80 insertions(+), 23 deletions(-) diff --git a/qlib/contrib/evaluate.py b/qlib/contrib/evaluate.py index 2901a40eae..8e5cfd4fb5 100644 --- a/qlib/contrib/evaluate.py +++ b/qlib/contrib/evaluate.py @@ -187,9 +187,13 @@ def backtest_daily( the benchmark for reporting. account : Union[float, int, Position] information for describing how to creating the account + For `float` or `int`: + Using Account with only initial cash + For `Position`: + Using Account with a Position exchange_kwargs : dict the kwargs for initializing Exchange @@ -283,8 +287,8 @@ def long_short_backtest( NOTE: This will be faster with offline qlib. :return: The result of backtest, it is represented by a dict. { "long": long_returns(excess), - "short": short_returns(excess), - "long_short": long_short_returns} + "short": short_returns(excess), + "long_short": long_short_returns} """ if get_level_index(pred, level="datetime") == 1: pred = pred.swaplevel().sort_index() diff --git a/qlib/data/base.py b/qlib/data/base.py index cf32d333f7..496ae38ee2 100644 --- a/qlib/data/base.py +++ b/qlib/data/base.py @@ -16,8 +16,10 @@ class Expression(abc.ABC): Expression is designed to handle the calculation of data with the format below data with two dimension for each instrument, + - feature - time: it could be observation time or period time. + - period time is designed for Point-in-time database. For example, the period time maybe 2014Q4, its value can observed for multiple times(different value may be observed at different time due to amendment). """ @@ -142,9 +144,12 @@ def load(self, instrument, start_index, end_index, *args): This function is responsible for loading feature/expression based on the expression engine. The concrete implementation will be separated into two parts: + 1) caching data, handle errors. + - This part is shared by all the expressions and implemented in Expression 2) processing and calculating data based on the specific expression. + - This part is different in each expression and implemented in each expression Expression Engine is shared by different data. diff --git a/qlib/data/cache.py b/qlib/data/cache.py index 7c692377ad..e7336e8bed 100644 --- a/qlib/data/cache.py +++ b/qlib/data/cache.py @@ -394,7 +394,7 @@ def dataset( .. note:: The server use redis_lock to make sure read-write conflicts will not be triggered - but client readers are not considered. + but client readers are not considered. """ if disk_cache == 0: # skip cache diff --git a/qlib/data/dataset/__init__.py b/qlib/data/dataset/__init__.py index dcc9957ed6..286418bcf8 100644 --- a/qlib/data/dataset/__init__.py +++ b/qlib/data/dataset/__init__.py @@ -205,8 +205,9 @@ def prepare( col_set : str The col_set will be passed to self.handler when fetching data. TODO: make it automatic: - - select DK_I for test data - - select DK_L for training data. + + - select DK_I for test data + - select DK_L for training data. data_key : str The data to fetch: DK_* Default is DK_I, which indicate fetching data for **inference**. diff --git a/qlib/data/dataset/handler.py b/qlib/data/dataset/handler.py index bb44cd893a..5d73ac6cea 100644 --- a/qlib/data/dataset/handler.py +++ b/qlib/data/dataset/handler.py @@ -160,13 +160,17 @@ def fetch( selector : Union[pd.Timestamp, slice, str] describe how to select data by index It can be categories as following + - fetch single index - fetch a range of index + - a slice range - pd.Index for specific indexes Following conflictions may occurs - - Does [20200101", "20210101"] mean selecting this slice or these two days? + + - Does ["20200101", "20210101"] mean selecting this slice or these two days? + - slice have higher priorities level : Union[str, int] @@ -178,7 +182,8 @@ def fetch( select a set of meaningful, pd.Index columns.(e.g. features, columns) - if col_set == CS_RAW: + - if col_set == CS_RAW: + the raw dataset will be returned. - if isinstance(col_set, List[str]): @@ -186,8 +191,10 @@ def fetch( select several sets of meaningful columns, the returned data has multiple levels proc_func: Callable + - Give a hook for processing data before fetching - An example to explain the necessity of the hook: + - A Dataset learned some processors to process data which is related to data segmentation - It will apply them every time when preparing data. - The learned processor require the dataframe remains the same format when fitting and applying @@ -326,18 +333,23 @@ class DataHandlerLP(DataHandler): DataHandler with **(L)earnable (P)rocessor** This handler will produce three pieces of data in pd.DataFrame format. + - DK_R / self._data: the raw data loaded from the loader - DK_I / self._infer: the data processed for inference - DK_L / self._learn: the data processed for learning model. The motivation of using different processor workflows for learning and inference Here are some examples. + - The instrument universe for learning and inference may be different. - The processing of some samples may rely on label (for example, some samples hit the limit may need extra processing or be dropped). - These processors only apply to the learning phase. + + - These processors only apply to the learning phase. Tips to improve the performance of data handler + - To reduce the memory cost + - `drop_raw=True`: this will modify the data inplace on raw data; """ @@ -482,12 +494,18 @@ def process_data(self, with_fit: bool = False): Notation: (data) [processor] # data processing flow of self.process_type == DataHandlerLP.PTYPE_I - (self._data)-[shared_processors]-(_shared_df)-[learn_processors]-(_learn_df) - \ - -[infer_processors]-(_infer_df) + + .. code-block:: text + + (self._data)-[shared_processors]-(_shared_df)-[learn_processors]-(_learn_df) + \\ + -[infer_processors]-(_infer_df) # data processing flow of self.process_type == DataHandlerLP.PTYPE_A - (self._data)-[shared_processors]-(_shared_df)-[infer_processors]-(_infer_df)-[learn_processors]-(_learn_df) + + .. code-block:: text + + (self._data)-[shared_processors]-(_shared_df)-[infer_processors]-(_infer_df)-[learn_processors]-(_learn_df) Parameters ---------- diff --git a/qlib/data/dataset/loader.py b/qlib/data/dataset/loader.py index c80d60bab8..074cfa6084 100644 --- a/qlib/data/dataset/loader.py +++ b/qlib/data/dataset/loader.py @@ -278,7 +278,9 @@ class DataLoaderDH(DataLoader): - If you just want to load data from single datahandler, you can write them in single data handler TODO: What make this module not that easy to use. + - For online scenario + - The underlayer data handler should be configured. But data loader doesn't provide such interface & hook. """ diff --git a/qlib/model/meta/dataset.py b/qlib/model/meta/dataset.py index 8238428978..34a9b949b3 100644 --- a/qlib/model/meta/dataset.py +++ b/qlib/model/meta/dataset.py @@ -12,11 +12,15 @@ class MetaTaskDataset(Serializable, metaclass=abc.ABCMeta): A dataset fetching the data in a meta-level. A Meta Dataset is responsible for + - input tasks(e.g. Qlib tasks) and prepare meta tasks + - meta task contains more information than normal tasks (e.g. input data for meta model) The learnt pattern could transfer to other meta dataset. The following cases should be supported + - A meta-model trained on meta-dataset A and then applied to meta-dataset B + - Some pattern are shared between meta-dataset A and B, so meta-input on meta-dataset A are used when meta model are applied on meta-dataset-B """ diff --git a/qlib/model/meta/task.py b/qlib/model/meta/task.py index f59198830d..3204910010 100644 --- a/qlib/model/meta/task.py +++ b/qlib/model/meta/task.py @@ -11,9 +11,11 @@ class MetaTask: It serves as a component as in MetaDatasetDS The data processing is different + - the processed input may be different between training and testing + - When training, the X, y, X_test, y_test in training tasks are necessary (# PROC_MODE_FULL #) - but not necessary in test tasks. (# PROC_MODE_TEST #) + but not necessary in test tasks. (# PROC_MODE_TEST #) - When the meta model can be transferred into other dataset, only meta_info is necessary (# PROC_MODE_TRANSFER #) """ @@ -24,6 +26,7 @@ class MetaTask: def __init__(self, task: dict, meta_info: object, mode: str = PROC_MODE_FULL): """ The `__init__` func is responsible for + - store the task - store the origin input data for - process the input data for meta data diff --git a/qlib/strategy/base.py b/qlib/strategy/base.py index 532e88452e..a9e138fdbb 100644 --- a/qlib/strategy/base.py +++ b/qlib/strategy/base.py @@ -36,6 +36,7 @@ def __init__( outer_trade_decision : BaseTradeDecision, optional the trade decision of outer strategy which this strategy relies, and it will be traded in [start_time, end_time], by default None + - If the strategy is used to split trade decision, it will be used - If the strategy is used for portfolio management, it can be ignored level_infra : LevelInfrastructure, optional @@ -45,11 +46,13 @@ def __init__( trade_exchange : Exchange exchange that provides market info, used to deal order and generate report + - If `trade_exchange` is None, self.trade_exchange will be set with common_infra - It allows different trade_exchanges is used in different executions. - For example: + - In daily execution, both daily exchange and minutely are usable, but the daily exchange is - recommended because it run faster. + recommended because it run faster. - In minutely execution, the daily exchange is not usable, only the minutely exchange is recommended. """ @@ -137,6 +140,7 @@ def generate_trade_decision( ---------- execute_result : List[object], optional the executed result for trade decision, by default None + - When call the generate_trade_decision firstly, `execute_result` could be None """ raise NotImplementedError("generate_trade_decision is not implemented!") diff --git a/qlib/workflow/__init__.py b/qlib/workflow/__init__.py index 220949c143..aecf0ac992 100644 --- a/qlib/workflow/__init__.py +++ b/qlib/workflow/__init__.py @@ -350,6 +350,7 @@ def set_uri(self, uri: Optional[Text]): Method to reset the current uri of current experiment manager. NOTE: + - When the uri is refer to a file path, please using the absolute path instead of strings like "~/mlruns/" The backend don't support strings like this. """ diff --git a/qlib/workflow/online/manager.py b/qlib/workflow/online/manager.py index aeeb111b27..9a085ace51 100644 --- a/qlib/workflow/online/manager.py +++ b/qlib/workflow/online/manager.py @@ -78,7 +78,9 @@ # Can we simplify current workflow? + - Can reduce the number of state of tasks? + - For each task, we have three phases (i.e. task, partly trained task, final trained task) """ diff --git a/qlib/workflow/online/update.py b/qlib/workflow/online/update.py index 0360d69b77..5047a1bd25 100644 --- a/qlib/workflow/online/update.py +++ b/qlib/workflow/online/update.py @@ -82,19 +82,23 @@ def update(self, *args, **kwargs): class DSBasedUpdater(RecordUpdater, metaclass=ABCMeta): """ Dataset-Based Updater + - Providing updating feature for Updating data based on Qlib Dataset Assumption + - Based on Qlib dataset - - The data to be updated is a multi-level index pd.DataFrame. For example label , prediction. - - LABEL0 - datetime instrument - 2021-05-10 SH600000 0.006965 - SH600004 0.003407 - ... ... - 2021-05-28 SZ300498 0.015748 - SZ300676 -0.001321 + - The data to be updated is a multi-level index pd.DataFrame. For example label, prediction. + + .. code-block:: + + LABEL0 + datetime instrument + 2021-05-10 SH600000 0.006965 + SH600004 0.003407 + ... ... + 2021-05-28 SZ300498 0.015748 + SZ300676 -0.001321 """ def __init__( @@ -111,6 +115,7 @@ def __init__( Init PredUpdater. Expected behavior in following cases: + - if `to_date` is greater than the max date in the calendar, the data will be updated to the latest date - if there are data before `from_date` or after `to_date`, only the data between `from_date` and `to_date` are affected. @@ -118,11 +123,15 @@ def __init__( record : Recorder to_date : update to prediction to the `to_date` + if to_date is None: + data will updated to the latest date. from_date : the update will start from `from_date` + if from_date is None: + the updating will occur on the next tick after the latest data in historical data hist_ref : int Sometimes, the dataset will have historical depends. diff --git a/qlib/workflow/record_temp.py b/qlib/workflow/record_temp.py index 5f62e77589..2831482104 100644 --- a/qlib/workflow/record_temp.py +++ b/qlib/workflow/record_temp.py @@ -349,7 +349,9 @@ class PortAnaRecord(ACRecordTemp): This is the Portfolio Analysis Record class that generates the analysis results such as those of backtest. This class inherits the ``RecordTemp`` class. The following files will be stored in recorder + - report_normal.pkl & positions_normal.pkl: + - The return report and detailed positions of the backtest, returned by `qlib/contrib/evaluate.py:backtest` - port_analysis.pkl : The risk analysis of your portfolio, returned by `qlib/contrib/evaluate.py:risk_analysis` """ diff --git a/qlib/workflow/task/gen.py b/qlib/workflow/task/gen.py index 7ef7b4ed95..77bd2cbc11 100644 --- a/qlib/workflow/task/gen.py +++ b/qlib/workflow/task/gen.py @@ -94,7 +94,9 @@ def handler_mod(task: dict, rolling_gen): """ Help to modify the handler end time when using RollingGen It try to handle the following case + - Hander's data end_time is earlier than dataset's test_data's segments. + - To handle this, handler's data's end_time is extended. If the handler's end_time is None, then it is not necessary to change it's end time.