From 48f869488e5f01c2b70eee7cba747891c93755cd Mon Sep 17 00:00:00 2001 From: wendili-cs Date: Thu, 1 Jul 2021 17:03:26 +0800 Subject: [PATCH 01/37] Merge data selection to main --- docs/component/meta.rst | 53 ++++ docs/index.rst | 3 +- qlib/contrib/meta/TCTS/model.py | 123 ++++++++++ qlib/contrib/meta/TCTS/net.py | 54 +++++ qlib/contrib/meta/__init__.py | 4 + qlib/contrib/meta/data_selection/__init__.py | 6 + qlib/contrib/meta/data_selection/dataset.py | 240 +++++++++++++++++++ qlib/contrib/meta/data_selection/model.py | 212 ++++++++++++++++ qlib/contrib/meta/data_selection/net.py | 58 +++++ qlib/contrib/meta/data_selection/task.py | 50 ++++ qlib/contrib/meta/data_selection/utils.py | 125 ++++++++++ qlib/contrib/model/catboost_model.py | 15 +- qlib/contrib/model/gbdt.py | 23 +- qlib/contrib/model/pytorch_alstm_ts.py | 43 +++- qlib/contrib/model/pytorch_gru_ts.py | 43 +++- qlib/contrib/model/pytorch_lstm_ts.py | 41 +++- qlib/contrib/model/pytorch_nn.py | 13 +- qlib/contrib/model/xgboost.py | 15 +- qlib/data/dataset/weight.py | 102 ++++++++ qlib/model/base.py | 3 +- qlib/model/meta/__init__.py | 8 + qlib/model/meta/dataset.py | 57 +++++ qlib/model/meta/model.py | 58 +++++ qlib/model/meta/task.py | 42 ++++ qlib/model/utils.py | 15 ++ 25 files changed, 1361 insertions(+), 45 deletions(-) create mode 100644 docs/component/meta.rst create mode 100644 qlib/contrib/meta/TCTS/model.py create mode 100644 qlib/contrib/meta/TCTS/net.py create mode 100644 qlib/contrib/meta/__init__.py create mode 100644 qlib/contrib/meta/data_selection/__init__.py create mode 100644 qlib/contrib/meta/data_selection/dataset.py create mode 100644 qlib/contrib/meta/data_selection/model.py create mode 100644 qlib/contrib/meta/data_selection/net.py create mode 100644 qlib/contrib/meta/data_selection/task.py create mode 100644 qlib/contrib/meta/data_selection/utils.py create mode 100644 qlib/data/dataset/weight.py create mode 100644 qlib/model/meta/__init__.py create mode 100644 qlib/model/meta/dataset.py create mode 100644 qlib/model/meta/model.py create mode 100644 qlib/model/meta/task.py create mode 100644 qlib/model/utils.py diff --git a/docs/component/meta.rst b/docs/component/meta.rst new file mode 100644 index 0000000000..0d57b4499b --- /dev/null +++ b/docs/component/meta.rst @@ -0,0 +1,53 @@ +.. _meta: + +================================= +Meta Controller: Meta-Task & Meta-Dataset & Meta-Model +================================= +.. currentmodule:: qlib + + +Introduction +============= +TODO: Add introduction. + +Meta Task +============= + +A `Meta Task` instance is the basic element in the meta-learning framework. It saves the data that can be used for the `Meta Model`. Multiple `Meta Task` instances may share the same `Data Handler`, controlled by `Meta Dataset`. Users should use `prepare_task_data()` to obtain the data that can be directly fed into the `Meta Model`. + +.. autoclass:: qlib.model.meta.task.MetaTask + :members: + +Meta Dataset +============= + +`Meta Dataset` controls the meta-information generating process. It is on the duty of providing data for training the `Meta Model`. Users should use `prepare_tasks` to retrieve a list of `Meta Task` instances. + +.. autoclass:: qlib.model.meta.dataset.MetaDataset + :members: + +Meta Model +============= + +General Meta Model +------------------ +`Meta Model` instance is the part that controls the workflow. The usage of the `Meta Model` includes: +1. Users train their `Meta Model` with the `fit` function. +2. The `Meta Model` instance guides the workflow by giving useful information via the `inference` function. + +.. autoclass:: qlib.model.meta.model.MetaModel + :members: + +Meta Task Model +------------------ +This type of meta-model may interact with task definitions directly. Then, the `Meta Task Model` is the class for them to inherit from. They guide the base tasks by modifying the base task definitions. The function `prepare_tasks` can be used to obtain the modified base task definitions. + +.. autoclass:: qlib.model.meta.model.MetaTaskModel + :members: + +Meta Guide Model +------------------ +This type of meta-model participates in the training process of the base forecasting model. The meta-model may guide the base forecasting models during their training to improve their performances. + +.. autoclass:: qlib.model.meta.model.MetaGuideModel + :members: \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst index 803aa97d2d..f5606aede0 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -36,10 +36,11 @@ Document Structure :caption: COMPONENTS: Workflow: Workflow Management - Data Layer: Data Framework&Usage + Data Layer: Data Framework & Usage Forecast Model: Model Training & Prediction Strategy: Portfolio Management Intraday Trading: Model&Strategy Testing + Meta Controller: Meta-Task & Meta-Dataset & Meta-Model Qlib Recorder: Experiment Management Analysis: Evaluation & Results Analysis Online Serving: Online Management & Strategy & Tool diff --git a/qlib/contrib/meta/TCTS/model.py b/qlib/contrib/meta/TCTS/model.py new file mode 100644 index 0000000000..b270a114be --- /dev/null +++ b/qlib/contrib/meta/TCTS/model.py @@ -0,0 +1,123 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import pandas as pd +import numpy as np +import torch +from torch import nn +from torch import optim +import copy +import logging + +from .net import MLPModel + +from ....data.dataset import DatasetH + + +class MetaModelTCTS(MetaGuideModel): + """ + The meta-model for TCTS + """ + + def __init__( + self, + d_feat=6, + hidden_size=64, + num_layers=2, + dropout=0.0, + n_epochs=200, + batch_size=2000, + early_stop=20, + loss="mse", + optimizer="adam", + output_dim=5, + lr=5e-7, + steps=3, + GPU=0, + seed=None, + target_label=0, + **kwargs + ): + # Set logger. + self.logger = get_module_logger("TCTS") + self.logger.info("TCTS pytorch version...") + + # set hyper-parameters. + self.d_feat = d_feat + self.hidden_size = hidden_size + self.num_layers = num_layers + self.dropout = dropout + self.n_epochs = n_epochs + self.batch_size = batch_size + self.early_stop = early_stop + self.loss = loss + self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() else "cpu") + self.use_gpu = torch.cuda.is_available() + self.seed = seed + self.output_dim = output_dim + self.lr = lr + self.steps = steps + self.target_label = target_label + + self.logger.info( + "TCTS parameters setting:" + "\nd_feat : {}" + "\nhidden_size : {}" + "\nnum_layers : {}" + "\ndropout : {}" + "\nn_epochs : {}" + "\nbatch_size : {}" + "\nearly_stop : {}" + "\nloss_type : {}" + "\nvisible_GPU : {}" + "\nuse_GPU : {}" + "\nseed : {}".format( + d_feat, + hidden_size, + num_layers, + dropout, + n_epochs, + batch_size, + early_stop, + loss, + GPU, + self.use_gpu, + seed, + ) + ) + + if self.seed is not None: + np.random.seed(self.seed) + torch.manual_seed(self.seed) + + self.weight_model = MLPModel( + d_feat=360 + 2 * self.output_dim + 1, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + dropout=self.dropout, + output_dim=self.output_dim, + ) + if optimizer.lower() == "adam": + self.optimizer = optim.Adam(self.weight_model.parameters(), lr=self.lr) + elif optimizer.lower() == "gd": + self.optimizer = optim.SGD(self.weight_model.parameters(), lr=self.lr) + else: + raise NotImplementedError("optimizer {} is not supported!".format(optimizer)) + + self.fitted = False + self.weight_model.to(self.device) + + def loss_fn(self, pred, label, weight): + + loc = torch.argmax(weight, 1) + loss = (pred - label[np.arange(weight.shape[0]), loc]) ** 2 + return torch.mean(loss) + + def fit( + self, + dataset: DatasetH, + evals_result=dict(), + verbose=True, + save_path=None, + ): + pass diff --git a/qlib/contrib/meta/TCTS/net.py b/qlib/contrib/meta/TCTS/net.py new file mode 100644 index 0000000000..24128c63b7 --- /dev/null +++ b/qlib/contrib/meta/TCTS/net.py @@ -0,0 +1,54 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +import torch.optim as optim + + +class MLPModel(nn.Module): + def __init__(self, d_feat, hidden_size=256, num_layers=3, dropout=0.0, output_dim=1): + super().__init__() + + self.mlp = nn.Sequential() + self.softmax = nn.Softmax(dim=1) + + for i in range(num_layers): + if i > 0: + self.mlp.add_module("drop_%d" % i, nn.Dropout(dropout)) + self.mlp.add_module("fc_%d" % i, nn.Linear(d_feat if i == 0 else hidden_size, hidden_size)) + self.mlp.add_module("relu_%d" % i, nn.ReLU()) + + self.mlp.add_module("fc_out", nn.Linear(hidden_size, output_dim)) + + def forward(self, x): + # feature + # [N, F] + out = self.mlp(x).squeeze() + out = self.softmax(out) + return out + + +class GRUModel(nn.Module): + def __init__(self, d_feat=6, hidden_size=64, num_layers=2, dropout=0.0): + super().__init__() + + self.rnn = nn.GRU( + input_size=d_feat, + hidden_size=hidden_size, + num_layers=num_layers, + batch_first=True, + dropout=dropout, + ) + self.fc_out = nn.Linear(hidden_size, 1) + + self.d_feat = d_feat + + def forward(self, x): + # x: [N, F*T] + x = x.reshape(len(x), self.d_feat, -1) # [N, F, T] + x = x.permute(0, 2, 1) # [N, T, F] + out, _ = self.rnn(x) + return self.fc_out(out[:, -1, :]).squeeze() diff --git a/qlib/contrib/meta/__init__.py b/qlib/contrib/meta/__init__.py new file mode 100644 index 0000000000..06a2ea30be --- /dev/null +++ b/qlib/contrib/meta/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +from .data_selection import MetaTaskDS, MetaDatasetHDS, MetaModelDS diff --git a/qlib/contrib/meta/data_selection/__init__.py b/qlib/contrib/meta/data_selection/__init__.py new file mode 100644 index 0000000000..eaf702c7f6 --- /dev/null +++ b/qlib/contrib/meta/data_selection/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +from .task import MetaTaskDS +from .dataset import MetaDatasetHDS +from .model import MetaModelDS diff --git a/qlib/contrib/meta/data_selection/dataset.py b/qlib/contrib/meta/data_selection/dataset.py new file mode 100644 index 0000000000..da117f1506 --- /dev/null +++ b/qlib/contrib/meta/data_selection/dataset.py @@ -0,0 +1,240 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import pandas as pd +import numpy as np +import time +import copy +from typing import Union, List, Tuple, Text + +from ....data.dataset import DatasetH +from ....data.dataset.handler import DataHandlerLP +from ....data import D +from ....utils import init_instance_by_config +from ....workflow.task.utils import TimeAdjuster +from ....model.meta.dataset import MetaDatasetH + +from .utils import fill_diagnal +from .task import MetaTaskDS + + +class MetaDatasetHDS(MetaDatasetH): + """ + The MetaDatasetH for the meta-Learning-based data selection. + """ + + def __init__(self, rolling_dict: dict, sim_mat=None, rolling_len=20, horizon=20, HIST_N=30, *args, **kwargs): + """ + + Parameters + ---------- + rolling_dict: dict + A dict that defines the train, valid (training for meta-model), and test scope. + sim_mat: Union[pd.Dataframe, NoneType] + The similarity matrix. The similarity matrix will be calculated if None is passed in. + rolling_len: int + The length of the test period in each rolling task. + horizon: int + The horizon of the label, the rolling process will create a gap between the training data and test data in order to avoid accessing the future information. + HIST_N: int + The number of periods that the meta-model will use. + """ + super().__init__(*args, **kwargs) + self.rolling_len = rolling_len + self.rolling_dict = rolling_dict + self.horizon = horizon + self.HIST_N = HIST_N + if sim_mat is None: + self._init_sim_mat() + else: + self.sim_mat = sim_mat + self.meta_tasks_l = self._generate_tasks_from_sim_mat() + self.meta_tasks = self._init_meta_task_list() + + def _generate_tasks_from_sim_mat(self): + ta = TimeAdjuster() + cal = ta.cals + dates = list(cal) + meta_tasks_l = [] + rolling_start = self.rolling_dict["dataset"]["kwargs"]["segments"]["valid"][0] + for (start, end) in self.sim_mat.columns: + if start >= pd.Timestamp(rolling_start): + meta_task = copy.deepcopy(self.rolling_dict)["dataset"] # Be careful! + rolling_start_idx = ta.align_idx(start) + train_end = ta.get(rolling_start_idx - self.horizon) + meta_task["kwargs"]["segments"]["train"] = ( + pd.Timestamp(meta_task["kwargs"]["segments"]["train"][0]), + train_end, + ) + meta_task["kwargs"]["segments"]["test"] = (start, end) + meta_task["kwargs"]["segments"].pop("valid") + meta_tasks_l.append(meta_task) + return meta_tasks_l + + def get_sim_mat_from_tasks(self): + """ + Get the similarity matrix from the initialized tasks. + """ + sim_mat = {} + for task in self.sim_tasks: + sim_mean_series = pd.Series(task["sim_mean"]) + sim_mat[task["train_period"]] = sim_mean_series + sim_mat_df = pd.DataFrame(sim_mat) + return sim_mat_df + + def _init_sim_mat(self): + """ + Initialize the similarity matrix. + """ + self._generate_sim_task() + self._calc_sim_mat() + self.sim_mat = self.get_sim_mat_from_tasks() + + def _generate_sim_task(self): + """ + Generate the the definition of the similarity matrix. + """ + ta = TimeAdjuster() + cal = ta.cals + dates = list(cal) + self.sim_tasks = [] + rolling_dict = copy.deepcopy(self.rolling_dict) + train_start, train_end = rolling_dict["dataset"]["kwargs"]["segments"]["train"] + valid_start, valid_end = rolling_dict["dataset"]["kwargs"]["segments"]["valid"] + test_start, test_end = rolling_dict["dataset"]["kwargs"]["segments"]["test"] + train_start_idx, train_end_idx = ta.align_idx(train_start), ta.align_idx(train_end) + valid_start_idx, valid_end_idx = ta.align_idx(valid_start), ta.align_idx(valid_end) + test_start_idx, test_end_idx = ta.align_idx(test_start), ta.align_idx(test_end) + start_idx = train_start_idx + ((test_start_idx - train_start_idx) % self.rolling_len) # To align at test start + + def get_rolling_periods(): + rolling_periods = [] + if start_idx - 1 > train_start_idx: + rolling_periods.append((dates[train_start_idx], dates[start_idx - 1])) + for t_start, t_end in zip( + dates[start_idx : test_end_idx + 1 : self.rolling_len], + dates[start_idx + self.rolling_len - 1 : test_end_idx + 1 : self.rolling_len], + ): + rolling_periods.append((t_start, t_end)) + t_end_idx = ta.align_idx(t_end) + if t_end_idx + 1 < test_end_idx: + rolling_periods.append((dates[t_end_idx + 1], dates[test_end_idx])) + return rolling_periods + + rolling_periods = get_rolling_periods() + for period in rolling_periods: + sim_task = {"train_period": period, "rolling_periods": rolling_periods} + self.sim_tasks.append(sim_task) + + def _calc_sim_mat(self): + """ + Calculate the similarity matrix. + """ + print("Calculating the similarity matrix...") + start_time = time.time() + for index, task in enumerate(self.sim_tasks): + # Prepare the dataset + rolling_dict = copy.deepcopy(self.rolling_dict) + task["dataset"] = rolling_dict["dataset"] + task["dataset"]["kwargs"]["handler"] = self.data_handler + task_seg = { + "train": task["train_period"], + "test": (task["rolling_periods"][0][0], task["rolling_periods"][-1][1]), + } + task["dataset"]["kwargs"]["segments"] = task_seg + task["dataset"] = init_instance_by_config(task["dataset"]) + + # Train & inference the model + task["model"] = init_instance_by_config(rolling_dict["model"]) + task["model"].fit(task["dataset"]) + pred = task["model"].predict(task["dataset"]) + label = task["dataset"].prepare("test", col_set="label", data_key=DataHandlerLP.DK_I).iloc[:, 0] + + # Calculate the similarity + sim_mean = {} + for (rolling_start, rolling_end) in task["rolling_periods"]: + df = pd.DataFrame( + {"pred": pred.loc[rolling_start:rolling_end], "label": label.loc[rolling_start:rolling_end]} + ) + sims = df.groupby("datetime").apply(lambda df: df["pred"].corr(df["label"], method="spearman")) + sim_mean[(rolling_start, rolling_end)] = sims.mean() + task["sim_mean"] = sim_mean + end_time = time.time() + print("The similarity matrix calculating process is finished. Total time: %.2fs." % (end_time - start_time)) + + def _init_meta_task_list(self, *args, **kwargs): + meta_tasks = [] + for task in self.meta_tasks_l: + meta_task = self._init_meta_task(task) + if meta_task is not None: + meta_tasks.append(meta_task) + if meta_tasks == []: + raise AssertionError("No meta-task is created!") + return meta_tasks + + def _init_meta_task(self, meta_task: dict, *args, **kwargs) -> MetaTaskDS: + meta_task["kwargs"]["handler"] = self.data_handler + test_date = meta_task["kwargs"]["segments"]["test"] + sim_mat_fill = fill_diagnal(self.sim_mat) # Remove the future information + sim_mat_focus = sim_mat_fill.loc[:test_date, :test_date] + + task_def = { + # Because the last month may leak future information, so -1 is excluded + "insample": list(sim_mat_focus.index[:-2]), + "outsample": test_date, # sim_mat_focus.index[-1], + } + + time_perf = None # For possible spatical extension + task_idx = len(sim_mat_focus) + if task_idx > self.HIST_N: + time_perf = sim_mat_focus.iloc[-self.HIST_N - 1 : -1].loc[:, task_def["insample"]] + if time_perf is None: # Only qualified meta-task will be created + return None + return MetaTaskDS(task_def, time_perf, meta_task) + + def _prepare_seg(self, segment: str, *args, **kwargs): + assert len(self.meta_tasks_l) == len(self.meta_tasks) + meta_tasks = [] + test_start_date = pd.Timestamp(self.rolling_dict["dataset"]["kwargs"]["segments"]["test"][0]) + for index, task_def in enumerate(self.meta_tasks_l): + task_date = pd.Timestamp(task_def["kwargs"]["segments"]["test"][0]) + if (segment == "train" and task_date < test_start_date) or ( + segment == "test" and task_date >= test_start_date + ): + meta_tasks.append(self.meta_tasks[index]) + return meta_tasks + + def prepare_tasks(self, segments: Union[List[Text], Tuple[Text], Text], *args, **kwargs) -> List[tuple]: + """ + Prepare the meta-tasks. + """ + if isinstance(segments, (list, tuple)): + return [self._prepare_seg(seg) for seg in segments] + elif isinstance(segments, str): + return self._prepare_seg(segments) + else: + raise NotImplementedError(f"This type of input is not supported") + + def get_test_period_from_meta_tasks(self): + return [task["kwargs"]["segments"]["test"] for task in self.meta_tasks_l] + + def get_meta_task_by_test_period(self, test_period: Union[list, tuple]): + """ + Get the meta-task by the given key (test period). Return None if the meta-task is not found. + Assume the task instances in meta_tasks and the task definitions in meta_tasks_l are corresponding. + """ + # Find the exact one + period_tuple = tuple([pd.Timestamp(t) for t in test_period]) + periods = self.get_test_period_from_meta_tasks() + for index, key in enumerate(periods): + if key == period_tuple: + return self.meta_tasks[index] + # If there is no exact one, find the nearest one + nearest_idx = None + for index, key in enumerate(periods): + if key[0] <= period_tuple[0]: + if nearest_idx is None or periods[nearest_idx][0] < key[0]: + nearest_idx = index + if nearest_idx is not None: + return self.meta_tasks[nearest_idx] + return None diff --git a/qlib/contrib/meta/data_selection/model.py b/qlib/contrib/meta/data_selection/model.py new file mode 100644 index 0000000000..ea4ab0f062 --- /dev/null +++ b/qlib/contrib/meta/data_selection/model.py @@ -0,0 +1,212 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import pandas as pd +import numpy as np +import torch +from torch import nn +from torch import optim +from tqdm.auto import tqdm +import collections +import copy +from typing import Union, List, Tuple, Dict + +from ....data.dataset.weight import SampleReweighter, Reweighter +from ....model.meta.dataset import MetaDataset +from ....model.meta.model import MetaModel, MetaTaskModel +from ....workflow import R + +from .utils import fill_diagnal, convert_data_to_tensor, ICLoss +from .dataset import MetaDatasetHDS +from .net import PredNet + + +class MetaModelDS(MetaTaskModel): + """ + The meta-model for meta-learning-based data selection. + """ + + def __init__( + self, + hist_n=30, + clip_method="tanh", + clip_weight=2.0, + criterion="ic_loss", + lr=0.0001, + max_epoch=150, + ): + self.hist_n = hist_n + self.clip_method = clip_method + self.clip_weight = clip_weight + self.criterion = criterion + self.lr = lr + self.max_epoch = max_epoch + self.fitted = False + + def fit(self, meta_dataset: MetaDatasetHDS): + """ + The meta-learning-based data selection interacts directly with meta-dataset due to the close-form proxy measurement. + + Parameters + ---------- + meta_dataset : MetaDatasetHDS + The meta-model takes the meta-dataset for its training process. + """ + recorder = R.get_recorder() + if not self.fitted: + for k in set(["lr", "hist_n", "clip_method", "clip_weight", "criterion", "max_epoch"]): + recorder.log_params(**{k: getattr(self, k)}) + + # Training begins + meta_tasks = meta_dataset.prepare_tasks(["train", "test"]) + num2phase = {0: "train", 1: "test"} + phase2num = dict(zip(num2phase.values(), num2phase.keys())) + train_step = 0 + self.tn = PredNet(hist_n=self.hist_n, clip_weight=self.clip_weight, clip_method=self.clip_method) + opt = optim.Adam(self.tn.parameters(), lr=self.lr) + loss_l = {} + for epoch in tqdm(range(self.max_epoch), desc="epoch"): + for phase, task_list in enumerate(meta_tasks): + if phase == phase2num["train"]: # phase 0 for training, 1 for inference + self.tn.train() + torch.set_grad_enabled(True) + else: + self.tn.eval() + torch.set_grad_enabled(False) + running_loss = 0.0 + pred_y_all = [] + for task in tqdm(task_list, desc=f"{num2phase[phase]} Task", leave=False): + ( + X, + y, + time_perf, + time_belong, + X_test, + y_test, + test_idx, + train_idx, + test_period, + ) = task.prepare_task_data() + pred, weights = self.tn(X, y, time_perf, time_belong, X_test) + if self.criterion == "mse": + criterion = nn.MSELoss() + loss = criterion(pred, y_test) + elif self.criterion == "ic_loss": + criterion = ICLoss() + loss = criterion(pred, y_test, test_idx) + + if phase == phase2num["train"]: + opt.zero_grad() + norm_loss = nn.MSELoss() + loss.backward() + opt.step() + train_step += 1 + elif phase == phase2num["test"]: + pass # pass, leave the work for the inference function + # self.reweighters[test_period] = SampleReweighter( + # pd.Series(weights.detach().cpu().numpy(), index=train_idx) + # ) + + pred_y_all.append( + pd.DataFrame( + { + "pred": pd.Series(pred.detach().cpu().numpy(), index=test_idx), + "label": pd.Series(y_test.detach().cpu().numpy(), index=test_idx), + } + ) + ) + running_loss += loss.detach().item() + running_loss = running_loss / len(task_list) + loss_l.setdefault(phase, []).append(running_loss) + + pred_y_all = pd.concat(pred_y_all) + ic = ( + pred_y_all.groupby("datetime") + .apply(lambda df: df["pred"].corr(df["label"], method="spearman")) + .mean() + ) + + recorder.log_metrics(**{f"loss/{num2phase[phase]}": running_loss, "step": epoch}) + recorder.log_metrics(**{f"ic/{num2phase[phase]}": ic, "step": epoch}) + recorder.save_objects(**{"model.pkl": self.tn}) + self.fitted = True + + def _inference_single_task(self, meta_id: tuple, meta_dataset: MetaDatasetHDS): + meta_task = meta_dataset.get_meta_task_by_test_period(meta_id) + if meta_task is not None: + self.tn.eval() + torch.set_grad_enabled(False) + ( + X, + y, + time_perf, + time_belong, + X_test, + y_test, + test_idx, + train_idx, + test_period, + ) = meta_task.prepare_task_data() + weights = self.tn.get_sample_weights(X, time_perf, time_belong) + reweighter = SampleReweighter(pd.Series(weights.detach().cpu().numpy(), index=train_idx)) + return reweighter + else: + raise ValueError("The current task is not supported!") + + def inference(self, meta_ids: Union[List[tuple], tuple], meta_dataset: MetaDatasetHDS): + """ + Inference a single task with meta-dataset. The meta-model must be fitted. + + Parameters + ---------- + tasks: Union[List[dict], dict] + A list of definitions. + meta_dataset: MetaDatasetHDS + """ + if not self.fitted: + raise ValueError("The meta-model is not fitted yet!") + if isinstance(meta_ids, tuple): + return {meta_ids: self._inference_single_task(meta_ids, meta_dataset)} + + elif isinstance(meta_ids, list): + reweighters = {} + for meta_id in meta_ids: + reweighters[meta_id] = self._inference_single_task(meta_id, meta_dataset) + return reweighters + else: + raise NotImplementedError("This type of task definition is not supported!") + + def prepare_tasks(self, task: Union[List[dict], dict], reweighters: dict): + """ + + Parameters + ---------- + tasks: Union[List[dict], dict] + A list of definitions. + """ + if not self.fitted: + raise ValueError("The meta-model is not fitted yet!") + if isinstance(task, dict): + task_c = copy.deepcopy(task) + test_period = task_c["dataset"]["kwargs"]["segments"]["test"] + if test_period in reweighters: + task_c["reweighter"] = reweighters[test_period] + else: + nearest_key = None + for key in reweighters: + if key[0] <= test_period[0]: + if nearest_key is None or nearest_key[0] < key[0]: + nearest_key = key + if nearest_key is not None: + task_c["reweighter"] = reweighters[nearest_key] + else: + print( + "Warning: The task with test period:", + test_period, + " does not have the corresponding reweighter!", + ) + return task_c + elif isinstance(task, list): + return [self.prepare_tasks(i, reweighters) for i in task] + else: + raise NotImplementedError("This type of task definition is not supported!") diff --git a/qlib/contrib/meta/data_selection/net.py b/qlib/contrib/meta/data_selection/net.py new file mode 100644 index 0000000000..ba78a2ce4c --- /dev/null +++ b/qlib/contrib/meta/data_selection/net.py @@ -0,0 +1,58 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import pandas as pd +import numpy as np +import torch +from torch import nn + +from .utils import preds_to_weight_with_clamp, SingleMetaBase + + +class TimeWeightMeta(SingleMetaBase): + def __init__(self, hist_n, clip_weight=None, clip_method="clamp"): + # method 可以选 tanh 或者 clamp + super().__init__(hist_n, clip_weight, clip_method) + self.linear = nn.Linear(hist_n, 1) + self.k = nn.Parameter(torch.Tensor([8.0])) + + def forward(self, time_perf, time_belong, return_preds=False): + # time_perf的格式和其他的有一些不一样 + # 需要自己拆出train和test + preds = [] + for i in range(time_perf.shape[1]): + preds.append(self.linear(time_perf[:, i])) + preds = torch.cat(preds) + preds = preds - torch.mean(preds) # 这里注意一下不要引入未来信息 + preds = preds * self.k + if return_preds: + return time_belong @ preds + else: + weights = preds_to_weight_with_clamp(preds, self.clip_weight, self.clip_method) + sample_weights = time_belong @ weights + return sample_weights + + +class PredNet(nn.Module): + def __init__(self, hist_n, clip_weight=None, clip_method="tanh"): + super().__init__() + self.twm = TimeWeightMeta(hist_n=hist_n, clip_weight=clip_weight, clip_method=clip_method) + self.init_paramters(hist_n) + + def get_sample_weights(self, X, time_perf, time_belong, ignore_weight=False): + weights = torch.from_numpy(np.ones(X.shape[0])).float().to(X.device) + if not ignore_weight: + if time_perf is not None: + weights_t = self.twm(time_perf, time_belong) + weights = weights * weights_t + return weights + + def forward(self, X, y, time_perf, time_belong, X_test, ignore_weight=False): + weights = self.get_sample_weights(X, time_perf, time_belong, ignore_weight=ignore_weight) + X_w = X.T * weights.view(1, -1) + theta = torch.inverse(X_w @ X) @ X_w @ y + return X_test @ theta, weights + + def init_paramters(self, hist_n): + self.twm.linear.weight.data = 1.0 / hist_n + self.twm.linear.weight.data * 0.01 + self.twm.linear.bias.data.fill_(0.0) diff --git a/qlib/contrib/meta/data_selection/task.py b/qlib/contrib/meta/data_selection/task.py new file mode 100644 index 0000000000..37b35adad7 --- /dev/null +++ b/qlib/contrib/meta/data_selection/task.py @@ -0,0 +1,50 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import pandas as pd +import numpy as np + +from ....model.meta.task import MetaTask +from ....data.dataset.handler import DataHandlerLP + +from .utils import fill_diagnal, convert_data_to_tensor + + +class MetaTaskDS(MetaTask): + """ + The MetaTask for the meta-learning-based data selection. + """ + + def __init__(self, task_def: dict, time_perf, *args, **kwargs): + super().__init__(*args, **kwargs) + self.task_def = task_def + self.time_perf = time_perf + self._prepare_meta_task() + + def _prepare_meta_task(self): + self.X, self.X_test = self.dataset.prepare(["train", "test"], col_set="feature", data_key=DataHandlerLP.DK_L) + self.y, self.y_test = self.dataset.prepare(["train", "test"], col_set="label", data_key=DataHandlerLP.DK_L) + self.sample_time_belong = np.zeros((self.y.shape[0], self.time_perf.shape[1])) + for i, col in enumerate(self.time_perf.columns): + slc = slice(*self.y.index.slice_locs(start=col[0], end=col[1])) + self.sample_time_belong[slc, i] = 1.0 + # The last month also belongs to the last time_perf + self.sample_time_belong[self.sample_time_belong.sum(axis=1) != 1, -1] = 1.0 + self.test_idx = self.y_test.index + self.train_idx = self.y.index + self.X, self.y, self.time_perf, self.sample_time_belong, self.X_test, self.y_test = convert_data_to_tensor( + [self.X, self.y, self.time_perf, self.sample_time_belong, self.X_test, self.y_test] + ) + + def prepare_task_data(self): + return ( + self.X, + self.y, + self.time_perf, + self.sample_time_belong, + self.X_test, + self.y_test, + self.test_idx, + self.train_idx, + self.task_def["outsample"], + ) diff --git a/qlib/contrib/meta/data_selection/utils.py b/qlib/contrib/meta/data_selection/utils.py new file mode 100644 index 0000000000..3b29ac1c1b --- /dev/null +++ b/qlib/contrib/meta/data_selection/utils.py @@ -0,0 +1,125 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import pandas as pd +import numpy as np +import torch +from torch import nn + + +def fill_diagnal(sim_mat): + sim_mat = sim_mat.copy() + # Remove the future information + sim_mat_past = sim_mat.where(sim_mat.index.values.reshape(-1, 1) > sim_mat.columns.values) + sim_mat.values[sim_mat.index.values.reshape(-1, 1) == sim_mat.columns.values] = sim_mat_past.max(axis=1) + sim_mat.iloc[0, 0] = 0.0 + return sim_mat + + +def get_sim_mat_idx(i_sim_mat, outsample_period): + for idx in range(len(i_sim_mat.index)): + if i_sim_mat.index[idx][0] == outsample_period[0]: + return idx + raise AssertionError("Not Found!") + + +def convert_data_to_tensor(data, device="cpu"): + if isinstance(data, torch.Tensor): + if device == "cpu": + return data.cpu() + else: + return data.to(device) + if isinstance(data, pd.DataFrame): + return convert_data_to_tensor(torch.from_numpy(data.values.squeeze()).float(), device) + elif isinstance(data, np.ndarray): + return convert_data_to_tensor(torch.from_numpy(data).float(), device) + elif isinstance(data, (tuple, list)): + return [convert_data_to_tensor(i, device) for i in data] + elif isinstance(data, dict): + return {k: convert_data_to_tensor(v, device) for k, v in data.items()} + else: + print("type:", type(data)) + raise ValueError("Unsupported data type.") + + +class ICLoss(nn.Module): + def forward(self, pred, y, idx): + """forward. + + :param pred: + :param y: + :param idx: 这里假设 idx的level是(date, inst); 这里假设其一定排好序了 + """ + prev = None + diff_point = [] + for i, (date, inst) in enumerate(idx): + if date != prev: + diff_point.append(i) + prev = date + diff_point.append(None) + + ic_all = 0.0 + for start_i, end_i in zip(diff_point, diff_point[1:]): + pred_focus = pred[start_i:end_i] # TODO: just for fake + y_focus = y[start_i:end_i] + ic_day = torch.dot( + (pred_focus - pred_focus.mean()) / np.sqrt(pred_focus.shape[0]) / pred_focus.std(), + (y_focus - y_focus.mean()) / np.sqrt(y_focus.shape[0]) / y_focus.std(), + ) + ic_all += ic_day + ic_mean = ic_all / (len(diff_point) - 1) + return -ic_mean # ic loss + + +def preds_to_weight_with_clamp(preds, clip_weight=None, clip_method="tanh"): + """ + Clip the weights. + + Parameters + ---------- + clip_weight: float + The clip threshold. + clip_method: str + The clip method. Current available: "clamp", "tanh", and "sigmoid". + """ + if clip_weight is not None: + if clip_method == "clamp": + weights = torch.exp(preds) + weights = weights.clamp(1.0 / clip_weight, clip_weight) + elif clip_method == "tanh": + weights = torch.exp(torch.tanh(preds) * np.log(clip_weight)) + elif clip_method == "sigmoid": + # 这里的intuitively感觉是它保证和为1 + if clip_weight == 0.0: + weights = torch.ones_like(preds) + else: + sm = nn.Sigmoid() + weights = sm(preds) * clip_weight # TODO: The clip_weight is useless here. + weights = weights / torch.sum(weights) * weights.numel() + else: + raise ValueError("Unknown clip_method") + else: + weights = torch.exp(preds) + return weights + + +class SingleMetaBase(nn.Module): + def __init__(self, hist_n, clip_weight=None, clip_method="clamp"): + # method 可以选 tanh 或者 clamp + super().__init__() + self.clip_weight = clip_weight + if clip_method in ["tanh", "clamp"]: + if self.clip_weight is not None and self.clip_weight < 1.0: + self.clip_weight = 1 / self.clip_weight + self.clip_method = clip_method + + def is_enabled(self): + if self.clip_weight is None: + return True + if self.clip_method == "sigmoid": + if self.clip_weight > 0.0: + return True + else: + if self.clip_weight > 1.0: + return True + return False diff --git a/qlib/contrib/model/catboost_model.py b/qlib/contrib/model/catboost_model.py index 5138e0e6f0..59f67d6156 100644 --- a/qlib/contrib/model/catboost_model.py +++ b/qlib/contrib/model/catboost_model.py @@ -11,6 +11,7 @@ from ...data.dataset import DatasetH from ...data.dataset.handler import DataHandlerLP from ...model.interpret.base import FeatureInt +from ...data.dataset.weight import Reweighter class CatBoostModel(Model, FeatureInt): @@ -31,6 +32,7 @@ def fit( early_stopping_rounds=50, verbose_eval=20, evals_result=dict(), + reweighter=None, **kwargs ): df_train, df_valid = dataset.prepare( @@ -47,8 +49,17 @@ def fit( else: raise ValueError("CatBoost doesn't support multi-label training") - train_pool = Pool(data=x_train, label=y_train_1d) - valid_pool = Pool(data=x_valid, label=y_valid_1d) + if reweighter is None: + w_train = None + w_valid = None + elif isinstance(reweighter, Reweighter): + w_train = reweighter.reweight(df_train).values + w_valid = reweighter.reweight(df_valid).values + else: + raise ValueError("Unsupported reweighter type.") + + train_pool = Pool(data=x_train, label=y_train_1d, weight=w_train) + valid_pool = Pool(data=x_valid, label=y_valid_1d, weight=w_valid) # Initialize the catboost model self._params["iterations"] = num_boost_round diff --git a/qlib/contrib/model/gbdt.py b/qlib/contrib/model/gbdt.py index 1a7cf7fba3..fd8441c1ab 100644 --- a/qlib/contrib/model/gbdt.py +++ b/qlib/contrib/model/gbdt.py @@ -9,6 +9,7 @@ from ...data.dataset import DatasetH from ...data.dataset.handler import DataHandlerLP from ...model.interpret.base import LightGBMFInt +from ...data.dataset.weight import Reweighter class LGBModel(ModelFT, LightGBMFInt): @@ -21,7 +22,7 @@ def __init__(self, loss="mse", **kwargs): self.params.update(kwargs) self.model = None - def _prepare_data(self, dataset: DatasetH): + def _prepare_data(self, dataset: DatasetH, reweighter=None): df_train, df_valid = dataset.prepare( ["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L ) @@ -34,8 +35,17 @@ def _prepare_data(self, dataset: DatasetH): else: raise ValueError("LightGBM doesn't support multi-label training") - dtrain = lgb.Dataset(x_train, label=y_train) - dvalid = lgb.Dataset(x_valid, label=y_valid) + if reweighter is None: + w_train = None + w_valid = None + elif isinstance(reweighter, Reweighter): + w_train = reweighter.reweight(df_train) + w_valid = reweighter.reweight(df_valid) + else: + raise ValueError("Unsupported reweighter type.") + + dtrain = lgb.Dataset(x_train.values, label=y_train, weight=w_train) + dvalid = lgb.Dataset(x_valid.values, label=y_valid, weight=w_valid) return dtrain, dvalid def fit( @@ -45,9 +55,10 @@ def fit( early_stopping_rounds=50, verbose_eval=20, evals_result=dict(), + reweighter=None, **kwargs ): - dtrain, dvalid = self._prepare_data(dataset) + dtrain, dvalid = self._prepare_data(dataset, reweighter) self.model = lgb.train( self.params, dtrain, @@ -68,7 +79,7 @@ def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): x_test = dataset.prepare(segment, col_set="feature", data_key=DataHandlerLP.DK_I) return pd.Series(self.model.predict(x_test.values), index=x_test.index) - def finetune(self, dataset: DatasetH, num_boost_round=10, verbose_eval=20): + def finetune(self, dataset: DatasetH, num_boost_round=10, verbose_eval=20, reweighter=None): """ finetune model @@ -82,7 +93,7 @@ def finetune(self, dataset: DatasetH, num_boost_round=10, verbose_eval=20): verbose level """ # Based on existing model and finetune by train more rounds - dtrain, _ = self._prepare_data(dataset) + dtrain, _ = self._prepare_data(dataset, reweighter) self.model = lgb.train( self.params, dtrain, diff --git a/qlib/contrib/model/pytorch_alstm_ts.py b/qlib/contrib/model/pytorch_alstm_ts.py index f1aa8227cb..8f3a521a9f 100644 --- a/qlib/contrib/model/pytorch_alstm_ts.py +++ b/qlib/contrib/model/pytorch_alstm_ts.py @@ -22,6 +22,8 @@ from ...model.base import Model from ...data.dataset import DatasetH, TSDatasetH from ...data.dataset.handler import DataHandlerLP +from ...model.utils import ConcatDataset +from ...data.dataset.weight import Reweighter class ALSTM(Model): @@ -139,15 +141,18 @@ def __init__( def use_gpu(self): return self.device != torch.device("cpu") - def mse(self, pred, label): - loss = (pred - label) ** 2 + def mse(self, pred, label, weight): + loss = weight * (pred - label) ** 2 return torch.mean(loss) - def loss_fn(self, pred, label): + def loss_fn(self, pred, label, weight=None): mask = ~torch.isnan(label) + if weight is None: + weight = torch.ones_like(label) + if self.loss == "mse": - return self.mse(pred[mask], label[mask]) + return self.mse(pred[mask], label[mask], weight[mask]) raise ValueError("unknown loss `%s`" % self.loss) @@ -164,12 +169,12 @@ def train_epoch(self, data_loader): self.ALSTM_model.train() - for data in data_loader: + for (data, weight) in data_loader: feature = data[:, :, 0:-1].to(self.device) label = data[:, -1, -1].to(self.device) pred = self.ALSTM_model(feature.float()) - loss = self.loss_fn(pred, label) + loss = self.loss_fn(pred, label, weight.to(self.device)) self.train_optimizer.zero_grad() loss.backward() @@ -183,7 +188,7 @@ def test_epoch(self, data_loader): scores = [] losses = [] - for data in data_loader: + for (data, weight) in data_loader: feature = data[:, :, 0:-1].to(self.device) # feature[torch.isnan(feature)] = 0 @@ -191,7 +196,7 @@ def test_epoch(self, data_loader): with torch.no_grad(): pred = self.ALSTM_model(feature.float()) - loss = self.loss_fn(pred, label) + loss = self.loss_fn(pred, label, weight.to(self.device)) losses.append(loss.item()) score = self.metric_fn(pred, label) @@ -204,6 +209,7 @@ def fit( dataset, evals_result=dict(), save_path=None, + reweighter=None, ): dl_train = dataset.prepare("train", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L) dl_valid = dataset.prepare("valid", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L) @@ -211,11 +217,28 @@ def fit( dl_train.config(fillna_type="ffill+bfill") # process nan brought by dataloader dl_valid.config(fillna_type="ffill+bfill") # process nan brought by dataloader + if reweighter is None: + wl_train = np.ones(len(dl_train)) + wl_valid = np.ones(len(dl_valid)) + elif isinstance(reweighter, Reweighter): + wl_train = reweighter.reweight(dl_train) + wl_valid = reweighter.reweight(dl_valid) + else: + raise ValueError("Unsupported reweighter type.") + train_loader = DataLoader( - dl_train, batch_size=self.batch_size, shuffle=True, num_workers=self.n_jobs, drop_last=True + ConcatDataset(dl_train, wl_train), + batch_size=self.batch_size, + shuffle=True, + num_workers=self.n_jobs, + drop_last=True, ) valid_loader = DataLoader( - dl_valid, batch_size=self.batch_size, shuffle=False, num_workers=self.n_jobs, drop_last=True + ConcatDataset(dl_valid, wl_valid), + batch_size=self.batch_size, + shuffle=False, + num_workers=self.n_jobs, + drop_last=True, ) save_path = get_or_create_path(save_path) diff --git a/qlib/contrib/model/pytorch_gru_ts.py b/qlib/contrib/model/pytorch_gru_ts.py index c094a3e3c5..2e6d0e4fae 100755 --- a/qlib/contrib/model/pytorch_gru_ts.py +++ b/qlib/contrib/model/pytorch_gru_ts.py @@ -21,6 +21,8 @@ from ...model.base import Model from ...data.dataset import DatasetH, TSDatasetH from ...data.dataset.handler import DataHandlerLP +from ...model.utils import ConcatDataset +from ...data.dataset.weight import Reweighter class GRU(Model): @@ -138,15 +140,18 @@ def __init__( def use_gpu(self): return self.device != torch.device("cpu") - def mse(self, pred, label): - loss = (pred - label) ** 2 + def mse(self, pred, label, weight): + loss = weight * (pred - label) ** 2 return torch.mean(loss) - def loss_fn(self, pred, label): + def loss_fn(self, pred, label, weight=None): mask = ~torch.isnan(label) + if weight is None: + weight = torch.ones_like(label) + if self.loss == "mse": - return self.mse(pred[mask], label[mask]) + return self.mse(pred[mask], label[mask], weight[mask]) raise ValueError("unknown loss `%s`" % self.loss) @@ -163,12 +168,12 @@ def train_epoch(self, data_loader): self.GRU_model.train() - for data in data_loader: + for (data, weight) in data_loader: feature = data[:, :, 0:-1].to(self.device) label = data[:, -1, -1].to(self.device) pred = self.GRU_model(feature.float()) - loss = self.loss_fn(pred, label) + loss = self.loss_fn(pred, label, weight.to(self.device)) self.train_optimizer.zero_grad() loss.backward() @@ -182,7 +187,7 @@ def test_epoch(self, data_loader): scores = [] losses = [] - for data in data_loader: + for (data, weight) in data_loader: feature = data[:, :, 0:-1].to(self.device) # feature[torch.isnan(feature)] = 0 @@ -190,7 +195,7 @@ def test_epoch(self, data_loader): with torch.no_grad(): pred = self.GRU_model(feature.float()) - loss = self.loss_fn(pred, label) + loss = self.loss_fn(pred, label, weight.to(self.device)) losses.append(loss.item()) score = self.metric_fn(pred, label) @@ -203,6 +208,7 @@ def fit( dataset, evals_result=dict(), save_path=None, + reweighter=None, ): dl_train = dataset.prepare("train", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L) dl_valid = dataset.prepare("valid", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L) @@ -210,11 +216,28 @@ def fit( dl_train.config(fillna_type="ffill+bfill") # process nan brought by dataloader dl_valid.config(fillna_type="ffill+bfill") # process nan brought by dataloader + if reweighter is None: + wl_train = np.ones(len(dl_train)) + wl_valid = np.ones(len(dl_valid)) + elif isinstance(reweighter, Reweighter): + wl_train = reweighter.reweight(dl_train) + wl_valid = reweighter.reweight(dl_valid) + else: + raise ValueError("Unsupported reweighter type.") + train_loader = DataLoader( - dl_train, batch_size=self.batch_size, shuffle=True, num_workers=self.n_jobs, drop_last=True + ConcatDataset(dl_train, wl_train), + batch_size=self.batch_size, + shuffle=True, + num_workers=self.n_jobs, + drop_last=True, ) valid_loader = DataLoader( - dl_valid, batch_size=self.batch_size, shuffle=False, num_workers=self.n_jobs, drop_last=True + ConcatDataset(dl_valid, wl_valid), + batch_size=self.batch_size, + shuffle=False, + num_workers=self.n_jobs, + drop_last=True, ) save_path = get_or_create_path(save_path) diff --git a/qlib/contrib/model/pytorch_lstm_ts.py b/qlib/contrib/model/pytorch_lstm_ts.py index 1f97bd5b1a..043f88665f 100755 --- a/qlib/contrib/model/pytorch_lstm_ts.py +++ b/qlib/contrib/model/pytorch_lstm_ts.py @@ -20,6 +20,8 @@ from ...model.base import Model from ...data.dataset import DatasetH, TSDatasetH from ...data.dataset.handler import DataHandlerLP +from ...model.utils import ConcatDataset +from ...data.dataset.weight import Reweighter class LSTM(Model): @@ -134,15 +136,18 @@ def __init__( def use_gpu(self): return self.device != torch.device("cpu") - def mse(self, pred, label): - loss = (pred - label) ** 2 + def mse(self, pred, label, weight): + loss = weight * (pred - label) ** 2 return torch.mean(loss) def loss_fn(self, pred, label): mask = ~torch.isnan(label) + if weight is None: + weight = torch.ones_like(label) + if self.loss == "mse": - return self.mse(pred[mask], label[mask]) + return self.mse(pred[mask], label[mask], weight[mask]) raise ValueError("unknown loss `%s`" % self.loss) @@ -159,12 +164,12 @@ def train_epoch(self, data_loader): self.LSTM_model.train() - for data in data_loader: + for (data, weight) in data_loader: feature = data[:, :, 0:-1].to(self.device) label = data[:, -1, -1].to(self.device) pred = self.LSTM_model(feature.float()) - loss = self.loss_fn(pred, label) + loss = self.loss_fn(pred, label, weight.to(self.device)) self.train_optimizer.zero_grad() loss.backward() @@ -178,14 +183,14 @@ def test_epoch(self, data_loader): scores = [] losses = [] - for data in data_loader: + for (data, weight) in data_loader: feature = data[:, :, 0:-1].to(self.device) # feature[torch.isnan(feature)] = 0 label = data[:, -1, -1].to(self.device) pred = self.LSTM_model(feature.float()) - loss = self.loss_fn(pred, label) + loss = self.loss_fn(pred, label, weight.to(self.device)) losses.append(loss.item()) score = self.metric_fn(pred, label) @@ -198,6 +203,7 @@ def fit( dataset, evals_result=dict(), save_path=None, + reweighter=None, ): dl_train = dataset.prepare("train", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L) dl_valid = dataset.prepare("valid", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L) @@ -205,11 +211,28 @@ def fit( dl_train.config(fillna_type="ffill+bfill") # process nan brought by dataloader dl_valid.config(fillna_type="ffill+bfill") # process nan brought by dataloader + if reweighter is None: + wl_train = np.ones(len(dl_train)) + wl_valid = np.ones(len(dl_valid)) + elif isinstance(reweighter, Reweighter): + wl_train = reweighter.reweight(dl_train) + wl_valid = reweighter.reweight(dl_valid) + else: + raise ValueError("Unsupported reweighter type.") + train_loader = DataLoader( - dl_train, batch_size=self.batch_size, shuffle=True, num_workers=self.n_jobs, drop_last=True + ConcatDataset(dl_train, wl_train), + batch_size=self.batch_size, + shuffle=True, + num_workers=self.n_jobs, + drop_last=True, ) valid_loader = DataLoader( - dl_valid, batch_size=self.batch_size, shuffle=False, num_workers=self.n_jobs, drop_last=True + ConcatDataset(dl_valid, wl_valid), + batch_size=self.batch_size, + shuffle=False, + num_workers=self.n_jobs, + drop_last=True, ) save_path = get_or_create_path(save_path) diff --git a/qlib/contrib/model/pytorch_nn.py b/qlib/contrib/model/pytorch_nn.py index 868ab15135..ab1405dec9 100644 --- a/qlib/contrib/model/pytorch_nn.py +++ b/qlib/contrib/model/pytorch_nn.py @@ -19,6 +19,7 @@ from ...model.base import Model from ...data.dataset import DatasetH from ...data.dataset.handler import DataHandlerLP +from ...data.dataset.weight import Reweighter from ...utils import unpack_archive_with_buffer, save_multiple_parts_file, get_or_create_path from ...log import get_module_logger from ...workflow import R @@ -166,18 +167,22 @@ def fit( evals_result=dict(), verbose=True, save_path=None, + reweighter=None, ): df_train, df_valid = dataset.prepare( ["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L ) x_train, y_train = df_train["feature"], df_train["label"] x_valid, y_valid = df_valid["feature"], df_valid["label"] - try: - wdf_train, wdf_valid = dataset.prepare(["train", "valid"], col_set=["weight"], data_key=DataHandlerLP.DK_L) - w_train, w_valid = wdf_train["weight"], wdf_valid["weight"] - except KeyError as e: + + if reweighter is None: w_train = pd.DataFrame(np.ones_like(y_train.values), index=y_train.index) w_valid = pd.DataFrame(np.ones_like(y_valid.values), index=y_valid.index) + elif isinstance(reweighter, Reweighter): + w_train = pd.DataFrame(reweighter.reweight(df_train)) + w_valid = pd.DataFrame(reweighter.reweight(df_valid)) + else: + raise ValueError("Unsupported reweighter type.") save_path = get_or_create_path(save_path) stop_steps = 0 diff --git a/qlib/contrib/model/xgboost.py b/qlib/contrib/model/xgboost.py index 300326143f..d38655ebdc 100755 --- a/qlib/contrib/model/xgboost.py +++ b/qlib/contrib/model/xgboost.py @@ -9,6 +9,7 @@ from ...data.dataset import DatasetH from ...data.dataset.handler import DataHandlerLP from ...model.interpret.base import FeatureInt +from ...data.dataset.weight import Reweighter class XGBModel(Model, FeatureInt): @@ -26,6 +27,7 @@ def fit( early_stopping_rounds=50, verbose_eval=20, evals_result=dict(), + reweighter=None, **kwargs ): @@ -43,8 +45,17 @@ def fit( else: raise ValueError("XGBoost doesn't support multi-label training") - dtrain = xgb.DMatrix(x_train, label=y_train_1d) - dvalid = xgb.DMatrix(x_valid, label=y_valid_1d) + if reweighter is None: + w_train = None + w_valid = None + elif isinstance(reweighter, Reweighter): + w_train = reweighter.reweight(df_train) + w_valid = reweighter.reweight(df_valid) + else: + raise ValueError("Unsupported reweighter type.") + + dtrain = xgb.DMatrix(x_train.values, label=y_train_1d, weight=w_train) + dvalid = xgb.DMatrix(x_valid.values, label=y_valid_1d, weight=w_valid) self.model = xgb.train( self._params, dtrain=dtrain, diff --git a/qlib/data/dataset/weight.py b/qlib/data/dataset/weight.py new file mode 100644 index 0000000000..0ae5c059a2 --- /dev/null +++ b/qlib/data/dataset/weight.py @@ -0,0 +1,102 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import pandas as pd +import numpy as np +from typing import Union, List, Tuple +from ...data.dataset import TSDataSampler +from ...data.dataset.utils import get_level_index +from ...utils import lazy_sort_index + + +class Reweighter: + def __init__(*args, **kwargs): + """ + To initialize the Reweighter, users should provide specific methods to let reweighter do the reweighting (such as sample-wise, rule-based). + """ + raise NotImplementedError() + + +class WeightSampler: + """ + (T)ime-(S)eries WeightSampler + This is the result of the function prepare_weight. + + It is aligned with the instance of TSDataSampler. + """ + + def __init__(self, weights: pd.Series): + assert get_level_index(weights, "datetime") == 0 + self.weights_s = lazy_sort_index(weights) + + def __getitem__(self, idx: int): + return self.weights_s[idx] + + def __len__(self): + return len(self.weights_s) + + +class SampleReweighter(Reweighter): + """ + The sample-wise reweighter. It aims to reweight by the given weight of each sample. + """ + + def __init__(self, sample_weights: pd.Series, *args, **kwargs): + """ + + Parameters + ---------- + sample_weights : pd.Series + Determine the weight of each sample. + The index of the Series should be exactly the same with each sample's index. + """ + self.weights = sample_weights + + def _sample_reweight_DataFrame(self, samples: Union[pd.Series, pd.DataFrame], *args, **kwargs) -> pd.Series: + """ + This function processes the prepared data with pd.Series or pd.DataFrame type. + + Returns + ------- + pd.Series: + The weights of the prepared data. + """ + weight = pd.Series(data=1.0, index=samples.index, name="weight") + weight.update(self.weights) + return weight + + def _sample_reweight_TSDataSampler(self, sampler: TSDataSampler, *args, **kwargs): + """ + This function processes the prepared data with TSDataSampler type. + + Returns + ------- + WeightSampler: + The weight sampler of the prepared data. + """ + weight = pd.Series(1.0, index=sampler.get_index(), name="weight") + weight.update(self.weights) + return WeightSampler(weight) + + def reweight(self, prepared_data: Union[list, tuple, pd.DataFrame, pd.Series, WeightSampler]): + """ + Reweight the prepared data. + + Parameters + ---------- + prepared_data: Union[list, tuple, pd.DataFrame, pd.Series, WeightSampler] + The prepared data given by the DatasetH. + + Returns + ------- + Union[list, pd.Series, WeightSampler]: + """ + # Handle all kinds of prepared data format + if isinstance(prepared_data, (list, tuple)): + return [self.reweight(data) for data in prepared_data] + elif isinstance(prepared_data, (pd.Series, pd.DataFrame)): + return self._sample_reweight_DataFrame(prepared_data) + elif isinstance(prepared_data, TSDataSampler): + return self._sample_reweight_TSDataSampler(prepared_data) + else: + raise NotImplementedError(f"This type of input is not supported") diff --git a/qlib/model/base.py b/qlib/model/base.py index 493981133c..7047b5f44c 100644 --- a/qlib/model/base.py +++ b/qlib/model/base.py @@ -4,6 +4,7 @@ from typing import Text, Union from ..utils.serial import Serializable from ..data.dataset import Dataset +from ..data.dataset.weight import Reweighter class BaseModel(Serializable, metaclass=abc.ABCMeta): @@ -22,7 +23,7 @@ def __call__(self, *args, **kwargs) -> object: class Model(BaseModel): """Learnable Models""" - def fit(self, dataset: Dataset): + def fit(self, dataset: Dataset, reweighter: Reweighter): """ Learn model from the base model diff --git a/qlib/model/meta/__init__.py b/qlib/model/meta/__init__.py new file mode 100644 index 0000000000..d8d8317d36 --- /dev/null +++ b/qlib/model/meta/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import warnings + + +from .task import MetaTask +from .dataset import MetaDataset, MetaDatasetH diff --git a/qlib/model/meta/dataset.py b/qlib/model/meta/dataset.py new file mode 100644 index 0000000000..68ba50deba --- /dev/null +++ b/qlib/model/meta/dataset.py @@ -0,0 +1,57 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import abc +from typing import Union, List, Tuple, Text +from ...workflow.task.gen import RollingGen, task_generator +from ...data.dataset.handler import DataHandler +from ...utils.serial import Serializable + + +class MetaDataset(Serializable, metaclass=abc.ABCMeta): + """ + A dataset fetching the data in a meta-level. + """ + + def __init__(self, *args, **kwargs): + """ + The meta-dataset maintains a list of meta-tasks when it is initialized. + """ + super().__init__(*args, **kwargs) + self.meta_tasks = [] + + @abc.abstractmethod + def prepare_tasks(self, segments: Union[List[Text], Tuple[Text], Text], *args, **kwargs): + """ + Prepare the data in each meta-task and ready for training. + + The following code example shows how to retrieve a list of meta-tasks from the `meta_dataset`: + + .. code-block:: Python + + # get the train segment and the test segment, both of them are lists + train_meta_tasks, test_meta_tasks = meta_dataset.prepare_tasks(["train", "test"]) + + Returns + ------- + list: + A list of the prepared data of each meta-task for training the meta-model. For multiple segments [seg1, seg2, ... , segN], the returned list will be [[tasks in seg1], [tasks in seg2], ... , [tasks in segN]]. + """ + pass + + +class MetaDatasetH(MetaDataset): + """ + MetaDataset with specified DataHandler. + """ + + def __init__(self, data_handler: DataHandler, *args, **kwargs): + """ + + Parameters + ---------- + data_handler: DataHandler + The shared DataHandler among meta-tasks. + """ + super().__init__(*args, **kwargs) + self.data_handler = data_handler diff --git a/qlib/model/meta/model.py b/qlib/model/meta/model.py new file mode 100644 index 0000000000..90345ee827 --- /dev/null +++ b/qlib/model/meta/model.py @@ -0,0 +1,58 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import abc +from typing import Union, List, Tuple +from .dataset import MetaDataset + + +class MetaModel(metaclass=abc.ABCMeta): + """ + The meta-model controls the training process. + """ + + @abc.abstractmethod + def fit(self, *args, **kwargs): + """ + The training process of the meta-model. + """ + pass + + @abc.abstractmethod + def inference(self, *args, **kwargs): + """ + The inference process of the meta-model. + """ + pass + + +class MetaTaskModel(MetaModel): + """ + This type of meta-model deals with base task definitions. The meta-model creates tasks for training new base forecasting models after it is trained. `prepare_tasks` directly modifies the task definitions. + """ + + @abc.abstractmethod + def prepare_tasks(self, tasks: List[dict]): + """ + The meta-model modifies the tasks. The function will return the modified task list. + + Parameters + ---------- + tasks: List[dict] + A List of task definitions for the meta-model to modify. + """ + pass + + +class MetaGuideModel(MetaModel): + """ + This type of meta-model aims to guide the training process of the base model. The meta-model interacts with the base forecasting models during their training process. + """ + + @abc.abstractmethod + def fit(self, *args, **kwargs): + pass + + @abc.abstractmethod + def inference(self, *args, **kwargs): + pass diff --git a/qlib/model/meta/task.py b/qlib/model/meta/task.py new file mode 100644 index 0000000000..078b73811b --- /dev/null +++ b/qlib/model/meta/task.py @@ -0,0 +1,42 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import abc +from typing import Union, List, Tuple +from ...data.dataset import DatasetH, TSDatasetH +from ...utils import init_instance_by_config + + +class MetaTask(metaclass=abc.ABCMeta): + """ + A single meta-task, a meta-dataset contains a list of them. + """ + + def __init__(self, dataset_dict: dict, *args, **kwargs): + """ + + Parameters + ---------- + dataset_dict: dict + The dataset definition for this meta-task instance. + """ + self.dataset_dict = dataset_dict + self.dataset = init_instance_by_config(self.dataset_dict) + + def get_dataset(self) -> Union[DatasetH, TSDatasetH]: + """ + Get the dataset instance defined in the meta-task. + + Returns + ------- + Union[DatasetH, TSDatasetH]: + The instance of the dataset definition. + """ + return self.dataset + + @abc.abstractmethod + def prepare_task_data(self): + """ + Prepare the data for training the meta-model. + """ + pass diff --git a/qlib/model/utils.py b/qlib/model/utils.py new file mode 100644 index 0000000000..590b0aea3e --- /dev/null +++ b/qlib/model/utils.py @@ -0,0 +1,15 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +from torch.utils.data import Dataset + + +class ConcatDataset(Dataset): + def __init__(self, *datasets): + self.datasets = datasets + + def __getitem__(self, i): + return tuple(d[i] for d in self.datasets) + + def __len__(self): + return min(len(d) for d in self.datasets) From 5bb06cdf66113ab72a71c9ecdf7be016281f329a Mon Sep 17 00:00:00 2001 From: Wendi Li Date: Thu, 1 Jul 2021 16:00:12 +0000 Subject: [PATCH 02/37] Update trainer for reweighter --- qlib/model/trainer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/qlib/model/trainer.py b/qlib/model/trainer.py index a534a7a3b4..783bf84345 100644 --- a/qlib/model/trainer.py +++ b/qlib/model/trainer.py @@ -23,6 +23,7 @@ from qlib.workflow.record_temp import SignalRecord from qlib.workflow.recorder import Recorder from qlib.workflow.task.manage import TaskManager, run_task +from qlib.data.dataset.weight import Reweighter def begin_task_train(task_config: dict, experiment_name: str, recorder_name: str = None) -> Recorder: @@ -61,8 +62,9 @@ def end_task_train(rec: Recorder, experiment_name: str) -> Recorder: # model & dataset initiation model: Model = init_instance_by_config(task_config["model"]) dataset: Dataset = init_instance_by_config(task_config["dataset"]) + reweighter: Reweighter = task_config.get("reweighter", None) # model training - model.fit(dataset) + model.fit(dataset, reweighter=reweighter) R.save_objects(**{"params.pkl": model}) # this dataset is saved for online inference. So the concrete data should not be dumped dataset.config(dump_all=False, recursive=True) From 4f442f58742ed9d31265ba666a8c75bf33778c51 Mon Sep 17 00:00:00 2001 From: wendili-cs Date: Thu, 8 Jul 2021 20:34:17 +0800 Subject: [PATCH 03/37] Typos fixed. --- qlib/contrib/meta/data_selection/utils.py | 6 +++--- qlib/data/dataset/weight.py | 2 +- qlib/model/meta/__init__.py | 3 --- qlib/model/meta/model.py | 9 +++++++-- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/qlib/contrib/meta/data_selection/utils.py b/qlib/contrib/meta/data_selection/utils.py index 3b29ac1c1b..c6328f282d 100644 --- a/qlib/contrib/meta/data_selection/utils.py +++ b/qlib/contrib/meta/data_selection/utils.py @@ -48,7 +48,7 @@ def forward(self, pred, y, idx): :param pred: :param y: - :param idx: 这里假设 idx的level是(date, inst); 这里假设其一定排好序了 + :param idx: Assume the level of the idx is (date, inst), and it is sorted """ prev = None diff_point = [] @@ -89,7 +89,7 @@ def preds_to_weight_with_clamp(preds, clip_weight=None, clip_method="tanh"): elif clip_method == "tanh": weights = torch.exp(torch.tanh(preds) * np.log(clip_weight)) elif clip_method == "sigmoid": - # 这里的intuitively感觉是它保证和为1 + # intuitively assume its sum is 1 if clip_weight == 0.0: weights = torch.ones_like(preds) else: @@ -105,7 +105,7 @@ def preds_to_weight_with_clamp(preds, clip_weight=None, clip_method="tanh"): class SingleMetaBase(nn.Module): def __init__(self, hist_n, clip_weight=None, clip_method="clamp"): - # method 可以选 tanh 或者 clamp + # method can be tanh or clamp super().__init__() self.clip_weight = clip_weight if clip_method in ["tanh", "clamp"]: diff --git a/qlib/data/dataset/weight.py b/qlib/data/dataset/weight.py index 0ae5c059a2..2d5c955ca0 100644 --- a/qlib/data/dataset/weight.py +++ b/qlib/data/dataset/weight.py @@ -10,7 +10,7 @@ class Reweighter: - def __init__(*args, **kwargs): + def __init__(self, *args, **kwargs): """ To initialize the Reweighter, users should provide specific methods to let reweighter do the reweighting (such as sample-wise, rule-based). """ diff --git a/qlib/model/meta/__init__.py b/qlib/model/meta/__init__.py index d8d8317d36..2facbfd656 100644 --- a/qlib/model/meta/__init__.py +++ b/qlib/model/meta/__init__.py @@ -1,8 +1,5 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -import warnings - - from .task import MetaTask from .dataset import MetaDataset, MetaDatasetH diff --git a/qlib/model/meta/model.py b/qlib/model/meta/model.py index 90345ee827..5f8d134117 100644 --- a/qlib/model/meta/model.py +++ b/qlib/model/meta/model.py @@ -32,14 +32,19 @@ class MetaTaskModel(MetaModel): """ @abc.abstractmethod - def prepare_tasks(self, tasks: List[dict]): + def prepare_tasks(self, tasks: List[dict]) -> List[dict]: """ The meta-model modifies the tasks. The function will return the modified task list. Parameters ---------- tasks: List[dict] - A List of task definitions for the meta-model to modify. + A list of task definitions for the meta-model to modify. + + Returns + ------- + List[dict]: + A list of modified task definitions. """ pass From 81b4383a6b838b25ffb8ecb589351cfd384fee25 Mon Sep 17 00:00:00 2001 From: Young Date: Mon, 9 Aug 2021 16:07:33 +0000 Subject: [PATCH 04/37] update data selection interface --- qlib/contrib/meta/data_selection/dataset.py | 11 -- qlib/contrib/meta/data_selection/model.py | 80 ++++----- qlib/contrib/meta/data_selection/net.py | 19 +- qlib/contrib/meta/data_selection/task.py | 5 +- qlib/contrib/meta/data_selection/utils.py | 20 +-- qlib/contrib/torch.py | 31 ++++ qlib/data/dataset/__init__.py | 24 ++- qlib/data/dataset/loader.py | 27 ++- qlib/data/dataset/processor.py | 18 +- qlib/model/meta/dataset.py | 45 ++++- qlib/model/meta/task.py | 40 ++--- qlib/model/trainer.py | 4 +- qlib/utils/__init__.py | 184 ++++---------------- qlib/utils/data.py | 23 +++ qlib/utils/file.py | 159 +++++++++++++++++ qlib/utils/serial.py | 95 +++++++--- qlib/workflow/__init__.py | 10 +- qlib/workflow/exp.py | 3 +- qlib/workflow/task/gen.py | 43 ++++- qlib/workflow/task/manage.py | 8 + qlib/workflow/task/utils.py | 26 ++- 21 files changed, 562 insertions(+), 313 deletions(-) create mode 100644 qlib/contrib/torch.py create mode 100644 qlib/utils/data.py create mode 100644 qlib/utils/file.py diff --git a/qlib/contrib/meta/data_selection/dataset.py b/qlib/contrib/meta/data_selection/dataset.py index da117f1506..cd58bf9d8b 100644 --- a/qlib/contrib/meta/data_selection/dataset.py +++ b/qlib/contrib/meta/data_selection/dataset.py @@ -204,17 +204,6 @@ def _prepare_seg(self, segment: str, *args, **kwargs): meta_tasks.append(self.meta_tasks[index]) return meta_tasks - def prepare_tasks(self, segments: Union[List[Text], Tuple[Text], Text], *args, **kwargs) -> List[tuple]: - """ - Prepare the meta-tasks. - """ - if isinstance(segments, (list, tuple)): - return [self._prepare_seg(seg) for seg in segments] - elif isinstance(segments, str): - return self._prepare_seg(segments) - else: - raise NotImplementedError(f"This type of input is not supported") - def get_test_period_from_meta_tasks(self): return [task["kwargs"]["segments"]["test"] for task in self.meta_tasks_l] diff --git a/qlib/contrib/meta/data_selection/model.py b/qlib/contrib/meta/data_selection/model.py index ea4ab0f062..2f3145c5f9 100644 --- a/qlib/contrib/meta/data_selection/model.py +++ b/qlib/contrib/meta/data_selection/model.py @@ -16,9 +16,9 @@ from ....model.meta.model import MetaModel, MetaTaskModel from ....workflow import R -from .utils import fill_diagnal, convert_data_to_tensor, ICLoss +from .utils import fill_diagnal, ICLoss from .dataset import MetaDatasetHDS -from .net import PredNet +from qlib.contrib.meta.data_selection.net import PredNet class MetaModelDS(MetaTaskModel): @@ -28,14 +28,16 @@ class MetaModelDS(MetaTaskModel): def __init__( self, - hist_n=30, + step, + hist_step_n, clip_method="tanh", clip_weight=2.0, criterion="ic_loss", lr=0.0001, max_epoch=150, ): - self.hist_n = hist_n + self.step = step + self.hist_step_n = hist_step_n self.clip_method = clip_method self.clip_weight = clip_weight self.criterion = criterion @@ -52,22 +54,25 @@ def fit(self, meta_dataset: MetaDatasetHDS): meta_dataset : MetaDatasetHDS The meta-model takes the meta-dataset for its training process. """ - recorder = R.get_recorder() + if not self.fitted: - for k in set(["lr", "hist_n", "clip_method", "clip_weight", "criterion", "max_epoch"]): - recorder.log_params(**{k: getattr(self, k)}) + for k in set(["lr", "step", "hist_step_n", "clip_method", "clip_weight", "criterion", "max_epoch"]): + R.log_params(**{k: getattr(self, k)}) + + # FIXME: get test tasks for just checking the performance + phases = ["train", "test"] + meta_tasks_l = meta_dataset.prepare_tasks(phases) + + self.tn = PredNet( + step=self.step, hist_step_n=self.hist_step_n, clip_weight=self.clip_weight, clip_method=self.clip_method + ) - # Training begins - meta_tasks = meta_dataset.prepare_tasks(["train", "test"]) - num2phase = {0: "train", 1: "test"} - phase2num = dict(zip(num2phase.values(), num2phase.keys())) train_step = 0 - self.tn = PredNet(hist_n=self.hist_n, clip_weight=self.clip_weight, clip_method=self.clip_method) opt = optim.Adam(self.tn.parameters(), lr=self.lr) loss_l = {} for epoch in tqdm(range(self.max_epoch), desc="epoch"): - for phase, task_list in enumerate(meta_tasks): - if phase == phase2num["train"]: # phase 0 for training, 1 for inference + for phase, task_list in zip(phases, meta_tasks_l): + if phase == "train": # phase 0 for training, 1 for inference self.tn.train() torch.set_grad_enabled(True) else: @@ -75,43 +80,38 @@ def fit(self, meta_dataset: MetaDatasetHDS): torch.set_grad_enabled(False) running_loss = 0.0 pred_y_all = [] - for task in tqdm(task_list, desc=f"{num2phase[phase]} Task", leave=False): - ( - X, - y, - time_perf, - time_belong, - X_test, - y_test, - test_idx, - train_idx, - test_period, - ) = task.prepare_task_data() - pred, weights = self.tn(X, y, time_perf, time_belong, X_test) + for task in tqdm(task_list, desc=f"{phase} Task", leave=False): + meta_input = task.get_meta_input() + pred, weights = self.tn( + meta_input["X"], + meta_input["y"], + meta_input["time_perf"], + meta_input["time_belong"], + meta_input["X_test"], + ) if self.criterion == "mse": criterion = nn.MSELoss() - loss = criterion(pred, y_test) + loss = criterion(pred, meta_input["y_test"]) elif self.criterion == "ic_loss": criterion = ICLoss() - loss = criterion(pred, y_test, test_idx) + loss = criterion(pred, meta_input["y_test"], meta_input["test_idx"]) - if phase == phase2num["train"]: + if phase == "train": opt.zero_grad() norm_loss = nn.MSELoss() loss.backward() opt.step() train_step += 1 - elif phase == phase2num["test"]: - pass # pass, leave the work for the inference function - # self.reweighters[test_period] = SampleReweighter( - # pd.Series(weights.detach().cpu().numpy(), index=train_idx) - # ) + elif phase == "test": + pass pred_y_all.append( pd.DataFrame( { - "pred": pd.Series(pred.detach().cpu().numpy(), index=test_idx), - "label": pd.Series(y_test.detach().cpu().numpy(), index=test_idx), + "pred": pd.Series(pred.detach().cpu().numpy(), index=meta_input["test_idx"]), + "label": pd.Series( + meta_input["y_test"].detach().cpu().numpy(), index=meta_input["test_idx"] + ), } ) ) @@ -126,9 +126,9 @@ def fit(self, meta_dataset: MetaDatasetHDS): .mean() ) - recorder.log_metrics(**{f"loss/{num2phase[phase]}": running_loss, "step": epoch}) - recorder.log_metrics(**{f"ic/{num2phase[phase]}": ic, "step": epoch}) - recorder.save_objects(**{"model.pkl": self.tn}) + R.log_metrics(**{f"loss/{phase}": running_loss, "step": epoch}) + R.log_metrics(**{f"ic/{phase}": ic, "step": epoch}) + R.save_objects(**{"model.pkl": self.tn}) self.fitted = True def _inference_single_task(self, meta_id: tuple, meta_dataset: MetaDatasetHDS): diff --git a/qlib/contrib/meta/data_selection/net.py b/qlib/contrib/meta/data_selection/net.py index ba78a2ce4c..84a83c9efc 100644 --- a/qlib/contrib/meta/data_selection/net.py +++ b/qlib/contrib/meta/data_selection/net.py @@ -10,10 +10,10 @@ class TimeWeightMeta(SingleMetaBase): - def __init__(self, hist_n, clip_weight=None, clip_method="clamp"): + def __init__(self, hist_step_n, clip_weight=None, clip_method="clamp"): # method 可以选 tanh 或者 clamp - super().__init__(hist_n, clip_weight, clip_method) - self.linear = nn.Linear(hist_n, 1) + super().__init__(hist_step_n, clip_weight, clip_method) + self.linear = nn.Linear(hist_step_n, 1) self.k = nn.Parameter(torch.Tensor([8.0])) def forward(self, time_perf, time_belong, return_preds=False): @@ -34,10 +34,11 @@ def forward(self, time_perf, time_belong, return_preds=False): class PredNet(nn.Module): - def __init__(self, hist_n, clip_weight=None, clip_method="tanh"): + def __init__(self, step, hist_step_n, clip_weight=None, clip_method="tanh"): super().__init__() - self.twm = TimeWeightMeta(hist_n=hist_n, clip_weight=clip_weight, clip_method=clip_method) - self.init_paramters(hist_n) + self.step = step + self.twm = TimeWeightMeta(hist_step_n=hist_step_n, clip_weight=clip_weight, clip_method=clip_method) + self.init_paramters(hist_step_n) def get_sample_weights(self, X, time_perf, time_belong, ignore_weight=False): weights = torch.from_numpy(np.ones(X.shape[0])).float().to(X.device) @@ -48,11 +49,13 @@ def get_sample_weights(self, X, time_perf, time_belong, ignore_weight=False): return weights def forward(self, X, y, time_perf, time_belong, X_test, ignore_weight=False): + time_perf = time_perf.reshape(self.step, time_perf.shape[0] // self.step, *time_perf.shape[1:]) + time_perf = torch.mean(time_perf, dim=0, keepdim=False) weights = self.get_sample_weights(X, time_perf, time_belong, ignore_weight=ignore_weight) X_w = X.T * weights.view(1, -1) theta = torch.inverse(X_w @ X) @ X_w @ y return X_test @ theta, weights - def init_paramters(self, hist_n): - self.twm.linear.weight.data = 1.0 / hist_n + self.twm.linear.weight.data * 0.01 + def init_paramters(self, hist_step_n): + self.twm.linear.weight.data = 1.0 / hist_step_n + self.twm.linear.weight.data * 0.01 self.twm.linear.bias.data.fill_(0.0) diff --git a/qlib/contrib/meta/data_selection/task.py b/qlib/contrib/meta/data_selection/task.py index 37b35adad7..6cff4d1d72 100644 --- a/qlib/contrib/meta/data_selection/task.py +++ b/qlib/contrib/meta/data_selection/task.py @@ -7,7 +7,8 @@ from ....model.meta.task import MetaTask from ....data.dataset.handler import DataHandlerLP -from .utils import fill_diagnal, convert_data_to_tensor +from qlib.contrib.torch import data_to_tensor +from .utils import fill_diagnal class MetaTaskDS(MetaTask): @@ -32,7 +33,7 @@ def _prepare_meta_task(self): self.sample_time_belong[self.sample_time_belong.sum(axis=1) != 1, -1] = 1.0 self.test_idx = self.y_test.index self.train_idx = self.y.index - self.X, self.y, self.time_perf, self.sample_time_belong, self.X_test, self.y_test = convert_data_to_tensor( + self.X, self.y, self.time_perf, self.sample_time_belong, self.X_test, self.y_test = data_to_tensor( [self.X, self.y, self.time_perf, self.sample_time_belong, self.X_test, self.y_test] ) diff --git a/qlib/contrib/meta/data_selection/utils.py b/qlib/contrib/meta/data_selection/utils.py index c6328f282d..7fc42bab50 100644 --- a/qlib/contrib/meta/data_selection/utils.py +++ b/qlib/contrib/meta/data_selection/utils.py @@ -5,6 +5,7 @@ import numpy as np import torch from torch import nn +from qlib.contrib.torch import data_to_tensor def fill_diagnal(sim_mat): @@ -23,25 +24,6 @@ def get_sim_mat_idx(i_sim_mat, outsample_period): raise AssertionError("Not Found!") -def convert_data_to_tensor(data, device="cpu"): - if isinstance(data, torch.Tensor): - if device == "cpu": - return data.cpu() - else: - return data.to(device) - if isinstance(data, pd.DataFrame): - return convert_data_to_tensor(torch.from_numpy(data.values.squeeze()).float(), device) - elif isinstance(data, np.ndarray): - return convert_data_to_tensor(torch.from_numpy(data).float(), device) - elif isinstance(data, (tuple, list)): - return [convert_data_to_tensor(i, device) for i in data] - elif isinstance(data, dict): - return {k: convert_data_to_tensor(v, device) for k, v in data.items()} - else: - print("type:", type(data)) - raise ValueError("Unsupported data type.") - - class ICLoss(nn.Module): def forward(self, pred, y, idx): """forward. diff --git a/qlib/contrib/torch.py b/qlib/contrib/torch.py new file mode 100644 index 0000000000..91deb0b7a8 --- /dev/null +++ b/qlib/contrib/torch.py @@ -0,0 +1,31 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +""" + This module is not a necessary part of Qlib. + It is just some tools for convenience + It is should not imported into the core part of qlib +""" +import torch +import numpy as np +import pandas as pd + + +def data_to_tensor(data, device="cpu", raise_error=False): + if isinstance(data, torch.Tensor): + if device == "cpu": + return data.cpu() + else: + return data.to(device) + if isinstance(data, (pd.DataFrame, pd.Series)): + return data_to_tensor(torch.from_numpy(data.values).float(), device) + elif isinstance(data, np.ndarray): + return data_to_tensor(torch.from_numpy(data).float(), device) + elif isinstance(data, (tuple, list)): + return [data_to_tensor(i, device) for i in data] + elif isinstance(data, dict): + return {k: data_to_tensor(v, device) for k, v in data.items()} + else: + if raise_error: + raise ValueError(f"Unsupported data type: {type(data)}.") + else: + return data diff --git a/qlib/data/dataset/__init__.py b/qlib/data/dataset/__init__.py index 2acaa77fed..377a28efe5 100644 --- a/qlib/data/dataset/__init__.py +++ b/qlib/data/dataset/__init__.py @@ -1,5 +1,5 @@ from ...utils.serial import Serializable -from typing import Union, List, Tuple, Dict, Text, Optional +from typing import Callable, Union, List, Tuple, Dict, Text, Optional from ...utils import init_instance_by_config, np_ffill, time_to_slc_point from ...log import get_module_logger from .handler import DataHandler, DataHandlerLP @@ -235,6 +235,28 @@ def prepare( else: raise NotImplementedError(f"This type of input is not supported") + # helper functions + @staticmethod + def get_min_time(segments): + return DatasetH._get_extrema(segments, 0, (lambda a, b: a > b)) + + @staticmethod + def get_max_time(segments): + return DatasetH._get_extrema(segments, 1, (lambda a, b: a < b)) + + @staticmethod + def _get_extrema(segments, idx: int, cmp: Callable, key_func=pd.Timestamp): + """it will act like sort and return the max value or None""" + candidate = None + for k, seg in segments.items(): + point = seg[idx] + if point is None: + # None indicates unbounded, return directly + return None + elif candidate is None or cmp(key_func(candidate), key_func(point)): + candidate = point + return candidate + class TSDataSampler: """ diff --git a/qlib/data/dataset/loader.py b/qlib/data/dataset/loader.py index 54c00dc7ee..f8ace8f100 100644 --- a/qlib/data/dataset/loader.py +++ b/qlib/data/dataset/loader.py @@ -3,6 +3,8 @@ import os import abc +import pickle +from pathlib import Path import warnings import numpy as np import pandas as pd @@ -14,6 +16,7 @@ from qlib.data.filter import BaseDFilter from qlib.utils import load_dataset, init_instance_by_config, time_to_slc_point from qlib.log import get_module_logger +from qlib.utils.serial import Serializable class DataLoader(abc.ABC): @@ -181,12 +184,14 @@ def load_group_df(self, instruments, exprs: list, names: list, start_time=None, return df -class StaticDataLoader(DataLoader): +class StaticDataLoader(DataLoader, Serializable): """ DataLoader that supports loading data from file or as provided. """ - def __init__(self, config: dict, join="outer"): + include_attr = ["_config"] + + def __init__(self, config: Union[dict, str], join="outer"): """ Parameters ---------- @@ -195,7 +200,7 @@ def __init__(self, config: dict, join="outer"): join : str How to align different dataframes """ - self.config = config + self._config = config # using "_" to avoid confliction with the method `config` of Serializable self.join = join self._data = None @@ -215,12 +220,16 @@ def load(self, instruments=None, start_time=None, end_time=None) -> pd.DataFrame def _maybe_load_raw_data(self): if self._data is not None: return - self._data = pd.concat( - {fields_group: load_dataset(path_or_obj) for fields_group, path_or_obj in self.config.items()}, - axis=1, - join=self.join, - ) - self._data.sort_index(inplace=True) + if isinstance(self._config, dict): + self._data = pd.concat( + {fields_group: load_dataset(path_or_obj) for fields_group, path_or_obj in self._config.items()}, + axis=1, + join=self.join, + ) + self._data.sort_index(inplace=True) + elif isinstance(self._config, (str, Path)): + with Path(self._config).open("rb") as f: + self._data = pickle.load(f) class DataLoaderDH(DataLoader): diff --git a/qlib/data/dataset/processor.py b/qlib/data/dataset/processor.py index fce22ddfcf..bb89eca84e 100644 --- a/qlib/data/dataset/processor.py +++ b/qlib/data/dataset/processor.py @@ -7,6 +7,8 @@ import pandas as pd import copy +from qlib.utils.data import robust_zscore + from ...log import TimeInspector from .utils import fetch_df_by_index from ...utils.serial import Serializable @@ -273,14 +275,22 @@ def __call__(self, df): class CSZScoreNorm(Processor): """Cross Sectional ZScore Normalization""" - def __init__(self, fields_group=None): + def __init__(self, fields_group=None, method="zscore"): self.fields_group = fields_group + if method == "zscore": + self.zscore_func = lambda x: (x - x.mean()).div(x.std()) + elif method == "robust": + self.zscore_func = robust_zscore + else: + raise NotImplementedError(f"This type of input is not supported") def __call__(self, df): # try not modify original dataframe - cols = get_group_columns(df, self.fields_group) - df[cols] = df[cols].groupby("datetime").apply(lambda x: (x - x.mean()).div(x.std())) - + if not isinstance(self.fields_group, list): + self.fields_group = [self.fields_group] + for g in self.fields_group: + cols = get_group_columns(df, g) + df[cols] = df[cols].groupby("datetime").apply(self.zscore_func) return df diff --git a/qlib/model/meta/dataset.py b/qlib/model/meta/dataset.py index 68ba50deba..743626fdaa 100644 --- a/qlib/model/meta/dataset.py +++ b/qlib/model/meta/dataset.py @@ -2,7 +2,7 @@ # Licensed under the MIT License. import abc -from typing import Union, List, Tuple, Text +from typing import Dict, Union, List, Tuple, Text from ...workflow.task.gen import RollingGen, task_generator from ...data.dataset.handler import DataHandler from ...utils.serial import Serializable @@ -11,17 +11,29 @@ class MetaDataset(Serializable, metaclass=abc.ABCMeta): """ A dataset fetching the data in a meta-level. + + A Meta Dataset is responsible for + - input a specific task and prepare input data (based a given task) for meta model + - prepare underlayer data: + + The learnt pattern could transfer to other meta dataset. The following cases should be supported + - A meta-model trained on meta-dataset A and then applied to meta-dataset B + - Some pattern are shared between meta-dataset A and B, so meta-input on meta-dataset A are used when meta model are applied on meta-dataset-B """ - def __init__(self, *args, **kwargs): + def __init__(self, segments: Union[Dict[Text, Tuple], float], *args, **kwargs): """ The meta-dataset maintains a list of meta-tasks when it is initialized. + + The segments indicates the way to divide the data + + The duty of the `__init__` function of MetaDataset + - initialize the tasks """ super().__init__(*args, **kwargs) - self.meta_tasks = [] + self.segments = segments - @abc.abstractmethod - def prepare_tasks(self, segments: Union[List[Text], Tuple[Text], Text], *args, **kwargs): + def prepare_tasks(self, segments: Union[List[Text], Text], *args, **kwargs) -> List: """ Prepare the data in each meta-task and ready for training. @@ -32,10 +44,33 @@ def prepare_tasks(self, segments: Union[List[Text], Tuple[Text], Text], *args, * # get the train segment and the test segment, both of them are lists train_meta_tasks, test_meta_tasks = meta_dataset.prepare_tasks(["train", "test"]) + Parameters + ---------- + segments: Union[List[Text], Tuple[Text], Text] + the info to select data + Returns ------- list: A list of the prepared data of each meta-task for training the meta-model. For multiple segments [seg1, seg2, ... , segN], the returned list will be [[tasks in seg1], [tasks in seg2], ... , [tasks in segN]]. + Each task is a meta task + """ + if isinstance(segments, (list, tuple)): + return [self._prepare_seg(seg) for seg in segments] + elif isinstance(segments, str): + return self._prepare_seg(segments) + else: + raise NotImplementedError(f"This type of input is not supported") + + @abc.abstractmethod + def _prepare_seg(self, segment: Text): + """ + prepare a single segment of data for training data + + Parameters + ---------- + seg : Text + the name of the segment """ pass diff --git a/qlib/model/meta/task.py b/qlib/model/meta/task.py index 078b73811b..bcc6694ad2 100644 --- a/qlib/model/meta/task.py +++ b/qlib/model/meta/task.py @@ -3,40 +3,40 @@ import abc from typing import Union, List, Tuple -from ...data.dataset import DatasetH, TSDatasetH + +from qlib.data.dataset import Dataset from ...utils import init_instance_by_config -class MetaTask(metaclass=abc.ABCMeta): +class MetaTask: """ A single meta-task, a meta-dataset contains a list of them. + It is designed for Mea """ - def __init__(self, dataset_dict: dict, *args, **kwargs): + def __init__(self, task: dict, meta_info: object): """ + the `__init__` func is responsible for + - store the task + - store the origin input data for + - process the input data for meta data Parameters ---------- - dataset_dict: dict - The dataset definition for this meta-task instance. - """ - self.dataset_dict = dataset_dict - self.dataset = init_instance_by_config(self.dataset_dict) + task : dict + the task to be enhanced by meta model - def get_dataset(self) -> Union[DatasetH, TSDatasetH]: + meta_info : object + the input for meta model """ - Get the dataset instance defined in the meta-task. + self.task = task + self.meta_info = meta_info # the original meta input information, it will be processed later - Returns - ------- - Union[DatasetH, TSDatasetH]: - The instance of the dataset definition. - """ - return self.dataset + def get_dataset(self) -> Dataset: + return init_instance_by_config(self.task["dataset"], accept_types=Dataset) - @abc.abstractmethod - def prepare_task_data(self): + def get_meta_input(elf) -> object: """ - Prepare the data for training the meta-model. + Return the **processed** meta_info """ - pass + return self.meta_info diff --git a/qlib/model/trainer.py b/qlib/model/trainer.py index dd3b6e1042..9a7ce51817 100644 --- a/qlib/model/trainer.py +++ b/qlib/model/trainer.py @@ -17,7 +17,7 @@ from qlib.data.dataset import Dataset from qlib.log import get_module_logger from qlib.model.base import Model -from qlib.utils import flatten_dict, get_cls_kwargs, init_instance_by_config +from qlib.utils import flatten_dict, get_cls_kwargs, init_instance_by_config, auto_filter_kwargs from qlib.workflow import R from qlib.workflow.record_temp import SignalRecord from qlib.workflow.recorder import Recorder @@ -63,7 +63,7 @@ def end_task_train(rec: Recorder, experiment_name: str) -> Recorder: dataset: Dataset = init_instance_by_config(task_config["dataset"]) reweighter: Reweighter = task_config.get("reweighter", None) # model training - model.fit(dataset, reweighter=reweighter) + auto_filter_kwargs(model.fit)(dataset, reweighter=reweighter) R.save_objects(**{"params.pkl": model}) # this dataset is saved for online inference. So the concrete data should not be dumped dataset.config(dump_all=False, recursive=True) diff --git a/qlib/utils/__init__.py b/qlib/utils/__init__.py index 2fe9eafed2..fac5148907 100644 --- a/qlib/utils/__init__.py +++ b/qlib/utils/__init__.py @@ -16,6 +16,7 @@ import bisect import shutil import difflib +import inspect import hashlib import datetime import requests @@ -26,10 +27,10 @@ import numpy as np import pandas as pd from pathlib import Path -from typing import Union, Tuple, Any, Text, Optional +from typing import Callable, Union, Tuple, Any, Text, Optional from types import ModuleType from urllib.parse import urlparse - +from .file import get_or_create_path, save_multiple_parts_file, unpack_archive_with_buffer, get_tmp_file_with_buffer from ..config import C from ..log import get_module_logger, set_log_with_config @@ -299,153 +300,6 @@ def default(self, o): return changes -def get_or_create_path(path: Optional[Text] = None, return_dir: bool = False): - """Create or get a file or directory given the path and return_dir. - - Parameters - ---------- - path: a string indicates the path or None indicates creating a temporary path. - return_dir: if True, create and return a directory; otherwise c&r a file. - - """ - if path: - if return_dir and not os.path.exists(path): - os.makedirs(path) - elif not return_dir: # return a file, thus we need to create its parent directory - xpath = os.path.abspath(os.path.join(path, "..")) - if not os.path.exists(xpath): - os.makedirs(xpath) - else: - temp_dir = os.path.expanduser("~/tmp") - if not os.path.exists(temp_dir): - os.makedirs(temp_dir) - if return_dir: - _, path = tempfile.mkdtemp(dir=temp_dir) - else: - _, path = tempfile.mkstemp(dir=temp_dir) - return path - - -@contextlib.contextmanager -def save_multiple_parts_file(filename, format="gztar"): - """Save multiple parts file - - Implementation process: - 1. get the absolute path to 'filename' - 2. create a 'filename' directory - 3. user does something with file_path('filename/') - 4. remove 'filename' directory - 5. make_archive 'filename' directory, and rename 'archive file' to filename - - :param filename: result model path - :param format: archive format: one of "zip", "tar", "gztar", "bztar", or "xztar" - :return: real model path - - Usage:: - - >>> # The following code will create an archive file('~/tmp/test_file') containing 'test_doc_i'(i is 0-10) files. - >>> with save_multiple_parts_file('~/tmp/test_file') as filename_dir: - ... for i in range(10): - ... temp_path = os.path.join(filename_dir, 'test_doc_{}'.format(str(i))) - ... with open(temp_path) as fp: - ... fp.write(str(i)) - ... - - """ - - if filename.startswith("~"): - filename = os.path.expanduser(filename) - - file_path = os.path.abspath(filename) - - # Create model dir - if os.path.exists(file_path): - raise FileExistsError("ERROR: file exists: {}, cannot be create the directory.".format(file_path)) - - os.makedirs(file_path) - - # return model dir - yield file_path - - # filename dir to filename.tar.gz file - tar_file = shutil.make_archive(file_path, format=format, root_dir=file_path) - - # Remove filename dir - if os.path.exists(file_path): - shutil.rmtree(file_path) - - # filename.tar.gz rename to filename - os.rename(tar_file, file_path) - - -@contextlib.contextmanager -def unpack_archive_with_buffer(buffer, format="gztar"): - """Unpack archive with archive buffer - After the call is finished, the archive file and directory will be deleted. - - Implementation process: - 1. create 'tempfile' in '~/tmp/' and directory - 2. 'buffer' write to 'tempfile' - 3. unpack archive file('tempfile') - 4. user does something with file_path('tempfile/') - 5. remove 'tempfile' and 'tempfile directory' - - :param buffer: bytes - :param format: archive format: one of "zip", "tar", "gztar", "bztar", or "xztar" - :return: unpack archive directory path - - Usage:: - - >>> # The following code is to print all the file names in 'test_unpack.tar.gz' - >>> with open('test_unpack.tar.gz') as fp: - ... buffer = fp.read() - ... - >>> with unpack_archive_with_buffer(buffer) as temp_dir: - ... for f_n in os.listdir(temp_dir): - ... print(f_n) - ... - - """ - temp_dir = os.path.expanduser("~/tmp") - if not os.path.exists(temp_dir): - os.makedirs(temp_dir) - with tempfile.NamedTemporaryFile("wb", delete=False, dir=temp_dir) as fp: - fp.write(buffer) - file_path = fp.name - - try: - tar_file = file_path + ".tar.gz" - os.rename(file_path, tar_file) - # Create dir - os.makedirs(file_path) - shutil.unpack_archive(tar_file, format=format, extract_dir=file_path) - - # Return temp dir - yield file_path - - except Exception as e: - log.error(str(e)) - finally: - # Remove temp tar file - if os.path.exists(tar_file): - os.unlink(tar_file) - - # Remove temp model dir - if os.path.exists(file_path): - shutil.rmtree(file_path) - - -@contextlib.contextmanager -def get_tmp_file_with_buffer(buffer): - temp_dir = os.path.expanduser("~/tmp") - if not os.path.exists(temp_dir): - os.makedirs(temp_dir) - with tempfile.NamedTemporaryFile("wb", delete=True, dir=temp_dir) as fp: - fp.write(buffer) - file_path = fp.name - yield file_path - - def remove_repeat_field(fields): """remove repeat field @@ -780,6 +634,37 @@ def flatten_dict(d, parent_key="", sep=".") -> dict: return dict(items) +def auto_filter_kwargs(func: Callable) -> Callable: + """ + this will work like a decoration function + + The decrated function will ignore and give warning when the parameter is not acceptable + + Parameters + ---------- + func : Callable + The original function + + Returns + ------- + Callable: + the new callable function + """ + + def _func(*args, **kwargs): + spec = inspect.getfullargspec(func) + new_kwargs = {} + for k, v in kwargs.items(): + # if `func` don't accept variable keyword arguments like `**kwargs` and have not according named arguments + if spec.varkw is None and k not in spec.args: + log.warn(f"The parameter `{k}` with value `{v}` is ignored.") + else: + new_kwargs[k] = v + return func(*args, **new_kwargs) + + return _func + + #################### Wrapper ##################### class Wrapper: """Wrapper class for anything that needs to set up during qlib.init""" @@ -855,6 +740,7 @@ def fname_to_code(fname: str): ---------- fname: str """ + prefix = "_qlib_" if fname.startswith(prefix): fname = fname.lstrip(prefix) diff --git a/qlib/utils/data.py b/qlib/utils/data.py new file mode 100644 index 0000000000..6e48687cf7 --- /dev/null +++ b/qlib/utils/data.py @@ -0,0 +1,23 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +import pandas as pd +import numpy as np + + +def robust_zscore(x: pd.Series, zscore=False): + """Robust ZScore Normalization + + Use robust statistics for Z-Score normalization: + mean(x) = median(x) + std(x) = MAD(x) * 1.4826 + + Reference: + https://en.wikipedia.org/wiki/Median_absolute_deviation. + """ + x = x - x.median() + mad = x.abs().median() + x = np.clip(x / mad / 1.4826, -3, 3) + if zscore: + x -= x.mean() + x /= x.std() + return x diff --git a/qlib/utils/file.py b/qlib/utils/file.py new file mode 100644 index 0000000000..64cca31210 --- /dev/null +++ b/qlib/utils/file.py @@ -0,0 +1,159 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import contextlib +import os +import shutil +import tempfile +from typing import Optional, Text + +from qlib.log import get_module_logger + +log = get_module_logger("utils.file") + + +def get_or_create_path(path: Optional[Text] = None, return_dir: bool = False): + """Create or get a file or directory given the path and return_dir. + + Parameters + ---------- + path: a string indicates the path or None indicates creating a temporary path. + return_dir: if True, create and return a directory; otherwise c&r a file. + + """ + if path: + if return_dir and not os.path.exists(path): + os.makedirs(path) + elif not return_dir: # return a file, thus we need to create its parent directory + xpath = os.path.abspath(os.path.join(path, "..")) + if not os.path.exists(xpath): + os.makedirs(xpath) + else: + temp_dir = os.path.expanduser("~/tmp") + if not os.path.exists(temp_dir): + os.makedirs(temp_dir) + if return_dir: + _, path = tempfile.mkdtemp(dir=temp_dir) + else: + _, path = tempfile.mkstemp(dir=temp_dir) + return path + + +@contextlib.contextmanager +def save_multiple_parts_file(filename, format="gztar"): + """Save multiple parts file + + Implementation process: + 1. get the absolute path to 'filename' + 2. create a 'filename' directory + 3. user does something with file_path('filename/') + 4. remove 'filename' directory + 5. make_archive 'filename' directory, and rename 'archive file' to filename + + :param filename: result model path + :param format: archive format: one of "zip", "tar", "gztar", "bztar", or "xztar" + :return: real model path + + Usage:: + + >>> # The following code will create an archive file('~/tmp/test_file') containing 'test_doc_i'(i is 0-10) files. + >>> with save_multiple_parts_file('~/tmp/test_file') as filename_dir: + ... for i in range(10): + ... temp_path = os.path.join(filename_dir, 'test_doc_{}'.format(str(i))) + ... with open(temp_path) as fp: + ... fp.write(str(i)) + ... + + """ + + if filename.startswith("~"): + filename = os.path.expanduser(filename) + + file_path = os.path.abspath(filename) + + # Create model dir + if os.path.exists(file_path): + raise FileExistsError("ERROR: file exists: {}, cannot be create the directory.".format(file_path)) + + os.makedirs(file_path) + + # return model dir + yield file_path + + # filename dir to filename.tar.gz file + tar_file = shutil.make_archive(file_path, format=format, root_dir=file_path) + + # Remove filename dir + if os.path.exists(file_path): + shutil.rmtree(file_path) + + # filename.tar.gz rename to filename + os.rename(tar_file, file_path) + + +@contextlib.contextmanager +def unpack_archive_with_buffer(buffer, format="gztar"): + """Unpack archive with archive buffer + After the call is finished, the archive file and directory will be deleted. + + Implementation process: + 1. create 'tempfile' in '~/tmp/' and directory + 2. 'buffer' write to 'tempfile' + 3. unpack archive file('tempfile') + 4. user does something with file_path('tempfile/') + 5. remove 'tempfile' and 'tempfile directory' + + :param buffer: bytes + :param format: archive format: one of "zip", "tar", "gztar", "bztar", or "xztar" + :return: unpack archive directory path + + Usage:: + + >>> # The following code is to print all the file names in 'test_unpack.tar.gz' + >>> with open('test_unpack.tar.gz') as fp: + ... buffer = fp.read() + ... + >>> with unpack_archive_with_buffer(buffer) as temp_dir: + ... for f_n in os.listdir(temp_dir): + ... print(f_n) + ... + + """ + temp_dir = os.path.expanduser("~/tmp") + if not os.path.exists(temp_dir): + os.makedirs(temp_dir) + with tempfile.NamedTemporaryFile("wb", delete=False, dir=temp_dir) as fp: + fp.write(buffer) + file_path = fp.name + + try: + tar_file = file_path + ".tar.gz" + os.rename(file_path, tar_file) + # Create dir + os.makedirs(file_path) + shutil.unpack_archive(tar_file, format=format, extract_dir=file_path) + + # Return temp dir + yield file_path + + except Exception as e: + log.error(str(e)) + finally: + # Remove temp tar file + if os.path.exists(tar_file): + os.unlink(tar_file) + + # Remove temp model dir + if os.path.exists(file_path): + shutil.rmtree(file_path) + + +@contextlib.contextmanager +def get_tmp_file_with_buffer(buffer): + temp_dir = os.path.expanduser("~/tmp") + if not os.path.exists(temp_dir): + os.makedirs(temp_dir) + with tempfile.NamedTemporaryFile("wb", delete=True, dir=temp_dir) as fp: + fp.write(buffer) + file_path = fp.name + yield file_path diff --git a/qlib/utils/serial.py b/qlib/utils/serial.py index 4189f8e617..25753d7572 100644 --- a/qlib/utils/serial.py +++ b/qlib/utils/serial.py @@ -11,22 +11,40 @@ class Serializable: """ Serializable will change the behaviors of pickle. - - It only saves the state whose name **does not** start with `_` + + The rule to tell if a attribute will be kept or dropped when dumping. + The rule with higher priorities is on the top + - in the config attribute list -> always dropped + - in the include attribute list -> always kept + - in the exclude attribute list -> always dropped + - name not starts with `_` -> kept + - name starts with `_` -> kept if `dump_all` is true else dropped + It provides a syntactic sugar for distinguish the attributes which user doesn't want. - For examples, a learnable Datahandler just wants to save the parameters without data when dumping to disk """ pickle_backend = "pickle" # another optional value is "dill" which can pickle more things of python. default_dump_all = False # if dump all things + config_attr = ["_include", "_exclude"] + exclude_attr = [] # exclude_attr have lower priorities than `self._exclude` + include_attr = [] # include_attr have lower priorities then `self._include` def __init__(self): self._dump_all = self.default_dump_all - self._exclude = [] + self._exclude = None # this attribute have higher priorities than `exclude_attr` + + def _is_kept(self, key): + if key in self.config_attr: + return False + if key in self._get_attr_list("include"): + return True + if key in self._get_attr_list("exclude"): + return False + return self.dump_all or not key.startswith("_") def __getstate__(self) -> dict: - return { - k: v for k, v in self.__dict__.items() if k not in self.exclude and (self.dump_all or not k.startswith("_")) - } + return {k: v for k, v in self.__dict__.items() if self._is_kept(k)} def __setstate__(self, state: dict): self.__dict__.update(state) @@ -38,54 +56,79 @@ def dump_all(self): """ return getattr(self, "_dump_all", False) - @property - def exclude(self): + def _get_attr_list(self, attr_type: str) -> list: """ - What attribute will not be dumped + What attribute will not be in specific list + + Parameters + ---------- + attr_type : str + "include" or "exclude" + + Returns + ------- + list: """ - return getattr(self, "_exclude", []) + if hasattr(self, f"_{attr_type}"): + res = getattr(self, f"_{attr_type}", []) + else: + res = getattr(self.__class__, f"{attr_type}_attr", []) + if res is None: + return [] + return res FLAG_KEY = "_qlib_serial_flag" - def config(self, dump_all: bool = None, exclude: list = None, recursive=False): + def config(self, recursive=False, **kwargs): """ configure the serializable object Parameters ---------- - dump_all : bool - will the object dump all object - exclude : list - What attribute will not be dumped + kwargs may include following keys + + dump_all : bool + will the object dump all object + exclude : list + What attribute will not be dumped + include : list + What attribute will be dumped + recursive : bool will the configuration be recursive """ - - params = {"dump_all": dump_all, "exclude": exclude} - - for k, v in params.items(): - if v is not None: + keys = {"dump_all", "exclude", "include"} + for k, v in kwargs.items(): + if k in keys: attr_name = f"_{k}" setattr(self, attr_name, v) + else: + raise KeyError(f"Unknown parameter: {k}") if recursive: for obj in self.__dict__.values(): # set flag to prevent endless loop self.__dict__[self.FLAG_KEY] = True if isinstance(obj, Serializable) and self.FLAG_KEY not in obj.__dict__: - obj.config(**params, recursive=True) + obj.config(recursive=True, **kwargs) del self.__dict__[self.FLAG_KEY] - def to_pickle(self, path: Union[Path, str], dump_all: bool = None, exclude: list = None): + def to_pickle(self, path: Union[Path, str], **kwargs): """ Dump self to a pickle file. - Args: - path (Union[Path, str]): the path to dump - dump_all (bool, optional): if need to dump all things. Defaults to None. - exclude (list, optional): will exclude the attributes in this list when dumping. Defaults to None. + path (Union[Path, str]): the path to dump + + kwargs may include following keys + + dump_all : bool + will the object dump all object + exclude : list + What attribute will not be dumped + include : list + What attribute will be dumped """ - self.config(dump_all=dump_all, exclude=exclude) + self.config(**kwargs) with Path(path).open("wb") as f: self.get_backend().dump(self, f) diff --git a/qlib/workflow/__init__.py b/qlib/workflow/__init__.py index 51a6ed553e..f172b5cb75 100644 --- a/qlib/workflow/__init__.py +++ b/qlib/workflow/__init__.py @@ -446,13 +446,13 @@ def save_objects(self, local_path=None, artifact_path=None, **kwargs): artifact_path : str the relative path for the artifact to be stored in the URI. """ - self.get_exp().get_recorder().save_objects(local_path, artifact_path, **kwargs) + self.get_exp().get_recorder(start=True).save_objects(local_path, artifact_path, **kwargs) def load_object(self, name: Text): """ Method for loading an object from artifacts in the experiment in the uri. """ - return self.get_exp().get_recorder().load_object(name) + return self.get_exp().get_recorder(start=True).load_object(name) def log_params(self, **kwargs): """ @@ -477,7 +477,7 @@ def log_params(self, **kwargs): keyword argument: name1=value1, name2=value2, ... """ - self.get_exp().get_recorder().log_params(**kwargs) + self.get_exp().get_recorder(start=True).log_params(**kwargs) def log_metrics(self, step=None, **kwargs): """ @@ -502,7 +502,7 @@ def log_metrics(self, step=None, **kwargs): keyword argument: name1=value1, name2=value2, ... """ - self.get_exp().get_recorder().log_metrics(step, **kwargs) + self.get_exp().get_recorder(start=True).log_metrics(step, **kwargs) def set_tags(self, **kwargs): """ @@ -527,7 +527,7 @@ def set_tags(self, **kwargs): keyword argument: name1=value1, name2=value2, ... """ - self.get_exp().get_recorder().set_tags(**kwargs) + self.get_exp().get_recorder(start=True).set_tags(**kwargs) class RecorderWrapper(Wrapper): diff --git a/qlib/workflow/exp.py b/qlib/workflow/exp.py index fcf6cd8d16..9bdfdefc13 100644 --- a/qlib/workflow/exp.py +++ b/qlib/workflow/exp.py @@ -22,6 +22,7 @@ def __init__(self, id, name): self.id = id self.name = name self.active_recorder = None # only one recorder can running each time + self._default_rec_name = "abstract_recorder" def __repr__(self): return "{name}(id={id}, info={info})".format(name=self.__class__.__name__, id=self.id, info=self.info) @@ -150,7 +151,7 @@ def get_recorder(self, recorder_id=None, recorder_name=None, create: bool = True create : boolean create the recorder if it hasn't been created before. start : boolean - start the new recorder if one is created. + start the new recorder if one is **created**. Returns ------- diff --git a/qlib/workflow/task/gen.py b/qlib/workflow/task/gen.py index e60fa4755b..32f37427e9 100644 --- a/qlib/workflow/task/gen.py +++ b/qlib/workflow/task/gen.py @@ -6,7 +6,7 @@ import abc import copy import pandas as pd -from typing import List, Union, Callable +from typing import Dict, List, Union, Callable from qlib.utils import transform_end_date from .utils import TimeAdjuster @@ -112,13 +112,24 @@ def handler_mod(task: dict, rolling_gen): except KeyError: # Maybe dataset do not have handler, then do nothing. pass + except TypeError: + # if handler a dumped file like file://// + pass class RollingGen(TaskGen): ROLL_EX = TimeAdjuster.SHIFT_EX # fixed start date, expanding end date ROLL_SD = TimeAdjuster.SHIFT_SD # fixed segments size, slide it from start date - def __init__(self, step: int = 40, rtype: str = ROLL_EX, ds_extra_mod_func: Union[None, Callable] = handler_mod): + def __init__( + self, + step: int = 40, + rtype: str = ROLL_EX, + ds_extra_mod_func: Union[None, Callable] = handler_mod, + test_key="test", + train_key="train", + trunc_days: int = None, + ): """ Generate tasks for rolling @@ -131,14 +142,17 @@ def __init__(self, step: int = 40, rtype: str = ROLL_EX, ds_extra_mod_func: Unio ds_extra_mod_func: Callable A method like: handler_mod(task: dict, rg: RollingGen) Do some extra action after generating a task. For example, use ``handler_mod`` to modify the end time of the handler of a dataset. + trunc_days: int + trunc some data to avoid future information leakage """ self.step = step self.rtype = rtype self.ds_extra_mod_func = ds_extra_mod_func self.ta = TimeAdjuster(future=True) - self.test_key = "test" - self.train_key = "train" + self.test_key = test_key + self.train_key = train_key + self.trunc_days = trunc_days def _update_task_segs(self, task, segs): # update segments of this task @@ -162,8 +176,7 @@ def gen_following_tasks(self, task: dict, test_end: pd.Timestamp) -> List[dict]: List[dict]: the following tasks of `task`(`task` itself is excluded) """ - t = copy.deepcopy(task) - prev_seg = t["dataset"]["kwargs"]["segments"] + prev_seg = task["dataset"]["kwargs"]["segments"] while True: segments = {} try: @@ -184,6 +197,7 @@ def gen_following_tasks(self, task: dict, test_end: pd.Timestamp) -> List[dict]: break prev_seg = segments + t = copy.deepcopy(task) # deepcopy is necessary to avoid replace task inplace self._update_task_segs(t, segments) yield t @@ -250,6 +264,8 @@ def generate(self, task: dict) -> List[dict]: # 2) and init test segments test_start_idx = self.ta.align_idx(segments[self.test_key][0]) segments[self.test_key] = (self.ta.get(test_start_idx), self.ta.get(test_start_idx + self.step - 1)) + if self.trunc_days is not None: + self.trunc_segments(self.ta, segments, self.trunc_days, self.test_key) # update segments of this task self._update_task_segs(t, segments) @@ -259,3 +275,18 @@ def generate(self, task: dict) -> List[dict]: # Update the following rolling res.extend(self.gen_following_tasks(t, test_end)) return res + + # helper function + @staticmethod + def trunc_segments(ta: TimeAdjuster, segments: Dict[str, pd.Timestamp], days, test_key="test"): + """ + To avoid the leakage of future information, the segments should be truncated according to the test start_time + + NOTE: + This function will change segments inplace + """ + # adjust segment + test_start = min(segments[test_key]) + for k in list(segments.keys()): + if k != test_key: + segments[k] = ta.truncate(segments[k], test_start, days) diff --git a/qlib/workflow/task/manage.py b/qlib/workflow/task/manage.py index 41e243b435..922da23512 100644 --- a/qlib/workflow/task/manage.py +++ b/qlib/workflow/task/manage.py @@ -47,6 +47,14 @@ class TaskManager: The tasks manager assumes that you will only update the tasks you fetched. The mongo fetch one and update will make it date updating secure. + This class can be used as a tool from commandline. Here are serveral examples + + .. code-block:: shell + + python -m qlib.workflow.task.manage -t wait + python -m qlib.workflow.task.manage -t task_stat + + .. note:: Assumption: the data in MongoDB was encoded and the data out of MongoDB was decoded diff --git a/qlib/workflow/task/utils.py b/qlib/workflow/task/utils.py index 174b4b9bfc..e79444d1ec 100644 --- a/qlib/workflow/task/utils.py +++ b/qlib/workflow/task/utils.py @@ -100,7 +100,7 @@ def get(self, idx: int): idx : int index of the calendar """ - if idx >= len(self.cals): + if idx is None or idx >= len(self.cals): return None return self.cals[idx] @@ -123,6 +123,9 @@ def align_idx(self, time_point, tp_type="start") -> int: ------- index : int """ + if time_point is None: + # `None` indicates unbounded index/boarder + return None time_point = pd.Timestamp(time_point) if tp_type == "start": idx = bisect.bisect_left(self.cals, time_point) @@ -158,6 +161,8 @@ def align_time(self, time_point, tp_type="start") -> pd.Timestamp: Returns: pd.Timestamp """ + if time_point is None: + return None return self.cals[self.align_idx(time_point, tp_type=tp_type)] def align_seg(self, segment: Union[dict, tuple]) -> Union[dict, tuple]: @@ -201,6 +206,10 @@ def truncate(self, segment: tuple, test_start, days: int) -> tuple: days : int The trading days to be truncated the data in this segment may need 'days' data + `days` are based on the `test_start`. + For example, if the label contains the information of 2 days in the near future, the prediction horizon 1 day. + (e.g. the prediction target is `Ref($close, -2)/Ref($close, -1) - 1`) + the days should be 2 + 1 == 3 days. Returns --------- @@ -220,10 +229,17 @@ def truncate(self, segment: tuple, test_start, days: int) -> tuple: SHIFT_SD = "sliding" SHIFT_EX = "expanding" + def _add_step(self, index, step): + if index is None: + return None + return index + step + def shift(self, seg: tuple, step: int, rtype=SHIFT_SD) -> tuple: """ Shift the datatime of segment + If there are None (which indicates unbounded index) in the segment, this method will return None. + Parameters ---------- seg : @@ -245,13 +261,13 @@ def shift(self, seg: tuple, step: int, rtype=SHIFT_SD) -> tuple: if isinstance(seg, tuple): start_idx, end_idx = self.align_idx(seg[0], tp_type="start"), self.align_idx(seg[1], tp_type="end") if rtype == self.SHIFT_SD: - start_idx += step - end_idx += step + start_idx = self._add_step(start_idx, step) + end_idx = self._add_step(end_idx, step) elif rtype == self.SHIFT_EX: - end_idx += step + end_idx = self._add_step(end_idx, step) else: raise NotImplementedError(f"This type of input is not supported") - if start_idx > len(self.cals): + if start_idx is not None and start_idx > len(self.cals): raise KeyError("The segment is out of valid calendar") return self.get(start_idx), self.get(end_idx) else: From aa2699f3f44888d6a29f3eed0cba1bb490de44e5 Mon Sep 17 00:00:00 2001 From: Young Date: Fri, 13 Aug 2021 01:31:38 +0000 Subject: [PATCH 05/37] successfully run exp after refactor some interface --- qlib/contrib/meta/data_selection/model.py | 26 +++++++++++++++++ qlib/contrib/meta/data_selection/net.py | 19 +++++++++---- qlib/contrib/model/linear.py | 20 +++++++++---- qlib/data/dataset/weight.py | 17 ++++++++++++ qlib/model/meta/model.py | 34 +++++++++++++++++------ qlib/model/meta/task.py | 18 +++++++++--- qlib/model/trainer.py | 6 ++-- 7 files changed, 114 insertions(+), 26 deletions(-) diff --git a/qlib/contrib/meta/data_selection/model.py b/qlib/contrib/meta/data_selection/model.py index 2f3145c5f9..579a2171cd 100644 --- a/qlib/contrib/meta/data_selection/model.py +++ b/qlib/contrib/meta/data_selection/model.py @@ -3,6 +3,7 @@ import pandas as pd import numpy as np +from qlib.model.meta.task import MetaTask import torch from torch import nn from torch import optim @@ -19,6 +20,19 @@ from .utils import fill_diagnal, ICLoss from .dataset import MetaDatasetHDS from qlib.contrib.meta.data_selection.net import PredNet +from qlib.data.dataset.weight import Reweighter + + +class TimeReweighter(Reweighter): + + def __init__(self, time_weight: pd.Series): + self.time_weight = time_weight + + def reweight(self, data: Union[pd.DataFrame, pd.Series]): + w_s = pd.Series(1., index=data.index) + for k, w in self.time_weight.items(): + w_s.loc[slice(*k)] = w + return w_s class MetaModelDS(MetaTaskModel): @@ -131,6 +145,7 @@ def fit(self, meta_dataset: MetaDatasetHDS): R.save_objects(**{"model.pkl": self.tn}) self.fitted = True + # TODO: refactor def _inference_single_task(self, meta_id: tuple, meta_dataset: MetaDatasetHDS): meta_task = meta_dataset.get_meta_task_by_test_period(meta_id) if meta_task is not None: @@ -153,6 +168,7 @@ def _inference_single_task(self, meta_id: tuple, meta_dataset: MetaDatasetHDS): else: raise ValueError("The current task is not supported!") + # TODO: refactor def inference(self, meta_ids: Union[List[tuple], tuple], meta_dataset: MetaDatasetHDS): """ Inference a single task with meta-dataset. The meta-model must be fitted. @@ -176,6 +192,7 @@ def inference(self, meta_ids: Union[List[tuple], tuple], meta_dataset: MetaDatas else: raise NotImplementedError("This type of task definition is not supported!") + # TODO: refactor def prepare_tasks(self, task: Union[List[dict], dict], reweighters: dict): """ @@ -210,3 +227,12 @@ def prepare_tasks(self, task: Union[List[dict], dict], reweighters: dict): return [self.prepare_tasks(i, reweighters) for i in task] else: raise NotImplementedError("This type of task definition is not supported!") + + def prepare_task(self, task: MetaTask) -> dict: + meta_ipt = task.get_meta_input() + weights = self.tn.twm(meta_ipt["time_perf"]) + + weight_s = pd.Series(weights.detach().cpu().numpy(), index=task.meta_info.columns) + task = copy.copy(task.task) # NOTE: this is a shallow copy. + task["reweighter"] = TimeReweighter(weight_s) + return task diff --git a/qlib/contrib/meta/data_selection/net.py b/qlib/contrib/meta/data_selection/net.py index 84a83c9efc..4d96a12d71 100644 --- a/qlib/contrib/meta/data_selection/net.py +++ b/qlib/contrib/meta/data_selection/net.py @@ -16,7 +16,11 @@ def __init__(self, hist_step_n, clip_weight=None, clip_method="clamp"): self.linear = nn.Linear(hist_step_n, 1) self.k = nn.Parameter(torch.Tensor([8.0])) - def forward(self, time_perf, time_belong, return_preds=False): + def forward(self, time_perf, time_belong=None, return_preds=False): + hist_step_n = self.linear.in_features + time_perf = time_perf.reshape(time_perf.shape[0] // hist_step_n, hist_step_n, *time_perf.shape[1:]) + time_perf = torch.mean(time_perf, dim=0, keepdim=False) + # time_perf的格式和其他的有一些不一样 # 需要自己拆出train和test preds = [] @@ -26,11 +30,16 @@ def forward(self, time_perf, time_belong, return_preds=False): preds = preds - torch.mean(preds) # 这里注意一下不要引入未来信息 preds = preds * self.k if return_preds: - return time_belong @ preds + if time_belong is None: + return preds + else: + return time_belong @ preds else: weights = preds_to_weight_with_clamp(preds, self.clip_weight, self.clip_method) - sample_weights = time_belong @ weights - return sample_weights + if time_belong is None: + return weights + else: + return time_belong @ weights class PredNet(nn.Module): @@ -49,8 +58,6 @@ def get_sample_weights(self, X, time_perf, time_belong, ignore_weight=False): return weights def forward(self, X, y, time_perf, time_belong, X_test, ignore_weight=False): - time_perf = time_perf.reshape(self.step, time_perf.shape[0] // self.step, *time_perf.shape[1:]) - time_perf = torch.mean(time_perf, dim=0, keepdim=False) weights = self.get_sample_weights(X, time_perf, time_belong, ignore_weight=ignore_weight) X_w = X.T * weights.view(1, -1) theta = torch.inverse(X_w @ X) @ X_w @ y diff --git a/qlib/contrib/model/linear.py b/qlib/contrib/model/linear.py index f16acc1eca..c357033157 100644 --- a/qlib/contrib/model/linear.py +++ b/qlib/contrib/model/linear.py @@ -4,6 +4,7 @@ import numpy as np import pandas as pd from typing import Text, Union +from qlib.data.dataset.weight import Reweighter from scipy.optimize import nnls from sklearn.linear_model import LinearRegression, Ridge, Lasso @@ -49,31 +50,38 @@ def __init__(self, estimator="ols", alpha=0.0, fit_intercept=False): self.coef_ = None - def fit(self, dataset: DatasetH): + def fit(self, dataset: DatasetH, reweighter: Reweighter = None): df_train = dataset.prepare("train", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L) + if reweighter is not None: + w: pd.Series = reweighter.reweight(df_train) + w = w.values + else: + w = None X, y = df_train["feature"].values, np.squeeze(df_train["label"].values) if self.estimator in [self.OLS, self.RIDGE, self.LASSO]: - self._fit(X, y) + self._fit(X, y, w) elif self.estimator == self.NNLS: - self._fit_nnls(X, y) + self._fit_nnls(X, y, w) else: raise ValueError(f"unknown estimator `{self.estimator}`") return self - def _fit(self, X, y): + def _fit(self, X, y, w): if self.estimator == self.OLS: model = LinearRegression(fit_intercept=self.fit_intercept, copy_X=False) else: model = {self.RIDGE: Ridge, self.LASSO: Lasso}[self.estimator]( alpha=self.alpha, fit_intercept=self.fit_intercept, copy_X=False ) - model.fit(X, y) + model.fit(X, y, sample_weight=w) self.coef_ = model.coef_ self.intercept_ = model.intercept_ - def _fit_nnls(self, X, y): + def _fit_nnls(self, X, y, w=None): + if w is not None: + raise NotImplementedError("TODO: support nnls with weight") # TODO if self.fit_intercept: X = np.c_[X, np.ones(len(X))] # NOTE: mem copy coef = nnls(X, y)[0] diff --git a/qlib/data/dataset/weight.py b/qlib/data/dataset/weight.py index 2d5c955ca0..ba4ed61cc2 100644 --- a/qlib/data/dataset/weight.py +++ b/qlib/data/dataset/weight.py @@ -16,6 +16,23 @@ def __init__(self, *args, **kwargs): """ raise NotImplementedError() + def reweight(self, data: object) -> object: + """ + Get weights for data + + Parameters + ---------- + data : object + The input data. + The first dimension is the index of samples + + Returns + ------- + object: + the weights info for the data + """ + raise NotImplementedError(f"This type of input is not supported") + class WeightSampler: """ diff --git a/qlib/model/meta/model.py b/qlib/model/meta/model.py index 5f8d134117..59a8a417a0 100644 --- a/qlib/model/meta/model.py +++ b/qlib/model/meta/model.py @@ -3,6 +3,8 @@ import abc from typing import Union, List, Tuple + +from qlib.model.meta.task import MetaTask from .dataset import MetaDataset @@ -30,23 +32,39 @@ class MetaTaskModel(MetaModel): """ This type of meta-model deals with base task definitions. The meta-model creates tasks for training new base forecasting models after it is trained. `prepare_tasks` directly modifies the task definitions. """ - @abc.abstractmethod - def prepare_tasks(self, tasks: List[dict]) -> List[dict]: + def prepare_task(self, task: MetaTask) -> dict: """ - The meta-model modifies the tasks. The function will return the modified task list. + Input a meta task and output a task with qlib format Parameters ---------- - tasks: List[dict] - A list of task definitions for the meta-model to modify. + task : MetaTask + meta task to inference Returns ------- - List[dict]: - A list of modified task definitions. + dict: + A task with Qlib format """ - pass + + # NOTE: factor; Please justify the necessity of this method + # @abc.abstractmethod + # def prepare_tasks(self, tasks: List[dict]) -> List[dict]: + # """ + # The meta-model modifies the tasks. The function will return the modified task list. + # + # Parameters + # ---------- + # tasks: List[dict] + # A list of task definitions for the meta-model to modify. + # + # Returns + # ------- + # List[dict]: + # A list of modified task definitions. + # """ + # pass class MetaGuideModel(MetaModel): diff --git a/qlib/model/meta/task.py b/qlib/model/meta/task.py index bcc6694ad2..2a000291df 100644 --- a/qlib/model/meta/task.py +++ b/qlib/model/meta/task.py @@ -11,12 +11,21 @@ class MetaTask: """ A single meta-task, a meta-dataset contains a list of them. - It is designed for Mea + It serves as a component as in MetaDatasetDS + + The data processing is different + - the processed input may be different between training and testing + - When training, the X, y, X_test, y_test in training tasks are necessary (# PROC_MODE_FULL #) + but not necessary in test tasks. (# PROC_MODE_TEST #) + - When the meta model can be transferred into other dataset, maybe only meta_info is necessary (# PROC_MODE_TRANSFER #) """ + PROC_MODE_FULL = "full" + PROC_MODE_TEST = "test" + PROC_MODE_TRANSFER = "transfer" - def __init__(self, task: dict, meta_info: object): + def __init__(self, task: dict, meta_info: object, mode: str = PROC_MODE_FULL): """ - the `__init__` func is responsible for + The `__init__` func is responsible for - store the task - store the origin input data for - process the input data for meta data @@ -31,11 +40,12 @@ def __init__(self, task: dict, meta_info: object): """ self.task = task self.meta_info = meta_info # the original meta input information, it will be processed later + self.mode = mode def get_dataset(self) -> Dataset: return init_instance_by_config(self.task["dataset"], accept_types=Dataset) - def get_meta_input(elf) -> object: + def get_meta_input(self) -> object: """ Return the **processed** meta_info """ diff --git a/qlib/model/trainer.py b/qlib/model/trainer.py index 9a7ce51817..19c5548138 100644 --- a/qlib/model/trainer.py +++ b/qlib/model/trainer.py @@ -84,7 +84,7 @@ def end_task_train(rec: Recorder, experiment_name: str) -> Recorder: return rec -def task_train(task_config: dict, experiment_name: str) -> Recorder: +def task_train(task_config: dict, experiment_name: str, recorder_name: str = None) -> Recorder: """ Task based training, will be divided into two steps. @@ -94,12 +94,14 @@ def task_train(task_config: dict, experiment_name: str) -> Recorder: The config of a task. experiment_name: str The name of experiment + recorder_name: str + The name of recorder Returns ---------- Recorder: The instance of the recorder """ - recorder = begin_task_train(task_config, experiment_name) + recorder = begin_task_train(task_config, experiment_name, recorder_name=recorder_name) recorder = end_task_train(recorder, experiment_name) return recorder From d17aaac659ca91c445b42c5cc6b460568263ff7d Mon Sep 17 00:00:00 2001 From: Young Date: Fri, 20 Aug 2021 01:31:12 +0000 Subject: [PATCH 06/37] data selection share handler & trainer --- qlib/contrib/meta/data_selection/model.py | 125 ++++++++++++---------- qlib/contrib/meta/data_selection/utils.py | 9 +- qlib/model/trainer.py | 70 ++++++------ qlib/utils/__init__.py | 4 +- qlib/workflow/__init__.py | 21 +++- qlib/workflow/expm.py | 5 +- qlib/workflow/task/gen.py | 8 +- 7 files changed, 144 insertions(+), 98 deletions(-) diff --git a/qlib/contrib/meta/data_selection/model.py b/qlib/contrib/meta/data_selection/model.py index 579a2171cd..1ae3e0581d 100644 --- a/qlib/contrib/meta/data_selection/model.py +++ b/qlib/contrib/meta/data_selection/model.py @@ -59,6 +59,66 @@ def __init__( self.max_epoch = max_epoch self.fitted = False + def run_epoch(self, phase, task_list, epoch, opt, loss_l, ignore_weight=False): + if phase == "train": # phase 0 for training, 1 for inference + self.tn.train() + torch.set_grad_enabled(True) + else: + self.tn.eval() + torch.set_grad_enabled(False) + running_loss = 0.0 + pred_y_all = [] + for task in tqdm(task_list, desc=f"{phase} Task", leave=False): + meta_input = task.get_meta_input() + pred, weights = self.tn( + meta_input["X"], + meta_input["y"], + meta_input["time_perf"], + meta_input["time_belong"], + meta_input["X_test"], + ignore_weight=ignore_weight + ) # 这里可能因为如下原因导致pred为None; + if self.criterion == "mse": + criterion = nn.MSELoss() + loss = criterion(pred, meta_input["y_test"]) + elif self.criterion == "ic_loss": + criterion = ICLoss() + loss = criterion(pred, meta_input["y_test"], meta_input["test_idx"], skip_size=50) + + if np.isnan(loss.detach().item()): __import__('ipdb').set_trace() + + if phase == "train": + opt.zero_grad() + norm_loss = nn.MSELoss() + loss.backward() + opt.step() + elif phase == "test": + pass + + pred_y_all.append( + pd.DataFrame( + { + "pred": pd.Series(pred.detach().cpu().numpy(), index=meta_input["test_idx"]), + "label": pd.Series( + meta_input["y_test"].detach().cpu().numpy(), index=meta_input["test_idx"] + ), + } + ) + ) + running_loss += loss.detach().item() + running_loss = running_loss / len(task_list) + loss_l.setdefault(phase, []).append(running_loss) + + pred_y_all = pd.concat(pred_y_all) + ic = ( + pred_y_all.groupby("datetime") + .apply(lambda df: df["pred"].corr(df["label"], method="spearman")) + .mean() + ) + + R.log_metrics(**{f"loss/{phase}": running_loss, "step": epoch}) + R.log_metrics(**{f"ic/{phase}": ic, "step": epoch}) + def fit(self, meta_dataset: MetaDatasetHDS): """ The meta-learning-based data selection interacts directly with meta-dataset due to the close-form proxy measurement. @@ -81,67 +141,18 @@ def fit(self, meta_dataset: MetaDatasetHDS): step=self.step, hist_step_n=self.hist_step_n, clip_weight=self.clip_weight, clip_method=self.clip_method ) - train_step = 0 opt = optim.Adam(self.tn.parameters(), lr=self.lr) + + # run weight with no weight + for phase, task_list in zip(phases, meta_tasks_l): + self.run_epoch(f"{phase}_noweight", task_list, 0, opt, {}, ignore_weight=True) + self.run_epoch(f"{phase}_init", task_list, 0, opt, {}) + + # run training loss_l = {} for epoch in tqdm(range(self.max_epoch), desc="epoch"): for phase, task_list in zip(phases, meta_tasks_l): - if phase == "train": # phase 0 for training, 1 for inference - self.tn.train() - torch.set_grad_enabled(True) - else: - self.tn.eval() - torch.set_grad_enabled(False) - running_loss = 0.0 - pred_y_all = [] - for task in tqdm(task_list, desc=f"{phase} Task", leave=False): - meta_input = task.get_meta_input() - pred, weights = self.tn( - meta_input["X"], - meta_input["y"], - meta_input["time_perf"], - meta_input["time_belong"], - meta_input["X_test"], - ) - if self.criterion == "mse": - criterion = nn.MSELoss() - loss = criterion(pred, meta_input["y_test"]) - elif self.criterion == "ic_loss": - criterion = ICLoss() - loss = criterion(pred, meta_input["y_test"], meta_input["test_idx"]) - - if phase == "train": - opt.zero_grad() - norm_loss = nn.MSELoss() - loss.backward() - opt.step() - train_step += 1 - elif phase == "test": - pass - - pred_y_all.append( - pd.DataFrame( - { - "pred": pd.Series(pred.detach().cpu().numpy(), index=meta_input["test_idx"]), - "label": pd.Series( - meta_input["y_test"].detach().cpu().numpy(), index=meta_input["test_idx"] - ), - } - ) - ) - running_loss += loss.detach().item() - running_loss = running_loss / len(task_list) - loss_l.setdefault(phase, []).append(running_loss) - - pred_y_all = pd.concat(pred_y_all) - ic = ( - pred_y_all.groupby("datetime") - .apply(lambda df: df["pred"].corr(df["label"], method="spearman")) - .mean() - ) - - R.log_metrics(**{f"loss/{phase}": running_loss, "step": epoch}) - R.log_metrics(**{f"ic/{phase}": ic, "step": epoch}) + self.run_epoch(phase, task_list, epoch, opt, loss_l) R.save_objects(**{"model.pkl": self.tn}) self.fitted = True diff --git a/qlib/contrib/meta/data_selection/utils.py b/qlib/contrib/meta/data_selection/utils.py index 7fc42bab50..011e472397 100644 --- a/qlib/contrib/meta/data_selection/utils.py +++ b/qlib/contrib/meta/data_selection/utils.py @@ -25,7 +25,7 @@ def get_sim_mat_idx(i_sim_mat, outsample_period): class ICLoss(nn.Module): - def forward(self, pred, y, idx): + def forward(self, pred, y, idx, skip_size=50): """forward. :param pred: @@ -41,15 +41,20 @@ def forward(self, pred, y, idx): diff_point.append(None) ic_all = 0.0 + skip_n = 0 for start_i, end_i in zip(diff_point, diff_point[1:]): pred_focus = pred[start_i:end_i] # TODO: just for fake + if pred_focus.shape[0] < skip_size: + # skip some days which have very small amount of stock. + skip_n += 1 + continue y_focus = y[start_i:end_i] ic_day = torch.dot( (pred_focus - pred_focus.mean()) / np.sqrt(pred_focus.shape[0]) / pred_focus.std(), (y_focus - y_focus.mean()) / np.sqrt(y_focus.shape[0]) / y_focus.std(), ) ic_all += ic_day - ic_mean = ic_all / (len(diff_point) - 1) + ic_mean = ic_all / (len(diff_point) - 1 - skip_n) return -ic_mean # ic loss diff --git a/qlib/model/trainer.py b/qlib/model/trainer.py index 19c5548138..52608d50c5 100644 --- a/qlib/model/trainer.py +++ b/qlib/model/trainer.py @@ -25,6 +25,38 @@ from qlib.data.dataset.weight import Reweighter +def _log_task_info(task_config: dict): + R.log_params(**flatten_dict(task_config)) + R.save_objects(**{"task": task_config}) # keep the original format and datatype + R.set_tags(**{"hostname": socket.gethostname()}) + + +def _exe_task(task_config: dict): + rec = R.get_recorder() + # model & dataset initiation + model: Model = init_instance_by_config(task_config["model"]) + dataset: Dataset = init_instance_by_config(task_config["dataset"]) + reweighter: Reweighter = task_config.get("reweighter", None) + # model training + auto_filter_kwargs(model.fit)(dataset, reweighter=reweighter) + R.save_objects(**{"params.pkl": model}) + # this dataset is saved for online inference. So the concrete data should not be dumped + dataset.config(dump_all=False, recursive=True) + R.save_objects(**{"dataset": dataset}) + # generate records: prediction, backtest, and analysis + records = task_config.get("record", []) + if isinstance(records, dict): # prevent only one dict + records = [records] + for record in records: + cls, kwargs = get_cls_kwargs(record, default_module="qlib.workflow.record_temp") + if cls is SignalRecord: + rconf = {"model": model, "dataset": dataset, "recorder": rec} + else: + rconf = {"recorder": rec} + r = cls(**kwargs, **rconf) + r.generate() + + def begin_task_train(task_config: dict, experiment_name: str, recorder_name: str = None) -> Recorder: """ Begin task training to start a recorder and save the task config. @@ -38,11 +70,8 @@ def begin_task_train(task_config: dict, experiment_name: str, recorder_name: str Recorder: the model recorder """ with R.start(experiment_name=experiment_name, recorder_name=recorder_name): - R.log_params(**flatten_dict(task_config)) - R.save_objects(**{"task": task_config}) # keep the original format and datatype - R.set_tags(**{"hostname": socket.gethostname()}) - recorder: Recorder = R.get_recorder() - return recorder + _log_task_info(task_config) + return R.get_recorder() def end_task_train(rec: Recorder, experiment_name: str) -> Recorder: @@ -58,29 +87,7 @@ def end_task_train(rec: Recorder, experiment_name: str) -> Recorder: """ with R.start(experiment_name=experiment_name, recorder_id=rec.info["id"], resume=True): task_config = R.load_object("task") - # model & dataset initiation - model: Model = init_instance_by_config(task_config["model"]) - dataset: Dataset = init_instance_by_config(task_config["dataset"]) - reweighter: Reweighter = task_config.get("reweighter", None) - # model training - auto_filter_kwargs(model.fit)(dataset, reweighter=reweighter) - R.save_objects(**{"params.pkl": model}) - # this dataset is saved for online inference. So the concrete data should not be dumped - dataset.config(dump_all=False, recursive=True) - R.save_objects(**{"dataset": dataset}) - # generate records: prediction, backtest, and analysis - records = task_config.get("record", []) - if isinstance(records, dict): # prevent only one dict - records = [records] - for record in records: - cls, kwargs = get_cls_kwargs(record, default_module="qlib.workflow.record_temp") - if cls is SignalRecord: - rconf = {"model": model, "dataset": dataset, "recorder": rec} - else: - rconf = {"recorder": rec} - r = cls(**kwargs, **rconf) - r.generate() - + _exe_task(task_config) return rec @@ -101,9 +108,10 @@ def task_train(task_config: dict, experiment_name: str, recorder_name: str = Non ---------- Recorder: The instance of the recorder """ - recorder = begin_task_train(task_config, experiment_name, recorder_name=recorder_name) - recorder = end_task_train(recorder, experiment_name) - return recorder + with R.start(experiment_name=experiment_name, recorder_name=recorder_name): + _log_task_info(task_config) + _exe_task(task_config) + return R.get_recorder() class Trainer: diff --git a/qlib/utils/__init__.py b/qlib/utils/__init__.py index fac5148907..02f17e053b 100644 --- a/qlib/utils/__init__.py +++ b/qlib/utils/__init__.py @@ -594,7 +594,7 @@ def lazy_sort_index(df: pd.DataFrame, axis=0) -> pd.DataFrame: sorted dataframe """ idx = df.index if axis == 0 else df.columns - if idx.is_monotonic_increasing: + if idx.is_monotonic_increasing and (not isinstance(idx, pd.MultiIndex) or not idx.is_lexsorted()): return df else: return df.sort_index(axis=axis) @@ -657,7 +657,7 @@ def _func(*args, **kwargs): for k, v in kwargs.items(): # if `func` don't accept variable keyword arguments like `**kwargs` and have not according named arguments if spec.varkw is None and k not in spec.args: - log.warn(f"The parameter `{k}` with value `{v}` is ignored.") + log.warning(f"The parameter `{k}` with value `{v}` is ignored.") else: new_kwargs[k] = v return func(*args, **new_kwargs) diff --git a/qlib/workflow/__init__.py b/qlib/workflow/__init__.py index f172b5cb75..def84eb7a0 100644 --- a/qlib/workflow/__init__.py +++ b/qlib/workflow/__init__.py @@ -3,7 +3,7 @@ from contextlib import contextmanager from typing import Text, Optional -from .expm import MLflowExpManager +from .expm import ExpManager from .exp import Experiment from .recorder import Recorder from ..utils import Wrapper @@ -15,7 +15,7 @@ class QlibRecorder: A global system that helps to manage the experiments. """ - def __init__(self, exp_manager): + def __init__(self, exp_manager: ExpManager): self.exp_manager = exp_manager def __repr__(self): @@ -334,6 +334,23 @@ def set_uri(self, uri: Optional[Text]): """ self.exp_manager.set_uri(uri) + @contextmanager + def uri_context(self, uri: Text): + """ + Temporarily set the exp_manager's uri to uri + + Parameters + ---------- + uri : Text + the temporal uri + """ + prev_uri = self.exp_manager._current_uri + self.exp_manager.set_uri(uri) + try: + yield + finally: + self.exp_manager._current_uri = prev_uri + def get_recorder( self, *, recorder_id=None, recorder_name=None, experiment_id=None, experiment_name=None ) -> Recorder: diff --git a/qlib/workflow/expm.py b/qlib/workflow/expm.py index 84cc6a13a5..355eb00e1c 100644 --- a/qlib/workflow/expm.py +++ b/qlib/workflow/expm.py @@ -14,7 +14,7 @@ from .recorder import Recorder from ..log import get_module_logger -logger = get_module_logger("workflow", logging.INFO) +logger = get_module_logger("workflow") class ExpManager: @@ -258,7 +258,7 @@ def set_uri(self, uri: Optional[Text] = None): """ if uri is None: - logger.info("No tracking URI is provided. Use the default tracking URI.") + logger.debug("No tracking URI is provided. Use the default tracking URI.") self._current_uri = self.default_uri else: # Temporarily re-set the current uri as the uri argument. @@ -269,6 +269,7 @@ def set_uri(self, uri: Optional[Text] = None): def _set_uri(self): """ Customized features for subclasses' set_uri function. + This method is designed for the underlying experiment backend storage. """ raise NotImplementedError(f"Please implement the `_set_uri` method.") diff --git a/qlib/workflow/task/gen.py b/qlib/workflow/task/gen.py index 32f37427e9..2cd806ec2d 100644 --- a/qlib/workflow/task/gen.py +++ b/qlib/workflow/task/gen.py @@ -129,6 +129,7 @@ def __init__( test_key="test", train_key="train", trunc_days: int = None, + task_copy_func: Callable = copy.deepcopy ): """ Generate tasks for rolling @@ -144,6 +145,8 @@ def __init__( Do some extra action after generating a task. For example, use ``handler_mod`` to modify the end time of the handler of a dataset. trunc_days: int trunc some data to avoid future information leakage + task_copy_func: Callable + the function to copy entire task. This is very useful when user want to share something between tasks """ self.step = step self.rtype = rtype @@ -153,6 +156,7 @@ def __init__( self.test_key = test_key self.train_key = train_key self.trunc_days = trunc_days + self.task_copy_func = task_copy_func def _update_task_segs(self, task, segs): # update segments of this task @@ -197,7 +201,7 @@ def gen_following_tasks(self, task: dict, test_end: pd.Timestamp) -> List[dict]: break prev_seg = segments - t = copy.deepcopy(task) # deepcopy is necessary to avoid replace task inplace + t = self.task_copy_func(task) # deepcopy is necessary to avoid replace task inplace self._update_task_segs(t, segments) yield t @@ -253,7 +257,7 @@ def generate(self, task: dict) -> List[dict]: """ res = [] - t = copy.deepcopy(task) + t = self.task_copy_func(task) # calculate segments From 82b41156290e63fabf50714cbb7ed073de423210 Mon Sep 17 00:00:00 2001 From: Young Date: Sun, 22 Aug 2021 11:38:25 +0000 Subject: [PATCH 07/37] fix meta model time series bug --- qlib/contrib/meta/data_selection/model.py | 27 +++++++++---------- qlib/contrib/meta/data_selection/net.py | 6 +++-- qlib/contrib/meta/data_selection/utils.py | 2 ++ qlib/model/meta/model.py | 1 + qlib/model/meta/task.py | 1 + qlib/model/trainer.py | 33 ++++++++++++++++++++++- qlib/workflow/record_temp.py | 5 ++-- qlib/workflow/task/gen.py | 2 +- 8 files changed, 57 insertions(+), 20 deletions(-) diff --git a/qlib/contrib/meta/data_selection/model.py b/qlib/contrib/meta/data_selection/model.py index 1ae3e0581d..33127f0d2a 100644 --- a/qlib/contrib/meta/data_selection/model.py +++ b/qlib/contrib/meta/data_selection/model.py @@ -1,6 +1,7 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. +from libs.qlib.qlib.log import get_module_logger import pandas as pd import numpy as np from qlib.model.meta.task import MetaTask @@ -24,12 +25,11 @@ class TimeReweighter(Reweighter): - def __init__(self, time_weight: pd.Series): self.time_weight = time_weight def reweight(self, data: Union[pd.DataFrame, pd.Series]): - w_s = pd.Series(1., index=data.index) + w_s = pd.Series(1.0, index=data.index) for k, w in self.time_weight.items(): w_s.loc[slice(*k)] = w return w_s @@ -76,16 +76,21 @@ def run_epoch(self, phase, task_list, epoch, opt, loss_l, ignore_weight=False): meta_input["time_perf"], meta_input["time_belong"], meta_input["X_test"], - ignore_weight=ignore_weight - ) # 这里可能因为如下原因导致pred为None; + ignore_weight=ignore_weight, + ) if self.criterion == "mse": criterion = nn.MSELoss() loss = criterion(pred, meta_input["y_test"]) elif self.criterion == "ic_loss": criterion = ICLoss() - loss = criterion(pred, meta_input["y_test"], meta_input["test_idx"], skip_size=50) + try: + loss = criterion(pred, meta_input["y_test"], meta_input["test_idx"], skip_size=50) + except ValueError as e: + get_module_logger("MetaModelDS").warning(f"Exception `{e}` when calculating IC loss") + continue - if np.isnan(loss.detach().item()): __import__('ipdb').set_trace() + if np.isnan(loss.detach().item()): + __import__("ipdb").set_trace() if phase == "train": opt.zero_grad() @@ -99,9 +104,7 @@ def run_epoch(self, phase, task_list, epoch, opt, loss_l, ignore_weight=False): pd.DataFrame( { "pred": pd.Series(pred.detach().cpu().numpy(), index=meta_input["test_idx"]), - "label": pd.Series( - meta_input["y_test"].detach().cpu().numpy(), index=meta_input["test_idx"] - ), + "label": pd.Series(meta_input["y_test"].detach().cpu().numpy(), index=meta_input["test_idx"]), } ) ) @@ -110,11 +113,7 @@ def run_epoch(self, phase, task_list, epoch, opt, loss_l, ignore_weight=False): loss_l.setdefault(phase, []).append(running_loss) pred_y_all = pd.concat(pred_y_all) - ic = ( - pred_y_all.groupby("datetime") - .apply(lambda df: df["pred"].corr(df["label"], method="spearman")) - .mean() - ) + ic = pred_y_all.groupby("datetime").apply(lambda df: df["pred"].corr(df["label"], method="spearman")).mean() R.log_metrics(**{f"loss/{phase}": running_loss, "step": epoch}) R.log_metrics(**{f"ic/{phase}": ic, "step": epoch}) diff --git a/qlib/contrib/meta/data_selection/net.py b/qlib/contrib/meta/data_selection/net.py index 4d96a12d71..8c57069818 100644 --- a/qlib/contrib/meta/data_selection/net.py +++ b/qlib/contrib/meta/data_selection/net.py @@ -18,8 +18,9 @@ def __init__(self, hist_step_n, clip_weight=None, clip_method="clamp"): def forward(self, time_perf, time_belong=None, return_preds=False): hist_step_n = self.linear.in_features - time_perf = time_perf.reshape(time_perf.shape[0] // hist_step_n, hist_step_n, *time_perf.shape[1:]) - time_perf = torch.mean(time_perf, dim=0, keepdim=False) + # NOTE: the reshape order is very important + time_perf = time_perf.reshape(hist_step_n, time_perf.shape[0] // hist_step_n, *time_perf.shape[1:]) + time_perf = torch.mean(time_perf, dim=1, keepdim=False) # time_perf的格式和其他的有一些不一样 # 需要自己拆出train和test @@ -58,6 +59,7 @@ def get_sample_weights(self, X, time_perf, time_belong, ignore_weight=False): return weights def forward(self, X, y, time_perf, time_belong, X_test, ignore_weight=False): + """ Please refer to the docs of MetaTaskDS for the description of the variables""" weights = self.get_sample_weights(X, time_perf, time_belong, ignore_weight=ignore_weight) X_w = X.T * weights.view(1, -1) theta = torch.inverse(X_w @ X) @ X_w @ y diff --git a/qlib/contrib/meta/data_selection/utils.py b/qlib/contrib/meta/data_selection/utils.py index 011e472397..bb080747ef 100644 --- a/qlib/contrib/meta/data_selection/utils.py +++ b/qlib/contrib/meta/data_selection/utils.py @@ -54,6 +54,8 @@ def forward(self, pred, y, idx, skip_size=50): (y_focus - y_focus.mean()) / np.sqrt(y_focus.shape[0]) / y_focus.std(), ) ic_all += ic_day + if len(diff_point) - 1 - skip_n <= 0: + raise ValueError("No enough data for calculating iC") ic_mean = ic_all / (len(diff_point) - 1 - skip_n) return -ic_mean # ic loss diff --git a/qlib/model/meta/model.py b/qlib/model/meta/model.py index 59a8a417a0..2bcefe0382 100644 --- a/qlib/model/meta/model.py +++ b/qlib/model/meta/model.py @@ -32,6 +32,7 @@ class MetaTaskModel(MetaModel): """ This type of meta-model deals with base task definitions. The meta-model creates tasks for training new base forecasting models after it is trained. `prepare_tasks` directly modifies the task definitions. """ + @abc.abstractmethod def prepare_task(self, task: MetaTask) -> dict: """ diff --git a/qlib/model/meta/task.py b/qlib/model/meta/task.py index 2a000291df..7f35daf73e 100644 --- a/qlib/model/meta/task.py +++ b/qlib/model/meta/task.py @@ -19,6 +19,7 @@ class MetaTask: but not necessary in test tasks. (# PROC_MODE_TEST #) - When the meta model can be transferred into other dataset, maybe only meta_info is necessary (# PROC_MODE_TRANSFER #) """ + PROC_MODE_FULL = "full" PROC_MODE_TEST = "test" PROC_MODE_TRANSFER = "transfer" diff --git a/qlib/model/trainer.py b/qlib/model/trainer.py index 52608d50c5..597b04d21b 100644 --- a/qlib/model/trainer.py +++ b/qlib/model/trainer.py @@ -14,6 +14,7 @@ import socket from typing import Callable, List +from tqdm.auto import tqdm from qlib.data.dataset import Dataset from qlib.log import get_module_logger from qlib.model.base import Model @@ -167,6 +168,30 @@ def is_delay(self) -> bool: def __call__(self, *args, **kwargs) -> list: return self.end_train(self.train(*args, **kwargs)) + def has_worker(self) -> bool: + """ + Some trainer has backend worker to support parallel training + This method can tell if the worker is enabled. + + Returns + ------- + bool: + if the worker is enabled + + """ + return False + + def worker(self): + """ + start the worker + + Raises + ------ + NotImplementedError: + If the worker is not supported + """ + raise NotImplementedError(f"Please implement the `worker` method") + class TrainerR(Trainer): """ @@ -215,7 +240,7 @@ def train(self, tasks: list, train_func: Callable = None, experiment_name: str = if experiment_name is None: experiment_name = self.experiment_name recs = [] - for task in tasks: + for task in tqdm(tasks): rec = train_func(task, experiment_name, **kwargs) rec.set_tags(**{self.STATUS_KEY: self.STATUS_BEGIN}) recs.append(rec) @@ -420,6 +445,9 @@ def worker( task_pool = experiment_name run_task(train_func, task_pool=task_pool, experiment_name=experiment_name) + def has_worker(self) -> bool: + return True + class DelayTrainerRM(TrainerRM): """ @@ -542,3 +570,6 @@ def worker(self, end_train_func=None, experiment_name: str = None): experiment_name=experiment_name, before_status=TaskManager.STATUS_PART_DONE, ) + + def has_worker(self) -> bool: + return True diff --git a/qlib/workflow/record_temp.py b/qlib/workflow/record_temp.py index cf30bfad52..0e32f1f9b1 100644 --- a/qlib/workflow/record_temp.py +++ b/qlib/workflow/record_temp.py @@ -151,9 +151,10 @@ def generate(self, **kwargs): del params["data_key"] # The backend handler should be DataHandler raw_label = self.dataset.prepare(**params) - except AttributeError: + except AttributeError as e: # The data handler is initialize with `drop_raw=True`... # So raw_label is not available + logger.warning(f"Exception: {e}") raw_label = None self.recorder.save_objects(**{"label.pkl": raw_label}) @@ -242,7 +243,7 @@ def generate(self, **kwargs): pred = self.load("pred.pkl") label = self.load("label.pkl") if label is None or not isinstance(label, pd.DataFrame) or label.empty: - logger.warn(f"Empty label.") + logger.warning(f"Empty label.") return ic, ric = calc_ic(pred.iloc[:, 0], label.iloc[:, self.label_col]) metrics = { diff --git a/qlib/workflow/task/gen.py b/qlib/workflow/task/gen.py index 2cd806ec2d..06ada57a40 100644 --- a/qlib/workflow/task/gen.py +++ b/qlib/workflow/task/gen.py @@ -129,7 +129,7 @@ def __init__( test_key="test", train_key="train", trunc_days: int = None, - task_copy_func: Callable = copy.deepcopy + task_copy_func: Callable = copy.deepcopy, ): """ Generate tasks for rolling From 5b118c404e92e6817375a4d2712aa0a36d56229e Mon Sep 17 00:00:00 2001 From: Young Date: Mon, 13 Sep 2021 06:14:20 +0000 Subject: [PATCH 08/37] fix online workflow set_uri bug --- qlib/workflow/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qlib/workflow/__init__.py b/qlib/workflow/__init__.py index def84eb7a0..b7a0d5404f 100644 --- a/qlib/workflow/__init__.py +++ b/qlib/workflow/__init__.py @@ -349,7 +349,7 @@ def uri_context(self, uri: Text): try: yield finally: - self.exp_manager._current_uri = prev_uri + self.exp_manager.set_uri(prev_uri) def get_recorder( self, *, recorder_id=None, recorder_name=None, experiment_id=None, experiment_name=None From 3b073f7f59bf471fa470d5c32587a9ead818040c Mon Sep 17 00:00:00 2001 From: Young Date: Sun, 26 Sep 2021 08:24:35 +0000 Subject: [PATCH 09/37] fix set_uri bug --- qlib/workflow/__init__.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/qlib/workflow/__init__.py b/qlib/workflow/__init__.py index def84eb7a0..ef6985ad5d 100644 --- a/qlib/workflow/__init__.py +++ b/qlib/workflow/__init__.py @@ -331,6 +331,10 @@ def get_uri(self): def set_uri(self, uri: Optional[Text]): """ Method to reset the current uri of current experiment manager. + + NOTE: + - When the uri is refer to a file path, please using the absolute path instead of strings like "~/mlruns/" + The backend don't support strings like this. """ self.exp_manager.set_uri(uri) @@ -339,6 +343,9 @@ def uri_context(self, uri: Text): """ Temporarily set the exp_manager's uri to uri + NOTE: + - Please refer to the NOTE in the `set_uri` + Parameters ---------- uri : Text @@ -349,7 +356,7 @@ def uri_context(self, uri: Text): try: yield finally: - self.exp_manager._current_uri = prev_uri + self.exp_manager.set_uri(prev_uri) def get_recorder( self, *, recorder_id=None, recorder_name=None, experiment_id=None, experiment_name=None From b0850b07051f1c0fa81dda62b2421fff90d0391b Mon Sep 17 00:00:00 2001 From: Young Date: Mon, 27 Sep 2021 03:32:43 +0000 Subject: [PATCH 10/37] updawte ds docs and delay trainer bug --- qlib/data/dataset/weight.py | 3 ++- qlib/model/meta/model.py | 16 ++++++++++++++-- qlib/model/trainer.py | 6 +++++- 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/qlib/data/dataset/weight.py b/qlib/data/dataset/weight.py index ba4ed61cc2..09570956dd 100644 --- a/qlib/data/dataset/weight.py +++ b/qlib/data/dataset/weight.py @@ -56,6 +56,7 @@ def __len__(self): class SampleReweighter(Reweighter): """ The sample-wise reweighter. It aims to reweight by the given weight of each sample. + The samples are indexed in a pandas way """ def __init__(self, sample_weights: pd.Series, *args, **kwargs): @@ -82,7 +83,7 @@ def _sample_reweight_DataFrame(self, samples: Union[pd.Series, pd.DataFrame], *a weight.update(self.weights) return weight - def _sample_reweight_TSDataSampler(self, sampler: TSDataSampler, *args, **kwargs): + def _sample_reweight_TSDataSampler(self, sampler: TSDataSampler, *args, **kwargs) -> WeightSampler: """ This function processes the prepared data with TSDataSampler type. diff --git a/qlib/model/meta/model.py b/qlib/model/meta/model.py index 2bcefe0382..e38def0109 100644 --- a/qlib/model/meta/model.py +++ b/qlib/model/meta/model.py @@ -10,7 +10,11 @@ class MetaModel(metaclass=abc.ABCMeta): """ - The meta-model controls the training process. + The meta-model guiding the model learning. + + The word `Guiding` can be categorized into two types based on the stage of model learning + - The definition of learning tasks: Please refer to docs of `MetaTaskModel` + - Controlling the learning process of models: Please refer to the docs of `MetaGuideModel` """ @abc.abstractmethod @@ -21,9 +25,14 @@ def fit(self, *args, **kwargs): pass @abc.abstractmethod - def inference(self, *args, **kwargs): + def inference(self, *args, **kwargs) -> object: """ The inference process of the meta-model. + + Returns + ------- + object: + Some information to guide the model learning """ pass @@ -38,6 +47,9 @@ def prepare_task(self, task: MetaTask) -> dict: """ Input a meta task and output a task with qlib format + When modifying the model tasks, the meta model will leverage `self.inference` to get some necessary + information. + Parameters ---------- task : MetaTask diff --git a/qlib/model/trainer.py b/qlib/model/trainer.py index 4776ac65ab..ef37dac055 100644 --- a/qlib/model/trainer.py +++ b/qlib/model/trainer.py @@ -497,13 +497,17 @@ def train(self, tasks: list, train_func=None, experiment_name: str = None, **kwa tasks = [tasks] if len(tasks) == 0: return [] - return super().train( + _skip_run_task = self.skip_run_task + self.skip_run_task = False # The task preparation can't be skipped + res = super().train( tasks, train_func=train_func, experiment_name=experiment_name, after_status=TaskManager.STATUS_PART_DONE, **kwargs, ) + self.skip_run_task = _skip_run_task + return res def end_train(self, recs, end_train_func=None, experiment_name: str = None, **kwargs) -> List[Recorder]: """ From f32a7ad22517a31cbfb20c111fbd5ad4050cf483 Mon Sep 17 00:00:00 2001 From: Young Date: Tue, 16 Nov 2021 13:33:03 +0000 Subject: [PATCH 11/37] docs --- qlib/model/meta/task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qlib/model/meta/task.py b/qlib/model/meta/task.py index 7f35daf73e..f6c2f26f48 100644 --- a/qlib/model/meta/task.py +++ b/qlib/model/meta/task.py @@ -17,7 +17,7 @@ class MetaTask: - the processed input may be different between training and testing - When training, the X, y, X_test, y_test in training tasks are necessary (# PROC_MODE_FULL #) but not necessary in test tasks. (# PROC_MODE_TEST #) - - When the meta model can be transferred into other dataset, maybe only meta_info is necessary (# PROC_MODE_TRANSFER #) + - When the meta model can be transferred into other dataset, only meta_info is necessary (# PROC_MODE_TRANSFER #) """ PROC_MODE_FULL = "full" From 8fb37b6a5f461fae729240dfff8f78fd314f0309 Mon Sep 17 00:00:00 2001 From: Young Date: Tue, 16 Nov 2021 22:15:39 +0800 Subject: [PATCH 12/37] resume reweighter --- qlib/model/trainer.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/qlib/model/trainer.py b/qlib/model/trainer.py index 86300cab47..9950aca94b 100644 --- a/qlib/model/trainer.py +++ b/qlib/model/trainer.py @@ -73,11 +73,9 @@ def _exe_task(task_config: dict): # model & dataset initiation model: Model = init_instance_by_config(task_config["model"]) dataset: Dataset = init_instance_by_config(task_config["dataset"]) - # FIXME: resume reweighter after merging data selection - # reweighter: Reweighter = task_config.get("reweighter", None) + reweighter: Reweighter = task_config.get("reweighter", None) # model training - # auto_filter_kwargs(model.fit)(dataset, reweighter=reweighter) - model.fit(dataset) + auto_filter_kwargs(model.fit)(dataset, reweighter=reweighter) R.save_objects(**{"params.pkl": model}) # this dataset is saved for online inference. So the concrete data should not be dumped dataset.config(dump_all=False, recursive=True) From 21baeadd079cc19bacbf60e2727c92595f47da46 Mon Sep 17 00:00:00 2001 From: Young Date: Tue, 16 Nov 2021 23:43:41 +0800 Subject: [PATCH 13/37] add reweighting result --- qlib/contrib/meta/data_selection/model.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/qlib/contrib/meta/data_selection/model.py b/qlib/contrib/meta/data_selection/model.py index 33127f0d2a..0c1380a827 100644 --- a/qlib/contrib/meta/data_selection/model.py +++ b/qlib/contrib/meta/data_selection/model.py @@ -22,6 +22,8 @@ from .dataset import MetaDatasetHDS from qlib.contrib.meta.data_selection.net import PredNet from qlib.data.dataset.weight import Reweighter +from qlib.log import get_module_logger +logger = get_module_logger("data selection") class TimeReweighter(Reweighter): @@ -32,6 +34,7 @@ def reweight(self, data: Union[pd.DataFrame, pd.Series]): w_s = pd.Series(1.0, index=data.index) for k, w in self.time_weight.items(): w_s.loc[slice(*k)] = w + logger.info(f"Reweighting result: {w_s}") return w_s From 12afe61dba3b0f5842f5a3350059ccf336da0890 Mon Sep 17 00:00:00 2001 From: Young Date: Wed, 17 Nov 2021 09:02:31 +0800 Subject: [PATCH 14/37] fix qlib model import --- qlib/contrib/meta/data_selection/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qlib/contrib/meta/data_selection/model.py b/qlib/contrib/meta/data_selection/model.py index 0c1380a827..dcdcd8e83d 100644 --- a/qlib/contrib/meta/data_selection/model.py +++ b/qlib/contrib/meta/data_selection/model.py @@ -1,7 +1,7 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -from libs.qlib.qlib.log import get_module_logger +from qlib.log import get_module_logger import pandas as pd import numpy as np from qlib.model.meta.task import MetaTask From 1d9732bd28c1731b63d4ed4246c1831b279ab9b3 Mon Sep 17 00:00:00 2001 From: Young Date: Wed, 17 Nov 2021 08:57:23 +0000 Subject: [PATCH 15/37] make recorder more friendly --- qlib/contrib/meta/data_selection/model.py | 1 + qlib/model/trainer.py | 3 ++- qlib/utils/__init__.py | 2 +- qlib/workflow/task/collect.py | 19 +++++++++++++++++-- 4 files changed, 21 insertions(+), 4 deletions(-) diff --git a/qlib/contrib/meta/data_selection/model.py b/qlib/contrib/meta/data_selection/model.py index dcdcd8e83d..71c603b96d 100644 --- a/qlib/contrib/meta/data_selection/model.py +++ b/qlib/contrib/meta/data_selection/model.py @@ -23,6 +23,7 @@ from qlib.contrib.meta.data_selection.net import PredNet from qlib.data.dataset.weight import Reweighter from qlib.log import get_module_logger + logger = get_module_logger("data selection") diff --git a/qlib/model/trainer.py b/qlib/model/trainer.py index 9950aca94b..ce97c79164 100644 --- a/qlib/model/trainer.py +++ b/qlib/model/trainer.py @@ -20,7 +20,7 @@ from qlib.data.dataset import Dataset from qlib.log import get_module_logger from qlib.model.base import Model -from qlib.utils import flatten_dict, get_callable_kwargs, init_instance_by_config, auto_filter_kwargs +from qlib.utils import flatten_dict, get_callable_kwargs, init_instance_by_config, auto_filter_kwargs from qlib.workflow import R from qlib.workflow.record_temp import SignalRecord from qlib.workflow.recorder import Recorder @@ -59,6 +59,7 @@ def _exe_task(task_config: dict): r = cls(**kwargs, **rconf) r.generate() + # from qlib.data.dataset.weight import Reweighter diff --git a/qlib/utils/__init__.py b/qlib/utils/__init__.py index 23808e6a50..3f3049d33e 100644 --- a/qlib/utils/__init__.py +++ b/qlib/utils/__init__.py @@ -28,7 +28,7 @@ import numpy as np import pandas as pd from pathlib import Path -from typing import Dict, Union, Tuple, Any, Text, Optional, Callable +from typing import Dict, Union, Tuple, Any, Text, Optional, Callable from types import ModuleType from urllib.parse import urlparse from .file import get_or_create_path, save_multiple_parts_file, unpack_archive_with_buffer, get_tmp_file_with_buffer diff --git a/qlib/workflow/task/collect.py b/qlib/workflow/task/collect.py index b5b63bba6c..8d7b6a71c7 100644 --- a/qlib/workflow/task/collect.py +++ b/qlib/workflow/task/collect.py @@ -5,12 +5,14 @@ Collector module can collect objects from everywhere and process them such as merging, grouping, averaging and so on. """ +from collections import defaultdict from qlib.log import TimeInspector -from typing import Callable, Dict, List +from typing import Callable, Dict, Iterable, List from qlib.log import get_module_logger from qlib.utils.serial import Serializable from qlib.workflow import R from qlib.workflow.exp import Experiment +from qlib.workflow.recorder import Recorder class Collector(Serializable): @@ -142,6 +144,7 @@ def __init__( artifacts_path={"pred": "pred.pkl"}, artifacts_key=None, list_kwargs={}, + status: Iterable = {Recorder.STATUS_FI}, ): """ Init RecorderCollector. @@ -156,6 +159,7 @@ def __init__( artifacts_path (dict, optional): The artifacts name and its path in Recorder. Defaults to {"pred": "pred.pkl", "IC": "sig_analysis/ic.pkl"}. artifacts_key (str or List, optional): the artifacts key you want to get. If None, get all artifacts. list_kwargs (str): arguments for list_recorders function. + status (Iterable): only collect recorders with specific status. None indicating collecting all the recorders """ super().__init__(process_list=process_list) if isinstance(experiment, str): @@ -171,6 +175,7 @@ def __init__( self.artifacts_key = artifacts_key self.rec_filter_func = rec_filter_func self.list_kwargs = list_kwargs + self.status = status def collect(self, artifacts_key=None, rec_filter_func=None, only_exist=True) -> dict: """ @@ -202,9 +207,19 @@ def collect(self, artifacts_key=None, rec_filter_func=None, only_exist=True) -> elif isinstance(self.experiment, Callable): recs = self.experiment() - recs = [rec for rec in recs if rec_filter_func is None or rec_filter_func(rec)] + recs = [ + rec + for rec in recs + if ( + (self.status is None or rec.status in self.status) and (rec_filter_func is None or rec_filter_func(rec)) + ) + ] logger = get_module_logger("RecorderCollector") + status_stat = defaultdict(int) + for r in recs: + status_stat[r.status] += 1 + logger.info(f"Nubmer of recorders after filter: {status_stat}") for rec in recs: rec_key = self.rec_key_func(rec) for key in artifacts_key: From 20a8fe57bd8838fa6b25ea8655ef519ecb79fc8f Mon Sep 17 00:00:00 2001 From: Young Date: Thu, 18 Nov 2021 13:20:46 +0000 Subject: [PATCH 16/37] fix experiment workflow bug --- qlib/workflow/expm.py | 7 +++---- qlib/workflow/recorder.py | 5 +++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/qlib/workflow/expm.py b/qlib/workflow/expm.py index e1aece17b8..004f76f6bc 100644 --- a/qlib/workflow/expm.py +++ b/qlib/workflow/expm.py @@ -279,8 +279,9 @@ def set_uri(self, uri: Optional[Text] = None): """ if uri is None: - logger.debug("No tracking URI is provided. Use the default tracking URI.") - self._current_uri = self.default_uri + if self._current_uri is None: + logger.debug("No tracking URI is provided. Use the default tracking URI.") + self._current_uri = self.default_uri else: # Temporarily re-set the current uri as the uri argument. self._current_uri = uri @@ -352,8 +353,6 @@ def end_exp(self, recorder_status: Text = Recorder.STATUS_S): if self.active_experiment is not None: self.active_experiment.end(recorder_status) self.active_experiment = None - # When an experiment end, we will release the current uri. - self._current_uri = None def create_exp(self, experiment_name: Optional[Text] = None): assert experiment_name is not None diff --git a/qlib/workflow/recorder.py b/qlib/workflow/recorder.py index 056d75be14..deb3335090 100644 --- a/qlib/workflow/recorder.py +++ b/qlib/workflow/recorder.py @@ -306,8 +306,9 @@ def end_run(self, status: str = Recorder.STATUS_S): self.end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") if self.status != Recorder.STATUS_S: self.status = status - with TimeInspector.logt("waiting `async_log`"): - self.async_log.wait() + if self.async_log is not None: + with TimeInspector.logt("waiting `async_log`"): + self.async_log.wait() self.async_log = None def save_objects(self, local_path=None, artifact_path=None, **kwargs): From faf3e0361839e2c28bd4e3d12f0d97ddd7825530 Mon Sep 17 00:00:00 2001 From: Young Date: Thu, 9 Dec 2021 18:17:43 +0800 Subject: [PATCH 17/37] commit for merging master incase of conflictions --- examples/benchmarks_dynamic/DDG-DA/README.md | 1 + examples/benchmarks_dynamic/DDG-DA/meta_ds.py | 307 +++++++++++ .../benchmarks_dynamic/DDG-DA/workflow.py | 78 +++ examples/benchmarks_dynamic/REAMDE.md | 3 + .../baseline/rolling_benchmark.py | 77 +++ qlib/contrib/meta/data_selection/dataset.py | 502 ++++++++++-------- qlib/contrib/meta/data_selection/task.py | 51 -- qlib/data/dataset/processor.py | 4 +- qlib/data/dataset/utils.py | 29 +- qlib/model/ens/group.py | 6 +- qlib/model/trainer.py | 37 +- qlib/tests/config.py | 40 +- qlib/utils/data.py | 33 ++ 13 files changed, 861 insertions(+), 307 deletions(-) create mode 100644 examples/benchmarks_dynamic/DDG-DA/README.md create mode 100644 examples/benchmarks_dynamic/DDG-DA/meta_ds.py create mode 100644 examples/benchmarks_dynamic/DDG-DA/workflow.py create mode 100644 examples/benchmarks_dynamic/REAMDE.md create mode 100644 examples/benchmarks_dynamic/baseline/rolling_benchmark.py delete mode 100644 qlib/contrib/meta/data_selection/task.py diff --git a/examples/benchmarks_dynamic/DDG-DA/README.md b/examples/benchmarks_dynamic/DDG-DA/README.md new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/examples/benchmarks_dynamic/DDG-DA/README.md @@ -0,0 +1 @@ + diff --git a/examples/benchmarks_dynamic/DDG-DA/meta_ds.py b/examples/benchmarks_dynamic/DDG-DA/meta_ds.py new file mode 100644 index 0000000000..31d8c0e68f --- /dev/null +++ b/examples/benchmarks_dynamic/DDG-DA/meta_ds.py @@ -0,0 +1,307 @@ +from copy import deepcopy +from qlib.data.dataset.utils import init_task_handler +from qlib.utils.data import deepcopy_basic_type +from qlib.contrib.torch import data_to_tensor +from qlib.workflow.task.utils import TimeAdjuster +from qlib.model.meta.task import MetaTask +from typing import Dict, List, Union, Text, Tuple +from qlib.data.dataset.handler import DataHandler +from qlib.log import get_module_logger +from qlib.utils import auto_filter_kwargs, get_date_by_shift, init_instance_by_config +from qlib.workflow import R +from qlib.workflow.task.gen import RollingGen, task_generator +from joblib import Parallel, delayed +from qlib.model.meta.dataset import MetaDataset +from qlib.model.trainer import task_train, TrainerR +from qlib.data.dataset import DatasetH +from tqdm.auto import tqdm +import pandas as pd +import numpy as np + + +class InternalData: + def __init__(self, task_tpl: dict, step: int, exp_name: str): + self.task_tpl = task_tpl + self.step = step + self.exp_name = exp_name + + def setup(self, trainer=TrainerR, trainer_kwargs={}): + """ + after running this function `self.data_ic_df` will become set. + Each col represents a data. + Each row represents the Timestamp of performance of that data. + For example, + + .. code-block:: python + + 2021-06-21 2021-06-04 2021-05-21 2021-05-07 2021-04-20 2021-04-06 2021-03-22 2021-03-08 ... + 2021-07-02 2021-06-18 2021-06-03 2021-05-20 2021-05-06 2021-04-19 2021-04-02 2021-03-19 ... + datetime ... + 2018-01-02 0.079782 0.115975 0.070866 0.028849 -0.081170 0.140380 0.063864 0.110987 ... + 2018-01-03 0.123386 0.107789 0.071037 0.045278 -0.060782 0.167446 0.089779 0.124476 ... + 2018-01-04 0.140775 0.097206 0.063702 0.042415 -0.078164 0.173218 0.098914 0.114389 ... + 2018-01-05 0.030320 -0.037209 -0.044536 -0.047267 -0.081888 0.045648 0.059947 0.047652 ... + 2018-01-08 0.107201 0.009219 -0.015995 -0.036594 -0.086633 0.108965 0.122164 0.108508 ... + ... ... ... ... ... ... ... ... ... ... + + """ + + # 1) prepare the prediction of proxy models + perf_task_tpl = deepcopy(self.task_tpl) # this task is supposed to contains no complicated objects + + trainer = auto_filter_kwargs(trainer)(experiment_name=self.exp_name, **trainer_kwargs) + # NOTE: + # The handler is initialized for only once. + if not trainer.has_worker(): + self.dh = init_task_handler(perf_task_tpl) + else: + self.dh = init_instance_by_config(perf_task_tpl["dataset"]["kwargs"]["handler"]) + + seg = perf_task_tpl["dataset"]["kwargs"]["segments"] + + # We want to split the training time period into small segments. + perf_task_tpl["dataset"]["kwargs"]["segments"] = { + "train": (DatasetH.get_min_time(seg), DatasetH.get_max_time(seg)), + "test": (None, None) + } + + # NOTE: + # we play a trick here + # treat the training segments as test to create the rolling tasks + rg = RollingGen(step=self.step, test_key="train", train_key=None, task_copy_func=deepcopy_basic_type) + gen_task = task_generator(perf_task_tpl, [rg]) + + recorders = R.list_recorders(experiment_name=self.exp_name) + if len(gen_task) == len(recorders): + get_module_logger("Internal Data").info("the data has been initialized") + else: + # train new models + assert 0 == len(recorders), "An empty experiment is required for setup `InternalData``" + trainer.train(gen_task) + + # 2) extract the similarity matrix + label_df = self.dh.fetch(col_set="label") + # for + recorders = R.list_recorders(experiment_name=self.exp_name) + + key_l = [] + ic_l = [] + for _, rec in tqdm(recorders.items(), desc="calc"): + pred = rec.load_object("pred.pkl") + task = rec.load_object("task") + data_key = task["dataset"]["kwargs"]["segments"]["train"] + key_l.append(data_key) + ic_l.append(delayed(self._calc_perf)(pred.iloc[:, 0], label_df.iloc[:, 0])) + + ic_l = Parallel(n_jobs=-1)(ic_l) + self.data_ic_df = pd.DataFrame(dict(zip(key_l, ic_l))) + self.data_ic_df = self.data_ic_df.sort_index().sort_index(axis=1) + + del self.dh # handler is not useful now + + def _calc_perf(self, pred, label): + df = pd.DataFrame({'pred': pred, 'label': label}) + df = df.groupby("datetime").corr(method="spearman") + corr = df.loc(axis=0)[:, "pred"]["label"].droplevel(axis=0, level=-1) + return corr + + def update(self): + """update the data for online trading""" + # TODO: + # when new data are totally(including label) available + # - update the prediction + # - update the data similarity map(if applied) + + +class MetaTaskDS(MetaTask): + """ Meta Task for Data Selection """ + def __init__(self, task: dict, meta_info: pd.DataFrame, mode: str = MetaTask.PROC_MODE_FULL, fill_method="max"): + """ + The description of the processed data + + time_perf: A array with shape -> data piece performance + + time_belong: A array with shape -> belong or not (1. or 0.) + array([[1., 0., 0., ..., 0., 0., 0.], + [1., 0., 0., ..., 0., 0., 0.], + [1., 0., 0., ..., 0., 0., 0.], + ..., + [0., 0., 0., ..., 0., 0., 1.], + [0., 0., 0., ..., 0., 0., 1.], + [0., 0., 0., ..., 0., 0., 1.]]) + + """ + super().__init__(task, meta_info) + self.fill_method = fill_method + + time_perf = self._get_processed_meta_info() + self.processed_meta_input = {"time_perf": time_perf} + # FIXME: memory issue in this step + if mode == MetaTask.PROC_MODE_FULL: + # process metainfo_ + ds = self.get_dataset() + + # these three lines occupied 70% of the time of initializing MetaTaskDS + d_train, d_test = ds.prepare(["train", "test"], col_set=["feature", "label"]) + prev_size = d_test.shape[0] + # print(d_test.groupby("datetime").size()) + d_train = d_train.dropna(axis=0) + d_test = d_test.dropna(axis=0) + # print(d_test.groupby("datetime").size()) + if prev_size == 0 or d_test.shape[0] / prev_size <= 0.1: + __import__('ipdb').set_trace() + raise ValueError(f"Most of samples are dropped. Skip this task: {task}") + + if globals().get("YX_CONFIRM_XXX") is None: + if d_test.groupby("datetime").size().shape[0] < 5: + __import__('ipdb').set_trace() + # globals()["YX_CONFIRM_XXX"] = True + + sample_time_belong = np.zeros((d_train.shape[0], time_perf.shape[1])) + for i, col in enumerate(time_perf.columns): + # these two lines of code occupied 20% of the time of initializing MetaTaskDS + slc = slice(*d_train.index.slice_locs(start=col[0], end=col[1])) + sample_time_belong[slc, i] = 1.0 + + # If you want that last month also belongs to the last time_perf + # Assumptions: the latest data has similar performance like the last month + sample_time_belong[sample_time_belong.sum(axis=1) != 1, -1] = 1.0 + + self.processed_meta_input.update(dict( + X=d_train["feature"], + y=d_train["label"].iloc[:, 0], + X_test=d_test["feature"], + y_test=d_test["label"].iloc[:, 0], + time_belong=sample_time_belong, + test_idx=d_test["label"].index, + )) + # TODO: set device: I think this is not necessary to converting data format. + self.processed_meta_input = data_to_tensor(self.processed_meta_input) + + def _get_processed_meta_info(self): + # __import__('ipdb').set_trace() + meta_info_norm = self.meta_info.sub(self.meta_info.mean(axis=1), axis=0) #.fillna(0.) + if self.fill_method == "max": + meta_info_norm = meta_info_norm.T.fillna(meta_info_norm.max(axis=1)).T # fill it with row max to align with previous implementation + elif self.fill_method == "zero": + pass + else: + raise NotImplementedError(f"This type of input is not supported") + meta_info_norm = meta_info_norm.fillna(0.) # always fill zero in case of NaN + return meta_info_norm + + def get_meta_input(self): + return self.processed_meta_input + + +class MetaDatasetDS(MetaDataset): + def __init__(self, + *, + task_tpl: Union[dict, list], + step: int, + trunc_days: int = None, + exp_name: Union[str, InternalData], + segments: Union[Dict[Text, Tuple], float], + hist_step_n: int = 10, + task_mode: str = MetaTask.PROC_MODE_FULL, + fill_method: str = "max"): + """ + A dataset for meta model. + + Parameters + ---------- + task_tpl : Union[dict, list] + Decide what tasks are used. + - dict : the task template, the prepared task is generated with `step`, `trunc_days` and `RollingGen` + - list : when list, use the list of tasks directly + the list is supposed to be sorted according timeline + step : int + the rolling step + trunc_days: int + days to be truncated based on the test start + + exp_name : Union[str, InternalData] + Decide what meta_info are used for prediction. + - str: the name of the experiment to store the performance of data + - InternalData: a prepared internal data + segments: Union[Dict[Text, Tuple], float] + the segments to divide data + both left and right + if segments is a float: + the float represents the percentage of data for training + hist_step_n: int + length of historical steps for the meta infomation + task_mode : str + Please refer to the docs of MetaTask + """ + super().__init__(segments=segments) + if isinstance(exp_name, InternalData): + self.internal_data = exp_name + else: + self.internal_data = InternalData(task_tpl, step=step, exp_name=exp_name) + self.internal_data.setup() + self.task_tpl = deepcopy(task_tpl) # FIXME: if the handler is shared, how to avoid the explosion of the memroy. + self.trunc_days = trunc_days + self.hist_step_n = hist_step_n + self.step = step + + if isinstance(task_tpl, dict): + rg = RollingGen(step=step, trunc_days=trunc_days, task_copy_func=deepcopy_basic_type) # NOTE: trunc_days is very important !!!! + task_iter = rg(task_tpl) + if task_mode == MetaTask.PROC_MODE_FULL: + # Only pre initializing the task when full task is req + # initializing handler and share it. + init_task_handler(task_tpl) + else: + assert isinstance(task_tpl, list) + task_iter = task_tpl + + self.task_list = [] + self.meta_task_l = [] + logger = get_module_logger("MetaDatasetDS") + for t in tqdm(task_iter, desc="creating meta tasks"): + try: + self.meta_task_l.append(MetaTaskDS(t, meta_info=self._prepare_meta_ipt(t), mode=task_mode, fill_method=fill_method)) + self.task_list.append(t) + except ValueError as e: + logger.warning(f"ValueError: {e}") + if globals().get("YX_CONFIRM_XXX") is None: + if len(self.meta_task_l) <= 0: + __import__('ipdb').set_trace() + # globals()["YX_CONFIRM_XXX"] = True + + def _prepare_meta_ipt(self, task): + ic_df = self.internal_data.data_ic_df + + segs = task["dataset"]["kwargs"]["segments"] + end = max([segs[k][1] for k in ("train", "valid") if k in segs]) + ic_df_avail = ic_df.loc[:end, pd.IndexSlice[:, :end]] + + # meta data set focus on the **information** instead of preprocess + # 1) filter the future info + def mask_future(s): + """mask future information""" + # from qlib.utils import get_date_by_shift + start, end = s.name + end = get_date_by_shift(trading_date=end, shift=self.trunc_days - 1, future=True) + return s.mask((s.index >= start) & (s.index <= end)) + ic_df_avail = ic_df_avail.apply(mask_future) # apply to each col + + # 2) filter the info with too long periods + total_len = self.step * self.hist_step_n + if ic_df_avail.shape[0] >= total_len: + return ic_df_avail.iloc[-total_len:] + else: + raise ValueError("the history of distribution data is not long enough.") + + def _prepare_seg(self, segment: Text) -> List[MetaTask]: + if isinstance(self.segments, float): + train_task_n = int(len(self.meta_task_l) * self.segments) + if segment == "train": + return self.meta_task_l[:train_task_n] + elif segment == "test": + return self.meta_task_l[train_task_n:] + else: + raise NotImplementedError(f"This type of input is not supported") + else: + raise NotImplementedError(f"This type of input is not supported") diff --git a/examples/benchmarks_dynamic/DDG-DA/workflow.py b/examples/benchmarks_dynamic/DDG-DA/workflow.py new file mode 100644 index 0000000000..ef69079272 --- /dev/null +++ b/examples/benchmarks_dynamic/DDG-DA/workflow.py @@ -0,0 +1,78 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +from pathlib import Path +from qlib.data.dataset.handler import DataHandlerLP + +import pandas as pd +import fire +import sys +from tqdm.auto import tqdm +import yaml +from qlib import auto_init +from qlib.model.trainer import task_train +from qlib.utils import init_instance_by_config +from qlib.workflow.task.gen import RollingGen, task_generator + +DIRNAME = Path(__file__).absolute().resolve().parent +sys.path.append(str(DIRNAME.parent / "baseline")) +from rolling_benchmark import RollingBenchmark # NOTE: sys.path is changed for import RollingBenchmark + + +class DDGDA: + def __init__(self) -> None: + self.step = 20 + + def get_feature_importance(self): + rb = RollingBenchmark() + task = rb.basic_task() + + model = init_instance_by_config(task["model"]) + dataset = init_instance_by_config(task["dataset"]) + model.fit(dataset) + + fi = model.get_feature_importance() + + # Because the model use numpy instead of dataframe for training lightgbm + # So the we must use following extra steps to get the right feature importance + df = dataset.prepare(segments=slice(None), col_set="feature", data_key=DataHandlerLP.DK_R) + cols = df.columns + fi_named = {cols[int(k.split("_")[1])]: imp for k, imp in fi.to_dict().items()} + + return pd.Series(fi_named) + + def dump_data_for_proxy_model(self): + """ + Dump data for training meta model. + The meta model will be trained upon the proxy forecasting model. + """ + topk = 30 + fi = self.get_feature_importance() + col_selected = fi.nlargest(topk) + + rb = RollingBenchmark() + task = rb.basic_task() + dataset = init_instance_by_config(task["dataset"]) + prep_ds = dataset.prepare(slice(None), col_set=["feature", "label"], data_key=DataHandlerLP.DK_L) + + feature_df = prep_ds["feature"] + label_df = prep_ds["label"] + + feature_selected = feature_df.loc[:, col_selected.index] + + feature_selected = feature_selected.groupby("datetime").apply(lambda df: (df - df.mean()).div(df.std())) + feature_selected = feature_selected.fillna(0.) + + df_all = { + "label": label_df.reindex(feature_selected.index), + "feature": feature_selected, + } + df_all = pd.concat(df_all, axis=1) + df_all.to_pickle(DIRNAME / f"fea_label_df.pkl") + + def run_all(self): + self.dump_data_for_proxy_model() + + +if __name__ == "__main__": + auto_init() + fire.Fire(DDGDA) diff --git a/examples/benchmarks_dynamic/REAMDE.md b/examples/benchmarks_dynamic/REAMDE.md new file mode 100644 index 0000000000..b0c255b64c --- /dev/null +++ b/examples/benchmarks_dynamic/REAMDE.md @@ -0,0 +1,3 @@ +# Introduction + +Modeling the dynamic of market is a very important problem in Quant research. diff --git a/examples/benchmarks_dynamic/baseline/rolling_benchmark.py b/examples/benchmarks_dynamic/baseline/rolling_benchmark.py new file mode 100644 index 0000000000..d07d32d455 --- /dev/null +++ b/examples/benchmarks_dynamic/baseline/rolling_benchmark.py @@ -0,0 +1,77 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +from qlib.model.ens.ensemble import RollingEnsemble +from qlib.utils import init_instance_by_config +import fire +import yaml +from qlib import auto_init +from pathlib import Path +from tqdm.auto import tqdm +from qlib.model.trainer import TrainerR +from qlib.workflow import R + +DIRNAME = Path(__file__).absolute().resolve().parent +from qlib.workflow.task.gen import task_generator, RollingGen +from qlib.workflow.task.collect import RecorderCollector + + +class RollingBenchmark: + """ + before running the example, please clean your previous results with following command + - `rm -r mlruns` + + """ + def __init__(self) -> None: + self.step = 20 + + def basic_task(self): + """For fast training rolling""" + conf_path = DIRNAME.parent.parent / "benchmarks" / "LightGBM" / "workflow_config_lightgbm_Alpha158.yaml" + with conf_path.open("r") as f: + conf = yaml.safe_load(f) + task = conf["task"] + + # dump the processed data on to disk for later loading to speed up the processing + h_path = DIRNAME / "lightgbm_alpha158_handler.pkl" + + if not h_path.exists(): + h_conf = task["dataset"]["kwargs"]["handler"] + h = init_instance_by_config(h_conf) + h.to_pickle(h_path, dump_all=True) + + task["dataset"]["kwargs"]["handler"] = f"file://{h_path}" + task["record"] = ["qlib.workflow.record_temp.SignalRecord"] + return task + + def create_rolling_tasks(self): + task = self.basic_task() + task_l = task_generator(task, RollingGen( + step=self.step, trunc_days=2)) # the last two days should be truncated to avoid information leakage + return task_l + + def run_rolling_tasks(self): + task_l = self.create_rolling_tasks() + trainer = TrainerR(experiment_name="rolling_models") + trainer(task_l) + + def ens_rolling(self): + comb_key = "rolling" + rc = RecorderCollector(experiment="rolling_models", + artifacts_key=["pred", "label"], + process_list=[RollingEnsemble()], + # rec_key_func=lambda rec: (comb_key, rec.info["id"]), + artifacts_path={ + "pred": "pred.pkl", + "label": "label.pkl" + }) + res = rc() + with R.start(experiment_name=comb_key): + R.save_objects(**{"pred.pkl": res["pred"], "label.pkl": res["label"]}) + + def update_rolling_rec(self): + pass + + +if __name__ == "__main__": + auto_init() + fire.Fire(RollingBenchmark) diff --git a/qlib/contrib/meta/data_selection/dataset.py b/qlib/contrib/meta/data_selection/dataset.py index cd58bf9d8b..b620b51634 100644 --- a/qlib/contrib/meta/data_selection/dataset.py +++ b/qlib/contrib/meta/data_selection/dataset.py @@ -1,229 +1,309 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. - +from copy import deepcopy +from qlib.data.dataset.utils import init_task_handler +from qlib.utils.data import deepcopy_basic_type +from qlib.contrib.torch import data_to_tensor +from qlib.workflow.task.utils import TimeAdjuster +from qlib.model.meta.task import MetaTask +from typing import Dict, List, Union, Text, Tuple +from qlib.data.dataset.handler import DataHandler +from qlib.log import get_module_logger +from qlib.utils import auto_filter_kwargs, get_date_by_shift, init_instance_by_config +from qlib.workflow import R +from qlib.workflow.task.gen import RollingGen, task_generator +from joblib import Parallel, delayed +from qlib.model.meta.dataset import MetaDataset +from qlib.model.trainer import task_train, TrainerR +from qlib.data.dataset import DatasetH +from tqdm.auto import tqdm import pandas as pd import numpy as np -import time -import copy -from typing import Union, List, Tuple, Text -from ....data.dataset import DatasetH -from ....data.dataset.handler import DataHandlerLP -from ....data import D -from ....utils import init_instance_by_config -from ....workflow.task.utils import TimeAdjuster -from ....model.meta.dataset import MetaDatasetH -from .utils import fill_diagnal -from .task import MetaTaskDS +class InternalData: + def __init__(self, task_tpl: dict, step: int, exp_name: str): + self.task_tpl = task_tpl + self.step = step + self.exp_name = exp_name + def setup(self, trainer=TrainerR, trainer_kwargs={}): + """ + after running this function `self.data_ic_df` will become set. + Each col represents a data. + Each row represents the Timestamp of performance of that data. + For example, -class MetaDatasetHDS(MetaDatasetH): - """ - The MetaDatasetH for the meta-Learning-based data selection. - """ + .. code-block:: python - def __init__(self, rolling_dict: dict, sim_mat=None, rolling_len=20, horizon=20, HIST_N=30, *args, **kwargs): - """ + 2021-06-21 2021-06-04 2021-05-21 2021-05-07 2021-04-20 2021-04-06 2021-03-22 2021-03-08 ... + 2021-07-02 2021-06-18 2021-06-03 2021-05-20 2021-05-06 2021-04-19 2021-04-02 2021-03-19 ... + datetime ... + 2018-01-02 0.079782 0.115975 0.070866 0.028849 -0.081170 0.140380 0.063864 0.110987 ... + 2018-01-03 0.123386 0.107789 0.071037 0.045278 -0.060782 0.167446 0.089779 0.124476 ... + 2018-01-04 0.140775 0.097206 0.063702 0.042415 -0.078164 0.173218 0.098914 0.114389 ... + 2018-01-05 0.030320 -0.037209 -0.044536 -0.047267 -0.081888 0.045648 0.059947 0.047652 ... + 2018-01-08 0.107201 0.009219 -0.015995 -0.036594 -0.086633 0.108965 0.122164 0.108508 ... + ... ... ... ... ... ... ... ... ... ... - Parameters - ---------- - rolling_dict: dict - A dict that defines the train, valid (training for meta-model), and test scope. - sim_mat: Union[pd.Dataframe, NoneType] - The similarity matrix. The similarity matrix will be calculated if None is passed in. - rolling_len: int - The length of the test period in each rolling task. - horizon: int - The horizon of the label, the rolling process will create a gap between the training data and test data in order to avoid accessing the future information. - HIST_N: int - The number of periods that the meta-model will use. """ - super().__init__(*args, **kwargs) - self.rolling_len = rolling_len - self.rolling_dict = rolling_dict - self.horizon = horizon - self.HIST_N = HIST_N - if sim_mat is None: - self._init_sim_mat() + + # 1) prepare the prediction of proxy models + perf_task_tpl = deepcopy(self.task_tpl) # this task is supposed to contains no complicated objects + + trainer = auto_filter_kwargs(trainer)(experiment_name=self.exp_name, **trainer_kwargs) + # NOTE: + # The handler is initialized for only once. + if not trainer.has_worker(): + self.dh = init_task_handler(perf_task_tpl) else: - self.sim_mat = sim_mat - self.meta_tasks_l = self._generate_tasks_from_sim_mat() - self.meta_tasks = self._init_meta_task_list() - - def _generate_tasks_from_sim_mat(self): - ta = TimeAdjuster() - cal = ta.cals - dates = list(cal) - meta_tasks_l = [] - rolling_start = self.rolling_dict["dataset"]["kwargs"]["segments"]["valid"][0] - for (start, end) in self.sim_mat.columns: - if start >= pd.Timestamp(rolling_start): - meta_task = copy.deepcopy(self.rolling_dict)["dataset"] # Be careful! - rolling_start_idx = ta.align_idx(start) - train_end = ta.get(rolling_start_idx - self.horizon) - meta_task["kwargs"]["segments"]["train"] = ( - pd.Timestamp(meta_task["kwargs"]["segments"]["train"][0]), - train_end, - ) - meta_task["kwargs"]["segments"]["test"] = (start, end) - meta_task["kwargs"]["segments"].pop("valid") - meta_tasks_l.append(meta_task) - return meta_tasks_l - - def get_sim_mat_from_tasks(self): - """ - Get the similarity matrix from the initialized tasks. - """ - sim_mat = {} - for task in self.sim_tasks: - sim_mean_series = pd.Series(task["sim_mean"]) - sim_mat[task["train_period"]] = sim_mean_series - sim_mat_df = pd.DataFrame(sim_mat) - return sim_mat_df - - def _init_sim_mat(self): - """ - Initialize the similarity matrix. - """ - self._generate_sim_task() - self._calc_sim_mat() - self.sim_mat = self.get_sim_mat_from_tasks() + self.dh = init_instance_by_config(perf_task_tpl["dataset"]["kwargs"]["handler"]) - def _generate_sim_task(self): - """ - Generate the the definition of the similarity matrix. - """ - ta = TimeAdjuster() - cal = ta.cals - dates = list(cal) - self.sim_tasks = [] - rolling_dict = copy.deepcopy(self.rolling_dict) - train_start, train_end = rolling_dict["dataset"]["kwargs"]["segments"]["train"] - valid_start, valid_end = rolling_dict["dataset"]["kwargs"]["segments"]["valid"] - test_start, test_end = rolling_dict["dataset"]["kwargs"]["segments"]["test"] - train_start_idx, train_end_idx = ta.align_idx(train_start), ta.align_idx(train_end) - valid_start_idx, valid_end_idx = ta.align_idx(valid_start), ta.align_idx(valid_end) - test_start_idx, test_end_idx = ta.align_idx(test_start), ta.align_idx(test_end) - start_idx = train_start_idx + ((test_start_idx - train_start_idx) % self.rolling_len) # To align at test start - - def get_rolling_periods(): - rolling_periods = [] - if start_idx - 1 > train_start_idx: - rolling_periods.append((dates[train_start_idx], dates[start_idx - 1])) - for t_start, t_end in zip( - dates[start_idx : test_end_idx + 1 : self.rolling_len], - dates[start_idx + self.rolling_len - 1 : test_end_idx + 1 : self.rolling_len], - ): - rolling_periods.append((t_start, t_end)) - t_end_idx = ta.align_idx(t_end) - if t_end_idx + 1 < test_end_idx: - rolling_periods.append((dates[t_end_idx + 1], dates[test_end_idx])) - return rolling_periods - - rolling_periods = get_rolling_periods() - for period in rolling_periods: - sim_task = {"train_period": period, "rolling_periods": rolling_periods} - self.sim_tasks.append(sim_task) - - def _calc_sim_mat(self): + seg = perf_task_tpl["dataset"]["kwargs"]["segments"] + + # We want to split the training time period into small segments. + perf_task_tpl["dataset"]["kwargs"]["segments"] = { + "train": (DatasetH.get_min_time(seg), DatasetH.get_max_time(seg)), + "test": (None, None) + } + + # NOTE: + # we play a trick here + # treat the training segments as test to create the rolling tasks + rg = RollingGen(step=self.step, test_key="train", train_key=None, task_copy_func=deepcopy_basic_type) + gen_task = task_generator(perf_task_tpl, [rg]) + + recorders = R.list_recorders(experiment_name=self.exp_name) + if len(gen_task) == len(recorders): + get_module_logger("Internal Data").info("the data has been initialized") + else: + # train new models + assert 0 == len(recorders), "An empty experiment is required for setup `InternalData``" + trainer.train(gen_task) + + # 2) extract the similarity matrix + label_df = self.dh.fetch(col_set="label") + # for + recorders = R.list_recorders(experiment_name=self.exp_name) + + key_l = [] + ic_l = [] + for _, rec in tqdm(recorders.items(), desc="calc"): + pred = rec.load_object("pred.pkl") + task = rec.load_object("task") + data_key = task["dataset"]["kwargs"]["segments"]["train"] + key_l.append(data_key) + ic_l.append(delayed(self._calc_perf)(pred.iloc[:, 0], label_df.iloc[:, 0])) + + ic_l = Parallel(n_jobs=-1)(ic_l) + self.data_ic_df = pd.DataFrame(dict(zip(key_l, ic_l))) + self.data_ic_df = self.data_ic_df.sort_index().sort_index(axis=1) + + del self.dh # handler is not useful now + + def _calc_perf(self, pred, label): + df = pd.DataFrame({'pred': pred, 'label': label}) + df = df.groupby("datetime").corr(method="spearman") + corr = df.loc(axis=0)[:, "pred"]["label"].droplevel(axis=0, level=-1) + return corr + + def update(self): + """update the data for online trading""" + # TODO: + # when new data are totally(including label) available + # - update the prediction + # - update the data similarity map(if applied) + + +class MetaTaskDS(MetaTask): + """ Meta Task for Data Selection """ + def __init__(self, task: dict, meta_info: pd.DataFrame, mode: str = MetaTask.PROC_MODE_FULL, fill_method="max"): """ - Calculate the similarity matrix. + The description of the processed data + + time_perf: A array with shape -> data piece performance + + time_belong: A array with shape -> belong or not (1. or 0.) + array([[1., 0., 0., ..., 0., 0., 0.], + [1., 0., 0., ..., 0., 0., 0.], + [1., 0., 0., ..., 0., 0., 0.], + ..., + [0., 0., 0., ..., 0., 0., 1.], + [0., 0., 0., ..., 0., 0., 1.], + [0., 0., 0., ..., 0., 0., 1.]]) + """ - print("Calculating the similarity matrix...") - start_time = time.time() - for index, task in enumerate(self.sim_tasks): - # Prepare the dataset - rolling_dict = copy.deepcopy(self.rolling_dict) - task["dataset"] = rolling_dict["dataset"] - task["dataset"]["kwargs"]["handler"] = self.data_handler - task_seg = { - "train": task["train_period"], - "test": (task["rolling_periods"][0][0], task["rolling_periods"][-1][1]), - } - task["dataset"]["kwargs"]["segments"] = task_seg - task["dataset"] = init_instance_by_config(task["dataset"]) - - # Train & inference the model - task["model"] = init_instance_by_config(rolling_dict["model"]) - task["model"].fit(task["dataset"]) - pred = task["model"].predict(task["dataset"]) - label = task["dataset"].prepare("test", col_set="label", data_key=DataHandlerLP.DK_I).iloc[:, 0] - - # Calculate the similarity - sim_mean = {} - for (rolling_start, rolling_end) in task["rolling_periods"]: - df = pd.DataFrame( - {"pred": pred.loc[rolling_start:rolling_end], "label": label.loc[rolling_start:rolling_end]} - ) - sims = df.groupby("datetime").apply(lambda df: df["pred"].corr(df["label"], method="spearman")) - sim_mean[(rolling_start, rolling_end)] = sims.mean() - task["sim_mean"] = sim_mean - end_time = time.time() - print("The similarity matrix calculating process is finished. Total time: %.2fs." % (end_time - start_time)) - - def _init_meta_task_list(self, *args, **kwargs): - meta_tasks = [] - for task in self.meta_tasks_l: - meta_task = self._init_meta_task(task) - if meta_task is not None: - meta_tasks.append(meta_task) - if meta_tasks == []: - raise AssertionError("No meta-task is created!") - return meta_tasks - - def _init_meta_task(self, meta_task: dict, *args, **kwargs) -> MetaTaskDS: - meta_task["kwargs"]["handler"] = self.data_handler - test_date = meta_task["kwargs"]["segments"]["test"] - sim_mat_fill = fill_diagnal(self.sim_mat) # Remove the future information - sim_mat_focus = sim_mat_fill.loc[:test_date, :test_date] - - task_def = { - # Because the last month may leak future information, so -1 is excluded - "insample": list(sim_mat_focus.index[:-2]), - "outsample": test_date, # sim_mat_focus.index[-1], - } + super().__init__(task, meta_info) + self.fill_method = fill_method + + time_perf = self._get_processed_meta_info() + self.processed_meta_input = {"time_perf": time_perf} + # FIXME: memory issue in this step + if mode == MetaTask.PROC_MODE_FULL: + # process metainfo_ + ds = self.get_dataset() + + # these three lines occupied 70% of the time of initializing MetaTaskDS + d_train, d_test = ds.prepare(["train", "test"], col_set=["feature", "label"]) + prev_size = d_test.shape[0] + # print(d_test.groupby("datetime").size()) + d_train = d_train.dropna(axis=0) + d_test = d_test.dropna(axis=0) + # print(d_test.groupby("datetime").size()) + if prev_size == 0 or d_test.shape[0] / prev_size <= 0.1: + __import__('ipdb').set_trace() + raise ValueError(f"Most of samples are dropped. Skip this task: {task}") + + if globals().get("YX_CONFIRM_XXX") is None: + if d_test.groupby("datetime").size().shape[0] < 5: + __import__('ipdb').set_trace() + # globals()["YX_CONFIRM_XXX"] = True + + sample_time_belong = np.zeros((d_train.shape[0], time_perf.shape[1])) + for i, col in enumerate(time_perf.columns): + # these two lines of code occupied 20% of the time of initializing MetaTaskDS + slc = slice(*d_train.index.slice_locs(start=col[0], end=col[1])) + sample_time_belong[slc, i] = 1.0 + + # If you want that last month also belongs to the last time_perf + # Assumptions: the latest data has similar performance like the last month + sample_time_belong[sample_time_belong.sum(axis=1) != 1, -1] = 1.0 - time_perf = None # For possible spatical extension - task_idx = len(sim_mat_focus) - if task_idx > self.HIST_N: - time_perf = sim_mat_focus.iloc[-self.HIST_N - 1 : -1].loc[:, task_def["insample"]] - if time_perf is None: # Only qualified meta-task will be created - return None - return MetaTaskDS(task_def, time_perf, meta_task) - - def _prepare_seg(self, segment: str, *args, **kwargs): - assert len(self.meta_tasks_l) == len(self.meta_tasks) - meta_tasks = [] - test_start_date = pd.Timestamp(self.rolling_dict["dataset"]["kwargs"]["segments"]["test"][0]) - for index, task_def in enumerate(self.meta_tasks_l): - task_date = pd.Timestamp(task_def["kwargs"]["segments"]["test"][0]) - if (segment == "train" and task_date < test_start_date) or ( - segment == "test" and task_date >= test_start_date - ): - meta_tasks.append(self.meta_tasks[index]) - return meta_tasks - - def get_test_period_from_meta_tasks(self): - return [task["kwargs"]["segments"]["test"] for task in self.meta_tasks_l] - - def get_meta_task_by_test_period(self, test_period: Union[list, tuple]): + self.processed_meta_input.update(dict( + X=d_train["feature"], + y=d_train["label"].iloc[:, 0], + X_test=d_test["feature"], + y_test=d_test["label"].iloc[:, 0], + time_belong=sample_time_belong, + test_idx=d_test["label"].index, + )) + # TODO: set device: I think this is not necessary to converting data format. + self.processed_meta_input = data_to_tensor(self.processed_meta_input) + + def _get_processed_meta_info(self): + # __import__('ipdb').set_trace() + meta_info_norm = self.meta_info.sub(self.meta_info.mean(axis=1), axis=0) #.fillna(0.) + if self.fill_method == "max": + meta_info_norm = meta_info_norm.T.fillna(meta_info_norm.max(axis=1)).T # fill it with row max to align with previous implementation + elif self.fill_method == "zero": + pass + else: + raise NotImplementedError(f"This type of input is not supported") + meta_info_norm = meta_info_norm.fillna(0.) # always fill zero in case of NaN + return meta_info_norm + + def get_meta_input(self): + return self.processed_meta_input + + +class MetaDatasetDS(MetaDataset): + def __init__(self, + *, + task_tpl: Union[dict, list], + step: int, + trunc_days: int = None, + exp_name: Union[str, InternalData], + segments: Union[Dict[Text, Tuple], float], + hist_step_n: int = 10, + task_mode: str = MetaTask.PROC_MODE_FULL, + fill_method: str = "max"): """ - Get the meta-task by the given key (test period). Return None if the meta-task is not found. - Assume the task instances in meta_tasks and the task definitions in meta_tasks_l are corresponding. + A dataset for meta model. + + Parameters + ---------- + task_tpl : Union[dict, list] + Decide what tasks are used. + - dict : the task template, the prepared task is generated with `step`, `trunc_days` and `RollingGen` + - list : when list, use the list of tasks directly + the list is supposed to be sorted according timeline + step : int + the rolling step + trunc_days: int + days to be truncated based on the test start + + exp_name : Union[str, InternalData] + Decide what meta_info are used for prediction. + - str: the name of the experiment to store the performance of data + - InternalData: a prepared internal data + segments: Union[Dict[Text, Tuple], float] + the segments to divide data + both left and right + if segments is a float: + the float represents the percentage of data for training + hist_step_n: int + length of historical steps for the meta infomation + task_mode : str + Please refer to the docs of MetaTask """ - # Find the exact one - period_tuple = tuple([pd.Timestamp(t) for t in test_period]) - periods = self.get_test_period_from_meta_tasks() - for index, key in enumerate(periods): - if key == period_tuple: - return self.meta_tasks[index] - # If there is no exact one, find the nearest one - nearest_idx = None - for index, key in enumerate(periods): - if key[0] <= period_tuple[0]: - if nearest_idx is None or periods[nearest_idx][0] < key[0]: - nearest_idx = index - if nearest_idx is not None: - return self.meta_tasks[nearest_idx] - return None + super().__init__(segments=segments) + if isinstance(exp_name, InternalData): + self.internal_data = exp_name + else: + self.internal_data = InternalData(task_tpl, step=step, exp_name=exp_name) + self.internal_data.setup() + self.task_tpl = deepcopy(task_tpl) # FIXME: if the handler is shared, how to avoid the explosion of the memroy. + self.trunc_days = trunc_days + self.hist_step_n = hist_step_n + self.step = step + + if isinstance(task_tpl, dict): + rg = RollingGen(step=step, trunc_days=trunc_days, task_copy_func=deepcopy_basic_type) # NOTE: trunc_days is very important !!!! + task_iter = rg(task_tpl) + if task_mode == MetaTask.PROC_MODE_FULL: + # Only pre initializing the task when full task is req + # initializing handler and share it. + init_task_handler(task_tpl) + else: + assert isinstance(task_tpl, list) + task_iter = task_tpl + + self.task_list = [] + self.meta_task_l = [] + logger = get_module_logger("MetaDatasetDS") + for t in tqdm(task_iter, desc="creating meta tasks"): + try: + self.meta_task_l.append(MetaTaskDS(t, meta_info=self._prepare_meta_ipt(t), mode=task_mode, fill_method=fill_method)) + self.task_list.append(t) + except ValueError as e: + logger.warning(f"ValueError: {e}") + if globals().get("YX_CONFIRM_XXX") is None: + if len(self.meta_task_l) <= 0: + __import__('ipdb').set_trace() + # globals()["YX_CONFIRM_XXX"] = True + + def _prepare_meta_ipt(self, task): + ic_df = self.internal_data.data_ic_df + + segs = task["dataset"]["kwargs"]["segments"] + end = max([segs[k][1] for k in ("train", "valid") if k in segs]) + ic_df_avail = ic_df.loc[:end, pd.IndexSlice[:, :end]] + + # meta data set focus on the **information** instead of preprocess + # 1) filter the future info + def mask_future(s): + """mask future information""" + # from qlib.utils import get_date_by_shift + start, end = s.name + end = get_date_by_shift(trading_date=end, shift=self.trunc_days - 1, future=True) + return s.mask((s.index >= start) & (s.index <= end)) + ic_df_avail = ic_df_avail.apply(mask_future) # apply to each col + + # 2) filter the info with too long periods + total_len = self.step * self.hist_step_n + if ic_df_avail.shape[0] >= total_len: + return ic_df_avail.iloc[-total_len:] + else: + raise ValueError("the history of distribution data is not long enough.") + + def _prepare_seg(self, segment: Text) -> List[MetaTask]: + if isinstance(self.segments, float): + train_task_n = int(len(self.meta_task_l) * self.segments) + if segment == "train": + return self.meta_task_l[:train_task_n] + elif segment == "test": + return self.meta_task_l[train_task_n:] + else: + raise NotImplementedError(f"This type of input is not supported") + else: + raise NotImplementedError(f"This type of input is not supported") diff --git a/qlib/contrib/meta/data_selection/task.py b/qlib/contrib/meta/data_selection/task.py deleted file mode 100644 index 6cff4d1d72..0000000000 --- a/qlib/contrib/meta/data_selection/task.py +++ /dev/null @@ -1,51 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -import pandas as pd -import numpy as np - -from ....model.meta.task import MetaTask -from ....data.dataset.handler import DataHandlerLP - -from qlib.contrib.torch import data_to_tensor -from .utils import fill_diagnal - - -class MetaTaskDS(MetaTask): - """ - The MetaTask for the meta-learning-based data selection. - """ - - def __init__(self, task_def: dict, time_perf, *args, **kwargs): - super().__init__(*args, **kwargs) - self.task_def = task_def - self.time_perf = time_perf - self._prepare_meta_task() - - def _prepare_meta_task(self): - self.X, self.X_test = self.dataset.prepare(["train", "test"], col_set="feature", data_key=DataHandlerLP.DK_L) - self.y, self.y_test = self.dataset.prepare(["train", "test"], col_set="label", data_key=DataHandlerLP.DK_L) - self.sample_time_belong = np.zeros((self.y.shape[0], self.time_perf.shape[1])) - for i, col in enumerate(self.time_perf.columns): - slc = slice(*self.y.index.slice_locs(start=col[0], end=col[1])) - self.sample_time_belong[slc, i] = 1.0 - # The last month also belongs to the last time_perf - self.sample_time_belong[self.sample_time_belong.sum(axis=1) != 1, -1] = 1.0 - self.test_idx = self.y_test.index - self.train_idx = self.y.index - self.X, self.y, self.time_perf, self.sample_time_belong, self.X_test, self.y_test = data_to_tensor( - [self.X, self.y, self.time_perf, self.sample_time_belong, self.X_test, self.y_test] - ) - - def prepare_task_data(self): - return ( - self.X, - self.y, - self.time_perf, - self.sample_time_belong, - self.X_test, - self.y_test, - self.test_idx, - self.train_idx, - self.task_def["outsample"], - ) diff --git a/qlib/data/dataset/processor.py b/qlib/data/dataset/processor.py index 4d16660671..206b8b1af2 100644 --- a/qlib/data/dataset/processor.py +++ b/qlib/data/dataset/processor.py @@ -7,7 +7,7 @@ import pandas as pd import copy -from qlib.utils.data import robust_zscore +from qlib.utils.data import robust_zscore, zscore from ...log import TimeInspector from .utils import fetch_df_by_index @@ -295,7 +295,7 @@ class CSZScoreNorm(Processor): def __init__(self, fields_group=None, method="zscore"): self.fields_group = fields_group if method == "zscore": - self.zscore_func = lambda x: (x - x.mean()).div(x.std()) + self.zscore_func = zscore elif method == "robust": self.zscore_func = robust_zscore else: diff --git a/qlib/data/dataset/utils.py b/qlib/data/dataset/utils.py index c6b3d97b62..f34db749b2 100644 --- a/qlib/data/dataset/utils.py +++ b/qlib/data/dataset/utils.py @@ -1,8 +1,12 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. - +from __future__ import annotations import pandas as pd from typing import Union, List +from qlib.utils import init_instance_by_config +from typing import TYPE_CHECKING +if TYPE_CHECKING: + from qlib.data.dataset import DataHandler def get_level_index(df: pd.DataFrame, level=Union[str, int]) -> int: @@ -111,3 +115,26 @@ def convert_index_format(df: Union[pd.DataFrame, pd.Series], level: str = "datet if get_level_index(df, level=level) == 1: df = df.swaplevel().sort_index() return df + + +def init_task_handler(task: dict) -> Union[DataHandler, None]: + """ + initialize the handler part of the task **inplace** + + Parameters + ---------- + task : dict + the task to be handled + + Returns + ------- + Union[DataHandler, None]: + returns + """ + + h_conf = task["dataset"]["kwargs"].get("handler") + if h_conf is not None: + handler = init_instance_by_config(h_conf, accept_types=DataHandler) + task["dataset"]["kwargs"]["handler"] = handler + + return handler diff --git a/qlib/model/ens/group.py b/qlib/model/ens/group.py index 7f45b06a5c..aa8ee8758f 100644 --- a/qlib/model/ens/group.py +++ b/qlib/model/ens/group.py @@ -3,9 +3,9 @@ """ Group can group a set of objects based on `group_func` and change them to a dict. -After group, we provide a method to reduce them. +After group, we provide a method to reduce them. -For example: +For example: group: {(A,B,C1): object, (A,B,C2): object} -> {(A,B): {C1: object, C2: object}} reduce: {(A,B): {C1: object, C2: object}} -> {(A,B): object} @@ -107,6 +107,8 @@ def group(self, rolling_dict: dict) -> dict: for key, values in rolling_dict.items(): if isinstance(key, tuple): grouped_dict.setdefault(key[:-1], {})[key[-1]] = values + else: + raise TypeError(f"Expected `tuple` type, but got a value `{key}`") return grouped_dict def __init__(self): diff --git a/qlib/model/trainer.py b/qlib/model/trainer.py index ce97c79164..738fef0aeb 100644 --- a/qlib/model/trainer.py +++ b/qlib/model/trainer.py @@ -28,41 +28,6 @@ from qlib.data.dataset.weight import Reweighter -def _log_task_info(task_config: dict): - R.log_params(**flatten_dict(task_config)) - R.save_objects(**{"task": task_config}) # keep the original format and datatype - R.set_tags(**{"hostname": socket.gethostname()}) - - -def _exe_task(task_config: dict): - rec = R.get_recorder() - # model & dataset initiation - model: Model = init_instance_by_config(task_config["model"]) - dataset: Dataset = init_instance_by_config(task_config["dataset"]) - reweighter: Reweighter = task_config.get("reweighter", None) - # model training - auto_filter_kwargs(model.fit)(dataset, reweighter=reweighter) - R.save_objects(**{"params.pkl": model}) - # this dataset is saved for online inference. So the concrete data should not be dumped - dataset.config(dump_all=False, recursive=True) - R.save_objects(**{"dataset": dataset}) - # generate records: prediction, backtest, and analysis - records = task_config.get("record", []) - if isinstance(records, dict): # prevent only one dict - records = [records] - for record in records: - cls, kwargs = get_callable_kwargs(record, default_module="qlib.workflow.record_temp") - if cls is SignalRecord: - rconf = {"model": model, "dataset": dataset, "recorder": rec} - else: - rconf = {"recorder": rec} - r = cls(**kwargs, **rconf) - r.generate() - - -# from qlib.data.dataset.weight import Reweighter - - def _log_task_info(task_config: dict): R.log_params(**flatten_dict(task_config)) R.save_objects(**{"task": task_config}) # keep the original format and datatype @@ -324,7 +289,7 @@ def train(self, tasks: list, train_func: Callable = None, experiment_name: str = if experiment_name is None: experiment_name = self.experiment_name recs = [] - for task in tqdm(tasks): + for task in tqdm(tasks, desc="train tasks"): rec = train_func(task, experiment_name, **kwargs) rec.set_tags(**{self.STATUS_KEY: self.STATUS_BEGIN}) recs.append(rec) diff --git a/qlib/tests/config.py b/qlib/tests/config.py index f01715992e..a7bb8d0576 100644 --- a/qlib/tests/config.py +++ b/qlib/tests/config.py @@ -30,6 +30,41 @@ }, } +SA_RC = { + "class": "SigAnaRecord", + "module_path": "qlib.workflow.record_temp", +} + +PORT_CONFIG = { + 'class': 'PortAnaRecord', + 'module_path': 'qlib.workflow.record_temp', + 'kwargs': { + 'config': { + 'strategy': { + 'class': 'TopkDropoutStrategy', + 'module_path': 'qlib.contrib.strategy', + 'kwargs': { + 'signal': ['', ''], + 'topk': 50, + 'n_drop': 5 + } + }, + 'backtest': { + 'start_time': "2017-01-01", + 'end_time': "2020-08-01", + 'account': 100000000, + 'benchmark': 'SH000300', + 'exchange_kwargs': { + 'limit_threshold': 0.095, + 'deal_price': 'close', + 'open_cost': 0.0005, + 'close_cost': 0.0015, + 'min_cost': 5 + } + } + } + } +} RECORD_CONFIG = [ { @@ -40,10 +75,7 @@ "model": "", }, }, - { - "class": "SigAnaRecord", - "module_path": "qlib.workflow.record_temp", - }, + SA_RC ] diff --git a/qlib/utils/data.py b/qlib/utils/data.py index 6e48687cf7..82b69127b4 100644 --- a/qlib/utils/data.py +++ b/qlib/utils/data.py @@ -1,5 +1,6 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. +from typing import Union import pandas as pd import numpy as np @@ -21,3 +22,35 @@ def robust_zscore(x: pd.Series, zscore=False): x -= x.mean() x /= x.std() return x + + +def zscore(x: Union[pd.Series, pd.DataFrame]): + return (x - x.mean()).div(x.std()) + + +def deepcopy_basic_type(obj: object) -> object: + """ + deepcopy an object without copy the complicated objects. + This is useful when you want to generate Qlib tasks and share the handler + + NOTE: + - This function can't handle recursive objects!!!!! + + Parameters + ---------- + obj : object + the object to be copied + + Returns + ------- + object: + The copied object + """ + if isinstance(obj, tuple): + return tuple(deepcopy_basic_type(i) for i in obj) + elif isinstance(obj, list): + return list(deepcopy_basic_type(i) for i in obj) + elif isinstance(obj, dict): + return {k: deepcopy_basic_type(v) for k, v in obj.items()} + else: + return obj From 3bc4030d2e7ab47e4fa25b3c372e3b518b47f9eb Mon Sep 17 00:00:00 2001 From: Young Date: Sat, 11 Dec 2021 22:25:44 +0800 Subject: [PATCH 18/37] Successful run DDG-DA with a single command --- docs/component/meta.rst | 4 +- examples/benchmarks_dynamic/DDG-DA/meta_ds.py | 307 ------------------ .../DDG-DA/requirements.txt | 1 + .../benchmarks_dynamic/DDG-DA/workflow.py | 176 +++++++++- .../baseline/rolling_benchmark.py | 54 ++- qlib/contrib/meta/__init__.py | 2 +- qlib/contrib/meta/data_selection/__init__.py | 3 +- qlib/contrib/meta/data_selection/dataset.py | 91 ++++-- qlib/contrib/meta/data_selection/model.py | 99 +----- qlib/contrib/meta/data_selection/net.py | 2 +- qlib/contrib/model/gbdt.py | 73 +++-- qlib/contrib/strategy/signal_strategy.py | 4 + qlib/data/dataset/utils.py | 3 + qlib/model/meta/__init__.py | 2 +- qlib/model/meta/dataset.py | 28 +- qlib/model/meta/model.py | 47 +-- qlib/model/trainer.py | 43 +-- qlib/tests/config.py | 33 +- qlib/utils/__init__.py | 80 ++++- qlib/workflow/exp.py | 37 ++- qlib/workflow/record_temp.py | 102 ++++-- 21 files changed, 534 insertions(+), 657 deletions(-) delete mode 100644 examples/benchmarks_dynamic/DDG-DA/meta_ds.py create mode 100644 examples/benchmarks_dynamic/DDG-DA/requirements.txt diff --git a/docs/component/meta.rst b/docs/component/meta.rst index 0d57b4499b..98eace5b42 100644 --- a/docs/component/meta.rst +++ b/docs/component/meta.rst @@ -23,7 +23,7 @@ Meta Dataset `Meta Dataset` controls the meta-information generating process. It is on the duty of providing data for training the `Meta Model`. Users should use `prepare_tasks` to retrieve a list of `Meta Task` instances. -.. autoclass:: qlib.model.meta.dataset.MetaDataset +.. autoclass:: qlib.model.meta.dataset.MetaTaskDataset :members: Meta Model @@ -50,4 +50,4 @@ Meta Guide Model This type of meta-model participates in the training process of the base forecasting model. The meta-model may guide the base forecasting models during their training to improve their performances. .. autoclass:: qlib.model.meta.model.MetaGuideModel - :members: \ No newline at end of file + :members: diff --git a/examples/benchmarks_dynamic/DDG-DA/meta_ds.py b/examples/benchmarks_dynamic/DDG-DA/meta_ds.py deleted file mode 100644 index 31d8c0e68f..0000000000 --- a/examples/benchmarks_dynamic/DDG-DA/meta_ds.py +++ /dev/null @@ -1,307 +0,0 @@ -from copy import deepcopy -from qlib.data.dataset.utils import init_task_handler -from qlib.utils.data import deepcopy_basic_type -from qlib.contrib.torch import data_to_tensor -from qlib.workflow.task.utils import TimeAdjuster -from qlib.model.meta.task import MetaTask -from typing import Dict, List, Union, Text, Tuple -from qlib.data.dataset.handler import DataHandler -from qlib.log import get_module_logger -from qlib.utils import auto_filter_kwargs, get_date_by_shift, init_instance_by_config -from qlib.workflow import R -from qlib.workflow.task.gen import RollingGen, task_generator -from joblib import Parallel, delayed -from qlib.model.meta.dataset import MetaDataset -from qlib.model.trainer import task_train, TrainerR -from qlib.data.dataset import DatasetH -from tqdm.auto import tqdm -import pandas as pd -import numpy as np - - -class InternalData: - def __init__(self, task_tpl: dict, step: int, exp_name: str): - self.task_tpl = task_tpl - self.step = step - self.exp_name = exp_name - - def setup(self, trainer=TrainerR, trainer_kwargs={}): - """ - after running this function `self.data_ic_df` will become set. - Each col represents a data. - Each row represents the Timestamp of performance of that data. - For example, - - .. code-block:: python - - 2021-06-21 2021-06-04 2021-05-21 2021-05-07 2021-04-20 2021-04-06 2021-03-22 2021-03-08 ... - 2021-07-02 2021-06-18 2021-06-03 2021-05-20 2021-05-06 2021-04-19 2021-04-02 2021-03-19 ... - datetime ... - 2018-01-02 0.079782 0.115975 0.070866 0.028849 -0.081170 0.140380 0.063864 0.110987 ... - 2018-01-03 0.123386 0.107789 0.071037 0.045278 -0.060782 0.167446 0.089779 0.124476 ... - 2018-01-04 0.140775 0.097206 0.063702 0.042415 -0.078164 0.173218 0.098914 0.114389 ... - 2018-01-05 0.030320 -0.037209 -0.044536 -0.047267 -0.081888 0.045648 0.059947 0.047652 ... - 2018-01-08 0.107201 0.009219 -0.015995 -0.036594 -0.086633 0.108965 0.122164 0.108508 ... - ... ... ... ... ... ... ... ... ... ... - - """ - - # 1) prepare the prediction of proxy models - perf_task_tpl = deepcopy(self.task_tpl) # this task is supposed to contains no complicated objects - - trainer = auto_filter_kwargs(trainer)(experiment_name=self.exp_name, **trainer_kwargs) - # NOTE: - # The handler is initialized for only once. - if not trainer.has_worker(): - self.dh = init_task_handler(perf_task_tpl) - else: - self.dh = init_instance_by_config(perf_task_tpl["dataset"]["kwargs"]["handler"]) - - seg = perf_task_tpl["dataset"]["kwargs"]["segments"] - - # We want to split the training time period into small segments. - perf_task_tpl["dataset"]["kwargs"]["segments"] = { - "train": (DatasetH.get_min_time(seg), DatasetH.get_max_time(seg)), - "test": (None, None) - } - - # NOTE: - # we play a trick here - # treat the training segments as test to create the rolling tasks - rg = RollingGen(step=self.step, test_key="train", train_key=None, task_copy_func=deepcopy_basic_type) - gen_task = task_generator(perf_task_tpl, [rg]) - - recorders = R.list_recorders(experiment_name=self.exp_name) - if len(gen_task) == len(recorders): - get_module_logger("Internal Data").info("the data has been initialized") - else: - # train new models - assert 0 == len(recorders), "An empty experiment is required for setup `InternalData``" - trainer.train(gen_task) - - # 2) extract the similarity matrix - label_df = self.dh.fetch(col_set="label") - # for - recorders = R.list_recorders(experiment_name=self.exp_name) - - key_l = [] - ic_l = [] - for _, rec in tqdm(recorders.items(), desc="calc"): - pred = rec.load_object("pred.pkl") - task = rec.load_object("task") - data_key = task["dataset"]["kwargs"]["segments"]["train"] - key_l.append(data_key) - ic_l.append(delayed(self._calc_perf)(pred.iloc[:, 0], label_df.iloc[:, 0])) - - ic_l = Parallel(n_jobs=-1)(ic_l) - self.data_ic_df = pd.DataFrame(dict(zip(key_l, ic_l))) - self.data_ic_df = self.data_ic_df.sort_index().sort_index(axis=1) - - del self.dh # handler is not useful now - - def _calc_perf(self, pred, label): - df = pd.DataFrame({'pred': pred, 'label': label}) - df = df.groupby("datetime").corr(method="spearman") - corr = df.loc(axis=0)[:, "pred"]["label"].droplevel(axis=0, level=-1) - return corr - - def update(self): - """update the data for online trading""" - # TODO: - # when new data are totally(including label) available - # - update the prediction - # - update the data similarity map(if applied) - - -class MetaTaskDS(MetaTask): - """ Meta Task for Data Selection """ - def __init__(self, task: dict, meta_info: pd.DataFrame, mode: str = MetaTask.PROC_MODE_FULL, fill_method="max"): - """ - The description of the processed data - - time_perf: A array with shape -> data piece performance - - time_belong: A array with shape -> belong or not (1. or 0.) - array([[1., 0., 0., ..., 0., 0., 0.], - [1., 0., 0., ..., 0., 0., 0.], - [1., 0., 0., ..., 0., 0., 0.], - ..., - [0., 0., 0., ..., 0., 0., 1.], - [0., 0., 0., ..., 0., 0., 1.], - [0., 0., 0., ..., 0., 0., 1.]]) - - """ - super().__init__(task, meta_info) - self.fill_method = fill_method - - time_perf = self._get_processed_meta_info() - self.processed_meta_input = {"time_perf": time_perf} - # FIXME: memory issue in this step - if mode == MetaTask.PROC_MODE_FULL: - # process metainfo_ - ds = self.get_dataset() - - # these three lines occupied 70% of the time of initializing MetaTaskDS - d_train, d_test = ds.prepare(["train", "test"], col_set=["feature", "label"]) - prev_size = d_test.shape[0] - # print(d_test.groupby("datetime").size()) - d_train = d_train.dropna(axis=0) - d_test = d_test.dropna(axis=0) - # print(d_test.groupby("datetime").size()) - if prev_size == 0 or d_test.shape[0] / prev_size <= 0.1: - __import__('ipdb').set_trace() - raise ValueError(f"Most of samples are dropped. Skip this task: {task}") - - if globals().get("YX_CONFIRM_XXX") is None: - if d_test.groupby("datetime").size().shape[0] < 5: - __import__('ipdb').set_trace() - # globals()["YX_CONFIRM_XXX"] = True - - sample_time_belong = np.zeros((d_train.shape[0], time_perf.shape[1])) - for i, col in enumerate(time_perf.columns): - # these two lines of code occupied 20% of the time of initializing MetaTaskDS - slc = slice(*d_train.index.slice_locs(start=col[0], end=col[1])) - sample_time_belong[slc, i] = 1.0 - - # If you want that last month also belongs to the last time_perf - # Assumptions: the latest data has similar performance like the last month - sample_time_belong[sample_time_belong.sum(axis=1) != 1, -1] = 1.0 - - self.processed_meta_input.update(dict( - X=d_train["feature"], - y=d_train["label"].iloc[:, 0], - X_test=d_test["feature"], - y_test=d_test["label"].iloc[:, 0], - time_belong=sample_time_belong, - test_idx=d_test["label"].index, - )) - # TODO: set device: I think this is not necessary to converting data format. - self.processed_meta_input = data_to_tensor(self.processed_meta_input) - - def _get_processed_meta_info(self): - # __import__('ipdb').set_trace() - meta_info_norm = self.meta_info.sub(self.meta_info.mean(axis=1), axis=0) #.fillna(0.) - if self.fill_method == "max": - meta_info_norm = meta_info_norm.T.fillna(meta_info_norm.max(axis=1)).T # fill it with row max to align with previous implementation - elif self.fill_method == "zero": - pass - else: - raise NotImplementedError(f"This type of input is not supported") - meta_info_norm = meta_info_norm.fillna(0.) # always fill zero in case of NaN - return meta_info_norm - - def get_meta_input(self): - return self.processed_meta_input - - -class MetaDatasetDS(MetaDataset): - def __init__(self, - *, - task_tpl: Union[dict, list], - step: int, - trunc_days: int = None, - exp_name: Union[str, InternalData], - segments: Union[Dict[Text, Tuple], float], - hist_step_n: int = 10, - task_mode: str = MetaTask.PROC_MODE_FULL, - fill_method: str = "max"): - """ - A dataset for meta model. - - Parameters - ---------- - task_tpl : Union[dict, list] - Decide what tasks are used. - - dict : the task template, the prepared task is generated with `step`, `trunc_days` and `RollingGen` - - list : when list, use the list of tasks directly - the list is supposed to be sorted according timeline - step : int - the rolling step - trunc_days: int - days to be truncated based on the test start - - exp_name : Union[str, InternalData] - Decide what meta_info are used for prediction. - - str: the name of the experiment to store the performance of data - - InternalData: a prepared internal data - segments: Union[Dict[Text, Tuple], float] - the segments to divide data - both left and right - if segments is a float: - the float represents the percentage of data for training - hist_step_n: int - length of historical steps for the meta infomation - task_mode : str - Please refer to the docs of MetaTask - """ - super().__init__(segments=segments) - if isinstance(exp_name, InternalData): - self.internal_data = exp_name - else: - self.internal_data = InternalData(task_tpl, step=step, exp_name=exp_name) - self.internal_data.setup() - self.task_tpl = deepcopy(task_tpl) # FIXME: if the handler is shared, how to avoid the explosion of the memroy. - self.trunc_days = trunc_days - self.hist_step_n = hist_step_n - self.step = step - - if isinstance(task_tpl, dict): - rg = RollingGen(step=step, trunc_days=trunc_days, task_copy_func=deepcopy_basic_type) # NOTE: trunc_days is very important !!!! - task_iter = rg(task_tpl) - if task_mode == MetaTask.PROC_MODE_FULL: - # Only pre initializing the task when full task is req - # initializing handler and share it. - init_task_handler(task_tpl) - else: - assert isinstance(task_tpl, list) - task_iter = task_tpl - - self.task_list = [] - self.meta_task_l = [] - logger = get_module_logger("MetaDatasetDS") - for t in tqdm(task_iter, desc="creating meta tasks"): - try: - self.meta_task_l.append(MetaTaskDS(t, meta_info=self._prepare_meta_ipt(t), mode=task_mode, fill_method=fill_method)) - self.task_list.append(t) - except ValueError as e: - logger.warning(f"ValueError: {e}") - if globals().get("YX_CONFIRM_XXX") is None: - if len(self.meta_task_l) <= 0: - __import__('ipdb').set_trace() - # globals()["YX_CONFIRM_XXX"] = True - - def _prepare_meta_ipt(self, task): - ic_df = self.internal_data.data_ic_df - - segs = task["dataset"]["kwargs"]["segments"] - end = max([segs[k][1] for k in ("train", "valid") if k in segs]) - ic_df_avail = ic_df.loc[:end, pd.IndexSlice[:, :end]] - - # meta data set focus on the **information** instead of preprocess - # 1) filter the future info - def mask_future(s): - """mask future information""" - # from qlib.utils import get_date_by_shift - start, end = s.name - end = get_date_by_shift(trading_date=end, shift=self.trunc_days - 1, future=True) - return s.mask((s.index >= start) & (s.index <= end)) - ic_df_avail = ic_df_avail.apply(mask_future) # apply to each col - - # 2) filter the info with too long periods - total_len = self.step * self.hist_step_n - if ic_df_avail.shape[0] >= total_len: - return ic_df_avail.iloc[-total_len:] - else: - raise ValueError("the history of distribution data is not long enough.") - - def _prepare_seg(self, segment: Text) -> List[MetaTask]: - if isinstance(self.segments, float): - train_task_n = int(len(self.meta_task_l) * self.segments) - if segment == "train": - return self.meta_task_l[:train_task_n] - elif segment == "test": - return self.meta_task_l[train_task_n:] - else: - raise NotImplementedError(f"This type of input is not supported") - else: - raise NotImplementedError(f"This type of input is not supported") diff --git a/examples/benchmarks_dynamic/DDG-DA/requirements.txt b/examples/benchmarks_dynamic/DDG-DA/requirements.txt new file mode 100644 index 0000000000..6e10c5798f --- /dev/null +++ b/examples/benchmarks_dynamic/DDG-DA/requirements.txt @@ -0,0 +1 @@ +torch==1.10.0 diff --git a/examples/benchmarks_dynamic/DDG-DA/workflow.py b/examples/benchmarks_dynamic/DDG-DA/workflow.py index ef69079272..0a5370a688 100644 --- a/examples/benchmarks_dynamic/DDG-DA/workflow.py +++ b/examples/benchmarks_dynamic/DDG-DA/workflow.py @@ -1,6 +1,9 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. from pathlib import Path +from qlib.model.meta.task import MetaTask +from qlib.contrib.meta.data_selection.model import MetaModelDS +from qlib.contrib.meta.data_selection.dataset import InternalData, MetaDatasetDS from qlib.data.dataset.handler import DataHandlerLP import pandas as pd @@ -8,10 +11,12 @@ import sys from tqdm.auto import tqdm import yaml +import pickle from qlib import auto_init -from qlib.model.trainer import task_train +from qlib.model.trainer import TrainerR, task_train from qlib.utils import init_instance_by_config from qlib.workflow.task.gen import RollingGen, task_generator +from qlib.workflow import R DIRNAME = Path(__file__).absolute().resolve().parent sys.path.append(str(DIRNAME.parent / "baseline")) @@ -19,8 +24,20 @@ class DDGDA: + """ + please run `python workflow.py run_all` to run the full workflow of the experiment + + **NOTE** + before running the example, please clean your previous results with following command + - `rm -r mlruns` + """ + def __init__(self) -> None: self.step = 20 + # NOTE: + # the horizon must match the meaning in the base task template + self.horizon = 1 + self.meta_exp_name = "DDG-DA" def get_feature_importance(self): rb = RollingBenchmark() @@ -42,8 +59,9 @@ def get_feature_importance(self): def dump_data_for_proxy_model(self): """ - Dump data for training meta model. - The meta model will be trained upon the proxy forecasting model. + Dump data for training meta model. + The meta model will be trained upon the proxy forecasting model. + This dataset is for the proxy forecasting model. """ topk = 30 fi = self.get_feature_importance() @@ -60,17 +78,165 @@ def dump_data_for_proxy_model(self): feature_selected = feature_df.loc[:, col_selected.index] feature_selected = feature_selected.groupby("datetime").apply(lambda df: (df - df.mean()).div(df.std())) - feature_selected = feature_selected.fillna(0.) + feature_selected = feature_selected.fillna(0.0) df_all = { "label": label_df.reindex(feature_selected.index), "feature": feature_selected, } df_all = pd.concat(df_all, axis=1) - df_all.to_pickle(DIRNAME / f"fea_label_df.pkl") + df_all.to_pickle(DIRNAME / "fea_label_df.pkl") + + # dump data in handler format for aligning the interface + handler = DataHandlerLP( + data_loader={ + "class": "qlib.data.dataset.loader.StaticDataLoader", + "kwargs": {"config": DIRNAME / "fea_label_df.pkl"}, + } + ) + handler.to_pickle(DIRNAME / "handler_proxy.pkl", dump_all=True) + + @property + def _internal_data_path(self): + return DIRNAME / f"internal_data_s{self.step}.pkl" + + def dump_meta_ipt(self): + """ + Dump data for training meta model. + This function will dump the input data for meta model + """ + rb = RollingBenchmark() + sim_task = rb.basic_task() + + sim_task["model"].setdefault("kwargs", {}).update({"early_stopping_rounds": None, "num_boost_round": 350}) + + exp_name_sim = f"data_sim_s{self.step}" + + internal_data = InternalData(sim_task, self.step, exp_name=exp_name_sim) + internal_data.setup(trainer=TrainerR) + + with self._internal_data_path.open("wb") as f: + pickle.dump(internal_data, f) + + def train_meta_model(self): + """ + training a meta model based on a simplified linear proxy model; + """ + + # 1) leverage the simplified proxy forecasting model to train meta model. + # - Only the dataset part is important, in current version of meta model will integrate the + rb = RollingBenchmark() + sim_task = rb.basic_task() + proxy_forecast_model_task = { + # "model": "qlib.contrib.model.linear.LinearModel", + "dataset": { + "class": "qlib.data.dataset.DatasetH", + "kwargs": { + "handler": f"file://{(DIRNAME / 'handler_proxy.pkl').absolute()}", + "segments": { + "train": ("2008-01-01", "2010-12-31"), + "test": ("2011-01-01", sim_task["dataset"]["kwargs"]["segments"]["test"][1]), + }, + }, + }, + # "record": ["qlib.workflow.record_temp.SignalRecord"] + } + + # 2) preparing meta dataset + kwargs = dict( + task_tpl=proxy_forecast_model_task, + step=self.step, + segments=0.5, + trunc_days=1 + self.horizon, + hist_step_n=30, + fill_method="max", + rolling_ext_days=0, + ) + # NOTE: + # the input of meta model (internal data) are shared between proxy model and final forecasting model + # but their task test segment are not aligned! It worked in my previous experiment. + # So the misalignment will not affect the effectiveness of the method. + with self._internal_data_path.open("rb") as f: + internal_data = pickle.load(f) + md = MetaDatasetDS(exp_name=internal_data, **kwargs) + + # 3) train and logging meta model + with R.start(experiment_name=self.meta_exp_name): + R.log_params(**kwargs) + mm = MetaModelDS(step=self.step, hist_step_n=kwargs["hist_step_n"], lr=0.0001) + mm.fit(md) + R.save_objects(model=mm) + + @property + def _task_path(self): + return DIRNAME / f"tasks_s{self.step}.pkl" + + def meta_inference(self): + """ + Leverage meta-model for inference: + - Given + - baseline tasks + - input for meta model(internal data) + - meta model (its learnt knowledge on proxy forecasting model is expected to transfer to normal forecasting model) + """ + # 1) get meta model + exp = R.get_exp(experiment_name=self.meta_exp_name) + rec = exp.list_recorders(rtype=exp.RT_L)[0] + meta_model: MetaModelDS = rec.load_object("model") + + # 2) + # we are transfer to knowledge of meta model to final forecasting tasks. + # Create MetaTaskDataset for the final forecasting tasks + # Aligning the setting of it to the MetaTaskDataset when training Meta model is necessary + + # 2.1) get previous config + param = rec.list_params() + trunc_days = int(param["trunc_days"]) + step = int(param["step"]) + hist_step_n = int(param["hist_step_n"]) + fill_method = param.get("fill_method", "max") + + rb = RollingBenchmark() + task_l = rb.create_rolling_tasks() + + # 2.2) create meta dataset for final dataset + kwargs = dict( + task_tpl=task_l, + step=step, + segments=0.0, # all the tasks are for testing + trunc_days=trunc_days, + hist_step_n=hist_step_n, + fill_method=fill_method, + task_mode=MetaTask.PROC_MODE_TRANSFER, + ) + + with self._internal_data_path.open("rb") as f: + internal_data = pickle.load(f) + mds = MetaDatasetDS(exp_name=internal_data, **kwargs) + + # 3) meta model make inference and get new qlib task + new_tasks = meta_model.inference(mds) + with self._task_path.open("wb") as f: + pickle.dump(new_tasks, f) + + def train_and_eval_tasks(self): + """ + Training the tasks generated by meta model + Then evaluate it + """ + with self._task_path.open("rb") as f: + tasks = pickle.load(f) + rb = RollingBenchmark(rolling_exp="rolling_ds") + rb.train_rolling_tasks(tasks) + rb.ens_rolling() + rb.update_rolling_rec() def run_all(self): self.dump_data_for_proxy_model() + self.dump_meta_ipt() + self.train_meta_model() + self.meta_inference() + self.train_and_eval_tasks() if __name__ == "__main__": diff --git a/examples/benchmarks_dynamic/baseline/rolling_benchmark.py b/examples/benchmarks_dynamic/baseline/rolling_benchmark.py index d07d32d455..b57f351691 100644 --- a/examples/benchmarks_dynamic/baseline/rolling_benchmark.py +++ b/examples/benchmarks_dynamic/baseline/rolling_benchmark.py @@ -13,16 +13,21 @@ DIRNAME = Path(__file__).absolute().resolve().parent from qlib.workflow.task.gen import task_generator, RollingGen from qlib.workflow.task.collect import RecorderCollector +from qlib.workflow.record_temp import PortAnaRecord, SigAnaRecord class RollingBenchmark: """ + **NOTE** before running the example, please clean your previous results with following command - `rm -r mlruns` """ - def __init__(self) -> None: + + def __init__(self, rolling_exp="rolling_models") -> None: self.step = 20 + self.horizon = 1 + self.rolling_exp = rolling_exp def basic_task(self): """For fast training rolling""" @@ -45,31 +50,46 @@ def basic_task(self): def create_rolling_tasks(self): task = self.basic_task() - task_l = task_generator(task, RollingGen( - step=self.step, trunc_days=2)) # the last two days should be truncated to avoid information leakage + task_l = task_generator( + task, RollingGen(step=self.step, trunc_days=self.horizon + 1) + ) # the last two days should be truncated to avoid information leakage return task_l - def run_rolling_tasks(self): - task_l = self.create_rolling_tasks() - trainer = TrainerR(experiment_name="rolling_models") + def train_rolling_tasks(self, task_l=None): + if task_l is None: + task_l = self.create_rolling_tasks() + trainer = TrainerR(experiment_name=self.rolling_exp) trainer(task_l) + COMB_EXP = "rolling" + def ens_rolling(self): - comb_key = "rolling" - rc = RecorderCollector(experiment="rolling_models", - artifacts_key=["pred", "label"], - process_list=[RollingEnsemble()], - # rec_key_func=lambda rec: (comb_key, rec.info["id"]), - artifacts_path={ - "pred": "pred.pkl", - "label": "label.pkl" - }) + rc = RecorderCollector( + experiment=self.rolling_exp, + artifacts_key=["pred", "label"], + process_list=[RollingEnsemble()], + # rec_key_func=lambda rec: (self.COMB_EXP, rec.info["id"]), + artifacts_path={"pred": "pred.pkl", "label": "label.pkl"}, + ) res = rc() - with R.start(experiment_name=comb_key): + with R.start(experiment_name=self.COMB_EXP): + R.log_params(exp_name=self.rolling_exp) R.save_objects(**{"pred.pkl": res["pred"], "label.pkl": res["label"]}) def update_rolling_rec(self): - pass + """ + Evaluate the combined rolling results + """ + for rid, rec in R.list_recorders(experiment_name=self.COMB_EXP).items(): + for rt_cls in SigAnaRecord, PortAnaRecord: + rt = rt_cls(recorder=rec, skip_existing=True) + rt.generate() + print(f"Your evaluation results can be found in the experiment named `{self.COMB_EXP}`.") + + def run_all(self): + self.train_rolling_tasks() + self.ens_rolling() + self.update_rolling_rec() if __name__ == "__main__": diff --git a/qlib/contrib/meta/__init__.py b/qlib/contrib/meta/__init__.py index 06a2ea30be..1422cd4f93 100644 --- a/qlib/contrib/meta/__init__.py +++ b/qlib/contrib/meta/__init__.py @@ -1,4 +1,4 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -from .data_selection import MetaTaskDS, MetaDatasetHDS, MetaModelDS +from .data_selection import MetaTaskDS, MetaDatasetDS, MetaModelDS diff --git a/qlib/contrib/meta/data_selection/__init__.py b/qlib/contrib/meta/data_selection/__init__.py index eaf702c7f6..cc533bc4aa 100644 --- a/qlib/contrib/meta/data_selection/__init__.py +++ b/qlib/contrib/meta/data_selection/__init__.py @@ -1,6 +1,5 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -from .task import MetaTaskDS -from .dataset import MetaDatasetHDS +from .dataset import MetaDatasetDS, MetaTaskDS from .model import MetaModelDS diff --git a/qlib/contrib/meta/data_selection/dataset.py b/qlib/contrib/meta/data_selection/dataset.py index b620b51634..c1c1811e3d 100644 --- a/qlib/contrib/meta/data_selection/dataset.py +++ b/qlib/contrib/meta/data_selection/dataset.py @@ -13,7 +13,7 @@ from qlib.workflow import R from qlib.workflow.task.gen import RollingGen, task_generator from joblib import Parallel, delayed -from qlib.model.meta.dataset import MetaDataset +from qlib.model.meta.dataset import MetaTaskDataset from qlib.model.trainer import task_train, TrainerR from qlib.data.dataset import DatasetH from tqdm.auto import tqdm @@ -64,7 +64,7 @@ def setup(self, trainer=TrainerR, trainer_kwargs={}): # We want to split the training time period into small segments. perf_task_tpl["dataset"]["kwargs"]["segments"] = { "train": (DatasetH.get_min_time(seg), DatasetH.get_max_time(seg)), - "test": (None, None) + "test": (None, None), } # NOTE: @@ -74,7 +74,7 @@ def setup(self, trainer=TrainerR, trainer_kwargs={}): gen_task = task_generator(perf_task_tpl, [rg]) recorders = R.list_recorders(experiment_name=self.exp_name) - if len(gen_task) == len(recorders): + if len(gen_task) == len(recorders): get_module_logger("Internal Data").info("the data has been initialized") else: # train new models @@ -102,7 +102,7 @@ def setup(self, trainer=TrainerR, trainer_kwargs={}): del self.dh # handler is not useful now def _calc_perf(self, pred, label): - df = pd.DataFrame({'pred': pred, 'label': label}) + df = pd.DataFrame({"pred": pred, "label": label}) df = df.groupby("datetime").corr(method="spearman") corr = df.loc(axis=0)[:, "pred"]["label"].droplevel(axis=0, level=-1) return corr @@ -116,7 +116,8 @@ def update(self): class MetaTaskDS(MetaTask): - """ Meta Task for Data Selection """ + """Meta Task for Data Selection""" + def __init__(self, task: dict, meta_info: pd.DataFrame, mode: str = MetaTask.PROC_MODE_FULL, fill_method="max"): """ The description of the processed data @@ -151,12 +152,12 @@ def __init__(self, task: dict, meta_info: pd.DataFrame, mode: str = MetaTask.PRO d_test = d_test.dropna(axis=0) # print(d_test.groupby("datetime").size()) if prev_size == 0 or d_test.shape[0] / prev_size <= 0.1: - __import__('ipdb').set_trace() + __import__("ipdb").set_trace() raise ValueError(f"Most of samples are dropped. Skip this task: {task}") if globals().get("YX_CONFIRM_XXX") is None: if d_test.groupby("datetime").size().shape[0] < 5: - __import__('ipdb').set_trace() + __import__("ipdb").set_trace() # globals()["YX_CONFIRM_XXX"] = True sample_time_belong = np.zeros((d_train.shape[0], time_perf.shape[1])) @@ -169,44 +170,51 @@ def __init__(self, task: dict, meta_info: pd.DataFrame, mode: str = MetaTask.PRO # Assumptions: the latest data has similar performance like the last month sample_time_belong[sample_time_belong.sum(axis=1) != 1, -1] = 1.0 - self.processed_meta_input.update(dict( - X=d_train["feature"], - y=d_train["label"].iloc[:, 0], - X_test=d_test["feature"], - y_test=d_test["label"].iloc[:, 0], - time_belong=sample_time_belong, - test_idx=d_test["label"].index, - )) + self.processed_meta_input.update( + dict( + X=d_train["feature"], + y=d_train["label"].iloc[:, 0], + X_test=d_test["feature"], + y_test=d_test["label"].iloc[:, 0], + time_belong=sample_time_belong, + test_idx=d_test["label"].index, + ) + ) # TODO: set device: I think this is not necessary to converting data format. self.processed_meta_input = data_to_tensor(self.processed_meta_input) def _get_processed_meta_info(self): # __import__('ipdb').set_trace() - meta_info_norm = self.meta_info.sub(self.meta_info.mean(axis=1), axis=0) #.fillna(0.) + meta_info_norm = self.meta_info.sub(self.meta_info.mean(axis=1), axis=0) # .fillna(0.) if self.fill_method == "max": - meta_info_norm = meta_info_norm.T.fillna(meta_info_norm.max(axis=1)).T # fill it with row max to align with previous implementation + meta_info_norm = meta_info_norm.T.fillna( + meta_info_norm.max(axis=1) + ).T # fill it with row max to align with previous implementation elif self.fill_method == "zero": pass else: raise NotImplementedError(f"This type of input is not supported") - meta_info_norm = meta_info_norm.fillna(0.) # always fill zero in case of NaN + meta_info_norm = meta_info_norm.fillna(0.0) # always fill zero in case of NaN return meta_info_norm def get_meta_input(self): return self.processed_meta_input -class MetaDatasetDS(MetaDataset): - def __init__(self, - *, - task_tpl: Union[dict, list], - step: int, - trunc_days: int = None, - exp_name: Union[str, InternalData], - segments: Union[Dict[Text, Tuple], float], - hist_step_n: int = 10, - task_mode: str = MetaTask.PROC_MODE_FULL, - fill_method: str = "max"): +class MetaDatasetDS(MetaTaskDataset): + def __init__( + self, + *, + task_tpl: Union[dict, list], + step: int, + trunc_days: int = None, + rolling_ext_days: int = 0, + exp_name: Union[str, InternalData], + segments: Union[Dict[Text, Tuple], float], + hist_step_n: int = 10, + task_mode: str = MetaTask.PROC_MODE_FULL, + fill_method: str = "max", + ): """ A dataset for meta model. @@ -221,6 +229,9 @@ def __init__(self, the rolling step trunc_days: int days to be truncated based on the test start + rolling_ext_days: int + sometimes users want to train meta models for a longer test period but with smaller rolling steps for more task samples. + the total length of test periods will be `step + rolling_ext_days` exp_name : Union[str, InternalData] Decide what meta_info are used for prediction. @@ -248,9 +259,17 @@ def __init__(self, self.step = step if isinstance(task_tpl, dict): - rg = RollingGen(step=step, trunc_days=trunc_days, task_copy_func=deepcopy_basic_type) # NOTE: trunc_days is very important !!!! + rg = RollingGen( + step=step, trunc_days=trunc_days, task_copy_func=deepcopy_basic_type + ) # NOTE: trunc_days is very important !!!! task_iter = rg(task_tpl) - if task_mode == MetaTask.PROC_MODE_FULL: + if rolling_ext_days > 0: + self.ta = TimeAdjuster(future=True) + for t in task_iter: + t["dataset"]["kwargs"]["segments"]["test"] = self.ta.shift( + t["dataset"]["kwargs"]["segments"]["test"], step=rolling_ext_days, rtype=RollingGen.ROLL_EX + ) + if task_mode == MetaTask.PROC_MODE_FULL: # Only pre initializing the task when full task is req # initializing handler and share it. init_task_handler(task_tpl) @@ -261,15 +280,18 @@ def __init__(self, self.task_list = [] self.meta_task_l = [] logger = get_module_logger("MetaDatasetDS") + logger.info(f"Example task for training meta model: {task_iter[0]}") for t in tqdm(task_iter, desc="creating meta tasks"): try: - self.meta_task_l.append(MetaTaskDS(t, meta_info=self._prepare_meta_ipt(t), mode=task_mode, fill_method=fill_method)) + self.meta_task_l.append( + MetaTaskDS(t, meta_info=self._prepare_meta_ipt(t), mode=task_mode, fill_method=fill_method) + ) self.task_list.append(t) except ValueError as e: logger.warning(f"ValueError: {e}") if globals().get("YX_CONFIRM_XXX") is None: - if len(self.meta_task_l) <= 0: - __import__('ipdb').set_trace() + if len(self.meta_task_l) <= 0: + __import__("ipdb").set_trace() # globals()["YX_CONFIRM_XXX"] = True def _prepare_meta_ipt(self, task): @@ -287,6 +309,7 @@ def mask_future(s): start, end = s.name end = get_date_by_shift(trading_date=end, shift=self.trunc_days - 1, future=True) return s.mask((s.index >= start) & (s.index <= end)) + ic_df_avail = ic_df_avail.apply(mask_future) # apply to each col # 2) filter the info with too long periods diff --git a/qlib/contrib/meta/data_selection/model.py b/qlib/contrib/meta/data_selection/model.py index 71c603b96d..6e113a01c8 100644 --- a/qlib/contrib/meta/data_selection/model.py +++ b/qlib/contrib/meta/data_selection/model.py @@ -14,12 +14,12 @@ from typing import Union, List, Tuple, Dict from ....data.dataset.weight import SampleReweighter, Reweighter -from ....model.meta.dataset import MetaDataset +from ....model.meta.dataset import MetaTaskDataset from ....model.meta.model import MetaModel, MetaTaskModel from ....workflow import R from .utils import fill_diagnal, ICLoss -from .dataset import MetaDatasetHDS +from .dataset import MetaDatasetDS from qlib.contrib.meta.data_selection.net import PredNet from qlib.data.dataset.weight import Reweighter from qlib.log import get_module_logger @@ -122,13 +122,13 @@ def run_epoch(self, phase, task_list, epoch, opt, loss_l, ignore_weight=False): R.log_metrics(**{f"loss/{phase}": running_loss, "step": epoch}) R.log_metrics(**{f"ic/{phase}": ic, "step": epoch}) - def fit(self, meta_dataset: MetaDatasetHDS): + def fit(self, meta_dataset: MetaDatasetDS): """ The meta-learning-based data selection interacts directly with meta-dataset due to the close-form proxy measurement. Parameters ---------- - meta_dataset : MetaDatasetHDS + meta_dataset : MetaDatasetDS The meta-model takes the meta-dataset for its training process. """ @@ -159,90 +159,7 @@ def fit(self, meta_dataset: MetaDatasetHDS): R.save_objects(**{"model.pkl": self.tn}) self.fitted = True - # TODO: refactor - def _inference_single_task(self, meta_id: tuple, meta_dataset: MetaDatasetHDS): - meta_task = meta_dataset.get_meta_task_by_test_period(meta_id) - if meta_task is not None: - self.tn.eval() - torch.set_grad_enabled(False) - ( - X, - y, - time_perf, - time_belong, - X_test, - y_test, - test_idx, - train_idx, - test_period, - ) = meta_task.prepare_task_data() - weights = self.tn.get_sample_weights(X, time_perf, time_belong) - reweighter = SampleReweighter(pd.Series(weights.detach().cpu().numpy(), index=train_idx)) - return reweighter - else: - raise ValueError("The current task is not supported!") - - # TODO: refactor - def inference(self, meta_ids: Union[List[tuple], tuple], meta_dataset: MetaDatasetHDS): - """ - Inference a single task with meta-dataset. The meta-model must be fitted. - - Parameters - ---------- - tasks: Union[List[dict], dict] - A list of definitions. - meta_dataset: MetaDatasetHDS - """ - if not self.fitted: - raise ValueError("The meta-model is not fitted yet!") - if isinstance(meta_ids, tuple): - return {meta_ids: self._inference_single_task(meta_ids, meta_dataset)} - - elif isinstance(meta_ids, list): - reweighters = {} - for meta_id in meta_ids: - reweighters[meta_id] = self._inference_single_task(meta_id, meta_dataset) - return reweighters - else: - raise NotImplementedError("This type of task definition is not supported!") - - # TODO: refactor - def prepare_tasks(self, task: Union[List[dict], dict], reweighters: dict): - """ - - Parameters - ---------- - tasks: Union[List[dict], dict] - A list of definitions. - """ - if not self.fitted: - raise ValueError("The meta-model is not fitted yet!") - if isinstance(task, dict): - task_c = copy.deepcopy(task) - test_period = task_c["dataset"]["kwargs"]["segments"]["test"] - if test_period in reweighters: - task_c["reweighter"] = reweighters[test_period] - else: - nearest_key = None - for key in reweighters: - if key[0] <= test_period[0]: - if nearest_key is None or nearest_key[0] < key[0]: - nearest_key = key - if nearest_key is not None: - task_c["reweighter"] = reweighters[nearest_key] - else: - print( - "Warning: The task with test period:", - test_period, - " does not have the corresponding reweighter!", - ) - return task_c - elif isinstance(task, list): - return [self.prepare_tasks(i, reweighters) for i in task] - else: - raise NotImplementedError("This type of task definition is not supported!") - - def prepare_task(self, task: MetaTask) -> dict: + def _prepare_task(self, task: MetaTask) -> dict: meta_ipt = task.get_meta_input() weights = self.tn.twm(meta_ipt["time_perf"]) @@ -250,3 +167,9 @@ def prepare_task(self, task: MetaTask) -> dict: task = copy.copy(task.task) # NOTE: this is a shallow copy. task["reweighter"] = TimeReweighter(weight_s) return task + + def inference(self, meta_dataset: MetaTaskDataset) -> List[dict]: + res = [] + for mt in meta_dataset.prepare_tasks("test"): + res.append(self._prepare_task(mt)) + return res diff --git a/qlib/contrib/meta/data_selection/net.py b/qlib/contrib/meta/data_selection/net.py index 8c57069818..73fd12061d 100644 --- a/qlib/contrib/meta/data_selection/net.py +++ b/qlib/contrib/meta/data_selection/net.py @@ -59,7 +59,7 @@ def get_sample_weights(self, X, time_perf, time_belong, ignore_weight=False): return weights def forward(self, X, y, time_perf, time_belong, X_test, ignore_weight=False): - """ Please refer to the docs of MetaTaskDS for the description of the variables""" + """Please refer to the docs of MetaTaskDS for the description of the variables""" weights = self.get_sample_weights(X, time_perf, time_belong, ignore_weight=ignore_weight) X_w = X.T * weights.view(1, -1) theta = torch.inverse(X_w @ X) @ X_w @ y diff --git a/qlib/contrib/model/gbdt.py b/qlib/contrib/model/gbdt.py index 2630ced67e..d2a093d2ab 100644 --- a/qlib/contrib/model/gbdt.py +++ b/qlib/contrib/model/gbdt.py @@ -4,7 +4,7 @@ import numpy as np import pandas as pd import lightgbm as lgb -from typing import Text, Union +from typing import List, Text, Tuple, Union from ...model.base import ModelFT from ...data.dataset import DatasetH from ...data.dataset.handler import DataHandlerLP @@ -15,59 +15,62 @@ class LGBModel(ModelFT, LightGBMFInt): """LightGBM Model""" - def __init__(self, loss="mse", early_stopping_rounds=50, **kwargs): + def __init__(self, loss="mse", early_stopping_rounds=50, num_boost_round=1000, **kwargs): if loss not in {"mse", "binary"}: raise NotImplementedError self.params = {"objective": loss, "verbosity": -1} self.params.update(kwargs) self.early_stopping_rounds = early_stopping_rounds + self.num_boost_round = num_boost_round self.model = None - def _prepare_data(self, dataset: DatasetH, reweighter=None): - df_train, df_valid = dataset.prepare( - ["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L - ) - if df_train.empty or df_valid.empty: - raise ValueError("Empty data from dataset, please check your dataset config.") - x_train, y_train = df_train["feature"], df_train["label"] - x_valid, y_valid = df_valid["feature"], df_valid["label"] - - # Lightgbm need 1D array as its label - if y_train.values.ndim == 2 and y_train.values.shape[1] == 1: - y_train, y_valid = np.squeeze(y_train.values), np.squeeze(y_valid.values) - else: - raise ValueError("LightGBM doesn't support multi-label training") + def _prepare_data(self, dataset: DatasetH, reweighter=None) -> List[Tuple[lgb.Dataset, str]]: + """ + The motivation of current version is to make validation optional + - train segment is necessary; + """ + ds_l = [] + assert "train" in dataset.segments + for key in ["train", "valid"]: + if key in dataset.segments: + df = dataset.prepare(key, col_set=["feature", "label"], data_key=DataHandlerLP.DK_L) + if df.empty: + raise ValueError("Empty data from dataset, please check your dataset config.") + x, y = df["feature"], df["label"] - if reweighter is None: - w_train = None - w_valid = None - elif isinstance(reweighter, Reweighter): - w_train = reweighter.reweight(df_train) - w_valid = reweighter.reweight(df_valid) - else: - raise ValueError("Unsupported reweighter type.") + # Lightgbm need 1D array as its label + if y.values.ndim == 2 and y.values.shape[1] == 1: + y = np.squeeze(y.values) + else: + raise ValueError("LightGBM doesn't support multi-label training") - dtrain = lgb.Dataset(x_train.values, label=y_train, weight=w_train) - dvalid = lgb.Dataset(x_valid.values, label=y_valid, weight=w_valid) - return dtrain, dvalid + if reweighter is None: + w = None + elif isinstance(reweighter, Reweighter): + w = reweighter.reweight(df) + else: + raise ValueError("Unsupported reweighter type.") + ds_l.append((lgb.Dataset(x.values, label=y, weight=w), key)) + return ds_l def fit( self, dataset: DatasetH, - num_boost_round=1000, + num_boost_round=None, early_stopping_rounds=None, verbose_eval=20, evals_result=dict(), reweighter=None, **kwargs ): - dtrain, dvalid = self._prepare_data(dataset, reweighter) + ds_l = self._prepare_data(dataset, reweighter) + ds, names = list(zip(*ds_l)) self.model = lgb.train( self.params, - dtrain, - num_boost_round=num_boost_round, - valid_sets=[dtrain, dvalid], - valid_names=["train", "valid"], + ds[0], # training dataset + num_boost_round=self.num_boost_round if num_boost_round is None else num_boost_round, + valid_sets=ds, + valid_names=names, early_stopping_rounds=( self.early_stopping_rounds if early_stopping_rounds is None else early_stopping_rounds ), @@ -75,8 +78,8 @@ def fit( evals_result=evals_result, **kwargs ) - evals_result["train"] = list(evals_result["train"].values())[0] - evals_result["valid"] = list(evals_result["valid"].values())[0] + for k in names: + evals_result[k] = list(evals_result[k].values())[0] def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): if self.model is None: diff --git a/qlib/contrib/strategy/signal_strategy.py b/qlib/contrib/strategy/signal_strategy.py index ae69b4bb69..738bf5f2e8 100644 --- a/qlib/contrib/strategy/signal_strategy.py +++ b/qlib/contrib/strategy/signal_strategy.py @@ -106,6 +106,10 @@ def generate_trade_decision(self, execute_result=None): trade_start_time, trade_end_time = self.trade_calendar.get_step_time(trade_step) pred_start_time, pred_end_time = self.trade_calendar.get_step_time(trade_step, shift=1) pred_score = self.signal.get_signal(start_time=pred_start_time, end_time=pred_end_time) + # NOTE: the current version of topk dropout strategy can't handle pd.DataFrame(multiple signal) + # So it only leverage the first col of signal + if isinstance(pred_score, pd.DataFrame): + pred_score = pred_score.iloc[:, 0] if pred_score is None: return TradeDecisionWO([], self) if self.only_tradable: diff --git a/qlib/data/dataset/utils.py b/qlib/data/dataset/utils.py index f34db749b2..15946f3dc0 100644 --- a/qlib/data/dataset/utils.py +++ b/qlib/data/dataset/utils.py @@ -5,6 +5,7 @@ from typing import Union, List from qlib.utils import init_instance_by_config from typing import TYPE_CHECKING + if TYPE_CHECKING: from qlib.data.dataset import DataHandler @@ -131,6 +132,8 @@ def init_task_handler(task: dict) -> Union[DataHandler, None]: Union[DataHandler, None]: returns """ + # avoid recursive import + from .handler import DataHandler h_conf = task["dataset"]["kwargs"].get("handler") if h_conf is not None: diff --git a/qlib/model/meta/__init__.py b/qlib/model/meta/__init__.py index 2facbfd656..4421c8d19a 100644 --- a/qlib/model/meta/__init__.py +++ b/qlib/model/meta/__init__.py @@ -2,4 +2,4 @@ # Licensed under the MIT License. from .task import MetaTask -from .dataset import MetaDataset, MetaDatasetH +from .dataset import MetaTaskDataset diff --git a/qlib/model/meta/dataset.py b/qlib/model/meta/dataset.py index 743626fdaa..4b56dd1ba5 100644 --- a/qlib/model/meta/dataset.py +++ b/qlib/model/meta/dataset.py @@ -2,19 +2,20 @@ # Licensed under the MIT License. import abc +from qlib.model.meta.task import MetaTask from typing import Dict, Union, List, Tuple, Text from ...workflow.task.gen import RollingGen, task_generator from ...data.dataset.handler import DataHandler from ...utils.serial import Serializable -class MetaDataset(Serializable, metaclass=abc.ABCMeta): +class MetaTaskDataset(Serializable, metaclass=abc.ABCMeta): """ A dataset fetching the data in a meta-level. A Meta Dataset is responsible for - - input a specific task and prepare input data (based a given task) for meta model - - prepare underlayer data: + - input tasks(e.g. Qlib tasks) and prepare meta tasks + - meta task contains more information than normal tasks (e.g. input data for meta model) The learnt pattern could transfer to other meta dataset. The following cases should be supported - A meta-model trained on meta-dataset A and then applied to meta-dataset B @@ -27,13 +28,13 @@ def __init__(self, segments: Union[Dict[Text, Tuple], float], *args, **kwargs): The segments indicates the way to divide the data - The duty of the `__init__` function of MetaDataset + The duty of the `__init__` function of MetaTaskDataset - initialize the tasks """ super().__init__(*args, **kwargs) self.segments = segments - def prepare_tasks(self, segments: Union[List[Text], Text], *args, **kwargs) -> List: + def prepare_tasks(self, segments: Union[List[Text], Text], *args, **kwargs) -> List[MetaTask]: """ Prepare the data in each meta-task and ready for training. @@ -73,20 +74,3 @@ def _prepare_seg(self, segment: Text): the name of the segment """ pass - - -class MetaDatasetH(MetaDataset): - """ - MetaDataset with specified DataHandler. - """ - - def __init__(self, data_handler: DataHandler, *args, **kwargs): - """ - - Parameters - ---------- - data_handler: DataHandler - The shared DataHandler among meta-tasks. - """ - super().__init__(*args, **kwargs) - self.data_handler = data_handler diff --git a/qlib/model/meta/model.py b/qlib/model/meta/model.py index e38def0109..224600daa4 100644 --- a/qlib/model/meta/model.py +++ b/qlib/model/meta/model.py @@ -2,10 +2,11 @@ # Licensed under the MIT License. import abc +from qlib.contrib.meta.data_selection.dataset import MetaDatasetDS from typing import Union, List, Tuple from qlib.model.meta.task import MetaTask -from .dataset import MetaDataset +from .dataset import MetaTaskDataset class MetaModel(metaclass=abc.ABCMeta): @@ -42,42 +43,26 @@ class MetaTaskModel(MetaModel): This type of meta-model deals with base task definitions. The meta-model creates tasks for training new base forecasting models after it is trained. `prepare_tasks` directly modifies the task definitions. """ - @abc.abstractmethod - def prepare_task(self, task: MetaTask) -> dict: + def fit(self, meta_dataset: MetaTaskDataset): """ - Input a meta task and output a task with qlib format - - When modifying the model tasks, the meta model will leverage `self.inference` to get some necessary - information. + The MetaTaskModel is expected to get prepared MetaTask from meta_dataset. + And then it will learn knowledge from the meta tasks + """ + raise NotImplementedError(f"Please implement the `fit` method") - Parameters - ---------- - task : MetaTask - meta task to inference + def inference(self, meta_dataset: MetaTaskDataset) -> List[dict]: + """ + MetaTaskModel will make inference on the meta_dataset + The MetaTaskModel is expected to get prepared MetaTask from meta_dataset. + Then it will create modified task with Qlib format which can be executed by Qlib trainer. Returns ------- - dict: - A task with Qlib format - """ + List[dict]: + A list of modified task definitions. - # NOTE: factor; Please justify the necessity of this method - # @abc.abstractmethod - # def prepare_tasks(self, tasks: List[dict]) -> List[dict]: - # """ - # The meta-model modifies the tasks. The function will return the modified task list. - # - # Parameters - # ---------- - # tasks: List[dict] - # A list of task definitions for the meta-model to modify. - # - # Returns - # ------- - # List[dict]: - # A list of modified task definitions. - # """ - # pass + """ + raise NotImplementedError(f"Please implement the `inference` method") class MetaGuideModel(MetaModel): diff --git a/qlib/model/trainer.py b/qlib/model/trainer.py index 738fef0aeb..9b8bf2726c 100644 --- a/qlib/model/trainer.py +++ b/qlib/model/trainer.py @@ -20,7 +20,7 @@ from qlib.data.dataset import Dataset from qlib.log import get_module_logger from qlib.model.base import Model -from qlib.utils import flatten_dict, get_callable_kwargs, init_instance_by_config, auto_filter_kwargs +from qlib.utils import flatten_dict, get_callable_kwargs, init_instance_by_config, auto_filter_kwargs, fill_placeholder from qlib.workflow import R from qlib.workflow.record_temp import SignalRecord from qlib.workflow.recorder import Recorder @@ -83,47 +83,6 @@ def begin_task_train(task_config: dict, experiment_name: str, recorder_name: str return R.get_recorder() -def fill_placeholder(config: dict, config_extend: dict): - """ - Detect placeholder in config and fill them with config_extend. - The item of dict must be single item(int, str, etc), dict and list. Tuples are not supported. - - Parameters - ---------- - config : dict - the parameter dict will be filled - config_extend : dict - the value of all placeholders - - Returns - ------- - dict - the parameter dict - """ - # check the format of config_extend - for placeholder in config_extend.keys(): - assert re.match(r"<[^<>]+>", placeholder) - - # bfs - top = 0 - tail = 1 - item_queue = [config] - while top < tail: - now_item = item_queue[top] - top += 1 - if isinstance(now_item, list): - item_keys = range(len(now_item)) - elif isinstance(now_item, dict): - item_keys = now_item.keys() - for key in item_keys: - if isinstance(now_item[key], list) or isinstance(now_item[key], dict): - item_queue.append(now_item[key]) - tail += 1 - elif isinstance(now_item[key], str) and now_item[key] in config_extend.keys(): - now_item[key] = config_extend[now_item[key]] - return config - - def end_task_train(rec: Recorder, experiment_name: str) -> Recorder: """ Finish task training with real model fitting and saving. diff --git a/qlib/tests/config.py b/qlib/tests/config.py index a7bb8d0576..e1cb1208a0 100644 --- a/qlib/tests/config.py +++ b/qlib/tests/config.py @@ -30,41 +30,12 @@ }, } + SA_RC = { "class": "SigAnaRecord", "module_path": "qlib.workflow.record_temp", } -PORT_CONFIG = { - 'class': 'PortAnaRecord', - 'module_path': 'qlib.workflow.record_temp', - 'kwargs': { - 'config': { - 'strategy': { - 'class': 'TopkDropoutStrategy', - 'module_path': 'qlib.contrib.strategy', - 'kwargs': { - 'signal': ['', ''], - 'topk': 50, - 'n_drop': 5 - } - }, - 'backtest': { - 'start_time': "2017-01-01", - 'end_time': "2020-08-01", - 'account': 100000000, - 'benchmark': 'SH000300', - 'exchange_kwargs': { - 'limit_threshold': 0.095, - 'deal_price': 'close', - 'open_cost': 0.0005, - 'close_cost': 0.0015, - 'min_cost': 5 - } - } - } - } -} RECORD_CONFIG = [ { @@ -75,7 +46,7 @@ "model": "", }, }, - SA_RC + SA_RC, ] diff --git a/qlib/utils/__init__.py b/qlib/utils/__init__.py index 2ed74369a3..909bbbfbc2 100644 --- a/qlib/utils/__init__.py +++ b/qlib/utils/__init__.py @@ -192,6 +192,24 @@ def get_module_by_module_path(module_path: Union[str, ModuleType]): return module +def split_module_path(module_path: str) -> Tuple[str, str]: + """ + + Parameters + ---------- + module_path : str + e.g. "a.b.c.ClassName" + + Returns + ------- + Tuple[str, str] + e.g. ("a.b.c", "ClassName") + """ + *m_path, cls = module_path.split(".") + m_path = ".".join(m_path) + return m_path, cls + + def get_callable_kwargs(config: Union[dict, str], default_module: Union[str, ModuleType] = None) -> (type, dict): """ extract class/func and kwargs from config info @@ -213,17 +231,24 @@ def get_callable_kwargs(config: Union[dict, str], default_module: Union[str, Mod the class/func object and it's arguments. """ if isinstance(config, dict): - if isinstance(config["class"], str): - module = get_module_by_module_path(config.get("module_path", default_module)) - # raise AttributeError - _callable = getattr(module, config["class" if "class" in config else "func"]) + key = "class" if "class" in config else "func" + if isinstance(config[key], str): + # 1) get module and class + # - case 1): "a.b.c.ClassName" + # - case 2): {"class": "ClassName", "module_path": "a.b.c"} + m_path, cls = split_module_path(config[key]) + if m_path == "": + m_path = config.get("module_path", default_module) + module = get_module_by_module_path(m_path) + + # 2) get callable + _callable = getattr(module, cls) # may raise AttributeError else: - _callable = config["class"] # the class type itself is passed in + _callable = config[key] # the class type itself is passed in kwargs = config.get("kwargs", {}) elif isinstance(config, str): # a.b.c.ClassName - *m_path, cls = config.split(".") - m_path = ".".join(m_path) + m_path, cls = split_module_path(config) module = get_module_by_module_path(default_module if m_path == "" else m_path) _callable = getattr(module, cls) @@ -699,6 +724,47 @@ def flatten_dict(d, parent_key="", sep=".") -> dict: return dict(items) +def fill_placeholder(config: dict, config_extend: dict): + """ + Detect placeholder in config and fill them with config_extend. + The item of dict must be single item(int, str, etc), dict and list. Tuples are not supported. + + Parameters + ---------- + config : dict + the parameter dict will be filled + config_extend : dict + the value of all placeholders + + Returns + ------- + dict + the parameter dict + """ + # check the format of config_extend + for placeholder in config_extend.keys(): + assert re.match(r"<[^<>]+>", placeholder) + + # bfs + top = 0 + tail = 1 + item_queue = [config] + while top < tail: + now_item = item_queue[top] + top += 1 + if isinstance(now_item, list): + item_keys = range(len(now_item)) + elif isinstance(now_item, dict): + item_keys = now_item.keys() + for key in item_keys: + if isinstance(now_item[key], list) or isinstance(now_item[key], dict): + item_queue.append(now_item[key]) + tail += 1 + elif isinstance(now_item[key], str) and now_item[key] in config_extend.keys(): + now_item[key] = config_extend[now_item[key]] + return config + + def auto_filter_kwargs(func: Callable) -> Callable: """ this will work like a decoration function diff --git a/qlib/workflow/exp.py b/qlib/workflow/exp.py index 9bdfdefc13..d5cd555a2c 100644 --- a/qlib/workflow/exp.py +++ b/qlib/workflow/exp.py @@ -1,7 +1,7 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -from typing import Union +from typing import Dict, List, Union import mlflow, logging from mlflow.entities import ViewType from mlflow.exceptions import MlflowException @@ -215,7 +215,10 @@ def _get_recorder(self, recorder_id=None, recorder_name=None): """ raise NotImplementedError(f"Please implement the `_get_recorder` method") - def list_recorders(self, **flt_kwargs): + RT_D = "dict" # return type dict + RT_L = "list" # return type list + + def list_recorders(self, rtype: str = RT_D, **flt_kwargs) -> Union[List[Recorder], Dict[str, Recorder]]: """ List all the existing recorders of this experiment. Please first get the experiment instance before calling this method. If user want to use the method `R.list_recorders()`, please refer to the related API document in `QlibRecorder`. @@ -226,7 +229,11 @@ def list_recorders(self, **flt_kwargs): Returns ------- - A dictionary (id -> recorder) of recorder information that being stored. + The return type depent on `rtype` + if `rtype` == "dict": + A dictionary (id -> recorder) of recorder information that being stored. + elif `rtype` == "list": + A list of Recorder. """ raise NotImplementedError(f"Please implement the `list_recorders` method.") @@ -326,8 +333,17 @@ def delete_recorder(self, recorder_id=None, recorder_name=None): UNLIMITED = 50000 # FIXME: Mlflow can only list 50000 records at most!!!!!!! - def list_recorders(self, max_results: int = UNLIMITED, status: Union[str, None] = None, filter_string: str = ""): + def list_recorders( + self, + rtype=Experiment.RT_D, + max_results: int = UNLIMITED, + status: Union[str, None] = None, + filter_string: str = "", + ): """ + Quoting docs of search_runs + > The default ordering is to sort by start_time DESC, then run_id. + Parameters ---------- max_results : int @@ -341,10 +357,17 @@ def list_recorders(self, max_results: int = UNLIMITED, status: Union[str, None] runs = self._client.search_runs( self.id, run_view_type=ViewType.ACTIVE_ONLY, max_results=max_results, filter_string=filter_string ) - recorders = dict() + rids = [] + recorders = [] for i in range(len(runs)): recorder = MLflowRecorder(self.id, self._uri, mlflow_run=runs[i]) if status is None or recorder.status == status: - recorders[runs[i].info.run_id] = recorder + rids.append(runs[i].info.run_id) + recorders.append(recorder) - return recorders + if rtype == Experiment.RT_D: + return dict(zip(rids, recorders)) + elif rtype == Experiment.RT_L: + return recorders + else: + raise NotImplementedError(f"This type of input is not supported") diff --git a/qlib/workflow/record_temp.py b/qlib/workflow/record_temp.py index 40fd53e67d..52425e0edc 100644 --- a/qlib/workflow/record_temp.py +++ b/qlib/workflow/record_temp.py @@ -14,8 +14,9 @@ from ..data.dataset.handler import DataHandlerLP from ..backtest import backtest as normal_backtest from ..log import get_module_logger -from ..utils import flatten_dict, class_casting +from ..utils import fill_placeholder, flatten_dict, class_casting, get_date_by_shift from ..utils.time import Freq +from ..utils.data import deepcopy_basic_type from ..contrib.eva.alpha import calc_ic, calc_long_short_return, calc_long_short_prec @@ -204,6 +205,35 @@ def list(self): return ["pred.pkl", "label.pkl"] +class ACRecordTemp(RecordTemp): + """Automatically checking record template""" + + def __init__(self, recorder, skip_existing=False): + self.skip_existing = skip_existing + super().__init__(recorder=recorder) + + def generate(self, *args, **kwargs): + """automatically checking the files and then run the concrete generating task""" + if self.skip_existing: + try: + self.check(include_self=True, parents=False) + except FileNotFoundError: + pass # continue to generating metrics + else: + logger.info("The results has previously generated, Generation skipped.") + return + + try: + self.check() + except FileNotFoundError: + logger.warning("The dependent data does not exists. Generation skipped.") + return + return self._generate(*args, **kwargs) + + def _generate(self, *args, **kwargs): + raise NotImplementedError(f"Please implement the `_generate` method") + + class HFSignalRecord(SignalRecord): """ This is the Signal Analysis Record class that generates the analysis results such as IC and IR. This class inherits the ``RecordTemp`` class. @@ -251,7 +281,7 @@ def list(self): return ["ic.pkl", "ric.pkl", "long_pre.pkl", "short_pre.pkl", "long_short_r.pkl", "long_avg_r.pkl"] -class SigAnaRecord(RecordTemp): +class SigAnaRecord(ACRecordTemp): """ This is the Signal Analysis Record class that generates the analysis results such as IC and IR. This class inherits the ``RecordTemp`` class. """ @@ -260,34 +290,18 @@ class SigAnaRecord(RecordTemp): depend_cls = SignalRecord def __init__(self, recorder, ana_long_short=False, ann_scaler=252, label_col=0, skip_existing=False): - super().__init__(recorder=recorder) + super().__init__(recorder=recorder, skip_existing=skip_existing) self.ana_long_short = ana_long_short self.ann_scaler = ann_scaler self.label_col = label_col - self.skip_existing = skip_existing - def generate(self, label: Optional[pd.DataFrame] = None, **kwargs): + def _generate(self, label: Optional[pd.DataFrame] = None, **kwargs): """ Parameters ---------- label : Optional[pd.DataFrame] Label should be a dataframe. """ - if self.skip_existing: - try: - self.check(include_self=True, parents=False) - except FileNotFoundError: - pass # continue to generating metrics - else: - logger.info("The results has previously generated, Generation skipped.") - return - - try: - self.check() - except FileNotFoundError: - logger.warning("The dependent data does not exists. Generation skipped.") - return - pred = self.load("pred.pkl") if label is None: label = self.load("label.pkl") @@ -329,7 +343,7 @@ def list(self): return paths -class PortAnaRecord(RecordTemp): +class PortAnaRecord(ACRecordTemp): """ This is the Portfolio Analysis Record class that generates the analysis results such as those of backtest. This class inherits the ``RecordTemp`` class. @@ -340,14 +354,35 @@ class PortAnaRecord(RecordTemp): """ artifact_path = "portfolio_analysis" + depend_cls = SignalRecord def __init__( self, recorder, - config, + config: dict = { # Default config for daily trading + "strategy": { + "class": "TopkDropoutStrategy", + "module_path": "qlib.contrib.strategy", + "kwargs": {"signal": "", "topk": 50, "n_drop": 5}, + }, + "backtest": { + "start_time": None, + "end_time": None, + "account": 100000000, + "benchmark": "SH000300", + "exchange_kwargs": { + "limit_threshold": 0.095, + "deal_price": "close", + "open_cost": 0.0005, + "close_cost": 0.0015, + "min_cost": 5, + }, + }, + }, risk_analysis_freq: Union[List, str] = None, indicator_analysis_freq: Union[List, str] = None, indicator_analysis_method=None, + skip_existing=False, **kwargs, ): """ @@ -364,7 +399,12 @@ def __init__( indicator_analysis_method : str, optional, default by None the candidated values include 'mean', 'amount_weighted', 'value_weighted' """ - super().__init__(recorder=recorder, **kwargs) + super().__init__(recorder=recorder, skip_existing=skip_existing, **kwargs) + + # We only deepcopy_basic_type because + # - We don't want to affect the config outside. + # - We don't want to deepcopy complex object to avoid overhead + config = deepcopy_basic_type(config) self.strategy_config = config["strategy"] _default_executor_config = { @@ -406,7 +446,21 @@ def _get_report_freq(self, executor_config): ret_freq.extend(self._get_report_freq(executor_config["kwargs"]["inner_executor"])) return ret_freq - def generate(self, **kwargs): + def _generate(self, **kwargs): + pred = self.load("pred.pkl") + + # replace the "" with prediction saved before + placehorder_value = {"": pred} + for k in "executor_config", "strategy_config": + setattr(self, k, fill_placeholder(getattr(self, k), placehorder_value)) + + # if the backtesting time range is not set, it will automatically extract time range from the prediction file + dt_values = pred.index.get_level_values("datetime") + if self.backtest_config["start_time"] is None: + self.backtest_config["start_time"] = dt_values.min() + if self.backtest_config["end_time"] is None: + self.backtest_config["end_time"] = get_date_by_shift(dt_values.max(), 1) + # custom strategy and get backtest portfolio_metric_dict, indicator_dict = normal_backtest( executor=self.executor_config, strategy=self.strategy_config, **self.backtest_config From 49c40746e14831438957780e3f622286eadfe4ff Mon Sep 17 00:00:00 2001 From: Young Date: Sat, 11 Dec 2021 23:20:14 +0800 Subject: [PATCH 19/37] remove unused code --- qlib/contrib/meta/data_selection/model.py | 3 +- qlib/data/dataset/weight.py | 86 ----------------------- 2 files changed, 2 insertions(+), 87 deletions(-) diff --git a/qlib/contrib/meta/data_selection/model.py b/qlib/contrib/meta/data_selection/model.py index 6e113a01c8..a3de4023f8 100644 --- a/qlib/contrib/meta/data_selection/model.py +++ b/qlib/contrib/meta/data_selection/model.py @@ -13,7 +13,7 @@ import copy from typing import Union, List, Tuple, Dict -from ....data.dataset.weight import SampleReweighter, Reweighter +from ....data.dataset.weight import Reweighter from ....model.meta.dataset import MetaTaskDataset from ....model.meta.model import MetaModel, MetaTaskModel from ....workflow import R @@ -32,6 +32,7 @@ def __init__(self, time_weight: pd.Series): self.time_weight = time_weight def reweight(self, data: Union[pd.DataFrame, pd.Series]): + # TODO: handling TSDataSampler w_s = pd.Series(1.0, index=data.index) for k, w in self.time_weight.items(): w_s.loc[slice(*k)] = w diff --git a/qlib/data/dataset/weight.py b/qlib/data/dataset/weight.py index 09570956dd..8e2a6b9598 100644 --- a/qlib/data/dataset/weight.py +++ b/qlib/data/dataset/weight.py @@ -32,89 +32,3 @@ def reweight(self, data: object) -> object: the weights info for the data """ raise NotImplementedError(f"This type of input is not supported") - - -class WeightSampler: - """ - (T)ime-(S)eries WeightSampler - This is the result of the function prepare_weight. - - It is aligned with the instance of TSDataSampler. - """ - - def __init__(self, weights: pd.Series): - assert get_level_index(weights, "datetime") == 0 - self.weights_s = lazy_sort_index(weights) - - def __getitem__(self, idx: int): - return self.weights_s[idx] - - def __len__(self): - return len(self.weights_s) - - -class SampleReweighter(Reweighter): - """ - The sample-wise reweighter. It aims to reweight by the given weight of each sample. - The samples are indexed in a pandas way - """ - - def __init__(self, sample_weights: pd.Series, *args, **kwargs): - """ - - Parameters - ---------- - sample_weights : pd.Series - Determine the weight of each sample. - The index of the Series should be exactly the same with each sample's index. - """ - self.weights = sample_weights - - def _sample_reweight_DataFrame(self, samples: Union[pd.Series, pd.DataFrame], *args, **kwargs) -> pd.Series: - """ - This function processes the prepared data with pd.Series or pd.DataFrame type. - - Returns - ------- - pd.Series: - The weights of the prepared data. - """ - weight = pd.Series(data=1.0, index=samples.index, name="weight") - weight.update(self.weights) - return weight - - def _sample_reweight_TSDataSampler(self, sampler: TSDataSampler, *args, **kwargs) -> WeightSampler: - """ - This function processes the prepared data with TSDataSampler type. - - Returns - ------- - WeightSampler: - The weight sampler of the prepared data. - """ - weight = pd.Series(1.0, index=sampler.get_index(), name="weight") - weight.update(self.weights) - return WeightSampler(weight) - - def reweight(self, prepared_data: Union[list, tuple, pd.DataFrame, pd.Series, WeightSampler]): - """ - Reweight the prepared data. - - Parameters - ---------- - prepared_data: Union[list, tuple, pd.DataFrame, pd.Series, WeightSampler] - The prepared data given by the DatasetH. - - Returns - ------- - Union[list, pd.Series, WeightSampler]: - """ - # Handle all kinds of prepared data format - if isinstance(prepared_data, (list, tuple)): - return [self.reweight(data) for data in prepared_data] - elif isinstance(prepared_data, (pd.Series, pd.DataFrame)): - return self._sample_reweight_DataFrame(prepared_data) - elif isinstance(prepared_data, TSDataSampler): - return self._sample_reweight_TSDataSampler(prepared_data) - else: - raise NotImplementedError(f"This type of input is not supported") From ce66d9aa4d5937f9b293b76277c51266845deb12 Mon Sep 17 00:00:00 2001 From: Young Date: Mon, 13 Dec 2021 10:35:45 +0800 Subject: [PATCH 20/37] asdd more docs --- examples/benchmarks_dynamic/DDG-DA/README.md | 7 +++++++ examples/benchmarks_dynamic/DDG-DA/workflow.py | 7 +++++++ examples/benchmarks_dynamic/REAMDE.md | 7 +++++++ examples/benchmarks_dynamic/baseline/rolling_benchmark.py | 3 +++ 4 files changed, 24 insertions(+) diff --git a/examples/benchmarks_dynamic/DDG-DA/README.md b/examples/benchmarks_dynamic/DDG-DA/README.md index 8b13789179..08922ac2f6 100644 --- a/examples/benchmarks_dynamic/DDG-DA/README.md +++ b/examples/benchmarks_dynamic/DDG-DA/README.md @@ -1 +1,8 @@ + + +# Introduction + + +The data in the paper is private. So we conduct the experiments on Qlib's public dataset. +Thought the dataset is different, but the conclusion is the same. diff --git a/examples/benchmarks_dynamic/DDG-DA/workflow.py b/examples/benchmarks_dynamic/DDG-DA/workflow.py index 0a5370a688..9f07c2e626 100644 --- a/examples/benchmarks_dynamic/DDG-DA/workflow.py +++ b/examples/benchmarks_dynamic/DDG-DA/workflow.py @@ -232,10 +232,17 @@ def train_and_eval_tasks(self): rb.update_rolling_rec() def run_all(self): + # 1) file: handler_proxy.pkl self.dump_data_for_proxy_model() + # 2) + # file: internal_data_s20.pkl + # mlflow: data_sim_s20, models for calculating meta_ipt self.dump_meta_ipt() + # 3) meta model will be stored in `DDG-DA` self.train_meta_model() + # 4) new_tasks are saved in "tasks_s20.pkl" (reweighter is added) self.meta_inference() + # 5) load the saved tasks and train model self.train_and_eval_tasks() diff --git a/examples/benchmarks_dynamic/REAMDE.md b/examples/benchmarks_dynamic/REAMDE.md index b0c255b64c..dbdd9b4adf 100644 --- a/examples/benchmarks_dynamic/REAMDE.md +++ b/examples/benchmarks_dynamic/REAMDE.md @@ -1,3 +1,10 @@ # Introduction Modeling the dynamic of market is a very important problem in Quant research. + + + +| Model Name | Dataset | IC | ICIR | Rank IC | Rank ICIR | Annualized Return | Information Ratio | Max Drawdown | +|------------------|---------|----|------|---------|-----------|-------------------|-------------------|--------------| +| RR[LightGBM] | | | | | | | | | +| DDG-DA[LightGBM] | | | | | | | | | diff --git a/examples/benchmarks_dynamic/baseline/rolling_benchmark.py b/examples/benchmarks_dynamic/baseline/rolling_benchmark.py index b57f351691..04436db757 100644 --- a/examples/benchmarks_dynamic/baseline/rolling_benchmark.py +++ b/examples/benchmarks_dynamic/baseline/rolling_benchmark.py @@ -87,7 +87,10 @@ def update_rolling_rec(self): print(f"Your evaluation results can be found in the experiment named `{self.COMB_EXP}`.") def run_all(self): + # the results will be save in mlruns. + # 1) each rolling task is saved in rolling_models self.train_rolling_tasks() + # 2) combined rolling tasks and evaluation results are saved in rolling self.ens_rolling() self.update_rolling_rec() From cea134d42a5df9dc7e514b4832702737f2b204ba Mon Sep 17 00:00:00 2001 From: you-n-g Date: Mon, 13 Dec 2021 14:46:18 +0800 Subject: [PATCH 21/37] Update README.md --- examples/benchmarks_dynamic/DDG-DA/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/benchmarks_dynamic/DDG-DA/README.md b/examples/benchmarks_dynamic/DDG-DA/README.md index 08922ac2f6..63012f74ef 100644 --- a/examples/benchmarks_dynamic/DDG-DA/README.md +++ b/examples/benchmarks_dynamic/DDG-DA/README.md @@ -4,5 +4,5 @@ # Introduction -The data in the paper is private. So we conduct the experiments on Qlib's public dataset. -Thought the dataset is different, but the conclusion is the same. +The data in the paper are private. So we conduct experiments on Qlib's public dataset. +Though the dataset is different, the conclusions remains same. From a4a2b32a317d2211fe9d6e7643927a4dbe302d6a Mon Sep 17 00:00:00 2001 From: demon143 <785696300@qq.com> Date: Sat, 8 Jan 2022 11:34:50 +0000 Subject: [PATCH 22/37] Update & fix some bugs. --- .../workflow_config_lightgbm_Alpha158.yaml | 1 + .../workflow_config_linear_Alpha158.yaml | 2 +- .../benchmarks_dynamic/DDG-DA/workflow.py | 40 ++++++++++++++----- .../baseline/rolling_benchmark.py | 27 ++++++++----- qlib/contrib/data/handler.py | 4 +- qlib/contrib/meta/data_selection/dataset.py | 6 +++ qlib/contrib/meta/data_selection/model.py | 9 ++++- 7 files changed, 63 insertions(+), 26 deletions(-) diff --git a/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml b/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml index 2d441dea92..1b48150532 100644 --- a/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml +++ b/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml @@ -9,6 +9,7 @@ data_handler_config: &data_handler_config fit_start_time: 2008-01-01 fit_end_time: 2014-12-31 instruments: *market + label: ["Ref($close, -21) / Ref($close, -1) - 1"] port_analysis_config: &port_analysis_config strategy: class: TopkDropoutStrategy diff --git a/examples/benchmarks/Linear/workflow_config_linear_Alpha158.yaml b/examples/benchmarks/Linear/workflow_config_linear_Alpha158.yaml index c4e4d8e21b..1a29f4b49f 100644 --- a/examples/benchmarks/Linear/workflow_config_linear_Alpha158.yaml +++ b/examples/benchmarks/Linear/workflow_config_linear_Alpha158.yaml @@ -22,7 +22,7 @@ data_handler_config: &data_handler_config - class: CSRankNorm kwargs: fields_group: label - label: ["Ref($close, -2) / Ref($close, -1) - 1"] + label: ["Ref($close, -21) / Ref($close, -1) - 1"] port_analysis_config: &port_analysis_config strategy: class: TopkDropoutStrategy diff --git a/examples/benchmarks_dynamic/DDG-DA/workflow.py b/examples/benchmarks_dynamic/DDG-DA/workflow.py index 9f07c2e626..4076a6db84 100644 --- a/examples/benchmarks_dynamic/DDG-DA/workflow.py +++ b/examples/benchmarks_dynamic/DDG-DA/workflow.py @@ -21,6 +21,9 @@ DIRNAME = Path(__file__).absolute().resolve().parent sys.path.append(str(DIRNAME.parent / "baseline")) from rolling_benchmark import RollingBenchmark # NOTE: sys.path is changed for import RollingBenchmark +import torch + +torch.manual_seed(43) class DDGDA: @@ -32,15 +35,17 @@ class DDGDA: - `rm -r mlruns` """ - def __init__(self) -> None: + def __init__(self, model_types=["linear", "linear"]) -> None: self.step = 20 # NOTE: # the horizon must match the meaning in the base task template - self.horizon = 1 + self.horizon = 20 self.meta_exp_name = "DDG-DA" + self.model_types = model_types # first for calculate IC, second for forecasting models' type def get_feature_importance(self): - rb = RollingBenchmark() + # this must be lightGBM, because it needs to get the feature importance + rb = RollingBenchmark(model_type="gbdt") task = rb.basic_task() model = init_instance_by_config(task["model"]) @@ -67,7 +72,7 @@ def dump_data_for_proxy_model(self): fi = self.get_feature_importance() col_selected = fi.nlargest(topk) - rb = RollingBenchmark() + rb = RollingBenchmark(model_type=self.model_types[0]) task = rb.basic_task() dataset = init_instance_by_config(task["dataset"]) prep_ds = dataset.prepare(slice(None), col_set=["feature", "label"], data_key=DataHandlerLP.DK_L) @@ -105,10 +110,12 @@ def dump_meta_ipt(self): Dump data for training meta model. This function will dump the input data for meta model """ - rb = RollingBenchmark() + # According to the experiments, the choice of the model type is very important for achieving good results + rb = RollingBenchmark(model_type=self.model_types[0]) sim_task = rb.basic_task() - sim_task["model"].setdefault("kwargs", {}).update({"early_stopping_rounds": None, "num_boost_round": 350}) + if self.model_types[0] == "gbdt": + sim_task["model"].setdefault("kwargs", {}).update({"early_stopping_rounds": None, "num_boost_round": 150}) exp_name_sim = f"data_sim_s{self.step}" @@ -125,7 +132,7 @@ def train_meta_model(self): # 1) leverage the simplified proxy forecasting model to train meta model. # - Only the dataset part is important, in current version of meta model will integrate the - rb = RollingBenchmark() + rb = RollingBenchmark(model_type=self.model_types[0]) sim_task = rb.basic_task() proxy_forecast_model_task = { # "model": "qlib.contrib.model.linear.LinearModel", @@ -146,7 +153,7 @@ def train_meta_model(self): kwargs = dict( task_tpl=proxy_forecast_model_task, step=self.step, - segments=0.5, + segments=0.62, trunc_days=1 + self.horizon, hist_step_n=30, fill_method="max", @@ -163,7 +170,7 @@ def train_meta_model(self): # 3) train and logging meta model with R.start(experiment_name=self.meta_exp_name): R.log_params(**kwargs) - mm = MetaModelDS(step=self.step, hist_step_n=kwargs["hist_step_n"], lr=0.0001) + mm = MetaModelDS(step=self.step, hist_step_n=kwargs["hist_step_n"], lr=0.001, max_epoch=200) mm.fit(md) R.save_objects(model=mm) @@ -196,7 +203,7 @@ def meta_inference(self): hist_step_n = int(param["hist_step_n"]) fill_method = param.get("fill_method", "max") - rb = RollingBenchmark() + rb = RollingBenchmark(model_type=self.model_types[1]) task_l = rb.create_rolling_tasks() # 2.2) create meta dataset for final dataset @@ -226,7 +233,7 @@ def train_and_eval_tasks(self): """ with self._task_path.open("rb") as f: tasks = pickle.load(f) - rb = RollingBenchmark(rolling_exp="rolling_ds") + rb = RollingBenchmark(rolling_exp="rolling_ds", model_type=self.model_types[1]) rb.train_rolling_tasks(tasks) rb.ens_rolling() rb.update_rolling_rec() @@ -245,6 +252,17 @@ def run_all(self): # 5) load the saved tasks and train model self.train_and_eval_tasks() + def from_meta(self): + # debug only + self.train_meta_model() + self.meta_inference() + self.train_and_eval_tasks() + + def from_inference(self): + # debug only + self.meta_inference() + self.train_and_eval_tasks() + if __name__ == "__main__": auto_init() diff --git a/examples/benchmarks_dynamic/baseline/rolling_benchmark.py b/examples/benchmarks_dynamic/baseline/rolling_benchmark.py index 04436db757..d84f5e7d9a 100644 --- a/examples/benchmarks_dynamic/baseline/rolling_benchmark.py +++ b/examples/benchmarks_dynamic/baseline/rolling_benchmark.py @@ -24,25 +24,32 @@ class RollingBenchmark: """ - def __init__(self, rolling_exp="rolling_models") -> None: + def __init__(self, rolling_exp="rolling_models", model_type="linear") -> None: self.step = 20 - self.horizon = 1 + self.horizon = 20 self.rolling_exp = rolling_exp + self.model_type = model_type def basic_task(self): """For fast training rolling""" - conf_path = DIRNAME.parent.parent / "benchmarks" / "LightGBM" / "workflow_config_lightgbm_Alpha158.yaml" + if self.model_type == "gbdt": + conf_path = DIRNAME.parent.parent / "benchmarks" / "LightGBM" / "workflow_config_lightgbm_Alpha158.yaml" + # dump the processed data on to disk for later loading to speed up the processing + h_path = DIRNAME / "lightgbm_alpha158_handler.pkl" + elif self.model_type == "linear": + conf_path = DIRNAME.parent.parent / "benchmarks" / "Linear" / "workflow_config_linear_Alpha158.yaml" + h_path = DIRNAME / "linear_alpha158_handler.pkl" + else: + raise AssertionError("Model type is not supported!") with conf_path.open("r") as f: conf = yaml.safe_load(f) - task = conf["task"] - # dump the processed data on to disk for later loading to speed up the processing - h_path = DIRNAME / "lightgbm_alpha158_handler.pkl" + task = conf["task"] - if not h_path.exists(): - h_conf = task["dataset"]["kwargs"]["handler"] - h = init_instance_by_config(h_conf) - h.to_pickle(h_path, dump_all=True) + # if not h_path.exists(): + h_conf = task["dataset"]["kwargs"]["handler"] + h = init_instance_by_config(h_conf) + h.to_pickle(h_path, dump_all=True) task["dataset"]["kwargs"]["handler"] = f"file://{h_path}" task["record"] = ["qlib.workflow.record_temp.SignalRecord"] diff --git a/qlib/contrib/data/handler.py b/qlib/contrib/data/handler.py index ccd753006d..1fe5d2b850 100644 --- a/qlib/contrib/data/handler.py +++ b/qlib/contrib/data/handler.py @@ -130,7 +130,7 @@ def get_feature_config(self): class Alpha360vwap(Alpha360): def get_label_config(self): - return (["Ref($vwap, -2)/Ref($vwap, -1) - 1"], ["LABEL0"]) + return (["Ref($close, -2)/Ref($close, -1) - 1"], ["LABEL0"]) class Alpha158(DataHandlerLP): @@ -371,4 +371,4 @@ def parse_config_to_fields(config): class Alpha158vwap(Alpha158): def get_label_config(self): - return (["Ref($vwap, -2)/Ref($vwap, -1) - 1"], ["LABEL0"]) + return (["Ref($close, -2)/Ref($close, -1) - 1"], ["LABEL0"]) diff --git a/qlib/contrib/meta/data_selection/dataset.py b/qlib/contrib/meta/data_selection/dataset.py index c1c1811e3d..94238671a4 100644 --- a/qlib/contrib/meta/data_selection/dataset.py +++ b/qlib/contrib/meta/data_selection/dataset.py @@ -180,6 +180,12 @@ def __init__(self, task: dict, meta_info: pd.DataFrame, mode: str = MetaTask.PRO test_idx=d_test["label"].index, ) ) + + # debug: record the test period of the current meta-task instance + self.test_period = ( + d_test["feature"].index.get_level_values("datetime")[0], + d_test["feature"].index.get_level_values("datetime")[-1], + ) # TODO: set device: I think this is not necessary to converting data format. self.processed_meta_input = data_to_tensor(self.processed_meta_input) diff --git a/qlib/contrib/meta/data_selection/model.py b/qlib/contrib/meta/data_selection/model.py index a3de4023f8..982339e399 100644 --- a/qlib/contrib/meta/data_selection/model.py +++ b/qlib/contrib/meta/data_selection/model.py @@ -53,7 +53,7 @@ def __init__( clip_weight=2.0, criterion="ic_loss", lr=0.0001, - max_epoch=150, + max_epoch=100, ): self.step = step self.hist_step_n = hist_step_n @@ -65,7 +65,7 @@ def __init__( self.fitted = False def run_epoch(self, phase, task_list, epoch, opt, loss_l, ignore_weight=False): - if phase == "train": # phase 0 for training, 1 for inference + if phase == "train": self.tn.train() torch.set_grad_enabled(True) else: @@ -141,6 +141,11 @@ def fit(self, meta_dataset: MetaDatasetDS): phases = ["train", "test"] meta_tasks_l = meta_dataset.prepare_tasks(phases) + if len(meta_tasks_l[1]): + R.log_params( + **dict(proxy_test_begin=meta_tasks_l[1][0].test_period) + ) # debug: record when the test phase starts + self.tn = PredNet( step=self.step, hist_step_n=self.hist_step_n, clip_weight=self.clip_weight, clip_method=self.clip_method ) From 82418320123556210b0ce0a7716b7a1f03c76e37 Mon Sep 17 00:00:00 2001 From: wendili-cs Date: Sat, 8 Jan 2022 13:25:09 +0000 Subject: [PATCH 23/37] Update configuration & remove debug functions --- examples/benchmarks_dynamic/DDG-DA/workflow.py | 11 ----------- qlib/contrib/data/handler.py | 4 ++-- 2 files changed, 2 insertions(+), 13 deletions(-) diff --git a/examples/benchmarks_dynamic/DDG-DA/workflow.py b/examples/benchmarks_dynamic/DDG-DA/workflow.py index 4076a6db84..ff636edb15 100644 --- a/examples/benchmarks_dynamic/DDG-DA/workflow.py +++ b/examples/benchmarks_dynamic/DDG-DA/workflow.py @@ -252,17 +252,6 @@ def run_all(self): # 5) load the saved tasks and train model self.train_and_eval_tasks() - def from_meta(self): - # debug only - self.train_meta_model() - self.meta_inference() - self.train_and_eval_tasks() - - def from_inference(self): - # debug only - self.meta_inference() - self.train_and_eval_tasks() - if __name__ == "__main__": auto_init() diff --git a/qlib/contrib/data/handler.py b/qlib/contrib/data/handler.py index 1fe5d2b850..ccd753006d 100644 --- a/qlib/contrib/data/handler.py +++ b/qlib/contrib/data/handler.py @@ -130,7 +130,7 @@ def get_feature_config(self): class Alpha360vwap(Alpha360): def get_label_config(self): - return (["Ref($close, -2)/Ref($close, -1) - 1"], ["LABEL0"]) + return (["Ref($vwap, -2)/Ref($vwap, -1) - 1"], ["LABEL0"]) class Alpha158(DataHandlerLP): @@ -371,4 +371,4 @@ def parse_config_to_fields(config): class Alpha158vwap(Alpha158): def get_label_config(self): - return (["Ref($close, -2)/Ref($close, -1) - 1"], ["LABEL0"]) + return (["Ref($vwap, -2)/Ref($vwap, -1) - 1"], ["LABEL0"]) From e1b079d1a477e87223d7bedc568e12ae6ab31a19 Mon Sep 17 00:00:00 2001 From: wendili-cs Date: Sun, 9 Jan 2022 01:44:42 +0000 Subject: [PATCH 24/37] Update README.md --- examples/benchmarks_dynamic/REAMDE.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/benchmarks_dynamic/REAMDE.md b/examples/benchmarks_dynamic/REAMDE.md index dbdd9b4adf..c6fac9d2ed 100644 --- a/examples/benchmarks_dynamic/REAMDE.md +++ b/examples/benchmarks_dynamic/REAMDE.md @@ -6,5 +6,5 @@ Modeling the dynamic of market is a very important problem in Quant research. | Model Name | Dataset | IC | ICIR | Rank IC | Rank ICIR | Annualized Return | Information Ratio | Max Drawdown | |------------------|---------|----|------|---------|-----------|-------------------|-------------------|--------------| -| RR[LightGBM] | | | | | | | | | -| DDG-DA[LightGBM] | | | | | | | | | +| RR[Linear] |Alpha158 |0.088|0.570|0.102 |0.622 |0.077 |1.175 |-0.086 | +| DDG-DA[Linear] |Alpha158 |0.093|0.622|0.106 |0.670 |0.085 |1.213 |-0.093 | \ No newline at end of file From 6a3f4713046b310cc12de78cf047eaf3f6e4792a Mon Sep 17 00:00:00 2001 From: wendili-cs Date: Sun, 9 Jan 2022 09:33:20 +0000 Subject: [PATCH 25/37] Modfify horizon from code rather than yaml --- .../LightGBM/workflow_config_lightgbm_Alpha158.yaml | 1 - .../benchmarks/Linear/workflow_config_linear_Alpha158.yaml | 1 - examples/benchmarks_dynamic/baseline/rolling_benchmark.py | 5 +++++ 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml b/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml index 1b48150532..2d441dea92 100644 --- a/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml +++ b/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml @@ -9,7 +9,6 @@ data_handler_config: &data_handler_config fit_start_time: 2008-01-01 fit_end_time: 2014-12-31 instruments: *market - label: ["Ref($close, -21) / Ref($close, -1) - 1"] port_analysis_config: &port_analysis_config strategy: class: TopkDropoutStrategy diff --git a/examples/benchmarks/Linear/workflow_config_linear_Alpha158.yaml b/examples/benchmarks/Linear/workflow_config_linear_Alpha158.yaml index 1a29f4b49f..290a8bc42d 100644 --- a/examples/benchmarks/Linear/workflow_config_linear_Alpha158.yaml +++ b/examples/benchmarks/Linear/workflow_config_linear_Alpha158.yaml @@ -22,7 +22,6 @@ data_handler_config: &data_handler_config - class: CSRankNorm kwargs: fields_group: label - label: ["Ref($close, -21) / Ref($close, -1) - 1"] port_analysis_config: &port_analysis_config strategy: class: TopkDropoutStrategy diff --git a/examples/benchmarks_dynamic/baseline/rolling_benchmark.py b/examples/benchmarks_dynamic/baseline/rolling_benchmark.py index d84f5e7d9a..18f6a8a1d7 100644 --- a/examples/benchmarks_dynamic/baseline/rolling_benchmark.py +++ b/examples/benchmarks_dynamic/baseline/rolling_benchmark.py @@ -44,6 +44,11 @@ def basic_task(self): with conf_path.open("r") as f: conf = yaml.safe_load(f) + # modify dataset horizon + conf["task"]["dataset"]["kwargs"]["handler"]["kwargs"]["label"] = [ + "Ref($close, -{}) / Ref($close, -1) - 1".format(self.horizon + 1) + ] + task = conf["task"] # if not h_path.exists(): From c3364cdc11108008960d98bc62a7bb1dc152813d Mon Sep 17 00:00:00 2001 From: wendili-cs Date: Sun, 9 Jan 2022 11:09:22 +0000 Subject: [PATCH 26/37] Update performance in README.md --- examples/benchmarks_dynamic/REAMDE.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/benchmarks_dynamic/REAMDE.md b/examples/benchmarks_dynamic/REAMDE.md index c6fac9d2ed..17ac8d4c2b 100644 --- a/examples/benchmarks_dynamic/REAMDE.md +++ b/examples/benchmarks_dynamic/REAMDE.md @@ -6,5 +6,7 @@ Modeling the dynamic of market is a very important problem in Quant research. | Model Name | Dataset | IC | ICIR | Rank IC | Rank ICIR | Annualized Return | Information Ratio | Max Drawdown | |------------------|---------|----|------|---------|-----------|-------------------|-------------------|--------------| -| RR[Linear] |Alpha158 |0.088|0.570|0.102 |0.622 |0.077 |1.175 |-0.086 | -| DDG-DA[Linear] |Alpha158 |0.093|0.622|0.106 |0.670 |0.085 |1.213 |-0.093 | \ No newline at end of file +| RR[Linear] |Alpha158 |0.088|0.570|0.102 |0.622 |0.077 |1.175 |-0.086 | +| DDG-DA[Linear] |Alpha158 |0.093|0.622|0.106 |0.670 |0.085 |1.213 |-0.093 | +| RR[LightGBM] |Alpha158 |0.079|0.566|0.088 |0.592 |0.075 |1.226 |-0.096 | +| DDG-DA[LightGBM] |Alpha158 |0.084|0.639|0.093 |0.664 |0.099 |1.442 |-0.071 | \ No newline at end of file From fa2d047fff1232b9091d526e739b099bc2bf8e37 Mon Sep 17 00:00:00 2001 From: Young Date: Sun, 9 Jan 2022 20:38:50 +0800 Subject: [PATCH 27/37] fix part comments --- README.md | 12 ++++++---- .../benchmarks_dynamic/DDG-DA/workflow.py | 22 +++++++++---------- examples/benchmarks_dynamic/REAMDE.md | 12 ++++++++-- qlib/contrib/meta/data_selection/dataset.py | 18 +++++---------- qlib/contrib/meta/data_selection/model.py | 7 +++--- qlib/contrib/meta/data_selection/utils.py | 16 -------------- qlib/contrib/torch.py | 2 +- 7 files changed, 38 insertions(+), 51 deletions(-) diff --git a/README.md b/README.md index 48873ed2e1..032ae06dfd 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,7 @@ Recent released features | Feature | Status | | -- | ------ | +| Meta-Learning-based framework & DDG-DA | [Released](https://github.com/microsoft/qlib/pull/743) on Jan 10, 2022 | | Planning-based portfolio optimization | [Released](https://github.com/microsoft/qlib/pull/754) on Dec 28, 2021 | | Release Qlib v0.8.0 | [Released](https://github.com/microsoft/qlib/releases/tag/v0.8.0) on Dec 8, 2021 | | ADD model | [Released](https://github.com/microsoft/qlib/pull/704) on Nov 22, 2021 | @@ -50,9 +51,13 @@ For more details, please refer to our paper ["Qlib: An AI-oriented Quantitative - [Data Preparation](#data-preparation) - [Auto Quant Research Workflow](#auto-quant-research-workflow) - [Building Customized Quant Research Workflow by Code](#building-customized-quant-research-workflow-by-code) -- [**Quant Model(Paper) Zoo**](#quant-model-paper-zoo) - - [Run a single model](#run-a-single-model) - - [Run multiple models](#run-multiple-models) +- [ Main Challenges & Solution in Quant Research ] + - [Forecasting: Finding valuable signals/patterns] + - [**Quant Model(Paper) Zoo**](#quant-model-paper-zoo) + - [Run a single model](#run-a-single-model) + - [Run multiple models](#run-multiple-models) + - [Adapting to market dynamics] + - RR & DDG-DA - [**Quant Dataset Zoo**](#quant-dataset-zoo) - [More About Qlib](#more-about-qlib) - [Offline Mode and Online Mode](#offline-mode-and-online-mode) @@ -69,7 +74,6 @@ Your feedbacks about the features are very important. | -- | ------ | | Point-in-Time database | Under review: https://github.com/microsoft/qlib/pull/343 | | Orderbook database | Under review: https://github.com/microsoft/qlib/pull/744 | -| Meta-Learning-based data selection | Under review: https://github.com/microsoft/qlib/pull/743 | # Framework of Qlib diff --git a/examples/benchmarks_dynamic/DDG-DA/workflow.py b/examples/benchmarks_dynamic/DDG-DA/workflow.py index ff636edb15..4479450925 100644 --- a/examples/benchmarks_dynamic/DDG-DA/workflow.py +++ b/examples/benchmarks_dynamic/DDG-DA/workflow.py @@ -21,9 +21,6 @@ DIRNAME = Path(__file__).absolute().resolve().parent sys.path.append(str(DIRNAME.parent / "baseline")) from rolling_benchmark import RollingBenchmark # NOTE: sys.path is changed for import RollingBenchmark -import torch - -torch.manual_seed(43) class DDGDA: @@ -35,13 +32,14 @@ class DDGDA: - `rm -r mlruns` """ - def __init__(self, model_types=["linear", "linear"]) -> None: + def __init__(self, sim_task_model="linear", forecast_model="linear"): self.step = 20 # NOTE: # the horizon must match the meaning in the base task template self.horizon = 20 self.meta_exp_name = "DDG-DA" - self.model_types = model_types # first for calculate IC, second for forecasting models' type + self.sim_task_model = sim_task_model # The model to capture the distribution of data. + self.forecast_model = forecast_model # downstream forecasting models' type def get_feature_importance(self): # this must be lightGBM, because it needs to get the feature importance @@ -72,7 +70,7 @@ def dump_data_for_proxy_model(self): fi = self.get_feature_importance() col_selected = fi.nlargest(topk) - rb = RollingBenchmark(model_type=self.model_types[0]) + rb = RollingBenchmark(model_type=self.sim_task_model) task = rb.basic_task() dataset = init_instance_by_config(task["dataset"]) prep_ds = dataset.prepare(slice(None), col_set=["feature", "label"], data_key=DataHandlerLP.DK_L) @@ -111,10 +109,10 @@ def dump_meta_ipt(self): This function will dump the input data for meta model """ # According to the experiments, the choice of the model type is very important for achieving good results - rb = RollingBenchmark(model_type=self.model_types[0]) + rb = RollingBenchmark(model_type=self.sim_task_model) sim_task = rb.basic_task() - if self.model_types[0] == "gbdt": + if self.sim_task_model == "gbdt": sim_task["model"].setdefault("kwargs", {}).update({"early_stopping_rounds": None, "num_boost_round": 150}) exp_name_sim = f"data_sim_s{self.step}" @@ -132,7 +130,7 @@ def train_meta_model(self): # 1) leverage the simplified proxy forecasting model to train meta model. # - Only the dataset part is important, in current version of meta model will integrate the - rb = RollingBenchmark(model_type=self.model_types[0]) + rb = RollingBenchmark(model_type=self.sim_task_model) sim_task = rb.basic_task() proxy_forecast_model_task = { # "model": "qlib.contrib.model.linear.LinearModel", @@ -170,7 +168,7 @@ def train_meta_model(self): # 3) train and logging meta model with R.start(experiment_name=self.meta_exp_name): R.log_params(**kwargs) - mm = MetaModelDS(step=self.step, hist_step_n=kwargs["hist_step_n"], lr=0.001, max_epoch=200) + mm = MetaModelDS(step=self.step, hist_step_n=kwargs["hist_step_n"], lr=0.001, max_epoch=200, seed=43) mm.fit(md) R.save_objects(model=mm) @@ -203,7 +201,7 @@ def meta_inference(self): hist_step_n = int(param["hist_step_n"]) fill_method = param.get("fill_method", "max") - rb = RollingBenchmark(model_type=self.model_types[1]) + rb = RollingBenchmark(model_type=self.forecast_model) task_l = rb.create_rolling_tasks() # 2.2) create meta dataset for final dataset @@ -233,7 +231,7 @@ def train_and_eval_tasks(self): """ with self._task_path.open("rb") as f: tasks = pickle.load(f) - rb = RollingBenchmark(rolling_exp="rolling_ds", model_type=self.model_types[1]) + rb = RollingBenchmark(rolling_exp="rolling_ds", model_type=self.forecast_model) rb.train_rolling_tasks(tasks) rb.ens_rolling() rb.update_rolling_rec() diff --git a/examples/benchmarks_dynamic/REAMDE.md b/examples/benchmarks_dynamic/REAMDE.md index 17ac8d4c2b..05d3ce2c9f 100644 --- a/examples/benchmarks_dynamic/REAMDE.md +++ b/examples/benchmarks_dynamic/REAMDE.md @@ -1,6 +1,14 @@ # Introduction -Modeling the dynamic of market is a very important problem in Quant research. +General problem. +Quant problem. +- Modeling the dynamic of market is a very important problem in Quant research. + +setting:.... +- horizon + +Target ..... + @@ -9,4 +17,4 @@ Modeling the dynamic of market is a very important problem in Quant research. | RR[Linear] |Alpha158 |0.088|0.570|0.102 |0.622 |0.077 |1.175 |-0.086 | | DDG-DA[Linear] |Alpha158 |0.093|0.622|0.106 |0.670 |0.085 |1.213 |-0.093 | | RR[LightGBM] |Alpha158 |0.079|0.566|0.088 |0.592 |0.075 |1.226 |-0.096 | -| DDG-DA[LightGBM] |Alpha158 |0.084|0.639|0.093 |0.664 |0.099 |1.442 |-0.071 | \ No newline at end of file +| DDG-DA[LightGBM] |Alpha158 |0.084|0.639|0.093 |0.664 |0.099 |1.442 |-0.071 | diff --git a/qlib/contrib/meta/data_selection/dataset.py b/qlib/contrib/meta/data_selection/dataset.py index 94238671a4..da8bb69876 100644 --- a/qlib/contrib/meta/data_selection/dataset.py +++ b/qlib/contrib/meta/data_selection/dataset.py @@ -147,18 +147,14 @@ def __init__(self, task: dict, meta_info: pd.DataFrame, mode: str = MetaTask.PRO # these three lines occupied 70% of the time of initializing MetaTaskDS d_train, d_test = ds.prepare(["train", "test"], col_set=["feature", "label"]) prev_size = d_test.shape[0] - # print(d_test.groupby("datetime").size()) d_train = d_train.dropna(axis=0) d_test = d_test.dropna(axis=0) - # print(d_test.groupby("datetime").size()) if prev_size == 0 or d_test.shape[0] / prev_size <= 0.1: - __import__("ipdb").set_trace() - raise ValueError(f"Most of samples are dropped. Skip this task: {task}") + raise ValueError(f"Most of samples are dropped. Please check this task: {task}") - if globals().get("YX_CONFIRM_XXX") is None: - if d_test.groupby("datetime").size().shape[0] < 5: - __import__("ipdb").set_trace() - # globals()["YX_CONFIRM_XXX"] = True + assert ( + d_test.groupby("datetime").size().shape[0] >= 5 + ), "In this segment, this trading dates is less than 5, you'd better check the data." sample_time_belong = np.zeros((d_train.shape[0], time_perf.shape[1])) for i, col in enumerate(time_perf.columns): @@ -190,7 +186,6 @@ def __init__(self, task: dict, meta_info: pd.DataFrame, mode: str = MetaTask.PRO self.processed_meta_input = data_to_tensor(self.processed_meta_input) def _get_processed_meta_info(self): - # __import__('ipdb').set_trace() meta_info_norm = self.meta_info.sub(self.meta_info.mean(axis=1), axis=0) # .fillna(0.) if self.fill_method == "max": meta_info_norm = meta_info_norm.T.fillna( @@ -295,10 +290,7 @@ def __init__( self.task_list.append(t) except ValueError as e: logger.warning(f"ValueError: {e}") - if globals().get("YX_CONFIRM_XXX") is None: - if len(self.meta_task_l) <= 0: - __import__("ipdb").set_trace() - # globals()["YX_CONFIRM_XXX"] = True + assert len(self.meta_task_l) > 0, "No meta tasks found. Please check the data and setting" def _prepare_meta_ipt(self, task): ic_df = self.internal_data.data_ic_df diff --git a/qlib/contrib/meta/data_selection/model.py b/qlib/contrib/meta/data_selection/model.py index 982339e399..5a216a1861 100644 --- a/qlib/contrib/meta/data_selection/model.py +++ b/qlib/contrib/meta/data_selection/model.py @@ -18,7 +18,7 @@ from ....model.meta.model import MetaModel, MetaTaskModel from ....workflow import R -from .utils import fill_diagnal, ICLoss +from .utils import ICLoss from .dataset import MetaDatasetDS from qlib.contrib.meta.data_selection.net import PredNet from qlib.data.dataset.weight import Reweighter @@ -54,6 +54,7 @@ def __init__( criterion="ic_loss", lr=0.0001, max_epoch=100, + seed=43, ): self.step = step self.hist_step_n = hist_step_n @@ -63,6 +64,7 @@ def __init__( self.lr = lr self.max_epoch = max_epoch self.fitted = False + torch.manual_seed(43) def run_epoch(self, phase, task_list, epoch, opt, loss_l, ignore_weight=False): if phase == "train": @@ -94,8 +96,7 @@ def run_epoch(self, phase, task_list, epoch, opt, loss_l, ignore_weight=False): get_module_logger("MetaModelDS").warning(f"Exception `{e}` when calculating IC loss") continue - if np.isnan(loss.detach().item()): - __import__("ipdb").set_trace() + assert not np.isnan(loss.detach().item()), "NaN loss!" if phase == "train": opt.zero_grad() diff --git a/qlib/contrib/meta/data_selection/utils.py b/qlib/contrib/meta/data_selection/utils.py index bb080747ef..8d7dcf2e4a 100644 --- a/qlib/contrib/meta/data_selection/utils.py +++ b/qlib/contrib/meta/data_selection/utils.py @@ -8,22 +8,6 @@ from qlib.contrib.torch import data_to_tensor -def fill_diagnal(sim_mat): - sim_mat = sim_mat.copy() - # Remove the future information - sim_mat_past = sim_mat.where(sim_mat.index.values.reshape(-1, 1) > sim_mat.columns.values) - sim_mat.values[sim_mat.index.values.reshape(-1, 1) == sim_mat.columns.values] = sim_mat_past.max(axis=1) - sim_mat.iloc[0, 0] = 0.0 - return sim_mat - - -def get_sim_mat_idx(i_sim_mat, outsample_period): - for idx in range(len(i_sim_mat.index)): - if i_sim_mat.index[idx][0] == outsample_period[0]: - return idx - raise AssertionError("Not Found!") - - class ICLoss(nn.Module): def forward(self, pred, y, idx, skip_size=50): """forward. diff --git a/qlib/contrib/torch.py b/qlib/contrib/torch.py index 91deb0b7a8..293785c559 100644 --- a/qlib/contrib/torch.py +++ b/qlib/contrib/torch.py @@ -2,7 +2,7 @@ # Licensed under the MIT License. """ This module is not a necessary part of Qlib. - It is just some tools for convenience + They are just some tools for convenience It is should not imported into the core part of qlib """ import torch From efab5cbb2870c8b8172714a2d58956f230aa70ee Mon Sep 17 00:00:00 2001 From: wendili-cs Date: Mon, 10 Jan 2022 00:49:36 +0000 Subject: [PATCH 28/37] Remove unfinished TCTS. --- qlib/contrib/meta/TCTS/model.py | 123 -------------------------------- qlib/contrib/meta/TCTS/net.py | 54 -------------- 2 files changed, 177 deletions(-) delete mode 100644 qlib/contrib/meta/TCTS/model.py delete mode 100644 qlib/contrib/meta/TCTS/net.py diff --git a/qlib/contrib/meta/TCTS/model.py b/qlib/contrib/meta/TCTS/model.py deleted file mode 100644 index b270a114be..0000000000 --- a/qlib/contrib/meta/TCTS/model.py +++ /dev/null @@ -1,123 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -import pandas as pd -import numpy as np -import torch -from torch import nn -from torch import optim -import copy -import logging - -from .net import MLPModel - -from ....data.dataset import DatasetH - - -class MetaModelTCTS(MetaGuideModel): - """ - The meta-model for TCTS - """ - - def __init__( - self, - d_feat=6, - hidden_size=64, - num_layers=2, - dropout=0.0, - n_epochs=200, - batch_size=2000, - early_stop=20, - loss="mse", - optimizer="adam", - output_dim=5, - lr=5e-7, - steps=3, - GPU=0, - seed=None, - target_label=0, - **kwargs - ): - # Set logger. - self.logger = get_module_logger("TCTS") - self.logger.info("TCTS pytorch version...") - - # set hyper-parameters. - self.d_feat = d_feat - self.hidden_size = hidden_size - self.num_layers = num_layers - self.dropout = dropout - self.n_epochs = n_epochs - self.batch_size = batch_size - self.early_stop = early_stop - self.loss = loss - self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() else "cpu") - self.use_gpu = torch.cuda.is_available() - self.seed = seed - self.output_dim = output_dim - self.lr = lr - self.steps = steps - self.target_label = target_label - - self.logger.info( - "TCTS parameters setting:" - "\nd_feat : {}" - "\nhidden_size : {}" - "\nnum_layers : {}" - "\ndropout : {}" - "\nn_epochs : {}" - "\nbatch_size : {}" - "\nearly_stop : {}" - "\nloss_type : {}" - "\nvisible_GPU : {}" - "\nuse_GPU : {}" - "\nseed : {}".format( - d_feat, - hidden_size, - num_layers, - dropout, - n_epochs, - batch_size, - early_stop, - loss, - GPU, - self.use_gpu, - seed, - ) - ) - - if self.seed is not None: - np.random.seed(self.seed) - torch.manual_seed(self.seed) - - self.weight_model = MLPModel( - d_feat=360 + 2 * self.output_dim + 1, - hidden_size=self.hidden_size, - num_layers=self.num_layers, - dropout=self.dropout, - output_dim=self.output_dim, - ) - if optimizer.lower() == "adam": - self.optimizer = optim.Adam(self.weight_model.parameters(), lr=self.lr) - elif optimizer.lower() == "gd": - self.optimizer = optim.SGD(self.weight_model.parameters(), lr=self.lr) - else: - raise NotImplementedError("optimizer {} is not supported!".format(optimizer)) - - self.fitted = False - self.weight_model.to(self.device) - - def loss_fn(self, pred, label, weight): - - loc = torch.argmax(weight, 1) - loss = (pred - label[np.arange(weight.shape[0]), loc]) ** 2 - return torch.mean(loss) - - def fit( - self, - dataset: DatasetH, - evals_result=dict(), - verbose=True, - save_path=None, - ): - pass diff --git a/qlib/contrib/meta/TCTS/net.py b/qlib/contrib/meta/TCTS/net.py deleted file mode 100644 index 24128c63b7..0000000000 --- a/qlib/contrib/meta/TCTS/net.py +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -import numpy as np -import pandas as pd -import torch -import torch.nn as nn -import torch.optim as optim - - -class MLPModel(nn.Module): - def __init__(self, d_feat, hidden_size=256, num_layers=3, dropout=0.0, output_dim=1): - super().__init__() - - self.mlp = nn.Sequential() - self.softmax = nn.Softmax(dim=1) - - for i in range(num_layers): - if i > 0: - self.mlp.add_module("drop_%d" % i, nn.Dropout(dropout)) - self.mlp.add_module("fc_%d" % i, nn.Linear(d_feat if i == 0 else hidden_size, hidden_size)) - self.mlp.add_module("relu_%d" % i, nn.ReLU()) - - self.mlp.add_module("fc_out", nn.Linear(hidden_size, output_dim)) - - def forward(self, x): - # feature - # [N, F] - out = self.mlp(x).squeeze() - out = self.softmax(out) - return out - - -class GRUModel(nn.Module): - def __init__(self, d_feat=6, hidden_size=64, num_layers=2, dropout=0.0): - super().__init__() - - self.rnn = nn.GRU( - input_size=d_feat, - hidden_size=hidden_size, - num_layers=num_layers, - batch_first=True, - dropout=dropout, - ) - self.fc_out = nn.Linear(hidden_size, 1) - - self.d_feat = d_feat - - def forward(self, x): - # x: [N, F*T] - x = x.reshape(len(x), self.d_feat, -1) # [N, F, T] - x = x.permute(0, 2, 1) # [N, T, F] - out, _ = self.rnn(x) - return self.fc_out(out[:, -1, :]).squeeze() From 5a184ebd85edae995e9fc972d3473f2c1b83ddbe Mon Sep 17 00:00:00 2001 From: wendili-cs Date: Mon, 10 Jan 2022 02:48:43 +0000 Subject: [PATCH 29/37] Fix some details. --- examples/benchmarks_dynamic/DDG-DA/workflow.py | 2 +- .../benchmarks_dynamic/baseline/rolling_benchmark.py | 12 ++++++------ qlib/contrib/meta/data_selection/dataset.py | 5 ----- qlib/contrib/meta/data_selection/model.py | 4 ++-- qlib/contrib/meta/data_selection/net.py | 6 ++---- 5 files changed, 11 insertions(+), 18 deletions(-) diff --git a/examples/benchmarks_dynamic/DDG-DA/workflow.py b/examples/benchmarks_dynamic/DDG-DA/workflow.py index 4479450925..010a759476 100644 --- a/examples/benchmarks_dynamic/DDG-DA/workflow.py +++ b/examples/benchmarks_dynamic/DDG-DA/workflow.py @@ -151,7 +151,7 @@ def train_meta_model(self): kwargs = dict( task_tpl=proxy_forecast_model_task, step=self.step, - segments=0.62, + segments=0.62, # keep test period consistent with the dataset yaml trunc_days=1 + self.horizon, hist_step_n=30, fill_method="max", diff --git a/examples/benchmarks_dynamic/baseline/rolling_benchmark.py b/examples/benchmarks_dynamic/baseline/rolling_benchmark.py index 18f6a8a1d7..0459fa26be 100644 --- a/examples/benchmarks_dynamic/baseline/rolling_benchmark.py +++ b/examples/benchmarks_dynamic/baseline/rolling_benchmark.py @@ -35,10 +35,10 @@ def basic_task(self): if self.model_type == "gbdt": conf_path = DIRNAME.parent.parent / "benchmarks" / "LightGBM" / "workflow_config_lightgbm_Alpha158.yaml" # dump the processed data on to disk for later loading to speed up the processing - h_path = DIRNAME / "lightgbm_alpha158_handler.pkl" + h_path = DIRNAME / "lightgbm_alpha158_handler_horizon{}.pkl".format(self.horizon) elif self.model_type == "linear": conf_path = DIRNAME.parent.parent / "benchmarks" / "Linear" / "workflow_config_linear_Alpha158.yaml" - h_path = DIRNAME / "linear_alpha158_handler.pkl" + h_path = DIRNAME / "linear_alpha158_handler_horizon{}.pkl".format(self.horizon) else: raise AssertionError("Model type is not supported!") with conf_path.open("r") as f: @@ -51,10 +51,10 @@ def basic_task(self): task = conf["task"] - # if not h_path.exists(): - h_conf = task["dataset"]["kwargs"]["handler"] - h = init_instance_by_config(h_conf) - h.to_pickle(h_path, dump_all=True) + if not h_path.exists(): + h_conf = task["dataset"]["kwargs"]["handler"] + h = init_instance_by_config(h_conf) + h.to_pickle(h_path, dump_all=True) task["dataset"]["kwargs"]["handler"] = f"file://{h_path}" task["record"] = ["qlib.workflow.record_temp.SignalRecord"] diff --git a/qlib/contrib/meta/data_selection/dataset.py b/qlib/contrib/meta/data_selection/dataset.py index da8bb69876..f907af5359 100644 --- a/qlib/contrib/meta/data_selection/dataset.py +++ b/qlib/contrib/meta/data_selection/dataset.py @@ -177,11 +177,6 @@ def __init__(self, task: dict, meta_info: pd.DataFrame, mode: str = MetaTask.PRO ) ) - # debug: record the test period of the current meta-task instance - self.test_period = ( - d_test["feature"].index.get_level_values("datetime")[0], - d_test["feature"].index.get_level_values("datetime")[-1], - ) # TODO: set device: I think this is not necessary to converting data format. self.processed_meta_input = data_to_tensor(self.processed_meta_input) diff --git a/qlib/contrib/meta/data_selection/model.py b/qlib/contrib/meta/data_selection/model.py index 5a216a1861..c2106348a5 100644 --- a/qlib/contrib/meta/data_selection/model.py +++ b/qlib/contrib/meta/data_selection/model.py @@ -64,7 +64,7 @@ def __init__( self.lr = lr self.max_epoch = max_epoch self.fitted = False - torch.manual_seed(43) + torch.manual_seed(seed) def run_epoch(self, phase, task_list, epoch, opt, loss_l, ignore_weight=False): if phase == "train": @@ -144,7 +144,7 @@ def fit(self, meta_dataset: MetaDatasetDS): if len(meta_tasks_l[1]): R.log_params( - **dict(proxy_test_begin=meta_tasks_l[1][0].test_period) + **dict(proxy_test_begin=meta_tasks_l[1][0].task["dataset"]["kwargs"]["segments"]["test"]) ) # debug: record when the test phase starts self.tn = PredNet( diff --git a/qlib/contrib/meta/data_selection/net.py b/qlib/contrib/meta/data_selection/net.py index 73fd12061d..c8b15d7500 100644 --- a/qlib/contrib/meta/data_selection/net.py +++ b/qlib/contrib/meta/data_selection/net.py @@ -11,7 +11,7 @@ class TimeWeightMeta(SingleMetaBase): def __init__(self, hist_step_n, clip_weight=None, clip_method="clamp"): - # method 可以选 tanh 或者 clamp + # clip_method includes "tanh" or "clamp" super().__init__(hist_step_n, clip_weight, clip_method) self.linear = nn.Linear(hist_step_n, 1) self.k = nn.Parameter(torch.Tensor([8.0])) @@ -22,13 +22,11 @@ def forward(self, time_perf, time_belong=None, return_preds=False): time_perf = time_perf.reshape(hist_step_n, time_perf.shape[0] // hist_step_n, *time_perf.shape[1:]) time_perf = torch.mean(time_perf, dim=1, keepdim=False) - # time_perf的格式和其他的有一些不一样 - # 需要自己拆出train和test preds = [] for i in range(time_perf.shape[1]): preds.append(self.linear(time_perf[:, i])) preds = torch.cat(preds) - preds = preds - torch.mean(preds) # 这里注意一下不要引入未来信息 + preds = preds - torch.mean(preds) # avoid using future information preds = preds * self.k if return_preds: if time_belong is None: From 8fee1b41cb7ea733852a4f083a26bd6fd1ae377a Mon Sep 17 00:00:00 2001 From: wendili-cs Date: Mon, 10 Jan 2022 04:41:55 +0000 Subject: [PATCH 30/37] Update meta docs --- docs/component/meta.rst | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/docs/component/meta.rst b/docs/component/meta.rst index 98eace5b42..b91c51e47f 100644 --- a/docs/component/meta.rst +++ b/docs/component/meta.rst @@ -8,7 +8,7 @@ Meta Controller: Meta-Task & Meta-Dataset & Meta-Model Introduction ============= -TODO: Add introduction. +``Meta Controller`` provides guidance to ``Forecast Model``, which aims to learn regular patterns among a series of forecasting tasks and use learned patterns to guide forthcoming forecasting tasks. Users can implement their own meta-model instance based on ``Meta Controller`` module. Meta Task ============= @@ -51,3 +51,16 @@ This type of meta-model participates in the training process of the base forecas .. autoclass:: qlib.model.meta.model.MetaGuideModel :members: + + +Example +============= +``Qlib`` provides an implementation of ``Meta Model`` module, ``DDG-DA``, +which adapts to the market dynamics. + +``DDG-DA`` includes four steps: + +1. Calculate meta-information and encapsulate it into ``Meta Task`` instances. All the meta-tasks form a ``Meta Dataset`` instance. +2. Train ``DDG-DA`` based on the training data of the meta-dataset. +3. Do the inference of the ``DDG-DA`` to get guide information. +4. Apply guide information to the forecasting models to improve their performances. From a31a4d58299420773854fede79b264241f36aab8 Mon Sep 17 00:00:00 2001 From: wendili-cs Date: Mon, 10 Jan 2022 05:11:13 +0000 Subject: [PATCH 31/37] Update README.md of the benchmarks_dynamic --- docs/component/meta.rst | 2 ++ examples/benchmarks_dynamic/REAMDE.md | 14 ++++++-------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/component/meta.rst b/docs/component/meta.rst index b91c51e47f..da69d6e26b 100644 --- a/docs/component/meta.rst +++ b/docs/component/meta.rst @@ -64,3 +64,5 @@ which adapts to the market dynamics. 2. Train ``DDG-DA`` based on the training data of the meta-dataset. 3. Do the inference of the ``DDG-DA`` to get guide information. 4. Apply guide information to the forecasting models to improve their performances. + +The above example has been given in ``examples/benchmarks_dynamic/DDG-DA/workflow.py``. diff --git a/examples/benchmarks_dynamic/REAMDE.md b/examples/benchmarks_dynamic/REAMDE.md index 05d3ce2c9f..a84fbc5381 100644 --- a/examples/benchmarks_dynamic/REAMDE.md +++ b/examples/benchmarks_dynamic/REAMDE.md @@ -1,16 +1,12 @@ # Introduction -General problem. -Quant problem. -- Modeling the dynamic of market is a very important problem in Quant research. - -setting:.... -- horizon - -Target ..... +Due to the non-stationary nature of the environment, the data distribution may change in different periods. However, there are still many cases that some underlying factors of environment evolution are predictable, making it possible to model the future trend of the streaming data. +Modeling the dynamics of the market is a very important problem in Quant research. On this page, we first provide the framework of periodically Rolling Retrain (RR) forecasting models so that forecasting models can learn up-to-date distributions when doing the forecasting process. Moreover, we implement a `Meta Model` module, `DDG-DA`, which effectively forecasts the evolution of data distribution and improves the performance of the RR forecasting models. +The table below shows the performances of the original RR and `DDG-DA` on different forecasting models. +## Alpha158 dataset | Model Name | Dataset | IC | ICIR | Rank IC | Rank ICIR | Annualized Return | Information Ratio | Max Drawdown | |------------------|---------|----|------|---------|-----------|-------------------|-------------------|--------------| @@ -18,3 +14,5 @@ Target ..... | DDG-DA[Linear] |Alpha158 |0.093|0.622|0.106 |0.670 |0.085 |1.213 |-0.093 | | RR[LightGBM] |Alpha158 |0.079|0.566|0.088 |0.592 |0.075 |1.226 |-0.096 | | DDG-DA[LightGBM] |Alpha158 |0.084|0.639|0.093 |0.664 |0.099 |1.442 |-0.071 | + +- The label horizon of the `Alpha158` dataset is set to 20. \ No newline at end of file From 97f61d582e014d7ece23cdb40fdcfbd3e38d0bd5 Mon Sep 17 00:00:00 2001 From: wendili-cs Date: Mon, 10 Jan 2022 06:38:14 +0000 Subject: [PATCH 32/37] Update README.md files --- README.md | 26 ++++++++++++-------- examples/benchmarks_dynamic/DDG-DA/README.md | 20 ++++++++++++--- examples/benchmarks_dynamic/REAMDE.md | 4 ++- 3 files changed, 36 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 032ae06dfd..6430b9be60 100644 --- a/README.md +++ b/README.md @@ -51,13 +51,12 @@ For more details, please refer to our paper ["Qlib: An AI-oriented Quantitative - [Data Preparation](#data-preparation) - [Auto Quant Research Workflow](#auto-quant-research-workflow) - [Building Customized Quant Research Workflow by Code](#building-customized-quant-research-workflow-by-code) -- [ Main Challenges & Solution in Quant Research ] - - [Forecasting: Finding valuable signals/patterns] - - [**Quant Model(Paper) Zoo**](#quant-model-paper-zoo) - - [Run a single model](#run-a-single-model) - - [Run multiple models](#run-multiple-models) - - [Adapting to market dynamics] - - RR & DDG-DA +- [Main Challenges & Solutions in Quant Research](#Main-Challenges-&-Solutions-in-Quant-Research) + - [Forecasting: Finding Valuable Signals/Patterns](##Forecasting:-Finding-valuable-signals/patterns) + - [**Quant Model (Paper) Zoo**](###Quant-Model-(Paper)-Zoo) + - [Run a Single Model](###run-a-single-model) + - [Run Multiple Models](###run-multiple-models) + - [Adapting to Market Dynamics](##Adapting-to-Market-Dynamics) - [**Quant Dataset Zoo**](#quant-dataset-zoo) - [More About Qlib](#more-about-qlib) - [Offline Mode and Online Mode](#offline-mode-and-online-mode) @@ -284,8 +283,13 @@ Qlib provides a tool named `qrun` to run the whole workflow automatically (inclu ## Building Customized Quant Research Workflow by Code The automatic workflow may not suit the research workflow of all Quant researchers. To support a flexible Quant research workflow, Qlib also provides a modularized interface to allow researchers to build their own workflow by code. [Here](examples/workflow_by_code.ipynb) is a demo for customized Quant research workflow by code. +# Main Challenges & Solutions in Quant Research +TODO (describe challenges). The solutions can be split into two aspects, which are doing modification on the forecasting process and doing higher-level adaptation in dynamic environments. -# [Quant Model (Paper) Zoo](examples/benchmarks) +## Forecasting: Finding Valuable Signals/Patterns +By finding more valuable signals or patterns, the forecasting process can be improved. This goal can be achieved by switching to better forecasting models or applying model ensembles. `Qlib` has implemented multiple forecasting models to fit various scenarios. + +### [Quant Model (Paper) Zoo](examples/benchmarks) Here is a list of models built on `Qlib`. - [GBDT based on XGBoost (Tianqi Chen, et al. KDD 2016)](examples/benchmarks/XGBoost/) @@ -312,7 +316,7 @@ Your PR of new Quant models is highly welcomed. The performance of each model on the `Alpha158` and `Alpha360` dataset can be found [here](examples/benchmarks/README.md). -## Run a single model +### Run a single model All the models listed above are runnable with ``Qlib``. Users can find the config files we provide and some details about the model through the [benchmarks](examples/benchmarks) folder. More information can be retrieved at the model files listed above. `Qlib` provides three different ways to run a single model, users can pick the one that fits their cases best: @@ -322,7 +326,7 @@ All the models listed above are runnable with ``Qlib``. Users can find the confi - Users can use the script [`run_all_model.py`](examples/run_all_model.py) listed in the `examples` folder to run a model. Here is an example of the specific shell command to be used: `python run_all_model.py run --models=lightgbm`, where the `--models` arguments can take any number of models listed above(the available models can be found in [benchmarks](examples/benchmarks/)). For more use cases, please refer to the file's [docstrings](examples/run_all_model.py). - **NOTE**: Each baseline has different environment dependencies, please make sure that your python version aligns with the requirements(e.g. TFT only supports Python 3.6~3.7 due to the limitation of `tensorflow==1.15.0`) -## Run multiple models +### Run multiple models `Qlib` also provides a script [`run_all_model.py`](examples/run_all_model.py) which can run multiple models for several iterations. (**Note**: the script only support *Linux* for now. Other OS will be supported in the future. Besides, it doesn't support parallel running the same model for multiple times as well, and this will be fixed in the future development too.) The script will create a unique virtual environment for each model, and delete the environments after training. Thus, only experiment results such as `IC` and `backtest` results will be generated and stored. @@ -334,6 +338,8 @@ python run_all_model.py run 10 It also provides the API to run specific models at once. For more use cases, please refer to the file's [docstrings](examples/run_all_model.py). +## [Adapting to Market Dynamics](examples/benchmarks_dynamic) +Modeling the dynamics of the market is also a solution to improve forecasting performance. `Qlib` has provided examples to adapt to market dynamics. By modeling the future trend of the streaming data, the gap between the training data distribution and test data distribution can be narrowed, so that forecasting models can achieve better results. # Quant Dataset Zoo Dataset plays a very important role in Quant. Here is a list of the datasets built on `Qlib`: diff --git a/examples/benchmarks_dynamic/DDG-DA/README.md b/examples/benchmarks_dynamic/DDG-DA/README.md index 63012f74ef..9eb26ffc00 100644 --- a/examples/benchmarks_dynamic/DDG-DA/README.md +++ b/examples/benchmarks_dynamic/DDG-DA/README.md @@ -1,8 +1,22 @@ +# Introduction +This is the implementation of `DDG-DA` based on `Meta Controller` component provided by `Qlib`. +## Background +In many real-world scenarios, we often deal with streaming data that is sequentially collected over time. Due to the non-stationary nature of the environment, the streaming data distribution may change in unpredictable ways, which is known as concept drift. To handle concept drift, previous methods first detect when/where the concept drift happens and then adapt models to fit the distribution of the latest data. However, there are still many cases that some underlying factors of environment evolution are predictable, making it possible to model the future concept drift trend of the streaming data, while such cases are not fully explored in previous work. +Therefore, we propose a novel method `DDG-DA`, that can effectively forecast the evolution of data distribution and improve the performance of models. Specifically, we first train a predictor to estimate the future data distribution, then leverage it to generate training samples, and finally train models on the generated data. -# Introduction +## Dataset +The data in the paper are private. So we conduct experiments on Qlib's public dataset. +Though the dataset is different, the conclusion remains the same. By applying `DDG-DA`, users can see rising trends at the test phase both in the proxy models' ICs and the performances of the forecasting models. +## Run the Code +Users can try `DDG-DA` by running the following command: +```bash + python workflow.py run_all +``` -The data in the paper are private. So we conduct experiments on Qlib's public dataset. -Though the dataset is different, the conclusions remains same. +The default forecasting models are `Linear`. Users can choose other forecasting models by changing the `forecast_model` parameter when `DDG-DA` initializes. For example, users can try `LightGBM` forecasting models by running the following command: +```bash + python workflow.py --forecast_model="gbdt" run_all +``` \ No newline at end of file diff --git a/examples/benchmarks_dynamic/REAMDE.md b/examples/benchmarks_dynamic/REAMDE.md index a84fbc5381..fb821d39b9 100644 --- a/examples/benchmarks_dynamic/REAMDE.md +++ b/examples/benchmarks_dynamic/REAMDE.md @@ -15,4 +15,6 @@ The table below shows the performances of the original RR and `DDG-DA` on differ | RR[LightGBM] |Alpha158 |0.079|0.566|0.088 |0.592 |0.075 |1.226 |-0.096 | | DDG-DA[LightGBM] |Alpha158 |0.084|0.639|0.093 |0.664 |0.099 |1.442 |-0.071 | -- The label horizon of the `Alpha158` dataset is set to 20. \ No newline at end of file +- The label horizon of the `Alpha158` dataset is set to 20. +- The rolling time intervals are set to 20 trading days. +- The test rolling periods are from January 2017 to August 2020. \ No newline at end of file From da68103335b7a1d37b8b0070a00fd7900d0eb0ce Mon Sep 17 00:00:00 2001 From: wendili-cs Date: Mon, 10 Jan 2022 08:14:09 +0000 Subject: [PATCH 33/37] Add README.md to the rolling_benchmark baseline. --- examples/benchmarks_dynamic/baseline/REAMDE.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 examples/benchmarks_dynamic/baseline/REAMDE.md diff --git a/examples/benchmarks_dynamic/baseline/REAMDE.md b/examples/benchmarks_dynamic/baseline/REAMDE.md new file mode 100644 index 0000000000..17e10482db --- /dev/null +++ b/examples/benchmarks_dynamic/baseline/REAMDE.md @@ -0,0 +1,15 @@ +# Introduction + +This is the framework of periodically Rolling Retrain (RR) forecasting models. RR adapts to market dynamics by utilizing the up-to-date data periodically. + +## Run the Code +Users can try RR by running the following command: +```bash + python rolling_benchmark.py run_all +``` + +The default forecasting models are `Linear`. Users can choose other forecasting models by changing the `model_type` parameter. +For example, users can try `LightGBM` forecasting models by running the following command: +```bash + python rolling_benchmark.py --model_type="gbdt" run_all +``` \ No newline at end of file From 7e1183b4a390af5dd08e2578973bdf5c896b0fc4 Mon Sep 17 00:00:00 2001 From: Young Date: Mon, 10 Jan 2022 16:24:27 +0800 Subject: [PATCH 34/37] Refine the docs and link --- README.md | 29 ++++++++++++++------ examples/benchmarks_dynamic/DDG-DA/README.md | 7 ++++- examples/benchmarks_dynamic/REAMDE.md | 10 +++---- 3 files changed, 30 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 09f8ecd79a..eb8f21c854 100644 --- a/README.md +++ b/README.md @@ -51,12 +51,12 @@ For more details, please refer to our paper ["Qlib: An AI-oriented Quantitative - [Data Preparation](#data-preparation) - [Auto Quant Research Workflow](#auto-quant-research-workflow) - [Building Customized Quant Research Workflow by Code](#building-customized-quant-research-workflow-by-code) -- [Main Challenges & Solutions in Quant Research](#Main-Challenges-&-Solutions-in-Quant-Research) - - [Forecasting: Finding Valuable Signals/Patterns](##Forecasting:-Finding-valuable-signals/patterns) - - [**Quant Model (Paper) Zoo**](###Quant-Model-(Paper)-Zoo) - - [Run a Single Model](###run-a-single-model) - - [Run Multiple Models](###run-multiple-models) - - [Adapting to Market Dynamics](##Adapting-to-Market-Dynamics) +- [Main Challenges & Solutions in Quant Research](#main-challenges--solutions-in-quant-research) + - [Forecasting: Finding Valuable Signals/Patterns](#forecasting-finding-valuable-signalspatterns) + - [**Quant Model (Paper) Zoo**](#quant-model-paper-zoo) + - [Run a Single Model](#run-a-single-model) + - [Run Multiple Models](#run-multiple-models) + - [Adapting to Market Dynamics](#adapting-to-market-dynamics) - [**Quant Dataset Zoo**](#quant-dataset-zoo) - [More About Qlib](#more-about-qlib) - [Offline Mode and Online Mode](#offline-mode-and-online-mode) @@ -284,10 +284,15 @@ Qlib provides a tool named `qrun` to run the whole workflow automatically (inclu The automatic workflow may not suit the research workflow of all Quant researchers. To support a flexible Quant research workflow, Qlib also provides a modularized interface to allow researchers to build their own workflow by code. [Here](examples/workflow_by_code.ipynb) is a demo for customized Quant research workflow by code. # Main Challenges & Solutions in Quant Research -TODO (describe challenges). The solutions can be split into two aspects, which are doing modification on the forecasting process and doing higher-level adaptation in dynamic environments. +Quant investment is an very unique scenario with lots of key challenges to be solved. +Currently, Qlib provides some solutions for several of them. ## Forecasting: Finding Valuable Signals/Patterns -By finding more valuable signals or patterns, the forecasting process can be improved. This goal can be achieved by switching to better forecasting models or applying model ensembles. `Qlib` has implemented multiple forecasting models to fit various scenarios. +Accurate forecasting of the stock price trend is a very important part to construct profitable portfolios. +However, huge amount of data with various formats in the financial market which make it challenging to build forecasting models. + +An increasing number of SOTA Quant research works/papers, which focus on building forecasting models to mine valuable signals/patterns in complex financial data, are released in `Qlib` + ### [Quant Model (Paper) Zoo](examples/benchmarks) @@ -339,7 +344,13 @@ python run_all_model.py run 10 It also provides the API to run specific models at once. For more use cases, please refer to the file's [docstrings](examples/run_all_model.py). ## [Adapting to Market Dynamics](examples/benchmarks_dynamic) -Modeling the dynamics of the market is also a solution to improve forecasting performance. `Qlib` has provided examples to adapt to market dynamics. By modeling the future trend of the streaming data, the gap between the training data distribution and test data distribution can be narrowed, so that forecasting models can achieve better results. + +Due to the non-stationary nature of the environment of the financial market, the data distribution may change in different periods, which makes the performance of models build on training data decays in the future test data. +So adapting the forecasting models/strategies to market dynamics is very important to the model/strategies' performance. + +Here is a list of solutions built on `Qlib`. +- [Rolling Retraining](examples/benchmarks_dynamic/baseline/) +- [DDG-DA on pytorch (Wendi, et al. AAAI 2022)](examples/benchmarks_dynamic/DDG-DA/) # Quant Dataset Zoo Dataset plays a very important role in Quant. Here is a list of the datasets built on `Qlib`: diff --git a/examples/benchmarks_dynamic/DDG-DA/README.md b/examples/benchmarks_dynamic/DDG-DA/README.md index 9eb26ffc00..741f715ce0 100644 --- a/examples/benchmarks_dynamic/DDG-DA/README.md +++ b/examples/benchmarks_dynamic/DDG-DA/README.md @@ -19,4 +19,9 @@ Users can try `DDG-DA` by running the following command: The default forecasting models are `Linear`. Users can choose other forecasting models by changing the `forecast_model` parameter when `DDG-DA` initializes. For example, users can try `LightGBM` forecasting models by running the following command: ```bash python workflow.py --forecast_model="gbdt" run_all -``` \ No newline at end of file +``` + + +## Results + +The results of other methods in Qlib's public dataset can be found [here](../) diff --git a/examples/benchmarks_dynamic/REAMDE.md b/examples/benchmarks_dynamic/REAMDE.md index fb821d39b9..e6d09902a4 100644 --- a/examples/benchmarks_dynamic/REAMDE.md +++ b/examples/benchmarks_dynamic/REAMDE.md @@ -1,10 +1,8 @@ # Introduction +Due to the non-stationary nature of the environment of the financial market, the data distribution may change in different periods, which makes the performance of models build on training data decays in the future test data. +So adapting the forecasting models/strategies to market dynamics is very important to the model/strategies' performance. -Due to the non-stationary nature of the environment, the data distribution may change in different periods. However, there are still many cases that some underlying factors of environment evolution are predictable, making it possible to model the future trend of the streaming data. - -Modeling the dynamics of the market is a very important problem in Quant research. On this page, we first provide the framework of periodically Rolling Retrain (RR) forecasting models so that forecasting models can learn up-to-date distributions when doing the forecasting process. Moreover, we implement a `Meta Model` module, `DDG-DA`, which effectively forecasts the evolution of data distribution and improves the performance of the RR forecasting models. - -The table below shows the performances of the original RR and `DDG-DA` on different forecasting models. +The table below shows the performances of different solutions on different forecasting models. ## Alpha158 dataset @@ -17,4 +15,4 @@ The table below shows the performances of the original RR and `DDG-DA` on differ - The label horizon of the `Alpha158` dataset is set to 20. - The rolling time intervals are set to 20 trading days. -- The test rolling periods are from January 2017 to August 2020. \ No newline at end of file +- The test rolling periods are from January 2017 to August 2020. From b0857c2bf265d7a8455b9c53fef1d512951e0a5c Mon Sep 17 00:00:00 2001 From: wendili-cs Date: Mon, 10 Jan 2022 08:28:16 +0000 Subject: [PATCH 35/37] Rename README.md in benchmarks_dynamic. --- examples/benchmarks_dynamic/baseline/{REAMDE.md => README.md} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename examples/benchmarks_dynamic/baseline/{REAMDE.md => README.md} (100%) diff --git a/examples/benchmarks_dynamic/baseline/REAMDE.md b/examples/benchmarks_dynamic/baseline/README.md similarity index 100% rename from examples/benchmarks_dynamic/baseline/REAMDE.md rename to examples/benchmarks_dynamic/baseline/README.md From 38b83dd549c290d9336748775fc1f2b091c8a14b Mon Sep 17 00:00:00 2001 From: wendili-cs Date: Mon, 10 Jan 2022 08:44:12 +0000 Subject: [PATCH 36/37] Remove comments. --- examples/benchmarks/TFT/tft.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/benchmarks/TFT/tft.py b/examples/benchmarks/TFT/tft.py index 8b5c70190a..e6dd27e83e 100644 --- a/examples/benchmarks/TFT/tft.py +++ b/examples/benchmarks/TFT/tft.py @@ -209,7 +209,6 @@ def fit(self, dataset: DatasetH, MODEL_FOLDER="qlib_tft_model", USE_GPU_ID=0, ** fixed_params = self.data_formatter.get_experiment_params() params = self.data_formatter.get_default_model_params() - # Wendi: 合并调优的参数和非调优的参数 params = {**params, **fixed_params} if not os.path.exists(self.model_folder): From 34f5bd24fe01375dabb6996a7eb44423ffd84c95 Mon Sep 17 00:00:00 2001 From: Young Date: Mon, 10 Jan 2022 16:51:58 +0800 Subject: [PATCH 37/37] auto download data --- docs/component/meta.rst | 2 +- examples/benchmarks_dynamic/DDG-DA/workflow.py | 2 ++ examples/benchmarks_dynamic/baseline/rolling_benchmark.py | 2 ++ 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/component/meta.rst b/docs/component/meta.rst index da69d6e26b..fa5061099b 100644 --- a/docs/component/meta.rst +++ b/docs/component/meta.rst @@ -65,4 +65,4 @@ which adapts to the market dynamics. 3. Do the inference of the ``DDG-DA`` to get guide information. 4. Apply guide information to the forecasting models to improve their performances. -The above example has been given in ``examples/benchmarks_dynamic/DDG-DA/workflow.py``. +The `above example `_ can be found in ``examples/benchmarks_dynamic/DDG-DA/workflow.py``. diff --git a/examples/benchmarks_dynamic/DDG-DA/workflow.py b/examples/benchmarks_dynamic/DDG-DA/workflow.py index 010a759476..e6f5df46d2 100644 --- a/examples/benchmarks_dynamic/DDG-DA/workflow.py +++ b/examples/benchmarks_dynamic/DDG-DA/workflow.py @@ -17,6 +17,7 @@ from qlib.utils import init_instance_by_config from qlib.workflow.task.gen import RollingGen, task_generator from qlib.workflow import R +from qlib.tests.data import GetData DIRNAME = Path(__file__).absolute().resolve().parent sys.path.append(str(DIRNAME.parent / "baseline")) @@ -252,5 +253,6 @@ def run_all(self): if __name__ == "__main__": + GetData().qlib_data(exists_skip=True) auto_init() fire.Fire(DDGDA) diff --git a/examples/benchmarks_dynamic/baseline/rolling_benchmark.py b/examples/benchmarks_dynamic/baseline/rolling_benchmark.py index 0459fa26be..c192cd4cd0 100644 --- a/examples/benchmarks_dynamic/baseline/rolling_benchmark.py +++ b/examples/benchmarks_dynamic/baseline/rolling_benchmark.py @@ -9,6 +9,7 @@ from tqdm.auto import tqdm from qlib.model.trainer import TrainerR from qlib.workflow import R +from qlib.tests.data import GetData DIRNAME = Path(__file__).absolute().resolve().parent from qlib.workflow.task.gen import task_generator, RollingGen @@ -108,5 +109,6 @@ def run_all(self): if __name__ == "__main__": + GetData().qlib_data(exists_skip=True) auto_init() fire.Fire(RollingBenchmark)