microsoft · you-n-g · Jan 10, 2022 · Jul 1, 2021 · Jul 1, 2021 · Jul 8, 2021
diff --git a/docs/component/meta.rst b/docs/component/meta.rst
@@ -0,0 +1,53 @@
+.. _meta:
+
+=================================
+Meta Controller: Meta-Task & Meta-Dataset & Meta-Model
+=================================
+.. currentmodule:: qlib
+
+
+Introduction
+=============
+TODO: Add introduction.
+
+Meta Task
+=============
+
+A `Meta Task` instance is the basic element in the meta-learning framework. It saves the data that can be used for the `Meta Model`. Multiple `Meta Task` instances may share the same `Data Handler`, controlled by `Meta Dataset`. Users should use `prepare_task_data()` to obtain the data that can be directly fed into the `Meta Model`.
+
+.. autoclass:: qlib.model.meta.task.MetaTask
+    :members:
+
+Meta Dataset
+=============
+
+`Meta Dataset` controls the meta-information generating process. It is on the duty of providing data for training the `Meta Model`. Users should use `prepare_tasks` to retrieve a list of `Meta Task` instances.
+
+.. autoclass:: qlib.model.meta.dataset.MetaTaskDataset
+    :members:
+
+Meta Model
+=============
+
+General Meta Model
+------------------
+`Meta Model` instance is the part that controls the workflow. The usage of the `Meta Model` includes:
+1. Users train their `Meta Model` with the `fit` function. 
+2. The `Meta Model` instance guides the workflow by giving useful information via the `inference` function.
+
+.. autoclass:: qlib.model.meta.model.MetaModel
+    :members:
+
+Meta Task Model
+------------------
+This type of meta-model may interact with task definitions directly. Then, the `Meta Task Model` is the class for them to inherit from. They guide the base tasks by modifying the base task definitions. The function `prepare_tasks` can be used to obtain the modified base task definitions.
+
+.. autoclass:: qlib.model.meta.model.MetaTaskModel
+    :members:
+
+Meta Guide Model
+------------------
+This type of meta-model participates in the training process of the base forecasting model. The meta-model may guide the base forecasting models during their training to improve their performances.
+
+.. autoclass:: qlib.model.meta.model.MetaGuideModel
+    :members:
diff --git a/docs/index.rst b/docs/index.rst
@@ -36,10 +36,11 @@ Document Structure
    :caption: COMPONENTS:
 
    Workflow: Workflow Management <component/workflow.rst>
-   Data Layer: Data Framework&Usage <component/data.rst>
+   Data Layer: Data Framework & Usage <component/data.rst>
    Forecast Model: Model Training & Prediction <component/model.rst>
    Portfolio Management and Backtest <component/strategy.rst>
    Nested Decision Execution: High-Frequency Trading <component/highfreq.rst>
+   Meta Controller: Meta-Task & Meta-Dataset & Meta-Model <component/meta.rst>
    Qlib Recorder: Experiment Management <component/recorder.rst>
    Analysis: Evaluation & Results Analysis <component/report.rst>
    Online Serving: Online Management & Strategy & Tool <component/online.rst>

diff --git a/examples/benchmarks/Linear/workflow_config_linear_Alpha158.yaml b/examples/benchmarks/Linear/workflow_config_linear_Alpha158.yaml
@@ -22,7 +22,6 @@ data_handler_config: &data_handler_config
         - class: CSRankNorm
           kwargs:
               fields_group: label
-    label: ["Ref($close, -2) / Ref($close, -1) - 1"]
 port_analysis_config: &port_analysis_config
     strategy:
         class: TopkDropoutStrategy

diff --git a/examples/benchmarks_dynamic/DDG-DA/README.md b/examples/benchmarks_dynamic/DDG-DA/README.md
@@ -0,0 +1,8 @@
+
+
+
+# Introduction
+
+
+The data in the paper are private. So we conduct experiments on Qlib's public dataset.
+Though the dataset is different, the conclusions remains same.
diff --git a/examples/benchmarks_dynamic/DDG-DA/requirements.txt b/examples/benchmarks_dynamic/DDG-DA/requirements.txt
@@ -0,0 +1 @@
+torch==1.10.0 
diff --git a/examples/benchmarks_dynamic/DDG-DA/workflow.py b/examples/benchmarks_dynamic/DDG-DA/workflow.py
@@ -0,0 +1,258 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+from pathlib import Path
+from qlib.model.meta.task import MetaTask
+from qlib.contrib.meta.data_selection.model import MetaModelDS
+from qlib.contrib.meta.data_selection.dataset import InternalData, MetaDatasetDS
+from qlib.data.dataset.handler import DataHandlerLP
+
+import pandas as pd
+import fire
+import sys
+from tqdm.auto import tqdm
+import yaml
+import pickle
+from qlib import auto_init
+from qlib.model.trainer import TrainerR, task_train
+from qlib.utils import init_instance_by_config
+from qlib.workflow.task.gen import RollingGen, task_generator
+from qlib.workflow import R
+
+DIRNAME = Path(__file__).absolute().resolve().parent
+sys.path.append(str(DIRNAME.parent / "baseline"))
+from rolling_benchmark import RollingBenchmark  # NOTE: sys.path is changed for import RollingBenchmark
+import torch
+
+torch.manual_seed(43)
+
+
+class DDGDA:
+    """
+    please run `python workflow.py run_all` to run the full workflow of the experiment
+
+    **NOTE**
+    before running the example, please clean your previous results with following command
+    - `rm -r mlruns`
+    """
+
+    def __init__(self, model_types=["linear", "linear"]) -> None:
+        self.step = 20
+        # NOTE:
+        # the horizon must match the meaning in the base task template
+        self.horizon = 20
+        self.meta_exp_name = "DDG-DA"
+        self.model_types = model_types  # first for calculate IC, second for forecasting models' type
+
+    def get_feature_importance(self):
+        # this must be lightGBM, because it needs to get the feature importance
+        rb = RollingBenchmark(model_type="gbdt")
+        task = rb.basic_task()
+
+        model = init_instance_by_config(task["model"])
+        dataset = init_instance_by_config(task["dataset"])
+        model.fit(dataset)
+
+        fi = model.get_feature_importance()
+
+        # Because the model use numpy instead of dataframe for training lightgbm
+        # So the we must use following extra steps to get the right feature importance
+        df = dataset.prepare(segments=slice(None), col_set="feature", data_key=DataHandlerLP.DK_R)
+        cols = df.columns
+        fi_named = {cols[int(k.split("_")[1])]: imp for k, imp in fi.to_dict().items()}
+
+        return pd.Series(fi_named)
+
+    def dump_data_for_proxy_model(self):
+        """
+        Dump data for training meta model.
+        The meta model will be trained upon the proxy forecasting model.
+        This dataset is for the proxy forecasting model.
+        """
+        topk = 30
+        fi = self.get_feature_importance()
+        col_selected = fi.nlargest(topk)
+
+        rb = RollingBenchmark(model_type=self.model_types[0])
+        task = rb.basic_task()
+        dataset = init_instance_by_config(task["dataset"])
+        prep_ds = dataset.prepare(slice(None), col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
+
+        feature_df = prep_ds["feature"]
+        label_df = prep_ds["label"]
+
+        feature_selected = feature_df.loc[:, col_selected.index]
+
+        feature_selected = feature_selected.groupby("datetime").apply(lambda df: (df - df.mean()).div(df.std()))
+        feature_selected = feature_selected.fillna(0.0)
+
+        df_all = {
+            "label": label_df.reindex(feature_selected.index),
+            "feature": feature_selected,
+        }
+        df_all = pd.concat(df_all, axis=1)
+        df_all.to_pickle(DIRNAME / "fea_label_df.pkl")
+
+        # dump data in handler format for aligning the interface
+        handler = DataHandlerLP(
+            data_loader={
+                "class": "qlib.data.dataset.loader.StaticDataLoader",
+                "kwargs": {"config": DIRNAME / "fea_label_df.pkl"},
+            }
+        )
+        handler.to_pickle(DIRNAME / "handler_proxy.pkl", dump_all=True)
+
+    @property
+    def _internal_data_path(self):
+        return DIRNAME / f"internal_data_s{self.step}.pkl"
+
+    def dump_meta_ipt(self):
+        """
+        Dump data for training meta model.
+        This function will dump the input data for meta model
+        """
+        # According to the experiments, the choice of the model type is very important for achieving good results
+        rb = RollingBenchmark(model_type=self.model_types[0])
+        sim_task = rb.basic_task()
+
+        if self.model_types[0] == "gbdt":
+            sim_task["model"].setdefault("kwargs", {}).update({"early_stopping_rounds": None, "num_boost_round": 150})
+
+        exp_name_sim = f"data_sim_s{self.step}"
+
+        internal_data = InternalData(sim_task, self.step, exp_name=exp_name_sim)
+        internal_data.setup(trainer=TrainerR)
+
+        with self._internal_data_path.open("wb") as f:
+            pickle.dump(internal_data, f)
+
+    def train_meta_model(self):
+        """
+        training a meta model based on a simplified linear proxy model;
+        """
+
+        # 1) leverage the simplified proxy forecasting model to train meta model.
+        # - Only the dataset part is important, in current version of meta model will integrate the
+        rb = RollingBenchmark(model_type=self.model_types[0])
+        sim_task = rb.basic_task()
+        proxy_forecast_model_task = {
+            # "model": "qlib.contrib.model.linear.LinearModel",
+            "dataset": {
+                "class": "qlib.data.dataset.DatasetH",
+                "kwargs": {
+                    "handler": f"file://{(DIRNAME / 'handler_proxy.pkl').absolute()}",
+                    "segments": {
+                        "train": ("2008-01-01", "2010-12-31"),
+                        "test": ("2011-01-01", sim_task["dataset"]["kwargs"]["segments"]["test"][1]),
+                    },
+                },
+            },
+            # "record": ["qlib.workflow.record_temp.SignalRecord"]
+        }
+
+        # 2) preparing meta dataset
+        kwargs = dict(
+            task_tpl=proxy_forecast_model_task,
+            step=self.step,
+            segments=0.62,
+            trunc_days=1 + self.horizon,
+            hist_step_n=30,
+            fill_method="max",
+            rolling_ext_days=0,
+        )
+        # NOTE:
+        # the input of meta model (internal data) are shared between proxy model and final forecasting model
+        # but their task test segment are not aligned! It worked in my previous experiment.
+        # So the misalignment will not affect the effectiveness of the method.
+        with self._internal_data_path.open("rb") as f:
+            internal_data = pickle.load(f)
+        md = MetaDatasetDS(exp_name=internal_data, **kwargs)
+
+        # 3) train and logging meta model
+        with R.start(experiment_name=self.meta_exp_name):
+            R.log_params(**kwargs)
+            mm = MetaModelDS(step=self.step, hist_step_n=kwargs["hist_step_n"], lr=0.001, max_epoch=200)
+            mm.fit(md)
+            R.save_objects(model=mm)
+
+    @property
+    def _task_path(self):
+        return DIRNAME / f"tasks_s{self.step}.pkl"
+
+    def meta_inference(self):
+        """
+        Leverage meta-model for inference:
+        - Given
+            - baseline tasks
+            - input for meta model(internal data)
+            - meta model (its learnt knowledge on proxy forecasting model is expected to transfer to normal forecasting model)
+        """
+        # 1) get meta model
+        exp = R.get_exp(experiment_name=self.meta_exp_name)
+        rec = exp.list_recorders(rtype=exp.RT_L)[0]
+        meta_model: MetaModelDS = rec.load_object("model")
+
+        # 2)
+        # we are transfer to knowledge of meta model to final forecasting tasks.
+        # Create MetaTaskDataset for the final forecasting tasks
+        # Aligning the setting of it to the MetaTaskDataset when training Meta model is necessary
+
+        # 2.1) get previous config
+        param = rec.list_params()
+        trunc_days = int(param["trunc_days"])
+        step = int(param["step"])
+        hist_step_n = int(param["hist_step_n"])
+        fill_method = param.get("fill_method", "max")
+
+        rb = RollingBenchmark(model_type=self.model_types[1])
+        task_l = rb.create_rolling_tasks()
+
+        # 2.2) create meta dataset for final dataset
+        kwargs = dict(
+            task_tpl=task_l,
+            step=step,
+            segments=0.0,  # all the tasks are for testing
+            trunc_days=trunc_days,
+            hist_step_n=hist_step_n,
+            fill_method=fill_method,
+            task_mode=MetaTask.PROC_MODE_TRANSFER,
+        )
+
+        with self._internal_data_path.open("rb") as f:
+            internal_data = pickle.load(f)
+        mds = MetaDatasetDS(exp_name=internal_data, **kwargs)
+
+        # 3) meta model make inference and get new qlib task
+        new_tasks = meta_model.inference(mds)
+        with self._task_path.open("wb") as f:
+            pickle.dump(new_tasks, f)
+
+    def train_and_eval_tasks(self):
+        """
+        Training the tasks generated by meta model
+        Then evaluate it
+        """
+        with self._task_path.open("rb") as f:
+            tasks = pickle.load(f)
+        rb = RollingBenchmark(rolling_exp="rolling_ds", model_type=self.model_types[1])
+        rb.train_rolling_tasks(tasks)
+        rb.ens_rolling()
+        rb.update_rolling_rec()
+
+    def run_all(self):
+        # 1) file: handler_proxy.pkl
+        self.dump_data_for_proxy_model()
+        # 2)
+        # file: internal_data_s20.pkl
+        # mlflow: data_sim_s20, models for calculating meta_ipt
+        self.dump_meta_ipt()
+        # 3) meta model will be stored in `DDG-DA`
+        self.train_meta_model()
+        # 4) new_tasks are saved in "tasks_s20.pkl" (reweighter is added)
+        self.meta_inference()
+        # 5) load the saved tasks and train model
+        self.train_and_eval_tasks()
+
+
+if __name__ == "__main__":
+    auto_init()
+    fire.Fire(DDGDA)
diff --git a/examples/benchmarks_dynamic/REAMDE.md b/examples/benchmarks_dynamic/REAMDE.md
@@ -0,0 +1,12 @@
+# Introduction
+
+Modeling the dynamic of market is a very important problem in Quant research. 
+
+
+
+| Model Name       | Dataset | IC | ICIR | Rank IC | Rank ICIR | Annualized Return | Information Ratio | Max Drawdown |
+|------------------|---------|----|------|---------|-----------|-------------------|-------------------|--------------|
+| RR[Linear]       |Alpha158 |0.088|0.570|0.102    |0.622      |0.077              |1.175              |-0.086        |
+| DDG-DA[Linear]   |Alpha158 |0.093|0.622|0.106    |0.670      |0.085              |1.213              |-0.093        |
+| RR[LightGBM]     |Alpha158 |0.079|0.566|0.088    |0.592      |0.075              |1.226              |-0.096        |
+| DDG-DA[LightGBM] |Alpha158 |0.084|0.639|0.093    |0.664      |0.099              |1.442              |-0.071        |