microsoft · you-n-g · Jan 10, 2022 · Jul 1, 2021 · Jul 1, 2021 · Jul 8, 2021
diff --git a/examples/benchmarks/Linear/workflow_config_linear_Alpha158.yaml b/examples/benchmarks/Linear/workflow_config_linear_Alpha158.yaml
@@ -22,7 +22,6 @@ data_handler_config: &data_handler_config
         - class: CSRankNorm
           kwargs:
               fields_group: label
-    label: ["Ref($close, -2) / Ref($close, -1) - 1"]
 port_analysis_config: &port_analysis_config
     strategy:
         class: TopkDropoutStrategy

diff --git a/examples/benchmarks_dynamic/DDG-DA/workflow.py b/examples/benchmarks_dynamic/DDG-DA/workflow.py
@@ -21,6 +21,9 @@
 DIRNAME = Path(__file__).absolute().resolve().parent
 sys.path.append(str(DIRNAME.parent / "baseline"))
 from rolling_benchmark import RollingBenchmark  # NOTE: sys.path is changed for import RollingBenchmark
+import torch
+
+torch.manual_seed(43)
 
 
 class DDGDA:
@@ -32,15 +35,17 @@ class DDGDA:
     - `rm -r mlruns`
     """
 
-    def __init__(self) -> None:
+    def __init__(self, model_types=["linear", "linear"]) -> None:
         self.step = 20
         # NOTE:
         # the horizon must match the meaning in the base task template
-        self.horizon = 1
+        self.horizon = 20
         self.meta_exp_name = "DDG-DA"
+        self.model_types = model_types  # first for calculate IC, second for forecasting models' type
 
     def get_feature_importance(self):
-        rb = RollingBenchmark()
+        # this must be lightGBM, because it needs to get the feature importance
+        rb = RollingBenchmark(model_type="gbdt")
         task = rb.basic_task()
 
         model = init_instance_by_config(task["model"])
@@ -67,7 +72,7 @@ def dump_data_for_proxy_model(self):
         fi = self.get_feature_importance()
         col_selected = fi.nlargest(topk)
 
-        rb = RollingBenchmark()
+        rb = RollingBenchmark(model_type=self.model_types[0])
         task = rb.basic_task()
         dataset = init_instance_by_config(task["dataset"])
         prep_ds = dataset.prepare(slice(None), col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
@@ -105,10 +110,12 @@ def dump_meta_ipt(self):
         Dump data for training meta model.
         This function will dump the input data for meta model
         """
-        rb = RollingBenchmark()
+        # According to the experiments, the choice of the model type is very important for achieving good results
+        rb = RollingBenchmark(model_type=self.model_types[0])
         sim_task = rb.basic_task()
 
-        sim_task["model"].setdefault("kwargs", {}).update({"early_stopping_rounds": None, "num_boost_round": 350})
+        if self.model_types[0] == "gbdt":
+            sim_task["model"].setdefault("kwargs", {}).update({"early_stopping_rounds": None, "num_boost_round": 150})
 
         exp_name_sim = f"data_sim_s{self.step}"
 
@@ -125,7 +132,7 @@ def train_meta_model(self):
 
         # 1) leverage the simplified proxy forecasting model to train meta model.
         # - Only the dataset part is important, in current version of meta model will integrate the
-        rb = RollingBenchmark()
+        rb = RollingBenchmark(model_type=self.model_types[0])
         sim_task = rb.basic_task()
         proxy_forecast_model_task = {
             # "model": "qlib.contrib.model.linear.LinearModel",
@@ -146,7 +153,7 @@ def train_meta_model(self):
         kwargs = dict(
             task_tpl=proxy_forecast_model_task,
             step=self.step,
-            segments=0.5,
+            segments=0.62,
             trunc_days=1 + self.horizon,
             hist_step_n=30,
             fill_method="max",
@@ -163,7 +170,7 @@ def train_meta_model(self):
         # 3) train and logging meta model
         with R.start(experiment_name=self.meta_exp_name):
             R.log_params(**kwargs)
-            mm = MetaModelDS(step=self.step, hist_step_n=kwargs["hist_step_n"], lr=0.0001)
+            mm = MetaModelDS(step=self.step, hist_step_n=kwargs["hist_step_n"], lr=0.001, max_epoch=200)
             mm.fit(md)
             R.save_objects(model=mm)
 
@@ -196,7 +203,7 @@ def meta_inference(self):
         hist_step_n = int(param["hist_step_n"])
         fill_method = param.get("fill_method", "max")
 
-        rb = RollingBenchmark()
+        rb = RollingBenchmark(model_type=self.model_types[1])
         task_l = rb.create_rolling_tasks()
 
         # 2.2) create meta dataset for final dataset
@@ -226,7 +233,7 @@ def train_and_eval_tasks(self):
         """
         with self._task_path.open("rb") as f:
             tasks = pickle.load(f)
-        rb = RollingBenchmark(rolling_exp="rolling_ds")
+        rb = RollingBenchmark(rolling_exp="rolling_ds", model_type=self.model_types[1])
         rb.train_rolling_tasks(tasks)
         rb.ens_rolling()
         rb.update_rolling_rec()

diff --git a/examples/benchmarks_dynamic/REAMDE.md b/examples/benchmarks_dynamic/REAMDE.md
@@ -6,5 +6,7 @@ Modeling the dynamic of market is a very important problem in Quant research.
 
 | Model Name       | Dataset | IC | ICIR | Rank IC | Rank ICIR | Annualized Return | Information Ratio | Max Drawdown |
 |------------------|---------|----|------|---------|-----------|-------------------|-------------------|--------------|
-| RR[LightGBM]     |         |    |      |         |           |                   |                   |              |
-| DDG-DA[LightGBM] |         |    |      |         |           |                   |                   |              |
+| RR[Linear]       |Alpha158 |0.088|0.570|0.102    |0.622      |0.077              |1.175              |-0.086        |
+| DDG-DA[Linear]   |Alpha158 |0.093|0.622|0.106    |0.670      |0.085              |1.213              |-0.093        |
+| RR[LightGBM]     |Alpha158 |0.079|0.566|0.088    |0.592      |0.075              |1.226              |-0.096        |
+| DDG-DA[LightGBM] |Alpha158 |0.084|0.639|0.093    |0.664      |0.099              |1.442              |-0.071        |
diff --git a/examples/benchmarks_dynamic/baseline/rolling_benchmark.py b/examples/benchmarks_dynamic/baseline/rolling_benchmark.py
@@ -24,25 +24,37 @@ class RollingBenchmark:
 
     """
 
-    def __init__(self, rolling_exp="rolling_models") -> None:
+    def __init__(self, rolling_exp="rolling_models", model_type="linear") -> None:
         self.step = 20
-        self.horizon = 1
+        self.horizon = 20
         self.rolling_exp = rolling_exp
+        self.model_type = model_type
 
     def basic_task(self):
         """For fast training rolling"""
-        conf_path = DIRNAME.parent.parent / "benchmarks" / "LightGBM" / "workflow_config_lightgbm_Alpha158.yaml"
+        if self.model_type == "gbdt":
+            conf_path = DIRNAME.parent.parent / "benchmarks" / "LightGBM" / "workflow_config_lightgbm_Alpha158.yaml"
+            # dump the processed data on to disk for later loading to speed up the processing
+            h_path = DIRNAME / "lightgbm_alpha158_handler.pkl"
+        elif self.model_type == "linear":
+            conf_path = DIRNAME.parent.parent / "benchmarks" / "Linear" / "workflow_config_linear_Alpha158.yaml"
+            h_path = DIRNAME / "linear_alpha158_handler.pkl"
+        else:
+            raise AssertionError("Model type is not supported!")
         with conf_path.open("r") as f:
             conf = yaml.safe_load(f)
-        task = conf["task"]
 
-        # dump the processed data on to disk for later loading to speed up the processing
-        h_path = DIRNAME / "lightgbm_alpha158_handler.pkl"
+        # modify dataset horizon
+        conf["task"]["dataset"]["kwargs"]["handler"]["kwargs"]["label"] = [
+            "Ref($close, -{}) / Ref($close, -1) - 1".format(self.horizon + 1)
+        ]
+
+        task = conf["task"]
 
-        if not h_path.exists():
-            h_conf = task["dataset"]["kwargs"]["handler"]
-            h = init_instance_by_config(h_conf)
-            h.to_pickle(h_path, dump_all=True)
+        # if not h_path.exists():
+        h_conf = task["dataset"]["kwargs"]["handler"]
+        h = init_instance_by_config(h_conf)
+        h.to_pickle(h_path, dump_all=True)
 
         task["dataset"]["kwargs"]["handler"] = f"file://{h_path}"
         task["record"] = ["qlib.workflow.record_temp.SignalRecord"]

diff --git a/qlib/contrib/meta/data_selection/dataset.py b/qlib/contrib/meta/data_selection/dataset.py
@@ -180,6 +180,12 @@ def __init__(self, task: dict, meta_info: pd.DataFrame, mode: str = MetaTask.PRO
                     test_idx=d_test["label"].index,
                 )
             )
+
+            # debug: record the test period of the current meta-task instance
+            self.test_period = (
+                d_test["feature"].index.get_level_values("datetime")[0],
+                d_test["feature"].index.get_level_values("datetime")[-1],
+            )
         # TODO: set device: I think this is not necessary to converting data format.
         self.processed_meta_input = data_to_tensor(self.processed_meta_input)
 

diff --git a/qlib/contrib/meta/data_selection/model.py b/qlib/contrib/meta/data_selection/model.py
@@ -53,7 +53,7 @@ def __init__(
         clip_weight=2.0,
         criterion="ic_loss",
         lr=0.0001,
-        max_epoch=150,
+        max_epoch=100,
     ):
         self.step = step
         self.hist_step_n = hist_step_n
@@ -65,7 +65,7 @@ def __init__(
         self.fitted = False
 
     def run_epoch(self, phase, task_list, epoch, opt, loss_l, ignore_weight=False):
-        if phase == "train":  # phase 0 for training, 1 for inference
+        if phase == "train":
             self.tn.train()
             torch.set_grad_enabled(True)
         else:
@@ -141,6 +141,11 @@ def fit(self, meta_dataset: MetaDatasetDS):
         phases = ["train", "test"]
         meta_tasks_l = meta_dataset.prepare_tasks(phases)
 
+        if len(meta_tasks_l[1]):
+            R.log_params(
+                **dict(proxy_test_begin=meta_tasks_l[1][0].test_period)
+            )  # debug: record when the test phase starts
+
         self.tn = PredNet(
             step=self.step, hist_step_n=self.hist_step_n, clip_weight=self.clip_weight, clip_method=self.clip_method
         )