microsoft · you-n-g · Dec 30, 2022 · Dec 23, 2022 · Dec 26, 2022 · Dec 26, 2022
diff --git a/.github/workflows/test_qlib_from_source.yml b/.github/workflows/test_qlib_from_source.yml
@@ -140,10 +140,7 @@ jobs:
 
     - name: Test workflow by config (install from source)
       run: |
-        # Version 0.52.0 of numba must be installed manually in CI, otherwise it will cause incompatibility with the latest version of numpy.
-        python -m pip install numba==0.52.0
-        # You must update numpy manually, because when installing python tools, it will try to uninstall numpy and cause CI to fail.
-        python -m pip install --upgrade numpy
+        python -m pip install numba
         python qlib/workflow/cli.py examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
 
     - name: Unit tests with Pytest

diff --git a/qlib/data/dataset/processor.py b/qlib/data/dataset/processor.py
@@ -211,16 +211,15 @@ def fit(self, df: pd.DataFrame = None):
         self.min_val = np.nanmin(df[cols].values, axis=0)
         self.max_val = np.nanmax(df[cols].values, axis=0)
         self.ignore = self.min_val == self.max_val
+        for _i, _con in enumerate(self.ignore):
+            if _con:
+                self.min_val[_i] = 0
+                self.max_val[_i] = 1
         self.cols = cols
 
     def __call__(self, df):
-        def normalize(x, min_val=self.min_val, max_val=self.max_val, ignore=self.ignore):
-            if (~ignore).all():
-                return (x - min_val) / (max_val - min_val)
-            for i in range(ignore.size):
-                if not ignore[i]:
-                    x[i] = (x[i] - min_val) / (max_val - min_val)
-            return x
+        def normalize(x, min_val=self.min_val, max_val=self.max_val):
+            return (x - min_val) / (max_val - min_val)
 
         df.loc(axis=1)[self.cols] = normalize(df[self.cols].values)
         return df
@@ -242,16 +241,15 @@ def fit(self, df: pd.DataFrame = None):
         self.mean_train = np.nanmean(df[cols].values, axis=0)
         self.std_train = np.nanstd(df[cols].values, axis=0)
         self.ignore = self.std_train == 0
+        for _i, _con in enumerate(self.ignore):
+            if _con:
+                self.std_train[_i] = 1
+                self.mean_train[_i] = 0
         self.cols = cols
 
     def __call__(self, df):
-        def normalize(x, mean_train=self.mean_train, std_train=self.std_train, ignore=self.ignore):
-            if (~ignore).all():
-                return (x - mean_train) / std_train
-            for i in range(ignore.size):
-                if not ignore[i]:
-                    x[i] = (x[i] - mean_train) / std_train
-            return x
+        def normalize(x, mean_train=self.mean_train, std_train=self.std_train):
+            return (x - mean_train) / std_train
 
         df.loc(axis=1)[self.cols] = normalize(df[self.cols].values)
         return df
@@ -313,7 +311,7 @@ def __call__(self, df):
             self.fields_group = [self.fields_group]
         for g in self.fields_group:
             cols = get_group_columns(df, g)
-            df[cols] = df[cols].groupby("datetime", group_keys=False).apply(self.zscore_func)
+            df[cols] = df[cols].groupby("datetime", group_keys=False).mean().apply(self.zscore_func)
         return df
 
 
@@ -361,7 +359,8 @@ def __init__(self, fields_group=None):
 
     def __call__(self, df):
         cols = get_group_columns(df, self.fields_group)
-        df[cols] = df[cols].groupby("datetime").apply(lambda x: x.fillna(x.mean()))
+        df.index.astype(np.datetime64)
+        df[cols] = df[cols].groupby("datetime").mean().apply(lambda x: x.fillna(x.mean()))
         return df
 
 

diff --git a/setup.py b/setup.py
@@ -156,7 +156,14 @@ def get_version(rel_path: str) -> str:
             "baostock",
             "yahooquery",
             "beautifulsoup4",
-            "tianshou",
+            # In version 0.4.11 of tianshou, the code:
+            # logits, hidden = self.actor(batch.obs, state=state, info=batch.info)
+            # was changed in PR787,
+            # which causes pytest errors(AttributeError: 'dict' object has no attribute 'info') in CI,
+            # so we restricted the version of tianshou.
+            # References:
+            # https://github.com/thu-ml/tianshou/releases
+            "tianshou<=0.4.10",
             "gym>=0.24",  # If you do not put gym at the end, gym will degrade causing pytest results to fail.
         ],
         "rl": [

diff --git a/tests/test_processor.py b/tests/test_processor.py
@@ -0,0 +1,66 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import io
+import unittest
+import pandas as pd
+from qlib.data import D
+from qlib.tests import TestAutoData
+from qlib.data.dataset.processor import MinMaxNorm, ZScoreNorm, CSZScoreNorm, CSZFillna
+
+
+class TestProcessor(TestAutoData):
+    TEST_INST = "SH600519"
+
+    def test_MinMaxNorm(self):
+        origin_df = D.features([self.TEST_INST], ["$high", "$open", "$low", "$close"]).tail(10)
+        origin_df["test"] = 0
+        df = origin_df.copy()
+        mmn = MinMaxNorm(fields_group=None, fit_start_time="2021-05-31", fit_end_time="2021-06-11")
+        mmn.fit(df)
+        mmn.__call__(df)
+        assert (df.tail(5).iloc[:, :-1] != origin_df.tail(5).iloc[:, :-1]).all().all()
+
+    def test_ZScoreNorm(self):
+        origin_df = D.features([self.TEST_INST], ["$high", "$open", "$low", "$close"]).tail(10)
+        origin_df["test"] = 0
+        df = origin_df.copy()
+        zsn = ZScoreNorm(fields_group=None, fit_start_time="2021-05-31", fit_end_time="2021-06-11")
+        zsn.fit(df)
+        zsn.__call__(df)
+        assert (df.tail(5).iloc[:, :-1] != origin_df.tail(5).iloc[:, :-1]).all().all()
+
+    def test_CSZFillna(self):
+        st = """
+        2000-01-01,1,2
+        2000-01-02,,4
+        2000-01-03,5,6
+        """
+        origin_df = pd.read_csv(io.StringIO(st), header=None)
+        origin_df.columns = ["datetime", "a", "b"]
+        origin_df.set_index("datetime", inplace=True, drop=True)
+        df = origin_df.copy()
+        CSZFillna(fields_group=None).__call__(df)
+        assert ~((origin_df == df).iloc[1, 0])
+
+    def test_CSZScoreNorm(self):
+        st = """
+        2000-01-01,1,2
+        2000-01-02,3,4
+        2000-01-03,5,6
+        """
+        origin_df = pd.read_csv(io.StringIO(st), header=None)
+        origin_df.columns = ["datetime", "a", "b"]
+        origin_df.set_index("datetime", inplace=True, drop=True)
+        df = origin_df.copy()
+        CSZScoreNorm(fields_group=None).__call__(df)
+        assert (df == ((origin_df - origin_df.mean()).div(origin_df.std()))).all().all()
+
+
+def suite():
+    _suite = unittest.TestSuite()
+    _suite.addTest(TestProcessor("test_MinMaxNorm"))
+    _suite.addTest(TestProcessor("test_ZScoreNorm"))
+    _suite.addTest(TestProcessor("test_CSZFillna"))
+    _suite.addTest(TestProcessor("test_CSZScoreNorm"))
+    return _suite