Use the region in qlib.config for FileCalendarStorage. (microsoft#1049)

* Use the region in qlib.config for FileCalendarStorage. * Fix black. * Make region as an optional parameter.
qianyun210603 · Apr 20, 2022 · 5b53491 · 5b53491
1 parent a56ee5d
commit 5b53491
Show file tree

Hide file tree

Showing 3 changed files with 83 additions and 31 deletions.
diff --git a/qlib/data/storage/file_storage.py b/qlib/data/storage/file_storage.py
@@ -79,6 +79,7 @@ def __init__(self, freq: str, future: bool, provider_uri: dict = None, **kwargs)
         self.future = future
         self._provider_uri = None if provider_uri is None else C.DataPathManager.format_provider_uri(provider_uri)
         self.enable_read_cache = True  # TODO: make it configurable
+        self.region = C["region"]
 
     @property
     def file_name(self) -> str:
@@ -130,7 +131,9 @@ def data(self) -> List[CalVT]:
         else:
             _calendar = self._read_calendar()
         if Freq(self._freq_file) != Freq(self.freq):
-            _calendar = resam_calendar(np.array(list(map(pd.Timestamp, _calendar))), self._freq_file, self.freq)
+            _calendar = resam_calendar(
+                np.array(list(map(pd.Timestamp, _calendar))), self._freq_file, self.freq, self.region
+            )
         return _calendar
 
     def _get_storage_freq(self) -> List[str]:

diff --git a/qlib/utils/resam.py b/qlib/utils/resam.py
@@ -6,9 +6,12 @@
 
 from . import lazy_sort_index
 from .time import Freq, cal_sam_minute
+from ..config import C
 
 
-def resam_calendar(calendar_raw: np.ndarray, freq_raw: Union[str, Freq], freq_sam: Union[str, Freq]) -> np.ndarray:
+def resam_calendar(
+    calendar_raw: np.ndarray, freq_raw: Union[str, Freq], freq_sam: Union[str, Freq], region: str = None
+) -> np.ndarray:
     """
     Resample the calendar with frequency freq_raw into the calendar with frequency freq_sam
     Assumption:
@@ -22,12 +25,16 @@ def resam_calendar(calendar_raw: np.ndarray, freq_raw: Union[str, Freq], freq_sa
         Frequency of the raw calendar
     freq_sam : str
         Sample frequency
-
+    region: str
+        Region, for example, "cn", "us"
     Returns
     -------
     np.ndarray
         The calendar with frequency freq_sam
     """
+    if region is None:
+        region = C["region"]
+
     freq_raw = Freq(freq_raw)
     freq_sam = Freq(freq_sam)
     if not len(calendar_raw):
@@ -40,7 +47,7 @@ def resam_calendar(calendar_raw: np.ndarray, freq_raw: Union[str, Freq], freq_sa
         else:
             if freq_raw.count > freq_sam.count:
                 raise ValueError("raw freq must be higher than sampling freq")
-        _calendar_minute = np.unique(list(map(lambda x: cal_sam_minute(x, freq_sam.count), calendar_raw)))
+        _calendar_minute = np.unique(list(map(lambda x: cal_sam_minute(x, freq_sam.count, region), calendar_raw)))
         return _calendar_minute
 
     # else, convert the raw calendar into day calendar, and divide the whole calendar into several bars evenly

diff --git a/qlib/utils/time.py b/qlib/utils/time.py
@@ -4,18 +4,32 @@
 Time related utils are compiled in this script
 """
 import bisect
-from datetime import datetime, time, date
+from datetime import datetime, time, date, timedelta
 from typing import List, Optional, Tuple, Union
 import functools
 import re
 
 import pandas as pd
 
 from qlib.config import C
+from qlib.constant import REG_CN, REG_TW, REG_US
+
+
+CN_TIME = [
+    datetime.strptime("9:30", "%H:%M"),
+    datetime.strptime("11:30", "%H:%M"),
+    datetime.strptime("13:00", "%H:%M"),
+    datetime.strptime("15:00", "%H:%M"),
+]
+US_TIME = [datetime.strptime("9:30", "%H:%M"), datetime.strptime("16:00", "%H:%M")]
+TW_TIME = [
+    datetime.strptime("9:00", "%H:%M"),
+    datetime.strptime("13:30", "%H:%M"),
+]
 
 
 @functools.lru_cache(maxsize=240)
-def get_min_cal(shift: int = 0) -> List[time]:
+def get_min_cal(shift: int = 0, region: str = REG_CN) -> List[time]:
     """
     get the minute level calendar in day period
 
@@ -24,21 +38,39 @@ def get_min_cal(shift: int = 0) -> List[time]:
     shift : int
         the shift direction would be like pandas shift.
         series.shift(1) will replace the value at `i`-th with the one at `i-1`-th
+    region: str
+        Region, for example, "cn", "us"
 
     Returns
     -------
     List[time]:
 
     """
     cal = []
-    for ts in list(pd.date_range("9:30", "11:29", freq="1min") - pd.Timedelta(minutes=shift)) + list(
-        pd.date_range("13:00", "14:59", freq="1min") - pd.Timedelta(minutes=shift)
-    ):
-        cal.append(ts.time())
+
+    if region == REG_CN:
+        for ts in list(
+            pd.date_range(CN_TIME[0], CN_TIME[1] - timedelta(minutes=1), freq="1min") - pd.Timedelta(minutes=shift)
+        ) + list(
+            pd.date_range(CN_TIME[2], CN_TIME[3] - timedelta(minutes=1), freq="1min") - pd.Timedelta(minutes=shift)
+        ):
+            cal.append(ts.time())
+    elif region == REG_TW:
+        for ts in list(
+            pd.date_range(TW_TIME[0], TW_TIME[1] - timedelta(minutes=1), freq="1min") - pd.Timedelta(minutes=shift)
+        ):
+            cal.append(ts.time())
+    elif region == REG_US:
+        for ts in list(
+            pd.date_range(US_TIME[0], US_TIME[1] - timedelta(minutes=1), freq="1min") - pd.Timedelta(minutes=shift)
+        ):
+            cal.append(ts.time())
+    else:
+        raise ValueError(f"{region} is not supported")
     return cal
 
 
-def is_single_value(start_time, end_time, freq, region="cn"):
+def is_single_value(start_time, end_time, freq, region: str = REG_CN):
     """Is there only one piece of data for stock market.
 
     Parameters
@@ -48,19 +80,33 @@ def is_single_value(start_time, end_time, freq, region="cn"):
     end_time : Union[pd.Timestamp, str]
         closed end time for data.
     freq :
+    region: str
+        Region, for example, "cn", "us"
     Returns
     -------
     bool
         True means one piece of data to obtain.
     """
-    if region == "cn":
+    if region == REG_CN:
         if end_time - start_time < freq:
             return True
         if start_time.hour == 11 and start_time.minute == 29 and start_time.second == 0:
             return True
         if start_time.hour == 14 and start_time.minute == 59 and start_time.second == 0:
             return True
         return False
+    elif region == REG_TW:
+        if end_time - start_time < freq:
+            return True
+        if start_time.hour == 13 and start_time.minute >= 25 and start_time.second == 0:
+            return True
+        return False
+    elif region == REG_US:
+        if end_time - start_time < freq:
+            return True
+        if start_time.hour == 15 and start_time.minute == 59 and start_time.second == 0:
+            return True
+        return False
     else:
         raise NotImplementedError(f"please implement the is_single_value func for {region}")
 
@@ -72,8 +118,6 @@ class Freq:
     NORM_FREQ_MINUTE = "min"  # using min instead of minute for align with Qlib's data filename
     SUPPORT_CAL_LIST = [NORM_FREQ_MINUTE, NORM_FREQ_DAY]  # FIXME: this list should from data
 
-    MIN_CAL = get_min_cal()
-
     def __init__(self, freq: Union[str, "Freq"]) -> None:
         if isinstance(freq, str):
             self.count, self.base = self.parse(freq)
@@ -211,36 +255,32 @@ def get_recent_freq(base_freq: Union[str, "Freq"], freq_list: List[Union[str, "F
         return min_freq[1] if min_freq else None
 
 
-CN_TIME = [
-    datetime.strptime("9:30", "%H:%M"),
-    datetime.strptime("11:30", "%H:%M"),
-    datetime.strptime("13:00", "%H:%M"),
-    datetime.strptime("15:00", "%H:%M"),
-]
-US_TIME = [datetime.strptime("9:30", "%H:%M"), datetime.strptime("16:00", "%H:%M")]
-
-
-def time_to_day_index(time_obj: Union[str, datetime], region: str = "cn"):
+def time_to_day_index(time_obj: Union[str, datetime], region: str = REG_CN):
     if isinstance(time_obj, str):
         time_obj = datetime.strptime(time_obj, "%H:%M")
 
-    if region == "cn":
+    if region == REG_CN:
         if CN_TIME[0] <= time_obj < CN_TIME[1]:
             return int((time_obj - CN_TIME[0]).total_seconds() / 60)
         elif CN_TIME[2] <= time_obj < CN_TIME[3]:
             return int((time_obj - CN_TIME[2]).total_seconds() / 60) + 120
         else:
             raise ValueError(f"{time_obj} is not the opening time of the {region} stock market")
-    elif region == "us":
+    elif region == REG_US:
         if US_TIME[0] <= time_obj < US_TIME[1]:
             return int((time_obj - US_TIME[0]).total_seconds() / 60)
         else:
             raise ValueError(f"{time_obj} is not the opening time of the {region} stock market")
+    elif region == REG_TW:
+        if TW_TIME[0] <= time_obj < TW_TIME[1]:
+            return int((time_obj - TW_TIME[0]).total_seconds() / 60)
+        else:
+            raise ValueError(f"{time_obj} is not the opening time of the {region} stock market")
     else:
         raise ValueError(f"{region} is not supported")
 
 
-def get_day_min_idx_range(start: str, end: str, freq: str) -> Tuple[int, int]:
+def get_day_min_idx_range(start: str, end: str, freq: str, region: str) -> Tuple[int, int]:
     """
     get the min-bar index in a day for a time range (both left and right is closed) given a fixed frequency
     Parameters
@@ -260,7 +300,7 @@ def get_day_min_idx_range(start: str, end: str, freq: str) -> Tuple[int, int]:
     start = pd.Timestamp(start).time()
     end = pd.Timestamp(end).time()
     freq = Freq(freq)
-    in_day_cal = Freq.MIN_CAL[:: freq.count]
+    in_day_cal = get_min_cal(region=region)[:: freq.count]
     left_idx = bisect.bisect_left(in_day_cal, start)
     right_idx = bisect.bisect_right(in_day_cal, end) - 1
     return left_idx, right_idx
@@ -280,7 +320,7 @@ def concat_date_time(date_obj: date, time_obj: time) -> pd.Timestamp:
     )
 
 
-def cal_sam_minute(x: pd.Timestamp, sam_minutes: int) -> pd.Timestamp:
+def cal_sam_minute(x: pd.Timestamp, sam_minutes: int, region: str = REG_CN) -> pd.Timestamp:
     """
     align the minute-level data to a down sampled calendar
 
@@ -292,13 +332,15 @@ def cal_sam_minute(x: pd.Timestamp, sam_minutes: int) -> pd.Timestamp:
         datetime to be aligned
     sam_minutes : int
         align to `sam_minutes` minute-level calendar
+    region: str
+        Region, for example, "cn", "us"
 
     Returns
     -------
     pd.Timestamp:
         the datetime after aligned
     """
-    cal = get_min_cal(C.min_data_shift)[::sam_minutes]
+    cal = get_min_cal(C.min_data_shift, region)[::sam_minutes]
     idx = bisect.bisect_right(cal, x.time()) - 1
     _date, new_time = x.date(), cal[idx]
     return concat_date_time(_date, new_time)
@@ -332,4 +374,4 @@ def epsilon_change(date_time: pd.Timestamp, direction: str = "backward") -> pd.T
 
 
 if __name__ == "__main__":
-    print(get_day_min_idx_range("8:30", "14:59", "10min"))
+    print(get_day_min_idx_range("8:30", "14:59", "10min", REG_CN))