Skip to content

Commit

Permalink
Use the region in qlib.config for FileCalendarStorage. (microsoft#1049)
Browse files Browse the repository at this point in the history
* Use the region in qlib.config for FileCalendarStorage.

* Fix black.

* Make region as an optional parameter.
  • Loading branch information
ChiahungTai committed Apr 20, 2022
1 parent a56ee5d commit 5b53491
Show file tree
Hide file tree
Showing 3 changed files with 83 additions and 31 deletions.
5 changes: 4 additions & 1 deletion qlib/data/storage/file_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ def __init__(self, freq: str, future: bool, provider_uri: dict = None, **kwargs)
self.future = future
self._provider_uri = None if provider_uri is None else C.DataPathManager.format_provider_uri(provider_uri)
self.enable_read_cache = True # TODO: make it configurable
self.region = C["region"]

@property
def file_name(self) -> str:
Expand Down Expand Up @@ -130,7 +131,9 @@ def data(self) -> List[CalVT]:
else:
_calendar = self._read_calendar()
if Freq(self._freq_file) != Freq(self.freq):
_calendar = resam_calendar(np.array(list(map(pd.Timestamp, _calendar))), self._freq_file, self.freq)
_calendar = resam_calendar(
np.array(list(map(pd.Timestamp, _calendar))), self._freq_file, self.freq, self.region
)
return _calendar

def _get_storage_freq(self) -> List[str]:
Expand Down
13 changes: 10 additions & 3 deletions qlib/utils/resam.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,12 @@

from . import lazy_sort_index
from .time import Freq, cal_sam_minute
from ..config import C


def resam_calendar(calendar_raw: np.ndarray, freq_raw: Union[str, Freq], freq_sam: Union[str, Freq]) -> np.ndarray:
def resam_calendar(
calendar_raw: np.ndarray, freq_raw: Union[str, Freq], freq_sam: Union[str, Freq], region: str = None
) -> np.ndarray:
"""
Resample the calendar with frequency freq_raw into the calendar with frequency freq_sam
Assumption:
Expand All @@ -22,12 +25,16 @@ def resam_calendar(calendar_raw: np.ndarray, freq_raw: Union[str, Freq], freq_sa
Frequency of the raw calendar
freq_sam : str
Sample frequency
region: str
Region, for example, "cn", "us"
Returns
-------
np.ndarray
The calendar with frequency freq_sam
"""
if region is None:
region = C["region"]

freq_raw = Freq(freq_raw)
freq_sam = Freq(freq_sam)
if not len(calendar_raw):
Expand All @@ -40,7 +47,7 @@ def resam_calendar(calendar_raw: np.ndarray, freq_raw: Union[str, Freq], freq_sa
else:
if freq_raw.count > freq_sam.count:
raise ValueError("raw freq must be higher than sampling freq")
_calendar_minute = np.unique(list(map(lambda x: cal_sam_minute(x, freq_sam.count), calendar_raw)))
_calendar_minute = np.unique(list(map(lambda x: cal_sam_minute(x, freq_sam.count, region), calendar_raw)))
return _calendar_minute

# else, convert the raw calendar into day calendar, and divide the whole calendar into several bars evenly
Expand Down
96 changes: 69 additions & 27 deletions qlib/utils/time.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,32 @@
Time related utils are compiled in this script
"""
import bisect
from datetime import datetime, time, date
from datetime import datetime, time, date, timedelta
from typing import List, Optional, Tuple, Union
import functools
import re

import pandas as pd

from qlib.config import C
from qlib.constant import REG_CN, REG_TW, REG_US


CN_TIME = [
datetime.strptime("9:30", "%H:%M"),
datetime.strptime("11:30", "%H:%M"),
datetime.strptime("13:00", "%H:%M"),
datetime.strptime("15:00", "%H:%M"),
]
US_TIME = [datetime.strptime("9:30", "%H:%M"), datetime.strptime("16:00", "%H:%M")]
TW_TIME = [
datetime.strptime("9:00", "%H:%M"),
datetime.strptime("13:30", "%H:%M"),
]


@functools.lru_cache(maxsize=240)
def get_min_cal(shift: int = 0) -> List[time]:
def get_min_cal(shift: int = 0, region: str = REG_CN) -> List[time]:
"""
get the minute level calendar in day period
Expand All @@ -24,21 +38,39 @@ def get_min_cal(shift: int = 0) -> List[time]:
shift : int
the shift direction would be like pandas shift.
series.shift(1) will replace the value at `i`-th with the one at `i-1`-th
region: str
Region, for example, "cn", "us"
Returns
-------
List[time]:
"""
cal = []
for ts in list(pd.date_range("9:30", "11:29", freq="1min") - pd.Timedelta(minutes=shift)) + list(
pd.date_range("13:00", "14:59", freq="1min") - pd.Timedelta(minutes=shift)
):
cal.append(ts.time())

if region == REG_CN:
for ts in list(
pd.date_range(CN_TIME[0], CN_TIME[1] - timedelta(minutes=1), freq="1min") - pd.Timedelta(minutes=shift)
) + list(
pd.date_range(CN_TIME[2], CN_TIME[3] - timedelta(minutes=1), freq="1min") - pd.Timedelta(minutes=shift)
):
cal.append(ts.time())
elif region == REG_TW:
for ts in list(
pd.date_range(TW_TIME[0], TW_TIME[1] - timedelta(minutes=1), freq="1min") - pd.Timedelta(minutes=shift)
):
cal.append(ts.time())
elif region == REG_US:
for ts in list(
pd.date_range(US_TIME[0], US_TIME[1] - timedelta(minutes=1), freq="1min") - pd.Timedelta(minutes=shift)
):
cal.append(ts.time())
else:
raise ValueError(f"{region} is not supported")
return cal


def is_single_value(start_time, end_time, freq, region="cn"):
def is_single_value(start_time, end_time, freq, region: str = REG_CN):
"""Is there only one piece of data for stock market.
Parameters
Expand All @@ -48,19 +80,33 @@ def is_single_value(start_time, end_time, freq, region="cn"):
end_time : Union[pd.Timestamp, str]
closed end time for data.
freq :
region: str
Region, for example, "cn", "us"
Returns
-------
bool
True means one piece of data to obtain.
"""
if region == "cn":
if region == REG_CN:
if end_time - start_time < freq:
return True
if start_time.hour == 11 and start_time.minute == 29 and start_time.second == 0:
return True
if start_time.hour == 14 and start_time.minute == 59 and start_time.second == 0:
return True
return False
elif region == REG_TW:
if end_time - start_time < freq:
return True
if start_time.hour == 13 and start_time.minute >= 25 and start_time.second == 0:
return True
return False
elif region == REG_US:
if end_time - start_time < freq:
return True
if start_time.hour == 15 and start_time.minute == 59 and start_time.second == 0:
return True
return False
else:
raise NotImplementedError(f"please implement the is_single_value func for {region}")

Expand All @@ -72,8 +118,6 @@ class Freq:
NORM_FREQ_MINUTE = "min" # using min instead of minute for align with Qlib's data filename
SUPPORT_CAL_LIST = [NORM_FREQ_MINUTE, NORM_FREQ_DAY] # FIXME: this list should from data

MIN_CAL = get_min_cal()

def __init__(self, freq: Union[str, "Freq"]) -> None:
if isinstance(freq, str):
self.count, self.base = self.parse(freq)
Expand Down Expand Up @@ -211,36 +255,32 @@ def get_recent_freq(base_freq: Union[str, "Freq"], freq_list: List[Union[str, "F
return min_freq[1] if min_freq else None


CN_TIME = [
datetime.strptime("9:30", "%H:%M"),
datetime.strptime("11:30", "%H:%M"),
datetime.strptime("13:00", "%H:%M"),
datetime.strptime("15:00", "%H:%M"),
]
US_TIME = [datetime.strptime("9:30", "%H:%M"), datetime.strptime("16:00", "%H:%M")]


def time_to_day_index(time_obj: Union[str, datetime], region: str = "cn"):
def time_to_day_index(time_obj: Union[str, datetime], region: str = REG_CN):
if isinstance(time_obj, str):
time_obj = datetime.strptime(time_obj, "%H:%M")

if region == "cn":
if region == REG_CN:
if CN_TIME[0] <= time_obj < CN_TIME[1]:
return int((time_obj - CN_TIME[0]).total_seconds() / 60)
elif CN_TIME[2] <= time_obj < CN_TIME[3]:
return int((time_obj - CN_TIME[2]).total_seconds() / 60) + 120
else:
raise ValueError(f"{time_obj} is not the opening time of the {region} stock market")
elif region == "us":
elif region == REG_US:
if US_TIME[0] <= time_obj < US_TIME[1]:
return int((time_obj - US_TIME[0]).total_seconds() / 60)
else:
raise ValueError(f"{time_obj} is not the opening time of the {region} stock market")
elif region == REG_TW:
if TW_TIME[0] <= time_obj < TW_TIME[1]:
return int((time_obj - TW_TIME[0]).total_seconds() / 60)
else:
raise ValueError(f"{time_obj} is not the opening time of the {region} stock market")
else:
raise ValueError(f"{region} is not supported")


def get_day_min_idx_range(start: str, end: str, freq: str) -> Tuple[int, int]:
def get_day_min_idx_range(start: str, end: str, freq: str, region: str) -> Tuple[int, int]:
"""
get the min-bar index in a day for a time range (both left and right is closed) given a fixed frequency
Parameters
Expand All @@ -260,7 +300,7 @@ def get_day_min_idx_range(start: str, end: str, freq: str) -> Tuple[int, int]:
start = pd.Timestamp(start).time()
end = pd.Timestamp(end).time()
freq = Freq(freq)
in_day_cal = Freq.MIN_CAL[:: freq.count]
in_day_cal = get_min_cal(region=region)[:: freq.count]
left_idx = bisect.bisect_left(in_day_cal, start)
right_idx = bisect.bisect_right(in_day_cal, end) - 1
return left_idx, right_idx
Expand All @@ -280,7 +320,7 @@ def concat_date_time(date_obj: date, time_obj: time) -> pd.Timestamp:
)


def cal_sam_minute(x: pd.Timestamp, sam_minutes: int) -> pd.Timestamp:
def cal_sam_minute(x: pd.Timestamp, sam_minutes: int, region: str = REG_CN) -> pd.Timestamp:
"""
align the minute-level data to a down sampled calendar
Expand All @@ -292,13 +332,15 @@ def cal_sam_minute(x: pd.Timestamp, sam_minutes: int) -> pd.Timestamp:
datetime to be aligned
sam_minutes : int
align to `sam_minutes` minute-level calendar
region: str
Region, for example, "cn", "us"
Returns
-------
pd.Timestamp:
the datetime after aligned
"""
cal = get_min_cal(C.min_data_shift)[::sam_minutes]
cal = get_min_cal(C.min_data_shift, region)[::sam_minutes]
idx = bisect.bisect_right(cal, x.time()) - 1
_date, new_time = x.date(), cal[idx]
return concat_date_time(_date, new_time)
Expand Down Expand Up @@ -332,4 +374,4 @@ def epsilon_change(date_time: pd.Timestamp, direction: str = "backward") -> pd.T


if __name__ == "__main__":
print(get_day_min_idx_range("8:30", "14:59", "10min"))
print(get_day_min_idx_range("8:30", "14:59", "10min", REG_CN))

0 comments on commit 5b53491

Please sign in to comment.