Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[refactor] Encapsulate _make_future_dataframe and _check_dataframe #1279

Merged
merged 3 commits into from
Apr 18, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion neuralprophet/data/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ def _check_dataframe(
check_y: bool = True,
exogenous: bool = True,
future: Optional[bool] = None,
):
) -> pd.DataFrame:
"""Performs basic data sanity checks and ordering

Prepare dataframe for fitting or predicting.
Expand Down
81 changes: 63 additions & 18 deletions neuralprophet/data/split.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
from typing import Optional

import pandas as pd

Expand Down Expand Up @@ -52,26 +53,70 @@ def _get_maybe_extend_periods(model, df):
return periods_add


def _make_future_dataframe(model, df, events_df, regressors_df, periods, n_historic_predictions):
def _make_future_dataframe(
model,
df: pd.DataFrame,
events_df: pd.DataFrame,
regressors_df: pd.DataFrame,
periods: Optional[int],
n_historic_predictions: int,
n_forecasts: int,
max_lags: int,
freq: Optional[str],
) -> pd.DataFrame:
"""
Generate a future dataframe by extending the input dataframe into the future.

Parameters
----------
model : NeuralProphet
The model object used for prediction.
df : pd.DataFrame
The input dataframe with a single ID column and a 'ds' column containing timestamps.
events_df : pd.DataFrame, optional
The dataframe containing information about external events.
regressors_df : pd.DataFrame, optional
The dataframe containing information about external regressors.
periods : int
The number of steps to extend the DataFrame into the future.
n_historic_predictions : int
The number of historic predictions to include in the output dataframe.
n_forecasts : int
identical to NeuralProphet
max_lags : int
identical to NeuralProphet
freq : str
identical to NeuralProphet

Returns
-------
pd.DataFrame
The extended dataframe with additional rows for future periods.

Raises
------
ValueError
If future values of all user specified regressors not provided.
"""
# Receives df with single ID column
assert len(df["ID"].unique()) == 1
if periods == 0 and n_historic_predictions is True:
log.warning("Not extending df into future as no periods specified." "You can call predict directly instead.")
df = df.copy(deep=True)
_ = df_utils.infer_frequency(df, n_lags=model.max_lags, freq=model.data_freq)
_ = df_utils.infer_frequency(df, n_lags=max_lags, freq=freq)
last_date = pd.to_datetime(df["ds"].copy(deep=True).dropna()).sort_values().max()
if events_df is not None:
events_df = events_df.copy(deep=True).reset_index(drop=True)
if regressors_df is not None:
regressors_df = regressors_df.copy(deep=True).reset_index(drop=True)
if periods is None:
periods = 1 if model.max_lags == 0 else model.n_forecasts
periods = 1 if max_lags == 0 else n_forecasts
else:
assert periods >= 0

if isinstance(n_historic_predictions, bool):
if n_historic_predictions:
n_historic_predictions = len(df) - model.max_lags
n_historic_predictions = len(df) - max_lags
else:
n_historic_predictions = 0
elif not isinstance(n_historic_predictions, int):
Expand All @@ -90,25 +135,25 @@ def _make_future_dataframe(model, df, events_df, regressors_df, periods, n_histo
if regressor not in regressors_df.columns:
raise ValueError(f"Future values of user specified regressor {regressor} not provided")

if len(df) < model.max_lags:
if len(df) < max_lags:
raise ValueError(
"Insufficient input data for a prediction."
"Please supply historic observations (number of rows) of at least max_lags (max of number of n_lags)."
)
elif len(df) < model.max_lags + n_historic_predictions:
elif len(df) < max_lags + n_historic_predictions:
log.warning(
f"Insufficient data for {n_historic_predictions} historic forecasts, reduced to {len(df) - model.max_lags}."
f"Insufficient data for {n_historic_predictions} historic forecasts, reduced to {len(df) - max_lags}."
)
n_historic_predictions = len(df) - model.max_lags
if (n_historic_predictions + model.max_lags) == 0:
n_historic_predictions = len(df) - max_lags
if (n_historic_predictions + max_lags) == 0:
df = pd.DataFrame(columns=df.columns)
else:
df = df[-(model.max_lags + n_historic_predictions) :]
df = df[-(max_lags + n_historic_predictions) :]
nan_at_end = 0
while len(df) > nan_at_end and df["y"].isnull().iloc[-(1 + nan_at_end)]:
nan_at_end += 1
if nan_at_end > 0:
if model.max_lags > 0 and (nan_at_end + 1) >= model.max_lags:
if max_lags > 0 and (nan_at_end + 1) >= max_lags:
raise ValueError(
f"{nan_at_end + 1} missing values were detected at the end of df before df was extended into the future. "
"Please make sure there are no NaN values at the end of df."
Expand All @@ -121,10 +166,10 @@ def _make_future_dataframe(model, df, events_df, regressors_df, periods, n_histo

if len(df) > 0:
if len(df.columns) == 1 and "ds" in df:
assert model.max_lags == 0
assert max_lags == 0
df = _check_dataframe(model, df, check_y=False, exogenous=False)
else:
df = _check_dataframe(model, df, check_y=model.max_lags > 0, exogenous=True, future=True)
df = _check_dataframe(model, df, check_y=max_lags > 0, exogenous=True, future=True)
# future data
# check for external events known in future
if model.config_events is not None and periods > 0 and events_df is None:
Expand All @@ -133,17 +178,17 @@ def _make_future_dataframe(model, df, events_df, regressors_df, periods, n_histo
"All events being treated as not occurring in future"
)

if model.max_lags > 0:
if periods > 0 and periods != model.n_forecasts:
periods = model.n_forecasts
log.warning(f"Number of forecast steps is defined by n_forecasts. Adjusted to {model.n_forecasts}.")
if max_lags > 0:
if periods > 0 and periods != n_forecasts:
periods = n_forecasts
log.warning(f"Number of forecast steps is defined by n_forecasts. Adjusted to {n_forecasts}.")

if periods > 0:
future_df = df_utils.make_future_df(
df_columns=df.columns,
last_date=last_date,
periods=periods,
freq=model.data_freq,
freq=freq,
config_events=model.config_events,
events_df=events_df,
config_regressors=model.config_regressors,
Expand Down
5 changes: 4 additions & 1 deletion neuralprophet/forecaster.py
Original file line number Diff line number Diff line change
Expand Up @@ -1492,12 +1492,15 @@ def make_future_dataframe(
df_future_dataframe = pd.DataFrame()
for df_name, df_i in df.groupby("ID"):
df_aux = _make_future_dataframe(
self,
model=self,
df=df_i,
events_df=events_dict[df_name],
regressors_df=regressors_dict[df_name],
periods=periods,
n_historic_predictions=n_historic_predictions,
n_forecasts=self.n_forecasts,
max_lags=self.max_lags,
freq=self.data_freq,
)
df_aux["ID"] = df_name
df_future_dataframe = pd.concat((df_future_dataframe, df_aux), ignore_index=True)
Expand Down