I have a very extensive XGB Boost/RF, where the in-sample predictions are quite ok. Based on the validation set, it doesn't look overfit. If I make the actual predictions, the model is just taking one single value. The predictions are on a daily business days.
The correlation plot and the feature importance are showing that EMA_5_days is an important contributor. I have therefore used this variable for prediction. The model is also using a Monte Carlo Simulation method, but it still makes useless predictions.
I have to clue what to do next. Below you will find the complete code. Any help is welcome! It looks like the model doesn't know what to do. I also added the week days information in the model, so it could know which date is that to predict.
import numpy as npimport pandas as pdfrom xgboost import XGBRegressorfrom sklearn.ensemble import RandomForestRegressorfrom sklearn.model_selection import train_test_split, GridSearchCV, TimeSeriesSplitfrom sklearn.metrics import mean_squared_error, mean_absolute_errorimport holidays# Ensure DATE_TIME_STAMP is in datetime format and recompute datetime-related featuresdf_forecast['DATE_TIME_STAMP'] = pd.to_datetime(df_forecast['DATE_TIME_STAMP'])df_forecast['DayOfWeek'] = df_forecast['DATE_TIME_STAMP'].dt.dayofweekdf_forecast['WeekOfYear'] = df_forecast['DATE_TIME_STAMP'].dt.isocalendar().weekdf_forecast['MonthOfYear'] = df_forecast['DATE_TIME_STAMP'].dt.month# Remove existing one-hot encoded columns if they exist to prevent duplicatesfor col_prefix in ['DayOfWeek_', 'WeekOfYear_', 'MonthOfYear_']: df_forecast = df_forecast.loc[:, ~df_forecast.columns.str.startswith(col_prefix)]# Perform one-hot encodingdf_forecast = pd.get_dummies(df_forecast, columns=['DayOfWeek', 'WeekOfYear', 'MonthOfYear'])# Ensure no duplicate columns exist after encodingassert not df_forecast.columns.duplicated().any(), "Duplicate columns found after processing."param_grid = {'n_estimators': [50, 100, 150],'max_depth': [4, 5, 6],'learning_rate': [0.05],'subsample': [0.8],'colsample_bytree': [0.8],'min_child_weight': [1],'gamma': [0.1],'lambda': [1],'alpha': [0.1]}tscv = TimeSeriesSplit(n_splits=5)def update_ema(last_ema, new_value, alpha=0.1, historical_ema_weight=0.7):""" Update the Exponential Moving Average (EMA) based on the last EMA value, the new incoming value, the alpha parameter, and the historical EMA weight. :param last_ema: The last calculated EMA value. :param new_value: The new data point to be incorporated into the EMA. :param alpha: The smoothing factor applied to the new value, between 0 and 1. :param historical_ema_weight: The weight given to the historical EMA in the balance. :return: The updated EMA value.""" # Calculate the contribution from the new value and the historical EMA new_value_contribution = alpha * new_value historical_ema_contribution = (1 - alpha) * historical_ema_weight * last_ema # The remaining weight to reach 1 is given to the last EMA remaining_weight = 1 - new_value_contribution - historical_ema_contribution return new_value_contribution + historical_ema_contribution + remaining_weight * last_emadef prepare_features_for_prediction(base_features, model_features): prepared_features = {feature: base_features.get(feature, 0) for feature in model_features} return pd.DataFrame([prepared_features])def naive_monte_carlo_simulation(models, base_features, num_simulations=100): simulated_predictions = [] for _ in range(num_simulations): perturbed_features = {feature: max(0, value + np.random.normal(0, max(0.1 * abs(value), 0.01))) for feature, value in base_features.items()} prepared_features = pd.DataFrame([perturbed_features]) prediction = predict_with_ensemble(models, prepared_features) simulated_predictions.append(prediction) return np.median(simulated_predictions)def predict_with_ensemble(models, features): features = features[models['XGB'].get_booster().feature_names] pred_XGB = models['XGB'].predict(features) pred_RF = models['RF'].predict(features) return (pred_XGB + pred_RF) / 2def predict_next_n_business_days(models, base_features, last_date, country_code, n_days=30, alpha_ema=0.1): future_predictions = [] next_date = pd.to_datetime(last_date) try: country_holidays = holidays.CountryHoliday(country_code) except (KeyError, NotImplementedError): print(f"Holiday information for country code '{country_code}' not found or not implemented.") return [] # Initialize the last EMA value last_ema_value = base_features.get('ema_5_days', 0) while len(future_predictions) < n_days: next_date += pd.Timedelta(days=1) if next_date.weekday() >= 5 or next_date in country_holidays: continue # Reset all one-hot encoded day, week, and month features to 0 for i in range(7): base_features[f'DayOfWeek_{i}'] = 0 base_features[f'DayOfWeek_{next_date.dayofweek}'] = 1 for i in range(1, 54): base_features[f'WeekOfYear_{i}'] = 0 if next_date.isocalendar().week in range(1, 54): base_features[f'WeekOfYear_{next_date.isocalendar().week}'] = 1 for i in range(1, 13): base_features[f'MonthOfYear_{i}'] = 0 base_features[f'MonthOfYear_{next_date.month}'] = 1 # Use naive Monte Carlo simulation for prediction median_prediction = naive_monte_carlo_simulation(models, base_features, num_simulations=100) # Update the EMA with the median prediction last_ema_value = alpha_ema * median_prediction + (1 - alpha_ema) * last_ema_value base_features['ema_5_days'] = last_ema_value future_predictions.append({'DATE_TIME_STAMP': next_date.strftime('%Y-%m-%d'),'Forecast': max(0, median_prediction), # Ensure forecast is not negative'SHIP_TO_COUNTRY': country_code }) return future_predictions# Main execution starts hereforecast_df_XGB = pd.DataFrame()forecast_out_of_sample_df_XGB = pd.DataFrame()validation_df_XGB = pd.DataFrame()scores_list = []error_log_list = []models_dict = {}for item_col in df_forecast.columns: if item_col.startswith('SHIP_TO_COUNTRY_'): country_code = item_col.split('_')[-1] df_country = df_forecast[df_forecast[item_col] == 1] if 'ema_5_days' not in df_country.columns or len(df_country) < 50: error_message = f"Skipping country code {country_code} due to insufficient data or missing 'ema_5_days' column." error_log_list.append({'Country_Code': country_code, 'Error': error_message}) continue X = df_country.drop(['UNIQUE_ITEM_COUNT_COUNTRY', 'DATE_TIME_STAMP', item_col], axis=1) y = df_country['UNIQUE_ITEM_COUNT_COUNTRY'] X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, shuffle=False) X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, shuffle=False) model_XGB = XGBRegressor() grid_search_XGB = GridSearchCV(model_XGB, param_grid, cv=tscv, scoring='neg_mean_absolute_error', n_jobs=-1, verbose=1) grid_search_XGB.fit(X_train, y_train) model_RF = RandomForestRegressor(n_estimators=100, random_state=42) model_RF.fit(X_train, y_train) models_dict[country_code] = {'XGB': grid_search_XGB.best_estimator_, 'RF': model_RF} y_val_pred = predict_with_ensemble(models_dict[country_code], X_val) y_test_pred = predict_with_ensemble(models_dict[country_code], X_test) validation_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred)) validation_mae = mean_absolute_error(y_val, y_val_pred) test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred)) test_mae = mean_absolute_error(y_test, y_test_pred) scores_list.append({'Country_Code': country_code,'Validation_RMSE': validation_rmse,'Validation_MAE': validation_mae,'Test_RMSE': test_rmse,'Test_MAE': test_mae }) test_forecast = pd.DataFrame({'DATE_TIME_STAMP': df_country.loc[X_test.index, 'DATE_TIME_STAMP'],'Forecast': np.maximum(y_test_pred, 0),'Actual': y_test,'SHIP_TO_COUNTRY': country_code }) forecast_df_XGB = pd.concat([forecast_df_XGB, test_forecast], ignore_index=True) base_features = X_train_val.iloc[-1].to_dict() base_features['ema_5_days'] = update_ema(base_features.get('ema_5_days', 0), y_train_val.iloc[-1]) last_prediction_date = df_country['DATE_TIME_STAMP'].max() out_of_sample_predictions = predict_next_n_business_days(models_dict[country_code], base_features, last_prediction_date, country_code, n_days=30) # After processing all countries, concatenate the out-of-sample predictions forecast_out_of_sample_df_XGB = pd.concat([forecast_out_of_sample_df_XGB, pd.DataFrame(out_of_sample_predictions)], ignore_index=True)error_log_df = pd.DataFrame(error_log_list)scores_df_XGB = pd.DataFrame(scores_list)print("Validation Results:\n", validation_df_XGB.head())print("Forecast Results:\n", forecast_df_XGB.head())print("Out-of-Sample Forecast Results:\n", forecast_out_of_sample_df_XGB.head(30))print("Accuracy Scores:\n", scores_df_XGB.head())print("Error Log:\n", error_log_df.head())