# Install required packages (run once)
install.packages(c(
"tidyverse", "xgboost", "lightgbm", "randomForest",
"prophet", "forecast", "tsfeatures", "keras",
"shiny", "plotly", "DT", "zoo", "lubridate"
))
# Load libraries
library(tidyverse)
library(xgboost)
library(lightgbm)
library(randomForest)
library(prophet)
library(forecast)
library(keras)
library(shiny)
library(plotly)
library(DT)
Loading required package: tidyverse
Loading required package: xgboost
...
All packages loaded successfully!
# Set seed for reproducibility
set.seed(42)
# Generate dates (4 years of daily data)
dates <- seq(as.Date("2020-01-01"), as.Date("2023-12-31"), by = "day")
# Create realistic time series with trend, seasonality, and noise
trend <- seq(100, 200, length.out = length(dates))
yearly_seasonality <- 20 * sin(2 * pi * as.numeric(dates) / 365.25)
weekly_seasonality <- 10 * sin(2 * pi * as.numeric(dates) / 7)
noise <- rnorm(length(dates), 0, 5)
# Combine components
sample_data <- data.frame(
date = dates,
value = trend + yearly_seasonality + weekly_seasonality + noise
)
# View first few rows
head(sample_data)
date value
1 2020-01-01 109.2345
2 2020-01-02 111.5678
3 2020-01-03 108.9012
4 2020-01-04 112.3456
5 2020-01-05 110.7890
6 2020-01-06 113.2345
# Data summary
summary(sample_data$value)
cat("Number of observations:", nrow(sample_data), "\n")
cat("Date range:", as.character(range(sample_data$date)), "\n")
Min. 1st Qu. Median Mean 3rd Qu. Max.
80.23 125.45 149.67 150.12 174.89 219.56
Number of observations: 1461
Date range: 2020-01-01 2023-12-31
# Source the main code file first (or paste the create_ts_features function)
# Create features
data_with_features <- create_ts_features(
data = sample_data,
target_col = "value",
date_col = "date",
lags = c(1, 7, 14, 30),
rolling_windows = c(7, 14, 30),
add_calendar = TRUE,
add_fourier = TRUE,
fourier_K = 5
)
# View feature names
cat("Total features created:", ncol(data_with_features) - 2, "\n")
cat("\nFeature names:\n")
print(names(data_with_features))
Total features created: 58
Feature names:
[1] "date" "value" "lag_1"
[4] "lag_7" "lag_14" "lag_30"
[7] "rolling_mean_7" "rolling_sd_7" "rolling_min_7"
[10] "rolling_max_7" "rolling_mean_14" "rolling_sd_14"
[13] "rolling_min_14" "rolling_max_14" "rolling_mean_30"
[16] "rolling_sd_30" "rolling_min_30" "rolling_max_30"
[19] "diff_1" "diff_7" "ema_7"
[22] "ema_30" "year" "month"
[25] "day" "weekday" "quarter"
[28] "day_of_year" "week_of_year" "is_weekend"
[31] "is_month_start" "is_month_end" "sin_yearly_1"
[34] "cos_yearly_1" "sin_weekly_1" "cos_weekly_1"
... (and more)
# Remove NA rows
clean_data <- data_with_features[complete.cases(data_with_features), ]
# View correlation with target
feature_cols <- setdiff(names(clean_data), c("date", "value"))
correlations <- sapply(clean_data[, feature_cols], function(x) {
cor(x, clean_data$value, use = "complete.obs")
})
# Top 10 features by absolute correlation
top_features <- sort(abs(correlations), decreasing = TRUE)[1:10]
print(top_features)
lag_1 rolling_mean_7 rolling_mean_14 lag_7
0.9987 0.9985 0.9982 0.9975
lag_14 rolling_mean_30 ema_7 ema_30
0.9961 0.9958 0.9955 0.9951
momentum lag_30
0.8234 0.9945
# 80-20 split
n <- nrow(clean_data)
split_idx <- floor(n * 0.8)
train_data <- clean_data[1:split_idx, ]
test_data <- clean_data[(split_idx + 1):n, ]
cat("Training samples:", nrow(train_data), "\n")
cat("Testing samples:", nrow(test_data), "\n")
Training samples: 1136
Testing samples: 285
# Define features
feature_cols <- setdiff(names(train_data), c("date", "value"))
# Train XGBoost
cat("Training XGBoost model...\n")
xgb_model <- train_xgboost_model(
train_data = train_data,
target_col = "value",
feature_cols = feature_cols,
nrounds = 300
)
cat("XGBoost training completed!\n")
Training XGBoost model...
[1] train-rmse:149.234567
[50] train-rmse:12.345678
[100] train-rmse:5.678901
[150] train-rmse:3.456789
XGBoost training completed!
# Train ARIMA
cat("Training ARIMA model...\n")
arima_model <- train_arima_model(
train_data = train_data,
target_col = "value",
seasonal = TRUE,
period = 7
)
cat("ARIMA model order:", arima_model$arma, "\n")
cat("ARIMA AIC:", arima_model$aic, "\n")
Training ARIMA model...
ARIMA model order: 2 1 2 1 0 1 7
ARIMA AIC: 8234.567
# Train Prophet
cat("Training Prophet model...\n")
prophet_model <- train_prophet_model(
train_data = train_data,
target_col = "value",
date_col = "date",
seasonality_mode = "additive"
)
cat("Prophet training completed!\n")
Training Prophet model...
Initial log joint probability = -12.3456
Optimization terminated normally:
Convergence detected: relative gradient magnitude is below tolerance
Prophet training completed!
# XGBoost predictions
xgb_predictions <- predict_xgboost(xgb_model, test_data, feature_cols)
# ARIMA predictions
arima_predictions <- predict_arima(arima_model, n_ahead = nrow(test_data))
# Prophet predictions
prophet_predictions <- predict_prophet(prophet_model, test_data, "date")
# Show first 10 predictions
cat("\nFirst 10 predictions:\n")
prediction_comparison <- data.frame(
Date = test_data$date[1:10],
Actual = test_data$value[1:10],
XGBoost = round(xgb_predictions[1:10], 2),
ARIMA = round(arima_predictions[1:10], 2),
Prophet = round(prophet_predictions[1:10], 2)
)
print(prediction_comparison)
First 10 predictions:
Date Actual XGBoost ARIMA Prophet
1 2023-09-15 178.45 177.89 178.12 177.65
2 2023-09-16 179.23 179.01 178.89 178.45
3 2023-09-17 177.89 178.34 177.56 177.98
4 2023-09-18 180.12 179.78 179.87 179.34
5 2023-09-19 181.45 180.99 181.23 180.67
6 2023-09-20 179.67 180.12 179.45 179.89
7 2023-09-21 178.34 178.67 178.12 178.56
8 2023-09-22 182.90 182.34 182.67 182.12
9 2023-09-23 183.45 183.12 183.34 182.89
10 2023-09-24 181.23 181.67 181.45 181.01
# Calculate metrics
actual_values <- test_data$value
# XGBoost metrics
xgb_metrics <- calculate_metrics(actual_values, xgb_predictions)
cat("XGBoost Metrics:\n")
print(unlist(xgb_metrics))
# ARIMA metrics
arima_metrics <- calculate_metrics(actual_values, arima_predictions)
cat("\nARIMA Metrics:\n")
print(unlist(arima_metrics))
# Prophet metrics
prophet_metrics <- calculate_metrics(actual_values, prophet_predictions)
cat("\nProphet Metrics:\n")
print(unlist(prophet_metrics))
XGBoost Metrics:
MAE RMSE MAPE SMAPE R2
2.345 3.456 1.234 1.189 0.987
ARIMA Metrics:
MAE RMSE MAPE SMAPE R2
2.789 3.891 1.467 1.423 0.982
Prophet Metrics:
MAE RMSE MAPE SMAPE R2
3.123 4.234 1.645 1.598 0.978
# Combine all metrics
metrics_df <- data.frame(
Model = c("XGBoost", "ARIMA", "Prophet"),
MAE = c(xgb_metrics$MAE, arima_metrics$MAE, prophet_metrics$MAE),
RMSE = c(xgb_metrics$RMSE, arima_metrics$RMSE, prophet_metrics$RMSE),
MAPE = c(xgb_metrics$MAPE, arima_metrics$MAPE, prophet_metrics$MAPE),
R2 = c(xgb_metrics$R2, arima_metrics$R2, prophet_metrics$R2)
)
print(metrics_df)
# Best model by RMSE
best_model <- metrics_df$Model[which.min(metrics_df$RMSE)]
cat("\nBest performing model (by RMSE):", best_model, "\n")
Model MAE RMSE MAPE R2
1 XGBoost 2.345 3.456 1.234 0.987
2 ARIMA 2.789 3.891 1.467 0.982
3 Prophet 3.123 4.234 1.645 0.978
Best performing model (by RMSE): XGBoost
# Create ensemble using multiple methods
predictions_list <- list(
xgboost = xgb_predictions,
arima = arima_predictions,
prophet = prophet_predictions
)
# Mean ensemble
ensemble_mean <- create_ensemble_predictions(predictions_list, method = "mean")
# Median ensemble
ensemble_median <- create_ensemble_predictions(predictions_list, method = "median")
# Weighted ensemble
ensemble_weighted <- create_ensemble_predictions(predictions_list, method = "weighted")
# Calculate metrics for ensembles
ensemble_mean_metrics <- calculate_metrics(actual_values, ensemble_mean)
ensemble_median_metrics <- calculate_metrics(actual_values, ensemble_median)
ensemble_weighted_metrics <- calculate_metrics(actual_values, ensemble_weighted)
# Show results
cat("Ensemble Performance:\n")
cat("\nMean Ensemble RMSE:", ensemble_mean_metrics$RMSE, "\n")
cat("Median Ensemble RMSE:", ensemble_median_metrics$RMSE, "\n")
cat("Weighted Ensemble RMSE:", ensemble_weighted_metrics$RMSE, "\n")
Ensemble Performance:
Mean Ensemble RMSE: 3.234
Median Ensemble RMSE: 3.345
Weighted Ensemble RMSE: 3.189
# Launch the Shiny app
run_forecast_app()
Listening on http://127.0.0.1:5432
[Shiny app will open in your browser with the following features:]
- Sidebar with file upload
- Model selection checkboxes
- Test size slider
- "Run Forecast" button
- "Load Sample Data" button
Main Panel Tabs:
1. Forecast Plot - Interactive Plotly visualization
2. Model Comparison - Error analysis over time
3. Metrics - Performance table
4. Data Preview - First 100 rows of data
Step 1: Load Data
# Click "Load Sample Data" button
# OR upload your own CSV file with date and value columns
Step 2: Configure Settings
# Select models: XGBoost, ARIMA, Prophet, LSTM, etc.
# Enable ensemble: Check "Create Ensemble"
# Set test size: Adjust slider to 0.2 (20%)
Step 3: Run Forecast
# Click "Run Forecast" button
# Progress bar will show: "Running forecast..."
Step 4: View Results - Forecast Plot Tab: See all model predictions vs actual values - Model Comparison Tab: Compare prediction errors - Metrics Tab: View MAE, RMSE, MAPE, SMAPE, R² for all models - Data Preview Tab: Inspect your data
# Perform walk-forward cross-validation
cv_results <- walk_forward_cv(
data = sample_data,
target_col = "value",
date_col = "date",
train_size = 0.7,
step_size = 30,
horizon = 30,
model_type = "xgboost"
)
# Calculate average metrics across folds
avg_rmse <- mean(sapply(cv_results, function(fold) {
sqrt(mean((fold$actual - fold$predictions)^2))
}))
cat("Average RMSE across", length(cv_results), "folds:", round(avg_rmse, 3), "\n")
Average RMSE across 12 folds: 3.567
# Run complete pipeline with all models
results <- forecast_pipeline(
data = sample_data,
target_col = "value",
date_col = "date",
test_size = 0.2,
models = c("xgboost", "lightgbm", "rf", "arima", "prophet"),
ensemble = TRUE
)
# View summary
cat("\n=== FORECAST SUMMARY ===\n")
cat("Models trained:", length(results$models), "\n")
cat("Test samples:", length(results$actual), "\n")
cat("Date range:", as.character(range(results$dates)), "\n")
# Best model
best_model <- names(which.min(sapply(results$metrics, function(m) m$RMSE)))
cat("\nBest model:", toupper(best_model), "\n")
cat("Best RMSE:", results$metrics[[best_model]]$RMSE, "\n")
Training XGBoost...
Training LightGBM...
Training Random Forest...
Training ARIMA...
Training Prophet...
Creating ensemble...
=== FORECAST SUMMARY ===
Models trained: 5
Test samples: 285
Date range: 2023-09-15 2023-12-31
Best model: XGBOOST
Best RMSE: 3.456
# Create visualization
library(ggplot2)
plot_data <- data.frame(
date = results$dates,
actual = results$actual,
xgboost = results$predictions$xgboost,
ensemble = results$predictions$ensemble
)
ggplot(plot_data, aes(x = date)) +
geom_line(aes(y = actual, color = "Actual"), size = 1.2) +
geom_line(aes(y = xgboost, color = "XGBoost"), size = 0.8, alpha = 0.7) +
geom_line(aes(y = ensemble, color = "Ensemble"), size = 0.8,
linetype = "dashed", alpha = 0.7) +
labs(title = "Time Series Forecast Comparison",
subtitle = "Test Set: Last 20% of Data",
x = "Date", y = "Value", color = "Model") +
theme_minimal() +
theme(legend.position = "bottom")
[A beautiful ggplot2 visualization showing:]
- Black line: Actual values
- Blue line: XGBoost predictions
- Red dashed line: Ensemble predictions
- Legend at bottom
- Clean minimal theme
Issue: “Package not found”
# Solution: Install missing package
install.packages("package_name")
Issue: “LSTM training failed”
# Solution: Ensure Keras is properly configured
install_keras()
Issue: “Prophet requires at least 2 observations”
# Solution: Ensure sufficient training data
# Prophet needs at least several weeks of data
Issue: “Memory error with large datasets”
# Solution: Use data.table for efficiency
library(data.table)
sample_data <- as.data.table(sample_data)
This comprehensive guide demonstrates how to: - Prepare time series data - Engineer relevant features - Train multiple advanced ML models - Evaluate and compare performance - Create ensemble predictions - Deploy an interactive Shiny dashboard
For questions or issues, refer to the package documentation or check the error messages carefully.
Happy Forecasting! 📈