Load Data and Initial Look
library(quantmod)
## Warning: package 'quantmod' was built under R version 4.4.3
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(timeSeries)
## Loading required package: timeDate
##
## Attaching package: 'timeSeries'
## The following object is masked from 'package:zoo':
##
## time<-
## The following objects are masked from 'package:graphics':
##
## lines, points
library(tseries)
library(forecast)
library(xts)
# Pull data from Yahoo Finance
getSymbols('SPY', from='2015-01-01', to='2025-04-23')
## [1] "SPY"
# Extract closing prices
SPY_Close_Prices <- SPY[,4]
# Plot closing prices
plot(SPY_Close_Prices, main="SPY Closing Prices (2015-2025)")

# ACF and PACF plots
par(mfrow=c(1,2))
Acf(SPY_Close_Prices, main='ACF for Differenced Series')
Pacf(SPY_Close_Prices, main='PACF for Differenced Series')

# Auto ARIMA model
auto.arima(SPY_Close_Prices, seasonal=FALSE)
## Series: SPY_Close_Prices
## ARIMA(4,1,1) with drift
##
## Coefficients:
## ar1 ar2 ar3 ar4 ma1 drift
## -0.9391 -0.0332 -0.0121 -0.0951 0.8658 0.1246
## s.e. 0.0444 0.0272 0.0270 0.0207 0.0407 0.0706
##
## sigma^2 = 16.07: log likelihood = -7267.87
## AIC=14549.74 AICc=14549.79 BIC=14590.76
Log Residuals to Remove Non-Stationary Properties
# Compute log returns
logs <- diff(log(SPY_Close_Prices), lag=1)
logs <- logs[!is.na(logs)]
# Plot log returns
par(mfrow=c(1,1))
plot(logs, type='l', main='Log Returns Plot')

# ADF test
print(adf.test(logs))
## Warning in adf.test(logs): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: logs
## Dickey-Fuller = -13.793, Lag order = 13, p-value = 0.01
## alternative hypothesis: stationary
# Auto ARIMA on log returns
auto.arima(logs, seasonal=FALSE)
## Series: logs
## ARIMA(5,0,2) with zero mean
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ma1 ma2
## -1.6983 -0.8341 -0.0112 -0.0226 -0.0042 1.6173 0.7366
## s.e. 0.0805 0.0823 0.0422 0.0403 0.0266 0.0780 0.0663
##
## sigma^2 = 0.0001257: log likelihood = 7959.39
## AIC=-15902.78 AICc=-15902.72 BIC=-15855.9
Split Data into Training and Testing Sets
set.seed(109)
sample_size <- floor(0.80 * nrow(logs))
train_indices <- sample(seq_len(nrow(logs)), size=sample_size)
train <- logs[train_indices, ]
test <- logs[-train_indices, ]
# ACF and PACF plots
par(mfrow=c(1,2))
Acf(train, main='ACF for Differenced Series')
Pacf(train, main='PACF for Differenced Series')

# Auto ARIMA on training set
auto.arima(train, seasonal=FALSE)
## Series: train
## ARIMA(0,0,0) with zero mean
##
## sigma^2 = 0.0001334: log likelihood = 6303.54
## AIC=-12605.08 AICc=-12605.08 BIC=-12599.45
Model Fitting and Residual Analysis
# Fit ARIMA models
fit1 <- auto.arima(train, seasonal=FALSE)
tsdisplay(residuals(fit1), lag.max=40, main='(5,0,2) Model Residuals')

fit2 <- arima(train, order=c(4,1,2))
tsdisplay(residuals(fit2), lag.max=40, main='(4,1,2) Model Residuals')

fit3 <- auto.arima(SPY_Close_Prices, seasonal=FALSE)
tsdisplay(residuals(fit3), lag.max=40, main='Original Data Model Residuals')

fit4 <- arima(SPY_Close_Prices, order=c(1,1,1))
tsdisplay(residuals(fit4), lag.max=40, main='(1,1,1) Model Residuals on Original Data')

Forecasting
par(mfrow=c(2,2))
period <- 100
fcast1 <- forecast(fit1, h=period)
plot(fcast1, main="Auto ARIMA (5,0,2)")
fcast2 <- forecast(fit2, h=period)
plot(fcast2, main="Custom ARIMA (4,1,2)")
fcast3 <- forecast(fit3, h=period)
plot(fcast3, main="Auto ARIMA on Original Data")
fcast4 <- forecast(fit4, h=period)
plot(fcast4, main="Custom ARIMA on Original Data")

Accuracy and Conclusion
accuracy(fcast1) #inf.->slightly upward followed by stationary trend
## ME RMSE MAE MPE MAPE MASE
## Training set 0.0003176177 0.01154864 0.007542545 100 100 0.6935458
## ACF1
## Training set -0.009980763
accuracy(fcast2) #inf.->slightly upward followed by stationary trend
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.0001128358 0.01150001 0.007542242 NaN Inf 0.6935179 0.001256132
accuracy(fcast3) #99.27% accuracy by MAPE mean avg. percent error
## ME RMSE MAE MPE MAPE MASE
## Training set -5.197405e-05 4.002719 2.581256 -0.01158895 0.7471461 1.000948
## ACF1
## Training set -0.0006094235
accuracy(fcast4) #99.26% accuracy
## ME RMSE MAE MPE MAPE MASE
## Training set 0.1321011 4.035444 2.582789 0.03182545 0.7480869 1.001542
## ACF1
## Training set 0.003668324
fcast3
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2592 529.3481 524.2114 534.4847 521.4923 537.2039
## 2593 527.2724 520.2691 534.2758 516.5617 537.9831
## 2594 530.4419 521.8699 539.0140 517.3322 543.5517
## 2595 526.4963 516.7096 536.2829 511.5289 541.4636
## 2596 530.1808 519.4344 540.9272 513.7457 546.6160
## 2597 527.2703 515.5227 539.0180 509.3039 545.2368
## 2598 529.8863 517.3231 542.4494 510.6726 549.1000
## 2599 528.1166 514.6883 541.5450 507.5797 548.6535
## 2600 529.6352 515.4719 543.7986 507.9743 551.2962
## 2601 528.7725 513.8521 543.6929 505.9537 551.5913
## 2602 529.5638 513.9659 545.1617 505.7089 553.4188
## 2603 529.2587 512.9817 545.5357 504.3652 554.1522
## 2604 529.6441 512.7353 546.5528 503.7844 555.5038
## 2605 529.6240 512.0933 547.1548 502.8131 556.4350
## 2606 529.8176 511.6942 547.9410 502.1003 557.5350
## 2607 529.9201 511.2182 548.6220 501.3180 558.5222
## 2608 530.0402 510.7796 549.3008 500.5836 559.4968
## 2609 530.1828 510.3781 549.9874 499.8942 560.4713
## 2610 530.2844 509.9506 550.6183 499.1865 561.3823
## 2611 530.4322 509.5828 551.2817 498.5457 562.3187
## 2612 530.5361 509.1831 551.8891 497.8796 563.1926
## 2613 530.6780 508.8336 552.5225 497.2698 564.0863
## 2614 530.7890 508.4635 553.1146 496.6450 564.9330
## 2615 530.9240 508.1278 553.7201 496.0603 565.7877
## 2616 531.0412 507.7837 554.2987 495.4719 566.6104
## 2617 531.1710 507.4613 554.8806 494.9102 567.4318
## 2618 531.2922 507.1387 555.4457 494.3526 568.2318
## 2619 531.4190 506.8297 556.0082 493.8130 569.0250
## 2620 531.5424 506.5249 556.5598 493.2815 569.8033
## 2621 531.6677 506.2293 557.1061 492.7630 570.5724
## 2622 531.7920 505.9395 557.6446 492.2540 571.3301
## 2623 531.9167 505.6566 558.1769 491.7553 572.0782
## 2624 532.0414 505.3799 558.7030 491.2662 572.8167
## 2625 532.1660 505.1090 559.2229 490.7859 573.5460
## 2626 532.2907 504.8441 559.7374 490.3147 574.2668
## 2627 532.4152 504.5843 560.2461 489.8515 574.9789
## 2628 532.5400 504.3301 560.7499 489.3966 575.6834
## 2629 532.6645 504.0806 561.2485 488.9492 576.3799
## 2630 532.7893 503.8362 561.7424 488.5093 577.0692
## 2631 532.9139 503.5962 562.2315 488.0764 577.7513
## 2632 533.0385 503.3609 562.7162 487.6505 578.4266
## 2633 533.1632 503.1297 563.1966 487.2310 579.0953
## 2634 533.2878 502.9028 563.6728 486.8180 579.7576
## 2635 533.4124 502.6799 564.1450 486.4111 580.4138
## 2636 533.5371 502.4609 564.6133 486.0102 581.0640
## 2637 533.6617 502.2457 565.0778 485.6150 581.7085
## 2638 533.7864 502.0340 565.5387 485.2254 582.3474
## 2639 533.9110 501.8260 565.9961 484.8412 582.9809
## 2640 534.0357 501.6213 566.4500 484.4621 583.6092
## 2641 534.1603 501.4199 566.9007 484.0882 584.2324
## 2642 534.2849 501.2218 567.3481 483.7192 584.8507
## 2643 534.4096 501.0267 567.7924 483.3549 585.4643
## 2644 534.5342 500.8347 568.2337 482.9953 586.0732
## 2645 534.6589 500.6457 568.6721 482.6402 586.6776
## 2646 534.7835 500.4595 569.1075 482.2895 587.2776
## 2647 534.9082 500.2761 569.5402 481.9430 587.8733
## 2648 535.0328 500.0954 569.9702 481.6007 588.4649
## 2649 535.1574 499.9174 570.3975 481.2624 589.0524
## 2650 535.2821 499.7419 570.8222 480.9281 589.6360
## 2651 535.4067 499.5690 571.2445 480.5976 590.2158
## 2652 535.5314 499.3985 571.6642 480.2709 590.7918
## 2653 535.6560 499.2304 572.0816 479.9479 591.3642
## 2654 535.7807 499.0646 572.4967 479.6284 591.9330
## 2655 535.9053 498.9011 572.9094 479.3123 592.4983
## 2656 536.0299 498.7399 573.3200 478.9997 593.0601
## 2657 536.1546 498.5808 573.7284 478.6904 593.6187
## 2658 536.2792 498.4238 574.1346 478.3844 594.1740
## 2659 536.4039 498.2690 574.5388 478.0816 594.7262
## 2660 536.5285 498.1161 574.9409 477.7818 595.2752
## 2661 536.6531 497.9653 575.3410 477.4851 595.8212
## 2662 536.7778 497.8164 575.7392 477.1914 596.3642
## 2663 536.9024 497.6694 576.1355 476.9006 596.9043
## 2664 537.0271 497.5242 576.5299 476.6127 597.4415
## 2665 537.1517 497.3809 576.9225 476.3275 597.9759
## 2666 537.2764 497.2394 577.3133 476.0451 598.5076
## 2667 537.4010 497.0997 577.7023 475.7654 599.0366
## 2668 537.5256 496.9616 578.0897 475.4883 599.5630
## 2669 537.6503 496.8253 578.4753 475.2138 600.0867
## 2670 537.7749 496.6906 578.8593 474.9419 600.6080
## 2671 537.8996 496.5576 579.2416 474.6724 601.1267
## 2672 538.0242 496.4261 579.6223 474.4054 601.6430
## 2673 538.1489 496.2962 580.0015 474.1408 602.1570
## 2674 538.2735 496.1679 580.3791 473.8785 602.6685
## 2675 538.3981 496.0410 580.7553 473.6185 603.1778
## 2676 538.5228 495.9157 581.1299 473.3608 603.6848
## 2677 538.6474 495.7918 581.5031 473.1053 604.1895
## 2678 538.7721 495.6693 581.8748 472.8521 604.6921
## 2679 538.8967 495.5482 582.2452 472.6009 605.1925
## 2680 539.0214 495.4286 582.6141 472.3519 605.6908
## 2681 539.1460 495.3103 582.9817 472.1050 606.1870
## 2682 539.2706 495.1933 583.3480 471.8601 606.6811
## 2683 539.3953 495.0776 583.7129 471.6173 607.1733
## 2684 539.5199 494.9633 584.0766 471.3764 607.6634
## 2685 539.6446 494.8502 584.4389 471.1375 608.1517
## 2686 539.7692 494.7384 584.8001 470.9005 608.6379
## 2687 539.8938 494.6278 585.1599 470.6654 609.1223
## 2688 540.0185 494.5184 585.5186 470.4321 609.6049
## 2689 540.1431 494.4102 585.8761 470.2007 610.0856
## 2690 540.2678 494.3032 586.2323 469.9710 610.5645
## 2691 540.3924 494.1974 586.5875 469.7432 611.0416
fcast4
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2592 525.6595 520.4868 530.8321 517.7486 533.5703
## 2593 526.3222 519.3231 533.3213 515.6180 537.0264
## 2594 526.0461 517.4990 534.5932 512.9744 539.1177
## 2595 526.1611 516.3456 535.9767 511.1495 541.1727
## 2596 526.1132 515.1606 537.0657 509.3627 542.8637
## 2597 526.1332 514.1566 538.1097 507.8166 544.4497
## 2598 526.1248 513.2031 539.0466 506.3627 545.8870
## 2599 526.1283 512.3267 539.9299 505.0206 547.2360
## 2600 526.1269 511.4979 540.7558 503.7538 548.4999
## 2601 526.1275 510.7157 541.5392 502.5572 549.6977
## 2602 526.1272 509.9704 542.2840 501.4175 550.8369
## 2603 526.1273 509.2584 542.9962 500.3285 551.9261
## 2604 526.1273 508.5751 543.6794 499.2835 552.9710
## 2605 526.1273 507.9175 544.3371 498.2778 553.9768
## 2606 526.1273 507.2828 544.9718 497.3071 554.9475
## 2607 526.1273 506.6688 545.5858 496.3680 555.8865
## 2608 526.1273 506.0735 546.1810 495.4577 556.7968
## 2609 526.1273 505.4955 546.7591 494.5737 557.6809
## 2610 526.1273 504.9332 547.3214 493.7137 558.5409
## 2611 526.1273 504.3854 547.8691 492.8760 559.3786
## 2612 526.1273 503.8511 548.4034 492.0589 560.1957
## 2613 526.1273 503.3294 548.9252 491.2609 560.9937
## 2614 526.1273 502.8193 549.4353 490.4807 561.7738
## 2615 526.1273 502.3201 549.9345 489.7173 562.5372
## 2616 526.1273 501.8312 550.4234 488.9696 563.2850
## 2617 526.1273 501.3519 550.9027 488.2366 564.0180
## 2618 526.1273 500.8817 551.3728 487.5175 564.7370
## 2619 526.1273 500.4202 551.8344 486.8116 565.4429
## 2620 526.1273 499.9667 552.2878 486.1182 566.1364
## 2621 526.1273 499.5210 552.7335 485.4365 566.8181
## 2622 526.1273 499.0827 553.1719 484.7661 567.4885
## 2623 526.1273 498.6513 553.6033 484.1064 568.1482
## 2624 526.1273 498.2266 554.0280 483.4568 568.7977
## 2625 526.1273 497.8083 554.4463 482.8171 569.4375
## 2626 526.1273 497.3960 554.8586 482.1866 570.0680
## 2627 526.1273 496.9896 555.2650 481.5650 570.6895
## 2628 526.1273 496.5888 555.6658 480.9520 571.3025
## 2629 526.1273 496.1933 556.0612 480.3473 571.9073
## 2630 526.1273 495.8030 556.4515 479.7503 572.5042
## 2631 526.1273 495.4177 556.8369 479.1610 573.0935
## 2632 526.1273 495.0371 557.2174 478.5790 573.6756
## 2633 526.1273 494.6612 557.5934 478.0040 574.2505
## 2634 526.1273 494.2897 557.9649 477.4358 574.8187
## 2635 526.1273 493.9224 558.3321 476.8742 575.3804
## 2636 526.1273 493.5593 558.6952 476.3189 575.9357
## 2637 526.1273 493.2002 559.0543 475.7697 576.4848
## 2638 526.1273 492.8450 559.4095 475.2265 577.0281
## 2639 526.1273 492.4936 559.7610 474.6890 577.5656
## 2640 526.1273 492.1457 560.1088 474.1570 578.0976
## 2641 526.1273 491.8014 560.4531 473.6304 578.6241
## 2642 526.1273 491.4606 560.7940 473.1091 579.1455
## 2643 526.1273 491.1230 561.1316 472.5928 579.6617
## 2644 526.1273 490.7886 561.4659 472.0815 580.1731
## 2645 526.1273 490.4574 561.7971 471.5750 580.6796
## 2646 526.1273 490.1293 562.1253 471.0731 581.1815
## 2647 526.1273 489.8041 562.4505 470.5757 581.6788
## 2648 526.1273 489.4818 562.7728 470.0828 582.1718
## 2649 526.1273 489.1623 563.0923 469.5942 582.6604
## 2650 526.1273 488.8455 563.4091 469.1097 583.1448
## 2651 526.1273 488.5314 563.7231 468.6294 583.6252
## 2652 526.1273 488.2199 564.0346 468.1530 584.1016
## 2653 526.1273 487.9110 564.3436 467.6805 584.5741
## 2654 526.1273 487.6045 564.6501 467.2118 585.0428
## 2655 526.1273 487.3004 564.9541 466.7468 585.5078
## 2656 526.1273 486.9988 565.2558 466.2854 585.9692
## 2657 526.1273 486.6994 565.5552 465.8275 586.4271
## 2658 526.1273 486.4022 565.8523 465.3731 586.8815
## 2659 526.1273 486.1073 566.1472 464.9220 587.3325
## 2660 526.1273 485.8146 566.4400 464.4743 587.7803
## 2661 526.1273 485.5239 566.7307 464.0298 588.2248
## 2662 526.1273 485.2353 567.0192 463.5884 588.6661
## 2663 526.1273 484.9488 567.3058 463.1502 589.1044
## 2664 526.1273 484.6642 567.5904 462.7149 589.5396
## 2665 526.1273 484.3815 567.8730 462.2827 589.9719
## 2666 526.1273 484.1008 568.1538 461.8533 590.4013
## 2667 526.1273 483.8219 568.4327 461.4268 590.8278
## 2668 526.1273 483.5448 568.7097 461.0031 591.2515
## 2669 526.1273 483.2696 568.9850 460.5821 591.6725
## 2670 526.1273 482.9961 569.2585 460.1638 592.0908
## 2671 526.1273 482.7243 569.5303 459.7481 592.5064
## 2672 526.1273 482.4542 569.8004 459.3351 592.9195
## 2673 526.1273 482.1858 570.0688 458.9245 593.3300
## 2674 526.1273 481.9190 570.3356 458.5165 593.7381
## 2675 526.1273 481.6538 570.6008 458.1109 594.1436
## 2676 526.1273 481.3901 570.8644 457.7077 594.5468
## 2677 526.1273 481.1281 571.1265 457.3069 594.9477
## 2678 526.1273 480.8675 571.3871 456.9084 595.3462
## 2679 526.1273 480.6084 571.6462 456.5122 595.7424
## 2680 526.1273 480.3508 571.9038 456.1182 596.1364
## 2681 526.1273 480.0946 572.1599 455.7264 596.5281
## 2682 526.1273 479.8399 572.4147 455.3368 596.9177
## 2683 526.1273 479.5865 572.6680 454.9493 597.3052
## 2684 526.1273 479.3345 572.9200 454.5640 597.6906
## 2685 526.1273 479.0839 573.1707 454.1807 598.0739
## 2686 526.1273 478.8346 573.4200 453.7994 598.4552
## 2687 526.1273 478.5866 573.6680 453.4201 598.8345
## 2688 526.1273 478.3399 573.9147 453.0428 599.2118
## 2689 526.1273 478.0945 574.1601 452.6674 599.5872
## 2690 526.1273 477.8503 574.4043 452.2939 599.9606
## 2691 526.1273 477.6073 574.6473 451.9224 600.3322
XGBoost Modeling with Lagged Features
library(xgboost)
## Warning: package 'xgboost' was built under R version 4.4.3
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Loading required package: lattice
# Ensure 'logs' is numeric vector, not xts
logs_numeric <- as.numeric(logs)
# Create lagged features
lagged_data <- data.frame(
Lag1 = dplyr::lag(logs_numeric, n = 1),
Lag2 = dplyr::lag(logs_numeric, n = 2),
Lag3 = dplyr::lag(logs_numeric, n = 3),
Target = logs_numeric
)
# Remove NA rows
lagged_data <- na.omit(lagged_data)
# Check structure
str(lagged_data)
## 'data.frame': 2587 obs. of 4 variables:
## $ Lag1 : num 0.01238 0.01759 -0.00805 -0.00786 -0.00282 ...
## $ Lag2 : num -0.00946 0.01238 0.01759 -0.00805 -0.00786 ...
## $ Lag3 : num -0.01822 -0.00946 0.01238 0.01759 -0.00805 ...
## $ Target: num 0.01759 -0.00805 -0.00786 -0.00282 -0.00606 ...
## - attr(*, "na.action")= 'omit' Named int [1:3] 1 2 3
## ..- attr(*, "names")= chr [1:3] "1" "2" "3"
# Split into train/test
set.seed(109)
sample_size <- floor(0.80 * nrow(lagged_data))
train_indices <- sample(seq_len(nrow(lagged_data)), size = sample_size)
train_data <- lagged_data[train_indices, ]
test_data <- lagged_data[-train_indices, ]
# Create data matrices for XGBoost
train_matrix <- xgb.DMatrix(data = as.matrix(train_data[, c("Lag1", "Lag2", "Lag3")]),
label = train_data$Target)
test_matrix <- xgb.DMatrix(data = as.matrix(test_data[, c("Lag1", "Lag2", "Lag3")]),
label = test_data$Target)
# Set XGBoost parameters
params <- list(
objective = "reg:squarederror",
eval_metric = "rmse"
)
# Train the model
xgb_model <- xgboost(params = params, data = train_matrix, nrounds = 100, verbose = 0)
# Predict
xgb_preds <- predict(xgb_model, test_matrix)
# Evaluate performance
xgb_rmse <- sqrt(mean((xgb_preds - test_data$Target)^2))
xgb_mae <- mean(abs(xgb_preds - test_data$Target))
cat("XGBoost RMSE:", round(xgb_rmse, 6), "\n")
## XGBoost RMSE: 0.011483
cat("XGBoost MAE:", round(xgb_mae, 6), "\n")
## XGBoost MAE: 0.007831
Compare ARIMA vs XGBoost
# Get ARIMA forecast values from fit1
arima_preds <- forecast(fit1, h = nrow(test_data))$mean
arima_preds <- as.numeric(arima_preds)
arima_rmse <- sqrt(mean((arima_preds - test_data$Target)^2))
arima_mae <- mean(abs(arima_preds - test_data$Target))
comparison <- data.frame(
Model = c("ARIMA (fit1)", "XGBoost"),
RMSE = c(round(arima_rmse, 6), round(xgb_rmse, 6)),
MAE = c(round(arima_mae, 6), round(xgb_mae, 6))
)
print(comparison)
## Model RMSE MAE
## 1 ARIMA (fit1) 0.009418 0.006667
## 2 XGBoost 0.011483 0.007831
XGBoost Model and Visualization
#–– 1. Load libraries
library(xgboost)
library(dplyr)
library(tidyr)
library(ggplot2)
library(xgboost)
library(xts)
library(zoo)
library(dygraphs)
#–– 2. Prepare lagged dataset
n_lags <- 5
df <- data.frame(
Date = index(logs),
Return = as.numeric(logs)
)
# create lags 1…n_lags
for(i in 1:n_lags) {
df[[ paste0("Lag", i) ]] <- dplyr::lag(df$Return, n = i)
}
df <- tidyr::drop_na(df) # remove rows with any NA
#–– 3. Split into training (80%) and testing (20%)
split_pt <- floor(0.8 * nrow(df))
train_df <- df[ 1:split_pt, ]
test_df <- df[(split_pt+1):nrow(df), ]
#–– 4. Create xgb.DMatrix objects
dtrain <- xgb.DMatrix(
data = as.matrix(train_df[ , paste0("Lag", 1:n_lags)]),
label = train_df$Return
)
dtest <- xgb.DMatrix(
data = as.matrix(test_df[ , paste0("Lag", 1:n_lags)]),
label = test_df$Return
)
#–– 5. Train the model
params <- list(
objective = "reg:squarederror",
eval_metric = "rmse"
)
xgb_mod <- xgb.train(
params = params,
data = dtrain,
nrounds = 100,
watchlist = list(train = dtrain, test = dtest),
verbose = 0
)
#–– 6. Predict on test set
preds <- predict(xgb_mod, dtest)
#–– 7. Plot Actual vs Predicted
results <- data.frame(
Date = test_df$Date,
Actual = test_df$Return,
Predicted = preds
)
ggplot(results, aes(x = Date)) +
geom_line(aes(y = Actual, color = "Actual"), size = 0.6) +
geom_line(aes(y = Predicted, color = "Predicted"), size = 0.6, linetype = "dashed") +
scale_color_manual(
name = "Series",
values = c(Actual = "steelblue", Predicted = "firebrick")
) +
labs(
title = "XGBoost: Actual vs Predicted Log Returns",
y = "Log Return"
) +
theme_minimal()

pred_xts <- xts(
results[, c("Actual","Predicted")],
order.by = results$Date
)
dygraph(pred_xts, main = "Actual vs Predicted Log Returns") %>%
dySeries("Actual", label = "Actual") %>%
dySeries("Predicted", label = "Predicted", strokePattern = "dashed") %>%
dyOptions(colors = c("steelblue","firebrick"))
#–– 8. Plot Feature Importance
imp <- xgb.importance(
feature_names = paste0("Lag", 1:n_lags),
model = xgb_mod
)
xgb.plot.importance(imp, top_n = n_lags)

library(xgboost)
library(purrr)
library(dplyr)
# 2a) Define grid of parameters to try
param_grid <- expand.grid(
max_depth = c(3, 5, 7),
eta = c(0.01, 0.1, 0.3),
subsample = c(0.7, 1.0),
colsample_bytree = c(0.7, 1.0)
)
# 2b) Helper to run cv and return best RMSE
cv_results <- pmap_dfr(param_grid, function(max_depth, eta, subsample, colsample_bytree) {
params <- list(
objective = "reg:squarederror",
eval_metric = "rmse",
max_depth = max_depth,
eta = eta,
subsample = subsample,
colsample_bytree = colsample_bytree
)
cv <- xgb.cv(
params = params,
data = dtrain,
nrounds = 200,
nfold = 5,
early_stopping_rounds = 10,
verbose = FALSE
)
tibble(
max_depth = max_depth,
eta = eta,
subsample = subsample,
colsample = colsample_bytree,
best_nrounds = cv$best_iteration,
cv_rmse = cv$evaluation_log[cv$best_iteration, test_rmse_mean]
)
})
# 2c) See top 5 configs
top5 <- cv_results %>%
arrange(cv_rmse) %>%
head(5)
print(top5)
## # A tibble: 5 × 6
## max_depth eta subsample colsample best_nrounds cv_rmse
## <dbl> <dbl> <dbl> <dbl> <int> <dbl>
## 1 3 0.1 0.7 1 57 0.0118
## 2 3 0.1 0.7 0.7 70 0.0119
## 3 3 0.3 0.7 0.7 18 0.0119
## 4 7 0.3 1 0.7 18 0.0119
## 5 3 0.1 1 0.7 60 0.0120
3. Add Rolling‑Volatility & VIX Features (with re‑split)
library(zoo)
library(quantmod)
library(dplyr)
#— A) rolling 5‑day SD of returns
df$vol5 <- rollapply(df$Return, width = 5, FUN = sd,
fill = NA, align = "right")
#— B) fetch VIX log‐returns and merge
getSymbols("^VIX", from = "2015-01-01")
## [1] "VIX"
vix_ret <- na.omit(diff(log(Cl(VIX))))
vix_df <- tibble(
Date = index(vix_ret),
VIX_Return = coredata(vix_ret)
)
df <- df %>%
left_join(vix_df, by = "Date") %>%
drop_na() # removes any rows w/ NA in vol5 or VIX_Return
#— C) define features vector
features <- c(paste0("Lag", 1:n_lags), "vol5", "VIX_Return")
#— D) re‑split into train/test (80/20 by time order)
split_pt <- floor(0.8 * nrow(df))
train_df <- df[ 1:split_pt, ]
test_df <- df[(split_pt+1):nrow(df), ]
#— E) rebuild xgboost DMatrices
dtrain <- xgb.DMatrix(
data = as.matrix(train_df[, features]),
label = train_df$Return
)
dtest <- xgb.DMatrix(
data = as.matrix(test_df[, features]),
label = test_df$Return
)
horizon <- 5
df_h <- df %>%
mutate(Target = dplyr::lead(Return, n = horizon)) %>%
drop_na()
# rebuild train/test on df_h
split_pt <- floor(0.8 * nrow(df_h))
tr_h <- df_h[1:split_pt, ]
te_h <- df_h[(split_pt+1):nrow(df_h), ]
dtr_h <- xgb.DMatrix(as.matrix(tr_h[features]), label = tr_h$Target)
dte_h <- xgb.DMatrix(as.matrix(te_h[features]), label = te_h$Target)
# train & predict
mod_h <- xgb.train(params, data = dtr_h, nrounds = 100, watchlist = list(train=dtr_h))
## [1] train-rmse:0.350038
## [2] train-rmse:0.245224
## [3] train-rmse:0.171901
## [4] train-rmse:0.120648
## [5] train-rmse:0.084875
## [6] train-rmse:0.059986
## [7] train-rmse:0.042752
## [8] train-rmse:0.030863
## [9] train-rmse:0.022789
## [10] train-rmse:0.017434
## [11] train-rmse:0.013984
## [12] train-rmse:0.011795
## [13] train-rmse:0.010513
## [14] train-rmse:0.009655
## [15] train-rmse:0.009139
## [16] train-rmse:0.008885
## [17] train-rmse:0.008692
## [18] train-rmse:0.008529
## [19] train-rmse:0.008395
## [20] train-rmse:0.008310
## [21] train-rmse:0.008114
## [22] train-rmse:0.008043
## [23] train-rmse:0.007938
## [24] train-rmse:0.007835
## [25] train-rmse:0.007640
## [26] train-rmse:0.007430
## [27] train-rmse:0.007315
## [28] train-rmse:0.007224
## [29] train-rmse:0.007094
## [30] train-rmse:0.006884
## [31] train-rmse:0.006703
## [32] train-rmse:0.006538
## [33] train-rmse:0.006495
## [34] train-rmse:0.006324
## [35] train-rmse:0.006221
## [36] train-rmse:0.006137
## [37] train-rmse:0.006009
## [38] train-rmse:0.005920
## [39] train-rmse:0.005715
## [40] train-rmse:0.005633
## [41] train-rmse:0.005483
## [42] train-rmse:0.005364
## [43] train-rmse:0.005327
## [44] train-rmse:0.005310
## [45] train-rmse:0.005247
## [46] train-rmse:0.005205
## [47] train-rmse:0.005191
## [48] train-rmse:0.005084
## [49] train-rmse:0.005050
## [50] train-rmse:0.004885
## [51] train-rmse:0.004850
## [52] train-rmse:0.004800
## [53] train-rmse:0.004702
## [54] train-rmse:0.004676
## [55] train-rmse:0.004607
## [56] train-rmse:0.004538
## [57] train-rmse:0.004495
## [58] train-rmse:0.004425
## [59] train-rmse:0.004359
## [60] train-rmse:0.004271
## [61] train-rmse:0.004195
## [62] train-rmse:0.004114
## [63] train-rmse:0.004028
## [64] train-rmse:0.003930
## [65] train-rmse:0.003858
## [66] train-rmse:0.003804
## [67] train-rmse:0.003768
## [68] train-rmse:0.003707
## [69] train-rmse:0.003661
## [70] train-rmse:0.003642
## [71] train-rmse:0.003577
## [72] train-rmse:0.003531
## [73] train-rmse:0.003520
## [74] train-rmse:0.003471
## [75] train-rmse:0.003452
## [76] train-rmse:0.003364
## [77] train-rmse:0.003303
## [78] train-rmse:0.003239
## [79] train-rmse:0.003203
## [80] train-rmse:0.003149
## [81] train-rmse:0.003097
## [82] train-rmse:0.003057
## [83] train-rmse:0.003013
## [84] train-rmse:0.002963
## [85] train-rmse:0.002950
## [86] train-rmse:0.002914
## [87] train-rmse:0.002899
## [88] train-rmse:0.002852
## [89] train-rmse:0.002788
## [90] train-rmse:0.002732
## [91] train-rmse:0.002687
## [92] train-rmse:0.002670
## [93] train-rmse:0.002642
## [94] train-rmse:0.002629
## [95] train-rmse:0.002597
## [96] train-rmse:0.002569
## [97] train-rmse:0.002538
## [98] train-rmse:0.002493
## [99] train-rmse:0.002454
## [100] train-rmse:0.002448
pred_h <- predict(mod_h, dte_h)
# plot
plot_df <- data.frame(
Date = te_h$Date,
Actual_h = te_h$Target,
Predicted_h = pred_h
)
ggplot(plot_df, aes(Date)) +
geom_line(aes(y = Actual_h, color="Actual")) +
geom_line(aes(y = Predicted_h, color="Pred"), linetype="dashed") +
labs(title = paste0(horizon,"‑Day Ahead Forecast"), y="Log Return") +
theme_minimal()

# rolling‑window CV: initialWindow = 0.8*N, horizon = 0.2*N
library(caret)
ts_ctrl <- trainControl(
method = "timeslice",
initialWindow = floor(0.8 * nrow(df)),
horizon = floor(0.2 * nrow(df)),
fixedWindow = TRUE
)
# define small tuneGrid if desired
tuneGrid <- expand.grid(
nrounds = 100,
max_depth = 5,
eta = 0.1,
gamma = 0,
colsample_bytree = 0.8,
min_child_weight = 1,
subsample = 0.8
)
caret_mod <- train(
x = df[ features ],
y = df$Return,
method = "xgbTree",
trControl = ts_ctrl,
tuneGrid = tuneGrid,
metric = "RMSE"
)
caret_mod$results
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 1 100 5 0.1 0 0.8 1 0.8
## RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 0.006541774 0.5903679 0.004105603 8.850498e-06 0.002748341 4.218602e-05