Load Data and Initial Look

library(quantmod)
## Warning: package 'quantmod' was built under R version 4.4.3
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(timeSeries)
## Loading required package: timeDate
## 
## Attaching package: 'timeSeries'
## The following object is masked from 'package:zoo':
## 
##     time<-
## The following objects are masked from 'package:graphics':
## 
##     lines, points
library(tseries)
library(forecast)
library(xts)

# Pull data from Yahoo Finance
getSymbols('SPY', from='2015-01-01', to='2025-04-23')
## [1] "SPY"
# Extract closing prices
SPY_Close_Prices <- SPY[,4]

# Plot closing prices
plot(SPY_Close_Prices, main="SPY Closing Prices (2015-2025)")

# ACF and PACF plots
par(mfrow=c(1,2))
Acf(SPY_Close_Prices, main='ACF for Differenced Series')
Pacf(SPY_Close_Prices, main='PACF for Differenced Series')

# Auto ARIMA model
auto.arima(SPY_Close_Prices, seasonal=FALSE)
## Series: SPY_Close_Prices 
## ARIMA(4,1,1) with drift 
## 
## Coefficients:
##           ar1      ar2      ar3      ar4     ma1   drift
##       -0.9391  -0.0332  -0.0121  -0.0951  0.8658  0.1246
## s.e.   0.0444   0.0272   0.0270   0.0207  0.0407  0.0706
## 
## sigma^2 = 16.07:  log likelihood = -7267.87
## AIC=14549.74   AICc=14549.79   BIC=14590.76

Log Residuals to Remove Non-Stationary Properties

# Compute log returns
logs <- diff(log(SPY_Close_Prices), lag=1)
logs <- logs[!is.na(logs)]

# Plot log returns
par(mfrow=c(1,1))
plot(logs, type='l', main='Log Returns Plot')

# ADF test
print(adf.test(logs))
## Warning in adf.test(logs): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  logs
## Dickey-Fuller = -13.793, Lag order = 13, p-value = 0.01
## alternative hypothesis: stationary
# Auto ARIMA on log returns
auto.arima(logs, seasonal=FALSE)
## Series: logs 
## ARIMA(5,0,2) with zero mean 
## 
## Coefficients:
##           ar1      ar2      ar3      ar4      ar5     ma1     ma2
##       -1.6983  -0.8341  -0.0112  -0.0226  -0.0042  1.6173  0.7366
## s.e.   0.0805   0.0823   0.0422   0.0403   0.0266  0.0780  0.0663
## 
## sigma^2 = 0.0001257:  log likelihood = 7959.39
## AIC=-15902.78   AICc=-15902.72   BIC=-15855.9

Split Data into Training and Testing Sets

set.seed(109)
sample_size <- floor(0.80 * nrow(logs))
train_indices <- sample(seq_len(nrow(logs)), size=sample_size)
train <- logs[train_indices, ]
test <- logs[-train_indices, ]

# ACF and PACF plots
par(mfrow=c(1,2))
Acf(train, main='ACF for Differenced Series')
Pacf(train, main='PACF for Differenced Series')

# Auto ARIMA on training set
auto.arima(train, seasonal=FALSE)
## Series: train 
## ARIMA(0,0,0) with zero mean 
## 
## sigma^2 = 0.0001334:  log likelihood = 6303.54
## AIC=-12605.08   AICc=-12605.08   BIC=-12599.45

Model Fitting and Residual Analysis

# Fit ARIMA models
fit1 <- auto.arima(train, seasonal=FALSE)
tsdisplay(residuals(fit1), lag.max=40, main='(5,0,2) Model Residuals')

fit2 <- arima(train, order=c(4,1,2))
tsdisplay(residuals(fit2), lag.max=40, main='(4,1,2) Model Residuals')

fit3 <- auto.arima(SPY_Close_Prices, seasonal=FALSE)
tsdisplay(residuals(fit3), lag.max=40, main='Original Data Model Residuals')

fit4 <- arima(SPY_Close_Prices, order=c(1,1,1))
tsdisplay(residuals(fit4), lag.max=40, main='(1,1,1) Model Residuals on Original Data')

Forecasting

par(mfrow=c(2,2))
period <- 100

fcast1 <- forecast(fit1, h=period)
plot(fcast1, main="Auto ARIMA (5,0,2)")

fcast2 <- forecast(fit2, h=period)
plot(fcast2, main="Custom ARIMA (4,1,2)")

fcast3 <- forecast(fit3, h=period)
plot(fcast3, main="Auto ARIMA on Original Data")

fcast4 <- forecast(fit4, h=period)
plot(fcast4, main="Custom ARIMA on Original Data")

Accuracy and Conclusion

accuracy(fcast1) #inf.->slightly upward followed by stationary trend
##                        ME       RMSE         MAE MPE MAPE      MASE
## Training set 0.0003176177 0.01154864 0.007542545 100  100 0.6935458
##                      ACF1
## Training set -0.009980763
accuracy(fcast2) #inf.->slightly upward followed by stationary trend
##                        ME       RMSE         MAE MPE MAPE      MASE        ACF1
## Training set 0.0001128358 0.01150001 0.007542242 NaN  Inf 0.6935179 0.001256132
accuracy(fcast3) #99.27% accuracy by MAPE mean avg. percent error
##                         ME     RMSE      MAE         MPE      MAPE     MASE
## Training set -5.197405e-05 4.002719 2.581256 -0.01158895 0.7471461 1.000948
##                       ACF1
## Training set -0.0006094235
accuracy(fcast4) #99.26% accuracy
##                     ME     RMSE      MAE        MPE      MAPE     MASE
## Training set 0.1321011 4.035444 2.582789 0.03182545 0.7480869 1.001542
##                     ACF1
## Training set 0.003668324
fcast3
##      Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 2592       529.3481 524.2114 534.4847 521.4923 537.2039
## 2593       527.2724 520.2691 534.2758 516.5617 537.9831
## 2594       530.4419 521.8699 539.0140 517.3322 543.5517
## 2595       526.4963 516.7096 536.2829 511.5289 541.4636
## 2596       530.1808 519.4344 540.9272 513.7457 546.6160
## 2597       527.2703 515.5227 539.0180 509.3039 545.2368
## 2598       529.8863 517.3231 542.4494 510.6726 549.1000
## 2599       528.1166 514.6883 541.5450 507.5797 548.6535
## 2600       529.6352 515.4719 543.7986 507.9743 551.2962
## 2601       528.7725 513.8521 543.6929 505.9537 551.5913
## 2602       529.5638 513.9659 545.1617 505.7089 553.4188
## 2603       529.2587 512.9817 545.5357 504.3652 554.1522
## 2604       529.6441 512.7353 546.5528 503.7844 555.5038
## 2605       529.6240 512.0933 547.1548 502.8131 556.4350
## 2606       529.8176 511.6942 547.9410 502.1003 557.5350
## 2607       529.9201 511.2182 548.6220 501.3180 558.5222
## 2608       530.0402 510.7796 549.3008 500.5836 559.4968
## 2609       530.1828 510.3781 549.9874 499.8942 560.4713
## 2610       530.2844 509.9506 550.6183 499.1865 561.3823
## 2611       530.4322 509.5828 551.2817 498.5457 562.3187
## 2612       530.5361 509.1831 551.8891 497.8796 563.1926
## 2613       530.6780 508.8336 552.5225 497.2698 564.0863
## 2614       530.7890 508.4635 553.1146 496.6450 564.9330
## 2615       530.9240 508.1278 553.7201 496.0603 565.7877
## 2616       531.0412 507.7837 554.2987 495.4719 566.6104
## 2617       531.1710 507.4613 554.8806 494.9102 567.4318
## 2618       531.2922 507.1387 555.4457 494.3526 568.2318
## 2619       531.4190 506.8297 556.0082 493.8130 569.0250
## 2620       531.5424 506.5249 556.5598 493.2815 569.8033
## 2621       531.6677 506.2293 557.1061 492.7630 570.5724
## 2622       531.7920 505.9395 557.6446 492.2540 571.3301
## 2623       531.9167 505.6566 558.1769 491.7553 572.0782
## 2624       532.0414 505.3799 558.7030 491.2662 572.8167
## 2625       532.1660 505.1090 559.2229 490.7859 573.5460
## 2626       532.2907 504.8441 559.7374 490.3147 574.2668
## 2627       532.4152 504.5843 560.2461 489.8515 574.9789
## 2628       532.5400 504.3301 560.7499 489.3966 575.6834
## 2629       532.6645 504.0806 561.2485 488.9492 576.3799
## 2630       532.7893 503.8362 561.7424 488.5093 577.0692
## 2631       532.9139 503.5962 562.2315 488.0764 577.7513
## 2632       533.0385 503.3609 562.7162 487.6505 578.4266
## 2633       533.1632 503.1297 563.1966 487.2310 579.0953
## 2634       533.2878 502.9028 563.6728 486.8180 579.7576
## 2635       533.4124 502.6799 564.1450 486.4111 580.4138
## 2636       533.5371 502.4609 564.6133 486.0102 581.0640
## 2637       533.6617 502.2457 565.0778 485.6150 581.7085
## 2638       533.7864 502.0340 565.5387 485.2254 582.3474
## 2639       533.9110 501.8260 565.9961 484.8412 582.9809
## 2640       534.0357 501.6213 566.4500 484.4621 583.6092
## 2641       534.1603 501.4199 566.9007 484.0882 584.2324
## 2642       534.2849 501.2218 567.3481 483.7192 584.8507
## 2643       534.4096 501.0267 567.7924 483.3549 585.4643
## 2644       534.5342 500.8347 568.2337 482.9953 586.0732
## 2645       534.6589 500.6457 568.6721 482.6402 586.6776
## 2646       534.7835 500.4595 569.1075 482.2895 587.2776
## 2647       534.9082 500.2761 569.5402 481.9430 587.8733
## 2648       535.0328 500.0954 569.9702 481.6007 588.4649
## 2649       535.1574 499.9174 570.3975 481.2624 589.0524
## 2650       535.2821 499.7419 570.8222 480.9281 589.6360
## 2651       535.4067 499.5690 571.2445 480.5976 590.2158
## 2652       535.5314 499.3985 571.6642 480.2709 590.7918
## 2653       535.6560 499.2304 572.0816 479.9479 591.3642
## 2654       535.7807 499.0646 572.4967 479.6284 591.9330
## 2655       535.9053 498.9011 572.9094 479.3123 592.4983
## 2656       536.0299 498.7399 573.3200 478.9997 593.0601
## 2657       536.1546 498.5808 573.7284 478.6904 593.6187
## 2658       536.2792 498.4238 574.1346 478.3844 594.1740
## 2659       536.4039 498.2690 574.5388 478.0816 594.7262
## 2660       536.5285 498.1161 574.9409 477.7818 595.2752
## 2661       536.6531 497.9653 575.3410 477.4851 595.8212
## 2662       536.7778 497.8164 575.7392 477.1914 596.3642
## 2663       536.9024 497.6694 576.1355 476.9006 596.9043
## 2664       537.0271 497.5242 576.5299 476.6127 597.4415
## 2665       537.1517 497.3809 576.9225 476.3275 597.9759
## 2666       537.2764 497.2394 577.3133 476.0451 598.5076
## 2667       537.4010 497.0997 577.7023 475.7654 599.0366
## 2668       537.5256 496.9616 578.0897 475.4883 599.5630
## 2669       537.6503 496.8253 578.4753 475.2138 600.0867
## 2670       537.7749 496.6906 578.8593 474.9419 600.6080
## 2671       537.8996 496.5576 579.2416 474.6724 601.1267
## 2672       538.0242 496.4261 579.6223 474.4054 601.6430
## 2673       538.1489 496.2962 580.0015 474.1408 602.1570
## 2674       538.2735 496.1679 580.3791 473.8785 602.6685
## 2675       538.3981 496.0410 580.7553 473.6185 603.1778
## 2676       538.5228 495.9157 581.1299 473.3608 603.6848
## 2677       538.6474 495.7918 581.5031 473.1053 604.1895
## 2678       538.7721 495.6693 581.8748 472.8521 604.6921
## 2679       538.8967 495.5482 582.2452 472.6009 605.1925
## 2680       539.0214 495.4286 582.6141 472.3519 605.6908
## 2681       539.1460 495.3103 582.9817 472.1050 606.1870
## 2682       539.2706 495.1933 583.3480 471.8601 606.6811
## 2683       539.3953 495.0776 583.7129 471.6173 607.1733
## 2684       539.5199 494.9633 584.0766 471.3764 607.6634
## 2685       539.6446 494.8502 584.4389 471.1375 608.1517
## 2686       539.7692 494.7384 584.8001 470.9005 608.6379
## 2687       539.8938 494.6278 585.1599 470.6654 609.1223
## 2688       540.0185 494.5184 585.5186 470.4321 609.6049
## 2689       540.1431 494.4102 585.8761 470.2007 610.0856
## 2690       540.2678 494.3032 586.2323 469.9710 610.5645
## 2691       540.3924 494.1974 586.5875 469.7432 611.0416
fcast4
##      Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 2592       525.6595 520.4868 530.8321 517.7486 533.5703
## 2593       526.3222 519.3231 533.3213 515.6180 537.0264
## 2594       526.0461 517.4990 534.5932 512.9744 539.1177
## 2595       526.1611 516.3456 535.9767 511.1495 541.1727
## 2596       526.1132 515.1606 537.0657 509.3627 542.8637
## 2597       526.1332 514.1566 538.1097 507.8166 544.4497
## 2598       526.1248 513.2031 539.0466 506.3627 545.8870
## 2599       526.1283 512.3267 539.9299 505.0206 547.2360
## 2600       526.1269 511.4979 540.7558 503.7538 548.4999
## 2601       526.1275 510.7157 541.5392 502.5572 549.6977
## 2602       526.1272 509.9704 542.2840 501.4175 550.8369
## 2603       526.1273 509.2584 542.9962 500.3285 551.9261
## 2604       526.1273 508.5751 543.6794 499.2835 552.9710
## 2605       526.1273 507.9175 544.3371 498.2778 553.9768
## 2606       526.1273 507.2828 544.9718 497.3071 554.9475
## 2607       526.1273 506.6688 545.5858 496.3680 555.8865
## 2608       526.1273 506.0735 546.1810 495.4577 556.7968
## 2609       526.1273 505.4955 546.7591 494.5737 557.6809
## 2610       526.1273 504.9332 547.3214 493.7137 558.5409
## 2611       526.1273 504.3854 547.8691 492.8760 559.3786
## 2612       526.1273 503.8511 548.4034 492.0589 560.1957
## 2613       526.1273 503.3294 548.9252 491.2609 560.9937
## 2614       526.1273 502.8193 549.4353 490.4807 561.7738
## 2615       526.1273 502.3201 549.9345 489.7173 562.5372
## 2616       526.1273 501.8312 550.4234 488.9696 563.2850
## 2617       526.1273 501.3519 550.9027 488.2366 564.0180
## 2618       526.1273 500.8817 551.3728 487.5175 564.7370
## 2619       526.1273 500.4202 551.8344 486.8116 565.4429
## 2620       526.1273 499.9667 552.2878 486.1182 566.1364
## 2621       526.1273 499.5210 552.7335 485.4365 566.8181
## 2622       526.1273 499.0827 553.1719 484.7661 567.4885
## 2623       526.1273 498.6513 553.6033 484.1064 568.1482
## 2624       526.1273 498.2266 554.0280 483.4568 568.7977
## 2625       526.1273 497.8083 554.4463 482.8171 569.4375
## 2626       526.1273 497.3960 554.8586 482.1866 570.0680
## 2627       526.1273 496.9896 555.2650 481.5650 570.6895
## 2628       526.1273 496.5888 555.6658 480.9520 571.3025
## 2629       526.1273 496.1933 556.0612 480.3473 571.9073
## 2630       526.1273 495.8030 556.4515 479.7503 572.5042
## 2631       526.1273 495.4177 556.8369 479.1610 573.0935
## 2632       526.1273 495.0371 557.2174 478.5790 573.6756
## 2633       526.1273 494.6612 557.5934 478.0040 574.2505
## 2634       526.1273 494.2897 557.9649 477.4358 574.8187
## 2635       526.1273 493.9224 558.3321 476.8742 575.3804
## 2636       526.1273 493.5593 558.6952 476.3189 575.9357
## 2637       526.1273 493.2002 559.0543 475.7697 576.4848
## 2638       526.1273 492.8450 559.4095 475.2265 577.0281
## 2639       526.1273 492.4936 559.7610 474.6890 577.5656
## 2640       526.1273 492.1457 560.1088 474.1570 578.0976
## 2641       526.1273 491.8014 560.4531 473.6304 578.6241
## 2642       526.1273 491.4606 560.7940 473.1091 579.1455
## 2643       526.1273 491.1230 561.1316 472.5928 579.6617
## 2644       526.1273 490.7886 561.4659 472.0815 580.1731
## 2645       526.1273 490.4574 561.7971 471.5750 580.6796
## 2646       526.1273 490.1293 562.1253 471.0731 581.1815
## 2647       526.1273 489.8041 562.4505 470.5757 581.6788
## 2648       526.1273 489.4818 562.7728 470.0828 582.1718
## 2649       526.1273 489.1623 563.0923 469.5942 582.6604
## 2650       526.1273 488.8455 563.4091 469.1097 583.1448
## 2651       526.1273 488.5314 563.7231 468.6294 583.6252
## 2652       526.1273 488.2199 564.0346 468.1530 584.1016
## 2653       526.1273 487.9110 564.3436 467.6805 584.5741
## 2654       526.1273 487.6045 564.6501 467.2118 585.0428
## 2655       526.1273 487.3004 564.9541 466.7468 585.5078
## 2656       526.1273 486.9988 565.2558 466.2854 585.9692
## 2657       526.1273 486.6994 565.5552 465.8275 586.4271
## 2658       526.1273 486.4022 565.8523 465.3731 586.8815
## 2659       526.1273 486.1073 566.1472 464.9220 587.3325
## 2660       526.1273 485.8146 566.4400 464.4743 587.7803
## 2661       526.1273 485.5239 566.7307 464.0298 588.2248
## 2662       526.1273 485.2353 567.0192 463.5884 588.6661
## 2663       526.1273 484.9488 567.3058 463.1502 589.1044
## 2664       526.1273 484.6642 567.5904 462.7149 589.5396
## 2665       526.1273 484.3815 567.8730 462.2827 589.9719
## 2666       526.1273 484.1008 568.1538 461.8533 590.4013
## 2667       526.1273 483.8219 568.4327 461.4268 590.8278
## 2668       526.1273 483.5448 568.7097 461.0031 591.2515
## 2669       526.1273 483.2696 568.9850 460.5821 591.6725
## 2670       526.1273 482.9961 569.2585 460.1638 592.0908
## 2671       526.1273 482.7243 569.5303 459.7481 592.5064
## 2672       526.1273 482.4542 569.8004 459.3351 592.9195
## 2673       526.1273 482.1858 570.0688 458.9245 593.3300
## 2674       526.1273 481.9190 570.3356 458.5165 593.7381
## 2675       526.1273 481.6538 570.6008 458.1109 594.1436
## 2676       526.1273 481.3901 570.8644 457.7077 594.5468
## 2677       526.1273 481.1281 571.1265 457.3069 594.9477
## 2678       526.1273 480.8675 571.3871 456.9084 595.3462
## 2679       526.1273 480.6084 571.6462 456.5122 595.7424
## 2680       526.1273 480.3508 571.9038 456.1182 596.1364
## 2681       526.1273 480.0946 572.1599 455.7264 596.5281
## 2682       526.1273 479.8399 572.4147 455.3368 596.9177
## 2683       526.1273 479.5865 572.6680 454.9493 597.3052
## 2684       526.1273 479.3345 572.9200 454.5640 597.6906
## 2685       526.1273 479.0839 573.1707 454.1807 598.0739
## 2686       526.1273 478.8346 573.4200 453.7994 598.4552
## 2687       526.1273 478.5866 573.6680 453.4201 598.8345
## 2688       526.1273 478.3399 573.9147 453.0428 599.2118
## 2689       526.1273 478.0945 574.1601 452.6674 599.5872
## 2690       526.1273 477.8503 574.4043 452.2939 599.9606
## 2691       526.1273 477.6073 574.6473 451.9224 600.3322

XGBoost Modeling with Lagged Features

library(xgboost)
## Warning: package 'xgboost' was built under R version 4.4.3
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Loading required package: lattice
# Ensure 'logs' is numeric vector, not xts
logs_numeric <- as.numeric(logs)

# Create lagged features
lagged_data <- data.frame(
  Lag1 = dplyr::lag(logs_numeric, n = 1),
  Lag2 = dplyr::lag(logs_numeric, n = 2),
  Lag3 = dplyr::lag(logs_numeric, n = 3),
  Target = logs_numeric
)

# Remove NA rows
lagged_data <- na.omit(lagged_data)

# Check structure
str(lagged_data)
## 'data.frame':    2587 obs. of  4 variables:
##  $ Lag1  : num  0.01238 0.01759 -0.00805 -0.00786 -0.00282 ...
##  $ Lag2  : num  -0.00946 0.01238 0.01759 -0.00805 -0.00786 ...
##  $ Lag3  : num  -0.01822 -0.00946 0.01238 0.01759 -0.00805 ...
##  $ Target: num  0.01759 -0.00805 -0.00786 -0.00282 -0.00606 ...
##  - attr(*, "na.action")= 'omit' Named int [1:3] 1 2 3
##   ..- attr(*, "names")= chr [1:3] "1" "2" "3"
# Split into train/test
set.seed(109)
sample_size <- floor(0.80 * nrow(lagged_data))
train_indices <- sample(seq_len(nrow(lagged_data)), size = sample_size)
train_data <- lagged_data[train_indices, ]
test_data <- lagged_data[-train_indices, ]

# Create data matrices for XGBoost
train_matrix <- xgb.DMatrix(data = as.matrix(train_data[, c("Lag1", "Lag2", "Lag3")]),
                            label = train_data$Target)
test_matrix <- xgb.DMatrix(data = as.matrix(test_data[, c("Lag1", "Lag2", "Lag3")]),
                           label = test_data$Target)

# Set XGBoost parameters
params <- list(
  objective = "reg:squarederror",
  eval_metric = "rmse"
)

# Train the model
xgb_model <- xgboost(params = params, data = train_matrix, nrounds = 100, verbose = 0)

# Predict
xgb_preds <- predict(xgb_model, test_matrix)

# Evaluate performance
xgb_rmse <- sqrt(mean((xgb_preds - test_data$Target)^2))
xgb_mae <- mean(abs(xgb_preds - test_data$Target))

cat("XGBoost RMSE:", round(xgb_rmse, 6), "\n")
## XGBoost RMSE: 0.011483
cat("XGBoost MAE:", round(xgb_mae, 6), "\n")
## XGBoost MAE: 0.007831

Compare ARIMA vs XGBoost

# Get ARIMA forecast values from fit1
arima_preds <- forecast(fit1, h = nrow(test_data))$mean
arima_preds <- as.numeric(arima_preds)
arima_rmse <- sqrt(mean((arima_preds - test_data$Target)^2))
arima_mae <- mean(abs(arima_preds - test_data$Target))

comparison <- data.frame(
  Model = c("ARIMA (fit1)", "XGBoost"),
  RMSE = c(round(arima_rmse, 6), round(xgb_rmse, 6)),
  MAE = c(round(arima_mae, 6), round(xgb_mae, 6))
)

print(comparison)
##          Model     RMSE      MAE
## 1 ARIMA (fit1) 0.009418 0.006667
## 2      XGBoost 0.011483 0.007831

XGBoost Model and Visualization

#–– 1. Load libraries
library(xgboost)
library(dplyr)
library(tidyr)
library(ggplot2)
library(xgboost)
library(xts)
library(zoo)
library(dygraphs)

#–– 2. Prepare lagged dataset
n_lags <- 5
df <- data.frame(
  Date   = index(logs),
  Return = as.numeric(logs)
)

# create lags 1…n_lags
for(i in 1:n_lags) {
  df[[ paste0("Lag", i) ]] <- dplyr::lag(df$Return, n = i)
}
df <- tidyr::drop_na(df)   # remove rows with any NA

#–– 3. Split into training (80%) and testing (20%)
split_pt   <- floor(0.8 * nrow(df))
train_df   <- df[      1:split_pt, ]
test_df    <- df[(split_pt+1):nrow(df), ]

#–– 4. Create xgb.DMatrix objects
dtrain <- xgb.DMatrix(
  data  = as.matrix(train_df[ , paste0("Lag", 1:n_lags)]),
  label = train_df$Return
)
dtest <- xgb.DMatrix(
  data  = as.matrix(test_df[ , paste0("Lag", 1:n_lags)]),
  label = test_df$Return
)

#–– 5. Train the model
params <- list(
  objective   = "reg:squarederror",
  eval_metric = "rmse"
)
xgb_mod <- xgb.train(
  params    = params,
  data      = dtrain,
  nrounds   = 100,
  watchlist = list(train = dtrain, test = dtest),
  verbose   = 0
)

#–– 6. Predict on test set
preds <- predict(xgb_mod, dtest)

#–– 7. Plot Actual vs Predicted
results <- data.frame(
  Date      = test_df$Date,
  Actual    = test_df$Return,
  Predicted = preds
)
ggplot(results, aes(x = Date)) +
  geom_line(aes(y = Actual,    color = "Actual"),    size = 0.6) +
  geom_line(aes(y = Predicted, color = "Predicted"), size = 0.6, linetype = "dashed") +
  scale_color_manual(
    name   = "Series",
    values = c(Actual = "steelblue", Predicted = "firebrick")
  ) +
  labs(
    title = "XGBoost: Actual vs Predicted Log Returns",
    y     = "Log Return"
  ) +
  theme_minimal()

pred_xts <- xts(
  results[, c("Actual","Predicted")],
  order.by = results$Date
)

dygraph(pred_xts, main = "Actual vs Predicted Log Returns") %>%
  dySeries("Actual",    label = "Actual") %>%
  dySeries("Predicted", label = "Predicted", strokePattern = "dashed") %>%
  dyOptions(colors = c("steelblue","firebrick"))
#–– 8. Plot Feature Importance
imp <- xgb.importance(
  feature_names = paste0("Lag", 1:n_lags),
  model         = xgb_mod
)
xgb.plot.importance(imp, top_n = n_lags)

library(xgboost)
library(purrr)
library(dplyr) 

# 2a) Define grid of parameters to try
param_grid <- expand.grid(
  max_depth        = c(3, 5, 7),
  eta              = c(0.01, 0.1, 0.3),
  subsample        = c(0.7, 1.0),
  colsample_bytree = c(0.7, 1.0)
)

# 2b) Helper to run cv and return best RMSE
cv_results <- pmap_dfr(param_grid, function(max_depth, eta, subsample, colsample_bytree) {
  params <- list(
    objective        = "reg:squarederror",
    eval_metric      = "rmse",
    max_depth        = max_depth,
    eta              = eta,
    subsample        = subsample,
    colsample_bytree = colsample_bytree
  )
  cv <- xgb.cv(
    params              = params,
    data                = dtrain,
    nrounds             = 200,
    nfold               = 5,
    early_stopping_rounds = 10,
    verbose             = FALSE
  )
  tibble(
    max_depth = max_depth,
    eta       = eta,
    subsample = subsample,
    colsample = colsample_bytree,
    best_nrounds = cv$best_iteration,
    cv_rmse      = cv$evaluation_log[cv$best_iteration, test_rmse_mean]
  )
})

# 2c) See top 5 configs
top5 <- cv_results %>% 
  arrange(cv_rmse) %>% 
    head(5)
print(top5)
## # A tibble: 5 × 6
##   max_depth   eta subsample colsample best_nrounds cv_rmse
##       <dbl> <dbl>     <dbl>     <dbl>        <int>   <dbl>
## 1         3   0.1       0.7       1             57  0.0118
## 2         3   0.1       0.7       0.7           70  0.0119
## 3         3   0.3       0.7       0.7           18  0.0119
## 4         7   0.3       1         0.7           18  0.0119
## 5         3   0.1       1         0.7           60  0.0120

3. Add Rolling‑Volatility & VIX Features (with re‑split)

library(zoo)
library(quantmod)
library(dplyr)

#— A) rolling 5‑day SD of returns
df$vol5 <- rollapply(df$Return, width = 5, FUN = sd,
                     fill = NA, align = "right")

#— B) fetch VIX log‐returns and merge
getSymbols("^VIX", from = "2015-01-01")
## [1] "VIX"
vix_ret <- na.omit(diff(log(Cl(VIX))))
vix_df  <- tibble(
  Date       = index(vix_ret),
  VIX_Return = coredata(vix_ret)
)

df <- df %>%
  left_join(vix_df, by = "Date") %>%
  drop_na()   # removes any rows w/ NA in vol5 or VIX_Return

#— C) define features vector
features <- c(paste0("Lag", 1:n_lags), "vol5", "VIX_Return")

#— D) re‑split into train/test (80/20 by time order)
split_pt <- floor(0.8 * nrow(df))
train_df <- df[       1:split_pt, ]
test_df  <- df[(split_pt+1):nrow(df), ]

#— E) rebuild xgboost DMatrices
dtrain <- xgb.DMatrix(
  data  = as.matrix(train_df[, features]),
  label = train_df$Return
)
dtest  <- xgb.DMatrix(
  data  = as.matrix(test_df[,  features]),
  label = test_df$Return
)
horizon <- 5
df_h <- df %>%
  mutate(Target = dplyr::lead(Return, n = horizon)) %>%
  drop_na()

# rebuild train/test on df_h
split_pt  <- floor(0.8 * nrow(df_h))
tr_h      <- df_h[1:split_pt, ]
te_h      <- df_h[(split_pt+1):nrow(df_h), ]

dtr_h <- xgb.DMatrix(as.matrix(tr_h[features]), label = tr_h$Target)
dte_h <- xgb.DMatrix(as.matrix(te_h[features]), label = te_h$Target)

# train & predict
mod_h <- xgb.train(params, data = dtr_h, nrounds = 100, watchlist = list(train=dtr_h))
## [1]  train-rmse:0.350038 
## [2]  train-rmse:0.245224 
## [3]  train-rmse:0.171901 
## [4]  train-rmse:0.120648 
## [5]  train-rmse:0.084875 
## [6]  train-rmse:0.059986 
## [7]  train-rmse:0.042752 
## [8]  train-rmse:0.030863 
## [9]  train-rmse:0.022789 
## [10] train-rmse:0.017434 
## [11] train-rmse:0.013984 
## [12] train-rmse:0.011795 
## [13] train-rmse:0.010513 
## [14] train-rmse:0.009655 
## [15] train-rmse:0.009139 
## [16] train-rmse:0.008885 
## [17] train-rmse:0.008692 
## [18] train-rmse:0.008529 
## [19] train-rmse:0.008395 
## [20] train-rmse:0.008310 
## [21] train-rmse:0.008114 
## [22] train-rmse:0.008043 
## [23] train-rmse:0.007938 
## [24] train-rmse:0.007835 
## [25] train-rmse:0.007640 
## [26] train-rmse:0.007430 
## [27] train-rmse:0.007315 
## [28] train-rmse:0.007224 
## [29] train-rmse:0.007094 
## [30] train-rmse:0.006884 
## [31] train-rmse:0.006703 
## [32] train-rmse:0.006538 
## [33] train-rmse:0.006495 
## [34] train-rmse:0.006324 
## [35] train-rmse:0.006221 
## [36] train-rmse:0.006137 
## [37] train-rmse:0.006009 
## [38] train-rmse:0.005920 
## [39] train-rmse:0.005715 
## [40] train-rmse:0.005633 
## [41] train-rmse:0.005483 
## [42] train-rmse:0.005364 
## [43] train-rmse:0.005327 
## [44] train-rmse:0.005310 
## [45] train-rmse:0.005247 
## [46] train-rmse:0.005205 
## [47] train-rmse:0.005191 
## [48] train-rmse:0.005084 
## [49] train-rmse:0.005050 
## [50] train-rmse:0.004885 
## [51] train-rmse:0.004850 
## [52] train-rmse:0.004800 
## [53] train-rmse:0.004702 
## [54] train-rmse:0.004676 
## [55] train-rmse:0.004607 
## [56] train-rmse:0.004538 
## [57] train-rmse:0.004495 
## [58] train-rmse:0.004425 
## [59] train-rmse:0.004359 
## [60] train-rmse:0.004271 
## [61] train-rmse:0.004195 
## [62] train-rmse:0.004114 
## [63] train-rmse:0.004028 
## [64] train-rmse:0.003930 
## [65] train-rmse:0.003858 
## [66] train-rmse:0.003804 
## [67] train-rmse:0.003768 
## [68] train-rmse:0.003707 
## [69] train-rmse:0.003661 
## [70] train-rmse:0.003642 
## [71] train-rmse:0.003577 
## [72] train-rmse:0.003531 
## [73] train-rmse:0.003520 
## [74] train-rmse:0.003471 
## [75] train-rmse:0.003452 
## [76] train-rmse:0.003364 
## [77] train-rmse:0.003303 
## [78] train-rmse:0.003239 
## [79] train-rmse:0.003203 
## [80] train-rmse:0.003149 
## [81] train-rmse:0.003097 
## [82] train-rmse:0.003057 
## [83] train-rmse:0.003013 
## [84] train-rmse:0.002963 
## [85] train-rmse:0.002950 
## [86] train-rmse:0.002914 
## [87] train-rmse:0.002899 
## [88] train-rmse:0.002852 
## [89] train-rmse:0.002788 
## [90] train-rmse:0.002732 
## [91] train-rmse:0.002687 
## [92] train-rmse:0.002670 
## [93] train-rmse:0.002642 
## [94] train-rmse:0.002629 
## [95] train-rmse:0.002597 
## [96] train-rmse:0.002569 
## [97] train-rmse:0.002538 
## [98] train-rmse:0.002493 
## [99] train-rmse:0.002454 
## [100]    train-rmse:0.002448
pred_h <- predict(mod_h, dte_h)

# plot
plot_df <- data.frame(
  Date      = te_h$Date,
  Actual_h  = te_h$Target,
  Predicted_h = pred_h
)
ggplot(plot_df, aes(Date)) +
  geom_line(aes(y = Actual_h,    color="Actual")) +
  geom_line(aes(y = Predicted_h, color="Pred"), linetype="dashed") +
  labs(title = paste0(horizon,"‑Day Ahead Forecast"), y="Log Return") +
  theme_minimal()

# rolling‑window CV: initialWindow = 0.8*N, horizon = 0.2*N
library(caret)

ts_ctrl <- trainControl(
  method      = "timeslice",
  initialWindow = floor(0.8 * nrow(df)),
  horizon     = floor(0.2 * nrow(df)),
  fixedWindow = TRUE
)

# define small tuneGrid if desired
tuneGrid <- expand.grid(
  nrounds          = 100,
  max_depth        = 5,
  eta              = 0.1,
  gamma            = 0,
  colsample_bytree = 0.8,
  min_child_weight = 1,
  subsample        = 0.8
)

caret_mod <- train(
  x            = df[ features ],
  y            = df$Return,
  method       = "xgbTree",
  trControl    = ts_ctrl,
  tuneGrid     = tuneGrid,
  metric       = "RMSE"
)

caret_mod$results
##   nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 1     100         5 0.1     0              0.8                1       0.8
##          RMSE  Rsquared         MAE       RMSESD  RsquaredSD        MAESD
## 1 0.006541774 0.5903679 0.004105603 8.850498e-06 0.002748341 4.218602e-05