SPY Predictor

Load Data and Initial Look

library(quantmod)

## Warning: package 'quantmod' was built under R version 4.4.3

## Loading required package: xts

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: TTR

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(timeSeries)

## Loading required package: timeDate

## 
## Attaching package: 'timeSeries'

## The following object is masked from 'package:zoo':
## 
##     time<-

## The following objects are masked from 'package:graphics':
## 
##     lines, points

library(tseries)
library(forecast)
library(xts)

# Pull data from Yahoo Finance
getSymbols('SPY', from='2015-01-01', to='2025-04-23')

## [1] "SPY"

# Extract closing prices
SPY_Close_Prices <- SPY[,4]

# Plot closing prices
plot(SPY_Close_Prices, main="SPY Closing Prices (2015-2025)")

# ACF and PACF plots
par(mfrow=c(1,2))
Acf(SPY_Close_Prices, main='ACF for Differenced Series')
Pacf(SPY_Close_Prices, main='PACF for Differenced Series')

# Auto ARIMA model
auto.arima(SPY_Close_Prices, seasonal=FALSE)

## Series: SPY_Close_Prices 
## ARIMA(4,1,1) with drift 
## 
## Coefficients:
##           ar1      ar2      ar3      ar4     ma1   drift
##       -0.9391  -0.0332  -0.0121  -0.0951  0.8658  0.1246
## s.e.   0.0444   0.0272   0.0270   0.0207  0.0407  0.0706
## 
## sigma^2 = 16.07:  log likelihood = -7267.87
## AIC=14549.74   AICc=14549.79   BIC=14590.76

Log Residuals to Remove Non-Stationary Properties

# Compute log returns
logs <- diff(log(SPY_Close_Prices), lag=1)
logs <- logs[!is.na(logs)]

# Plot log returns
par(mfrow=c(1,1))
plot(logs, type='l', main='Log Returns Plot')

# ADF test
print(adf.test(logs))

## Warning in adf.test(logs): p-value smaller than printed p-value

## 
##  Augmented Dickey-Fuller Test
## 
## data:  logs
## Dickey-Fuller = -13.793, Lag order = 13, p-value = 0.01
## alternative hypothesis: stationary

# Auto ARIMA on log returns
auto.arima(logs, seasonal=FALSE)

## Series: logs 
## ARIMA(5,0,2) with zero mean 
## 
## Coefficients:
##           ar1      ar2      ar3      ar4      ar5     ma1     ma2
##       -1.6983  -0.8341  -0.0112  -0.0226  -0.0042  1.6173  0.7366
## s.e.   0.0805   0.0823   0.0422   0.0403   0.0266  0.0780  0.0663
## 
## sigma^2 = 0.0001257:  log likelihood = 7959.39
## AIC=-15902.78   AICc=-15902.72   BIC=-15855.9

Split Data into Training and Testing Sets

set.seed(109)
sample_size <- floor(0.80 * nrow(logs))
train_indices <- sample(seq_len(nrow(logs)), size=sample_size)
train <- logs[train_indices, ]
test <- logs[-train_indices, ]

# ACF and PACF plots
par(mfrow=c(1,2))
Acf(train, main='ACF for Differenced Series')
Pacf(train, main='PACF for Differenced Series')

# Auto ARIMA on training set
auto.arima(train, seasonal=FALSE)

## Series: train 
## ARIMA(0,0,0) with zero mean 
## 
## sigma^2 = 0.0001334:  log likelihood = 6303.54
## AIC=-12605.08   AICc=-12605.08   BIC=-12599.45

Model Fitting and Residual Analysis

# Fit ARIMA models
fit1 <- auto.arima(train, seasonal=FALSE)
tsdisplay(residuals(fit1), lag.max=40, main='(5,0,2) Model Residuals')

fit2 <- arima(train, order=c(4,1,2))
tsdisplay(residuals(fit2), lag.max=40, main='(4,1,2) Model Residuals')

fit3 <- auto.arima(SPY_Close_Prices, seasonal=FALSE)
tsdisplay(residuals(fit3), lag.max=40, main='Original Data Model Residuals')

fit4 <- arima(SPY_Close_Prices, order=c(1,1,1))
tsdisplay(residuals(fit4), lag.max=40, main='(1,1,1) Model Residuals on Original Data')

Forecasting

par(mfrow=c(2,2))
period <- 100

fcast1 <- forecast(fit1, h=period)
plot(fcast1, main="Auto ARIMA (5,0,2)")

fcast2 <- forecast(fit2, h=period)
plot(fcast2, main="Custom ARIMA (4,1,2)")

fcast3 <- forecast(fit3, h=period)
plot(fcast3, main="Auto ARIMA on Original Data")

fcast4 <- forecast(fit4, h=period)
plot(fcast4, main="Custom ARIMA on Original Data")

Accuracy and Conclusion

accuracy(fcast1) #inf.->slightly upward followed by stationary trend

##                        ME       RMSE         MAE MPE MAPE      MASE
## Training set 0.0003176177 0.01154864 0.007542545 100  100 0.6935458
##                      ACF1
## Training set -0.009980763

accuracy(fcast2) #inf.->slightly upward followed by stationary trend

##                        ME       RMSE         MAE MPE MAPE      MASE        ACF1
## Training set 0.0001128358 0.01150001 0.007542242 NaN  Inf 0.6935179 0.001256132

accuracy(fcast3) #99.27% accuracy by MAPE mean avg. percent error

##                         ME     RMSE      MAE         MPE      MAPE     MASE
## Training set -5.197405e-05 4.002719 2.581256 -0.01158895 0.7471461 1.000948
##                       ACF1
## Training set -0.0006094235

accuracy(fcast4) #99.26% accuracy

##                     ME     RMSE      MAE        MPE      MAPE     MASE
## Training set 0.1321011 4.035444 2.582789 0.03182545 0.7480869 1.001542
##                     ACF1
## Training set 0.003668324

fcast3

##      Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 2592       529.3481 524.2114 534.4847 521.4923 537.2039
## 2593       527.2724 520.2691 534.2758 516.5617 537.9831
## 2594       530.4419 521.8699 539.0140 517.3322 543.5517
## 2595       526.4963 516.7096 536.2829 511.5289 541.4636
## 2596       530.1808 519.4344 540.9272 513.7457 546.6160
## 2597       527.2703 515.5227 539.0180 509.3039 545.2368
## 2598       529.8863 517.3231 542.4494 510.6726 549.1000
## 2599       528.1166 514.6883 541.5450 507.5797 548.6535
## 2600       529.6352 515.4719 543.7986 507.9743 551.2962
## 2601       528.7725 513.8521 543.6929 505.9537 551.5913
## 2602       529.5638 513.9659 545.1617 505.7089 553.4188
## 2603       529.2587 512.9817 545.5357 504.3652 554.1522
## 2604       529.6441 512.7353 546.5528 503.7844 555.5038
## 2605       529.6240 512.0933 547.1548 502.8131 556.4350
## 2606       529.8176 511.6942 547.9410 502.1003 557.5350
## 2607       529.9201 511.2182 548.6220 501.3180 558.5222
## 2608       530.0402 510.7796 549.3008 500.5836 559.4968
## 2609       530.1828 510.3781 549.9874 499.8942 560.4713
## 2610       530.2844 509.9506 550.6183 499.1865 561.3823
## 2611       530.4322 509.5828 551.2817 498.5457 562.3187
## 2612       530.5361 509.1831 551.8891 497.8796 563.1926
## 2613       530.6780 508.8336 552.5225 497.2698 564.0863
## 2614       530.7890 508.4635 553.1146 496.6450 564.9330
## 2615       530.9240 508.1278 553.7201 496.0603 565.7877
## 2616       531.0412 507.7837 554.2987 495.4719 566.6104
## 2617       531.1710 507.4613 554.8806 494.9102 567.4318
## 2618       531.2922 507.1387 555.4457 494.3526 568.2318
## 2619       531.4190 506.8297 556.0082 493.8130 569.0250
## 2620       531.5424 506.5249 556.5598 493.2815 569.8033
## 2621       531.6677 506.2293 557.1061 492.7630 570.5724
## 2622       531.7920 505.9395 557.6446 492.2540 571.3301
## 2623       531.9167 505.6566 558.1769 491.7553 572.0782
## 2624       532.0414 505.3799 558.7030 491.2662 572.8167
## 2625       532.1660 505.1090 559.2229 490.7859 573.5460
## 2626       532.2907 504.8441 559.7374 490.3147 574.2668
## 2627       532.4152 504.5843 560.2461 489.8515 574.9789
## 2628       532.5400 504.3301 560.7499 489.3966 575.6834
## 2629       532.6645 504.0806 561.2485 488.9492 576.3799
## 2630       532.7893 503.8362 561.7424 488.5093 577.0692
## 2631       532.9139 503.5962 562.2315 488.0764 577.7513
## 2632       533.0385 503.3609 562.7162 487.6505 578.4266
## 2633       533.1632 503.1297 563.1966 487.2310 579.0953
## 2634       533.2878 502.9028 563.6728 486.8180 579.7576
## 2635       533.4124 502.6799 564.1450 486.4111 580.4138
## 2636       533.5371 502.4609 564.6133 486.0102 581.0640
## 2637       533.6617 502.2457 565.0778 485.6150 581.7085
## 2638       533.7864 502.0340 565.5387 485.2254 582.3474
## 2639       533.9110 501.8260 565.9961 484.8412 582.9809
## 2640       534.0357 501.6213 566.4500 484.4621 583.6092
## 2641       534.1603 501.4199 566.9007 484.0882 584.2324
## 2642       534.2849 501.2218 567.3481 483.7192 584.8507
## 2643       534.4096 501.0267 567.7924 483.3549 585.4643
## 2644       534.5342 500.8347 568.2337 482.9953 586.0732
## 2645       534.6589 500.6457 568.6721 482.6402 586.6776
## 2646       534.7835 500.4595 569.1075 482.2895 587.2776
## 2647       534.9082 500.2761 569.5402 481.9430 587.8733
## 2648       535.0328 500.0954 569.9702 481.6007 588.4649
## 2649       535.1574 499.9174 570.3975 481.2624 589.0524
## 2650       535.2821 499.7419 570.8222 480.9281 589.6360
## 2651       535.4067 499.5690 571.2445 480.5976 590.2158
## 2652       535.5314 499.3985 571.6642 480.2709 590.7918
## 2653       535.6560 499.2304 572.0816 479.9479 591.3642
## 2654       535.7807 499.0646 572.4967 479.6284 591.9330
## 2655       535.9053 498.9011 572.9094 479.3123 592.4983
## 2656       536.0299 498.7399 573.3200 478.9997 593.0601
## 2657       536.1546 498.5808 573.7284 478.6904 593.6187
## 2658       536.2792 498.4238 574.1346 478.3844 594.1740
## 2659       536.4039 498.2690 574.5388 478.0816 594.7262
## 2660       536.5285 498.1161 574.9409 477.7818 595.2752
## 2661       536.6531 497.9653 575.3410 477.4851 595.8212
## 2662       536.7778 497.8164 575.7392 477.1914 596.3642
## 2663       536.9024 497.6694 576.1355 476.9006 596.9043
## 2664       537.0271 497.5242 576.5299 476.6127 597.4415
## 2665       537.1517 497.3809 576.9225 476.3275 597.9759
## 2666       537.2764 497.2394 577.3133 476.0451 598.5076
## 2667       537.4010 497.0997 577.7023 475.7654 599.0366
## 2668       537.5256 496.9616 578.0897 475.4883 599.5630
## 2669       537.6503 496.8253 578.4753 475.2138 600.0867
## 2670       537.7749 496.6906 578.8593 474.9419 600.6080
## 2671       537.8996 496.5576 579.2416 474.6724 601.1267
## 2672       538.0242 496.4261 579.6223 474.4054 601.6430
## 2673       538.1489 496.2962 580.0015 474.1408 602.1570
## 2674       538.2735 496.1679 580.3791 473.8785 602.6685
## 2675       538.3981 496.0410 580.7553 473.6185 603.1778
## 2676       538.5228 495.9157 581.1299 473.3608 603.6848
## 2677       538.6474 495.7918 581.5031 473.1053 604.1895
## 2678       538.7721 495.6693 581.8748 472.8521 604.6921
## 2679       538.8967 495.5482 582.2452 472.6009 605.1925
## 2680       539.0214 495.4286 582.6141 472.3519 605.6908
## 2681       539.1460 495.3103 582.9817 472.1050 606.1870
## 2682       539.2706 495.1933 583.3480 471.8601 606.6811
## 2683       539.3953 495.0776 583.7129 471.6173 607.1733
## 2684       539.5199 494.9633 584.0766 471.3764 607.6634
## 2685       539.6446 494.8502 584.4389 471.1375 608.1517
## 2686       539.7692 494.7384 584.8001 470.9005 608.6379
## 2687       539.8938 494.6278 585.1599 470.6654 609.1223
## 2688       540.0185 494.5184 585.5186 470.4321 609.6049
## 2689       540.1431 494.4102 585.8761 470.2007 610.0856
## 2690       540.2678 494.3032 586.2323 469.9710 610.5645
## 2691       540.3924 494.1974 586.5875 469.7432 611.0416

fcast4

##      Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 2592       525.6595 520.4868 530.8321 517.7486 533.5703
## 2593       526.3222 519.3231 533.3213 515.6180 537.0264
## 2594       526.0461 517.4990 534.5932 512.9744 539.1177
## 2595       526.1611 516.3456 535.9767 511.1495 541.1727
## 2596       526.1132 515.1606 537.0657 509.3627 542.8637
## 2597       526.1332 514.1566 538.1097 507.8166 544.4497
## 2598       526.1248 513.2031 539.0466 506.3627 545.8870
## 2599       526.1283 512.3267 539.9299 505.0206 547.2360
## 2600       526.1269 511.4979 540.7558 503.7538 548.4999
## 2601       526.1275 510.7157 541.5392 502.5572 549.6977
## 2602       526.1272 509.9704 542.2840 501.4175 550.8369
## 2603       526.1273 509.2584 542.9962 500.3285 551.9261
## 2604       526.1273 508.5751 543.6794 499.2835 552.9710
## 2605       526.1273 507.9175 544.3371 498.2778 553.9768
## 2606       526.1273 507.2828 544.9718 497.3071 554.9475
## 2607       526.1273 506.6688 545.5858 496.3680 555.8865
## 2608       526.1273 506.0735 546.1810 495.4577 556.7968
## 2609       526.1273 505.4955 546.7591 494.5737 557.6809
## 2610       526.1273 504.9332 547.3214 493.7137 558.5409
## 2611       526.1273 504.3854 547.8691 492.8760 559.3786
## 2612       526.1273 503.8511 548.4034 492.0589 560.1957
## 2613       526.1273 503.3294 548.9252 491.2609 560.9937
## 2614       526.1273 502.8193 549.4353 490.4807 561.7738
## 2615       526.1273 502.3201 549.9345 489.7173 562.5372
## 2616       526.1273 501.8312 550.4234 488.9696 563.2850
## 2617       526.1273 501.3519 550.9027 488.2366 564.0180
## 2618       526.1273 500.8817 551.3728 487.5175 564.7370
## 2619       526.1273 500.4202 551.8344 486.8116 565.4429
## 2620       526.1273 499.9667 552.2878 486.1182 566.1364
## 2621       526.1273 499.5210 552.7335 485.4365 566.8181
## 2622       526.1273 499.0827 553.1719 484.7661 567.4885
## 2623       526.1273 498.6513 553.6033 484.1064 568.1482
## 2624       526.1273 498.2266 554.0280 483.4568 568.7977
## 2625       526.1273 497.8083 554.4463 482.8171 569.4375
## 2626       526.1273 497.3960 554.8586 482.1866 570.0680
## 2627       526.1273 496.9896 555.2650 481.5650 570.6895
## 2628       526.1273 496.5888 555.6658 480.9520 571.3025
## 2629       526.1273 496.1933 556.0612 480.3473 571.9073
## 2630       526.1273 495.8030 556.4515 479.7503 572.5042
## 2631       526.1273 495.4177 556.8369 479.1610 573.0935
## 2632       526.1273 495.0371 557.2174 478.5790 573.6756
## 2633       526.1273 494.6612 557.5934 478.0040 574.2505
## 2634       526.1273 494.2897 557.9649 477.4358 574.8187
## 2635       526.1273 493.9224 558.3321 476.8742 575.3804
## 2636       526.1273 493.5593 558.6952 476.3189 575.9357
## 2637       526.1273 493.2002 559.0543 475.7697 576.4848
## 2638       526.1273 492.8450 559.4095 475.2265 577.0281
## 2639       526.1273 492.4936 559.7610 474.6890 577.5656
## 2640       526.1273 492.1457 560.1088 474.1570 578.0976
## 2641       526.1273 491.8014 560.4531 473.6304 578.6241
## 2642       526.1273 491.4606 560.7940 473.1091 579.1455
## 2643       526.1273 491.1230 561.1316 472.5928 579.6617
## 2644       526.1273 490.7886 561.4659 472.0815 580.1731
## 2645       526.1273 490.4574 561.7971 471.5750 580.6796
## 2646       526.1273 490.1293 562.1253 471.0731 581.1815
## 2647       526.1273 489.8041 562.4505 470.5757 581.6788
## 2648       526.1273 489.4818 562.7728 470.0828 582.1718
## 2649       526.1273 489.1623 563.0923 469.5942 582.6604
## 2650       526.1273 488.8455 563.4091 469.1097 583.1448
## 2651       526.1273 488.5314 563.7231 468.6294 583.6252
## 2652       526.1273 488.2199 564.0346 468.1530 584.1016
## 2653       526.1273 487.9110 564.3436 467.6805 584.5741
## 2654       526.1273 487.6045 564.6501 467.2118 585.0428
## 2655       526.1273 487.3004 564.9541 466.7468 585.5078
## 2656       526.1273 486.9988 565.2558 466.2854 585.9692
## 2657       526.1273 486.6994 565.5552 465.8275 586.4271
## 2658       526.1273 486.4022 565.8523 465.3731 586.8815
## 2659       526.1273 486.1073 566.1472 464.9220 587.3325
## 2660       526.1273 485.8146 566.4400 464.4743 587.7803
## 2661       526.1273 485.5239 566.7307 464.0298 588.2248
## 2662       526.1273 485.2353 567.0192 463.5884 588.6661
## 2663       526.1273 484.9488 567.3058 463.1502 589.1044
## 2664       526.1273 484.6642 567.5904 462.7149 589.5396
## 2665       526.1273 484.3815 567.8730 462.2827 589.9719
## 2666       526.1273 484.1008 568.1538 461.8533 590.4013
## 2667       526.1273 483.8219 568.4327 461.4268 590.8278
## 2668       526.1273 483.5448 568.7097 461.0031 591.2515
## 2669       526.1273 483.2696 568.9850 460.5821 591.6725
## 2670       526.1273 482.9961 569.2585 460.1638 592.0908
## 2671       526.1273 482.7243 569.5303 459.7481 592.5064
## 2672       526.1273 482.4542 569.8004 459.3351 592.9195
## 2673       526.1273 482.1858 570.0688 458.9245 593.3300
## 2674       526.1273 481.9190 570.3356 458.5165 593.7381
## 2675       526.1273 481.6538 570.6008 458.1109 594.1436
## 2676       526.1273 481.3901 570.8644 457.7077 594.5468
## 2677       526.1273 481.1281 571.1265 457.3069 594.9477
## 2678       526.1273 480.8675 571.3871 456.9084 595.3462
## 2679       526.1273 480.6084 571.6462 456.5122 595.7424
## 2680       526.1273 480.3508 571.9038 456.1182 596.1364
## 2681       526.1273 480.0946 572.1599 455.7264 596.5281
## 2682       526.1273 479.8399 572.4147 455.3368 596.9177
## 2683       526.1273 479.5865 572.6680 454.9493 597.3052
## 2684       526.1273 479.3345 572.9200 454.5640 597.6906
## 2685       526.1273 479.0839 573.1707 454.1807 598.0739
## 2686       526.1273 478.8346 573.4200 453.7994 598.4552
## 2687       526.1273 478.5866 573.6680 453.4201 598.8345
## 2688       526.1273 478.3399 573.9147 453.0428 599.2118
## 2689       526.1273 478.0945 574.1601 452.6674 599.5872
## 2690       526.1273 477.8503 574.4043 452.2939 599.9606
## 2691       526.1273 477.6073 574.6473 451.9224 600.3322

XGBoost Modeling with Lagged Features

library(xgboost)

## Warning: package 'xgboost' was built under R version 4.4.3

library(caret)

## Warning: package 'caret' was built under R version 4.4.3

## Loading required package: ggplot2

## Loading required package: lattice

# Ensure 'logs' is numeric vector, not xts
logs_numeric <- as.numeric(logs)

# Create lagged features
lagged_data <- data.frame(
  Lag1 = dplyr::lag(logs_numeric, n = 1),
  Lag2 = dplyr::lag(logs_numeric, n = 2),
  Lag3 = dplyr::lag(logs_numeric, n = 3),
  Target = logs_numeric
)

# Remove NA rows
lagged_data <- na.omit(lagged_data)

# Check structure
str(lagged_data)

## 'data.frame':    2587 obs. of  4 variables:
##  $ Lag1  : num  0.01238 0.01759 -0.00805 -0.00786 -0.00282 ...
##  $ Lag2  : num  -0.00946 0.01238 0.01759 -0.00805 -0.00786 ...
##  $ Lag3  : num  -0.01822 -0.00946 0.01238 0.01759 -0.00805 ...
##  $ Target: num  0.01759 -0.00805 -0.00786 -0.00282 -0.00606 ...
##  - attr(*, "na.action")= 'omit' Named int [1:3] 1 2 3
##   ..- attr(*, "names")= chr [1:3] "1" "2" "3"

# Split into train/test
set.seed(109)
sample_size <- floor(0.80 * nrow(lagged_data))
train_indices <- sample(seq_len(nrow(lagged_data)), size = sample_size)
train_data <- lagged_data[train_indices, ]
test_data <- lagged_data[-train_indices, ]

# Create data matrices for XGBoost
train_matrix <- xgb.DMatrix(data = as.matrix(train_data[, c("Lag1", "Lag2", "Lag3")]),
                            label = train_data$Target)
test_matrix <- xgb.DMatrix(data = as.matrix(test_data[, c("Lag1", "Lag2", "Lag3")]),
                           label = test_data$Target)

# Set XGBoost parameters
params <- list(
  objective = "reg:squarederror",
  eval_metric = "rmse"
)

# Train the model
xgb_model <- xgboost(params = params, data = train_matrix, nrounds = 100, verbose = 0)

# Predict
xgb_preds <- predict(xgb_model, test_matrix)

# Evaluate performance
xgb_rmse <- sqrt(mean((xgb_preds - test_data$Target)^2))
xgb_mae <- mean(abs(xgb_preds - test_data$Target))

cat("XGBoost RMSE:", round(xgb_rmse, 6), "\n")

## XGBoost RMSE: 0.011483

cat("XGBoost MAE:", round(xgb_mae, 6), "\n")

## XGBoost MAE: 0.007831

Compare ARIMA vs XGBoost

# Get ARIMA forecast values from fit1
arima_preds <- forecast(fit1, h = nrow(test_data))$mean
arima_preds <- as.numeric(arima_preds)
arima_rmse <- sqrt(mean((arima_preds - test_data$Target)^2))
arima_mae <- mean(abs(arima_preds - test_data$Target))

comparison <- data.frame(
  Model = c("ARIMA (fit1)", "XGBoost"),
  RMSE = c(round(arima_rmse, 6), round(xgb_rmse, 6)),
  MAE = c(round(arima_mae, 6), round(xgb_mae, 6))
)

print(comparison)

##          Model     RMSE      MAE
## 1 ARIMA (fit1) 0.009418 0.006667
## 2      XGBoost 0.011483 0.007831

XGBoost Model and Visualization

#–– 1. Load libraries
library(xgboost)
library(dplyr)
library(tidyr)
library(ggplot2)
library(xgboost)
library(xts)
library(zoo)
library(dygraphs)

#–– 2. Prepare lagged dataset
n_lags <- 5
df <- data.frame(
  Date   = index(logs),
  Return = as.numeric(logs)
)

# create lags 1…n_lags
for(i in 1:n_lags) {
  df[[ paste0("Lag", i) ]] <- dplyr::lag(df$Return, n = i)
}
df <- tidyr::drop_na(df)   # remove rows with any NA

#–– 3. Split into training (80%) and testing (20%)
split_pt   <- floor(0.8 * nrow(df))
train_df   <- df[      1:split_pt, ]
test_df    <- df[(split_pt+1):nrow(df), ]

#–– 4. Create xgb.DMatrix objects
dtrain <- xgb.DMatrix(
  data  = as.matrix(train_df[ , paste0("Lag", 1:n_lags)]),
  label = train_df$Return
)
dtest <- xgb.DMatrix(
  data  = as.matrix(test_df[ , paste0("Lag", 1:n_lags)]),
  label = test_df$Return
)

#–– 5. Train the model
params <- list(
  objective   = "reg:squarederror",
  eval_metric = "rmse"
)
xgb_mod <- xgb.train(
  params    = params,
  data      = dtrain,
  nrounds   = 100,
  watchlist = list(train = dtrain, test = dtest),
  verbose   = 0
)

#–– 6. Predict on test set
preds <- predict(xgb_mod, dtest)

#–– 7. Plot Actual vs Predicted
results <- data.frame(
  Date      = test_df$Date,
  Actual    = test_df$Return,
  Predicted = preds
)
ggplot(results, aes(x = Date)) +
  geom_line(aes(y = Actual,    color = "Actual"),    size = 0.6) +
  geom_line(aes(y = Predicted, color = "Predicted"), size = 0.6, linetype = "dashed") +
  scale_color_manual(
    name   = "Series",
    values = c(Actual = "steelblue", Predicted = "firebrick")
  ) +
  labs(
    title = "XGBoost: Actual vs Predicted Log Returns",
    y     = "Log Return"
  ) +
  theme_minimal()

pred_xts <- xts(
  results[, c("Actual","Predicted")],
  order.by = results$Date
)

dygraph(pred_xts, main = "Actual vs Predicted Log Returns") %>%
  dySeries("Actual",    label = "Actual") %>%
  dySeries("Predicted", label = "Predicted", strokePattern = "dashed") %>%
  dyOptions(colors = c("steelblue","firebrick"))

#–– 8. Plot Feature Importance
imp <- xgb.importance(
  feature_names = paste0("Lag", 1:n_lags),
  model         = xgb_mod
)
xgb.plot.importance(imp, top_n = n_lags)

library(xgboost)
library(purrr)
library(dplyr) 

# 2a) Define grid of parameters to try
param_grid <- expand.grid(
  max_depth        = c(3, 5, 7),
  eta              = c(0.01, 0.1, 0.3),
  subsample        = c(0.7, 1.0),
  colsample_bytree = c(0.7, 1.0)
)

# 2b) Helper to run cv and return best RMSE
cv_results <- pmap_dfr(param_grid, function(max_depth, eta, subsample, colsample_bytree) {
  params <- list(
    objective        = "reg:squarederror",
    eval_metric      = "rmse",
    max_depth        = max_depth,
    eta              = eta,
    subsample        = subsample,
    colsample_bytree = colsample_bytree
  )
  cv <- xgb.cv(
    params              = params,
    data                = dtrain,
    nrounds             = 200,
    nfold               = 5,
    early_stopping_rounds = 10,
    verbose             = FALSE
  )
  tibble(
    max_depth = max_depth,
    eta       = eta,
    subsample = subsample,
    colsample = colsample_bytree,
    best_nrounds = cv$best_iteration,
    cv_rmse      = cv$evaluation_log[cv$best_iteration, test_rmse_mean]
  )
})

# 2c) See top 5 configs
top5 <- cv_results %>% 
  arrange(cv_rmse) %>% 
    head(5)
print(top5)

## # A tibble: 5 × 6
##   max_depth   eta subsample colsample best_nrounds cv_rmse
##       <dbl> <dbl>     <dbl>     <dbl>        <int>   <dbl>
## 1         3   0.1       0.7       1             57  0.0118
## 2         3   0.1       0.7       0.7           70  0.0119
## 3         3   0.3       0.7       0.7           18  0.0119
## 4         7   0.3       1         0.7           18  0.0119
## 5         3   0.1       1         0.7           60  0.0120

3. Add Rolling‑Volatility & VIX Features (with re‑split)

library(zoo)
library(quantmod)
library(dplyr)

#— A) rolling 5‑day SD of returns
df$vol5 <- rollapply(df$Return, width = 5, FUN = sd,
                     fill = NA, align = "right")

#— B) fetch VIX log‐returns and merge
getSymbols("^VIX", from = "2015-01-01")

## [1] "VIX"

vix_ret <- na.omit(diff(log(Cl(VIX))))
vix_df  <- tibble(
  Date       = index(vix_ret),
  VIX_Return = coredata(vix_ret)
)

df <- df %>%
  left_join(vix_df, by = "Date") %>%
  drop_na()   # removes any rows w/ NA in vol5 or VIX_Return

#— C) define features vector
features <- c(paste0("Lag", 1:n_lags), "vol5", "VIX_Return")

#— D) re‑split into train/test (80/20 by time order)
split_pt <- floor(0.8 * nrow(df))
train_df <- df[       1:split_pt, ]
test_df  <- df[(split_pt+1):nrow(df), ]

#— E) rebuild xgboost DMatrices
dtrain <- xgb.DMatrix(
  data  = as.matrix(train_df[, features]),
  label = train_df$Return
)
dtest  <- xgb.DMatrix(
  data  = as.matrix(test_df[,  features]),
  label = test_df$Return
)

horizon <- 5
df_h <- df %>%
  mutate(Target = dplyr::lead(Return, n = horizon)) %>%
  drop_na()

# rebuild train/test on df_h
split_pt  <- floor(0.8 * nrow(df_h))
tr_h      <- df_h[1:split_pt, ]
te_h      <- df_h[(split_pt+1):nrow(df_h), ]

dtr_h <- xgb.DMatrix(as.matrix(tr_h[features]), label = tr_h$Target)
dte_h <- xgb.DMatrix(as.matrix(te_h[features]), label = te_h$Target)

# train & predict
mod_h <- xgb.train(params, data = dtr_h, nrounds = 100, watchlist = list(train=dtr_h))

## [1]  train-rmse:0.350038 
## [2]  train-rmse:0.245224 
## [3]  train-rmse:0.171901 
## [4]  train-rmse:0.120648 
## [5]  train-rmse:0.084875 
## [6]  train-rmse:0.059986 
## [7]  train-rmse:0.042752 
## [8]  train-rmse:0.030863 
## [9]  train-rmse:0.022789 
## [10] train-rmse:0.017434 
## [11] train-rmse:0.013984 
## [12] train-rmse:0.011795 
## [13] train-rmse:0.010513 
## [14] train-rmse:0.009655 
## [15] train-rmse:0.009139 
## [16] train-rmse:0.008885 
## [17] train-rmse:0.008692 
## [18] train-rmse:0.008529 
## [19] train-rmse:0.008395 
## [20] train-rmse:0.008310 
## [21] train-rmse:0.008114 
## [22] train-rmse:0.008043 
## [23] train-rmse:0.007938 
## [24] train-rmse:0.007835 
## [25] train-rmse:0.007640 
## [26] train-rmse:0.007430 
## [27] train-rmse:0.007315 
## [28] train-rmse:0.007224 
## [29] train-rmse:0.007094 
## [30] train-rmse:0.006884 
## [31] train-rmse:0.006703 
## [32] train-rmse:0.006538 
## [33] train-rmse:0.006495 
## [34] train-rmse:0.006324 
## [35] train-rmse:0.006221 
## [36] train-rmse:0.006137 
## [37] train-rmse:0.006009 
## [38] train-rmse:0.005920 
## [39] train-rmse:0.005715 
## [40] train-rmse:0.005633 
## [41] train-rmse:0.005483 
## [42] train-rmse:0.005364 
## [43] train-rmse:0.005327 
## [44] train-rmse:0.005310 
## [45] train-rmse:0.005247 
## [46] train-rmse:0.005205 
## [47] train-rmse:0.005191 
## [48] train-rmse:0.005084 
## [49] train-rmse:0.005050 
## [50] train-rmse:0.004885 
## [51] train-rmse:0.004850 
## [52] train-rmse:0.004800 
## [53] train-rmse:0.004702 
## [54] train-rmse:0.004676 
## [55] train-rmse:0.004607 
## [56] train-rmse:0.004538 
## [57] train-rmse:0.004495 
## [58] train-rmse:0.004425 
## [59] train-rmse:0.004359 
## [60] train-rmse:0.004271 
## [61] train-rmse:0.004195 
## [62] train-rmse:0.004114 
## [63] train-rmse:0.004028 
## [64] train-rmse:0.003930 
## [65] train-rmse:0.003858 
## [66] train-rmse:0.003804 
## [67] train-rmse:0.003768 
## [68] train-rmse:0.003707 
## [69] train-rmse:0.003661 
## [70] train-rmse:0.003642 
## [71] train-rmse:0.003577 
## [72] train-rmse:0.003531 
## [73] train-rmse:0.003520 
## [74] train-rmse:0.003471 
## [75] train-rmse:0.003452 
## [76] train-rmse:0.003364 
## [77] train-rmse:0.003303 
## [78] train-rmse:0.003239 
## [79] train-rmse:0.003203 
## [80] train-rmse:0.003149 
## [81] train-rmse:0.003097 
## [82] train-rmse:0.003057 
## [83] train-rmse:0.003013 
## [84] train-rmse:0.002963 
## [85] train-rmse:0.002950 
## [86] train-rmse:0.002914 
## [87] train-rmse:0.002899 
## [88] train-rmse:0.002852 
## [89] train-rmse:0.002788 
## [90] train-rmse:0.002732 
## [91] train-rmse:0.002687 
## [92] train-rmse:0.002670 
## [93] train-rmse:0.002642 
## [94] train-rmse:0.002629 
## [95] train-rmse:0.002597 
## [96] train-rmse:0.002569 
## [97] train-rmse:0.002538 
## [98] train-rmse:0.002493 
## [99] train-rmse:0.002454 
## [100]    train-rmse:0.002448

pred_h <- predict(mod_h, dte_h)

# plot
plot_df <- data.frame(
  Date      = te_h$Date,
  Actual_h  = te_h$Target,
  Predicted_h = pred_h
)
ggplot(plot_df, aes(Date)) +
  geom_line(aes(y = Actual_h,    color="Actual")) +
  geom_line(aes(y = Predicted_h, color="Pred"), linetype="dashed") +
  labs(title = paste0(horizon,"‑Day Ahead Forecast"), y="Log Return") +
  theme_minimal()

# rolling‑window CV: initialWindow = 0.8*N, horizon = 0.2*N
library(caret)

ts_ctrl <- trainControl(
  method      = "timeslice",
  initialWindow = floor(0.8 * nrow(df)),
  horizon     = floor(0.2 * nrow(df)),
  fixedWindow = TRUE
)

# define small tuneGrid if desired
tuneGrid <- expand.grid(
  nrounds          = 100,
  max_depth        = 5,
  eta              = 0.1,
  gamma            = 0,
  colsample_bytree = 0.8,
  min_child_weight = 1,
  subsample        = 0.8
)

caret_mod <- train(
  x            = df[ features ],
  y            = df$Return,
  method       = "xgbTree",
  trControl    = ts_ctrl,
  tuneGrid     = tuneGrid,
  metric       = "RMSE"
)

caret_mod$results

##   nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 1     100         5 0.1     0              0.8                1       0.8
##          RMSE  Rsquared         MAE       RMSESD  RsquaredSD        MAESD
## 1 0.006541774 0.5903679 0.004105603 8.850498e-06 0.002748341 4.218602e-05

SPY Predictor

Arjun Kapur

2025-02-20

Load Data and Initial Look

Log Residuals to Remove Non-Stationary Properties

Split Data into Training and Testing Sets

Model Fitting and Residual Analysis

Forecasting

Accuracy and Conclusion

XGBoost Modeling with Lagged Features

Compare ARIMA vs XGBoost

XGBoost Model and Visualization

3. Add Rolling‑Volatility & VIX Features (with re‑split)