library(readr)
library(dplyr)
library(tidyr)
library(jsonlite)
library(lubridate)
library(ggplot2)
library(forecast)
library(zoo)
library(lmtest)
data is from CoinMarketCap
bitcoin_data <- read_delim(
"/Users/joshuazhang/Desktop/Stats 429 Project/FinalProjectData/BTC.daily.csv",
delim = ";", col_names = TRUE, trim_ws = TRUE, show_col_types = FALSE
)
bitcoin_data <- bitcoin_data %>%
mutate(
Date = as.Date(timeOpen) # convert timestamp → Date
) %>%
filter(Date >= as.Date("2025-01-01")) %>% # keep only 2025+
select(Date, Open = open) %>% # keep date + open price
arrange(Date)
plot(bitcoin_data, type = "l", col = "red",
main = "BTC Opening Price",
ylab = "USD")
par(mfrow = c(2, 2))
btc_ts <- ts(bitcoin_data$Open)
acf(btc_ts, main = "ACF of BTC Opening Price (2025)")
pacf(btc_ts, main = "PACF of BTC Opening Price (2025)")
ndiffs(btc_ts) # differecing test; tells you how many differences needed
[1] 1
par(mfrow = c(1, 1))
btc_diff <- diff(btc_ts)
plot(btc_diff, type = "l", col = "red",
main = "Differenced BTC Opening Price",
ylab = "1st Difference")
par(mfrow = c(2, 2))
acf(btc_diff, main = "ACF of BTC Opening Price (2025); 1st diff")
pacf(btc_diff, main = "PACF of BTC Opening Price (2025); 1st diff")
par(mfrow = c(1, 1))
auto.arima(btc_ts)
Series: btc_ts
ARIMA(0,1,0)
sigma^2 = 4732593: log likelihood = -3031.61
AIC=6065.22 AICc=6065.23 BIC=6069.02
forecast_data <- tail(bitcoin_data, 10)
forecast_data
data is from CoinMarketCap
eth_data <- read_delim(
"/Users/joshuazhang/Desktop/Stats 429 Project/FinalProjectData/ETH_daily.csv",
delim = ";", col_names = TRUE, trim_ws = TRUE, show_col_types = FALSE
)
eth_data <- eth_data %>%
mutate(
Date = as.Date(timeOpen)
) %>%
filter(Date >= as.Date("2025-01-01")) %>%
select(Date, Open = open) %>%
arrange(Date)
plot(eth_data, type = "l", col = "red",
main = "ETA Opening Price",
ylab = "USD")
# ETH time series
eth_ts <- ts(eth_data$Open)
# ACF & PACF (original series)
par(mfrow = c(2, 2))
acf(eth_ts, main = "ACF of ETH Opening Price (2025)")
pacf(eth_ts, main = "PACF of ETH Opening Price (2025)")
ndiffs(eth_ts)
[1] 1
par(mfrow = c(1, 1))
# First difference
eth_diff <- diff(eth_ts)
plot(
eth_diff, type = "l", col = "red",
main = "Differenced ETH Opening Price",
ylab = "1st Difference"
)
# ACF & PACF of differenced series
par(mfrow = c(2, 2))
acf(eth_diff, main = "ACF of ETH Opening Price (2025); 1st diff")
pacf(eth_diff, main = "PACF of ETH Opening Price (2025); 1st diff")
par(mfrow = c(1, 1))
# Auto-ARIMA recommendation
auto.arima(eth_ts)
Series: eth_ts
ARIMA(0,1,0)
sigma^2 = 14896: log likelihood = -2072.38
AIC=4146.75 AICc=4146.77 BIC=4150.56
from investing.com
sp500_data <- read_delim(
"/Users/joshuazhang/Desktop/Stats 429 Project/FinalProjectData/S&P500_daily.csv",
delim = ",", col_names = TRUE, trim_ws = TRUE, show_col_types = FALSE
)
sp500_data <- sp500_data %>%
mutate(
Date = as.Date(Date, format = "%m/%d/%Y") # convert character → Date
) %>%
filter(Date >= as.Date("2025-01-01")) %>% # keep only after 2025-01-01
select(Date, Open) %>% # keep Date + Open
arrange(Date)
sp500_ts <- ts(sp500_data$Open)
plot(sp500_data, type = "l", col = "red",
main = "S&P500 Opening Price",
ylab = "USD")
# 2. ACF & PACF (original series)
par(mfrow = c(2, 2))
acf(sp500_ts, main = "ACF of S&P500 Opening Price (2025)")
pacf(sp500_ts, main = "PACF of S&P500 Opening Price (2025)")
ndiffs(sp500_ts)
[1] 1
par(mfrow = c(1, 1))
# First difference
sp500_diff <- diff(sp500_ts)
plot(
sp500_diff, type = "l", col = "red",
main = "Differenced S&P500 Opening Price",
ylab = "1st Difference"
)
par(mfrow = c(2, 2))
acf(sp500_diff, main = "ACF of S&P500 Opening Price (2025); 1st diff")
pacf(sp500_diff, main = "PACF of S&P500 Opening Price (2025); 1st diff")
par(mfrow = c(1, 1))
# Auto-ARIMA recommendation
auto.arima(sp500_ts)
Series: sp500_ts
ARIMA(2,1,2)
Coefficients:
ar1 ar2 ma1 ma2
-1.2117 -0.6758 1.0461 0.7085
s.e. 0.1784 0.1351 0.1616 0.1030
sigma^2 = 4453: log likelihood = -1279.42
AIC=2568.83 AICc=2569.1 BIC=2585.98
from investing.com
gold_data <- read_delim(
"/Users/joshuazhang/Desktop/Stats 429 Project/FinalProjectData/Gold_daily.csv",
delim = ",", col_names = TRUE, trim_ws = TRUE, show_col_types = FALSE
)
gold_data <- gold_data %>%
mutate(
Date = as.Date(Date, format = "%m/%d/%Y") # convert character → Date
) %>%
filter(Date >= as.Date("2025-01-01")) %>% # keep only after 2025-01-01
select(Date, Open) %>% # keep Date + Open only
arrange(Date)
gold_ts <- ts(gold_data$Open)
plot(gold_data, type = "l", col = "red",
main = "Gold Opening Price",
ylab = "USD")
# 2. ACF & PACF (original series)
par(mfrow = c(2, 2))
acf(gold_ts, main = "ACF of Gold Price (2025)")
pacf(gold_ts, main = "PACF of Gold Price (2025)")
ndiffs(gold_ts)
[1] 1
par(mfrow = c(1, 1))
# 3. First difference
gold_diff <- diff(gold_ts)
plot(
gold_diff, type = "l", col = "red",
main = "Differenced Gold Price",
ylab = "1st Difference"
)
# ACF & PACF of differenced series
par(mfrow = c(2, 2))
acf(gold_diff, main = "ACF of Gold Price (2025); 1st diff")
pacf(gold_diff, main = "PACF of Gold Price (2025); 1st diff")
par(mfrow = c(1, 1))
# Auto-ARIMA recommendation
auto.arima(gold_ts)
Series: gold_ts
ARIMA(0,1,0) with drift
Coefficients:
drift
6.8200
s.e. 2.8546
sigma^2 = 1939: log likelihood = -1232.85
AIC=2469.69 AICc=2469.75 BIC=2476.63
#VIX from investing.com
vix_data <- read_delim(
"/Users/joshuazhang/Desktop/Stats 429 Project/FinalProjectData/VIX_daily.csv",
delim = ",", col_names = TRUE, trim_ws = TRUE, show_col_types = FALSE
)
vix_data <- vix_data %>%
mutate(
Date = as.Date(Date, format = "%m/%d/%Y") # convert to Date
) %>%
filter(Date >= as.Date("2025-01-01")) %>% # keep only after Jan 1st 2025
select(Date, Open) %>% # keep Date + Open only
arrange(Date)
vix_ts <- ts(vix_data$Open)
plot(vix_data, type = "l", col = "red",
main = "VIX",
ylab = "index")
# ACF & PACF (original series)
par(mfrow = c(2, 2))
acf(vix_ts, main = "ACF of VIX (2025)")
pacf(vix_ts, main = "PACF of VIX (2025)")
ndiffs(vix_ts)
[1] 1
par(mfrow = c(1, 1))
# First difference
vix_diff <- diff(vix_ts)
plot(
vix_diff, type = "l", col = "red",
main = "Differenced VIX",
ylab = "1st Difference"
)
# ACF & PACF of differenced VIX
par(mfrow = c(2, 2))
acf(vix_diff, main = "ACF of VIX (1st diff)")
pacf(vix_diff, main = "PACF of VIX (1st diff)")
par(mfrow = c(1, 1))
# 5. Auto-ARIMA recommendation
auto.arima(vix_ts)
Series: vix_ts
ARIMA(2,1,2)
Coefficients:
ar1 ar2 ma1 ma2
-1.3088 -0.7008 1.0897 0.7362
s.e. 0.1053 0.1005 0.1029 0.0858
sigma^2 = 7.331: log likelihood = -568.27
AIC=1146.55 AICc=1146.81 BIC=1163.87
library(ggplot2)
library(ggplot2)
library(zoo)
library(dplyr)
# Rename columns BEFORE joining (avoids suffix issues)
btc <- bitcoin_data %>% rename(BTC = Open)
eth <- eth_data %>% rename(ETH = Open)
sp <- sp500_data %>% rename(SP500 = Open)
gold <- gold_data %>% rename(GOLD = Open)
vix <- vix_data %>% rename(VIX = Open)
# Join all datasets by Date
combined <- btc %>%
left_join(eth, by = "Date") %>%
left_join(sp, by = "Date") %>%
left_join(gold, by = "Date") %>%
left_join(vix, by = "Date")
# Drop rows where SP500, GOLD, or VIX are NA (market closed)
combined <- combined %>%
drop_na()
combined
model_data <- combined %>%
mutate(
rBTC = log(BTC) - lag(log(BTC)),
rETH = log(ETH) - lag(log(ETH)),
rSP = log(SP500) - lag(log(SP500)),
rGOLD = log(GOLD) - lag(log(GOLD)),
rVIX = log(VIX) - lag(log(VIX))
) %>%
drop_na()
model_data
# Extract vectors
date <- model_data$Date
rBTC <- model_data$rBTC
rETH <- model_data$rETH
rSP <- model_data$rSP
rGOLD <- model_data$rGOLD
rVIX <- model_data$rVIX
# Plot
plot(
date, rBTC, type="l", col="blue",
main="",
xlab="Date", ylab="Log Returns",
ylim=range(c(rBTC, rETH, rSP, rGOLD, rVIX), na.rm=TRUE)
)
lines(date, rETH, col="red")
lines(date, rSP, col="darkgreen")
lines(date, rGOLD, col="orange")
lines(date, rVIX, col="purple")
legend(
"topright",
legend=c("BTC", "ETH", "SP500", "GOLD", "VIX"),
col=c("blue", "red", "darkgreen", "orange", "purple"),
lty=1, cex=0.8
)
date <- combined$Date
BTC <- combined$BTC
ETH <- combined$ETH
SP500 <- combined$SP500
GOLD <- combined$GOLD
VIX <- combined$VIX
plot(
date, BTC, type="l", col="blue",
main="Raw Prices of All Variables",
xlab="Date", ylab="Value",
ylim = range(c(BTC, ETH, SP500, GOLD, VIX), na.rm = TRUE)
)
lines(date, ETH, col="red")
lines(date, SP500, col="darkgreen")
lines(date, GOLD, col="orange")
lines(date, VIX, col="purple")
legend("topright",
legend=c("BTC","ETH","SP500","GOLD","VIX"),
col=c("blue","red","darkgreen","orange","purple"),
lty=1, cex=0.8)
m1 <- lm(rBTC ~ rETH + rSP + rGOLD + rVIX, data=model_data)
summary(m1)
Call:
lm(formula = rBTC ~ rETH + rSP + rGOLD + rVIX, data = model_data)
Residuals:
Min 1Q Median 3Q Max
-0.071285 -0.009311 0.000348 0.010019 0.065623
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.0004208 0.0010955 -0.384 0.7012
rETH 0.4188940 0.0267214 15.676 <2e-16 ***
rSP 0.2897472 0.1563326 1.853 0.0652 .
rGOLD 0.1390304 0.0857231 1.622 0.1063
rVIX 0.0040839 0.0193357 0.211 0.8329
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.01623 on 222 degrees of freedom
Multiple R-squared: 0.6457, Adjusted R-squared: 0.6393
F-statistic: 101.1 on 4 and 222 DF, p-value: < 2.2e-16
# Residual ACF/PACF
par(mfrow=c(2,2))
acf(residuals(m1), main="ACF of Residuals (Model 1)")
pacf(residuals(m1), main="PACF of Residuals (Model 1)")
par(mfrow=c(1,1))
# box test
box_m1 <- Box.test(residuals(m1), lag = 10, type = "Ljung-Box")
box_m1
Box-Ljung test
data: residuals(m1)
X-squared = 13.933, df = 10, p-value = 0.1761
m2 <- lm(rBTC ~ rETH + rSP, data=model_data)
summary(m2)
Call:
lm(formula = rBTC ~ rETH + rSP, data = model_data)
Residuals:
Min 1Q Median 3Q Max
-0.071889 -0.009141 0.000111 0.010297 0.064819
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.0001262 0.0010812 -0.117 0.9072
rETH 0.4162239 0.0264277 15.750 <2e-16 ***
rSP 0.2712468 0.0998252 2.717 0.0071 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.01625 on 224 degrees of freedom
Multiple R-squared: 0.6414, Adjusted R-squared: 0.6382
F-statistic: 200.3 on 2 and 224 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
acf(residuals(m2), main="ACF of Residuals (Model 2)")
pacf(residuals(m2), main="PACF of Residuals (Model 2)")
par(mfrow=c(1,1))
# Model 2 – Reduced Model
box_m2 <- Box.test(residuals(m2), lag = 10, type = "Ljung-Box")
box_m2
Box-Ljung test
data: residuals(m2)
X-squared = 13.507, df = 10, p-value = 0.1967
lags <- 1:3 # possible lags
results <- data.frame(
Model = character(),
ETH_lag = integer(),
SP_lag = integer(),
VIX_lag = integer(),
AIC = numeric(),
AdjR2 = numeric(),
stringsAsFactors = FALSE
)
models <- list()
# Loop through all combinations
for (e in lags) {
for (s in lags) {
for (v in lags) {
# Create lagged data
temp <- model_data %>%
mutate(
rETH_lag = lag(rETH, e),
rSP_lag = lag(rSP, s),
rVIX_lag = lag(rVIX, v)
) %>%
drop_na()
# Fit model
m <- lm(rBTC ~ rETH_lag + rSP_lag + rGOLD + rVIX_lag, data=temp)
# Model name
model_name <- paste0("ETH", e, "_SP", s, "_VIX", v)
# Store model
models[[model_name]] <- m
# Save results
results <- rbind(
results,
data.frame(
Model = model_name,
ETH_lag = e,
SP_lag = s,
VIX_lag = v,
AIC = AIC(m),
AdjR2 = summary(m)$adj.r.squared
)
)
}
}
}
# Sort best models (lowest AIC)
results_sorted <- results[order(results$AIC), ]
results_sorted
# Identify best model (lowest AIC)
best_name <- results_sorted$Model[1]
cat("Best model:", best_name, "\n\n")
Best model: ETH2_SP1_VIX2
# Extract the model object
best_model <- models[[best_name]]
# Model summary
summary(best_model)
Call:
lm(formula = rBTC ~ rETH_lag + rSP_lag + rGOLD + rVIX_lag, data = temp)
Residuals:
Min 1Q Median 3Q Max
-0.109246 -0.013153 -0.000259 0.016136 0.097002
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.0004082 0.0017924 -0.228 0.8201
rETH_lag -0.0840910 0.0428283 -1.963 0.0509 .
rSP_lag -0.1135312 0.1465738 -0.775 0.4394
rGOLD 0.0763406 0.1403397 0.544 0.5870
rVIX_lag -0.0671813 0.0210187 -3.196 0.0016 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.0265 on 220 degrees of freedom
Multiple R-squared: 0.05874, Adjusted R-squared: 0.04162
F-statistic: 3.432 on 4 and 220 DF, p-value: 0.009571
# Best Lag Model
best_model <- models[[best_name]]
summary(best_model)
Call:
lm(formula = rBTC ~ rETH_lag + rSP_lag + rGOLD + rVIX_lag, data = temp)
Residuals:
Min 1Q Median 3Q Max
-0.109246 -0.013153 -0.000259 0.016136 0.097002
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.0004082 0.0017924 -0.228 0.8201
rETH_lag -0.0840910 0.0428283 -1.963 0.0509 .
rSP_lag -0.1135312 0.1465738 -0.775 0.4394
rGOLD 0.0763406 0.1403397 0.544 0.5870
rVIX_lag -0.0671813 0.0210187 -3.196 0.0016 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.0265 on 220 degrees of freedom
Multiple R-squared: 0.05874, Adjusted R-squared: 0.04162
F-statistic: 3.432 on 4 and 220 DF, p-value: 0.009571
# Residual ACF/PACF for the lag model
par(mfrow=c(2,2))
acf(residuals(best_model), main=paste("ACF of Residuals (", best_name, ")"))
pacf(residuals(best_model), main=paste("PACF of Residuals (", best_name, ")"))
par(mfrow=c(1,1))
# Model 3 – Lag Model (best model)
box_m3 <- Box.test(residuals(best_model), lag = 10, type = "Ljung-Box")
box_m3
Box-Ljung test
data: residuals(best_model)
X-squared = 6.4267, df = 10, p-value = 0.7782
# --- AIC, BIC, R-square extractor function ---
extract_metrics <- function(model) {
s <- summary(model)
data.frame(
AIC = AIC(model),
BIC = BIC(model),
R2 = s$r.squared,
AdjR2 = s$adj.r.squared
)
}
# --- Apply to all three models ---
metrics_m1 <- extract_metrics(m1)
metrics_m2 <- extract_metrics(m2)
metrics_m3 <- extract_metrics(best_model)
# Combine into one table
model_comparison <- rbind(
cbind(Model = "Full Model (m1)", metrics_m1),
cbind(Model = "Reduced Model (m2)", metrics_m2),
cbind(Model = "Best Lag Model (m3)", metrics_m3)
)
model_comparison
NA
### ============================================================
### 1. Fit individual regressions
### ============================================================
m_eth <- lm(rBTC ~ rETH, data = model_data)
m_sp <- lm(rBTC ~ rSP, data = model_data)
m_gold <- lm(rBTC ~ rGOLD, data = model_data)
m_vix <- lm(rBTC ~ rVIX, data = model_data)
### Helper function to extract stats
get_stats <- function(model){
s <- summary(model)
coef_val <- s$coefficients[2, 1] # slope coefficient
p_val <- s$coefficients[2, 4] # p-value of predictor
aic_val <- AIC(model)
adj_r2 <- s$adj.r.squared
return(c(coef_val, p_val, aic_val, adj_r2))
}
### ============================================================
### 2. Build results table
### ============================================================
results_individual <- data.frame(
Predictor = c("ETH", "SP500", "GOLD", "VIX"),
Coefficient = c(
get_stats(m_eth)[1],
get_stats(m_sp)[1],
get_stats(m_gold)[1],
get_stats(m_vix)[1]
),
P_value = c(
get_stats(m_eth)[2],
get_stats(m_sp)[2],
get_stats(m_gold)[2],
get_stats(m_vix)[2]
),
AIC = c(
get_stats(m_eth)[3],
get_stats(m_sp)[3],
get_stats(m_gold)[3],
get_stats(m_vix)[3]
),
Adj_R2 = c(
get_stats(m_eth)[4],
get_stats(m_sp)[4],
get_stats(m_gold)[4],
get_stats(m_vix)[4]
)
)
print(results_individual)
NA
NA
### Define the best-model lags
lag_e <- 1 # rETH lag
lag_s <- 2 # rSP lag
lag_v <- 2 # rVIX lag
### Select last k days
k <- 5
compare_raw <- tail(model_data, k + max(lag_e, lag_s, lag_v))
### Recreate lag variables needed by best_model
compare_data <- compare_raw %>%
mutate(
rETH_lag = lag(rETH, lag_e),
rSP_lag = lag(rSP, lag_s),
rVIX_lag = lag(rVIX, lag_v)
) %>%
drop_na() %>% # remove rows where lag data not available
tail(k) # keep last k rows only
### Get the price BEFORE the comparison window
last_price_before_window <- tail(model_data$BTC, k+1)[1]
### Helper function: returns → prices
predict_price <- function(model, data_window, last_price){
pred_r <- predict(model, newdata = data_window)
pred_price <- numeric(length(pred_r))
pred_price[1] <- last_price * exp(pred_r[1])
for(i in 2:length(pred_r)){
pred_price[i] <- pred_price[i-1] * exp(pred_r[i])
}
return(pred_price)
}
### Predict using all 3 models
# Full model (m1)
pred_m1 <- predict_price(m1, compare_data, last_price_before_window)
# Reduced model (m2)
pred_m2 <- predict_price(m2, compare_data, last_price_before_window)
# Best lag model (m3)
pred_m3 <- predict_price(best_model, compare_data, last_price_before_window)
### Build comparison table
comparison_table <- data.frame(
Date = compare_data$Date,
Actual = compare_data$BTC,
Full_Model = pred_m1,
Reduced_Model = pred_m2,
Best_Lag_Model = pred_m3
)
print(comparison_table)
### Plot all three models vs actual
matplot(
comparison_table$Date,
comparison_table[, 2:5],
type="l", lwd=2,
col=c("black","red","blue","darkgreen"),
xlab="Date", ylab="BTC Price (USD)",
main="Actual vs Predicted BTC Price"
)
legend(
"topleft",
legend=c("Actual","Full Model","Reduced Model","Best Lag Model"),
col=c("black","red","blue","darkgreen"),
lwd=2
)
### 8. Compute RMSE + MAE
RMSE <- function(actual, predicted) sqrt(mean((actual - predicted)^2))
MAE <- function(actual, predicted) mean(abs(actual - predicted))
error_table <- data.frame(
Model = c("Full Model","Reduced Model","Best Lag Model"),
RMSE = c(
RMSE(comparison_table$Actual, comparison_table$Full_Model),
RMSE(comparison_table$Actual, comparison_table$Reduced_Model),
RMSE(comparison_table$Actual, comparison_table$Best_Lag_Model)
),
MAE = c(
MAE(comparison_table$Actual, comparison_table$Full_Model),
MAE(comparison_table$Actual, comparison_table$Reduced_Model),
MAE(comparison_table$Actual, comparison_table$Best_Lag_Model)
)
)
print(error_table)
NA