dowjones = read.table("dow_jones_index.data", sep = ",", header = TRUE)
str(dowjones)
## 'data.frame': 750 obs. of 16 variables:
## $ quarter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ stock : chr "AA" "AA" "AA" "AA" ...
## $ date : chr "1/7/2011" "1/14/2011" "1/21/2011" "1/28/2011" ...
## $ open : chr "$15.82" "$16.71" "$16.19" "$15.87" ...
## $ high : chr "$16.72" "$16.71" "$16.38" "$16.63" ...
## $ low : chr "$15.78" "$15.64" "$15.60" "$15.82" ...
## $ close : chr "$16.42" "$15.97" "$15.79" "$16.13" ...
## $ volume : int 239655616 242963398 138428495 151379173 154387761 114691279 80023895 132981863 109493077 114332562 ...
## $ percent_change_price : num 3.79 -4.43 -2.47 1.64 5.93 ...
## $ percent_change_volume_over_last_wk: num NA 1.38 -43.02 9.36 1.99 ...
## $ previous_weeks_volume : int NA 239655616 242963398 138428495 151379173 154387761 114691279 80023895 132981863 109493077 ...
## $ next_weeks_open : chr "$16.71" "$16.19" "$15.87" "$16.18" ...
## $ next_weeks_close : chr "$15.97" "$15.79" "$16.13" "$17.14" ...
## $ percent_change_next_weeks_price : num -4.428 -2.471 1.638 5.933 0.231 ...
## $ days_to_next_dividend : int 26 19 12 5 97 90 83 76 69 62 ...
## $ percent_return_next_dividend : num 0.183 0.188 0.19 0.186 0.175 ...
Find missing values
colSums(is.na(dowjones))
## quarter stock
## 0 0
## date open
## 0 0
## high low
## 0 0
## close volume
## 0 0
## percent_change_price percent_change_volume_over_last_wk
## 0 30
## previous_weeks_volume next_weeks_open
## 30 0
## next_weeks_close percent_change_next_weeks_price
## 0 0
## days_to_next_dividend percent_return_next_dividend
## 0 0
Those missing values correspond to the variables that have past observations “previous_weeks_volume”, so the first line has no data, because it has nothing to refer to, therefore it is an NA. We will fill in those empty observations with the mean value, so that our future analysis doesn’t get affected by it.
dowjones = dowjones %>%
group_by(stock) %>%
mutate(percent_change_volume_over_last_wk = ifelse(is.na(percent_change_volume_over_last_wk),
mean(percent_change_volume_over_last_wk,
na.rm=TRUE),
percent_change_volume_over_last_wk),
previous_weeks_volume = ifelse(is.na(previous_weeks_volume),
mean(previous_weeks_volume, na.rm=TRUE),
previous_weeks_volume)) %>%
ungroup()
#dowjones$previous_weeks_volume[is.na(dowjones$previous_weeks_volume)] = mean(dowjones$previous_weeks_volume, na.rm = T)
#dowjones$percent_change_volume_over_last_wk[is.na(dowjones$percent_change_volume_over_last_wk)] = mean(dowjones$percent_change_volume_over_last_wk, na.rm = T)
colSums(is.na(dowjones))
## quarter stock
## 0 0
## date open
## 0 0
## high low
## 0 0
## close volume
## 0 0
## percent_change_price percent_change_volume_over_last_wk
## 0 0
## previous_weeks_volume next_weeks_open
## 0 0
## next_weeks_close percent_change_next_weeks_price
## 0 0
## days_to_next_dividend percent_return_next_dividend
## 0 0
We will change character variables to numeric by removing the $ sign.
dowjones$open = parse_number(dowjones$open)
dowjones$high = parse_number(dowjones$high)
dowjones$close = parse_number(dowjones$close)
dowjones$low = parse_number(dowjones$low)
dowjones$next_weeks_open = parse_number(dowjones$next_weeks_open)
dowjones$next_weeks_close = parse_number(dowjones$next_weeks_close)
Same with the date variable, we need to assign the proper date variable.
dowjones$date = as.Date(dowjones$date, "%m/%d/%Y")
dowjones$stock = as.factor(dowjones$stock)
str(dowjones)
## tibble [750 x 16] (S3: tbl_df/tbl/data.frame)
## $ quarter : int [1:750] 1 1 1 1 1 1 1 1 1 1 ...
## $ stock : Factor w/ 30 levels "AA","AXP","BA",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ date : Date[1:750], format: "2011-01-07" "2011-01-14" ...
## $ open : num [1:750] 15.8 16.7 16.2 15.9 16.2 ...
## $ high : num [1:750] 16.7 16.7 16.4 16.6 17.4 ...
## $ low : num [1:750] 15.8 15.6 15.6 15.8 16.2 ...
## $ close : num [1:750] 16.4 16 15.8 16.1 17.1 ...
## $ volume : int [1:750] 239655616 242963398 138428495 151379173 154387761 114691279 80023895 132981863 109493077 114332562 ...
## $ percent_change_price : num [1:750] 3.79 -4.43 -2.47 1.64 5.93 ...
## $ percent_change_volume_over_last_wk: num [1:750] 4.03 1.38 -43.02 9.36 1.99 ...
## $ previous_weeks_volume : num [1:750] 1.31e+08 2.40e+08 2.43e+08 1.38e+08 1.51e+08 ...
## $ next_weeks_open : num [1:750] 16.7 16.2 15.9 16.2 17.3 ...
## $ next_weeks_close : num [1:750] 16 15.8 16.1 17.1 17.4 ...
## $ percent_change_next_weeks_price : num [1:750] -4.428 -2.471 1.638 5.933 0.231 ...
## $ days_to_next_dividend : int [1:750] 26 19 12 5 97 90 83 76 69 62 ...
## $ percent_return_next_dividend : num [1:750] 0.183 0.188 0.19 0.186 0.175 ...
We will looks at the Scatter plot to have an idea of the variables that might be strongly correlated to each other.
pairs(dowjones)
The ones we can see strongly correlated are: Open, High, Low, Close, next_weeks_open, next_weeks_close.
Full model (our full model explains 72.4% of percent_change_next_weeks_price), however, this is cheating, since our model includes “next_weeks_open” and “next_weeks_close” which perfectly predicts the price, and therefore the percent_change_next_weeks_price.
model_full = lm(percent_change_next_weeks_price ~ ., data = dowjones )
summary(model_full)$r.squared
## [1] 0.7235275
When we delete these terms from the model, the r-square drops tremendously to 13.37%
model_lm1 = lm(percent_change_next_weeks_price ~ . - next_weeks_open - next_weeks_close, data = dowjones )
summary(model_lm1)$r.squared
## [1] 0.1328077
In here we tried multiple things: for sure deleting the “next_weeks_open” and “next_weeks_close” variables. This model will only bring that the significant variable is: previous_weeks_volume (due to the noise of the stocks variable seen as a variable)
set.seed(123)
train.control = trainControl(method="cv", number = 10)
step.model = train(percent_change_next_weeks_price ~ .- next_weeks_close - next_weeks_close, data = dowjones,
method = "leapSeq",
tuneGrid = data.frame(nvmax = 1:5),
trControl = train.control)
step.model$results
summary(step.model$finalModel)
## Subset selection object
## 42 Variables (and intercept)
## Forced in Forced out
## quarter FALSE FALSE
## stockAXP FALSE FALSE
## stockBA FALSE FALSE
## stockBAC FALSE FALSE
## stockCAT FALSE FALSE
## stockCSCO FALSE FALSE
## stockCVX FALSE FALSE
## stockDD FALSE FALSE
## stockDIS FALSE FALSE
## stockGE FALSE FALSE
## stockHD FALSE FALSE
## stockHPQ FALSE FALSE
## stockIBM FALSE FALSE
## stockINTC FALSE FALSE
## stockJNJ FALSE FALSE
## stockJPM FALSE FALSE
## stockKO FALSE FALSE
## stockKRFT FALSE FALSE
## stockMCD FALSE FALSE
## stockMMM FALSE FALSE
## stockMRK FALSE FALSE
## stockMSFT FALSE FALSE
## stockPFE FALSE FALSE
## stockPG FALSE FALSE
## stockT FALSE FALSE
## stockTRV FALSE FALSE
## stockUTX FALSE FALSE
## stockVZ FALSE FALSE
## stockWMT FALSE FALSE
## stockXOM FALSE FALSE
## date FALSE FALSE
## open FALSE FALSE
## high FALSE FALSE
## low FALSE FALSE
## close FALSE FALSE
## volume FALSE FALSE
## percent_change_price FALSE FALSE
## percent_change_volume_over_last_wk FALSE FALSE
## previous_weeks_volume FALSE FALSE
## next_weeks_open FALSE FALSE
## days_to_next_dividend FALSE FALSE
## percent_return_next_dividend FALSE FALSE
## 1 subsets of each size up to 5
## Selection Algorithm: 'sequential replacement'
## quarter stockAXP stockBA stockBAC stockCAT stockCSCO stockCVX stockDD
## 1 ( 1 ) " " " " " " " " " " "*" " " " "
## 2 ( 1 ) " " " " " " "*" " " "*" " " " "
## 3 ( 1 ) " " " " " " "*" " " "*" " " " "
## 4 ( 1 ) " " " " " " " " " " " " " " " "
## 5 ( 1 ) " " "*" " " " " " " " " " " " "
## stockDIS stockGE stockHD stockHPQ stockIBM stockINTC stockJNJ stockJPM
## 1 ( 1 ) " " " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " "*" " " " " " " " "
## 4 ( 1 ) " " " " " " " " " " " " " " " "
## 5 ( 1 ) " " " " " " " " " " " " " " " "
## stockKO stockKRFT stockMCD stockMMM stockMRK stockMSFT stockPFE
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " " " " " "
## 4 ( 1 ) " " " " " " " " " " " " " "
## 5 ( 1 ) " " " " " " " " " " " " " "
## stockPG stockT stockTRV stockUTX stockVZ stockWMT stockXOM date open
## 1 ( 1 ) " " " " " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " " " " " " " " " "
## 4 ( 1 ) " " " " " " " " " " " " " " " " "*"
## 5 ( 1 ) " " " " " " " " " " " " " " " " "*"
## high low close volume percent_change_price
## 1 ( 1 ) " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " "
## 4 ( 1 ) "*" " " " " " " " "
## 5 ( 1 ) "*" " " " " " " " "
## percent_change_volume_over_last_wk previous_weeks_volume
## 1 ( 1 ) " " " "
## 2 ( 1 ) " " " "
## 3 ( 1 ) " " " "
## 4 ( 1 ) " " " "
## 5 ( 1 ) " " " "
## next_weeks_open days_to_next_dividend percent_return_next_dividend
## 1 ( 1 ) " " " " " "
## 2 ( 1 ) " " " " " "
## 3 ( 1 ) " " " " " "
## 4 ( 1 ) "*" " " "*"
## 5 ( 1 ) "*" " " "*"
When running the same model but now without stocks, we can find the other variables that could help us. However, our Rsquared decreases all the way to 3%!
set.seed(123)
train.control = trainControl(method="cv", number = 10)
step.model = train(percent_change_next_weeks_price ~ .- next_weeks_close - next_weeks_close - stock, data = dowjones,
method = "leapSeq",
tuneGrid = data.frame(nvmax = 1:5),
trControl = train.control)
step.model$results
summary(step.model$finalModel)
## Subset selection object
## 13 Variables (and intercept)
## Forced in Forced out
## quarter FALSE FALSE
## date FALSE FALSE
## open FALSE FALSE
## high FALSE FALSE
## low FALSE FALSE
## close FALSE FALSE
## volume FALSE FALSE
## percent_change_price FALSE FALSE
## percent_change_volume_over_last_wk FALSE FALSE
## previous_weeks_volume FALSE FALSE
## next_weeks_open FALSE FALSE
## days_to_next_dividend FALSE FALSE
## percent_return_next_dividend FALSE FALSE
## 1 subsets of each size up to 5
## Selection Algorithm: 'sequential replacement'
## quarter date open high low close volume percent_change_price
## 1 ( 1 ) " " " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " "*" "*" " " " " " " " "
## 3 ( 1 ) "*" "*" "*" " " " " " " " " " "
## 4 ( 1 ) " " " " "*" "*" " " " " " " " "
## 5 ( 1 ) " " " " "*" "*" " " "*" " " " "
## percent_change_volume_over_last_wk previous_weeks_volume
## 1 ( 1 ) " " "*"
## 2 ( 1 ) " " " "
## 3 ( 1 ) " " " "
## 4 ( 1 ) " " " "
## 5 ( 1 ) " " " "
## next_weeks_open days_to_next_dividend percent_return_next_dividend
## 1 ( 1 ) " " " " " "
## 2 ( 1 ) " " " " " "
## 3 ( 1 ) " " " " " "
## 4 ( 1 ) "*" " " "*"
## 5 ( 1 ) "*" " " "*"
Here we can see the following possible models: (a) open + high, (b) quarter + date + open, (c) open + high + next_weeks_open + percent_return_next_dividend. Being (c) the model with the 2.81% Rsquared.
Here, you can’t see the process we went through because we did a manual VIF cutoff, but we tried with all the variables and getting rid of them, until we have the variables that satisfy our VIF cutoff (< 8.00)
model = lm(percent_change_next_weeks_price ~ percent_return_next_dividend + volume + previous_weeks_volume + days_to_next_dividend, data = dowjones )
summary(model)
##
## Call:
## lm(formula = percent_change_next_weeks_price ~ percent_return_next_dividend +
## volume + previous_weeks_volume + days_to_next_dividend, data = dowjones)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.2112 -1.5600 -0.0927 1.5783 9.7029
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.509e-02 2.930e-01 0.154 0.8778
## percent_return_next_dividend 6.024e-01 3.343e-01 1.802 0.0720 .
## volume 1.677e-09 1.295e-09 1.295 0.1958
## previous_weeks_volume -2.667e-09 1.293e-09 -2.063 0.0395 *
## days_to_next_dividend -2.044e-03 2.120e-03 -0.964 0.3352
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.667 on 745 degrees of freedom
## Multiple R-squared: 0.01469, Adjusted R-squared: 0.009401
## F-statistic: 2.777 on 4 and 745 DF, p-value: 0.02611
vif(model)
## percent_return_next_dividend volume
## 1.098254 4.433020
## previous_weeks_volume days_to_next_dividend
## 4.436957 1.015691
model2 = lm(percent_change_next_weeks_price ~ percent_change_price + percent_change_volume_over_last_wk+ + percent_return_next_dividend + volume + previous_weeks_volume + days_to_next_dividend, data = dowjones )
summary(model2)
##
## Call:
## lm(formula = percent_change_next_weeks_price ~ percent_change_price +
## percent_change_volume_over_last_wk + +percent_return_next_dividend +
## volume + previous_weeks_volume + days_to_next_dividend, data = dowjones)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.3447 -1.5159 -0.0915 1.5733 9.7191
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.651e-02 2.943e-01 0.192 0.8478
## percent_change_price 2.104e-02 4.000e-02 0.526 0.5991
## percent_change_volume_over_last_wk -2.298e-03 3.122e-03 -0.736 0.4620
## percent_return_next_dividend 5.979e-01 3.346e-01 1.787 0.0743 .
## volume 2.524e-09 1.615e-09 1.563 0.1185
## previous_weeks_volume -3.468e-09 1.601e-09 -2.166 0.0306 *
## days_to_next_dividend -2.083e-03 2.122e-03 -0.982 0.3266
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.669 on 743 degrees of freedom
## Multiple R-squared: 0.01597, Adjusted R-squared: 0.00802
## F-statistic: 2.009 on 6 and 743 DF, p-value: 0.06218
lag.plot(dowjones$close, pch = ".", set.lags = 1:4)
lag.plot(dowjones$volume, pch = ".", set.lags = 1:4)
lag.plot(dowjones$percent_change_next_weeks_price, pch = ".", set.lags = 1:4)
dowjones_lag = dowjones %>%
group_by(stock) %>%
mutate(close_lag = lag(close, n = 1),
volume_lag = lag(volume, n = 1),
percent_change_next_weeks_price_lag = lag(percent_change_next_weeks_price, n = 1)) %>%
ungroup()
head(dowjones_lag)
par(mfrow = c(1,2))
plot(dowjones_lag$close_lag,dowjones_lag$percent_change_next_weeks_price)
plot(dowjones_lag$volume_lag,dowjones_lag$percent_change_next_weeks_price)
dowjones_lag = dowjones_lag %>%
group_by(stock) %>%
mutate(close_lag = ifelse(is.na(close_lag), mean(close_lag, na.rm=T), close_lag),
volume_lag = ifelse(is.na(volume_lag), mean(volume_lag, na.rm=T), volume_lag),
percent_change_next_weeks_price_lag = ifelse(is.na(percent_change_next_weeks_price_lag),
mean(percent_change_next_weeks_price_lag, na.rm=T),
percent_change_next_weeks_price_lag)) %>%
ungroup()
model3 = lm(percent_change_next_weeks_price ~ stock+ percent_change_next_weeks_price_lag +
close_lag + volume_lag + percent_change_volume_over_last_wk,
data = dowjones_lag )
summary(model3)
##
## Call:
## lm(formula = percent_change_next_weeks_price ~ stock + percent_change_next_weeks_price_lag +
## close_lag + volume_lag + percent_change_volume_over_last_wk,
## data = dowjones_lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.7899 -1.4073 -0.1243 1.4309 8.9183
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.857e+00 8.355e-01 4.617 4.62e-06 ***
## stockAXP 9.212e+00 1.355e+00 6.797 2.25e-11 ***
## stockBA 1.582e+01 2.251e+00 7.029 4.86e-12 ***
## stockBAC -3.137e+00 1.255e+00 -2.499 0.0127 *
## stockCAT 2.401e+01 3.312e+00 7.249 1.09e-12 ***
## stockCSCO -1.133e+00 8.382e-01 -1.352 0.1769
## stockCVX 2.358e+01 3.243e+00 7.271 9.38e-13 ***
## stockDD 1.073e+01 1.554e+00 6.903 1.12e-11 ***
## stockDIS 7.220e+00 1.192e+00 6.057 2.24e-09 ***
## stockGE 7.703e-01 7.812e-01 0.986 0.3245
## stockHD 6.146e+00 1.062e+00 5.788 1.07e-08 ***
## stockHPQ 6.391e+00 1.192e+00 5.361 1.12e-07 ***
## stockIBM 4.018e+01 5.499e+00 7.307 7.30e-13 ***
## stockINTC 1.625e+00 8.123e-01 2.000 0.0459 *
## stockJNJ 1.302e+01 1.869e+00 6.965 7.42e-12 ***
## stockJPM 7.717e+00 1.282e+00 6.018 2.82e-09 ***
## stockKO 1.387e+01 1.968e+00 7.047 4.30e-12 ***
## stockKRFT 5.406e+00 9.633e-01 5.612 2.86e-08 ***
## stockMCD 1.710e+01 2.377e+00 7.193 1.61e-12 ***
## stockMMM 2.094e+01 2.898e+00 7.225 1.28e-12 ***
## stockMRK 5.028e+00 9.960e-01 5.048 5.67e-07 ***
## stockMSFT 2.009e+00 8.612e-01 2.332 0.0200 *
## stockPFE 1.443e+00 7.671e-01 1.880 0.0605 .
## stockPG 1.309e+01 1.920e+00 6.821 1.93e-11 ***
## stockT 3.979e+00 8.860e-01 4.491 8.24e-06 ***
## stockTRV 1.232e+01 1.765e+00 6.979 6.77e-12 ***
## stockUTX 1.903e+01 2.624e+00 7.251 1.07e-12 ***
## stockVZ 5.712e+00 1.051e+00 5.437 7.45e-08 ***
## stockWMT 1.052e+01 1.582e+00 6.647 5.95e-11 ***
## stockXOM 1.818e+01 2.557e+00 7.110 2.81e-12 ***
## percent_change_next_weeks_price_lag -8.124e-02 4.185e-02 -1.941 0.0526 .
## close_lag -2.655e-01 3.712e-02 -7.152 2.12e-12 ***
## volume_lag 2.481e-09 1.691e-09 1.467 0.1429
## percent_change_volume_over_last_wk 2.179e-03 2.616e-03 0.833 0.4051
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.607 on 716 degrees of freedom
## Multiple R-squared: 0.09496, Adjusted R-squared: 0.05325
## F-statistic: 2.277 on 33 and 716 DF, p-value: 7.582e-05
train=(dowjones$date <= "2011-03-31")
dowj_train= dowjones_lag[train ,]
dowj_test= dowjones_lag[!train ,]
#cols = c("percent_change_next_weeks_price", "percent_change_next_weeks_price_lag", "close_lag", "volume_lag", "percent_change_volume_over_last_wk", "stock")
stocks = as.factor(unique(dowj_train$stock))
RMSE = rep(NA, length(stocks)) #
#RSquared = rep(NA, length(stocks))
lm_pred_metrics = data.frame(Stock = stocks, RMSE = RMSE)
for(i in 1:length(stocks)){
stock_train = subset(dowj_train, stock == stocks[i])
stock_test = subset(dowj_train, stock == stocks[i])
lm_fit = lm(percent_change_next_weeks_price ~ volume_lag + open + close + high + low + volume,
data = stock_train)
#lm_fit = lm(percent_change_next_weeks_price ~ percent_change_next_weeks_price_lag +
# close_lag + volume_lag + percent_change_volume_over_last_wk,
# data = stock_train)
lm.preds = predict(lm_fit, stock_test)
lm.rmse = rmse(stock_train$percent_change_next_weeks_price, lm.preds)
lm_pred_metrics[i,"RMSE"] = lm.rmse
}
lm_pred_metrics
dt_pred_metrics = data.frame(Stock = stocks, RMSE = RMSE)
for(i in 1:length(stocks)){
stock_train = subset(dowj_train, stock == stocks[i])
stock_test = subset(dowj_test, stock == stocks[i])
dt_model = tree(percent_change_next_weeks_price ~ volume_lag + open + close + high + low + volume,
data = stock_train)
#dt_model = tree(percent_change_next_weeks_price ~ percent_change_next_weeks_price_lag +
# close_lag + volume_lag + percent_change_volume_over_last_wk,
# data = stock_train)
dt_preds = predict(dt_model, newdata = stock_test)
dt_rmse = rmse(stock_test$percent_change_next_weeks_price, dt_preds)
dt_pred_metrics[i, "RMSE"] = dt_rmse
#tree_r2 = R2(stock_test$percent_change_next_weeks_price, trees_preds)
#dt_pred_metrics[i, "R2"] = tree_r2
}
head(dt_pred_metrics)
mean(lm_pred_metrics$RMSE)
## [1] 1.27709
mean(dt_pred_metrics$RMSE)
## [1] 3.011236
svm_pred_metrics <- data.frame(stocks, RMSE)
Preds <- rep(NA, length(stocks))
svm_predictions <- data.frame(stocks, Preds)
for(i in 1:length(stocks)){
stock_train = subset(dowj_train, stock == stocks[i])
stock_test = subset(dowj_test, stock == stocks[i])
set.seed(1)
svm_fit <- train(percent_change_next_weeks_price ~ volume_lag + open + close + high + low + volume,
data = stock_train, method = "svmPoly",
metric = "RMSE", preProcess = c("center","scale"),
trControl = trainControl(method = "cv"))
#svm_fit <- train(percent_change_next_weeks_price ~ percent_change_next_weeks_price_lag +
# close_lag + volume_lag + percent_change_volume_over_last_wk,
# data = stock_train, method = "svmPoly",
# metric = "RMSE", preProcess = c("center","scale"),
# trControl = trainControl(method = "cv"))
svm_preds <- predict(svm_fit, stock_test)
svm_predictions[i, "Preds"] <- svm_preds[4]
svm_rmse <- RMSE(stock_test$percent_change_next_weeks_price, svm_preds)
svm_pred_metrics[i, "RMSE"] <- svm_rmse
# svm.r2 <- R2(stock.test$percent_change_next_weeks_price, svm.preds)
# svm.stock.predictions[i, "RSquared"] <- svm.r2
}
mean(svm_pred_metrics$RMSE)
## [1] 4.381084
dowj <- aggregate(dowjones$close, by = list(dowjones$date), FUN = function(x) sum(x)/0.132)
return_dow <- na.omit(Delt(dowj[,2]))
#stocks <- unique(dowjones$stock)
return_stocks <- data.frame(matrix(0, ncol = 30, nrow = 24)) #30 stocks, 24 weeks
return_stocks <- cbind(return_stocks, return_dow)
colnames(return_stocks) = c("AA", "AXP", "BA", "BAC", "CAT", "CSCO", "CVX", "DD", "DIS",
"GE", "HD", "HPQ", "IBM", "INTC", "JNJ", "JPM", "KRFT", "KO",
"MCD", "MMM", "MRK", "MSFT", "PFE", "PG", "T", "TRV", "UTX",
"VZ", "WMT", "XOM", "DOW")
#colnames(return_stocks) = stocks
for(i in 1:length(stocks)){
dow.sub = subset(dowjones, stock == stocks[i])
return_stocks[i] = na.omit(Delt(dow.sub$close))
}
beta.AA = lm(AA ~ DOW, data = return_stocks)$coef[2]
beta.AXP = lm(AXP ~ DOW, data = return_stocks)$coef[2]
beta.BA = lm(BA ~ DOW, data = return_stocks)$coef[2]
beta.BAC = lm(BAC ~ DOW, data = return_stocks)$coef[2]
beta.CAT = lm(CAT ~ DOW, data = return_stocks)$coef[2]
beta.CSCO = lm(CSCO ~ DOW, data = return_stocks)$coef[2]
beta.CVX = lm(CVX ~ DOW, data = return_stocks)$coef[2]
beta.DD = lm(DD ~ DOW, data = return_stocks)$coef[2]
beta.DIS = lm(DIS ~ DOW, data = return_stocks)$coef[2]
beta.GE = lm(GE ~ DOW, data = return_stocks)$coef[2]
beta.HD = lm(HD ~ DOW, data = return_stocks)$coef[2]
beta.HPQ = lm(HPQ ~ DOW, data = return_stocks)$coef[2]
beta.IBM = lm(IBM ~ DOW, data = return_stocks)$coef[2]
beta.INTC = lm(INTC ~ DOW, data = return_stocks)$coef[2]
beta.JNJ = lm(JNJ ~ DOW, data = return_stocks)$coef[2]
beta.JPM = lm(JPM ~ DOW, data = return_stocks)$coef[2]
beta.KRFT = lm(KRFT ~ DOW, data = return_stocks)$coef[2]
beta.KO = lm(KO ~ DOW, data = return_stocks)$coef[2]
beta.MCD = lm(MCD ~ DOW, data = return_stocks)$coef[2]
beta.MMM = lm(MMM ~ DOW, data = return_stocks)$coef[2]
beta.MRK = lm(MRK ~ DOW, data = return_stocks)$coef[2]
beta.MSFT = lm(MSFT ~ DOW, data = return_stocks)$coef[2]
beta.PFE = lm(PFE ~ DOW, data = return_stocks)$coef[2]
beta.PG = lm(PG ~ DOW, data = return_stocks)$coef[2]
beta.T = lm(`T` ~ DOW, data = return_stocks)$coef[2]
beta.TRV = lm(TRV ~ DOW, data = return_stocks)$coef[2]
beta.UTX = lm(UTX ~ DOW, data = return_stocks)$coef[2]
beta.VZ = lm(VZ ~ DOW, data = return_stocks)$coef[2]
beta.WMT = lm(WMT ~ DOW, data = return_stocks)$coef[2]
beta.XOM = lm(XOM ~ DOW, data = return_stocks)$coef[2]
df = data.frame(Stock = c("AA", "AXP", "BA", "BAC", "CAT", "CSCO", "CVX", "DD", "DIS", "GE",
"HD", "HPQ", "IBM", "INTC", "JNJ", "JPM", "KRFT", "KO", "MCD", "MMM",
"MRK", "MSFT", "PFE", "PG", "T", "TRV", "UTX", "VZ", "WMT", "XOM"),
Beta = c(beta.AA, beta.AXP, beta.BA, beta.BAC, beta.CAT, beta.CSCO,
beta.CVX, beta.DD, beta.DIS, beta.GE, beta.HD, beta.HPQ, beta.IBM,
beta.INTC, beta.JNJ, beta.JPM, beta.KRFT, beta.KO, beta.MCD,
beta.MMM, beta.MRK, beta.MSFT, beta.PFE, beta.PG, beta.T, beta.TRV,
beta.UTX, beta.VZ, beta.WMT, beta.XOM))