library(tidyverse)
library(readxl)
library(corrplot)
#install.packages("Hmisc")
library(Hmisc)
set.seed(123)
setwd("C:/Users/Emily/Desktop/Brit School/2021 Fall/dow jones")
# read in data
dow_jones = read.csv("dow_jones_index.data")
# split into test and train
### Use quarter 1 (Jan-Mar) data for training and quarter 2 (Apr-Jun) data for testing.
train_dj = dow_jones %>% filter(quarter == 1)
test_dj = dow_jones %>% filter(quarter == 2)
head(train_dj)
head(test_dj)
nrow(train_dj)
[1] 360
nrow(test_dj)
[1] 390
train_dj %>%
mutate(
test_date = as.Date(date, format = "%m/%d/%Y")
) %>%
group_by(test_date) %>%
dplyr::summarize(count = n())
`summarise()` ungrouping output (override with `.groups` argument)
# count of nulls by column
train_dj %>%
summarise_all(funs(sum(is.na(.))))
`funs()` is deprecated as of dplyr 0.8.0.
Please use a list of either functions or lambdas:
# Simple named list:
list(mean = mean, median = median)
# Auto named with `tibble::lst()`:
tibble::lst(mean, median)
# Using lambdas
list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
This warning is displayed once every 8 hours.
Call `lifecycle::last_warnings()` to see where this warning was generated.
# visualizing time by each stock
train_dj %>%
ggplot(aes(x = as.Date(date, format = "%m/%d/%Y"), y = percent_change_next_weeks_price)) +
geom_line() +
geom_point() +
facet_wrap(~stock) +
labs(title = "Percent Change Next Weeks Price")

train_dj %>%
group_by(date) %>%
dplyr::summarize(avg_chg = mean(percent_change_next_weeks_price, na.rm = TRUE)) %>%
ggplot(aes(x = as.Date(date,, format = "%m/%d/%Y"), y =avg_chg)) +
geom_line() +
labs(title = "Average Daily Percent Change - Next Weeks Price")
`summarise()` ungrouping output (override with `.groups` argument)

# correlations with variables that researchers used
# percent_change_price,
# percent_change_volume_over_last_wk, days_to_next_dividend, and percent_return_next_dividend
train_dj_subset =
train_dj %>%
dplyr::select(percent_change_price, percent_change_volume_over_last_wk, previous_weeks_volume, percent_change_next_weeks_price, days_to_next_dividend, percent_return_next_dividend) %>%
filter(!is.na(.))
head(train_dj_subset)
corrplot(
cor(train_dj_subset, use = "pairwise.complete.obs"),
type = "lower",
addCoef.col = "black"
)

Decision Tree Model
Fitting a decision tree model using the subset of data only containing the variables the researchers used
# install and load needed packages
#install.packages("rpart")
library(rpart)
#install.packages('Metrics')
library(Metrics)
#install.packages('randomForest')
library(randomForest)
#install.packages('e1071')
library(e1071)
decision_tree_fit =
rpart(
percent_change_next_weeks_price ~ percent_change_price + percent_change_volume_over_last_wk + previous_weeks_volume + days_to_next_dividend + percent_return_next_dividend,
method = "anova",
data = train_dj_subset
)
# plot the tree
plot(decision_tree_fit,
uniform = TRUE)
text(decision_tree_fit, use.n = TRUE, cex = .6)

dt_pred = predict(decision_tree_fit, test_dj, method = "anova")
rmse(actual = test_dj$percent_change_next_weeks_price, predicted = dt_pred)
[1] 3.144843
SVR
svr_fit = svm( percent_change_next_weeks_price ~ percent_change_price + percent_change_volume_over_last_wk + previous_weeks_volume + days_to_next_dividend + percent_return_next_dividend,
data = train_dj_subset)
svr_pred = predict(svr_fit, test_dj)
rmse(predicted = svr_pred, actual = test_dj$percent_change_next_weeks_price)
[1] 2.945372
Tune SVR
svr_tuned_fit =tune(svm,
percent_change_next_weeks_price ~ percent_change_price + percent_change_volume_over_last_wk + previous_weeks_volume + days_to_next_dividend + percent_return_next_dividend,
data=train_dj_subset,
ranges=list(epsilon=seq(0,1,0.1), cost= 2^(2:9)))
#Print optimum value of parameters
print(svr_tuned_fit)
Parameter tuning of ‘svm’:
- sampling method: 10-fold cross validation
- best parameters:
- best performance: 6.7274
plot(svr_tuned_fit)

NA
NA
NA
best_svr = svr_tuned_fit$best.model
best_svr_pred = predict(best_svr, test_dj)
rmse(predicted = best_svr_pred, actual = test_dj$percent_change_next_weeks_price)
[1] 2.972027
Now going to try the grid search on a smaller set of values since the darkest regions tend to be under cost = 20
svr_tuned_fit2 =tune(svm,
percent_change_next_weeks_price ~ percent_change_price + percent_change_volume_over_last_wk + previous_weeks_volume + days_to_next_dividend + percent_return_next_dividend,
data=train_dj_subset,
ranges=list(epsilon=seq(0,1,0.1), cost= 2^(2:5)))
#Print optimum value of parameters
print(svr_tuned_fit)
Parameter tuning of ‘svm’:
- sampling method: 10-fold cross validation
- best parameters:
- best performance: 6.7274
plot(svr_tuned_fit2)

fit tuned SVR and calculate RMSE
best_svr2 = svr_tuned_fit2$best.model
best_svr_pred2 = predict(best_svr2, test_dj)
rmse(predicted = best_svr_pred2, actual = test_dj$percent_change_next_weeks_price)
[1] 2.972027
Trying the gridsearch on a smaller set of values did not help the RMSE over the first tuned model.
Results
The best performing model when comparing the decision tree fit to the SVR fit is the tuned SVR.
LS0tDQp0aXRsZTogIlIgTm90ZWJvb2siDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQoNCmBgYHtyLCB3YXJuaW5nPUZBTFNFLG1lc3NhZ2U9RkFMU0V9DQpsaWJyYXJ5KHRpZHl2ZXJzZSkNCmxpYnJhcnkocmVhZHhsKQ0KbGlicmFyeShjb3JycGxvdCkNCiNpbnN0YWxsLnBhY2thZ2VzKCJIbWlzYyIpDQpsaWJyYXJ5KEhtaXNjKQ0KDQoNCnNldC5zZWVkKDEyMykNCg0Kc2V0d2QoIkM6L1VzZXJzL0VtaWx5L0Rlc2t0b3AvQnJpdCBTY2hvb2wvMjAyMSBGYWxsL2RvdyBqb25lcyIpDQoNCmBgYA0KDQpgYGB7cn0NCiMgcmVhZCBpbiBkYXRhDQpkb3dfam9uZXMgPSByZWFkLmNzdigiZG93X2pvbmVzX2luZGV4LmRhdGEiKQ0KDQojIHNwbGl0IGludG8gdGVzdCBhbmQgdHJhaW4NCiMjIyBVc2UgcXVhcnRlciAxIChKYW4tTWFyKSBkYXRhIGZvciB0cmFpbmluZyBhbmQgcXVhcnRlciAyIChBcHItSnVuKSBkYXRhIGZvciB0ZXN0aW5nLg0KdHJhaW5fZGogPSBkb3dfam9uZXMgJT4lIGZpbHRlcihxdWFydGVyID09IDEpDQoNCnRlc3RfZGogPSBkb3dfam9uZXMgJT4lIGZpbHRlcihxdWFydGVyID09IDIpDQoNCmhlYWQodHJhaW5fZGopDQpoZWFkKHRlc3RfZGopDQoNCm5yb3codHJhaW5fZGopDQpucm93KHRlc3RfZGopDQoNCnRyYWluX2RqICU+JQ0KICBtdXRhdGUoDQogICAgdGVzdF9kYXRlID0gYXMuRGF0ZShkYXRlLCBmb3JtYXQgPSAiJW0vJWQvJVkiKQ0KICApICU+JQ0KICBncm91cF9ieSh0ZXN0X2RhdGUpICU+JQ0KICBkcGx5cjo6c3VtbWFyaXplKGNvdW50ID0gbigpKSANCg0KDQoNCmBgYA0KDQpgYGB7cn0NCiMgY291bnQgb2YgbnVsbHMgYnkgY29sdW1uDQp0cmFpbl9kaiAlPiUNCiAgc3VtbWFyaXNlX2FsbChmdW5zKHN1bShpcy5uYSguKSkpKQ0KDQoNCmBgYA0KDQoNCg0KYGBge3IsIGZpZy53aWR0aCA9IDEyfQ0KIyB2aXN1YWxpemluZyB0aW1lIGJ5IGVhY2ggc3RvY2sNCnRyYWluX2RqICU+JQ0KICBnZ3Bsb3QoYWVzKHggPSBhcy5EYXRlKGRhdGUsIGZvcm1hdCA9ICIlbS8lZC8lWSIpLCB5ID0gcGVyY2VudF9jaGFuZ2VfbmV4dF93ZWVrc19wcmljZSkpICsNCiAgZ2VvbV9saW5lKCkgKw0KICBnZW9tX3BvaW50KCkgKw0KICBmYWNldF93cmFwKH5zdG9jaykgKw0KICBsYWJzKHRpdGxlID0gIlBlcmNlbnQgQ2hhbmdlIE5leHQgV2Vla3MgUHJpY2UiKQ0KYGBgDQoNCmBgYHtyfQ0KdHJhaW5fZGogJT4lDQogIGdyb3VwX2J5KGRhdGUpICU+JQ0KICBkcGx5cjo6c3VtbWFyaXplKGF2Z19jaGcgPSBtZWFuKHBlcmNlbnRfY2hhbmdlX25leHRfd2Vla3NfcHJpY2UsIG5hLnJtID0gVFJVRSkpICU+JQ0KICBnZ3Bsb3QoYWVzKHggPSBhcy5EYXRlKGRhdGUsLCBmb3JtYXQgPSAiJW0vJWQvJVkiKSwgeSAgPWF2Z19jaGcpKSArDQogIGdlb21fbGluZSgpICsNCiAgbGFicyh0aXRsZSA9ICJBdmVyYWdlIERhaWx5IFBlcmNlbnQgQ2hhbmdlIC0gTmV4dCBXZWVrcyBQcmljZSIpDQoNCmBgYA0KDQoNCg0KDQpgYGB7cn0NCiMgY29ycmVsYXRpb25zIHdpdGggdmFyaWFibGVzIHRoYXQgcmVzZWFyY2hlcnMgdXNlZA0KIyBwZXJjZW50X2NoYW5nZV9wcmljZSwNCiMgcGVyY2VudF9jaGFuZ2Vfdm9sdW1lX292ZXJfbGFzdF93aywgZGF5c190b19uZXh0X2RpdmlkZW5kLCBhbmQgcGVyY2VudF9yZXR1cm5fbmV4dF9kaXZpZGVuZA0KDQp0cmFpbl9kal9zdWJzZXQgPSANCnRyYWluX2RqICU+JSANCiAgZHBseXI6OnNlbGVjdChwZXJjZW50X2NoYW5nZV9wcmljZSwgcGVyY2VudF9jaGFuZ2Vfdm9sdW1lX292ZXJfbGFzdF93aywgcHJldmlvdXNfd2Vla3Nfdm9sdW1lLCBwZXJjZW50X2NoYW5nZV9uZXh0X3dlZWtzX3ByaWNlLCBkYXlzX3RvX25leHRfZGl2aWRlbmQsIHBlcmNlbnRfcmV0dXJuX25leHRfZGl2aWRlbmQpICU+JQ0KICBmaWx0ZXIoIWlzLm5hKC4pKQ0KDQpoZWFkKHRyYWluX2RqX3N1YnNldCkNCg0KY29ycnBsb3QoDQogIGNvcih0cmFpbl9kal9zdWJzZXQsIHVzZSA9ICJwYWlyd2lzZS5jb21wbGV0ZS5vYnMiKSwNCiAgdHlwZSA9ICJsb3dlciIsDQogIGFkZENvZWYuY29sID0gImJsYWNrIg0KKQ0KDQpgYGANCg0KDQoNCiMgRGVjaXNpb24gVHJlZSBNb2RlbA0KRml0dGluZyBhIGRlY2lzaW9uIHRyZWUgbW9kZWwgdXNpbmcgdGhlIHN1YnNldCBvZiBkYXRhIG9ubHkgY29udGFpbmluZyB0aGUgdmFyaWFibGVzIHRoZSByZXNlYXJjaGVycyB1c2VkDQpgYGB7ciwgd2FybmluZz1GQUxTRSwgbWVzc2FnZT1GQUxTRX0NCiMgaW5zdGFsbCBhbmQgbG9hZCBuZWVkZWQgcGFja2FnZXMNCiNpbnN0YWxsLnBhY2thZ2VzKCJycGFydCIpDQpsaWJyYXJ5KHJwYXJ0KQ0KDQojaW5zdGFsbC5wYWNrYWdlcygnTWV0cmljcycpDQpsaWJyYXJ5KE1ldHJpY3MpDQoNCiNpbnN0YWxsLnBhY2thZ2VzKCdyYW5kb21Gb3Jlc3QnKQ0KbGlicmFyeShyYW5kb21Gb3Jlc3QpDQoNCiNpbnN0YWxsLnBhY2thZ2VzKCdlMTA3MScpDQpsaWJyYXJ5KGUxMDcxKQ0KYGBgDQoNCmBgYHtyfQ0KZGVjaXNpb25fdHJlZV9maXQgPSANCiAgcnBhcnQoDQogICAgcGVyY2VudF9jaGFuZ2VfbmV4dF93ZWVrc19wcmljZSB+IHBlcmNlbnRfY2hhbmdlX3ByaWNlICsgcGVyY2VudF9jaGFuZ2Vfdm9sdW1lX292ZXJfbGFzdF93ayArIHByZXZpb3VzX3dlZWtzX3ZvbHVtZSArIGRheXNfdG9fbmV4dF9kaXZpZGVuZCArIHBlcmNlbnRfcmV0dXJuX25leHRfZGl2aWRlbmQsDQogICAgbWV0aG9kID0gImFub3ZhIiwNCiAgICBkYXRhID0gdHJhaW5fZGpfc3Vic2V0DQogICkNCmBgYA0KDQoNCmBgYHtyfQ0KIyBwbG90IHRoZSB0cmVlDQpwbG90KGRlY2lzaW9uX3RyZWVfZml0LA0KICAgICB1bmlmb3JtID0gVFJVRSkNCnRleHQoZGVjaXNpb25fdHJlZV9maXQsIHVzZS5uID0gVFJVRSwgY2V4ID0gLjYpDQpgYGANCg0KYGBge3J9DQpkdF9wcmVkID0gcHJlZGljdChkZWNpc2lvbl90cmVlX2ZpdCwgdGVzdF9kaiwgbWV0aG9kID0gImFub3ZhIikNCnJtc2UoYWN0dWFsID0gdGVzdF9kaiRwZXJjZW50X2NoYW5nZV9uZXh0X3dlZWtzX3ByaWNlLCBwcmVkaWN0ZWQgPSBkdF9wcmVkKQ0KDQpgYGANCg0KDQoNCg0KIyBTVlINCmBgYHtyfQ0Kc3ZyX2ZpdCA9IHN2bSggICAgcGVyY2VudF9jaGFuZ2VfbmV4dF93ZWVrc19wcmljZSB+IHBlcmNlbnRfY2hhbmdlX3ByaWNlICsgcGVyY2VudF9jaGFuZ2Vfdm9sdW1lX292ZXJfbGFzdF93ayArIHByZXZpb3VzX3dlZWtzX3ZvbHVtZSArIGRheXNfdG9fbmV4dF9kaXZpZGVuZCArIHBlcmNlbnRfcmV0dXJuX25leHRfZGl2aWRlbmQsDQogICAgICAgICAgICAgICAgICBkYXRhID0gdHJhaW5fZGpfc3Vic2V0KQ0KYGBgDQoNCg0KDQpgYGB7cn0NCnN2cl9wcmVkID0gcHJlZGljdChzdnJfZml0LCB0ZXN0X2RqKQ0KDQpybXNlKHByZWRpY3RlZCA9IHN2cl9wcmVkLCBhY3R1YWwgPSB0ZXN0X2RqJHBlcmNlbnRfY2hhbmdlX25leHRfd2Vla3NfcHJpY2UpDQpgYGANCg0KVHVuZSBTVlINCmBgYHtyfQ0Kc3ZyX3R1bmVkX2ZpdCA9dHVuZShzdm0sDQogICAgICAgICAgICAgICAgICAgIHBlcmNlbnRfY2hhbmdlX25leHRfd2Vla3NfcHJpY2UgfiBwZXJjZW50X2NoYW5nZV9wcmljZSArIHBlcmNlbnRfY2hhbmdlX3ZvbHVtZV9vdmVyX2xhc3Rfd2sgKyBwcmV2aW91c193ZWVrc192b2x1bWUgKyBkYXlzX3RvX25leHRfZGl2aWRlbmQgKyBwZXJjZW50X3JldHVybl9uZXh0X2RpdmlkZW5kLA0KICAgICAgICAgICAgICAgICAgICBkYXRhPXRyYWluX2RqX3N1YnNldCwNCiAgICAgICAgICAgICAgICAgICAgcmFuZ2VzPWxpc3QoZXBzaWxvbj1zZXEoMCwxLDAuMSksIGNvc3Q9ICAyXigyOjkpKSkNCg0KI1ByaW50IG9wdGltdW0gdmFsdWUgb2YgcGFyYW1ldGVycw0KcHJpbnQoc3ZyX3R1bmVkX2ZpdCkNCnBsb3Qoc3ZyX3R1bmVkX2ZpdCkNCg0KDQoNCmBgYA0KYGBge3J9DQpiZXN0X3N2ciA9IHN2cl90dW5lZF9maXQkYmVzdC5tb2RlbA0KDQoNCmJlc3Rfc3ZyX3ByZWQgPSBwcmVkaWN0KGJlc3Rfc3ZyLCB0ZXN0X2RqKQ0KDQpybXNlKHByZWRpY3RlZCA9IGJlc3Rfc3ZyX3ByZWQsIGFjdHVhbCA9IHRlc3RfZGokcGVyY2VudF9jaGFuZ2VfbmV4dF93ZWVrc19wcmljZSkNCmBgYA0KDQpOb3cgZ29pbmcgdG8gdHJ5IHRoZSBncmlkIHNlYXJjaCBvbiBhIHNtYWxsZXIgc2V0IG9mIHZhbHVlcyBzaW5jZSB0aGUgZGFya2VzdCByZWdpb25zIHRlbmQgdG8gYmUgdW5kZXIgY29zdCA9IDIwDQoNCg0KYGBge3J9DQpzdnJfdHVuZWRfZml0MiA9dHVuZShzdm0sDQogICAgICAgICAgICAgICAgICAgIHBlcmNlbnRfY2hhbmdlX25leHRfd2Vla3NfcHJpY2UgfiBwZXJjZW50X2NoYW5nZV9wcmljZSArIHBlcmNlbnRfY2hhbmdlX3ZvbHVtZV9vdmVyX2xhc3Rfd2sgKyBwcmV2aW91c193ZWVrc192b2x1bWUgKyBkYXlzX3RvX25leHRfZGl2aWRlbmQgKyBwZXJjZW50X3JldHVybl9uZXh0X2RpdmlkZW5kLA0KICAgICAgICAgICAgICAgICAgICBkYXRhPXRyYWluX2RqX3N1YnNldCwNCiAgICAgICAgICAgICAgICAgICAgcmFuZ2VzPWxpc3QoZXBzaWxvbj1zZXEoMCwxLDAuMSksIGNvc3Q9ICAyXigyOjUpKSkNCg0KI1ByaW50IG9wdGltdW0gdmFsdWUgb2YgcGFyYW1ldGVycw0KcHJpbnQoc3ZyX3R1bmVkX2ZpdCkNCnBsb3Qoc3ZyX3R1bmVkX2ZpdDIpDQpgYGANCg0KDQoNCmZpdCB0dW5lZCBTVlIgYW5kIGNhbGN1bGF0ZSBSTVNFDQpgYGB7cn0NCmJlc3Rfc3ZyMiA9IHN2cl90dW5lZF9maXQyJGJlc3QubW9kZWwNCg0KDQpiZXN0X3N2cl9wcmVkMiA9IHByZWRpY3QoYmVzdF9zdnIyLCB0ZXN0X2RqKQ0KDQpybXNlKHByZWRpY3RlZCA9IGJlc3Rfc3ZyX3ByZWQyLCBhY3R1YWwgPSB0ZXN0X2RqJHBlcmNlbnRfY2hhbmdlX25leHRfd2Vla3NfcHJpY2UpDQpgYGANClRyeWluZyB0aGUgZ3JpZHNlYXJjaCBvbiBhIHNtYWxsZXIgc2V0IG9mIHZhbHVlcyBkaWQgbm90IGhlbHAgdGhlIFJNU0Ugb3ZlciB0aGUgZmlyc3QgdHVuZWQgbW9kZWwuDQoNCiMgUmVzdWx0cw0KDQpUaGUgYmVzdCBwZXJmb3JtaW5nIG1vZGVsIHdoZW4gY29tcGFyaW5nIHRoZSBkZWNpc2lvbiB0cmVlIGZpdCB0byB0aGUgU1ZSIGZpdCBpcyB0aGUgdHVuZWQgU1ZSLiANCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg==