library(tidyverse)
library(readxl)
library(corrplot)
#install.packages("Hmisc")
library(Hmisc)


set.seed(123)

setwd("C:/Users/Emily/Desktop/Brit School/2021 Fall/dow jones")
# read in data
dow_jones = read.csv("dow_jones_index.data")

# split into test and train
### Use quarter 1 (Jan-Mar) data for training and quarter 2 (Apr-Jun) data for testing.
train_dj = dow_jones %>% filter(quarter == 1)

test_dj = dow_jones %>% filter(quarter == 2)

head(train_dj)
head(test_dj)

nrow(train_dj)
[1] 360
nrow(test_dj)
[1] 390
train_dj %>%
  mutate(
    test_date = as.Date(date, format = "%m/%d/%Y")
  ) %>%
  group_by(test_date) %>%
  dplyr::summarize(count = n()) 
`summarise()` ungrouping output (override with `.groups` argument)
# count of nulls by column
train_dj %>%
  summarise_all(funs(sum(is.na(.))))
`funs()` is deprecated as of dplyr 0.8.0.
Please use a list of either functions or lambdas: 

  # Simple named list: 
  list(mean = mean, median = median)

  # Auto named with `tibble::lst()`: 
  tibble::lst(mean, median)

  # Using lambdas
  list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
This warning is displayed once every 8 hours.
Call `lifecycle::last_warnings()` to see where this warning was generated.
# visualizing time by each stock
train_dj %>%
  ggplot(aes(x = as.Date(date, format = "%m/%d/%Y"), y = percent_change_next_weeks_price)) +
  geom_line() +
  geom_point() +
  facet_wrap(~stock) +
  labs(title = "Percent Change Next Weeks Price")

train_dj %>%
  group_by(date) %>%
  dplyr::summarize(avg_chg = mean(percent_change_next_weeks_price, na.rm = TRUE)) %>%
  ggplot(aes(x = as.Date(date,, format = "%m/%d/%Y"), y  =avg_chg)) +
  geom_line() +
  labs(title = "Average Daily Percent Change - Next Weeks Price")
`summarise()` ungrouping output (override with `.groups` argument)

# correlations with variables that researchers used
# percent_change_price,
# percent_change_volume_over_last_wk, days_to_next_dividend, and percent_return_next_dividend

train_dj_subset = 
train_dj %>% 
  dplyr::select(percent_change_price, percent_change_volume_over_last_wk, previous_weeks_volume, percent_change_next_weeks_price, days_to_next_dividend, percent_return_next_dividend) %>%
  filter(!is.na(.))

head(train_dj_subset)

corrplot(
  cor(train_dj_subset, use = "pairwise.complete.obs"),
  type = "lower",
  addCoef.col = "black"
)

Decision Tree Model

Fitting a decision tree model using the subset of data only containing the variables the researchers used

# install and load needed packages
#install.packages("rpart")
library(rpart)

#install.packages('Metrics')
library(Metrics)

#install.packages('randomForest')
library(randomForest)

#install.packages('e1071')
library(e1071)
decision_tree_fit = 
  rpart(
    percent_change_next_weeks_price ~ percent_change_price + percent_change_volume_over_last_wk + previous_weeks_volume + days_to_next_dividend + percent_return_next_dividend,
    method = "anova",
    data = train_dj_subset
  )
# plot the tree
plot(decision_tree_fit,
     uniform = TRUE)
text(decision_tree_fit, use.n = TRUE, cex = .6)

dt_pred = predict(decision_tree_fit, test_dj, method = "anova")
rmse(actual = test_dj$percent_change_next_weeks_price, predicted = dt_pred)
[1] 3.144843

SVR

svr_fit = svm(    percent_change_next_weeks_price ~ percent_change_price + percent_change_volume_over_last_wk + previous_weeks_volume + days_to_next_dividend + percent_return_next_dividend,
                  data = train_dj_subset)
svr_pred = predict(svr_fit, test_dj)

rmse(predicted = svr_pred, actual = test_dj$percent_change_next_weeks_price)
[1] 2.945372

Tune SVR

svr_tuned_fit =tune(svm,
                    percent_change_next_weeks_price ~ percent_change_price + percent_change_volume_over_last_wk + previous_weeks_volume + days_to_next_dividend + percent_return_next_dividend,
                    data=train_dj_subset,
                    ranges=list(epsilon=seq(0,1,0.1), cost=  2^(2:9)))

#Print optimum value of parameters
print(svr_tuned_fit)

Parameter tuning of ‘svm’:

- sampling method: 10-fold cross validation 

- best parameters:

- best performance: 6.7274 
plot(svr_tuned_fit)

NA
NA
NA
best_svr = svr_tuned_fit$best.model


best_svr_pred = predict(best_svr, test_dj)

rmse(predicted = best_svr_pred, actual = test_dj$percent_change_next_weeks_price)
[1] 2.972027

Now going to try the grid search on a smaller set of values since the darkest regions tend to be under cost = 20

svr_tuned_fit2 =tune(svm,
                    percent_change_next_weeks_price ~ percent_change_price + percent_change_volume_over_last_wk + previous_weeks_volume + days_to_next_dividend + percent_return_next_dividend,
                    data=train_dj_subset,
                    ranges=list(epsilon=seq(0,1,0.1), cost=  2^(2:5)))

#Print optimum value of parameters
print(svr_tuned_fit)

Parameter tuning of ‘svm’:

- sampling method: 10-fold cross validation 

- best parameters:

- best performance: 6.7274 
plot(svr_tuned_fit2)

fit tuned SVR and calculate RMSE

best_svr2 = svr_tuned_fit2$best.model


best_svr_pred2 = predict(best_svr2, test_dj)

rmse(predicted = best_svr_pred2, actual = test_dj$percent_change_next_weeks_price)
[1] 2.972027

Trying the gridsearch on a smaller set of values did not help the RMSE over the first tuned model.

Results

The best performing model when comparing the decision tree fit to the SVR fit is the tuned SVR.

LS0tDQp0aXRsZTogIlIgTm90ZWJvb2siDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQoNCmBgYHtyLCB3YXJuaW5nPUZBTFNFLG1lc3NhZ2U9RkFMU0V9DQpsaWJyYXJ5KHRpZHl2ZXJzZSkNCmxpYnJhcnkocmVhZHhsKQ0KbGlicmFyeShjb3JycGxvdCkNCiNpbnN0YWxsLnBhY2thZ2VzKCJIbWlzYyIpDQpsaWJyYXJ5KEhtaXNjKQ0KDQoNCnNldC5zZWVkKDEyMykNCg0Kc2V0d2QoIkM6L1VzZXJzL0VtaWx5L0Rlc2t0b3AvQnJpdCBTY2hvb2wvMjAyMSBGYWxsL2RvdyBqb25lcyIpDQoNCmBgYA0KDQpgYGB7cn0NCiMgcmVhZCBpbiBkYXRhDQpkb3dfam9uZXMgPSByZWFkLmNzdigiZG93X2pvbmVzX2luZGV4LmRhdGEiKQ0KDQojIHNwbGl0IGludG8gdGVzdCBhbmQgdHJhaW4NCiMjIyBVc2UgcXVhcnRlciAxIChKYW4tTWFyKSBkYXRhIGZvciB0cmFpbmluZyBhbmQgcXVhcnRlciAyIChBcHItSnVuKSBkYXRhIGZvciB0ZXN0aW5nLg0KdHJhaW5fZGogPSBkb3dfam9uZXMgJT4lIGZpbHRlcihxdWFydGVyID09IDEpDQoNCnRlc3RfZGogPSBkb3dfam9uZXMgJT4lIGZpbHRlcihxdWFydGVyID09IDIpDQoNCmhlYWQodHJhaW5fZGopDQpoZWFkKHRlc3RfZGopDQoNCm5yb3codHJhaW5fZGopDQpucm93KHRlc3RfZGopDQoNCnRyYWluX2RqICU+JQ0KICBtdXRhdGUoDQogICAgdGVzdF9kYXRlID0gYXMuRGF0ZShkYXRlLCBmb3JtYXQgPSAiJW0vJWQvJVkiKQ0KICApICU+JQ0KICBncm91cF9ieSh0ZXN0X2RhdGUpICU+JQ0KICBkcGx5cjo6c3VtbWFyaXplKGNvdW50ID0gbigpKSANCg0KDQoNCmBgYA0KDQpgYGB7cn0NCiMgY291bnQgb2YgbnVsbHMgYnkgY29sdW1uDQp0cmFpbl9kaiAlPiUNCiAgc3VtbWFyaXNlX2FsbChmdW5zKHN1bShpcy5uYSguKSkpKQ0KDQoNCmBgYA0KDQoNCg0KYGBge3IsIGZpZy53aWR0aCA9IDEyfQ0KIyB2aXN1YWxpemluZyB0aW1lIGJ5IGVhY2ggc3RvY2sNCnRyYWluX2RqICU+JQ0KICBnZ3Bsb3QoYWVzKHggPSBhcy5EYXRlKGRhdGUsIGZvcm1hdCA9ICIlbS8lZC8lWSIpLCB5ID0gcGVyY2VudF9jaGFuZ2VfbmV4dF93ZWVrc19wcmljZSkpICsNCiAgZ2VvbV9saW5lKCkgKw0KICBnZW9tX3BvaW50KCkgKw0KICBmYWNldF93cmFwKH5zdG9jaykgKw0KICBsYWJzKHRpdGxlID0gIlBlcmNlbnQgQ2hhbmdlIE5leHQgV2Vla3MgUHJpY2UiKQ0KYGBgDQoNCmBgYHtyfQ0KdHJhaW5fZGogJT4lDQogIGdyb3VwX2J5KGRhdGUpICU+JQ0KICBkcGx5cjo6c3VtbWFyaXplKGF2Z19jaGcgPSBtZWFuKHBlcmNlbnRfY2hhbmdlX25leHRfd2Vla3NfcHJpY2UsIG5hLnJtID0gVFJVRSkpICU+JQ0KICBnZ3Bsb3QoYWVzKHggPSBhcy5EYXRlKGRhdGUsLCBmb3JtYXQgPSAiJW0vJWQvJVkiKSwgeSAgPWF2Z19jaGcpKSArDQogIGdlb21fbGluZSgpICsNCiAgbGFicyh0aXRsZSA9ICJBdmVyYWdlIERhaWx5IFBlcmNlbnQgQ2hhbmdlIC0gTmV4dCBXZWVrcyBQcmljZSIpDQoNCmBgYA0KDQoNCg0KDQpgYGB7cn0NCiMgY29ycmVsYXRpb25zIHdpdGggdmFyaWFibGVzIHRoYXQgcmVzZWFyY2hlcnMgdXNlZA0KIyBwZXJjZW50X2NoYW5nZV9wcmljZSwNCiMgcGVyY2VudF9jaGFuZ2Vfdm9sdW1lX292ZXJfbGFzdF93aywgZGF5c190b19uZXh0X2RpdmlkZW5kLCBhbmQgcGVyY2VudF9yZXR1cm5fbmV4dF9kaXZpZGVuZA0KDQp0cmFpbl9kal9zdWJzZXQgPSANCnRyYWluX2RqICU+JSANCiAgZHBseXI6OnNlbGVjdChwZXJjZW50X2NoYW5nZV9wcmljZSwgcGVyY2VudF9jaGFuZ2Vfdm9sdW1lX292ZXJfbGFzdF93aywgcHJldmlvdXNfd2Vla3Nfdm9sdW1lLCBwZXJjZW50X2NoYW5nZV9uZXh0X3dlZWtzX3ByaWNlLCBkYXlzX3RvX25leHRfZGl2aWRlbmQsIHBlcmNlbnRfcmV0dXJuX25leHRfZGl2aWRlbmQpICU+JQ0KICBmaWx0ZXIoIWlzLm5hKC4pKQ0KDQpoZWFkKHRyYWluX2RqX3N1YnNldCkNCg0KY29ycnBsb3QoDQogIGNvcih0cmFpbl9kal9zdWJzZXQsIHVzZSA9ICJwYWlyd2lzZS5jb21wbGV0ZS5vYnMiKSwNCiAgdHlwZSA9ICJsb3dlciIsDQogIGFkZENvZWYuY29sID0gImJsYWNrIg0KKQ0KDQpgYGANCg0KDQoNCiMgRGVjaXNpb24gVHJlZSBNb2RlbA0KRml0dGluZyBhIGRlY2lzaW9uIHRyZWUgbW9kZWwgdXNpbmcgdGhlIHN1YnNldCBvZiBkYXRhIG9ubHkgY29udGFpbmluZyB0aGUgdmFyaWFibGVzIHRoZSByZXNlYXJjaGVycyB1c2VkDQpgYGB7ciwgd2FybmluZz1GQUxTRSwgbWVzc2FnZT1GQUxTRX0NCiMgaW5zdGFsbCBhbmQgbG9hZCBuZWVkZWQgcGFja2FnZXMNCiNpbnN0YWxsLnBhY2thZ2VzKCJycGFydCIpDQpsaWJyYXJ5KHJwYXJ0KQ0KDQojaW5zdGFsbC5wYWNrYWdlcygnTWV0cmljcycpDQpsaWJyYXJ5KE1ldHJpY3MpDQoNCiNpbnN0YWxsLnBhY2thZ2VzKCdyYW5kb21Gb3Jlc3QnKQ0KbGlicmFyeShyYW5kb21Gb3Jlc3QpDQoNCiNpbnN0YWxsLnBhY2thZ2VzKCdlMTA3MScpDQpsaWJyYXJ5KGUxMDcxKQ0KYGBgDQoNCmBgYHtyfQ0KZGVjaXNpb25fdHJlZV9maXQgPSANCiAgcnBhcnQoDQogICAgcGVyY2VudF9jaGFuZ2VfbmV4dF93ZWVrc19wcmljZSB+IHBlcmNlbnRfY2hhbmdlX3ByaWNlICsgcGVyY2VudF9jaGFuZ2Vfdm9sdW1lX292ZXJfbGFzdF93ayArIHByZXZpb3VzX3dlZWtzX3ZvbHVtZSArIGRheXNfdG9fbmV4dF9kaXZpZGVuZCArIHBlcmNlbnRfcmV0dXJuX25leHRfZGl2aWRlbmQsDQogICAgbWV0aG9kID0gImFub3ZhIiwNCiAgICBkYXRhID0gdHJhaW5fZGpfc3Vic2V0DQogICkNCmBgYA0KDQoNCmBgYHtyfQ0KIyBwbG90IHRoZSB0cmVlDQpwbG90KGRlY2lzaW9uX3RyZWVfZml0LA0KICAgICB1bmlmb3JtID0gVFJVRSkNCnRleHQoZGVjaXNpb25fdHJlZV9maXQsIHVzZS5uID0gVFJVRSwgY2V4ID0gLjYpDQpgYGANCg0KYGBge3J9DQpkdF9wcmVkID0gcHJlZGljdChkZWNpc2lvbl90cmVlX2ZpdCwgdGVzdF9kaiwgbWV0aG9kID0gImFub3ZhIikNCnJtc2UoYWN0dWFsID0gdGVzdF9kaiRwZXJjZW50X2NoYW5nZV9uZXh0X3dlZWtzX3ByaWNlLCBwcmVkaWN0ZWQgPSBkdF9wcmVkKQ0KDQpgYGANCg0KDQoNCg0KIyBTVlINCmBgYHtyfQ0Kc3ZyX2ZpdCA9IHN2bSggICAgcGVyY2VudF9jaGFuZ2VfbmV4dF93ZWVrc19wcmljZSB+IHBlcmNlbnRfY2hhbmdlX3ByaWNlICsgcGVyY2VudF9jaGFuZ2Vfdm9sdW1lX292ZXJfbGFzdF93ayArIHByZXZpb3VzX3dlZWtzX3ZvbHVtZSArIGRheXNfdG9fbmV4dF9kaXZpZGVuZCArIHBlcmNlbnRfcmV0dXJuX25leHRfZGl2aWRlbmQsDQogICAgICAgICAgICAgICAgICBkYXRhID0gdHJhaW5fZGpfc3Vic2V0KQ0KYGBgDQoNCg0KDQpgYGB7cn0NCnN2cl9wcmVkID0gcHJlZGljdChzdnJfZml0LCB0ZXN0X2RqKQ0KDQpybXNlKHByZWRpY3RlZCA9IHN2cl9wcmVkLCBhY3R1YWwgPSB0ZXN0X2RqJHBlcmNlbnRfY2hhbmdlX25leHRfd2Vla3NfcHJpY2UpDQpgYGANCg0KVHVuZSBTVlINCmBgYHtyfQ0Kc3ZyX3R1bmVkX2ZpdCA9dHVuZShzdm0sDQogICAgICAgICAgICAgICAgICAgIHBlcmNlbnRfY2hhbmdlX25leHRfd2Vla3NfcHJpY2UgfiBwZXJjZW50X2NoYW5nZV9wcmljZSArIHBlcmNlbnRfY2hhbmdlX3ZvbHVtZV9vdmVyX2xhc3Rfd2sgKyBwcmV2aW91c193ZWVrc192b2x1bWUgKyBkYXlzX3RvX25leHRfZGl2aWRlbmQgKyBwZXJjZW50X3JldHVybl9uZXh0X2RpdmlkZW5kLA0KICAgICAgICAgICAgICAgICAgICBkYXRhPXRyYWluX2RqX3N1YnNldCwNCiAgICAgICAgICAgICAgICAgICAgcmFuZ2VzPWxpc3QoZXBzaWxvbj1zZXEoMCwxLDAuMSksIGNvc3Q9ICAyXigyOjkpKSkNCg0KI1ByaW50IG9wdGltdW0gdmFsdWUgb2YgcGFyYW1ldGVycw0KcHJpbnQoc3ZyX3R1bmVkX2ZpdCkNCnBsb3Qoc3ZyX3R1bmVkX2ZpdCkNCg0KDQoNCmBgYA0KYGBge3J9DQpiZXN0X3N2ciA9IHN2cl90dW5lZF9maXQkYmVzdC5tb2RlbA0KDQoNCmJlc3Rfc3ZyX3ByZWQgPSBwcmVkaWN0KGJlc3Rfc3ZyLCB0ZXN0X2RqKQ0KDQpybXNlKHByZWRpY3RlZCA9IGJlc3Rfc3ZyX3ByZWQsIGFjdHVhbCA9IHRlc3RfZGokcGVyY2VudF9jaGFuZ2VfbmV4dF93ZWVrc19wcmljZSkNCmBgYA0KDQpOb3cgZ29pbmcgdG8gdHJ5IHRoZSBncmlkIHNlYXJjaCBvbiBhIHNtYWxsZXIgc2V0IG9mIHZhbHVlcyBzaW5jZSB0aGUgZGFya2VzdCByZWdpb25zIHRlbmQgdG8gYmUgdW5kZXIgY29zdCA9IDIwDQoNCg0KYGBge3J9DQpzdnJfdHVuZWRfZml0MiA9dHVuZShzdm0sDQogICAgICAgICAgICAgICAgICAgIHBlcmNlbnRfY2hhbmdlX25leHRfd2Vla3NfcHJpY2UgfiBwZXJjZW50X2NoYW5nZV9wcmljZSArIHBlcmNlbnRfY2hhbmdlX3ZvbHVtZV9vdmVyX2xhc3Rfd2sgKyBwcmV2aW91c193ZWVrc192b2x1bWUgKyBkYXlzX3RvX25leHRfZGl2aWRlbmQgKyBwZXJjZW50X3JldHVybl9uZXh0X2RpdmlkZW5kLA0KICAgICAgICAgICAgICAgICAgICBkYXRhPXRyYWluX2RqX3N1YnNldCwNCiAgICAgICAgICAgICAgICAgICAgcmFuZ2VzPWxpc3QoZXBzaWxvbj1zZXEoMCwxLDAuMSksIGNvc3Q9ICAyXigyOjUpKSkNCg0KI1ByaW50IG9wdGltdW0gdmFsdWUgb2YgcGFyYW1ldGVycw0KcHJpbnQoc3ZyX3R1bmVkX2ZpdCkNCnBsb3Qoc3ZyX3R1bmVkX2ZpdDIpDQpgYGANCg0KDQoNCmZpdCB0dW5lZCBTVlIgYW5kIGNhbGN1bGF0ZSBSTVNFDQpgYGB7cn0NCmJlc3Rfc3ZyMiA9IHN2cl90dW5lZF9maXQyJGJlc3QubW9kZWwNCg0KDQpiZXN0X3N2cl9wcmVkMiA9IHByZWRpY3QoYmVzdF9zdnIyLCB0ZXN0X2RqKQ0KDQpybXNlKHByZWRpY3RlZCA9IGJlc3Rfc3ZyX3ByZWQyLCBhY3R1YWwgPSB0ZXN0X2RqJHBlcmNlbnRfY2hhbmdlX25leHRfd2Vla3NfcHJpY2UpDQpgYGANClRyeWluZyB0aGUgZ3JpZHNlYXJjaCBvbiBhIHNtYWxsZXIgc2V0IG9mIHZhbHVlcyBkaWQgbm90IGhlbHAgdGhlIFJNU0Ugb3ZlciB0aGUgZmlyc3QgdHVuZWQgbW9kZWwuDQoNCiMgUmVzdWx0cw0KDQpUaGUgYmVzdCBwZXJmb3JtaW5nIG1vZGVsIHdoZW4gY29tcGFyaW5nIHRoZSBkZWNpc2lvbiB0cmVlIGZpdCB0byB0aGUgU1ZSIGZpdCBpcyB0aGUgdHVuZWQgU1ZSLiANCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg==