DATA

dowjones = read.table("dow_jones_index.data", sep = ",", header = TRUE)
str(dowjones)
## 'data.frame':    750 obs. of  16 variables:
##  $ quarter                           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ stock                             : chr  "AA" "AA" "AA" "AA" ...
##  $ date                              : chr  "1/7/2011" "1/14/2011" "1/21/2011" "1/28/2011" ...
##  $ open                              : chr  "$15.82" "$16.71" "$16.19" "$15.87" ...
##  $ high                              : chr  "$16.72" "$16.71" "$16.38" "$16.63" ...
##  $ low                               : chr  "$15.78" "$15.64" "$15.60" "$15.82" ...
##  $ close                             : chr  "$16.42" "$15.97" "$15.79" "$16.13" ...
##  $ volume                            : int  239655616 242963398 138428495 151379173 154387761 114691279 80023895 132981863 109493077 114332562 ...
##  $ percent_change_price              : num  3.79 -4.43 -2.47 1.64 5.93 ...
##  $ percent_change_volume_over_last_wk: num  NA 1.38 -43.02 9.36 1.99 ...
##  $ previous_weeks_volume             : int  NA 239655616 242963398 138428495 151379173 154387761 114691279 80023895 132981863 109493077 ...
##  $ next_weeks_open                   : chr  "$16.71" "$16.19" "$15.87" "$16.18" ...
##  $ next_weeks_close                  : chr  "$15.97" "$15.79" "$16.13" "$17.14" ...
##  $ percent_change_next_weeks_price   : num  -4.428 -2.471 1.638 5.933 0.231 ...
##  $ days_to_next_dividend             : int  26 19 12 5 97 90 83 76 69 62 ...
##  $ percent_return_next_dividend      : num  0.183 0.188 0.19 0.186 0.175 ...



a) Missing data

Find missing values

colSums(is.na(dowjones))
##                            quarter                              stock 
##                                  0                                  0 
##                               date                               open 
##                                  0                                  0 
##                               high                                low 
##                                  0                                  0 
##                              close                             volume 
##                                  0                                  0 
##               percent_change_price percent_change_volume_over_last_wk 
##                                  0                                 30 
##              previous_weeks_volume                    next_weeks_open 
##                                 30                                  0 
##                   next_weeks_close    percent_change_next_weeks_price 
##                                  0                                  0 
##              days_to_next_dividend       percent_return_next_dividend 
##                                  0                                  0

Those missing values correspond to the variables that have past observations “previous_weeks_volume”, so the first line has no data, because it has nothing to refer to, therefore it is an NA. We will fill in those empty observations with the mean value, so that our future analysis doesn’t get affected by it.

dowjones = dowjones %>% 
  group_by(stock) %>%
  mutate(percent_change_volume_over_last_wk = ifelse(is.na(percent_change_volume_over_last_wk),
                                                     mean(percent_change_volume_over_last_wk,
                                                     na.rm=TRUE),
                                                     percent_change_volume_over_last_wk),
         previous_weeks_volume = ifelse(is.na(previous_weeks_volume),
                                        mean(previous_weeks_volume, na.rm=TRUE), 
                                        previous_weeks_volume)) %>% 
  ungroup()


#dowjones$previous_weeks_volume[is.na(dowjones$previous_weeks_volume)] = mean(dowjones$previous_weeks_volume, na.rm = T)
#dowjones$percent_change_volume_over_last_wk[is.na(dowjones$percent_change_volume_over_last_wk)] = mean(dowjones$percent_change_volume_over_last_wk, na.rm = T)
colSums(is.na(dowjones))
##                            quarter                              stock 
##                                  0                                  0 
##                               date                               open 
##                                  0                                  0 
##                               high                                low 
##                                  0                                  0 
##                              close                             volume 
##                                  0                                  0 
##               percent_change_price percent_change_volume_over_last_wk 
##                                  0                                  0 
##              previous_weeks_volume                    next_weeks_open 
##                                  0                                  0 
##                   next_weeks_close    percent_change_next_weeks_price 
##                                  0                                  0 
##              days_to_next_dividend       percent_return_next_dividend 
##                                  0                                  0

b) Cleaning the data

We will change character variables to numeric by removing the $ sign.

dowjones$open = parse_number(dowjones$open)
dowjones$high = parse_number(dowjones$high)
dowjones$close = parse_number(dowjones$close)
dowjones$low = parse_number(dowjones$low)
dowjones$next_weeks_open = parse_number(dowjones$next_weeks_open)
dowjones$next_weeks_close = parse_number(dowjones$next_weeks_close)

Same with the date variable, we need to assign the proper date variable.

dowjones$date = as.Date(dowjones$date, "%m/%d/%Y")
dowjones$stock = as.factor(dowjones$stock)
str(dowjones)
## tibble [750 x 16] (S3: tbl_df/tbl/data.frame)
##  $ quarter                           : int [1:750] 1 1 1 1 1 1 1 1 1 1 ...
##  $ stock                             : Factor w/ 30 levels "AA","AXP","BA",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ date                              : Date[1:750], format: "2011-01-07" "2011-01-14" ...
##  $ open                              : num [1:750] 15.8 16.7 16.2 15.9 16.2 ...
##  $ high                              : num [1:750] 16.7 16.7 16.4 16.6 17.4 ...
##  $ low                               : num [1:750] 15.8 15.6 15.6 15.8 16.2 ...
##  $ close                             : num [1:750] 16.4 16 15.8 16.1 17.1 ...
##  $ volume                            : int [1:750] 239655616 242963398 138428495 151379173 154387761 114691279 80023895 132981863 109493077 114332562 ...
##  $ percent_change_price              : num [1:750] 3.79 -4.43 -2.47 1.64 5.93 ...
##  $ percent_change_volume_over_last_wk: num [1:750] 4.03 1.38 -43.02 9.36 1.99 ...
##  $ previous_weeks_volume             : num [1:750] 1.31e+08 2.40e+08 2.43e+08 1.38e+08 1.51e+08 ...
##  $ next_weeks_open                   : num [1:750] 16.7 16.2 15.9 16.2 17.3 ...
##  $ next_weeks_close                  : num [1:750] 16 15.8 16.1 17.1 17.4 ...
##  $ percent_change_next_weeks_price   : num [1:750] -4.428 -2.471 1.638 5.933 0.231 ...
##  $ days_to_next_dividend             : int [1:750] 26 19 12 5 97 90 83 76 69 62 ...
##  $ percent_return_next_dividend      : num [1:750] 0.183 0.188 0.19 0.186 0.175 ...

c) Exloratory Data analysis

Scatter Plot

We will looks at the Scatter plot to have an idea of the variables that might be strongly correlated to each other.

pairs(dowjones)

The ones we can see strongly correlated are: Open, High, Low, Close, next_weeks_open, next_weeks_close.

library(corrplot)
## Warning: package 'corrplot' was built under R version 4.1.3
## corrplot 0.92 loaded
corrplot(cor(dowjones[,-(1:3)]))

ggplot(dowjones, aes(x= date, y = percent_change_next_weeks_price, colour = stock)) +
  geom_line() + theme_bw() + labs(title = "Per Price Change Next Week", x = "Date", y= "quarterly Returns", subtitle = "") +
  scale_x_date(breaks = seq(as.Date("2011-01-07"), as.Date("2011-06-24"), by = "week"), minor_breaks = NULL) + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

d) Selecting the variables

Full model

Full model (our full model explains 72.4% of percent_change_next_weeks_price), however, this is cheating, since our model includes “next_weeks_open” and “next_weeks_close” which perfectly predicts the price, and therefore the percent_change_next_weeks_price.

model_full = lm(percent_change_next_weeks_price ~ ., data = dowjones )
summary(model_full)$r.squared
## [1] 0.7235275

When we delete these terms from the model, the r-square drops tremendously to 13.37%

model_lm1 = lm(percent_change_next_weeks_price ~ . - next_weeks_open - next_weeks_close, data = dowjones )
summary(model_lm1)$r.squared
## [1] 0.1328077
Model selection using leaps

In here we tried multiple things: for sure deleting the “next_weeks_open” and “next_weeks_close” variables. This model will only bring that the significant variable is: previous_weeks_volume (due to the noise of the stocks variable seen as a variable)

set.seed(123)
train.control = trainControl(method="cv", number = 10)
step.model = train(percent_change_next_weeks_price ~ .- next_weeks_close - next_weeks_close, data = dowjones,
                   method = "leapSeq",
                   tuneGrid = data.frame(nvmax = 1:5),
                   trControl = train.control)

step.model$results
summary(step.model$finalModel)
## Subset selection object
## 42 Variables  (and intercept)
##                                    Forced in Forced out
## quarter                                FALSE      FALSE
## stockAXP                               FALSE      FALSE
## stockBA                                FALSE      FALSE
## stockBAC                               FALSE      FALSE
## stockCAT                               FALSE      FALSE
## stockCSCO                              FALSE      FALSE
## stockCVX                               FALSE      FALSE
## stockDD                                FALSE      FALSE
## stockDIS                               FALSE      FALSE
## stockGE                                FALSE      FALSE
## stockHD                                FALSE      FALSE
## stockHPQ                               FALSE      FALSE
## stockIBM                               FALSE      FALSE
## stockINTC                              FALSE      FALSE
## stockJNJ                               FALSE      FALSE
## stockJPM                               FALSE      FALSE
## stockKO                                FALSE      FALSE
## stockKRFT                              FALSE      FALSE
## stockMCD                               FALSE      FALSE
## stockMMM                               FALSE      FALSE
## stockMRK                               FALSE      FALSE
## stockMSFT                              FALSE      FALSE
## stockPFE                               FALSE      FALSE
## stockPG                                FALSE      FALSE
## stockT                                 FALSE      FALSE
## stockTRV                               FALSE      FALSE
## stockUTX                               FALSE      FALSE
## stockVZ                                FALSE      FALSE
## stockWMT                               FALSE      FALSE
## stockXOM                               FALSE      FALSE
## date                                   FALSE      FALSE
## open                                   FALSE      FALSE
## high                                   FALSE      FALSE
## low                                    FALSE      FALSE
## close                                  FALSE      FALSE
## volume                                 FALSE      FALSE
## percent_change_price                   FALSE      FALSE
## percent_change_volume_over_last_wk     FALSE      FALSE
## previous_weeks_volume                  FALSE      FALSE
## next_weeks_open                        FALSE      FALSE
## days_to_next_dividend                  FALSE      FALSE
## percent_return_next_dividend           FALSE      FALSE
## 1 subsets of each size up to 5
## Selection Algorithm: 'sequential replacement'
##          quarter stockAXP stockBA stockBAC stockCAT stockCSCO stockCVX stockDD
## 1  ( 1 ) " "     " "      " "     " "      " "      "*"       " "      " "    
## 2  ( 1 ) " "     " "      " "     "*"      " "      "*"       " "      " "    
## 3  ( 1 ) " "     " "      " "     "*"      " "      "*"       " "      " "    
## 4  ( 1 ) " "     " "      " "     " "      " "      " "       " "      " "    
## 5  ( 1 ) " "     "*"      " "     " "      " "      " "       " "      " "    
##          stockDIS stockGE stockHD stockHPQ stockIBM stockINTC stockJNJ stockJPM
## 1  ( 1 ) " "      " "     " "     " "      " "      " "       " "      " "     
## 2  ( 1 ) " "      " "     " "     " "      " "      " "       " "      " "     
## 3  ( 1 ) " "      " "     " "     "*"      " "      " "       " "      " "     
## 4  ( 1 ) " "      " "     " "     " "      " "      " "       " "      " "     
## 5  ( 1 ) " "      " "     " "     " "      " "      " "       " "      " "     
##          stockKO stockKRFT stockMCD stockMMM stockMRK stockMSFT stockPFE
## 1  ( 1 ) " "     " "       " "      " "      " "      " "       " "     
## 2  ( 1 ) " "     " "       " "      " "      " "      " "       " "     
## 3  ( 1 ) " "     " "       " "      " "      " "      " "       " "     
## 4  ( 1 ) " "     " "       " "      " "      " "      " "       " "     
## 5  ( 1 ) " "     " "       " "      " "      " "      " "       " "     
##          stockPG stockT stockTRV stockUTX stockVZ stockWMT stockXOM date open
## 1  ( 1 ) " "     " "    " "      " "      " "     " "      " "      " "  " " 
## 2  ( 1 ) " "     " "    " "      " "      " "     " "      " "      " "  " " 
## 3  ( 1 ) " "     " "    " "      " "      " "     " "      " "      " "  " " 
## 4  ( 1 ) " "     " "    " "      " "      " "     " "      " "      " "  "*" 
## 5  ( 1 ) " "     " "    " "      " "      " "     " "      " "      " "  "*" 
##          high low close volume percent_change_price
## 1  ( 1 ) " "  " " " "   " "    " "                 
## 2  ( 1 ) " "  " " " "   " "    " "                 
## 3  ( 1 ) " "  " " " "   " "    " "                 
## 4  ( 1 ) "*"  " " " "   " "    " "                 
## 5  ( 1 ) "*"  " " " "   " "    " "                 
##          percent_change_volume_over_last_wk previous_weeks_volume
## 1  ( 1 ) " "                                " "                  
## 2  ( 1 ) " "                                " "                  
## 3  ( 1 ) " "                                " "                  
## 4  ( 1 ) " "                                " "                  
## 5  ( 1 ) " "                                " "                  
##          next_weeks_open days_to_next_dividend percent_return_next_dividend
## 1  ( 1 ) " "             " "                   " "                         
## 2  ( 1 ) " "             " "                   " "                         
## 3  ( 1 ) " "             " "                   " "                         
## 4  ( 1 ) "*"             " "                   "*"                         
## 5  ( 1 ) "*"             " "                   "*"

When running the same model but now without stocks, we can find the other variables that could help us. However, our Rsquared decreases all the way to 3%!

set.seed(123)
train.control = trainControl(method="cv", number = 10)
step.model = train(percent_change_next_weeks_price ~ .- next_weeks_close - next_weeks_close - stock, data = dowjones,
                   method = "leapSeq",
                   tuneGrid = data.frame(nvmax = 1:5),
                   trControl = train.control)

step.model$results
summary(step.model$finalModel)
## Subset selection object
## 13 Variables  (and intercept)
##                                    Forced in Forced out
## quarter                                FALSE      FALSE
## date                                   FALSE      FALSE
## open                                   FALSE      FALSE
## high                                   FALSE      FALSE
## low                                    FALSE      FALSE
## close                                  FALSE      FALSE
## volume                                 FALSE      FALSE
## percent_change_price                   FALSE      FALSE
## percent_change_volume_over_last_wk     FALSE      FALSE
## previous_weeks_volume                  FALSE      FALSE
## next_weeks_open                        FALSE      FALSE
## days_to_next_dividend                  FALSE      FALSE
## percent_return_next_dividend           FALSE      FALSE
## 1 subsets of each size up to 5
## Selection Algorithm: 'sequential replacement'
##          quarter date open high low close volume percent_change_price
## 1  ( 1 ) " "     " "  " "  " "  " " " "   " "    " "                 
## 2  ( 1 ) " "     " "  "*"  "*"  " " " "   " "    " "                 
## 3  ( 1 ) "*"     "*"  "*"  " "  " " " "   " "    " "                 
## 4  ( 1 ) " "     " "  "*"  "*"  " " " "   " "    " "                 
## 5  ( 1 ) " "     " "  "*"  "*"  " " "*"   " "    " "                 
##          percent_change_volume_over_last_wk previous_weeks_volume
## 1  ( 1 ) " "                                "*"                  
## 2  ( 1 ) " "                                " "                  
## 3  ( 1 ) " "                                " "                  
## 4  ( 1 ) " "                                " "                  
## 5  ( 1 ) " "                                " "                  
##          next_weeks_open days_to_next_dividend percent_return_next_dividend
## 1  ( 1 ) " "             " "                   " "                         
## 2  ( 1 ) " "             " "                   " "                         
## 3  ( 1 ) " "             " "                   " "                         
## 4  ( 1 ) "*"             " "                   "*"                         
## 5  ( 1 ) "*"             " "                   "*"

Here we can see the following possible models: (a) open + high, (b) quarter + date + open, (c) open + high + next_weeks_open + percent_return_next_dividend. Being (c) the model with the 2.81% Rsquared.

Model Selection using VIF cuttoff

Here, you can’t see the process we went through because we did a manual VIF cutoff, but we tried with all the variables and getting rid of them, until we have the variables that satisfy our VIF cutoff (< 8.00)

model = lm(percent_change_next_weeks_price ~ percent_return_next_dividend  + volume + previous_weeks_volume  + days_to_next_dividend, data = dowjones )
summary(model)
## 
## Call:
## lm(formula = percent_change_next_weeks_price ~ percent_return_next_dividend + 
##     volume + previous_weeks_volume + days_to_next_dividend, data = dowjones)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.2112  -1.5600  -0.0927   1.5783   9.7029 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                   4.509e-02  2.930e-01   0.154   0.8778  
## percent_return_next_dividend  6.024e-01  3.343e-01   1.802   0.0720 .
## volume                        1.677e-09  1.295e-09   1.295   0.1958  
## previous_weeks_volume        -2.667e-09  1.293e-09  -2.063   0.0395 *
## days_to_next_dividend        -2.044e-03  2.120e-03  -0.964   0.3352  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.667 on 745 degrees of freedom
## Multiple R-squared:  0.01469,    Adjusted R-squared:  0.009401 
## F-statistic: 2.777 on 4 and 745 DF,  p-value: 0.02611
vif(model)
## percent_return_next_dividend                       volume 
##                     1.098254                     4.433020 
##        previous_weeks_volume        days_to_next_dividend 
##                     4.436957                     1.015691
model2 = lm(percent_change_next_weeks_price ~ percent_change_price + percent_change_volume_over_last_wk+ + percent_return_next_dividend  + volume + previous_weeks_volume  + days_to_next_dividend, data = dowjones )
summary(model2)
## 
## Call:
## lm(formula = percent_change_next_weeks_price ~ percent_change_price + 
##     percent_change_volume_over_last_wk + +percent_return_next_dividend + 
##     volume + previous_weeks_volume + days_to_next_dividend, data = dowjones)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.3447  -1.5159  -0.0915   1.5733   9.7191 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                         5.651e-02  2.943e-01   0.192   0.8478  
## percent_change_price                2.104e-02  4.000e-02   0.526   0.5991  
## percent_change_volume_over_last_wk -2.298e-03  3.122e-03  -0.736   0.4620  
## percent_return_next_dividend        5.979e-01  3.346e-01   1.787   0.0743 .
## volume                              2.524e-09  1.615e-09   1.563   0.1185  
## previous_weeks_volume              -3.468e-09  1.601e-09  -2.166   0.0306 *
## days_to_next_dividend              -2.083e-03  2.122e-03  -0.982   0.3266  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.669 on 743 degrees of freedom
## Multiple R-squared:  0.01597,    Adjusted R-squared:  0.00802 
## F-statistic: 2.009 on 6 and 743 DF,  p-value: 0.06218
Lag for close
lag.plot(dowjones$close, pch = ".", set.lags = 1:4)

Lag for volume
lag.plot(dowjones$volume, pch = ".", set.lags = 1:4)

  • We can see the first lag provides the best correlation and hence we will use lag = 1 for our modeling.
Lag for percent_change_next_weeks_price
lag.plot(dowjones$percent_change_next_weeks_price, pch = ".", set.lags = 1:4)

Procedure

Create new lags
dowjones_lag = dowjones %>% 
  group_by(stock) %>% 
  mutate(close_lag = lag(close, n = 1), 
         volume_lag = lag(volume, n = 1),
         percent_change_next_weeks_price_lag = lag(percent_change_next_weeks_price, n = 1)) %>% 
  ungroup()

head(dowjones_lag)
Remove the NAs from the first obsevation in lagged variables
dowjones_lag = dowjones_lag %>% 
  group_by(stock) %>%
  mutate(close_lag = ifelse(is.na(close_lag), mean(close_lag, na.rm=T), close_lag),
         volume_lag = ifelse(is.na(volume_lag), mean(volume_lag, na.rm=T), volume_lag),
         percent_change_next_weeks_price_lag = ifelse(is.na(percent_change_next_weeks_price_lag),
                                             mean(percent_change_next_weeks_price_lag, na.rm=T),
                                             percent_change_next_weeks_price_lag)) %>% 
  ungroup()
par(mfrow = c(1,2))
plot(dowjones_lag$close_lag,dowjones_lag$percent_change_next_weeks_price)
plot(dowjones_lag$volume_lag,dowjones_lag$percent_change_next_weeks_price)

model3 = lm(percent_change_next_weeks_price ~ stock+ percent_change_next_weeks_price_lag +
               close_lag + volume_lag + percent_change_volume_over_last_wk, 
               data = dowjones_lag )
summary(model3)
## 
## Call:
## lm(formula = percent_change_next_weeks_price ~ stock + percent_change_next_weeks_price_lag + 
##     close_lag + volume_lag + percent_change_volume_over_last_wk, 
##     data = dowjones_lag)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.7899  -1.4073  -0.1243   1.4309   8.9183 
## 
## Coefficients:
##                                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          3.857e+00  8.355e-01   4.617 4.62e-06 ***
## stockAXP                             9.212e+00  1.355e+00   6.797 2.25e-11 ***
## stockBA                              1.582e+01  2.251e+00   7.029 4.86e-12 ***
## stockBAC                            -3.137e+00  1.255e+00  -2.499   0.0127 *  
## stockCAT                             2.401e+01  3.312e+00   7.249 1.09e-12 ***
## stockCSCO                           -1.133e+00  8.382e-01  -1.352   0.1769    
## stockCVX                             2.358e+01  3.243e+00   7.271 9.38e-13 ***
## stockDD                              1.073e+01  1.554e+00   6.903 1.12e-11 ***
## stockDIS                             7.220e+00  1.192e+00   6.057 2.24e-09 ***
## stockGE                              7.703e-01  7.812e-01   0.986   0.3245    
## stockHD                              6.146e+00  1.062e+00   5.788 1.07e-08 ***
## stockHPQ                             6.391e+00  1.192e+00   5.361 1.12e-07 ***
## stockIBM                             4.018e+01  5.499e+00   7.307 7.30e-13 ***
## stockINTC                            1.625e+00  8.123e-01   2.000   0.0459 *  
## stockJNJ                             1.302e+01  1.869e+00   6.965 7.42e-12 ***
## stockJPM                             7.717e+00  1.282e+00   6.018 2.82e-09 ***
## stockKO                              1.387e+01  1.968e+00   7.047 4.30e-12 ***
## stockKRFT                            5.406e+00  9.633e-01   5.612 2.86e-08 ***
## stockMCD                             1.710e+01  2.377e+00   7.193 1.61e-12 ***
## stockMMM                             2.094e+01  2.898e+00   7.225 1.28e-12 ***
## stockMRK                             5.028e+00  9.960e-01   5.048 5.67e-07 ***
## stockMSFT                            2.009e+00  8.612e-01   2.332   0.0200 *  
## stockPFE                             1.443e+00  7.671e-01   1.880   0.0605 .  
## stockPG                              1.309e+01  1.920e+00   6.821 1.93e-11 ***
## stockT                               3.979e+00  8.860e-01   4.491 8.24e-06 ***
## stockTRV                             1.232e+01  1.765e+00   6.979 6.77e-12 ***
## stockUTX                             1.903e+01  2.624e+00   7.251 1.07e-12 ***
## stockVZ                              5.712e+00  1.051e+00   5.437 7.45e-08 ***
## stockWMT                             1.052e+01  1.582e+00   6.647 5.95e-11 ***
## stockXOM                             1.818e+01  2.557e+00   7.110 2.81e-12 ***
## percent_change_next_weeks_price_lag -8.124e-02  4.185e-02  -1.941   0.0526 .  
## close_lag                           -2.655e-01  3.712e-02  -7.152 2.12e-12 ***
## volume_lag                           2.481e-09  1.691e-09   1.467   0.1429    
## percent_change_volume_over_last_wk   2.179e-03  2.616e-03   0.833   0.4051    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.607 on 716 degrees of freedom
## Multiple R-squared:  0.09496,    Adjusted R-squared:  0.05325 
## F-statistic: 2.277 on 33 and 716 DF,  p-value: 7.582e-05
Split
train=(dowjones$quarter == 1)
dowj_train= dowjones_lag[train ,]
dowj_test= dowjones_lag[!train ,]
stocks = (unique(dowj_train$stock))   # list of stocks
rmse = rep(NA, length(stocks))        # store root mean square error for each stock

dates = unique(dowj_test$date)        # list of dates for predictions
Preds <- rep(NA, length(stocks))                                    # list of dates for predictions

lm_predictions = data.frame(matrix(NA, ncol = 30, nrow = 13)) # df to store prediction of 30 stocks, 13 weeks
colnames(lm_predictions)= stocks

#Preds_lmodel <- rep(NA, length(stocks))                                    # list of dates for predictions
#lm_predictions <- data.frame(dates,stocks, Preds_lmodel)                        # df to store prediction of 30 stocks


lm_pred_metrics = data.frame(Stock = stocks, rmse = rmse)   # df to store metrics

for(i in 1:length(stocks)){
  
  stock_train = subset(dowj_train, stock == stocks[i])      # train set for each stocks
  stock_test = subset(dowj_test, stock == stocks[i])        # test set for each stocks
                                                            # fit linear regression for each stocks
  lm_fit = lm(percent_change_next_weeks_price ~  volume_lag + close + high + low,  
               data = stock_train) 
  
  lm_preds <-  predict(lm_fit, stock_test)                          # make predictions
  lm_predictions[i] <- lm_preds                   # store predictions for each week for each stocks
  
  lm_rmse = rmse(stock_test$percent_change_next_weeks_price, lm_preds)   # compute errors
  lm_pred_metrics[i,"rmse"] = lm_rmse                                    # store rmses

}
lm_predictions = data.frame(dates,lm_predictions)                    # store predictions 
dt_pred_metrics = data.frame(Stock = stocks, rmse = rmse)           # df to store metrics

for(i in 1:length(stocks)){

  stock_train = subset(dowj_train, stock == stocks[i])              # train set for each stocks
  stock_test = subset(dowj_test, stock == stocks[i])                # test set for each stocks
                                                                    # fit decision tree for each stocks 
  dt_model = tree(percent_change_next_weeks_price ~ volume_lag + close + high + low, 
                  data = stock_train)  

  dt_preds = predict(dt_model, newdata = stock_test)                    # make predictions
  
  dt_rmse = rmse(stock_test$percent_change_next_weeks_price, dt_preds)  # compute errors
  dt_pred_metrics[i, "rmse"] = dt_rmse                                  # store rmse
}
svm_pred_metrics <- data.frame(stocks, rmse)                        # df to store metrics

svm_predictions = data.frame(matrix(NA, ncol = 30, nrow = 13))      # df to store prediction of 30 stocks,                                                                       13 weeks
colnames(svm_predictions)= stocks


for(i in 1:length(stocks)){
  
  stock_train = subset(dowj_train, stock == stocks[i])              # train set for each stocks
  stock_test = subset(dowj_test, stock == stocks[i])                # test set for each stocks  
  set.seed(1)
                                                                    # fit decision tree for each stocks
  tuned <- tune.svm(percent_change_next_weeks_price ~ volume + close + low + high + open, 
                    data = stock_train,  gamma = seq(0.1, 1, by = 0.1), 
                    cost = seq(0.1,1, by = 0.1), scale=TRUE)
  
  svm_fit <- svm(percent_change_next_weeks_price ~ volume + close + low + high + open, 
                   data = stock_train,  gamma = tuned$best.parameters$gamma,
                   cost = tuned$best.parameters$cost, scale=TRUE) 

  svm_preds <- predict(svm_fit, stock_test)                         # make prediction
  svm_predictions[i] <- svm_preds                      # store decision tree predictions
  
  svm_rmse <- rmse(stock_test$percent_change_next_weeks_price, svm_preds)    # compute errors
  svm_pred_metrics[i, "rmse"] <- svm_rmse                                    # store rmse
}

Compare the RMSE of the three ML models

models_rmse = data.frame(stocks, lm_pred_metrics$rmse, dt_pred_metrics$rmse, svm_pred_metrics$rmse)
colnames(models_rmse)= c("stock", "lm_rmse", "dt_rmse", "svm_rmse")
colMeans(models_rmse[,2:4])
##  lm_rmse  dt_rmse svm_rmse 
## 4.917591 3.022875 2.930730
models_rmse %>% summarise(across(where(is.numeric), mean))

For CAPM

calculate return in percentage change

# Compute Dow's values from the 30 stocks included in the dow industrial average using the Dow divisor for quarter 2

dowj <- aggregate(dowj_test$close, by = list(dowj_test$date), FUN = function(x) sum(x)/0.132)   
return_dow <- na.omit(Delt(dowj[,2]))

return_stocks <- data.frame(matrix(0.0, ncol = 30, nrow = 12))                        # df to store return
colnames(return_stocks) = c("AA", "AXP", "BA", "BAC", "CAT", "CSCO", "CVX", "DD", 
                            "DIS", "GE", "HD", "HPQ", "IBM", "INTC", "JNJ", "JPM", 
                            "KRFT", "KO", "MCD", "MMM", "MRK", "MSFT", "PFE", "PG", 
                            "T", "TRV", "UTX", "VZ", "WMT", "XOM")

all_Stocks =svm_predictions %>% pivot_longer(cols = 1:30,         # represent prediction in long format
                      names_to = "stock", values_to = "return")

for(i in 1:length(stocks)){                                       # compute returns 
  
  dow.sub = subset(dowj_test, stock == stocks[i])
  return_stocks[i] = na.omit(Delt(dow.sub$close)) 
}

return_stocks <- data.frame(return_stocks, return_dow) %>% 
  rename(DOW = Delt.1.arithmetic)              # compute average returns for each stock.  
returns_stk = t(return_stocks[,-31] %>% summarise(across(where(is.numeric), mean)))
colnames(returns_stk) = "return"
returns_stk= as.data.frame(returns_stk)

Calculate betas for the stocks

beta_AA = lm(AA ~ DOW, data = return_stocks)$coef[2]
beta_AXP = lm(AXP ~ DOW, data = return_stocks)$coef[2]
beta_BA = lm(BA ~ DOW, data = return_stocks)$coef[2]
beta_BAC = lm(BAC ~ DOW, data = return_stocks)$coef[2]
beta_CAT = lm(CAT ~ DOW, data = return_stocks)$coef[2]
beta_CSCO = lm(CSCO ~ DOW, data = return_stocks)$coef[2]
beta_CVX = lm(CVX ~ DOW, data = return_stocks)$coef[2]
beta_DD = lm(DD ~ DOW, data = return_stocks)$coef[2]
beta_DIS = lm(DIS ~ DOW, data = return_stocks)$coef[2]
beta_GE = lm(GE ~ DOW, data = return_stocks)$coef[2]
beta_HD = lm(HD ~ DOW, data = return_stocks)$coef[2]
beta_HPQ = lm(HPQ ~ DOW, data = return_stocks)$coef[2]
beta_IBM = lm(IBM ~ DOW, data = return_stocks)$coef[2]
beta_INTC = lm(INTC ~ DOW, data = return_stocks)$coef[2]
beta_JNJ = lm(JNJ ~ DOW, data = return_stocks)$coef[2]
beta_JPM = lm(JPM ~ DOW, data = return_stocks)$coef[2]
beta_KRFT = lm(KRFT ~ DOW, data = return_stocks)$coef[2]
beta_KO = lm(KO ~ DOW, data = return_stocks)$coef[2]
beta_MCD = lm(MCD ~ DOW, data = return_stocks)$coef[2]
beta_MMM = lm(MMM ~ DOW, data = return_stocks)$coef[2]
beta_MRK = lm(MRK ~ DOW, data = return_stocks)$coef[2]
beta_MSFT = lm(MSFT ~ DOW, data = return_stocks)$coef[2]
beta_PFE = lm(PFE ~ DOW, data = return_stocks)$coef[2]
beta_PG = lm(PG ~ DOW, data = return_stocks)$coef[2]
beta_T = lm(`T` ~ DOW, data = return_stocks)$coef[2]
beta_TRV = lm(TRV ~ DOW, data = return_stocks)$coef[2]
beta_UTX = lm(UTX ~ DOW, data = return_stocks)$coef[2]
beta_VZ = lm(VZ ~ DOW, data = return_stocks)$coef[2]
beta_WMT = lm(WMT ~ DOW, data = return_stocks)$coef[2]
beta_XOM = lm(XOM ~ DOW, data = return_stocks)$coef[2]

df_capm = data.frame(Stock = c("AA", "AXP", "BA", "BAC", "CAT", "CSCO", "CVX", "DD", "DIS", "GE",
                           "HD", "HPQ", "IBM", "INTC", "JNJ", "JPM", "KRFT", "KO", "MCD", "MMM",
                           "MRK", "MSFT", "PFE", "PG", "T", "TRV", "UTX", "VZ", "WMT", "XOM"),
                 Beta = c(beta_AA, beta_AXP, beta_BA, beta_BAC, beta_CAT, beta_CSCO,
                          beta_CVX, beta_DD, beta_DIS, beta_GE, beta_HD, beta_HPQ, beta_IBM,
                          beta_INTC, beta_JNJ, beta_JPM, beta_KRFT, beta_KO, beta_MCD,
                          beta_MMM, beta_MRK, beta_MSFT, beta_PFE, beta_PG, beta_T, beta_TRV,
                          beta_UTX, beta_VZ, beta_WMT, beta_XOM)) 
df_capm <- data.frame(df_capm, Return =returns_stk$return)
df_capm %>% 
  arrange(-desc(Beta)) %>% 
  mutate(Return = scales::percent(Return))
return_stocks %>% pivot_longer(cols = 1:30, names_to = "stock", values_to = "return") %>%
  ggplot(aes(x = return, y = fct_rev(stock))) +
  geom_boxplot() + 
  scale_x_continuous(labels = scales::percent_format()) +
  labs(title = "Dow Jones Stock Return Distributions \n", x = "\n Expected Return", y = "Stock\n")+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  coord_flip()