2. Data

dowjones = read.table("dow_jones_index.data", sep = ",", header = TRUE)
str(dowjones)
## 'data.frame':    750 obs. of  16 variables:
##  $ quarter                           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ stock                             : chr  "AA" "AA" "AA" "AA" ...
##  $ date                              : chr  "1/7/2011" "1/14/2011" "1/21/2011" "1/28/2011" ...
##  $ open                              : chr  "$15.82" "$16.71" "$16.19" "$15.87" ...
##  $ high                              : chr  "$16.72" "$16.71" "$16.38" "$16.63" ...
##  $ low                               : chr  "$15.78" "$15.64" "$15.60" "$15.82" ...
##  $ close                             : chr  "$16.42" "$15.97" "$15.79" "$16.13" ...
##  $ volume                            : int  239655616 242963398 138428495 151379173 154387761 114691279 80023895 132981863 109493077 114332562 ...
##  $ percent_change_price              : num  3.79 -4.43 -2.47 1.64 5.93 ...
##  $ percent_change_volume_over_last_wk: num  NA 1.38 -43.02 9.36 1.99 ...
##  $ previous_weeks_volume             : int  NA 239655616 242963398 138428495 151379173 154387761 114691279 80023895 132981863 109493077 ...
##  $ next_weeks_open                   : chr  "$16.71" "$16.19" "$15.87" "$16.18" ...
##  $ next_weeks_close                  : chr  "$15.97" "$15.79" "$16.13" "$17.14" ...
##  $ percent_change_next_weeks_price   : num  -4.428 -2.471 1.638 5.933 0.231 ...
##  $ days_to_next_dividend             : int  26 19 12 5 97 90 83 76 69 62 ...
##  $ percent_return_next_dividend      : num  0.183 0.188 0.19 0.186 0.175 ...



2.a. Missing data

Find missing values

colSums(is.na(dowjones))
##                            quarter                              stock 
##                                  0                                  0 
##                               date                               open 
##                                  0                                  0 
##                               high                                low 
##                                  0                                  0 
##                              close                             volume 
##                                  0                                  0 
##               percent_change_price percent_change_volume_over_last_wk 
##                                  0                                 30 
##              previous_weeks_volume                    next_weeks_open 
##                                 30                                  0 
##                   next_weeks_close    percent_change_next_weeks_price 
##                                  0                                  0 
##              days_to_next_dividend       percent_return_next_dividend 
##                                  0                                  0

Those missing values correspond to the variables that have past observations “previous_weeks_volume”, so the first line has no data, because it has nothing to refer to, therefore it is an NA. We will fill in those empty observations with the mean value, so that our future analysis doesn’t get affected by it.

dowjones = dowjones %>% 
  group_by(stock) %>%
  mutate(percent_change_volume_over_last_wk = ifelse(is.na(percent_change_volume_over_last_wk),
                                                     mean(percent_change_volume_over_last_wk,
                                                     na.rm=TRUE),
                                                     percent_change_volume_over_last_wk),
         previous_weeks_volume = ifelse(is.na(previous_weeks_volume),
                                        mean(previous_weeks_volume, na.rm=TRUE), 
                                        previous_weeks_volume)) %>% 
  ungroup()


#dowjones$previous_weeks_volume[is.na(dowjones$previous_weeks_volume)] = mean(dowjones$previous_weeks_volume, na.rm = T)
#dowjones$percent_change_volume_over_last_wk[is.na(dowjones$percent_change_volume_over_last_wk)] = mean(dowjones$percent_change_volume_over_last_wk, na.rm = T)
colSums(is.na(dowjones))
##                            quarter                              stock 
##                                  0                                  0 
##                               date                               open 
##                                  0                                  0 
##                               high                                low 
##                                  0                                  0 
##                              close                             volume 
##                                  0                                  0 
##               percent_change_price percent_change_volume_over_last_wk 
##                                  0                                  0 
##              previous_weeks_volume                    next_weeks_open 
##                                  0                                  0 
##                   next_weeks_close    percent_change_next_weeks_price 
##                                  0                                  0 
##              days_to_next_dividend       percent_return_next_dividend 
##                                  0                                  0

2.b. Cleaning the data

We will change character variables to numeric by removing the $ sign.

dowjones$open = parse_number(dowjones$open)
dowjones$high = parse_number(dowjones$high)
dowjones$close = parse_number(dowjones$close)
dowjones$low = parse_number(dowjones$low)
dowjones$next_weeks_open = parse_number(dowjones$next_weeks_open)
dowjones$next_weeks_close = parse_number(dowjones$next_weeks_close)

Same with the date variable, we need to assign the proper date variable.

dowjones$date = as.Date(dowjones$date, "%m/%d/%Y")
dowjones$stock = as.factor(dowjones$stock)
str(dowjones)
## tibble [750 x 16] (S3: tbl_df/tbl/data.frame)
##  $ quarter                           : int [1:750] 1 1 1 1 1 1 1 1 1 1 ...
##  $ stock                             : Factor w/ 30 levels "AA","AXP","BA",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ date                              : Date[1:750], format: "2011-01-07" "2011-01-14" ...
##  $ open                              : num [1:750] 15.8 16.7 16.2 15.9 16.2 ...
##  $ high                              : num [1:750] 16.7 16.7 16.4 16.6 17.4 ...
##  $ low                               : num [1:750] 15.8 15.6 15.6 15.8 16.2 ...
##  $ close                             : num [1:750] 16.4 16 15.8 16.1 17.1 ...
##  $ volume                            : int [1:750] 239655616 242963398 138428495 151379173 154387761 114691279 80023895 132981863 109493077 114332562 ...
##  $ percent_change_price              : num [1:750] 3.79 -4.43 -2.47 1.64 5.93 ...
##  $ percent_change_volume_over_last_wk: num [1:750] 4.03 1.38 -43.02 9.36 1.99 ...
##  $ previous_weeks_volume             : num [1:750] 1.31e+08 2.40e+08 2.43e+08 1.38e+08 1.51e+08 ...
##  $ next_weeks_open                   : num [1:750] 16.7 16.2 15.9 16.2 17.3 ...
##  $ next_weeks_close                  : num [1:750] 16 15.8 16.1 17.1 17.4 ...
##  $ percent_change_next_weeks_price   : num [1:750] -4.428 -2.471 1.638 5.933 0.231 ...
##  $ days_to_next_dividend             : int [1:750] 26 19 12 5 97 90 83 76 69 62 ...
##  $ percent_return_next_dividend      : num [1:750] 0.183 0.188 0.19 0.186 0.175 ...

2.c. Exloratory Data analysis

Scatter Plot

We will looks at the Scatter plot to have an idea of the variables that might be strongly correlated to each other.

pairs(dowjones)

The ones we can see strongly correlated are: Open, High, Low, Close, next_weeks_open, next_weeks_close.

3.a. Selecting the variables

Full model

Full model (our full model explains 72.4% of percent_change_next_weeks_price), however, this is cheating, since our model includes “next_weeks_open” and “next_weeks_close” which perfectly predicts the price, and therefore the percent_change_next_weeks_price.

model_full = lm(percent_change_next_weeks_price ~ ., data = dowjones )
summary(model_full)$r.squared
## [1] 0.7235275

When we delete these terms from the model, the r-square drops tremendously to 13.37%

model_lm1 = lm(percent_change_next_weeks_price ~ . - next_weeks_open - next_weeks_close, data = dowjones )
summary(model_lm1)$r.squared
## [1] 0.1328077
Model selection using leaps

In here we tried multiple things: for sure deleting the “next_weeks_open” and “next_weeks_close” variables. This model will only bring that the significant variable is: previous_weeks_volume (due to the noise of the stocks variable seen as a variable)

set.seed(123)
train.control = trainControl(method="cv", number = 10)
step.model = train(percent_change_next_weeks_price ~ .- next_weeks_close - next_weeks_close, data = dowjones,
                   method = "leapSeq",
                   tuneGrid = data.frame(nvmax = 1:5),
                   trControl = train.control)

step.model$results
summary(step.model$finalModel)
## Subset selection object
## 42 Variables  (and intercept)
##                                    Forced in Forced out
## quarter                                FALSE      FALSE
## stockAXP                               FALSE      FALSE
## stockBA                                FALSE      FALSE
## stockBAC                               FALSE      FALSE
## stockCAT                               FALSE      FALSE
## stockCSCO                              FALSE      FALSE
## stockCVX                               FALSE      FALSE
## stockDD                                FALSE      FALSE
## stockDIS                               FALSE      FALSE
## stockGE                                FALSE      FALSE
## stockHD                                FALSE      FALSE
## stockHPQ                               FALSE      FALSE
## stockIBM                               FALSE      FALSE
## stockINTC                              FALSE      FALSE
## stockJNJ                               FALSE      FALSE
## stockJPM                               FALSE      FALSE
## stockKO                                FALSE      FALSE
## stockKRFT                              FALSE      FALSE
## stockMCD                               FALSE      FALSE
## stockMMM                               FALSE      FALSE
## stockMRK                               FALSE      FALSE
## stockMSFT                              FALSE      FALSE
## stockPFE                               FALSE      FALSE
## stockPG                                FALSE      FALSE
## stockT                                 FALSE      FALSE
## stockTRV                               FALSE      FALSE
## stockUTX                               FALSE      FALSE
## stockVZ                                FALSE      FALSE
## stockWMT                               FALSE      FALSE
## stockXOM                               FALSE      FALSE
## date                                   FALSE      FALSE
## open                                   FALSE      FALSE
## high                                   FALSE      FALSE
## low                                    FALSE      FALSE
## close                                  FALSE      FALSE
## volume                                 FALSE      FALSE
## percent_change_price                   FALSE      FALSE
## percent_change_volume_over_last_wk     FALSE      FALSE
## previous_weeks_volume                  FALSE      FALSE
## next_weeks_open                        FALSE      FALSE
## days_to_next_dividend                  FALSE      FALSE
## percent_return_next_dividend           FALSE      FALSE
## 1 subsets of each size up to 5
## Selection Algorithm: 'sequential replacement'
##          quarter stockAXP stockBA stockBAC stockCAT stockCSCO stockCVX stockDD
## 1  ( 1 ) " "     " "      " "     " "      " "      "*"       " "      " "    
## 2  ( 1 ) " "     " "      " "     "*"      " "      "*"       " "      " "    
## 3  ( 1 ) " "     " "      " "     "*"      " "      "*"       " "      " "    
## 4  ( 1 ) " "     " "      " "     " "      " "      " "       " "      " "    
## 5  ( 1 ) " "     "*"      " "     " "      " "      " "       " "      " "    
##          stockDIS stockGE stockHD stockHPQ stockIBM stockINTC stockJNJ stockJPM
## 1  ( 1 ) " "      " "     " "     " "      " "      " "       " "      " "     
## 2  ( 1 ) " "      " "     " "     " "      " "      " "       " "      " "     
## 3  ( 1 ) " "      " "     " "     "*"      " "      " "       " "      " "     
## 4  ( 1 ) " "      " "     " "     " "      " "      " "       " "      " "     
## 5  ( 1 ) " "      " "     " "     " "      " "      " "       " "      " "     
##          stockKO stockKRFT stockMCD stockMMM stockMRK stockMSFT stockPFE
## 1  ( 1 ) " "     " "       " "      " "      " "      " "       " "     
## 2  ( 1 ) " "     " "       " "      " "      " "      " "       " "     
## 3  ( 1 ) " "     " "       " "      " "      " "      " "       " "     
## 4  ( 1 ) " "     " "       " "      " "      " "      " "       " "     
## 5  ( 1 ) " "     " "       " "      " "      " "      " "       " "     
##          stockPG stockT stockTRV stockUTX stockVZ stockWMT stockXOM date open
## 1  ( 1 ) " "     " "    " "      " "      " "     " "      " "      " "  " " 
## 2  ( 1 ) " "     " "    " "      " "      " "     " "      " "      " "  " " 
## 3  ( 1 ) " "     " "    " "      " "      " "     " "      " "      " "  " " 
## 4  ( 1 ) " "     " "    " "      " "      " "     " "      " "      " "  "*" 
## 5  ( 1 ) " "     " "    " "      " "      " "     " "      " "      " "  "*" 
##          high low close volume percent_change_price
## 1  ( 1 ) " "  " " " "   " "    " "                 
## 2  ( 1 ) " "  " " " "   " "    " "                 
## 3  ( 1 ) " "  " " " "   " "    " "                 
## 4  ( 1 ) "*"  " " " "   " "    " "                 
## 5  ( 1 ) "*"  " " " "   " "    " "                 
##          percent_change_volume_over_last_wk previous_weeks_volume
## 1  ( 1 ) " "                                " "                  
## 2  ( 1 ) " "                                " "                  
## 3  ( 1 ) " "                                " "                  
## 4  ( 1 ) " "                                " "                  
## 5  ( 1 ) " "                                " "                  
##          next_weeks_open days_to_next_dividend percent_return_next_dividend
## 1  ( 1 ) " "             " "                   " "                         
## 2  ( 1 ) " "             " "                   " "                         
## 3  ( 1 ) " "             " "                   " "                         
## 4  ( 1 ) "*"             " "                   "*"                         
## 5  ( 1 ) "*"             " "                   "*"

When running the same model but now without stocks, we can find the other variables that could help us. However, our Rsquared decreases all the way to 3%!

set.seed(123)
train.control = trainControl(method="cv", number = 10)
step.model = train(percent_change_next_weeks_price ~ .- next_weeks_close - next_weeks_close - stock, data = dowjones,
                   method = "leapSeq",
                   tuneGrid = data.frame(nvmax = 1:5),
                   trControl = train.control)

step.model$results
summary(step.model$finalModel)
## Subset selection object
## 13 Variables  (and intercept)
##                                    Forced in Forced out
## quarter                                FALSE      FALSE
## date                                   FALSE      FALSE
## open                                   FALSE      FALSE
## high                                   FALSE      FALSE
## low                                    FALSE      FALSE
## close                                  FALSE      FALSE
## volume                                 FALSE      FALSE
## percent_change_price                   FALSE      FALSE
## percent_change_volume_over_last_wk     FALSE      FALSE
## previous_weeks_volume                  FALSE      FALSE
## next_weeks_open                        FALSE      FALSE
## days_to_next_dividend                  FALSE      FALSE
## percent_return_next_dividend           FALSE      FALSE
## 1 subsets of each size up to 5
## Selection Algorithm: 'sequential replacement'
##          quarter date open high low close volume percent_change_price
## 1  ( 1 ) " "     " "  " "  " "  " " " "   " "    " "                 
## 2  ( 1 ) " "     " "  "*"  "*"  " " " "   " "    " "                 
## 3  ( 1 ) "*"     "*"  "*"  " "  " " " "   " "    " "                 
## 4  ( 1 ) " "     " "  "*"  "*"  " " " "   " "    " "                 
## 5  ( 1 ) " "     " "  "*"  "*"  " " "*"   " "    " "                 
##          percent_change_volume_over_last_wk previous_weeks_volume
## 1  ( 1 ) " "                                "*"                  
## 2  ( 1 ) " "                                " "                  
## 3  ( 1 ) " "                                " "                  
## 4  ( 1 ) " "                                " "                  
## 5  ( 1 ) " "                                " "                  
##          next_weeks_open days_to_next_dividend percent_return_next_dividend
## 1  ( 1 ) " "             " "                   " "                         
## 2  ( 1 ) " "             " "                   " "                         
## 3  ( 1 ) " "             " "                   " "                         
## 4  ( 1 ) "*"             " "                   "*"                         
## 5  ( 1 ) "*"             " "                   "*"

Here we can see the following possible models: (a) open + high, (b) quarter + date + open, (c) open + high + next_weeks_open + percent_return_next_dividend. Being (c) the model with the 2.81% Rsquared.

Model Selection using VIF cuttoff

Here, you can’t see the process we went through because we did a manual VIF cutoff, but we tried with all the variables and getting rid of them, until we have the variables that satisfy our VIF cutoff (< 8.00)

model = lm(percent_change_next_weeks_price ~ percent_return_next_dividend  + volume + previous_weeks_volume  + days_to_next_dividend, data = dowjones )
summary(model)
## 
## Call:
## lm(formula = percent_change_next_weeks_price ~ percent_return_next_dividend + 
##     volume + previous_weeks_volume + days_to_next_dividend, data = dowjones)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.2112  -1.5600  -0.0927   1.5783   9.7029 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                   4.509e-02  2.930e-01   0.154   0.8778  
## percent_return_next_dividend  6.024e-01  3.343e-01   1.802   0.0720 .
## volume                        1.677e-09  1.295e-09   1.295   0.1958  
## previous_weeks_volume        -2.667e-09  1.293e-09  -2.063   0.0395 *
## days_to_next_dividend        -2.044e-03  2.120e-03  -0.964   0.3352  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.667 on 745 degrees of freedom
## Multiple R-squared:  0.01469,    Adjusted R-squared:  0.009401 
## F-statistic: 2.777 on 4 and 745 DF,  p-value: 0.02611
vif(model)
## percent_return_next_dividend                       volume 
##                     1.098254                     4.433020 
##        previous_weeks_volume        days_to_next_dividend 
##                     4.436957                     1.015691
model2 = lm(percent_change_next_weeks_price ~ percent_change_price + percent_change_volume_over_last_wk+ + percent_return_next_dividend  + volume + previous_weeks_volume  + days_to_next_dividend, data = dowjones )
summary(model2)
## 
## Call:
## lm(formula = percent_change_next_weeks_price ~ percent_change_price + 
##     percent_change_volume_over_last_wk + +percent_return_next_dividend + 
##     volume + previous_weeks_volume + days_to_next_dividend, data = dowjones)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.3447  -1.5159  -0.0915   1.5733   9.7191 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                         5.651e-02  2.943e-01   0.192   0.8478  
## percent_change_price                2.104e-02  4.000e-02   0.526   0.5991  
## percent_change_volume_over_last_wk -2.298e-03  3.122e-03  -0.736   0.4620  
## percent_return_next_dividend        5.979e-01  3.346e-01   1.787   0.0743 .
## volume                              2.524e-09  1.615e-09   1.563   0.1185  
## previous_weeks_volume              -3.468e-09  1.601e-09  -2.166   0.0306 *
## days_to_next_dividend              -2.083e-03  2.122e-03  -0.982   0.3266  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.669 on 743 degrees of freedom
## Multiple R-squared:  0.01597,    Adjusted R-squared:  0.00802 
## F-statistic: 2.009 on 6 and 743 DF,  p-value: 0.06218
Lag for close
lag.plot(dowjones$close, pch = ".", set.lags = 1:4)

Lag for volume
lag.plot(dowjones$volume, pch = ".", set.lags = 1:4)

  • We can see the first lag provides the best correlation and hence we will use lag = 1 for our modeling.
Lag for percent_change_next_weeks_price
lag.plot(dowjones$percent_change_next_weeks_price, pch = ".", set.lags = 1:4)

3. Methodology

Create new lags
dowjones_lag = dowjones %>% 
  group_by(stock) %>% 
  mutate(close_lag = lag(close, n = 1), 
         volume_lag = lag(volume, n = 1),
         percent_change_next_weeks_price_lag = lag(percent_change_next_weeks_price, n = 1)) %>% 
  ungroup()

head(dowjones_lag)
par(mfrow = c(1,2))
plot(dowjones_lag$close_lag,dowjones_lag$percent_change_next_weeks_price)
plot(dowjones_lag$volume_lag,dowjones_lag$percent_change_next_weeks_price)

Remove the NAs from the first obseevation in lagged variables
dowjones_lag = dowjones_lag %>% 
  group_by(stock) %>%
  mutate(close_lag = ifelse(is.na(close_lag), mean(close_lag, na.rm=T), close_lag),
         volume_lag = ifelse(is.na(volume_lag), mean(volume_lag, na.rm=T), volume_lag),
         percent_change_next_weeks_price_lag = ifelse(is.na(percent_change_next_weeks_price_lag),
                                             mean(percent_change_next_weeks_price_lag, na.rm=T),
                                             percent_change_next_weeks_price_lag)) %>% 
  ungroup()
model3 = lm(percent_change_next_weeks_price ~ stock+ percent_change_next_weeks_price_lag +
               close_lag + volume_lag + percent_change_volume_over_last_wk, 
               data = dowjones_lag )
summary(model3)
## 
## Call:
## lm(formula = percent_change_next_weeks_price ~ stock + percent_change_next_weeks_price_lag + 
##     close_lag + volume_lag + percent_change_volume_over_last_wk, 
##     data = dowjones_lag)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.7899  -1.4073  -0.1243   1.4309   8.9183 
## 
## Coefficients:
##                                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          3.857e+00  8.355e-01   4.617 4.62e-06 ***
## stockAXP                             9.212e+00  1.355e+00   6.797 2.25e-11 ***
## stockBA                              1.582e+01  2.251e+00   7.029 4.86e-12 ***
## stockBAC                            -3.137e+00  1.255e+00  -2.499   0.0127 *  
## stockCAT                             2.401e+01  3.312e+00   7.249 1.09e-12 ***
## stockCSCO                           -1.133e+00  8.382e-01  -1.352   0.1769    
## stockCVX                             2.358e+01  3.243e+00   7.271 9.38e-13 ***
## stockDD                              1.073e+01  1.554e+00   6.903 1.12e-11 ***
## stockDIS                             7.220e+00  1.192e+00   6.057 2.24e-09 ***
## stockGE                              7.703e-01  7.812e-01   0.986   0.3245    
## stockHD                              6.146e+00  1.062e+00   5.788 1.07e-08 ***
## stockHPQ                             6.391e+00  1.192e+00   5.361 1.12e-07 ***
## stockIBM                             4.018e+01  5.499e+00   7.307 7.30e-13 ***
## stockINTC                            1.625e+00  8.123e-01   2.000   0.0459 *  
## stockJNJ                             1.302e+01  1.869e+00   6.965 7.42e-12 ***
## stockJPM                             7.717e+00  1.282e+00   6.018 2.82e-09 ***
## stockKO                              1.387e+01  1.968e+00   7.047 4.30e-12 ***
## stockKRFT                            5.406e+00  9.633e-01   5.612 2.86e-08 ***
## stockMCD                             1.710e+01  2.377e+00   7.193 1.61e-12 ***
## stockMMM                             2.094e+01  2.898e+00   7.225 1.28e-12 ***
## stockMRK                             5.028e+00  9.960e-01   5.048 5.67e-07 ***
## stockMSFT                            2.009e+00  8.612e-01   2.332   0.0200 *  
## stockPFE                             1.443e+00  7.671e-01   1.880   0.0605 .  
## stockPG                              1.309e+01  1.920e+00   6.821 1.93e-11 ***
## stockT                               3.979e+00  8.860e-01   4.491 8.24e-06 ***
## stockTRV                             1.232e+01  1.765e+00   6.979 6.77e-12 ***
## stockUTX                             1.903e+01  2.624e+00   7.251 1.07e-12 ***
## stockVZ                              5.712e+00  1.051e+00   5.437 7.45e-08 ***
## stockWMT                             1.052e+01  1.582e+00   6.647 5.95e-11 ***
## stockXOM                             1.818e+01  2.557e+00   7.110 2.81e-12 ***
## percent_change_next_weeks_price_lag -8.124e-02  4.185e-02  -1.941   0.0526 .  
## close_lag                           -2.655e-01  3.712e-02  -7.152 2.12e-12 ***
## volume_lag                           2.481e-09  1.691e-09   1.467   0.1429    
## percent_change_volume_over_last_wk   2.179e-03  2.616e-03   0.833   0.4051    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.607 on 716 degrees of freedom
## Multiple R-squared:  0.09496,    Adjusted R-squared:  0.05325 
## F-statistic: 2.277 on 33 and 716 DF,  p-value: 7.582e-05
train=(dowjones$date <= "2011-03-31")
dowj_train= dowjones_lag[train ,]
dowj_test= dowjones_lag[!train ,]
Split
#cols = c("percent_change_next_weeks_price", "percent_change_next_weeks_price_lag", "close_lag", "volume_lag", "percent_change_volume_over_last_wk", "stock")
stocks = as.factor(unique(dowj_train$stock))
RMSE = rep(NA, length(stocks))  # 
#RSquared = rep(NA, length(stocks))
lm_pred_metrics = data.frame(Stock = stocks, RMSE = RMSE)

for(i in 1:length(stocks)){
  
  stock_train = subset(dowj_train, stock == stocks[i])
  stock_test = subset(dowj_train, stock == stocks[i])
  
  lm_fit = lm(percent_change_next_weeks_price ~ volume_lag + open + close + high + low + volume, 
               data = stock_train)  
  
  #lm_fit = lm(percent_change_next_weeks_price ~ percent_change_next_weeks_price_lag +
  #             close_lag + volume_lag + percent_change_volume_over_last_wk, 
  #             data = stock_train) 
  
  lm.preds = predict(lm_fit, stock_test) 
  
  lm.rmse = rmse(stock_train$percent_change_next_weeks_price, lm.preds)
  lm_pred_metrics[i,"RMSE"] = lm.rmse
}
lm_pred_metrics
dt_pred_metrics = data.frame(Stock = stocks, RMSE = RMSE)

for(i in 1:length(stocks)){

  stock_train = subset(dowj_train, stock == stocks[i])
  stock_test = subset(dowj_test, stock == stocks[i])
  
  dt_model = tree(percent_change_next_weeks_price ~ volume_lag + open + close + high + low + volume, 
                   data = stock_train)  
  
  
  #dt_model = tree(percent_change_next_weeks_price ~ percent_change_next_weeks_price_lag +
  #                close_lag + volume_lag + percent_change_volume_over_last_wk, 
  #                 data = stock_train)
  
  dt_preds = predict(dt_model, newdata = stock_test)
  
  dt_rmse = rmse(stock_test$percent_change_next_weeks_price, dt_preds)
  dt_pred_metrics[i, "RMSE"] = dt_rmse

  
  #tree_r2 = R2(stock_test$percent_change_next_weeks_price, trees_preds)
  #dt_pred_metrics[i, "R2"] = tree_r2
}
head(dt_pred_metrics)
mean(lm_pred_metrics$RMSE)
## [1] 1.27709
mean(dt_pred_metrics$RMSE)
## [1] 3.011236
svm_pred_metrics <- data.frame(stocks, RMSE)

Preds <- rep(NA, length(stocks))
svm_predictions <- data.frame(stocks, Preds)

for(i in 1:length(stocks)){
  
  stock_train = subset(dowj_train, stock == stocks[i])
  stock_test = subset(dowj_test, stock == stocks[i])
  
  set.seed(1)

  svm_fit <- train(percent_change_next_weeks_price ~ volume_lag + open + close + high + low + volume, 
                   data = stock_train,  method = "svmPoly",
                   metric = "RMSE", preProcess = c("center","scale"), 
                   trControl = trainControl(method = "cv"))  
  
  
  #svm_fit <- train(percent_change_next_weeks_price ~ percent_change_next_weeks_price_lag +
  #                 close_lag + volume_lag + percent_change_volume_over_last_wk, 
  #                 data = stock_train,  method = "svmPoly",
  #                 metric = "RMSE", preProcess = c("center","scale"), 
  #                 trControl = trainControl(method = "cv"))
                    
  svm_preds <- predict(svm_fit, stock_test) 
  svm_predictions[i, "Preds"] <- svm_preds[4]
  
  svm_rmse <- RMSE(stock_test$percent_change_next_weeks_price, svm_preds)
  svm_pred_metrics[i, "RMSE"] <- svm_rmse
  
 # svm.r2 <- R2(stock.test$percent_change_next_weeks_price, svm.preds)
  # svm.stock.predictions[i, "RSquared"] <- svm.r2

}
mean(svm_pred_metrics$RMSE)
## [1] 4.381084

For CAPM

calculate return in percentage change

dowj <- aggregate(dowjones$close, by = list(dowjones$date), FUN = function(x) sum(x)/0.132)
return_dow <- na.omit(Delt(dowj[,2]))

#stocks <- unique(dowjones$stock)
return_stocks <- data.frame(matrix(0, ncol = 30, nrow = 24)) #30 stocks, 24 weeks
return_stocks <- cbind(return_stocks, return_dow)
colnames(return_stocks) = c("AA", "AXP", "BA", "BAC", "CAT", "CSCO", "CVX", "DD", "DIS", 
                            "GE", "HD", "HPQ", "IBM", "INTC", "JNJ", "JPM", "KRFT", "KO", 
                            "MCD", "MMM", "MRK", "MSFT", "PFE", "PG", "T", "TRV", "UTX", 
                            "VZ", "WMT", "XOM", "DOW")
#colnames(return_stocks) = stocks

for(i in 1:length(stocks)){
  dow.sub = subset(dowjones, stock == stocks[i])
  return_stocks[i] = na.omit(Delt(dow.sub$close)) 
}

Calculate betas for the stocks

beta.AA = lm(AA ~ DOW, data = return_stocks)$coef[2]
beta.AXP = lm(AXP ~ DOW, data = return_stocks)$coef[2]
beta.BA = lm(BA ~ DOW, data = return_stocks)$coef[2]
beta.BAC = lm(BAC ~ DOW, data = return_stocks)$coef[2]
beta.CAT = lm(CAT ~ DOW, data = return_stocks)$coef[2]
beta.CSCO = lm(CSCO ~ DOW, data = return_stocks)$coef[2]
beta.CVX = lm(CVX ~ DOW, data = return_stocks)$coef[2]
beta.DD = lm(DD ~ DOW, data = return_stocks)$coef[2]
beta.DIS = lm(DIS ~ DOW, data = return_stocks)$coef[2]
beta.GE = lm(GE ~ DOW, data = return_stocks)$coef[2]
beta.HD = lm(HD ~ DOW, data = return_stocks)$coef[2]
beta.HPQ = lm(HPQ ~ DOW, data = return_stocks)$coef[2]
beta.IBM = lm(IBM ~ DOW, data = return_stocks)$coef[2]
beta.INTC = lm(INTC ~ DOW, data = return_stocks)$coef[2]
beta.JNJ = lm(JNJ ~ DOW, data = return_stocks)$coef[2]
beta.JPM = lm(JPM ~ DOW, data = return_stocks)$coef[2]
beta.KRFT = lm(KRFT ~ DOW, data = return_stocks)$coef[2]
beta.KO = lm(KO ~ DOW, data = return_stocks)$coef[2]
beta.MCD = lm(MCD ~ DOW, data = return_stocks)$coef[2]
beta.MMM = lm(MMM ~ DOW, data = return_stocks)$coef[2]
beta.MRK = lm(MRK ~ DOW, data = return_stocks)$coef[2]
beta.MSFT = lm(MSFT ~ DOW, data = return_stocks)$coef[2]
beta.PFE = lm(PFE ~ DOW, data = return_stocks)$coef[2]
beta.PG = lm(PG ~ DOW, data = return_stocks)$coef[2]
beta.T = lm(`T` ~ DOW, data = return_stocks)$coef[2]
beta.TRV = lm(TRV ~ DOW, data = return_stocks)$coef[2]
beta.UTX = lm(UTX ~ DOW, data = return_stocks)$coef[2]
beta.VZ = lm(VZ ~ DOW, data = return_stocks)$coef[2]
beta.WMT = lm(WMT ~ DOW, data = return_stocks)$coef[2]
beta.XOM = lm(XOM ~ DOW, data = return_stocks)$coef[2]

df = data.frame(Stock = c("AA", "AXP", "BA", "BAC", "CAT", "CSCO", "CVX", "DD", "DIS", "GE",
                           "HD", "HPQ", "IBM", "INTC", "JNJ", "JPM", "KRFT", "KO", "MCD", "MMM",
                           "MRK", "MSFT", "PFE", "PG", "T", "TRV", "UTX", "VZ", "WMT", "XOM"),
                 Beta = c(beta.AA, beta.AXP, beta.BA, beta.BAC, beta.CAT, beta.CSCO,
                          beta.CVX, beta.DD, beta.DIS, beta.GE, beta.HD, beta.HPQ, beta.IBM,
                          beta.INTC, beta.JNJ, beta.JPM, beta.KRFT, beta.KO, beta.MCD,
                          beta.MMM, beta.MRK, beta.MSFT, beta.PFE, beta.PG, beta.T, beta.TRV,
                          beta.UTX, beta.VZ, beta.WMT, beta.XOM))