Load Data, Set Up Lag Variables, Train Test Split, Lag Plots

#load data
dow=read.table("dow_jones_index.data", sep = ',', header=TRUE)
dow=na.omit(dow)
dow$date <- as.Date(dow$date, format = "%m/%d/%y")
dow$open = as.numeric(gsub("[\\$,]", "", dow$open))
dow$high = as.numeric(gsub("[\\$,]", "", dow$high))
dow$low = as.numeric(gsub("[\\$,]", "", dow$low))
dow$close = as.numeric(gsub("[\\$,]", "", dow$close))
dow$next_weeks_open = as.numeric(gsub("[\\$,]", "", dow$next_weeks_open))
dow$next_weeks_close = as.numeric(gsub("[\\$,]", "", dow$next_weeks_close))

#set up lag variables (lag 1 seems to show most correlation based on plots)
dow=dow %>%
group_by(stock)%>%
  mutate(open.lag=dplyr::lag(open,n=1))
dow=dow %>%
group_by(stock)%>%
  mutate(high.lag=dplyr::lag(high,n=1))
dow=dow %>%
group_by(stock)%>%
  mutate(low.lag=dplyr::lag(low,n=1))
dow=dow %>%
group_by(stock)%>%
  mutate(close.lag=dplyr::lag(close,n=1))
dow=dow %>%
group_by(stock)%>%
  mutate(percent_change_volume_over_last_wk.lag=dplyr::lag(percent_change_volume_over_last_wk,n=1))
dow=dow %>%
group_by(stock)%>%
  mutate(percent_change_next_weeks_price.lag=dplyr::lag(percent_change_next_weeks_price,n=1))
dow=dow %>%
group_by(stock)%>%
  mutate(percent_return_next_dividend.lag=dplyr::lag(percent_return_next_dividend,n=1))
dow=dow %>%
group_by(stock)%>%
  mutate(next_weeks_close.lag=dplyr::lag(next_weeks_close,n=1))
dow= dow %>%
  group_by(stock) %>%
  mutate(next_weeks_open.lag = dplyr::lag(next_weeks_open, n=1))

#train test split
data=split(dow,dow$quarter)
train<-data[[1]]
test<-data[[2]]
train<-split(train,train$stock)
test<-split(test,test$stock)
#lag plots
lag.plot(dow$open,set.lag=1:12)

lag.plot(dow$high, set.lag=1:12)

lag.plot(dow$low, set.lag=1:12)

lag.plot(dow$close, set.lag=1:12)

We have chosen to use lag(n=1) for the variables in our model as they seem to show the most correlation.

Multicollinearity

model = glm(percent_change_next_weeks_price ~ open.lag + high.lag + low.lag + close.lag + volume, data=train[[1]])
vif(model)
##  open.lag  high.lag   low.lag close.lag    volume 
##  30.13307 186.85735  56.30198 356.83660  30.53064
model = glm(percent_change_next_weeks_price ~ open.lag + high.lag + low.lag  + volume, data=train[[1]])
vif(model)
## open.lag high.lag  low.lag   volume 
## 2.476961 5.373253 5.213612 1.183033

We removed close.lag from our model due to VIF > 10.

Linear Regression Loop

mse.list = vector(mode="list")
for(i in names(train))
{
model = glm(percent_change_next_weeks_price ~ open.lag + high.lag + low.lag + volume, data=train[[i]])

preds = predict(model, test[[i]])
mse = mean((test[[i]]$percent_change_next_weeks_price-preds)^2)
mse.list[i]=mse
}
mse.df=data.frame(mse.list)
mse.df=t(mse.df)
print("MSE values")
print(str(mse.list))
print("Mean MSE")
print(mean(mse.df))
## [1] "MSE values"
## List of 30
##  $ AA  : num 16.9
##  $ AXP : num 17.8
##  $ BA  : num 15.4
##  $ BAC : num 9.77
##  $ CAT : num 24.7
##  $ CSCO: num 207
##  $ CVX : num 21.8
##  $ DD  : num 11.3
##  $ DIS : num 8.58
##  $ GE  : num 9.74
##  $ HD  : num 3.7
##  $ HPQ : num 17.4
##  $ IBM : num 5.7
##  $ INTC: num 63.5
##  $ JNJ : num 31
##  $ JPM : num 26.2
##  $ KO  : num 3.28
##  $ KRFT: num 106
##  $ MCD : num 10.3
##  $ MMM : num 11.1
##  $ MRK : num 14.1
##  $ MSFT: num 11.4
##  $ PFE : num 5.45
##  $ PG  : num 10.9
##  $ T   : num 93.9
##  $ TRV : num 5.16
##  $ UTX : num 12.1
##  $ VZ  : num 5.38
##  $ WMT : num 2.49
##  $ XOM : num 10.6
## NULL
## [1] "Mean MSE"
## [1] 26.42919

Decision Tree Loop

mse.list = vector(mode="list")
for(i in names(train))
{
model = tree(percent_change_next_weeks_price ~ open.lag + high.lag + low.lag + volume, data=train[[i]])
preds = predict(model, test[[i]])
mse = mean((test[[i]]$percent_change_next_weeks_price-preds)^2)
mse.list[i]=mse
}
mse.df=data.frame(mse.list)
mse.df=t(mse.df)
print("MSE values")
print(str(mse.list))
print("Mean MSE")
print(mean(mse.df))
## [1] "MSE values"
## List of 30
##  $ AA  : num 19.3
##  $ AXP : num 15.1
##  $ BA  : num 9.64
##  $ BAC : num 7.93
##  $ CAT : num 23.3
##  $ CSCO: num 11.4
##  $ CVX : num 13.3
##  $ DD  : num 8.32
##  $ DIS : num 9.4
##  $ GE  : num 12.6
##  $ HD  : num 6.48
##  $ HPQ : num 12.7
##  $ IBM : num 4.54
##  $ INTC: num 17.7
##  $ JNJ : num 7.28
##  $ JPM : num 10.7
##  $ KO  : num 3.22
##  $ KRFT: num 7.07
##  $ MCD : num 2.88
##  $ MMM : num 7.4
##  $ MRK : num 5.3
##  $ MSFT: num 10.6
##  $ PFE : num 8.47
##  $ PG  : num 4.62
##  $ T   : num 3.01
##  $ TRV : num 4.65
##  $ UTX : num 11.6
##  $ VZ  : num 4.71
##  $ WMT : num 1.26
##  $ XOM : num 12.1
## NULL
## [1] "Mean MSE"
## [1] 9.218837

SVR Loop

#tune parameters
form1 = percent_change_next_weeks_price ~ open.lag + high.lag + low.lag + volume
tuned = tune.svm(form1, data = train[[1]], gamma = seq(0.01, 0.1, by = 0.01), cost = seq(0.1, 1, by = 0.1))
tuned$best.parameters
## [1] gamma cost 
## <0 rows> (or 0-length row.names)

Our best parameters include a gamma of 0.05 and a cost of 1.

mse.list = vector(mode="list")
for(i in names(train))
{
model = svm(formula = form1, data= train[[i]], gamma = tuned$best.parameters$gamma, cost = tuned$best.parameters$cost)
preds = predict(model, test[[i]])
mse = mean((test[[i]]$percent_change_next_weeks_price-preds)^2)
mse.list[i]=mse
}
mse.df=data.frame(mse.list)
mse.df=t(mse.df)
print("MSE values")
print(str(mse.list))
print("Mean MSE")
print(mean(mse.df))
## [1] "MSE values"
## List of 30
##  $ AA  : num 19.7
##  $ AXP : num 10.7
##  $ BA  : num 8.16
##  $ BAC : num 7.88
##  $ CAT : num 25
##  $ CSCO: num 11.1
##  $ CVX : num 15.2
##  $ DD  : num 10.8
##  $ DIS : num 9.55
##  $ GE  : num 7.54
##  $ HD  : num 5.66
##  $ HPQ : num 13.9
##  $ IBM : num 3.5
##  $ INTC: num 18.9
##  $ JNJ : num 5.84
##  $ JPM : num 7.91
##  $ KO  : num 4.15
##  $ KRFT: num 3.5
##  $ MCD : num 3.34
##  $ MMM : num 5.92
##  $ MRK : num 7.26
##  $ MSFT: num 10.3
##  $ PFE : num 5.6
##  $ PG  : num 5.3
##  $ T   : num 2.07
##  $ TRV : num 5.65
##  $ UTX : num 8.93
##  $ VZ  : num 5.21
##  $ WMT : num 2.24
##  $ XOM : num 11.1
## NULL
## [1] "Mean MSE"
## [1] 8.725331

We note that SVR seems to be performing the best based on MSE criteria. The MSE means for all the stocks for each method are as follows: Linear Regression: 26.42919, Decision Tree: 9.218837, SVR: 8.830263.

Predict on future week with SVR (using only week 6-24-2011 data)

#create test set 2 (ONLY with last weeks data)
test2=split(dow,dow$date)
test2 = test2[[24]]
test2 = split(test2, test2$stock)
#svr loop on each stock, using train to train the model, and predicting on test2
pred.list = vector(mode="list")
for(i in names(train))
{
model = svm(formula = form1, data= train[[i]], gamma = tuned$best.parameters$gamma, cost = tuned$best.parameters$cost)
preds = predict(model, test2[[i]])
mse = mean((test[[i]]$percent_change_next_weeks_price-preds)^2)
pred.list[i]=preds
}
print("Percent Change Next Weeks Price Predictions")
print(str(pred.list))
pred.df=data.frame(pred.list)
pred.df=t(pred.df)
## [1] "Percent Change Next Weeks Price Predictions"
## List of 30
##  $ AA  : num 0.617
##  $ AXP : num -0.209
##  $ BA  : num 0.262
##  $ BAC : num -0.712
##  $ CAT : num 2.39
##  $ CSCO: num -0.944
##  $ CVX : num 2.31
##  $ DD  : num 1.28
##  $ DIS : num 0.568
##  $ GE  : num 0.0798
##  $ HD  : num 0.205
##  $ HPQ : num -0.51
##  $ IBM : num 0.458
##  $ INTC: num -0.159
##  $ JNJ : num 0.116
##  $ JPM : num 0.246
##  $ KO  : num 1.28
##  $ KRFT: num 0.613
##  $ MCD : num 0.331
##  $ MMM : num 0.362
##  $ MRK : num -0.591
##  $ MSFT: num -0.915
##  $ PFE : num 0.115
##  $ PG  : num -0.87
##  $ T   : num 0.438
##  $ TRV : num 0.953
##  $ UTX : num 0.405
##  $ VZ  : num 0.567
##  $ WMT : num -0.0635
##  $ XOM : num 0.306
## NULL

According to our SVR model and running our model only on the last week of data available. We conclude that the stock CAT is most likely to produce the greatest rate of return in the following week.

#import data
dow2=read.table("dow_jones_index.data", sep = ',', header=TRUE)
dow2=na.omit(dow2)
dow2$date <- as.Date(dow$date, format = "%m/%d/%y")
dow2$open = as.numeric(gsub("[\\$,]", "", dow2$open))
dow2$high = as.numeric(gsub("[\\$,]", "", dow2$high))
dow2$low = as.numeric(gsub("[\\$,]", "", dow2$low))
dow2$close = as.numeric(gsub("[\\$,]", "", dow2$close))
dow2$next_weeks_open = as.numeric(gsub("[\\$,]", "", dow2$next_weeks_open))
dow2$next_weeks_close = as.numeric(gsub("[\\$,]", "", dow2$next_weeks_close))
dow2 = split(dow2, dow2$stock)
SP500 <- read.csv("INDEX_US_S&P US_SPX.csv",)
SP500 = na.omit(SP500)
SP500$Close = as.numeric(gsub("[\\$,]", "", SP500$Close))
SP500<- SP500[seq(dim(SP500)[1],1),]
SP500 = SP500[-1,]

CAPM Analysis

#generate return variables
ReturnSP500 = na.omit(Delt(SP500[,5])); ReturnSP500
ReturnAA = na.omit(Delt(dow2$AA[,7])); ReturnAA
ReturnAXP = na.omit(Delt(dow2$AXP[,7])); ReturnAXP
ReturnBA = na.omit(Delt(dow2$BA[,7])); ReturnBA
ReturnBAC = na.omit(Delt(dow2$BAC[,7])); ReturnBAC
ReturnCAT = na.omit(Delt(dow2$CAT[,7])); ReturnCAT
ReturnCSCO = na.omit(Delt(dow2$CSCO[,7])); ReturnCSCO
ReturnCVX = na.omit(Delt(dow2$CVX[,7])); ReturnCVX
ReturnDD = na.omit(Delt(dow2$DD[,7])); ReturnDD
ReturnDIS = na.omit(Delt(dow2$DIS[,7])); ReturnDIS
ReturnGE = na.omit(Delt(dow2$GE[,7])); ReturnGE
ReturnHD = na.omit(Delt(dow2$HD[,7])); ReturnHD
ReturnHPQ = na.omit(Delt(dow2$HPQ[,7])); ReturnHPQ
ReturnIBM = na.omit(Delt(dow2$IBM[,7])); ReturnIBM
ReturnINTC = na.omit(Delt(dow2$INTC[,7])); ReturnINTC
ReturnJNJ = na.omit(Delt(dow2$JNJ[,7])); ReturnJNJ
ReturnJPM = na.omit(Delt(dow2$JPM[,7])); ReturnJPM
ReturnKO = na.omit(Delt(dow2$KO[,7])); ReturnKO
ReturnKRFT = na.omit(Delt(dow2$KRFT[,7])); ReturnKRFT
ReturnMCD = na.omit(Delt(dow2$MCD[,7])); ReturnMCD
ReturnMMM = na.omit(Delt(dow2$MMM[,7])); ReturnMMM
ReturnMRK = na.omit(Delt(dow2$MRK[,7])); ReturnMRK
ReturnMSFT = na.omit(Delt(dow2$MSFT[,7])); ReturnMSFT
ReturnPFE = na.omit(Delt(dow2$PFE[,7])); ReturnPFE
ReturnPG = na.omit(Delt(dow2$PG[,7])); ReturnPG
ReturnT = na.omit(Delt(dow2$T[,7])); ReturnT
ReturnTRV = na.omit(Delt(dow2$TRV[,7])); ReturnTRV
ReturnUTX = na.omit(Delt(dow2$UTX[,7])); ReturnUTX
ReturnWMT = na.omit(Delt(dow2$WMT[,7])); ReturnWMT
ReturnVZ = na.omit(Delt(dow2$VZ[,7])); ReturnVZ
ReturnXOM = na.omit(Delt(dow2$XOM[,7])); ReturnXOM
#combine return variables in dataframe
MyData = cbind(ReturnSP500,ReturnAA,ReturnAXP,ReturnBA,ReturnBAC,ReturnCAT,ReturnCSCO,ReturnCVX,ReturnDD,ReturnDIS,ReturnGE,ReturnHD,ReturnHPQ,ReturnIBM,ReturnINTC,ReturnJNJ,ReturnJPM,ReturnKO,ReturnKRFT,ReturnMCD,ReturnMMM,ReturnMRK,ReturnMSFT,ReturnPFE,ReturnPG,ReturnT,ReturnTRV,ReturnUTX,ReturnWMT,ReturnVZ,ReturnXOM)
colnames(MyData) = c("SP500", "AA", "AXP", "BA", "BAC", "CAT", "CSCO", "CVX", "DD", "DIS", "GE", "HD", "HPQ", "IBM", "INTC", "JNJ", "JPM", "KO", "KRFT", "MCD", "MMM", "MRK", "MSFT", "PFE", "PG", "T", "TRV", "UTX", "WMT", "VZ", "XOM")
head(MyData)
#obtain mean and sd for each stock
DataMean=apply(MyData, 2, mean)
DataSD=apply(MyData, 2, sd)
cbind(DataMean,DataSD)
#lm on each stock to generate beta
lm.AA <- lm(AA~ SP500, data = as.data.frame(MyData))
lm.AXP <- lm(AXP~ SP500, data = as.data.frame(MyData))
lm.BA <- lm(BA~ SP500, data = as.data.frame(MyData))
lm.BAC <- lm(BAC~ SP500, data = as.data.frame(MyData))
lm.CAT <- lm(CAT~ SP500, data = as.data.frame(MyData))
lm.CSCO <- lm(CSCO~ SP500, data = as.data.frame(MyData))
lm.CVX <- lm(CVX~ SP500, data = as.data.frame(MyData))
lm.DD <- lm(DD~ SP500, data = as.data.frame(MyData))
lm.DIS <- lm(DIS~ SP500, data = as.data.frame(MyData))
lm.GE <- lm(GE~ SP500, data = as.data.frame(MyData))
lm.HD <- lm(HD~ SP500, data = as.data.frame(MyData))
lm.HPQ <- lm(HPQ~ SP500, data = as.data.frame(MyData))
lm.IBM <- lm(IBM~ SP500, data = as.data.frame(MyData))
lm.INTC <- lm(INTC~ SP500, data = as.data.frame(MyData))
lm.JNJ <- lm(JNJ~ SP500, data = as.data.frame(MyData))
lm.JPM <- lm(JPM~ SP500, data = as.data.frame(MyData))
lm.KO <- lm(KO~ SP500, data = as.data.frame(MyData))
lm.KRFT <- lm(KRFT~ SP500, data = as.data.frame(MyData))
lm.MCD <- lm(MCD~ SP500, data = as.data.frame(MyData))
lm.MMM <- lm(MMM~ SP500, data = as.data.frame(MyData))
lm.MRK <- lm(MRK~ SP500, data = as.data.frame(MyData))
lm.MSFT <- lm(MSFT~ SP500, data = as.data.frame(MyData))
lm.PFE <- lm(PFE~ SP500, data = as.data.frame(MyData))
lm.PG <- lm(PG~ SP500, data = as.data.frame(MyData))
lm.T <- lm(T~ SP500, data = as.data.frame(MyData))
lm.TRV <- lm(TRV~ SP500, data = as.data.frame(MyData))
lm.UTX <- lm(UTX~ SP500, data = as.data.frame(MyData))
lm.WMT <- lm(WMT~ SP500, data = as.data.frame(MyData))
lm.VZ <- lm(VZ~ SP500, data = as.data.frame(MyData))
lm.XOM <- lm(XOM~ SP500, data = as.data.frame(MyData))
#put betas into a dataframe
beta.list = vector(mode="list")
beta.list[1] = summary(lm.AA)$coefficient[2,1]
beta.list[2] = summary(lm.AXP)$coefficient[2,1]
beta.list[3] = summary(lm.BA)$coefficient[2,1]
beta.list[4] = summary(lm.BAC)$coefficient[2,1]
beta.list[5] = summary(lm.CAT)$coefficient[2,1]
beta.list[6] = summary(lm.CSCO)$coefficient[2,1]
beta.list[7] = summary(lm.CVX)$coefficient[2,1]
beta.list[8] = summary(lm.DD)$coefficient[2,1]
beta.list[9] = summary(lm.DIS)$coefficient[2,1]
beta.list[10] = summary(lm.GE)$coefficient[2,1]
beta.list[11] = summary(lm.HD)$coefficient[2,1]
beta.list[12] = summary(lm.HPQ)$coefficient[2,1]
beta.list[13] = summary(lm.IBM)$coefficient[2,1]
beta.list[14] = summary(lm.INTC)$coefficient[2,1]
beta.list[15] = summary(lm.JNJ)$coefficient[2,1]
beta.list[16] = summary(lm.JPM)$coefficient[2,1]
beta.list[17] = summary(lm.KO)$coefficient[2,1]
beta.list[18] = summary(lm.KRFT)$coefficient[2,1]
beta.list[19] = summary(lm.MCD)$coefficient[2,1]
beta.list[20] = summary(lm.MMM)$coefficient[2,1]
beta.list[21] = summary(lm.MRK)$coefficient[2,1]
beta.list[22] = summary(lm.MSFT)$coefficient[2,1]
beta.list[23] = summary(lm.PFE)$coefficient[2,1]
beta.list[24] = summary(lm.PG)$coefficient[2,1]
beta.list[25] = summary(lm.T)$coefficient[2,1]
beta.list[26] = summary(lm.TRV)$coefficient[2,1]
beta.list[27] = summary(lm.UTX)$coefficient[2,1]
beta.list[28] = summary(lm.WMT)$coefficient[2,1]
beta.list[29] = summary(lm.VZ)$coefficient[2,1]
beta.list[30] = summary(lm.XOM)$coefficient[2,1]
beta.df = data.frame(beta.list)
beta.df = t(beta.df)
#combine predictions and beta
output = cbind(pred.df,beta.df)
colnames(output) = c("Percent Change Next Weeks Price Predictions", "Beta")
output
##      Percent Change Next Weeks Price Predictions      Beta
## AA                                     0.6169720 1.4552380
## AXP                                   -0.2090623 0.9274837
## BA                                     0.2617704 1.4731589
## BAC                                   -0.7118690 0.7126986
## CAT                                    2.3934434 1.5635308
## CSCO                                  -0.9436846 0.6966052
## CVX                                    2.3079950 0.8862385
## DD                                     1.2767755 1.2648750
## DIS                                    0.5684285 1.4666356
## GE                                     0.0798065 1.1416111
## HD                                     0.2046310 0.8143882
## HPQ                                   -0.5104214 1.1254713
## IBM                                    0.4578060 0.8071604
## INTC                                  -0.1590134 1.1726496
## JNJ                                    0.1163575 0.7436642
## JPM                                    0.2461876 0.7075853
## KO                                     1.2827774 0.6800020
## KRFT                                   0.6130615 0.2256578
## MCD                                    0.3306353 0.6895718
## MMM                                    0.3624885 1.0753253
## MRK                                   -0.5910745 0.5914761
## MSFT                                  -0.9151817 0.6697241
## PFE                                    0.1147030 0.7420847
## PG                                    -0.8703965 0.3534124
## T                                      0.4383295 0.8389334
## TRV                                    0.9526584 0.9400376
## UTX                                    0.4048329 1.1004080
## VZ                                     0.5667790 0.4727700
## WMT                                   -0.0634735 0.8047183
## XOM                                    0.3065000 1.1984868

Using the SP500 data as the market, we note that a beta greater than 1 indicates that the security’s price is more volatile than the market. We can see that CAT offers a high return but there is also high risk associated with it. A better stock to invest in would be CVX as it has a very high return and is less risky than the market.