#load data
dow=read.table("dow_jones_index.data", sep = ',', header=TRUE)
dow=na.omit(dow)
dow$date <- as.Date(dow$date, format = "%m/%d/%y")
dow$open = as.numeric(gsub("[\\$,]", "", dow$open))
dow$high = as.numeric(gsub("[\\$,]", "", dow$high))
dow$low = as.numeric(gsub("[\\$,]", "", dow$low))
dow$close = as.numeric(gsub("[\\$,]", "", dow$close))
dow$next_weeks_open = as.numeric(gsub("[\\$,]", "", dow$next_weeks_open))
dow$next_weeks_close = as.numeric(gsub("[\\$,]", "", dow$next_weeks_close))
#set up lag variables (lag 1 seems to show most correlation based on plots)
dow=dow %>%
group_by(stock)%>%
mutate(open.lag=dplyr::lag(open,n=1))
dow=dow %>%
group_by(stock)%>%
mutate(high.lag=dplyr::lag(high,n=1))
dow=dow %>%
group_by(stock)%>%
mutate(low.lag=dplyr::lag(low,n=1))
dow=dow %>%
group_by(stock)%>%
mutate(close.lag=dplyr::lag(close,n=1))
dow=dow %>%
group_by(stock)%>%
mutate(percent_change_volume_over_last_wk.lag=dplyr::lag(percent_change_volume_over_last_wk,n=1))
dow=dow %>%
group_by(stock)%>%
mutate(percent_change_next_weeks_price.lag=dplyr::lag(percent_change_next_weeks_price,n=1))
dow=dow %>%
group_by(stock)%>%
mutate(percent_return_next_dividend.lag=dplyr::lag(percent_return_next_dividend,n=1))
dow=dow %>%
group_by(stock)%>%
mutate(next_weeks_close.lag=dplyr::lag(next_weeks_close,n=1))
dow= dow %>%
group_by(stock) %>%
mutate(next_weeks_open.lag = dplyr::lag(next_weeks_open, n=1))
#train test split
data=split(dow,dow$quarter)
train<-data[[1]]
test<-data[[2]]
train<-split(train,train$stock)
test<-split(test,test$stock)
#lag plots
lag.plot(dow$open,set.lag=1:12)
lag.plot(dow$high, set.lag=1:12)
lag.plot(dow$low, set.lag=1:12)
lag.plot(dow$close, set.lag=1:12)
We have chosen to use lag(n=1) for the variables in our model as they seem to show the most correlation.
model = glm(percent_change_next_weeks_price ~ open.lag + high.lag + low.lag + close.lag + volume, data=train[[1]])
vif(model)
## open.lag high.lag low.lag close.lag volume
## 30.13307 186.85735 56.30198 356.83660 30.53064
model = glm(percent_change_next_weeks_price ~ open.lag + high.lag + low.lag + volume, data=train[[1]])
vif(model)
## open.lag high.lag low.lag volume
## 2.476961 5.373253 5.213612 1.183033
We removed close.lag from our model due to VIF > 10.
mse.list = vector(mode="list")
for(i in names(train))
{
model = glm(percent_change_next_weeks_price ~ open.lag + high.lag + low.lag + volume, data=train[[i]])
preds = predict(model, test[[i]])
mse = mean((test[[i]]$percent_change_next_weeks_price-preds)^2)
mse.list[i]=mse
}
mse.df=data.frame(mse.list)
mse.df=t(mse.df)
print("MSE values")
print(str(mse.list))
print("Mean MSE")
print(mean(mse.df))
## [1] "MSE values"
## List of 30
## $ AA : num 16.9
## $ AXP : num 17.8
## $ BA : num 15.4
## $ BAC : num 9.77
## $ CAT : num 24.7
## $ CSCO: num 207
## $ CVX : num 21.8
## $ DD : num 11.3
## $ DIS : num 8.58
## $ GE : num 9.74
## $ HD : num 3.7
## $ HPQ : num 17.4
## $ IBM : num 5.7
## $ INTC: num 63.5
## $ JNJ : num 31
## $ JPM : num 26.2
## $ KO : num 3.28
## $ KRFT: num 106
## $ MCD : num 10.3
## $ MMM : num 11.1
## $ MRK : num 14.1
## $ MSFT: num 11.4
## $ PFE : num 5.45
## $ PG : num 10.9
## $ T : num 93.9
## $ TRV : num 5.16
## $ UTX : num 12.1
## $ VZ : num 5.38
## $ WMT : num 2.49
## $ XOM : num 10.6
## NULL
## [1] "Mean MSE"
## [1] 26.42919
mse.list = vector(mode="list")
for(i in names(train))
{
model = tree(percent_change_next_weeks_price ~ open.lag + high.lag + low.lag + volume, data=train[[i]])
preds = predict(model, test[[i]])
mse = mean((test[[i]]$percent_change_next_weeks_price-preds)^2)
mse.list[i]=mse
}
mse.df=data.frame(mse.list)
mse.df=t(mse.df)
print("MSE values")
print(str(mse.list))
print("Mean MSE")
print(mean(mse.df))
## [1] "MSE values"
## List of 30
## $ AA : num 19.3
## $ AXP : num 15.1
## $ BA : num 9.64
## $ BAC : num 7.93
## $ CAT : num 23.3
## $ CSCO: num 11.4
## $ CVX : num 13.3
## $ DD : num 8.32
## $ DIS : num 9.4
## $ GE : num 12.6
## $ HD : num 6.48
## $ HPQ : num 12.7
## $ IBM : num 4.54
## $ INTC: num 17.7
## $ JNJ : num 7.28
## $ JPM : num 10.7
## $ KO : num 3.22
## $ KRFT: num 7.07
## $ MCD : num 2.88
## $ MMM : num 7.4
## $ MRK : num 5.3
## $ MSFT: num 10.6
## $ PFE : num 8.47
## $ PG : num 4.62
## $ T : num 3.01
## $ TRV : num 4.65
## $ UTX : num 11.6
## $ VZ : num 4.71
## $ WMT : num 1.26
## $ XOM : num 12.1
## NULL
## [1] "Mean MSE"
## [1] 9.218837
#tune parameters
form1 = percent_change_next_weeks_price ~ open.lag + high.lag + low.lag + volume
tuned = tune.svm(form1, data = train[[1]], gamma = seq(0.01, 0.1, by = 0.01), cost = seq(0.1, 1, by = 0.1))
tuned$best.parameters
## [1] gamma cost
## <0 rows> (or 0-length row.names)
Our best parameters include a gamma of 0.05 and a cost of 1.
mse.list = vector(mode="list")
for(i in names(train))
{
model = svm(formula = form1, data= train[[i]], gamma = tuned$best.parameters$gamma, cost = tuned$best.parameters$cost)
preds = predict(model, test[[i]])
mse = mean((test[[i]]$percent_change_next_weeks_price-preds)^2)
mse.list[i]=mse
}
mse.df=data.frame(mse.list)
mse.df=t(mse.df)
print("MSE values")
print(str(mse.list))
print("Mean MSE")
print(mean(mse.df))
## [1] "MSE values"
## List of 30
## $ AA : num 19.7
## $ AXP : num 10.7
## $ BA : num 8.16
## $ BAC : num 7.88
## $ CAT : num 25
## $ CSCO: num 11.1
## $ CVX : num 15.2
## $ DD : num 10.8
## $ DIS : num 9.55
## $ GE : num 7.54
## $ HD : num 5.66
## $ HPQ : num 13.9
## $ IBM : num 3.5
## $ INTC: num 18.9
## $ JNJ : num 5.84
## $ JPM : num 7.91
## $ KO : num 4.15
## $ KRFT: num 3.5
## $ MCD : num 3.34
## $ MMM : num 5.92
## $ MRK : num 7.26
## $ MSFT: num 10.3
## $ PFE : num 5.6
## $ PG : num 5.3
## $ T : num 2.07
## $ TRV : num 5.65
## $ UTX : num 8.93
## $ VZ : num 5.21
## $ WMT : num 2.24
## $ XOM : num 11.1
## NULL
## [1] "Mean MSE"
## [1] 8.725331
We note that SVR seems to be performing the best based on MSE criteria. The MSE means for all the stocks for each method are as follows: Linear Regression: 26.42919, Decision Tree: 9.218837, SVR: 8.830263.
#create test set 2 (ONLY with last weeks data)
test2=split(dow,dow$date)
test2 = test2[[24]]
test2 = split(test2, test2$stock)
#svr loop on each stock, using train to train the model, and predicting on test2
pred.list = vector(mode="list")
for(i in names(train))
{
model = svm(formula = form1, data= train[[i]], gamma = tuned$best.parameters$gamma, cost = tuned$best.parameters$cost)
preds = predict(model, test2[[i]])
mse = mean((test[[i]]$percent_change_next_weeks_price-preds)^2)
pred.list[i]=preds
}
print("Percent Change Next Weeks Price Predictions")
print(str(pred.list))
pred.df=data.frame(pred.list)
pred.df=t(pred.df)
## [1] "Percent Change Next Weeks Price Predictions"
## List of 30
## $ AA : num 0.617
## $ AXP : num -0.209
## $ BA : num 0.262
## $ BAC : num -0.712
## $ CAT : num 2.39
## $ CSCO: num -0.944
## $ CVX : num 2.31
## $ DD : num 1.28
## $ DIS : num 0.568
## $ GE : num 0.0798
## $ HD : num 0.205
## $ HPQ : num -0.51
## $ IBM : num 0.458
## $ INTC: num -0.159
## $ JNJ : num 0.116
## $ JPM : num 0.246
## $ KO : num 1.28
## $ KRFT: num 0.613
## $ MCD : num 0.331
## $ MMM : num 0.362
## $ MRK : num -0.591
## $ MSFT: num -0.915
## $ PFE : num 0.115
## $ PG : num -0.87
## $ T : num 0.438
## $ TRV : num 0.953
## $ UTX : num 0.405
## $ VZ : num 0.567
## $ WMT : num -0.0635
## $ XOM : num 0.306
## NULL
According to our SVR model and running our model only on the last week of data available. We conclude that the stock CAT is most likely to produce the greatest rate of return in the following week.
#import data
dow2=read.table("dow_jones_index.data", sep = ',', header=TRUE)
dow2=na.omit(dow2)
dow2$date <- as.Date(dow$date, format = "%m/%d/%y")
dow2$open = as.numeric(gsub("[\\$,]", "", dow2$open))
dow2$high = as.numeric(gsub("[\\$,]", "", dow2$high))
dow2$low = as.numeric(gsub("[\\$,]", "", dow2$low))
dow2$close = as.numeric(gsub("[\\$,]", "", dow2$close))
dow2$next_weeks_open = as.numeric(gsub("[\\$,]", "", dow2$next_weeks_open))
dow2$next_weeks_close = as.numeric(gsub("[\\$,]", "", dow2$next_weeks_close))
dow2 = split(dow2, dow2$stock)
SP500 <- read.csv("INDEX_US_S&P US_SPX.csv",)
SP500 = na.omit(SP500)
SP500$Close = as.numeric(gsub("[\\$,]", "", SP500$Close))
SP500<- SP500[seq(dim(SP500)[1],1),]
SP500 = SP500[-1,]
#generate return variables
ReturnSP500 = na.omit(Delt(SP500[,5])); ReturnSP500
ReturnAA = na.omit(Delt(dow2$AA[,7])); ReturnAA
ReturnAXP = na.omit(Delt(dow2$AXP[,7])); ReturnAXP
ReturnBA = na.omit(Delt(dow2$BA[,7])); ReturnBA
ReturnBAC = na.omit(Delt(dow2$BAC[,7])); ReturnBAC
ReturnCAT = na.omit(Delt(dow2$CAT[,7])); ReturnCAT
ReturnCSCO = na.omit(Delt(dow2$CSCO[,7])); ReturnCSCO
ReturnCVX = na.omit(Delt(dow2$CVX[,7])); ReturnCVX
ReturnDD = na.omit(Delt(dow2$DD[,7])); ReturnDD
ReturnDIS = na.omit(Delt(dow2$DIS[,7])); ReturnDIS
ReturnGE = na.omit(Delt(dow2$GE[,7])); ReturnGE
ReturnHD = na.omit(Delt(dow2$HD[,7])); ReturnHD
ReturnHPQ = na.omit(Delt(dow2$HPQ[,7])); ReturnHPQ
ReturnIBM = na.omit(Delt(dow2$IBM[,7])); ReturnIBM
ReturnINTC = na.omit(Delt(dow2$INTC[,7])); ReturnINTC
ReturnJNJ = na.omit(Delt(dow2$JNJ[,7])); ReturnJNJ
ReturnJPM = na.omit(Delt(dow2$JPM[,7])); ReturnJPM
ReturnKO = na.omit(Delt(dow2$KO[,7])); ReturnKO
ReturnKRFT = na.omit(Delt(dow2$KRFT[,7])); ReturnKRFT
ReturnMCD = na.omit(Delt(dow2$MCD[,7])); ReturnMCD
ReturnMMM = na.omit(Delt(dow2$MMM[,7])); ReturnMMM
ReturnMRK = na.omit(Delt(dow2$MRK[,7])); ReturnMRK
ReturnMSFT = na.omit(Delt(dow2$MSFT[,7])); ReturnMSFT
ReturnPFE = na.omit(Delt(dow2$PFE[,7])); ReturnPFE
ReturnPG = na.omit(Delt(dow2$PG[,7])); ReturnPG
ReturnT = na.omit(Delt(dow2$T[,7])); ReturnT
ReturnTRV = na.omit(Delt(dow2$TRV[,7])); ReturnTRV
ReturnUTX = na.omit(Delt(dow2$UTX[,7])); ReturnUTX
ReturnWMT = na.omit(Delt(dow2$WMT[,7])); ReturnWMT
ReturnVZ = na.omit(Delt(dow2$VZ[,7])); ReturnVZ
ReturnXOM = na.omit(Delt(dow2$XOM[,7])); ReturnXOM
#combine return variables in dataframe
MyData = cbind(ReturnSP500,ReturnAA,ReturnAXP,ReturnBA,ReturnBAC,ReturnCAT,ReturnCSCO,ReturnCVX,ReturnDD,ReturnDIS,ReturnGE,ReturnHD,ReturnHPQ,ReturnIBM,ReturnINTC,ReturnJNJ,ReturnJPM,ReturnKO,ReturnKRFT,ReturnMCD,ReturnMMM,ReturnMRK,ReturnMSFT,ReturnPFE,ReturnPG,ReturnT,ReturnTRV,ReturnUTX,ReturnWMT,ReturnVZ,ReturnXOM)
colnames(MyData) = c("SP500", "AA", "AXP", "BA", "BAC", "CAT", "CSCO", "CVX", "DD", "DIS", "GE", "HD", "HPQ", "IBM", "INTC", "JNJ", "JPM", "KO", "KRFT", "MCD", "MMM", "MRK", "MSFT", "PFE", "PG", "T", "TRV", "UTX", "WMT", "VZ", "XOM")
head(MyData)
#obtain mean and sd for each stock
DataMean=apply(MyData, 2, mean)
DataSD=apply(MyData, 2, sd)
cbind(DataMean,DataSD)
#lm on each stock to generate beta
lm.AA <- lm(AA~ SP500, data = as.data.frame(MyData))
lm.AXP <- lm(AXP~ SP500, data = as.data.frame(MyData))
lm.BA <- lm(BA~ SP500, data = as.data.frame(MyData))
lm.BAC <- lm(BAC~ SP500, data = as.data.frame(MyData))
lm.CAT <- lm(CAT~ SP500, data = as.data.frame(MyData))
lm.CSCO <- lm(CSCO~ SP500, data = as.data.frame(MyData))
lm.CVX <- lm(CVX~ SP500, data = as.data.frame(MyData))
lm.DD <- lm(DD~ SP500, data = as.data.frame(MyData))
lm.DIS <- lm(DIS~ SP500, data = as.data.frame(MyData))
lm.GE <- lm(GE~ SP500, data = as.data.frame(MyData))
lm.HD <- lm(HD~ SP500, data = as.data.frame(MyData))
lm.HPQ <- lm(HPQ~ SP500, data = as.data.frame(MyData))
lm.IBM <- lm(IBM~ SP500, data = as.data.frame(MyData))
lm.INTC <- lm(INTC~ SP500, data = as.data.frame(MyData))
lm.JNJ <- lm(JNJ~ SP500, data = as.data.frame(MyData))
lm.JPM <- lm(JPM~ SP500, data = as.data.frame(MyData))
lm.KO <- lm(KO~ SP500, data = as.data.frame(MyData))
lm.KRFT <- lm(KRFT~ SP500, data = as.data.frame(MyData))
lm.MCD <- lm(MCD~ SP500, data = as.data.frame(MyData))
lm.MMM <- lm(MMM~ SP500, data = as.data.frame(MyData))
lm.MRK <- lm(MRK~ SP500, data = as.data.frame(MyData))
lm.MSFT <- lm(MSFT~ SP500, data = as.data.frame(MyData))
lm.PFE <- lm(PFE~ SP500, data = as.data.frame(MyData))
lm.PG <- lm(PG~ SP500, data = as.data.frame(MyData))
lm.T <- lm(T~ SP500, data = as.data.frame(MyData))
lm.TRV <- lm(TRV~ SP500, data = as.data.frame(MyData))
lm.UTX <- lm(UTX~ SP500, data = as.data.frame(MyData))
lm.WMT <- lm(WMT~ SP500, data = as.data.frame(MyData))
lm.VZ <- lm(VZ~ SP500, data = as.data.frame(MyData))
lm.XOM <- lm(XOM~ SP500, data = as.data.frame(MyData))
#put betas into a dataframe
beta.list = vector(mode="list")
beta.list[1] = summary(lm.AA)$coefficient[2,1]
beta.list[2] = summary(lm.AXP)$coefficient[2,1]
beta.list[3] = summary(lm.BA)$coefficient[2,1]
beta.list[4] = summary(lm.BAC)$coefficient[2,1]
beta.list[5] = summary(lm.CAT)$coefficient[2,1]
beta.list[6] = summary(lm.CSCO)$coefficient[2,1]
beta.list[7] = summary(lm.CVX)$coefficient[2,1]
beta.list[8] = summary(lm.DD)$coefficient[2,1]
beta.list[9] = summary(lm.DIS)$coefficient[2,1]
beta.list[10] = summary(lm.GE)$coefficient[2,1]
beta.list[11] = summary(lm.HD)$coefficient[2,1]
beta.list[12] = summary(lm.HPQ)$coefficient[2,1]
beta.list[13] = summary(lm.IBM)$coefficient[2,1]
beta.list[14] = summary(lm.INTC)$coefficient[2,1]
beta.list[15] = summary(lm.JNJ)$coefficient[2,1]
beta.list[16] = summary(lm.JPM)$coefficient[2,1]
beta.list[17] = summary(lm.KO)$coefficient[2,1]
beta.list[18] = summary(lm.KRFT)$coefficient[2,1]
beta.list[19] = summary(lm.MCD)$coefficient[2,1]
beta.list[20] = summary(lm.MMM)$coefficient[2,1]
beta.list[21] = summary(lm.MRK)$coefficient[2,1]
beta.list[22] = summary(lm.MSFT)$coefficient[2,1]
beta.list[23] = summary(lm.PFE)$coefficient[2,1]
beta.list[24] = summary(lm.PG)$coefficient[2,1]
beta.list[25] = summary(lm.T)$coefficient[2,1]
beta.list[26] = summary(lm.TRV)$coefficient[2,1]
beta.list[27] = summary(lm.UTX)$coefficient[2,1]
beta.list[28] = summary(lm.WMT)$coefficient[2,1]
beta.list[29] = summary(lm.VZ)$coefficient[2,1]
beta.list[30] = summary(lm.XOM)$coefficient[2,1]
beta.df = data.frame(beta.list)
beta.df = t(beta.df)
#combine predictions and beta
output = cbind(pred.df,beta.df)
colnames(output) = c("Percent Change Next Weeks Price Predictions", "Beta")
output
## Percent Change Next Weeks Price Predictions Beta
## AA 0.6169720 1.4552380
## AXP -0.2090623 0.9274837
## BA 0.2617704 1.4731589
## BAC -0.7118690 0.7126986
## CAT 2.3934434 1.5635308
## CSCO -0.9436846 0.6966052
## CVX 2.3079950 0.8862385
## DD 1.2767755 1.2648750
## DIS 0.5684285 1.4666356
## GE 0.0798065 1.1416111
## HD 0.2046310 0.8143882
## HPQ -0.5104214 1.1254713
## IBM 0.4578060 0.8071604
## INTC -0.1590134 1.1726496
## JNJ 0.1163575 0.7436642
## JPM 0.2461876 0.7075853
## KO 1.2827774 0.6800020
## KRFT 0.6130615 0.2256578
## MCD 0.3306353 0.6895718
## MMM 0.3624885 1.0753253
## MRK -0.5910745 0.5914761
## MSFT -0.9151817 0.6697241
## PFE 0.1147030 0.7420847
## PG -0.8703965 0.3534124
## T 0.4383295 0.8389334
## TRV 0.9526584 0.9400376
## UTX 0.4048329 1.1004080
## VZ 0.5667790 0.4727700
## WMT -0.0634735 0.8047183
## XOM 0.3065000 1.1984868
Using the SP500 data as the market, we note that a beta greater than 1 indicates that the security’s price is more volatile than the market. We can see that CAT offers a high return but there is also high risk associated with it. A better stock to invest in would be CVX as it has a very high return and is less risky than the market.