Logistive Regression Model
study source: https://www.youtube.com/watch?v=mteljf020EE
head(Smarket)
## Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
## 1 2001 0.381 -0.192 -2.624 -1.055 5.010 1.1913 0.959 Up
## 2 2001 0.959 0.381 -0.192 -2.624 -1.055 1.2965 1.032 Up
## 3 2001 1.032 0.959 0.381 -0.192 -2.624 1.4112 -0.623 Down
## 4 2001 -0.623 1.032 0.959 0.381 -0.192 1.2760 0.614 Up
## 5 2001 0.614 -0.623 1.032 0.959 0.381 1.2057 0.213 Up
## 6 2001 0.213 0.614 -0.623 1.032 0.959 1.3491 1.392 Up
tail(Smarket)
## Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
## 1245 2005 0.252 -0.024 -0.584 -0.285 -0.141 2.06517 0.422 Up
## 1246 2005 0.422 0.252 -0.024 -0.584 -0.285 1.88850 0.043 Up
## 1247 2005 0.043 0.422 0.252 -0.024 -0.584 1.28581 -0.955 Down
## 1248 2005 -0.955 0.043 0.422 0.252 -0.024 1.54047 0.130 Up
## 1249 2005 0.130 -0.955 0.043 0.422 0.252 1.42236 -0.298 Down
## 1250 2005 -0.298 0.130 -0.955 0.043 0.422 1.38254 -0.489 Down
#summary(Smarket)
Year:The year that the observation was recorded
Lag1:Percentage return for previous day
Lag2:Percentage return for 2 days previous
Lag3:Percentage return for 3 days previous
Lag4:Percentage return for 4 days previous
Lag5:Percentage return for 5 days previous
Volume:Volume of shares traded (number of daily shares traded in billions)
Today:Percentage return for today
Direction: A factor with levels Down and Up indicating whether the market had a positive or negative return on a given day
Observed the corelations are very samll, which means the stock return rate are very random.
#cor(Smarket[,-9])
pairs.panels(Smarket)
Holdout data : Year (2005), Trainging data : Year (2001-2004)
training=(Year<2005) #logical vetor find match of the data
holdout=!training
training_data=Smarket[training,]
holdout_data=Smarket[holdout,]
Direction_holdout=Direction[holdout]
#summary(holdout_data)
Specify data by using training_data
For logistic regression: family=binomial
stock_model5=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,data=training_data, family = binomial)
#summary(stock_model)
The coefficient for Lags means, market return rate and lag’s return rate having same direction when it is positive; and market return rate and lag’s return rate having opposit direction when it is negative.
compute the predictions for 2005 and compare them to the actual movements(holdout data) of the market over that time period
model_Training_probs: It is a probability outcome from Predictie model that market will go up
Creat a vetor having same size of Direction_hotout and fill in with value “Down”
Set model_Training_probs> 0.5 then replace “Down” in vector of model_Training_Direction
#for a default binomial model the default predictions are of log-odds (probabilities on logit scale) and type = "response" gives the predicted probabilities
model_Training_probs5 = predict(stock_model5,holdout_data,type="response")
Num_holdout<-length(Direction_holdout)
model_Training_Direction5=rep("Down",Num_holdout)
model_Training_Direction5[model_Training_probs5>0.5] = "UP"
t5<-table(Direction_holdout,model_Training_Direction5)
t5
## model_Training_Direction5
## Direction_holdout Down UP
## Down 77 34
## Up 97 44
TPRate=TP/(TP+FP) FPRate=FP/(TN+FN) Lags5(FPRate,TPRate)
Accuracy=(77+44)/252
TPRate=77/(77+44)
FPRate=97/(97+34)
Lags5=c(FPRate,TPRate,Accuracy)
Lags5
## [1] 0.7404580 0.6363636 0.4801587
stock_model4=glm(Direction~Lag1+Lag2+Lag3+Lag4+Volume,data=training_data, family = binomial)
model_Training_probs4 = predict(stock_model4,holdout_data,type="response")
Num_holdout<-length(Direction_holdout)
model_Training_Direction4=rep("Down",Num_holdout)
model_Training_Direction4[model_Training_probs4>0.5] = "UP"
t4<-table(Direction_holdout,model_Training_Direction4)
t4
## model_Training_Direction4
## Direction_holdout Down UP
## Down 77 34
## Up 97 44
Accuracy=(77+44)/252
TPRate=77/(77+44)
FPRate=97/(97+34)
Lags4=c(FPRate,TPRate,Accuracy)
stock_model3=glm(Direction~Lag1+Lag2+Lag3+Volume,data=training_data, family = binomial)
model_Training_probs3 = predict(stock_model3,holdout_data,type="response")
Num_holdout<-length(Direction_holdout)
model_Training_Direction3=rep("Down",Num_holdout)
model_Training_Direction3[model_Training_probs3>0.5] = "UP"
t3<-table(Direction_holdout,model_Training_Direction3)
t3
## model_Training_Direction3
## Direction_holdout Down UP
## Down 78 33
## Up 99 42
Accuracy=(78+42)/252
TPRate=78/(78+42)
FPRate=99/(99+33)
Lags3=c(FPRate,TPRate,Accuracy)
stock_model2=glm(Direction~Lag1+Lag2+Volume,data=training_data, family = binomial)
model_Training_probs2 = predict(stock_model2,holdout_data,type="response")
Num_holdout<-length(Direction_holdout)
model_Training_Direction2=rep("Down",Num_holdout)
model_Training_Direction2[model_Training_probs2>0.5] = "UP"
t2<-table(Direction_holdout,model_Training_Direction2)
t2
## model_Training_Direction2
## Direction_holdout Down UP
## Down 79 32
## Up 100 41
Accuracy=(79+41)/252
TPRate=79/(79+41)
FPRate=100/(100+32)
Lags2=c(FPRate,TPRate,Accuracy)
stock_model1=glm(Direction~Lag1+Volume,data=training_data, family = binomial)
model_Training_probs1 = predict(stock_model1,holdout_data,type="response")
Num_holdout<-length(Direction_holdout)
model_Training_Direction1=rep("Down",Num_holdout)
model_Training_Direction1[model_Training_probs1>0.5] = "UP"
t1<-table(Direction_holdout,model_Training_Direction1)
t1
## model_Training_Direction1
## Direction_holdout Down UP
## Down 84 27
## Up 95 46
Accuracy=(84+46)/252
TPRate=84/(84+46)
FPRate=95/(95+27)
Lags1=c(FPRate,TPRate,Accuracy)
p<-as.data.frame(rbind(Lags1,Lags2,Lags3,Lags4,Lags5))
colnames(p)<-c("FPRate","TPRate","Accuracy")
p
## FPRate TPRate Accuracy
## Lags1 0.7786885 0.6461538 0.5158730
## Lags2 0.7575758 0.6583333 0.4761905
## Lags3 0.7500000 0.6500000 0.4761905
## Lags4 0.7404580 0.6363636 0.4801587
## Lags5 0.7404580 0.6363636 0.4801587
plot(p$FPRate,p$TPRate,main="ROC",xlab="FP Rate", ylab="TP Rat", xlim=c(0, 1), ylim=c(0, 1))