Sameer Mathur
Logistic Regression, LDA, QDA, and KNN
Here we are dealing with The Stock Market Dataset Which is the Inbuilt dataset in ISLR Package.
# loading the package in library
library(ISLR)
# attaching the inbuilt dataset
attach(Smarket)
# dimentions of the dataset
dim(Smarket)
[1] 1250 9
# some rows of the data set
head(Smarket)
Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
1 2001 0.381 -0.192 -2.624 -1.055 5.010 1.1913 0.959 Up
2 2001 0.959 0.381 -0.192 -2.624 -1.055 1.2965 1.032 Up
3 2001 1.032 0.959 0.381 -0.192 -2.624 1.4112 -0.623 Down
4 2001 -0.623 1.032 0.959 0.381 -0.192 1.2760 0.614 Up
5 2001 0.614 -0.623 1.032 0.959 0.381 1.2057 0.213 Up
6 2001 0.213 0.614 -0.623 1.032 0.959 1.3491 1.392 Up
# correlation matrix
mcor <- cor(Smarket[,-9])
# rounding for two decemal places
round(mcor,2)
Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today
Year 1.00 0.03 0.03 0.03 0.04 0.03 0.54 0.03
Lag1 0.03 1.00 -0.03 -0.01 0.00 -0.01 0.04 -0.03
Lag2 0.03 -0.03 1.00 -0.03 -0.01 0.00 -0.04 -0.01
Lag3 0.03 -0.01 -0.03 1.00 -0.02 -0.02 -0.04 0.00
Lag4 0.04 0.00 -0.01 -0.02 1.00 -0.03 -0.05 -0.01
Lag5 0.03 -0.01 0.00 -0.02 -0.03 1.00 -0.02 -0.03
Volume 0.54 0.04 -0.04 -0.04 -0.05 -0.02 1.00 0.01
Today 0.03 -0.03 -0.01 0.00 -0.01 -0.03 0.01 1.00
# fitting logistic regression model
glm.fits=glm(Direction ~
Lag1 + Lag2 + Lag3
+ Lag4 + Lag5 + Volume,
data = Smarket, family = binomial)
summary(glm.fits)
Call:
glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
Volume, family = binomial, data = Smarket)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.446 -1.203 1.065 1.145 1.326
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.126000 0.240736 -0.523 0.601
Lag1 -0.073074 0.050167 -1.457 0.145
Lag2 -0.042301 0.050086 -0.845 0.398
Lag3 0.011085 0.049939 0.222 0.824
Lag4 0.009359 0.049974 0.187 0.851
Lag5 0.010313 0.049511 0.208 0.835
Volume 0.135441 0.158360 0.855 0.392
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1731.2 on 1249 degrees of freedom
Residual deviance: 1727.6 on 1243 degrees of freedom
AIC: 1741.6
Number of Fisher Scoring iterations: 3
# coefficients
coef(glm.fits)
(Intercept) Lag1 Lag2 Lag3 Lag4
-0.126000257 -0.073073746 -0.042301344 0.011085108 0.009358938
Lag5 Volume
0.010313068 0.135440659
# Summary of coefficients
summary(glm.fits)$coef
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.126000257 0.24073574 -0.5233966 0.6006983
Lag1 -0.073073746 0.05016739 -1.4565986 0.1452272
Lag2 -0.042301344 0.05008605 -0.8445733 0.3983491
Lag3 0.011085108 0.04993854 0.2219750 0.8243333
Lag4 0.009358938 0.04997413 0.1872757 0.8514445
Lag5 0.010313068 0.04951146 0.2082966 0.8349974
Volume 0.135440659 0.15835970 0.8552723 0.3924004
# Predicted Probabilities
glm.probs <- predict(glm.fits,type="response")
# assigning "Up" > 0.5 ,"Down" < 0.5
Direction_pred <- ifelse(glm.probs > 0.5, "Up", "Down")
# creating new dataframe with predicted values
pred_DF <- data.frame(Direction,glm.probs ,Direction_pred)
# some rows for predicted and actual Direction columns with their probabilities
head(pred_DF,10)
Direction glm.probs Direction_pred
1 Up 0.5070841 Up
2 Up 0.4814679 Down
3 Down 0.4811388 Down
4 Up 0.5152224 Up
5 Up 0.5107812 Up
6 Up 0.5069565 Up
7 Down 0.4926509 Down
8 Up 0.5092292 Up
9 Up 0.5176135 Up
10 Up 0.4888378 Down
# table for predicted and actual Direction
table(Direction, Direction_pred)
Direction_pred
Direction Down Up
Down 145 457
Up 141 507
# probability for correct prediction
(507+145)/1250
[1] 0.5216
# mean
mean(Direction_pred == Direction)
[1] 0.5216
# taking the training dataset of the years before 2005
train <- (Year<2005)
# creating a subset from Smarket dataset
Smarket.2005 <- Smarket[!train,]
# dimentions of the new data frame
dim(Smarket.2005)
[1] 252 9
# assigning the column name
Direction.2005 <- Direction[!train]
# viewing the training dataset
head(Smarket.2005)
Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
999 2005 -0.134 0.008 -0.007 0.715 -0.431 0.7869 -0.812 Down
1000 2005 -0.812 -0.134 0.008 -0.007 0.715 1.5108 -1.167 Down
1001 2005 -1.167 -0.812 -0.134 0.008 -0.007 1.7210 -0.363 Down
1002 2005 -0.363 -1.167 -0.812 -0.134 0.008 1.7389 0.351 Up
1003 2005 0.351 -0.363 -1.167 -0.812 -0.134 1.5691 -0.143 Down
1004 2005 -0.143 0.351 -0.363 -1.167 -0.812 1.4779 0.342 Up
# fitting logistic Regression
glm.fits2 <- glm(Direction ~
Lag1 + Lag2 + Lag3 + Lag4 + Lag5
+ Volume,data = Smarket,family = binomial,subset = train)
# Predicted Probabilities
glm.probs2 <- predict(glm.fits,Smarket.2005,type="response")
# assigning "Up" > 0.5 ,"Down" < 0.5
Direction_pred2 <- ifelse(glm.probs2 > 0.5, "Up", "Down")
# creating new dataframe with predicted values
pred_DF2 <- data.frame(Direction.2005,glm.probs2 ,Direction_pred2)
# some rows for predicted and actual Direction columns with their probabilities
head(pred_DF2,10)
Direction.2005 glm.probs2 Direction_pred2
999 Down 0.4980502 Down
1000 Down 0.5376846 Up
1001 Down 0.5560722 Up
1002 Up 0.5436973 Up
1003 Down 0.5135738 Up
1004 Up 0.5116102 Up
1005 Down 0.5113422 Up
1006 Up 0.5259038 Up
1007 Down 0.5220768 Up
1008 Up 0.5299009 Up
# table for predicted and actual Direction
table(Direction.2005, Direction_pred2)
Direction_pred2
Direction.2005 Down Up
Down 4 107
Up 1 140
# probability for correct prediction
(4+140)/252
[1] 0.5714286
# mean
mean(Direction_pred2 == Direction.2005)
[1] 0.5714286
# taking only two independent variables
glm.fits3 <- glm(Direction~Lag1+Lag2,data=Smarket,
family=binomial,subset=train)
glm.probs3 <- predict(glm.fits3,Smarket.2005,type="response")
# assigning "Up" > 0.5 ,"Down" < 0.5
Direction_pred3 <- ifelse(glm.probs3 > 0.5, "Up", "Down")
# creating new dataframe with predicted values
pred_DF3 <- data.frame(Direction.2005,glm.probs3 ,Direction_pred3)
# some rows for predicted and actual Direction columns with their probabilities
head(pred_DF3,10)
Direction.2005 glm.probs3 Direction_pred3
999 Down 0.5098275 Up
1000 Down 0.5208237 Up
1001 Down 0.5332635 Up
1002 Up 0.5260574 Up
1003 Down 0.5072103 Up
1004 Up 0.5061388 Up
1005 Down 0.5048890 Up
1006 Up 0.5127302 Up
1007 Down 0.5093032 Up
1008 Up 0.5156231 Up
# table for predicted and actual Direction
table(Direction.2005, Direction_pred3)
Direction_pred3
Direction.2005 Down Up
Down 35 76
Up 35 106
# probability for correct prediction
(106+35)/252
[1] 0.5595238
# mean
mean(Direction_pred3 == Direction.2005)
[1] 0.5595238
glm.pred3 <- rep("Down",252)
glm.pred3[glm.probs3>.5] = "Up"
table(glm.pred3,Direction.2005)
Direction.2005
glm.pred3 Down Up
Down 35 35
Up 76 106
mean(glm.pred3==Direction.2005)
[1] 0.5595238
106/(106+76)
[1] 0.5824176
predict(glm.fits3,newdata=data.frame(Lag1=c(1.2,1.5),
Lag2=c(1.1,-0.8)),type="response")
1 2
0.4791462 0.4960939
# loading package
library(MASS)
# fitting LDA model
lda.fit <- lda(Direction~Lag1+Lag2,data=Smarket,subset=train)
# printing the results
lda.fit
Call:
lda(Direction ~ Lag1 + Lag2, data = Smarket, subset = train)
Prior probabilities of groups:
Down Up
0.491984 0.508016
Group means:
Lag1 Lag2
Down 0.04279022 0.03389409
Up -0.03954635 -0.03132544
Coefficients of linear discriminants:
LD1
Lag1 -0.6420190
Lag2 -0.5135293
# plotting
plot(lda.fit)
# predicting the model
lda.pred <- predict(lda.fit, Smarket.2005)
names(lda.pred)
[1] "class" "posterior" "x"
lda.class <- lda.pred$class
# table for correct prediction
table(lda.class,Direction.2005)
Direction.2005
lda.class Down Up
Down 35 35
Up 76 106
# mean
mean(lda.class==Direction.2005)
[1] 0.5595238
sum(lda.pred$posterior[,1]>=.5)
[1] 70
sum(lda.pred$posterior[,1]<.5)
[1] 182
lda.pred$posterior[1:20,1]
999 1000 1001 1002 1003 1004 1005
0.4901792 0.4792185 0.4668185 0.4740011 0.4927877 0.4938562 0.4951016
1006 1007 1008 1009 1010 1011 1012
0.4872861 0.4907013 0.4844026 0.4906963 0.5119988 0.4895152 0.4706761
1013 1014 1015 1016 1017 1018
0.4744593 0.4799583 0.4935775 0.5030894 0.4978806 0.4886331
lda.class[1:20]
[1] Up Up Up Up Up Up Up Up Up Up Up Down Up Up
[15] Up Up Up Down Up Up
Levels: Down Up
sum(lda.pred$posterior[,1]>.9)
[1] 0
# fitting QDA model
qda.fit <- qda(Direction ~ Lag1+Lag2,data=Smarket,subset = train)
# printing the results
qda.fit
Call:
qda(Direction ~ Lag1 + Lag2, data = Smarket, subset = train)
Prior probabilities of groups:
Down Up
0.491984 0.508016
Group means:
Lag1 Lag2
Down 0.04279022 0.03389409
Up -0.03954635 -0.03132544
# predicting the model
qda.class <- predict(qda.fit,Smarket.2005)$class
# table for correct prediction
table(qda.class,Direction.2005)
Direction.2005
qda.class Down Up
Down 30 20
Up 81 121
# mean
mean(qda.class==Direction.2005)
[1] 0.5992063
library(class)
# creating the training set
train.X <- cbind(Lag1,Lag2)[train,]
# creating test set
test.X <- cbind(Lag1,Lag2)[!train,]
# taking subset of dependent variable
train.Direction <- Direction[train]
set.seed(1)
# predicting KNN model
knn.pred <- knn(train.X,test.X,train.Direction,k=1)
# table for correct prediction
table(knn.pred,Direction.2005)
Direction.2005
knn.pred Down Up
Down 43 58
Up 68 83
(83+43)/252
[1] 0.5
# prediction for k =3
knn.pred <- knn(train.X,test.X,train.Direction,k=3)
# table for correct prediction
table(knn.pred,Direction.2005)
Direction.2005
knn.pred Down Up
Down 48 54
Up 63 87
# mean
mean(knn.pred==Direction.2005)
[1] 0.5357143
# dimentions of the data frame
dim(Caravan)
[1] 5822 86
# name of the columns
colnames(Caravan)
[1] "MOSTYPE" "MAANTHUI" "MGEMOMV" "MGEMLEEF" "MOSHOOFD" "MGODRK"
[7] "MGODPR" "MGODOV" "MGODGE" "MRELGE" "MRELSA" "MRELOV"
[13] "MFALLEEN" "MFGEKIND" "MFWEKIND" "MOPLHOOG" "MOPLMIDD" "MOPLLAAG"
[19] "MBERHOOG" "MBERZELF" "MBERBOER" "MBERMIDD" "MBERARBG" "MBERARBO"
[25] "MSKA" "MSKB1" "MSKB2" "MSKC" "MSKD" "MHHUUR"
[31] "MHKOOP" "MAUT1" "MAUT2" "MAUT0" "MZFONDS" "MZPART"
[37] "MINKM30" "MINK3045" "MINK4575" "MINK7512" "MINK123M" "MINKGEM"
[43] "MKOOPKLA" "PWAPART" "PWABEDR" "PWALAND" "PPERSAUT" "PBESAUT"
[49] "PMOTSCO" "PVRAAUT" "PAANHANG" "PTRACTOR" "PWERKT" "PBROM"
[55] "PLEVEN" "PPERSONG" "PGEZONG" "PWAOREG" "PBRAND" "PZEILPL"
[61] "PPLEZIER" "PFIETS" "PINBOED" "PBYSTAND" "AWAPART" "AWABEDR"
[67] "AWALAND" "APERSAUT" "ABESAUT" "AMOTSCO" "AVRAAUT" "AAANHANG"
[73] "ATRACTOR" "AWERKT" "ABROM" "ALEVEN" "APERSONG" "AGEZONG"
[79] "AWAOREG" "ABRAND" "AZEILPL" "APLEZIER" "AFIETS" "AINBOED"
[85] "ABYSTAND" "Purchase"
attach(Caravan)
summary(Purchase)
No Yes
5474 348
# feature Scaling
standardized.X <- scale(Caravan[,-86])
# variance in 1st column
var(Caravan[,1])
[1] 165.0378
# variance in 2nd column
var(Caravan[,2])
[1] 0.1647078
# variablity after Standardization in 1st column
var(standardized.X[,1])
[1] 1
# variability after Standardization in 2nd column
var(standardized.X[,2])
[1] 1
# taking test set from 1:1000 observation
test <- 1:1000
# creating training set
train.X <- standardized.X[-test,]
dim(train.X)
[1] 4822 85
# creating test set
test.X <- standardized.X[test,]
dim(test.X)
[1] 1000 85
# training set of dependent variable
train.Y <- Caravan$Purchase[-test]
length(train.Y)
[1] 4822
# test set of dependent variable
test.Y <- Caravan$Purchase[test]
length(test.Y)
[1] 1000
# fitting KNN model
knn.pred <- knn(train.X,test.X,train.Y,k=1)
mean(test.Y!=knn.pred)
[1] 0.116
# mean
mean(test.Y!="No")
[1] 0.059
# table of correct prediction
table(knn.pred,test.Y)
test.Y
knn.pred No Yes
No 874 49
Yes 67 10
# for k = 3
knn.pred <- knn(train.X,test.X,train.Y,k=3)
# table of correct prediction
table(knn.pred,test.Y)
test.Y
knn.pred No Yes
No 920 54
Yes 21 5
# for k = 5
knn.pred <- knn(train.X,test.X,train.Y,k=5)
# table of correct prediction
table(knn.pred,test.Y)
test.Y
knn.pred No Yes
No 930 55
Yes 11 4
# fitting GLM
glm.fits4 <- glm(Purchase~.,data = Caravan,family = binomial,subset = -test)
# predicting the model
glm.probs4 <- predict(glm.fits4,Caravan[test,],type="response")
# replacing with "No" to All observation
glm.pred4 <- rep("No",1000)
# assigning "Yes" to > .5
glm.pred4[glm.probs4 >.5]="Yes"
# table of correct prediction
table(glm.pred4,test.Y)
test.Y
glm.pred4 No Yes
No 934 59
Yes 7 0
# replacing with "No" to All observation
glm.pred5 <- rep("No",1000)
# assigning "Yes" to > .25
glm.pred5[glm.probs4 >.25]="Yes"
# table of correct prediction
table(glm.pred5,test.Y)
test.Y
glm.pred5 No Yes
No 919 48
Yes 22 11