ISLR: Chapter 4 Lab

Sameer Mathur

Logistic Regression, LDA, QDA, and KNN

About the Dataset

Here we are dealing with The Stock Market Dataset Which is the Inbuilt dataset in ISLR Package.

Loading and Viewing the Dataset

# loading the package in library
library(ISLR)
# attaching the inbuilt dataset
attach(Smarket)
# dimentions of the dataset
dim(Smarket)

[1] 1250    9

# some rows of the data set
head(Smarket)

  Year   Lag1   Lag2   Lag3   Lag4   Lag5 Volume  Today Direction
1 2001  0.381 -0.192 -2.624 -1.055  5.010 1.1913  0.959        Up
2 2001  0.959  0.381 -0.192 -2.624 -1.055 1.2965  1.032        Up
3 2001  1.032  0.959  0.381 -0.192 -2.624 1.4112 -0.623      Down
4 2001 -0.623  1.032  0.959  0.381 -0.192 1.2760  0.614        Up
5 2001  0.614 -0.623  1.032  0.959  0.381 1.2057  0.213        Up
6 2001  0.213  0.614 -0.623  1.032  0.959 1.3491  1.392        Up

Scatter Plot matrix of the Variables

plot of chunk unnamed-chunk-2

Correlation Table

# correlation matrix
mcor <- cor(Smarket[,-9])
# rounding for two decemal places
round(mcor,2)

       Year  Lag1  Lag2  Lag3  Lag4  Lag5 Volume Today
Year   1.00  0.03  0.03  0.03  0.04  0.03   0.54  0.03
Lag1   0.03  1.00 -0.03 -0.01  0.00 -0.01   0.04 -0.03
Lag2   0.03 -0.03  1.00 -0.03 -0.01  0.00  -0.04 -0.01
Lag3   0.03 -0.01 -0.03  1.00 -0.02 -0.02  -0.04  0.00
Lag4   0.04  0.00 -0.01 -0.02  1.00 -0.03  -0.05 -0.01
Lag5   0.03 -0.01  0.00 -0.02 -0.03  1.00  -0.02 -0.03
Volume 0.54  0.04 -0.04 -0.04 -0.05 -0.02   1.00  0.01
Today  0.03 -0.03 -0.01  0.00 -0.01 -0.03   0.01  1.00

Trend For Volume(number of daily shares traded in billions)

plot of chunk unnamed-chunk-4

Logistic Regression

# fitting logistic regression model
glm.fits=glm(Direction ~
              Lag1 + Lag2 + Lag3
             + Lag4 + Lag5 + Volume,
             data = Smarket, family = binomial)
summary(glm.fits)


Call:
glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
    Volume, family = binomial, data = Smarket)

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-1.446  -1.203   1.065   1.145   1.326  

Coefficients:
             Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.126000   0.240736  -0.523    0.601
Lag1        -0.073074   0.050167  -1.457    0.145
Lag2        -0.042301   0.050086  -0.845    0.398
Lag3         0.011085   0.049939   0.222    0.824
Lag4         0.009359   0.049974   0.187    0.851
Lag5         0.010313   0.049511   0.208    0.835
Volume       0.135441   0.158360   0.855    0.392

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1731.2  on 1249  degrees of freedom
Residual deviance: 1727.6  on 1243  degrees of freedom
AIC: 1741.6

Number of Fisher Scoring iterations: 3

# coefficients
coef(glm.fits)

 (Intercept)         Lag1         Lag2         Lag3         Lag4 
-0.126000257 -0.073073746 -0.042301344  0.011085108  0.009358938 
        Lag5       Volume 
 0.010313068  0.135440659

# Summary of coefficients
summary(glm.fits)$coef

                Estimate Std. Error    z value  Pr(>|z|)
(Intercept) -0.126000257 0.24073574 -0.5233966 0.6006983
Lag1        -0.073073746 0.05016739 -1.4565986 0.1452272
Lag2        -0.042301344 0.05008605 -0.8445733 0.3983491
Lag3         0.011085108 0.04993854  0.2219750 0.8243333
Lag4         0.009358938 0.04997413  0.1872757 0.8514445
Lag5         0.010313068 0.04951146  0.2082966 0.8349974
Volume       0.135440659 0.15835970  0.8552723 0.3924004

# Predicted Probabilities
glm.probs <- predict(glm.fits,type="response")

# assigning "Up" > 0.5 ,"Down" < 0.5
Direction_pred <- ifelse(glm.probs > 0.5, "Up", "Down")

# creating new dataframe with predicted values
pred_DF <- data.frame(Direction,glm.probs ,Direction_pred)
# some rows for predicted and actual Direction columns with their probabilities
head(pred_DF,10)

   Direction glm.probs Direction_pred
1         Up 0.5070841             Up
2         Up 0.4814679           Down
3       Down 0.4811388           Down
4         Up 0.5152224             Up
5         Up 0.5107812             Up
6         Up 0.5069565             Up
7       Down 0.4926509           Down
8         Up 0.5092292             Up
9         Up 0.5176135             Up
10        Up 0.4888378           Down

# table for predicted and actual Direction 
table(Direction, Direction_pred)

         Direction_pred
Direction Down  Up
     Down  145 457
     Up    141 507

# probability for correct prediction
(507+145)/1250

[1] 0.5216

# mean 
mean(Direction_pred == Direction)

[1] 0.5216

Creating the Training Set

# taking the training dataset of the years before 2005
train <- (Year<2005)
# creating a subset from Smarket dataset
Smarket.2005 <- Smarket[!train,]
# dimentions of the new data frame
dim(Smarket.2005)

[1] 252   9

# assigning the column name 
Direction.2005 <- Direction[!train]
# viewing the training dataset
head(Smarket.2005)

     Year   Lag1   Lag2   Lag3   Lag4   Lag5 Volume  Today Direction
999  2005 -0.134  0.008 -0.007  0.715 -0.431 0.7869 -0.812      Down
1000 2005 -0.812 -0.134  0.008 -0.007  0.715 1.5108 -1.167      Down
1001 2005 -1.167 -0.812 -0.134  0.008 -0.007 1.7210 -0.363      Down
1002 2005 -0.363 -1.167 -0.812 -0.134  0.008 1.7389  0.351        Up
1003 2005  0.351 -0.363 -1.167 -0.812 -0.134 1.5691 -0.143      Down
1004 2005 -0.143  0.351 -0.363 -1.167 -0.812 1.4779  0.342        Up

Fitting Logistic Regression with Training dataset

# fitting logistic Regression
glm.fits2 <- glm(Direction ~   
               Lag1 + Lag2 + Lag3 + Lag4 + Lag5
             + Volume,data = Smarket,family = binomial,subset = train)
# Predicted Probabilities
glm.probs2 <- predict(glm.fits,Smarket.2005,type="response")

# assigning "Up" > 0.5 ,"Down" < 0.5
Direction_pred2 <- ifelse(glm.probs2 > 0.5, "Up", "Down")

# creating new dataframe with predicted values
pred_DF2 <- data.frame(Direction.2005,glm.probs2 ,Direction_pred2)
# some rows for predicted and actual Direction columns with their probabilities
head(pred_DF2,10)

     Direction.2005 glm.probs2 Direction_pred2
999            Down  0.4980502            Down
1000           Down  0.5376846              Up
1001           Down  0.5560722              Up
1002             Up  0.5436973              Up
1003           Down  0.5135738              Up
1004             Up  0.5116102              Up
1005           Down  0.5113422              Up
1006             Up  0.5259038              Up
1007           Down  0.5220768              Up
1008             Up  0.5299009              Up

# table for predicted and actual Direction 
table(Direction.2005, Direction_pred2)

              Direction_pred2
Direction.2005 Down  Up
          Down    4 107
          Up      1 140

# probability for correct prediction
(4+140)/252

[1] 0.5714286

# mean 
mean(Direction_pred2 == Direction.2005)

[1] 0.5714286

# taking only two independent variables
glm.fits3 <- glm(Direction~Lag1+Lag2,data=Smarket,
                 family=binomial,subset=train)
glm.probs3 <- predict(glm.fits3,Smarket.2005,type="response")

# assigning "Up" > 0.5 ,"Down" < 0.5
Direction_pred3 <- ifelse(glm.probs3 > 0.5, "Up", "Down")

# creating new dataframe with predicted values
pred_DF3 <- data.frame(Direction.2005,glm.probs3 ,Direction_pred3)
# some rows for predicted and actual Direction columns with their probabilities
head(pred_DF3,10)

     Direction.2005 glm.probs3 Direction_pred3
999            Down  0.5098275              Up
1000           Down  0.5208237              Up
1001           Down  0.5332635              Up
1002             Up  0.5260574              Up
1003           Down  0.5072103              Up
1004             Up  0.5061388              Up
1005           Down  0.5048890              Up
1006             Up  0.5127302              Up
1007           Down  0.5093032              Up
1008             Up  0.5156231              Up

# table for predicted and actual Direction 
table(Direction.2005, Direction_pred3)

              Direction_pred3
Direction.2005 Down  Up
          Down   35  76
          Up     35 106

# probability for correct prediction
(106+35)/252

[1] 0.5595238

# mean 
mean(Direction_pred3 == Direction.2005)

[1] 0.5595238

glm.pred3 <- rep("Down",252)
glm.pred3[glm.probs3>.5] = "Up"
table(glm.pred3,Direction.2005)

         Direction.2005
glm.pred3 Down  Up
     Down   35  35
     Up     76 106

mean(glm.pred3==Direction.2005)

[1] 0.5595238

106/(106+76)

[1] 0.5824176

predict(glm.fits3,newdata=data.frame(Lag1=c(1.2,1.5),
                                     Lag2=c(1.1,-0.8)),type="response")

        1         2 
0.4791462 0.4960939

Linear Discriminant Analysis

# loading package
library(MASS)
# fitting LDA model
lda.fit <- lda(Direction~Lag1+Lag2,data=Smarket,subset=train)
# printing the results
lda.fit

Call:
lda(Direction ~ Lag1 + Lag2, data = Smarket, subset = train)

Prior probabilities of groups:
    Down       Up 
0.491984 0.508016 

Group means:
            Lag1        Lag2
Down  0.04279022  0.03389409
Up   -0.03954635 -0.03132544

Coefficients of linear discriminants:
            LD1
Lag1 -0.6420190
Lag2 -0.5135293

# plotting
plot(lda.fit)

plot of chunk unnamed-chunk-18

# predicting the model
lda.pred <- predict(lda.fit, Smarket.2005)
names(lda.pred)

[1] "class"     "posterior" "x"

lda.class <- lda.pred$class
# table for correct prediction
table(lda.class,Direction.2005)

         Direction.2005
lda.class Down  Up
     Down   35  35
     Up     76 106

# mean
mean(lda.class==Direction.2005)

[1] 0.5595238

sum(lda.pred$posterior[,1]>=.5)

[1] 70

sum(lda.pred$posterior[,1]<.5)

[1] 182

lda.pred$posterior[1:20,1]

      999      1000      1001      1002      1003      1004      1005 
0.4901792 0.4792185 0.4668185 0.4740011 0.4927877 0.4938562 0.4951016 
     1006      1007      1008      1009      1010      1011      1012 
0.4872861 0.4907013 0.4844026 0.4906963 0.5119988 0.4895152 0.4706761 
     1013      1014      1015      1016      1017      1018 
0.4744593 0.4799583 0.4935775 0.5030894 0.4978806 0.4886331

lda.class[1:20]

 [1] Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Down Up   Up  
[15] Up   Up   Up   Down Up   Up  
Levels: Down Up

sum(lda.pred$posterior[,1]>.9)

[1] 0

Quadratic Discriminant Analysis

# fitting QDA model
qda.fit <- qda(Direction ~ Lag1+Lag2,data=Smarket,subset = train)
# printing the results
qda.fit

Call:
qda(Direction ~ Lag1 + Lag2, data = Smarket, subset = train)

Prior probabilities of groups:
    Down       Up 
0.491984 0.508016 

Group means:
            Lag1        Lag2
Down  0.04279022  0.03389409
Up   -0.03954635 -0.03132544

# predicting the model
qda.class <- predict(qda.fit,Smarket.2005)$class
# table for correct prediction 
table(qda.class,Direction.2005)

         Direction.2005
qda.class Down  Up
     Down   30  20
     Up     81 121

# mean
mean(qda.class==Direction.2005)

[1] 0.5992063

K-Nearest Neighbors

library(class)
# creating the training set
train.X <- cbind(Lag1,Lag2)[train,]
# creating test set
test.X <- cbind(Lag1,Lag2)[!train,]
# taking subset of dependent variable
train.Direction <- Direction[train]
set.seed(1)
# predicting KNN model
knn.pred <- knn(train.X,test.X,train.Direction,k=1)

# table for correct prediction
table(knn.pred,Direction.2005)

        Direction.2005
knn.pred Down Up
    Down   43 58
    Up     68 83

(83+43)/252

[1] 0.5

# prediction for k =3
knn.pred <- knn(train.X,test.X,train.Direction,k=3)
# table for correct prediction
table(knn.pred,Direction.2005)

        Direction.2005
knn.pred Down Up
    Down   48 54
    Up     63 87

# mean
mean(knn.pred==Direction.2005)

[1] 0.5357143

An Application to Caravan Insurance Data

# dimentions of the data frame
dim(Caravan)

[1] 5822   86

# name of the columns
colnames(Caravan)

 [1] "MOSTYPE"  "MAANTHUI" "MGEMOMV"  "MGEMLEEF" "MOSHOOFD" "MGODRK"  
 [7] "MGODPR"   "MGODOV"   "MGODGE"   "MRELGE"   "MRELSA"   "MRELOV"  
[13] "MFALLEEN" "MFGEKIND" "MFWEKIND" "MOPLHOOG" "MOPLMIDD" "MOPLLAAG"
[19] "MBERHOOG" "MBERZELF" "MBERBOER" "MBERMIDD" "MBERARBG" "MBERARBO"
[25] "MSKA"     "MSKB1"    "MSKB2"    "MSKC"     "MSKD"     "MHHUUR"  
[31] "MHKOOP"   "MAUT1"    "MAUT2"    "MAUT0"    "MZFONDS"  "MZPART"  
[37] "MINKM30"  "MINK3045" "MINK4575" "MINK7512" "MINK123M" "MINKGEM" 
[43] "MKOOPKLA" "PWAPART"  "PWABEDR"  "PWALAND"  "PPERSAUT" "PBESAUT" 
[49] "PMOTSCO"  "PVRAAUT"  "PAANHANG" "PTRACTOR" "PWERKT"   "PBROM"   
[55] "PLEVEN"   "PPERSONG" "PGEZONG"  "PWAOREG"  "PBRAND"   "PZEILPL" 
[61] "PPLEZIER" "PFIETS"   "PINBOED"  "PBYSTAND" "AWAPART"  "AWABEDR" 
[67] "AWALAND"  "APERSAUT" "ABESAUT"  "AMOTSCO"  "AVRAAUT"  "AAANHANG"
[73] "ATRACTOR" "AWERKT"   "ABROM"    "ALEVEN"   "APERSONG" "AGEZONG" 
[79] "AWAOREG"  "ABRAND"   "AZEILPL"  "APLEZIER" "AFIETS"   "AINBOED" 
[85] "ABYSTAND" "Purchase"

attach(Caravan)
summary(Purchase)

  No  Yes 
5474  348

Standardization

# feature Scaling
standardized.X <- scale(Caravan[,-86])
# variance in 1st column
var(Caravan[,1])

[1] 165.0378

# variance in 2nd column
var(Caravan[,2])

[1] 0.1647078

# variablity after Standardization in 1st column
var(standardized.X[,1])

[1] 1

# variability after Standardization in 2nd column
var(standardized.X[,2])

[1] 1

Creating Test set and Training set

# taking test set from 1:1000 observation
test <- 1:1000
# creating training set
train.X <- standardized.X[-test,]
dim(train.X)

[1] 4822   85

# creating test set
test.X <- standardized.X[test,]
dim(test.X)

[1] 1000   85

# training set of dependent variable
train.Y <- Caravan$Purchase[-test]
length(train.Y)

[1] 4822

# test set of dependent variable
test.Y <- Caravan$Purchase[test]
length(test.Y)

[1] 1000

# fitting KNN model
knn.pred <- knn(train.X,test.X,train.Y,k=1)
mean(test.Y!=knn.pred)

[1] 0.116

# mean
mean(test.Y!="No")

[1] 0.059

# table of correct prediction
table(knn.pred,test.Y)

        test.Y
knn.pred  No Yes
     No  874  49
     Yes  67  10

# for k = 3
knn.pred <- knn(train.X,test.X,train.Y,k=3)
# table of correct prediction
table(knn.pred,test.Y)

        test.Y
knn.pred  No Yes
     No  920  54
     Yes  21   5

# for k = 5
knn.pred <- knn(train.X,test.X,train.Y,k=5)
# table of correct prediction
table(knn.pred,test.Y)

        test.Y
knn.pred  No Yes
     No  930  55
     Yes  11   4

Using Logistic Regression

# fitting GLM
glm.fits4 <- glm(Purchase~.,data = Caravan,family = binomial,subset = -test)
# predicting the model
glm.probs4 <- predict(glm.fits4,Caravan[test,],type="response")
# replacing with "No" to All observation
glm.pred4 <- rep("No",1000)
# assigning "Yes" to > .5 
glm.pred4[glm.probs4 >.5]="Yes"

# table of correct prediction
table(glm.pred4,test.Y)

         test.Y
glm.pred4  No Yes
      No  934  59
      Yes   7   0

# replacing with "No" to All observation
glm.pred5 <- rep("No",1000)
# assigning "Yes" to > .25 
glm.pred5[glm.probs4 >.25]="Yes"
# table of correct prediction
table(glm.pred5,test.Y)

         test.Y
glm.pred5  No Yes
      No  919  48
      Yes  22  11