Classification

library(ElemStatLearn)
data(SAheart)
heart <-SAheart[,c(1:3,5,7:10)]
heartfit<-glm( chd~.,data=heart, family = binomial)
summary(heartfit)
## 
## Call:
## glm(formula = chd ~ ., family = binomial, data = heart)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.752  -0.838  -0.455   0.929   2.443  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -4.129600   0.964156   -4.28  1.8e-05 ***
## sbp             0.005761   0.005633    1.02   0.3064    
## tobacco         0.079526   0.026215    3.03   0.0024 ** 
## ldl             0.184779   0.057412    3.22   0.0013 ** 
## famhistPresent  0.939185   0.224869    4.18  3.0e-05 ***
## obesity        -0.034543   0.029105   -1.19   0.2353    
## alcohol         0.000607   0.004455    0.14   0.8917    
## age             0.042541   0.010175    4.18  2.9e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 596.11  on 461  degrees of freedom
## Residual deviance: 483.17  on 454  degrees of freedom
## AIC: 499.2
## 
## Number of Fisher Scoring iterations: 4
require(ISLR)
## Loading required package: ISLR
names(Smarket)
## [1] "Year"      "Lag1"      "Lag2"      "Lag3"      "Lag4"      "Lag5"     
## [7] "Volume"    "Today"     "Direction"
summary(Smarket)
##       Year           Lag1             Lag2             Lag3       
##  Min.   :2001   Min.   :-4.922   Min.   :-4.922   Min.   :-4.922  
##  1st Qu.:2002   1st Qu.:-0.640   1st Qu.:-0.640   1st Qu.:-0.640  
##  Median :2003   Median : 0.039   Median : 0.039   Median : 0.038  
##  Mean   :2003   Mean   : 0.004   Mean   : 0.004   Mean   : 0.002  
##  3rd Qu.:2004   3rd Qu.: 0.597   3rd Qu.: 0.597   3rd Qu.: 0.597  
##  Max.   :2005   Max.   : 5.733   Max.   : 5.733   Max.   : 5.733  
##       Lag4             Lag5            Volume          Today       
##  Min.   :-4.922   Min.   :-4.922   Min.   :0.356   Min.   :-4.922  
##  1st Qu.:-0.640   1st Qu.:-0.640   1st Qu.:1.257   1st Qu.:-0.640  
##  Median : 0.038   Median : 0.038   Median :1.423   Median : 0.038  
##  Mean   : 0.002   Mean   : 0.006   Mean   :1.478   Mean   : 0.003  
##  3rd Qu.: 0.597   3rd Qu.: 0.597   3rd Qu.:1.642   3rd Qu.: 0.597  
##  Max.   : 5.733   Max.   : 5.733   Max.   :3.152   Max.   : 5.733  
##  Direction 
##  Down:602  
##  Up  :648  
##            
##            
##            
## 
pairs(Smarket,col=Smarket$Direction)

plot of chunk unnamed-chunk-2

Logistic regression

glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,
            data=Smarket,family=binomial)
summary(glm.fit)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Smarket)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
##  -1.45   -1.20    1.07    1.15    1.33  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.12600    0.24074   -0.52     0.60
## Lag1        -0.07307    0.05017   -1.46     0.15
## Lag2        -0.04230    0.05009   -0.84     0.40
## Lag3         0.01109    0.04994    0.22     0.82
## Lag4         0.00936    0.04997    0.19     0.85
## Lag5         0.01031    0.04951    0.21     0.83
## Volume       0.13544    0.15836    0.86     0.39
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1731.2  on 1249  degrees of freedom
## Residual deviance: 1727.6  on 1243  degrees of freedom
## AIC: 1742
## 
## Number of Fisher Scoring iterations: 3
glm.probs=predict(glm.fit,type="response") 
glm.probs[1:5]
##      1      2      3      4      5 
## 0.5071 0.4815 0.4811 0.5152 0.5108
glm.pred=ifelse(glm.probs>0.5,"Up","Down")
attach(Smarket)
table(glm.pred,Direction)
##         Direction
## glm.pred Down  Up
##     Down  145 141
##     Up    457 507
mean(glm.pred==Direction)
## [1] 0.5216

Make training and test set

train = Year<2005
Smarket.2005 =Smarket[!train,]
glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,
            data=Smarket,family=binomial, subset=train)
glm.probs=predict(glm.fit,newdata=Smarket[!train,],type="response") 
glm.pred=ifelse(glm.probs >0.5,"Up","Down")
Direction.2005=Smarket$Direction[!train]
table(glm.pred,Direction.2005)
##         Direction.2005
## glm.pred Down Up
##     Down   77 97
##     Up     34 44
mean(glm.pred==Direction.2005)
## [1] 0.4802

Fit smaller model

glm.fit=glm(Direction~Lag1+Lag2,
            data=Smarket,family=binomial, subset=train)
glm.probs=predict(glm.fit,newdata=Smarket[!train,],type="response") 
glm.pred=ifelse(glm.probs >0.5,"Up","Down")
table(glm.pred,Direction.2005)
##         Direction.2005
## glm.pred Down  Up
##     Down   35  35
##     Up     76 106
mean(glm.pred==Direction.2005)
## [1] 0.5595
106/(76+106)
## [1] 0.5824

Linear Discriminant Analysis

library(MASS)
lda.fit=lda(Direction~Lag1+Lag2,data=Smarket,subset=train)
lda.fit
## Call:
## lda(Direction ~ Lag1 + Lag2, data = Smarket, subset = train)
## 
## Prior probabilities of groups:
##  Down    Up 
## 0.492 0.508 
## 
## Group means:
##          Lag1     Lag2
## Down  0.04279  0.03389
## Up   -0.03955 -0.03133
## 
## Coefficients of linear discriminants:
##          LD1
## Lag1 -0.6420
## Lag2 -0.5135
plot(lda.fit)

plot of chunk unnamed-chunk-6

lda.pred=predict(lda.fit, Smarket.2005)
names(lda.pred)
## [1] "class"     "posterior" "x"
lda.class=lda.pred$class
table(lda.class,Direction.2005)
##          Direction.2005
## lda.class Down  Up
##      Down   35  35
##      Up     76 106
mean(lda.class==Direction.2005)
## [1] 0.5595
sum(lda.pred$posterior[,1]>=.5)
## [1] 70
sum(lda.pred$posterior[,1]<.5)
## [1] 182
lda.pred$posterior[1:20,1]
##    999   1000   1001   1002   1003   1004   1005   1006   1007   1008 
## 0.4902 0.4792 0.4668 0.4740 0.4928 0.4939 0.4951 0.4873 0.4907 0.4844 
##   1009   1010   1011   1012   1013   1014   1015   1016   1017   1018 
## 0.4907 0.5120 0.4895 0.4707 0.4745 0.4800 0.4936 0.5031 0.4979 0.4886
lda.class[1:20]
##  [1] Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Down Up   Up  
## [15] Up   Up   Up   Down Up   Up  
## Levels: Down Up
sum(lda.pred$posterior[,1]>.9)
## [1] 0

Quadratic Discriminant Analysis

qda.fit=qda(Direction~Lag1+Lag2,data=Smarket,subset=train)
qda.fit
## Call:
## qda(Direction ~ Lag1 + Lag2, data = Smarket, subset = train)
## 
## Prior probabilities of groups:
##  Down    Up 
## 0.492 0.508 
## 
## Group means:
##          Lag1     Lag2
## Down  0.04279  0.03389
## Up   -0.03955 -0.03133
qda.class=predict(qda.fit,Smarket.2005)$class
table(qda.class,Direction.2005)
##          Direction.2005
## qda.class Down  Up
##      Down   30  20
##      Up     81 121
mean(qda.class==Direction.2005)
## [1] 0.5992

K-Nearest Neighbors

library(class)
train.X=cbind(Lag1,Lag2)[train,]
test.X=cbind(Lag1,Lag2)[!train,]
train.Direction=Direction[train]
set.seed(1)
knn.pred=knn(train.X,test.X,train.Direction,k=1)
table(knn.pred,Direction.2005)
##         Direction.2005
## knn.pred Down Up
##     Down   43 58
##     Up     68 83
(83+43)/252
## [1] 0.5
knn.pred=knn(train.X,test.X,train.Direction,k=3)
table(knn.pred,Direction.2005)
##         Direction.2005
## knn.pred Down Up
##     Down   48 54
##     Up     63 87
mean(knn.pred==Direction.2005)
## [1] 0.5357

Insurance data

dim(Caravan)
## [1] 5822   86
attach(Caravan)
summary(Purchase)
##   No  Yes 
## 5474  348
348/5822
## [1] 0.05977
standardized.X=scale(Caravan[,-86])
var(Caravan[,1])
## [1] 165
var(Caravan[,2])
## [1] 0.1647
var(standardized.X[,1])
## [1] 1
var(standardized.X[,2])
## [1] 1
test=1:1000
train.X=standardized.X[-test,]
test.X=standardized.X[test,]
train.Y=Purchase[-test]
test.Y=Purchase[test]
set.seed(1)
knn.pred=knn(train.X,test.X,train.Y,k=1)
mean(test.Y!=knn.pred)
## [1] 0.118
mean(test.Y!="No")
## [1] 0.059
table(knn.pred,test.Y)
##         test.Y
## knn.pred  No Yes
##      No  873  50
##      Yes  68   9
9/(68+9)
## [1] 0.1169
knn.pred=knn(train.X,test.X,train.Y,k=3)
table(knn.pred,test.Y)
##         test.Y
## knn.pred  No Yes
##      No  920  54
##      Yes  21   5
5/26
## [1] 0.1923
knn.pred=knn(train.X,test.X,train.Y,k=5)
table(knn.pred,test.Y)
##         test.Y
## knn.pred  No Yes
##      No  930  55
##      Yes  11   4
4/15
## [1] 0.2667
glm.fit=glm(Purchase~.,data=Caravan,family=binomial,subset=-test)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
glm.probs=predict(glm.fit,Caravan[test,],type="response")
glm.pred=rep("No",1000)
glm.pred[glm.probs>.5]="Yes"
table(glm.pred,test.Y)
##         test.Y
## glm.pred  No Yes
##      No  934  59
##      Yes   7   0
glm.pred=rep("No",1000)
glm.pred[glm.probs>.25]="Yes"
table(glm.pred,test.Y)
##         test.Y
## glm.pred  No Yes
##      No  919  48
##      Yes  22  11
11/(22+11)
## [1] 0.3333