library(ElemStatLearn)
data(SAheart)
heart <-SAheart[,c(1:3,5,7:10)]
heartfit<-glm( chd~.,data=heart, family = binomial)
summary(heartfit)
##
## Call:
## glm(formula = chd ~ ., family = binomial, data = heart)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.752 -0.838 -0.455 0.929 2.443
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.129600 0.964156 -4.28 1.8e-05 ***
## sbp 0.005761 0.005633 1.02 0.3064
## tobacco 0.079526 0.026215 3.03 0.0024 **
## ldl 0.184779 0.057412 3.22 0.0013 **
## famhistPresent 0.939185 0.224869 4.18 3.0e-05 ***
## obesity -0.034543 0.029105 -1.19 0.2353
## alcohol 0.000607 0.004455 0.14 0.8917
## age 0.042541 0.010175 4.18 2.9e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 596.11 on 461 degrees of freedom
## Residual deviance: 483.17 on 454 degrees of freedom
## AIC: 499.2
##
## Number of Fisher Scoring iterations: 4
require(ISLR)
## Loading required package: ISLR
names(Smarket)
## [1] "Year" "Lag1" "Lag2" "Lag3" "Lag4" "Lag5"
## [7] "Volume" "Today" "Direction"
summary(Smarket)
## Year Lag1 Lag2 Lag3
## Min. :2001 Min. :-4.922 Min. :-4.922 Min. :-4.922
## 1st Qu.:2002 1st Qu.:-0.640 1st Qu.:-0.640 1st Qu.:-0.640
## Median :2003 Median : 0.039 Median : 0.039 Median : 0.038
## Mean :2003 Mean : 0.004 Mean : 0.004 Mean : 0.002
## 3rd Qu.:2004 3rd Qu.: 0.597 3rd Qu.: 0.597 3rd Qu.: 0.597
## Max. :2005 Max. : 5.733 Max. : 5.733 Max. : 5.733
## Lag4 Lag5 Volume Today
## Min. :-4.922 Min. :-4.922 Min. :0.356 Min. :-4.922
## 1st Qu.:-0.640 1st Qu.:-0.640 1st Qu.:1.257 1st Qu.:-0.640
## Median : 0.038 Median : 0.038 Median :1.423 Median : 0.038
## Mean : 0.002 Mean : 0.006 Mean :1.478 Mean : 0.003
## 3rd Qu.: 0.597 3rd Qu.: 0.597 3rd Qu.:1.642 3rd Qu.: 0.597
## Max. : 5.733 Max. : 5.733 Max. :3.152 Max. : 5.733
## Direction
## Down:602
## Up :648
##
##
##
##
pairs(Smarket,col=Smarket$Direction)
Logistic regression
glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,
data=Smarket,family=binomial)
summary(glm.fit)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Smarket)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.45 -1.20 1.07 1.15 1.33
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.12600 0.24074 -0.52 0.60
## Lag1 -0.07307 0.05017 -1.46 0.15
## Lag2 -0.04230 0.05009 -0.84 0.40
## Lag3 0.01109 0.04994 0.22 0.82
## Lag4 0.00936 0.04997 0.19 0.85
## Lag5 0.01031 0.04951 0.21 0.83
## Volume 0.13544 0.15836 0.86 0.39
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1731.2 on 1249 degrees of freedom
## Residual deviance: 1727.6 on 1243 degrees of freedom
## AIC: 1742
##
## Number of Fisher Scoring iterations: 3
glm.probs=predict(glm.fit,type="response")
glm.probs[1:5]
## 1 2 3 4 5
## 0.5071 0.4815 0.4811 0.5152 0.5108
glm.pred=ifelse(glm.probs>0.5,"Up","Down")
attach(Smarket)
table(glm.pred,Direction)
## Direction
## glm.pred Down Up
## Down 145 141
## Up 457 507
mean(glm.pred==Direction)
## [1] 0.5216
Make training and test set
train = Year<2005
Smarket.2005 =Smarket[!train,]
glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,
data=Smarket,family=binomial, subset=train)
glm.probs=predict(glm.fit,newdata=Smarket[!train,],type="response")
glm.pred=ifelse(glm.probs >0.5,"Up","Down")
Direction.2005=Smarket$Direction[!train]
table(glm.pred,Direction.2005)
## Direction.2005
## glm.pred Down Up
## Down 77 97
## Up 34 44
mean(glm.pred==Direction.2005)
## [1] 0.4802
Fit smaller model
glm.fit=glm(Direction~Lag1+Lag2,
data=Smarket,family=binomial, subset=train)
glm.probs=predict(glm.fit,newdata=Smarket[!train,],type="response")
glm.pred=ifelse(glm.probs >0.5,"Up","Down")
table(glm.pred,Direction.2005)
## Direction.2005
## glm.pred Down Up
## Down 35 35
## Up 76 106
mean(glm.pred==Direction.2005)
## [1] 0.5595
106/(76+106)
## [1] 0.5824
Linear Discriminant Analysis
library(MASS)
lda.fit=lda(Direction~Lag1+Lag2,data=Smarket,subset=train)
lda.fit
## Call:
## lda(Direction ~ Lag1 + Lag2, data = Smarket, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.492 0.508
##
## Group means:
## Lag1 Lag2
## Down 0.04279 0.03389
## Up -0.03955 -0.03133
##
## Coefficients of linear discriminants:
## LD1
## Lag1 -0.6420
## Lag2 -0.5135
plot(lda.fit)
lda.pred=predict(lda.fit, Smarket.2005)
names(lda.pred)
## [1] "class" "posterior" "x"
lda.class=lda.pred$class
table(lda.class,Direction.2005)
## Direction.2005
## lda.class Down Up
## Down 35 35
## Up 76 106
mean(lda.class==Direction.2005)
## [1] 0.5595
sum(lda.pred$posterior[,1]>=.5)
## [1] 70
sum(lda.pred$posterior[,1]<.5)
## [1] 182
lda.pred$posterior[1:20,1]
## 999 1000 1001 1002 1003 1004 1005 1006 1007 1008
## 0.4902 0.4792 0.4668 0.4740 0.4928 0.4939 0.4951 0.4873 0.4907 0.4844
## 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018
## 0.4907 0.5120 0.4895 0.4707 0.4745 0.4800 0.4936 0.5031 0.4979 0.4886
lda.class[1:20]
## [1] Up Up Up Up Up Up Up Up Up Up Up Down Up Up
## [15] Up Up Up Down Up Up
## Levels: Down Up
sum(lda.pred$posterior[,1]>.9)
## [1] 0
Quadratic Discriminant Analysis
qda.fit=qda(Direction~Lag1+Lag2,data=Smarket,subset=train)
qda.fit
## Call:
## qda(Direction ~ Lag1 + Lag2, data = Smarket, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.492 0.508
##
## Group means:
## Lag1 Lag2
## Down 0.04279 0.03389
## Up -0.03955 -0.03133
qda.class=predict(qda.fit,Smarket.2005)$class
table(qda.class,Direction.2005)
## Direction.2005
## qda.class Down Up
## Down 30 20
## Up 81 121
mean(qda.class==Direction.2005)
## [1] 0.5992
library(class)
train.X=cbind(Lag1,Lag2)[train,]
test.X=cbind(Lag1,Lag2)[!train,]
train.Direction=Direction[train]
set.seed(1)
knn.pred=knn(train.X,test.X,train.Direction,k=1)
table(knn.pred,Direction.2005)
## Direction.2005
## knn.pred Down Up
## Down 43 58
## Up 68 83
(83+43)/252
## [1] 0.5
knn.pred=knn(train.X,test.X,train.Direction,k=3)
table(knn.pred,Direction.2005)
## Direction.2005
## knn.pred Down Up
## Down 48 54
## Up 63 87
mean(knn.pred==Direction.2005)
## [1] 0.5357
Insurance data
dim(Caravan)
## [1] 5822 86
attach(Caravan)
summary(Purchase)
## No Yes
## 5474 348
348/5822
## [1] 0.05977
standardized.X=scale(Caravan[,-86])
var(Caravan[,1])
## [1] 165
var(Caravan[,2])
## [1] 0.1647
var(standardized.X[,1])
## [1] 1
var(standardized.X[,2])
## [1] 1
test=1:1000
train.X=standardized.X[-test,]
test.X=standardized.X[test,]
train.Y=Purchase[-test]
test.Y=Purchase[test]
set.seed(1)
knn.pred=knn(train.X,test.X,train.Y,k=1)
mean(test.Y!=knn.pred)
## [1] 0.118
mean(test.Y!="No")
## [1] 0.059
table(knn.pred,test.Y)
## test.Y
## knn.pred No Yes
## No 873 50
## Yes 68 9
9/(68+9)
## [1] 0.1169
knn.pred=knn(train.X,test.X,train.Y,k=3)
table(knn.pred,test.Y)
## test.Y
## knn.pred No Yes
## No 920 54
## Yes 21 5
5/26
## [1] 0.1923
knn.pred=knn(train.X,test.X,train.Y,k=5)
table(knn.pred,test.Y)
## test.Y
## knn.pred No Yes
## No 930 55
## Yes 11 4
4/15
## [1] 0.2667
glm.fit=glm(Purchase~.,data=Caravan,family=binomial,subset=-test)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
glm.probs=predict(glm.fit,Caravan[test,],type="response")
glm.pred=rep("No",1000)
glm.pred[glm.probs>.5]="Yes"
table(glm.pred,test.Y)
## test.Y
## glm.pred No Yes
## No 934 59
## Yes 7 0
glm.pred=rep("No",1000)
glm.pred[glm.probs>.25]="Yes"
table(glm.pred,test.Y)
## test.Y
## glm.pred No Yes
## No 919 48
## Yes 22 11
11/(22+11)
## [1] 0.3333