Russ Conte April 15, 2017 Introduction to Statistical Learning Chapter 4: Classification

library(ISLR)
attach(Default)
dim(Default)
## [1] 10000     4
head(Default)
##   default student   balance    income
## 1      No      No  729.5265 44361.625
## 2      No     Yes  817.1804 12106.135
## 3      No      No 1073.5492 31767.139
## 4      No      No  529.2506 35704.494
## 5      No      No  785.6559 38463.496
## 6      No     Yes  919.5885  7491.559
plot(balance, income)

boxplot(Default$balance)

boxplot(balance~default, data=Default, col=c("blue", "red"), xlab="Default", ylab="Balance")

boxplot(income~default, data=Default, col=c("blue", "red"), xlab="Default", ylab="Income")

options(scipen=999)
glm.fit1=glm(default~balance,data=Default, family = binomial)
summary(glm.fit1)
## 
## Call:
## glm(formula = default ~ balance, family = binomial, data = Default)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2697  -0.1465  -0.0589  -0.0221   3.7589  
## 
## Coefficients:
##                Estimate  Std. Error z value            Pr(>|z|)    
## (Intercept) -10.6513306   0.3611574  -29.49 <0.0000000000000002 ***
## balance       0.0054989   0.0002204   24.95 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1596.5  on 9998  degrees of freedom
## AIC: 1600.5
## 
## Number of Fisher Scoring iterations: 8
glm.fit2=glm(default~student, data=Default, family=binomial)
summary(glm.fit2)
## 
## Call:
## glm(formula = default ~ student, family = binomial, data = Default)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.2970  -0.2970  -0.2434  -0.2434   2.6585  
## 
## Coefficients:
##             Estimate Std. Error z value             Pr(>|z|)    
## (Intercept) -3.50413    0.07071  -49.55 < 0.0000000000000002 ***
## studentYes   0.40489    0.11502    3.52             0.000431 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 2908.7  on 9998  degrees of freedom
## AIC: 2912.7
## 
## Number of Fisher Scoring iterations: 6
glm.fit3=glm(default~., data=Default, family=binomial)
summary(glm.fit3)
## 
## Call:
## glm(formula = default ~ ., family = binomial, data = Default)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4691  -0.1418  -0.0557  -0.0203   3.7383  
## 
## Coefficients:
##                  Estimate    Std. Error z value             Pr(>|z|)    
## (Intercept) -10.869045196   0.492255516 -22.080 < 0.0000000000000002 ***
## studentYes   -0.646775807   0.236252529  -2.738              0.00619 ** 
## balance       0.005736505   0.000231895  24.738 < 0.0000000000000002 ***
## income        0.000003033   0.000008203   0.370              0.71152    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1571.5  on 9996  degrees of freedom
## AIC: 1579.5
## 
## Number of Fisher Scoring iterations: 8
str(Default)
## 'data.frame':    10000 obs. of  4 variables:
##  $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
##  $ balance: num  730 817 1074 529 786 ...
##  $ income : num  44362 12106 31767 35704 38463 ...
boxplot(balance~student, data=Default, col=c("blue", "red"), xlab="Default", ylab="Credit Card Balance")

Chapter 4 Lab

library(ISLR)
names(Smarket)
## [1] "Year"      "Lag1"      "Lag2"      "Lag3"      "Lag4"      "Lag5"     
## [7] "Volume"    "Today"     "Direction"
class(Smarket)
## [1] "data.frame"
head(Smarket)
##   Year   Lag1   Lag2   Lag3   Lag4   Lag5 Volume  Today Direction
## 1 2001  0.381 -0.192 -2.624 -1.055  5.010 1.1913  0.959        Up
## 2 2001  0.959  0.381 -0.192 -2.624 -1.055 1.2965  1.032        Up
## 3 2001  1.032  0.959  0.381 -0.192 -2.624 1.4112 -0.623      Down
## 4 2001 -0.623  1.032  0.959  0.381 -0.192 1.2760  0.614        Up
## 5 2001  0.614 -0.623  1.032  0.959  0.381 1.2057  0.213        Up
## 6 2001  0.213  0.614 -0.623  1.032  0.959 1.3491  1.392        Up
dim(Smarket)
## [1] 1250    9
class(Smarket$Direction)
## [1] "factor"
summary(Smarket)
##       Year           Lag1                Lag2          
##  Min.   :2001   Min.   :-4.922000   Min.   :-4.922000  
##  1st Qu.:2002   1st Qu.:-0.639500   1st Qu.:-0.639500  
##  Median :2003   Median : 0.039000   Median : 0.039000  
##  Mean   :2003   Mean   : 0.003834   Mean   : 0.003919  
##  3rd Qu.:2004   3rd Qu.: 0.596750   3rd Qu.: 0.596750  
##  Max.   :2005   Max.   : 5.733000   Max.   : 5.733000  
##       Lag3                Lag4                Lag5         
##  Min.   :-4.922000   Min.   :-4.922000   Min.   :-4.92200  
##  1st Qu.:-0.640000   1st Qu.:-0.640000   1st Qu.:-0.64000  
##  Median : 0.038500   Median : 0.038500   Median : 0.03850  
##  Mean   : 0.001716   Mean   : 0.001636   Mean   : 0.00561  
##  3rd Qu.: 0.596750   3rd Qu.: 0.596750   3rd Qu.: 0.59700  
##  Max.   : 5.733000   Max.   : 5.733000   Max.   : 5.73300  
##      Volume           Today           Direction 
##  Min.   :0.3561   Min.   :-4.922000   Down:602  
##  1st Qu.:1.2574   1st Qu.:-0.639500   Up  :648  
##  Median :1.4229   Median : 0.038500             
##  Mean   :1.4783   Mean   : 0.003138             
##  3rd Qu.:1.6417   3rd Qu.: 0.596750             
##  Max.   :3.1525   Max.   : 5.733000
pairs(Smarket)

cor(Smarket[,-c(9)])
##              Year         Lag1         Lag2         Lag3         Lag4
## Year   1.00000000  0.029699649  0.030596422  0.033194581  0.035688718
## Lag1   0.02969965  1.000000000 -0.026294328 -0.010803402 -0.002985911
## Lag2   0.03059642 -0.026294328  1.000000000 -0.025896670 -0.010853533
## Lag3   0.03319458 -0.010803402 -0.025896670  1.000000000 -0.024051036
## Lag4   0.03568872 -0.002985911 -0.010853533 -0.024051036  1.000000000
## Lag5   0.02978799 -0.005674606 -0.003557949 -0.018808338 -0.027083641
## Volume 0.53900647  0.040909908 -0.043383215 -0.041823686 -0.048414246
## Today  0.03009523 -0.026155045 -0.010250033 -0.002447647 -0.006899527
##                Lag5      Volume        Today
## Year    0.029787995  0.53900647  0.030095229
## Lag1   -0.005674606  0.04090991 -0.026155045
## Lag2   -0.003557949 -0.04338321 -0.010250033
## Lag3   -0.018808338 -0.04182369 -0.002447647
## Lag4   -0.027083641 -0.04841425 -0.006899527
## Lag5    1.000000000 -0.02200231 -0.034860083
## Volume -0.022002315  1.00000000  0.014591823
## Today  -0.034860083  0.01459182  1.000000000
attach(Smarket)
plot(Volume)

glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume, data=Smarket, family=binomial)
summary(glm.fit)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Smarket)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.446  -1.203   1.065   1.145   1.326  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.126000   0.240736  -0.523    0.601
## Lag1        -0.073074   0.050167  -1.457    0.145
## Lag2        -0.042301   0.050086  -0.845    0.398
## Lag3         0.011085   0.049939   0.222    0.824
## Lag4         0.009359   0.049974   0.187    0.851
## Lag5         0.010313   0.049511   0.208    0.835
## Volume       0.135441   0.158360   0.855    0.392
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1731.2  on 1249  degrees of freedom
## Residual deviance: 1727.6  on 1243  degrees of freedom
## AIC: 1741.6
## 
## Number of Fisher Scoring iterations: 3
coef(glm.fit)
##  (Intercept)         Lag1         Lag2         Lag3         Lag4 
## -0.126000257 -0.073073746 -0.042301344  0.011085108  0.009358938 
##         Lag5       Volume 
##  0.010313068  0.135440659
summary(glm.fit)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Smarket)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.446  -1.203   1.065   1.145   1.326  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.126000   0.240736  -0.523    0.601
## Lag1        -0.073074   0.050167  -1.457    0.145
## Lag2        -0.042301   0.050086  -0.845    0.398
## Lag3         0.011085   0.049939   0.222    0.824
## Lag4         0.009359   0.049974   0.187    0.851
## Lag5         0.010313   0.049511   0.208    0.835
## Volume       0.135441   0.158360   0.855    0.392
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1731.2  on 1249  degrees of freedom
## Residual deviance: 1727.6  on 1243  degrees of freedom
## AIC: 1741.6
## 
## Number of Fisher Scoring iterations: 3
summary(glm.fit)$coef[,4]
## (Intercept)        Lag1        Lag2        Lag3        Lag4        Lag5 
##   0.6006983   0.1452272   0.3983491   0.8243333   0.8514445   0.8349974 
##      Volume 
##   0.3924004
glm.probs=predict(glm.fit, type="response")
glm.probs[1:10]
##         1         2         3         4         5         6         7 
## 0.5070841 0.4814679 0.4811388 0.5152224 0.5107812 0.5069565 0.4926509 
##         8         9        10 
## 0.5092292 0.5176135 0.4888378
contrasts(Direction)
##      Up
## Down  0
## Up    1
glm.pred=rep("Down", 1250)
glm.pred[glm.probs>0.5]="Up"
table(glm.pred, Direction)
##         Direction
## glm.pred Down  Up
##     Down  145 141
##     Up    457 507
(507+145)/1250
## [1] 0.5216
mean(glm.pred==Direction)
## [1] 0.5216
train=(Year<2005)
Smarket.2005=Smarket[!train,]# I missed the comma!
dim(Smarket.2005)
## [1] 252   9
Direction.2005=Direction[!train]
glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume, data=Smarket, family=binomial, subset = train)
glm.probs=predict(glm.fit, Smarket.2005, type="response")
glm.pred=rep("Down", 252)
glm.pred[glm.probs>0.5]="Up"
table(glm.pred, Direction.2005)
##         Direction.2005
## glm.pred Down Up
##     Down   77 97
##     Up     34 44
mean(glm.pred==Direction.2005) # % of correct guesses in the test data set
## [1] 0.4801587
mean(glm.pred!=Direction.2005) # % of incorrect guesses in the test data set
## [1] 0.5198413
glm.fit=glm(Direction~Lag1+Lag2, data=Smarket, family=binomial, subset=train)
glm.probs=predict(glm.fit, Smarket.2005, type="response")
glm.pred=rep("Down", 252)
glm.pred[glm.probs>0.5]="Up"
table(glm.pred, Direction.2005)
##         Direction.2005
## glm.pred Down  Up
##     Down   35  35
##     Up     76 106
mean(glm.pred==Direction.2005)
## [1] 0.5595238
mean(glm.pred!=Direction.2005)
## [1] 0.4404762
predict(glm.fit,newdata = data.frame(Lag1=c(1.2, 1.5), Lag2=c(1.1, -0.8)),type = "response")
##         1         2 
## 0.4791462 0.4960939

4.6.3 Linear Discriminanat Analysis

library(MASS)
lda.fit=lda(Direction~Lag1+Lag2, data=Smarket, subset=train)
lda.fit
## Call:
## lda(Direction ~ Lag1 + Lag2, data = Smarket, subset = train)
## 
## Prior probabilities of groups:
##     Down       Up 
## 0.491984 0.508016 
## 
## Group means:
##             Lag1        Lag2
## Down  0.04279022  0.03389409
## Up   -0.03954635 -0.03132544
## 
## Coefficients of linear discriminants:
##             LD1
## Lag1 -0.6420190
## Lag2 -0.5135293
plot(lda.fit)

lda.pred=predict(lda.fit, Smarket.2005)
names(lda.pred)
## [1] "class"     "posterior" "x"
lda.pred$class
##   [1] Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Down Up   Up  
##  [15] Up   Up   Up   Down Up   Up   Down Down Down Up   Down Down Up   Up  
##  [29] Up   Down Down Up   Up   Up   Up   Up   Up   Down Down Up   Up   Up  
##  [43] Up   Down Down Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up  
##  [57] Up   Up   Up   Up   Down Down Up   Up   Down Down Down Up   Up   Up  
##  [71] Up   Up   Up   Up   Down Up   Down Down Up   Up   Up   Up   Up   Down
##  [85] Up   Down Down Up   Up   Up   Up   Up   Up   Down Down Down Down Up  
##  [99] Up   Up   Up   Up   Down Up   Up   Down Up   Up   Up   Up   Up   Up  
## [113] Up   Up   Up   Up   Down Up   Up   Up   Up   Up   Up   Down Down Up  
## [127] Up   Down Up   Up   Down Down Down Up   Up   Up   Up   Up   Down Up  
## [141] Up   Up   Up   Down Down Up   Up   Down Down Up   Up   Up   Up   Up  
## [155] Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up  
## [169] Down Down Up   Down Down Up   Up   Up   Up   Up   Up   Down Up   Up  
## [183] Up   Up   Up   Up   Up   Up   Down Down Up   Up   Up   Up   Up   Up  
## [197] Up   Up   Up   Down Down Up   Down Up   Up   Down Down Up   Up   Down
## [211] Down Up   Down Down Up   Up   Up   Up   Down Down Up   Up   Up   Down
## [225] Down Down Down Down Up   Up   Up   Up   Down Down Up   Up   Up   Up  
## [239] Up   Up   Down Down Up   Up   Up   Up   Up   Down Up   Up   Up   Up  
## Levels: Down Up
lda.pred$posterior
##           Down        Up
## 999  0.4901792 0.5098208
## 1000 0.4792185 0.5207815
## 1001 0.4668185 0.5331815
## 1002 0.4740011 0.5259989
## 1003 0.4927877 0.5072123
## 1004 0.4938562 0.5061438
## 1005 0.4951016 0.5048984
## 1006 0.4872861 0.5127139
## 1007 0.4907013 0.5092987
## 1008 0.4844026 0.5155974
## 1009 0.4906963 0.5093037
## 1010 0.5119988 0.4880012
## 1011 0.4895152 0.5104848
## 1012 0.4706761 0.5293239
## 1013 0.4744593 0.5255407
## 1014 0.4799583 0.5200417
## 1015 0.4935775 0.5064225
## 1016 0.5030894 0.4969106
## 1017 0.4978806 0.5021194
## 1018 0.4886331 0.5113669
## 1019 0.5006568 0.4993432
## 1020 0.5108735 0.4891265
## 1021 0.5039925 0.4960075
## 1022 0.4916335 0.5083665
## 1023 0.5041772 0.4958228
## 1024 0.5026751 0.4973249
## 1025 0.4914043 0.5085957
## 1026 0.4805964 0.5194036
## 1027 0.4882718 0.5117282
## 1028 0.5062187 0.4937813
## 1029 0.5005996 0.4994004
## 1030 0.4972965 0.5027035
## 1031 0.4958546 0.5041454
## 1032 0.4811777 0.5188223
## 1033 0.4841417 0.5158583
## 1034 0.4726388 0.5273612
## 1035 0.4836418 0.5163582
## 1036 0.5091007 0.4908993
## 1037 0.5135941 0.4864059
## 1038 0.4933839 0.5066161
## 1039 0.4926856 0.5073144
## 1040 0.4978472 0.5021528
## 1041 0.4920914 0.5079086
## 1042 0.5056346 0.4943654
## 1043 0.5062288 0.4937712
## 1044 0.4881894 0.5118106
## 1045 0.4725293 0.5274707
## 1046 0.4832339 0.5167661
## 1047 0.4835086 0.5164914
## 1048 0.4913334 0.5086666
## 1049 0.4877566 0.5122434
## 1050 0.4724386 0.5275614
## 1051 0.4854877 0.5145123
## 1052 0.4932911 0.5067089
## 1053 0.4845973 0.5154027
## 1054 0.4723718 0.5276282
## 1055 0.4816170 0.5183830
## 1056 0.4914067 0.5085933
## 1057 0.4942755 0.5057245
## 1058 0.4841232 0.5158768
## 1059 0.5026064 0.4973936
## 1060 0.5062557 0.4937443
## 1061 0.4821800 0.5178200
## 1062 0.4885263 0.5114737
## 1063 0.5011825 0.4988175
## 1064 0.5000595 0.4999405
## 1065 0.5027377 0.4972623
## 1066 0.4870086 0.5129914
## 1067 0.4827213 0.5172787
## 1068 0.4996501 0.5003499
## 1069 0.4818079 0.5181921
## 1070 0.4651057 0.5348943
## 1071 0.4577867 0.5422133
## 1072 0.4775004 0.5224996
## 1073 0.5034250 0.4965750
## 1074 0.4801664 0.5198336
## 1075 0.5046171 0.4953829
## 1076 0.5044752 0.4955248
## 1077 0.4964663 0.5035337
## 1078 0.4892965 0.5107035
## 1079 0.4876236 0.5123764
## 1080 0.4805625 0.5194375
## 1081 0.4958518 0.5041482
## 1082 0.5115212 0.4884788
## 1083 0.4958572 0.5041428
## 1084 0.5082871 0.4917129
## 1085 0.5022091 0.4977909
## 1086 0.4875892 0.5124108
## 1087 0.4995948 0.5004052
## 1088 0.4841917 0.5158083
## 1089 0.4858843 0.5141157
## 1090 0.4826969 0.5173031
## 1091 0.4745012 0.5254988
## 1092 0.5008540 0.4991460
## 1093 0.5127766 0.4872234
## 1094 0.5135472 0.4864528
## 1095 0.5095127 0.4904873
## 1096 0.4950201 0.5049799
## 1097 0.4956088 0.5043912
## 1098 0.4964643 0.5035357
## 1099 0.4874363 0.5125637
## 1100 0.4970339 0.5029661
## 1101 0.5003751 0.4996249
## 1102 0.4846137 0.5153863
## 1103 0.4976914 0.5023086
## 1104 0.5043081 0.4956919
## 1105 0.4843366 0.5156634
## 1106 0.4860664 0.5139336
## 1107 0.4930417 0.5069583
## 1108 0.4887219 0.5112781
## 1109 0.4968147 0.5031853
## 1110 0.4944989 0.5055011
## 1111 0.4924743 0.5075257
## 1112 0.4980141 0.5019859
## 1113 0.4978727 0.5021273
## 1114 0.4994390 0.5005610
## 1115 0.5028317 0.4971683
## 1116 0.4964503 0.5035497
## 1117 0.4883202 0.5116798
## 1118 0.4899801 0.5100199
## 1119 0.4771957 0.5228043
## 1120 0.4694030 0.5305970
## 1121 0.4824692 0.5175308
## 1122 0.5037943 0.4962057
## 1123 0.5000974 0.4999026
## 1124 0.4805303 0.5194697
## 1125 0.4876953 0.5123047
## 1126 0.5070782 0.4929218
## 1127 0.4901776 0.5098224
## 1128 0.4860999 0.5139001
## 1129 0.5108497 0.4891503
## 1130 0.5135547 0.4864453
## 1131 0.5020218 0.4979782
## 1132 0.4956830 0.5043170
## 1133 0.4965536 0.5034464
## 1134 0.4964590 0.5035410
## 1135 0.4855719 0.5144281
## 1136 0.4951439 0.5048561
## 1137 0.5060048 0.4939952
## 1138 0.4880643 0.5119357
## 1139 0.4921175 0.5078825
## 1140 0.4927195 0.5072805
## 1141 0.4901661 0.5098339
## 1142 0.5001986 0.4998014
## 1143 0.5047746 0.4952254
## 1144 0.4875267 0.5124733
## 1145 0.4847648 0.5152352
## 1146 0.5028405 0.4971595
## 1147 0.5008435 0.4991565
## 1148 0.4825591 0.5174409
## 1149 0.4732124 0.5267876
## 1150 0.4797731 0.5202269
## 1151 0.4983172 0.5016828
## 1152 0.4968824 0.5031176
## 1153 0.4997031 0.5002969
## 1154 0.4914721 0.5085279
## 1155 0.4892300 0.5107700
## 1156 0.4787694 0.5212306
## 1157 0.4799234 0.5200766
## 1158 0.4913818 0.5086182
## 1159 0.4916287 0.5083713
## 1160 0.4948795 0.5051205
## 1161 0.4890900 0.5109100
## 1162 0.4790944 0.5209056
## 1163 0.4878531 0.5121469
## 1164 0.4861838 0.5138162
## 1165 0.4935558 0.5064442
## 1166 0.4941329 0.5058671
## 1167 0.5020762 0.4979238
## 1168 0.5043051 0.4956949
## 1169 0.4890430 0.5109570
## 1170 0.5062006 0.4937994
## 1171 0.5092767 0.4907233
## 1172 0.4893670 0.5106330
## 1173 0.4987776 0.5012224
## 1174 0.4997456 0.5002544
## 1175 0.4806852 0.5193148
## 1176 0.4790536 0.5209464
## 1177 0.4889496 0.5110504
## 1178 0.5039466 0.4960534
## 1179 0.4934174 0.5065826
## 1180 0.4748985 0.5251015
## 1181 0.4706261 0.5293739
## 1182 0.4868978 0.5131022
## 1183 0.4967554 0.5032446
## 1184 0.4929449 0.5070551
## 1185 0.4922853 0.5077147
## 1186 0.4933690 0.5066310
## 1187 0.5053601 0.4946399
## 1188 0.5030552 0.4969448
## 1189 0.4905837 0.5094163
## 1190 0.4762390 0.5237610
## 1191 0.4603392 0.5396608
## 1192 0.4697932 0.5302068
## 1193 0.4925300 0.5074700
## 1194 0.4861143 0.5138857
## 1195 0.4811376 0.5188624
## 1196 0.4812474 0.5187526
## 1197 0.4842383 0.5157617
## 1198 0.5026218 0.4973782
## 1199 0.5052312 0.4947688
## 1200 0.4813184 0.5186816
## 1201 0.5015397 0.4984603
## 1202 0.4877161 0.5122839
## 1203 0.4774171 0.5225829
## 1204 0.5168827 0.4831173
## 1205 0.5072640 0.4927360
## 1206 0.4833515 0.5166485
## 1207 0.4726701 0.5273299
## 1208 0.5032667 0.4967333
## 1209 0.5202350 0.4797650
## 1210 0.4950279 0.5049721
## 1211 0.5018767 0.4981233
## 1212 0.5089142 0.4910858
## 1213 0.4968911 0.5031089
## 1214 0.4951595 0.5048405
## 1215 0.4895942 0.5104058
## 1216 0.4904653 0.5095347
## 1217 0.5055318 0.4944682
## 1218 0.5055416 0.4944584
## 1219 0.4942470 0.5057530
## 1220 0.4857495 0.5142505
## 1221 0.4901606 0.5098394
## 1222 0.5069730 0.4930270
## 1223 0.5084764 0.4915236
## 1224 0.5041288 0.4958712
## 1225 0.5048299 0.4951701
## 1226 0.5023879 0.4976121
## 1227 0.4986903 0.5013097
## 1228 0.4824758 0.5175242
## 1229 0.4825469 0.5174531
## 1230 0.4831600 0.5168400
## 1231 0.5017497 0.4982503
## 1232 0.5058708 0.4941292
## 1233 0.4890321 0.5109679
## 1234 0.4911052 0.5088948
## 1235 0.4864250 0.5135750
## 1236 0.4847062 0.5152938
## 1237 0.4944890 0.5055110
## 1238 0.4962261 0.5037739
## 1239 0.5005702 0.4994298
## 1240 0.5039068 0.4960932
## 1241 0.4946376 0.5053624
## 1242 0.4864366 0.5135634
## 1243 0.4807022 0.5192978
## 1244 0.4851439 0.5148561
## 1245 0.4951734 0.5048266
## 1246 0.5005893 0.4994107
## 1247 0.4972210 0.5027790
## 1248 0.4791988 0.5208012
## 1249 0.4831673 0.5168327
## 1250 0.4892591 0.5107409
lda.pred$x
##               LD1
## 999   0.082930955
## 1000  0.591141023
## 1001  1.167230633
## 1002  0.833350217
## 1003 -0.037928918
## 1004 -0.087431416
## 1005 -0.145127188
## 1006  0.217013241
## 1007  0.058737918
## 1008  0.350686419
## 1009  0.058972979
## 1010 -0.927941342
## 1011  0.113701897
## 1012  0.987838737
## 1013  0.812068621
## 1014  0.556813626
## 1015 -0.074523144
## 1016 -0.515140289
## 1017 -0.273862313
## 1018  0.154583117
## 1019 -0.402459508
## 1020 -0.875788251
## 1021 -0.556975090
## 1022  0.015545602
## 1023 -0.565532775
## 1024 -0.495947609
## 1025  0.026166415
## 1026  0.527211570
## 1027  0.171326741
## 1028 -0.660106384
## 1029 -0.399808484
## 1030 -0.246804695
## 1031 -0.180012366
## 1032  0.500244192
## 1033  0.362782493
## 1034  0.896631218
## 1035  0.385966937
## 1036 -0.793634311
## 1037 -1.001885690
## 1038 -0.065552914
## 1039 -0.033201873
## 1040 -0.272314419
## 1041 -0.005670681
## 1042 -0.633046617
## 1043 -0.660573497
## 1044  0.175146637
## 1045  0.901720095
## 1046  0.404879430
## 1047  0.392142626
## 1048  0.029449128
## 1049  0.195203503
## 1050  0.905934040
## 1051  0.300376866
## 1052 -0.061251737
## 1053  0.341659902
## 1054  0.909037994
## 1055  0.479867168
## 1056  0.026053397
## 1057 -0.106858727
## 1058  0.363641966
## 1059 -0.492769332
## 1060 -0.661821863
## 1061  0.453754535
## 1062  0.159531470
## 1063 -0.426809386
## 1064 -0.374790801
## 1065 -0.498847876
## 1066  0.229875540
## 1067  0.428650036
## 1068 -0.355825460
## 1069  0.471014282
## 1070  1.246938110
## 1071  1.587993753
## 1072  0.670875991
## 1073 -0.530686262
## 1074  0.547161007
## 1075 -0.585910659
## 1076 -0.579335302
## 1077 -0.208347590
## 1078  0.123837117
## 1079  0.201370997
## 1080  0.528782243
## 1081 -0.179881743
## 1082 -0.905805001
## 1083 -0.180129682
## 1084 -0.755939119
## 1085 -0.474363477
## 1086  0.202965738
## 1087 -0.353266839
## 1088  0.360465824
## 1089  0.281992515
## 1090  0.429784970
## 1091  0.810123224
## 1092 -0.411592166
## 1093 -0.963987656
## 1094 -0.999710819
## 1095 -0.812728572
## 1096 -0.141351130
## 1097 -0.168625772
## 1098 -0.208256477
## 1099  0.210051585
## 1100 -0.234641576
## 1101 -0.389412417
## 1102  0.340901856
## 1103 -0.265096229
## 1104 -0.571594987
## 1105  0.353748684
## 1106  0.273550871
## 1107 -0.049700122
## 1108  0.150468866
## 1109 -0.224487016
## 1110 -0.117206229
## 1111 -0.023408285
## 1112 -0.280047873
## 1113 -0.273496614
## 1114 -0.346047774
## 1115 -0.503201916
## 1116 -0.207605002
## 1117  0.169083120
## 1118  0.092157721
## 1119  0.685017616
## 1120  1.047021374
## 1121  0.440340886
## 1122 -0.547795600
## 1123 -0.376548399
## 1124  0.530276844
## 1125  0.198047455
## 1126 -0.699925318
## 1127  0.083006167
## 1128  0.271997390
## 1129 -0.874684276
## 1130 -1.000055459
## 1131 -0.465685483
## 1132 -0.172060184
## 1133 -0.212390928
## 1134 -0.208010242
## 1135  0.296475772
## 1136 -0.147088487
## 1137 -0.650197630
## 1138  0.180943289
## 1139 -0.006880812
## 1140 -0.034769522
## 1141  0.083539881
## 1142 -0.381234629
## 1143 -0.593204905
## 1144  0.205860848
## 1145  0.333893785
## 1146 -0.503610163
## 1147 -0.411106558
## 1148  0.436175505
## 1149  0.869982263
## 1150  0.565406109
## 1151 -0.294086350
## 1152 -0.227620610
## 1153 -0.358280948
## 1154  0.023026374
## 1155  0.126921286
## 1156  0.611978284
## 1157  0.558436717
## 1158  0.027209375
## 1159  0.015766480
## 1160 -0.134837692
## 1161  0.133407233
## 1162  0.596901573
## 1163  0.190732993
## 1164  0.268108329
## 1165 -0.073517145
## 1166 -0.100250737
## 1167 -0.468206298
## 1168 -0.571459191
## 1169  0.135584682
## 1170 -0.659268844
## 1171 -0.801791899
## 1172  0.120573299
## 1173 -0.315410923
## 1174 -0.360251258
## 1175  0.523092162
## 1176  0.598791959
## 1177  0.139913365
## 1178 -0.554847495
## 1179 -0.067102527
## 1180  0.791671412
## 1181  0.990164016
## 1182  0.235010388
## 1183 -0.221740596
## 1184 -0.045212006
## 1185 -0.014654220
## 1186 -0.064862344
## 1187 -0.620328709
## 1188 -0.513557583
## 1189  0.064191220
## 1190  0.729428658
## 1191  1.468963681
## 1192  1.028881539
## 1193 -0.025991404
## 1194  0.271330458
## 1195  0.502107071
## 1196  0.497014757
## 1197  0.358304261
## 1198 -0.493480531
## 1199 -0.614359730
## 1200  0.493719580
## 1201 -0.443354925
## 1202  0.197081440
## 1203  0.674742718
## 1204 -1.154355765
## 1205 -0.708534977
## 1206  0.399425284
## 1207  0.895175712
## 1208 -0.523354194
## 1209 -1.309851988
## 1210 -0.141714681
## 1211 -0.458964060
## 1212 -0.784993693
## 1213 -0.228027139
## 1214 -0.147810001
## 1215  0.110042296
## 1216  0.069675020
## 1217 -0.628283901
## 1218 -0.628739409
## 1219 -0.105540307
## 1220  0.288241253
## 1221  0.083796001
## 1222 -0.695053021
## 1223 -0.764710791
## 1224 -0.563288279
## 1225 -0.595766965
## 1226 -0.482644843
## 1227 -0.311368001
## 1228  0.440039224
## 1229  0.436738015
## 1230  0.408305690
## 1231 -0.453081897
## 1232 -0.643987573
## 1233  0.136092195
## 1234  0.040023110
## 1235  0.256928430
## 1236  0.336613129
## 1237 -0.116748141
## 1238 -0.197222689
## 1239 -0.398448390
## 1240 -0.553006090
## 1241 -0.123635444
## 1242  0.256391693
## 1243  0.522303602
## 1244  0.316318193
## 1245 -0.148455458
## 1246 -0.399332776
## 1247 -0.243307536
## 1248  0.592055064
## 1249  0.407966622
## 1250  0.125571506
lda.class=lda.pred$class
table(lda.class, Direction.2005)
##          Direction.2005
## lda.class Down  Up
##      Down   35  35
##      Up     76 106
mean(lda.class==Direction.2005)
## [1] 0.5595238
sum(lda.pred$posterior[,1]>=0.5)
## [1] 70
sum(lda.pred$posterior[,1]<0.5)
## [1] 182
lda.pred$posterior[1:20,1]
##       999      1000      1001      1002      1003      1004      1005 
## 0.4901792 0.4792185 0.4668185 0.4740011 0.4927877 0.4938562 0.4951016 
##      1006      1007      1008      1009      1010      1011      1012 
## 0.4872861 0.4907013 0.4844026 0.4906963 0.5119988 0.4895152 0.4706761 
##      1013      1014      1015      1016      1017      1018 
## 0.4744593 0.4799583 0.4935775 0.5030894 0.4978806 0.4886331
lda.class[1:20]
##  [1] Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Up   Down Up   Up  
## [15] Up   Up   Up   Down Up   Up  
## Levels: Down Up
sum(lda.pred$posterior[,1]>0.5)
## [1] 70

4.6.4 Quadratic Discrminiant Analysis

qda.fit=qda(Direction~Lag1+Lag2, data=Smarket, subset=train)
qda.fit
## Call:
## qda(Direction ~ Lag1 + Lag2, data = Smarket, subset = train)
## 
## Prior probabilities of groups:
##     Down       Up 
## 0.491984 0.508016 
## 
## Group means:
##             Lag1        Lag2
## Down  0.04279022  0.03389409
## Up   -0.03954635 -0.03132544
qda.class=predict(qda.fit, Smarket.2005)$class
table(qda.class,Direction.2005)
##          Direction.2005
## qda.class Down  Up
##      Down   30  20
##      Up     81 121
mean(qda.class==Direction.2005)
## [1] 0.5992063

K-Nearest Neighbors

library(class)
train.X=cbind(Lag1, Lag2)[train,]
test.X=cbind(Lag1, Lag2)[!train,]
train.Direction=Direction[train]
set.seed(1)
knn.pred=knn(train.X, test.X, train.Direction, k = 1)
table(knn.pred, Direction.2005)
##         Direction.2005
## knn.pred Down Up
##     Down   43 58
##     Up     68 83
(83+43)/252
## [1] 0.5

Try with k=3

knn.pred=knn(train.X, test.X, train.Direction, k = 3)
table(knn.pred, Direction.2005)
##         Direction.2005
## knn.pred Down Up
##     Down   48 54
##     Up     63 87
mean(knn.pred==Direction.2005)
## [1] 0.5357143

The results have improved slightly, but not much. It appears that QDA has the highest degree of accuracy

Let’s try this with Caravan data:

dim(Caravan)
## [1] 5822   86
attach(Caravan)
summary(Purchase)
##   No  Yes 
## 5474  348
head(Caravan)
##   MOSTYPE MAANTHUI MGEMOMV MGEMLEEF MOSHOOFD MGODRK MGODPR MGODOV MGODGE
## 1      33        1       3        2        8      0      5      1      3
## 2      37        1       2        2        8      1      4      1      4
## 3      37        1       2        2        8      0      4      2      4
## 4       9        1       3        3        3      2      3      2      4
## 5      40        1       4        2       10      1      4      1      4
## 6      23        1       2        1        5      0      5      0      5
##   MRELGE MRELSA MRELOV MFALLEEN MFGEKIND MFWEKIND MOPLHOOG MOPLMIDD
## 1      7      0      2        1        2        6        1        2
## 2      6      2      2        0        4        5        0        5
## 3      3      2      4        4        4        2        0        5
## 4      5      2      2        2        3        4        3        4
## 5      7      1      2        2        4        4        5        4
## 6      0      6      3        3        5        2        0        5
##   MOPLLAAG MBERHOOG MBERZELF MBERBOER MBERMIDD MBERARBG MBERARBO MSKA
## 1        7        1        0        1        2        5        2    1
## 2        4        0        0        0        5        0        4    0
## 3        4        0        0        0        7        0        2    0
## 4        2        4        0        0        3        1        2    3
## 5        0        0        5        4        0        0        0    9
## 6        4        2        0        0        4        2        2    2
##   MSKB1 MSKB2 MSKC MSKD MHHUUR MHKOOP MAUT1 MAUT2 MAUT0 MZFONDS MZPART
## 1     1     2    6    1      1      8     8     0     1       8      1
## 2     2     3    5    0      2      7     7     1     2       6      3
## 3     5     0    4    0      7      2     7     0     2       9      0
## 4     2     1    4    0      5      4     9     0     0       7      2
## 5     0     0    0    0      4      5     6     2     1       5      4
## 6     2     2    4    2      9      0     5     3     3       9      0
##   MINKM30 MINK3045 MINK4575 MINK7512 MINK123M MINKGEM MKOOPKLA PWAPART
## 1       0        4        5        0        0       4        3       0
## 2       2        0        5        2        0       5        4       2
## 3       4        5        0        0        0       3        4       2
## 4       1        5        3        0        0       4        4       0
## 5       0        0        9        0        0       6        3       0
## 6       5        2        3        0        0       3        3       0
##   PWABEDR PWALAND PPERSAUT PBESAUT PMOTSCO PVRAAUT PAANHANG PTRACTOR
## 1       0       0        6       0       0       0        0        0
## 2       0       0        0       0       0       0        0        0
## 3       0       0        6       0       0       0        0        0
## 4       0       0        6       0       0       0        0        0
## 5       0       0        0       0       0       0        0        0
## 6       0       0        6       0       0       0        0        0
##   PWERKT PBROM PLEVEN PPERSONG PGEZONG PWAOREG PBRAND PZEILPL PPLEZIER
## 1      0     0      0        0       0       0      5       0        0
## 2      0     0      0        0       0       0      2       0        0
## 3      0     0      0        0       0       0      2       0        0
## 4      0     0      0        0       0       0      2       0        0
## 5      0     0      0        0       0       0      6       0        0
## 6      0     0      0        0       0       0      0       0        0
##   PFIETS PINBOED PBYSTAND AWAPART AWABEDR AWALAND APERSAUT ABESAUT AMOTSCO
## 1      0       0        0       0       0       0        1       0       0
## 2      0       0        0       2       0       0        0       0       0
## 3      0       0        0       1       0       0        1       0       0
## 4      0       0        0       0       0       0        1       0       0
## 5      0       0        0       0       0       0        0       0       0
## 6      0       0        0       0       0       0        1       0       0
##   AVRAAUT AAANHANG ATRACTOR AWERKT ABROM ALEVEN APERSONG AGEZONG AWAOREG
## 1       0        0        0      0     0      0        0       0       0
## 2       0        0        0      0     0      0        0       0       0
## 3       0        0        0      0     0      0        0       0       0
## 4       0        0        0      0     0      0        0       0       0
## 5       0        0        0      0     0      0        0       0       0
## 6       0        0        0      0     0      0        0       0       0
##   ABRAND AZEILPL APLEZIER AFIETS AINBOED ABYSTAND Purchase
## 1      1       0        0      0       0        0       No
## 2      1       0        0      0       0        0       No
## 3      1       0        0      0       0        0       No
## 4      1       0        0      0       0        0       No
## 5      1       0        0      0       0        0       No
## 6      0       0        0      0       0        0       No
standardized.X=scale(Caravan[,-86])
var(Caravan[,1])
## [1] 165.0378
var(Caravan[,2])
## [1] 0.1647078
var(standardized.X[,1])
## [1] 1

Now the variance =1 because it’s standardized

var(standardized.X[,2])
## [1] 1

Same as above, variance = 1 because it’s standardized

test=1:1000
train.X=standardized.X[-test,]
test.X=standardized.X[test,]
train.Y=Purchase[-test]
test.Y=Purchase[test]
set.seed(1)
knn.pred=knn(train.X, test.X, train.Y, k=1)
mean(test.Y!=knn.pred)
## [1] 0.118
mean(test.Y!="No")
## [1] 0.059
table(knn.pred, test.Y)
##         test.Y
## knn.pred  No Yes
##      No  873  50
##      Yes  68   9
knn.pred=knn(train.X, test.X, train.Y, k=3) #note test and train on X, just test on Y
table(knn.pred, test.Y) #note just test on Y
##         test.Y
## knn.pred  No Yes
##      No  920  54
##      Yes  21   5
knn.pred=knn(train.X, test.X, train.Y, k=5)
table(knn.pred, test.Y)
##         test.Y
## knn.pred  No Yes
##      No  930  55
##      Yes  11   4

so setting k=5 gets us a correct prediction 11 out of 15 times, better than k=3 or k=1

Let’s see what a logistic regression model looks like, and how well that does

glm.fit=glm(Purchase~., data=Caravan, family=binomial, subset=-test)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

the expected error message happened, let’s address that issue

glm.probs=predict(glm.fit, Caravan[test,], type="response")
glm.pred=rep("No", 1000)
glm.pred[glm.probs>0.5]="Yes"
table(glm.pred, test.Y)
##         test.Y
## glm.pred  No Yes
##      No  934  59
##      Yes   7   0

Wow, that’s really good - it got all 7 of the yeses, and 934/(934+59) of the no resonses - nice job!

glm.pred=rep("No", 1000)
glm.pred[glm.probs>0.25]="Yes"
table(glm.pred, test.Y)
##         test.Y
## glm.pred  No Yes
##      No  919  48
##      Yes  22  11

This was correct for the Yes response 33% of the time