This question should be answered using the [Weekly] (https://rdrr.io/cran/ISLR/man/Weekly.html) data set, which is part of the ISLR package. This data is similar in nature to the [Smarket] (https://rdrr.io/cran/ISLR/man/Smarket.html) data from this chapter’s lab, except that it contains 1, 089 weekly returns for 21 years, from the beginning of 1990 to the end of 2010.
(a) Produce some numerical and graphical summaries of the Weekly data. Do there appear to be any patterns?
As seen from the summaries and graphs below, there do not appear to be any major patterns in the data. The only thing I was able to pick out was that as the Year
increases, so does the Volume
which means they have a positive correlation.
library(ISLR)
## Warning: package 'ISLR' was built under R version 3.6.3
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202
## Median : 0.2380 Median : 0.2340 Median :1.00268
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821
## Today Direction
## Min. :-18.1950 Down:484
## 1st Qu.: -1.1540 Up :605
## Median : 0.2410
## Mean : 0.1499
## 3rd Qu.: 1.4050
## Max. : 12.0260
pairs(Weekly)
cor(Weekly[,-9])
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
## Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
## Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
## Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
## Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
## Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
## Lag5 Volume Today
## Year -0.030519101 0.84194162 -0.032459894
## Lag1 -0.008183096 -0.06495131 -0.075031842
## Lag2 -0.072499482 -0.08551314 0.059166717
## Lag3 0.060657175 -0.06928771 -0.071243639
## Lag4 -0.075675027 -0.06107462 -0.007825873
## Lag5 1.000000000 -0.05851741 0.011012698
## Volume -0.058517414 1.00000000 -0.033077783
## Today 0.011012698 -0.03307778 1.000000000
attach(Weekly)
plot(Volume) # volume looks to be the only predictor that may have a correlation with Direction
(b) Use the full data set to perform a logistic regression with Direction
as the response and the five lag variables plus Volume
as predictors. Use the summary function to print the results. Do any of the predictors appear to be statistically significant? If so, which ones?
The only predictor that appears to be significant is Lag2
which has a pvalue of 0.0296.
glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume, data=Weekly, family=binomial)
summary(glm.fit)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
(c) Compute the confusion matrix and overall fraction of correct predictions. Explain what the confusion matrix is telling you about the types of mistakes made by logistic regression.
The confusion matrix is a way to compare the classifications we came up with from our glm.fit
compared to the actul data inorder to see how well our model did with the predicting the classification. Overall, our model was 56% accurate which can be calculated by (54+557)/1089. If we look at our Down
and Up
seperately, our model predicted Down
correctly only 11% of the time as calculated by 54/(54+430). Our model predicted Up
correctly 92% of the time as calculated by 557/(48+557).
glm.probs = predict(glm.fit, type='response')
contrasts(Direction)
## Up
## Down 0
## Up 1
#we want to assign each observation with classification up or down
glm.pred=rep("Down",1089) # creates a vector with Down 1089 times
glm.pred[glm.probs>0.5]="Up" #if glm.prob is greater then .5, the obs gets reassigned as Up
table(glm.pred,Direction)
## Direction
## glm.pred Down Up
## Down 54 48
## Up 430 557
mean(glm.pred==Direction) #outputs the overall percentage of correct predictions
## [1] 0.5610652
mean(glm.pred!=Direction) #outputs the miscalssification rate
## [1] 0.4389348
(d) Now fit the logistic regression model using a training data period from 1990 to 2008, with Lag2
as the only predictor. Compute the confusion matrix and the overall fraction of correct predictions for the held out data (that is, the data from 2009 and 2010).
Once the model was refit with only the Lag2
predictor, we got a higher percentage of 62.5% classifications correct when compared to our test data.
#create training dataset
train =(Year<2009)
head(train)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE
tail(train)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE
#to check numbers in train versus original
Weekly.2009=Weekly[!train,] #this is our testing dataset
dim(Weekly)
## [1] 1089 9
dim(Weekly.2009)
## [1] 104 9
#gets direction for training dataset
Direction.2009=Direction[!train]
glm.fits=glm(Direction~Lag2, data=Weekly, subset=train, family=binomial) #fits model using only `Lag2`
glm.probs=predict(glm.fits, Weekly.2009, type='response')
glm.pred=rep('Down', 104)
glm.pred[glm.probs>0.5]='Up'
table(glm.pred, Direction.2009)
## Direction.2009
## glm.pred Down Up
## Down 9 5
## Up 34 56
mean(glm.pred==Direction.2009) #did what I guess equal what really happened?
## [1] 0.625
mean(glm.pred!=Direction.2009) #misclassification rate
## [1] 0.375
(e) Repeat (d) using LDA.
As seen by the confusion matrix below, The LDA produces the same correction classification rate of 62.5% as did our logistic regression.
##LDA Model - will this help us to get a better fit?
library(MASS)
lda.fit=lda(Direction~Lag2, data = Weekly, subset = train)
lda.fit
## Call:
## lda(Direction ~ Lag2, data = Weekly, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
##
## Coefficients of linear discriminants:
## LD1
## Lag2 0.4414162
plot(lda.fit)
lda.pred=predict(lda.fit, Weekly.2009)
lda.class=lda.pred$class
table(lda.class, Direction.2009)
## Direction.2009
## lda.class Down Up
## Down 9 5
## Up 34 56
mean(lda.class==Direction.2009)
## [1] 0.625
(f) Repeat (d) using QDA.
The percentage of correct classifiactions went down with QDA at 58.65 percent. One thing I found interesting is that by looking at the confusino matrix, QDA picked Up
every time.
qda.fit=qda(Direction~Lag2, data = Weekly, subset = train)
qda.fit
## Call:
## qda(Direction ~ Lag2, data = Weekly, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
qda.class=predict(qda.fit, Weekly.2009)$class
table(qda.class, Direction.2009)
## Direction.2009
## qda.class Down Up
## Down 0 0
## Up 43 61
mean(qda.class==Direction.2009)
## [1] 0.5865385
(g) Repeat (d) using KNN with K = 1.
KNN with K=1 produced 50% of the classifications correct.
library(class)
train.X=as.matrix(Lag2[train])
test.X=as.matrix(Lag2[!train])
train.Direction=Direction[train]
dim(train.X)
## [1] 985 1
dim(test.X)
## [1] 104 1
dim(train.Direction)
## NULL
length(train.Direction)
## [1] 985
set.seed(1)
knn.pred=knn(train.X, test.X, train.Direction, k=1)
table(knn.pred, Direction.2009)
## Direction.2009
## knn.pred Down Up
## Down 21 30
## Up 22 31
mean(knn.pred==Direction.2009)
## [1] 0.5
(h) Which of these methods appears to provide the best results on this data?
Logisitc regression and LDA methods yielded the highest precentage of correct classifications at 62.5% correct.
(i) Experiment with different combinations of predictors, including possible transformations and interactions, for each of the methods. Report the variables, method, and associated confusion matrix that appears to provide the best results on the held out data. Note that you should also experiment with values for K in the KNN classifier.
I ran several different models below and found that the logistic regression and LDA models with Lag1^2 and Lag2 gave me the best results at 64.42%.
#taking the sqaure root of Lag 2 was not beneficial
glm.fit2=glm(Direction~(sqrt(Lag2)), data=Weekly, family=binomial)
## Warning in sqrt(Lag2): NaNs produced
glm.probs=predict(glm.fit2, Weekly.2009, type='response')
## Warning in sqrt(Lag2): NaNs produced
glm.pred=rep('Down', 104)
glm.pred[glm.probs>0.5]='Up'
table(glm.pred, Direction.2009)
## Direction.2009
## glm.pred Down Up
## Down 21 28
## Up 22 33
mean(glm.pred==Direction.2009) #correctly predicted
## [1] 0.5192308
mean(glm.pred!=Direction.2009) #misclassification rate
## [1] 0.4807692
#Lag1 and Lag2 and the interaction effect
glm.fit3=glm(Direction~Lag1*Lag2, data=Weekly, family=binomial)
glm.probs=predict(glm.fit3, Weekly.2009, type='response')
glm.pred=rep('Down', 104)
glm.pred[glm.probs>0.5]='Up'
table(glm.pred, Direction.2009)
## Direction.2009
## glm.pred Down Up
## Down 5 6
## Up 38 55
mean(glm.pred==Direction.2009) #correctly predicted
## [1] 0.5769231
mean(glm.pred!=Direction.2009) #misclassification rate
## [1] 0.4230769
#Lag1^2 and Lag2
glm.fit4=glm(Direction~Lag2+I(Lag1^2), data=Weekly, family=binomial)
glm.probs=predict(glm.fit4, Weekly.2009, type='response')
glm.pred=rep('Down', 104)
glm.pred[glm.probs>0.5]='Up'
table(glm.pred, Direction.2009)
## Direction.2009
## glm.pred Down Up
## Down 8 2
## Up 35 59
mean(glm.pred==Direction.2009) #correctly predicted
## [1] 0.6442308
mean(glm.pred!=Direction.2009) #misclassification rate
## [1] 0.3557692
#LDA with Lag1^2 and Lag2
lda.fit2=lda(Direction~Lag2 + I(Lag1^2), data = Weekly, subset = train)
lda.fit2
## Call:
## lda(Direction ~ Lag2 + I(Lag1^2), data = Weekly, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2 I(Lag1^2)
## Down -0.03568254 4.964397
## Up 0.26036581 5.318940
##
## Coefficients of linear discriminants:
## LD1
## Lag2 0.45006708
## I(Lag1^2) 0.02767883
lda.pred2=predict(lda.fit2, Weekly.2009)
lda.class2=lda.pred2$class
table(lda.class2, Direction.2009)
## Direction.2009
## lda.class2 Down Up
## Down 8 2
## Up 35 59
mean(lda.class2==Direction.2009)
## [1] 0.6442308
mean(lda.class2!=Direction.2009)
## [1] 0.3557692
#QDA with Lag1^2 and Lag2
qda.fit2=qda(Direction~Lag2 + I(Lag1^2), data = Weekly, subset = train)
qda.fit2
## Call:
## qda(Direction ~ Lag2 + I(Lag1^2), data = Weekly, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2 I(Lag1^2)
## Down -0.03568254 4.964397
## Up 0.26036581 5.318940
qda.pred=predict(qda.fit2, Weekly.2009)
qda.class=qda.pred$class
table(qda.class, Direction.2009)
## Direction.2009
## qda.class Down Up
## Down 32 40
## Up 11 21
mean(qda.class==Direction.2009)
## [1] 0.5096154
mean(qda.class!=Direction.2009)
## [1] 0.4903846
#KNN with different values of K
set.seed(1)
knn.pred=knn(train.X, test.X, train.Direction, k=5)
table(knn.pred, Direction.2009)
## Direction.2009
## knn.pred Down Up
## Down 16 21
## Up 27 40
mean(knn.pred==Direction.2009)
## [1] 0.5384615
mean(knn.pred!=Direction.2009)
## [1] 0.4615385
set.seed(1)
knn.pred=knn(train.X, test.X, train.Direction, k=10)
table(knn.pred, Direction.2009)
## Direction.2009
## knn.pred Down Up
## Down 17 21
## Up 26 40
mean(knn.pred==Direction.2009)
## [1] 0.5480769
mean(knn.pred!=Direction.2009)
## [1] 0.4519231
In this problem, you will develop a model to predict whether a given car gets high or low gas mileage based on the [Auto] (https://rdrr.io/cran/ISLR/man/Auto.html) data set.
(a) Create a binary variable, mpg01
, that contains a 1 if mpg
contains a value above its median, and a 0 if mpg
contains a value below its median. You can compute the median using the median() function. Note you may find it helpful to use the data.frame() function to create a single data set containing both mpg01
and the other Auto
variables.
library(ISLR)
summary(Auto)
## mpg cylinders displacement horsepower
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0
## 1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0
## Median :22.75 Median :4.000 Median :151.0 Median : 93.5
## Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0
##
## weight acceleration year origin
## Min. :1613 Min. : 8.00 Min. :70.00 Min. :1.000
## 1st Qu.:2225 1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000
## Median :2804 Median :15.50 Median :76.00 Median :1.000
## Mean :2978 Mean :15.54 Mean :75.98 Mean :1.577
## 3rd Qu.:3615 3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000
## Max. :5140 Max. :24.80 Max. :82.00 Max. :3.000
##
## name
## amc matador : 5
## ford pinto : 5
## toyota corolla : 5
## amc gremlin : 4
## amc hornet : 4
## chevrolet chevette: 4
## (Other) :365
detach(Weekly)
attach(Auto)
mpg01=rep(0, length(mpg)) #creates a new column `mpg01` with all values of 0
mpg01[mpg > median(mpg)] = 1 #any row with `mpg` greater than the median will be reassigned a valur of 1
Auto2 = data.frame(Auto, mpg01) # creates new dataset `Auto2` with added column of `mpg01`
summary(Auto2)
## mpg cylinders displacement horsepower
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0
## 1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0
## Median :22.75 Median :4.000 Median :151.0 Median : 93.5
## Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0
##
## weight acceleration year origin
## Min. :1613 Min. : 8.00 Min. :70.00 Min. :1.000
## 1st Qu.:2225 1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000
## Median :2804 Median :15.50 Median :76.00 Median :1.000
## Mean :2978 Mean :15.54 Mean :75.98 Mean :1.577
## 3rd Qu.:3615 3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000
## Max. :5140 Max. :24.80 Max. :82.00 Max. :3.000
##
## name mpg01
## amc matador : 5 Min. :0.0
## ford pinto : 5 1st Qu.:0.0
## toyota corolla : 5 Median :0.5
## amc gremlin : 4 Mean :0.5
## amc hornet : 4 3rd Qu.:1.0
## chevrolet chevette: 4 Max. :1.0
## (Other) :365
(b) Explore the data graphically in order to investigate the association between mpg01
and the other features. Which of the other features seem most likely to be useful in predicting mpg01
? Scatterplots and boxplots may be useful tools to answer this question. Describe your findings.
Using the correlation values and comparing to the boxplots, the predictors that look like they may be most uselful when predicting mpg01
are cylinders
, displacement
, horsepower
, and weight
.
pairs(Auto2[,-9]) #not very helpful since mpg01 is binary
cor(Auto2[,-9]) #looks like `cylinders`, `displacement`, `horsepower`, and `weight` may be correlated to `mpg01`
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## mpg01 0.8369392 -0.7591939 -0.7534766 -0.6670526 -0.7577566
## acceleration year origin mpg01
## mpg 0.4233285 0.5805410 0.5652088 0.8369392
## cylinders -0.5046834 -0.3456474 -0.5689316 -0.7591939
## displacement -0.5438005 -0.3698552 -0.6145351 -0.7534766
## horsepower -0.6891955 -0.4163615 -0.4551715 -0.6670526
## weight -0.4168392 -0.3091199 -0.5850054 -0.7577566
## acceleration 1.0000000 0.2903161 0.2127458 0.3468215
## year 0.2903161 1.0000000 0.1815277 0.4299042
## origin 0.2127458 0.1815277 1.0000000 0.5136984
## mpg01 0.3468215 0.4299042 0.5136984 1.0000000
par(mfrow=c(2,2))
boxplot(cylinders~mpg01)
boxplot(displacement~mpg01)
boxplot(horsepower~mpg01)
boxplot(weight~mpg01)
(c) Split the data into a training set and a test set.
set.seed(1)
inTrain=sample(1:nrow(Auto2), nrow(Auto2)*0.7) #inTrain is from 70% random data from Auto
inTrain
## [1] 324 167 129 299 270 187 307 85 277 362 330 263 329 79 213 37 105
## [18] 217 366 165 290 383 89 289 340 326 382 42 111 20 44 343 70 121
## [35] 40 172 25 248 198 39 298 280 160 14 130 45 22 206 230 193 104
## [52] 367 255 341 342 103 331 13 296 375 176 279 110 84 29 141 252 221
## [69] 108 304 33 347 149 287 102 145 118 323 107 64 224 337 51 325 372
## [86] 138 390 389 282 143 285 170 48 204 295 24 181 214 225 163 43 1
## [103] 328 78 284 116 233 61 86 374 49 242 246 247 239 219 135 364 363
## [120] 310 53 348 65 376 124 77 218 98 194 19 31 174 237 75 16 358
## [137] 9 50 92 122 152 386 207 244 229 350 355 391 223 373 309 140 126
## [154] 349 344 319 258 15 271 388 195 201 318 17 212 127 133 41 384 392
## [171] 159 117 72 36 315 294 157 378 313 306 272 106 185 88 281 228 238
## [188] 368 80 30 93 234 220 240 369 164 168 243 200 184 260 100 113 359
## [205] 73 27 333 235 38 62 134 132 35 125 99 267 269 71 153 262 377
## [222] 28 183 148 308 227 365 60 171 354 173 12 202 305 371 265 26 322
## [239] 334 208 288 297 357 249 210 278 82 97 264 250 56 216 101 336 259
## [256] 2 192 131 275 169 292 291 317 83 90 23 68 222 190 91 57 311
## [273] 345 52
training = Auto2[inTrain, -1] #training dataset is rows from inTraining(random 70%) and all columns execpt mpg(first column)
summary(training)
## cylinders displacement horsepower weight
## Min. :3.000 Min. : 68.0 Min. : 46.0 Min. :1649
## 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 78.0 1st Qu.:2224
## Median :4.000 Median :151.0 Median : 93.5 Median :2790
## Mean :5.464 Mean :193.2 Mean :103.8 Mean :2967
## 3rd Qu.:8.000 3rd Qu.:261.5 3rd Qu.:123.8 3rd Qu.:3612
## Max. :8.000 Max. :455.0 Max. :230.0 Max. :5140
##
## acceleration year origin
## Min. : 8.00 Min. :70.00 Min. :1.00
## 1st Qu.:14.00 1st Qu.:73.00 1st Qu.:1.00
## Median :15.50 Median :76.00 Median :1.00
## Mean :15.66 Mean :76.06 Mean :1.54
## 3rd Qu.:17.27 3rd Qu.:79.00 3rd Qu.:2.00
## Max. :24.80 Max. :82.00 Max. :3.00
##
## name mpg01
## amc gremlin : 4 Min. :0.0000
## amc hornet : 4 1st Qu.:0.0000
## amc matador : 3 Median :1.0000
## chevrolet caprice classic: 3 Mean :0.5073
## chevrolet chevette : 3 3rd Qu.:1.0000
## chevrolet impala : 3 Max. :1.0000
## (Other) :254
dim(training)
## [1] 274 9
testing = Auto2[-inTrain, -1] #testing dataset - everything not in training dataset
dim(testing)
## [1] 118 9
dim(Auto2)
## [1] 392 10
test.mpg01=mpg01[-inTrain] #vector of mpg01 for the test datset
train.mpg01=mpg01[inTrain] #vector og mpg01 for the training dataset
length(test.mpg01)
## [1] 118
length(train.mpg01)
## [1] 274
(d) Perform LDA on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained?
This LDA model yields an 11.86% misclassifications rate, which means it predicted about 88% correct which seems pretty good.
library(MASS)
lda.fit=lda(mpg01~cylinders+displacement+horsepower+weight, data = Auto2, subset=inTrain)
lda.fit
## Call:
## lda(mpg01 ~ cylinders + displacement + horsepower + weight, data = Auto2,
## subset = inTrain)
##
## Prior probabilities of groups:
## 0 1
## 0.4927007 0.5072993
##
## Group means:
## cylinders displacement horsepower weight
## 0 6.777778 271.9333 129.13333 3611.052
## 1 4.187050 116.8129 79.27338 2342.165
##
## Coefficients of linear discriminants:
## LD1
## cylinders -0.3962357999
## displacement -0.0047630097
## horsepower 0.0061919395
## weight -0.0008321338
plot(lda.fit)
lda.pred=predict(lda.fit, testing)
names(lda.pred)
## [1] "class" "posterior" "x"
lda.class=lda.pred$class #assigns classification for our predictions
table(lda.class, test.mpg01) #compares our prectiction with the real results for our test data
## test.mpg01
## lda.class 0 1
## 0 50 3
## 1 11 54
mean(lda.class==test.mpg01) #correctly predicted
## [1] 0.8813559
mean(lda.class!=test.mpg01) #misclassification rate
## [1] 0.1186441
(e) Perform QDA on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained?
The restults from QDA were very much like LDA with an 11% misclassification rate.
qda.fit=qda(mpg01~cylinders+displacement+horsepower+weight, data = Auto2, subset=inTrain)
qda.fit
## Call:
## qda(mpg01 ~ cylinders + displacement + horsepower + weight, data = Auto2,
## subset = inTrain)
##
## Prior probabilities of groups:
## 0 1
## 0.4927007 0.5072993
##
## Group means:
## cylinders displacement horsepower weight
## 0 6.777778 271.9333 129.13333 3611.052
## 1 4.187050 116.8129 79.27338 2342.165
qda.class=predict(qda.fit,testing)$class
#names(qda.pred)
table(qda.class, test.mpg01)
## test.mpg01
## qda.class 0 1
## 0 52 5
## 1 9 52
mean(qda.class==test.mpg01) #correctly predicted
## [1] 0.8813559
mean(qda.class!=test.mpg01) #misclassification rate
## [1] 0.1186441
(f) Perform logistic regression on the training data in order to predict mpg01
using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained?
Logistics regression reulted in a 9% misclassification rate, which mean about 91% were predicted correctly - this is the best so far.
glm.fit=glm(mpg01~cylinders + displacement + horsepower + weight, data = Auto2, subset=inTrain, family=binomial)
glm.probs=predict(glm.fit, testing, type='response')
glm.pred=rep(0,length(glm.probs))
glm.pred[glm.probs>0.5]=1
table(glm.pred, test.mpg01)
## test.mpg01
## glm.pred 0 1
## 0 53 3
## 1 8 54
mean(glm.pred==test.mpg01) #correctly predicted
## [1] 0.9067797
mean(glm.pred!=test.mpg01) #misclassification rate
## [1] 0.09322034
(g) Perform KNN on the training data, with several values of K, in order to predict mpg01
. Use only the variables that seemed most associated with mpg01
in (b). What test errors do you obtain? Which value of K seems to perform the best on this data set?
I ran the KNN for several values of K with the best resuslt being when K=5 with a 12.71% miscalssification rate.
KNN = 1 - 13.56% misclassification rate KNN = 10 - 14.41% misclassification rate KNN = 100 - 15.25% misclassification rate
train.X=cbind(training$cylinders, training$weight, training$displacement, training$horsepower)
test.X=cbind(testing$cylinders, testing$weight, testing$displacement, testing$horsepower)
#k=1
knn.pred=knn(train.X, test.X, train.mpg01)
mean(knn.pred==test.mpg01) #correctly predicted
## [1] 0.8644068
mean(knn.pred!=test.mpg01) #misclassification rate
## [1] 0.1355932
#k=5
knn.pred=knn(train.X, test.X, train.mpg01, k=5)
mean(knn.pred==test.mpg01) #correctly predicted
## [1] 0.8728814
mean(knn.pred!=test.mpg01) #misclassification rate
## [1] 0.1271186
#k=10
knn.pred=knn(train.X, test.X, train.mpg01, k=10)
mean(knn.pred==test.mpg01) #correctly predicted
## [1] 0.8559322
mean(knn.pred!=test.mpg01) #misclassification rate
## [1] 0.1440678
#k=100
knn.pred=knn(train.X, test.X, train.mpg01, k=100)
mean(knn.pred==test.mpg01) #correctly predicted
## [1] 0.8474576
mean(knn.pred!=test.mpg01) #misclassification rate
## [1] 0.1525424
Using the Boston data set, fit classification models in order to predict whether a given suburb has a crime rate above or below the median. Explore logistic regression, LDA, and KNN models using various subsets of the predictors. Describe your findings.
Below are the results for logistic regression, LDA, QDA, and KNN models. I found that the LDA and KNN whern K=1 yielded the lowest error rate of 13.82%, followed by QDA at 14.47%, logistic regression at 15.7%, and then by the other KNN models woth various values for K.
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08204 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
cor(Boston)
## crim zn indus chas nox
## crim 1.00000000 -0.20046922 0.40658341 -0.055891582 0.42097171
## zn -0.20046922 1.00000000 -0.53382819 -0.042696719 -0.51660371
## indus 0.40658341 -0.53382819 1.00000000 0.062938027 0.76365145
## chas -0.05589158 -0.04269672 0.06293803 1.000000000 0.09120281
## nox 0.42097171 -0.51660371 0.76365145 0.091202807 1.00000000
## rm -0.21924670 0.31199059 -0.39167585 0.091251225 -0.30218819
## age 0.35273425 -0.56953734 0.64477851 0.086517774 0.73147010
## dis -0.37967009 0.66440822 -0.70802699 -0.099175780 -0.76923011
## rad 0.62550515 -0.31194783 0.59512927 -0.007368241 0.61144056
## tax 0.58276431 -0.31456332 0.72076018 -0.035586518 0.66802320
## ptratio 0.28994558 -0.39167855 0.38324756 -0.121515174 0.18893268
## black -0.38506394 0.17552032 -0.35697654 0.048788485 -0.38005064
## lstat 0.45562148 -0.41299457 0.60379972 -0.053929298 0.59087892
## medv -0.38830461 0.36044534 -0.48372516 0.175260177 -0.42732077
## rm age dis rad tax
## crim -0.21924670 0.35273425 -0.37967009 0.625505145 0.58276431
## zn 0.31199059 -0.56953734 0.66440822 -0.311947826 -0.31456332
## indus -0.39167585 0.64477851 -0.70802699 0.595129275 0.72076018
## chas 0.09125123 0.08651777 -0.09917578 -0.007368241 -0.03558652
## nox -0.30218819 0.73147010 -0.76923011 0.611440563 0.66802320
## rm 1.00000000 -0.24026493 0.20524621 -0.209846668 -0.29204783
## age -0.24026493 1.00000000 -0.74788054 0.456022452 0.50645559
## dis 0.20524621 -0.74788054 1.00000000 -0.494587930 -0.53443158
## rad -0.20984667 0.45602245 -0.49458793 1.000000000 0.91022819
## tax -0.29204783 0.50645559 -0.53443158 0.910228189 1.00000000
## ptratio -0.35550149 0.26151501 -0.23247054 0.464741179 0.46085304
## black 0.12806864 -0.27353398 0.29151167 -0.444412816 -0.44180801
## lstat -0.61380827 0.60233853 -0.49699583 0.488676335 0.54399341
## medv 0.69535995 -0.37695457 0.24992873 -0.381626231 -0.46853593
## ptratio black lstat medv
## crim 0.2899456 -0.38506394 0.4556215 -0.3883046
## zn -0.3916785 0.17552032 -0.4129946 0.3604453
## indus 0.3832476 -0.35697654 0.6037997 -0.4837252
## chas -0.1215152 0.04878848 -0.0539293 0.1752602
## nox 0.1889327 -0.38005064 0.5908789 -0.4273208
## rm -0.3555015 0.12806864 -0.6138083 0.6953599
## age 0.2615150 -0.27353398 0.6023385 -0.3769546
## dis -0.2324705 0.29151167 -0.4969958 0.2499287
## rad 0.4647412 -0.44441282 0.4886763 -0.3816262
## tax 0.4608530 -0.44180801 0.5439934 -0.4685359
## ptratio 1.0000000 -0.17738330 0.3740443 -0.5077867
## black -0.1773833 1.00000000 -0.3660869 0.3334608
## lstat 0.3740443 -0.36608690 1.0000000 -0.7376627
## medv -0.5077867 0.33346082 -0.7376627 1.0000000
Setting up the new variable crim_ind
and new Boston2 dataset
attach(Boston)
crim_ind=rep(0, length(crim)) #create a binary crime indicator
crim_ind[crim>median(crim)]=1 # reassign rows with crime rate greater than the median to 1
Boston2 = data.frame(Boston, crim_ind) #Boston2 dataset has the new crim_ind variable added
summary(Boston2)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08204 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv crim_ind
## Min. : 1.73 Min. : 5.00 Min. :0.0
## 1st Qu.: 6.95 1st Qu.:17.02 1st Qu.:0.0
## Median :11.36 Median :21.20 Median :0.5
## Mean :12.65 Mean :22.53 Mean :0.5
## 3rd Qu.:16.95 3rd Qu.:25.00 3rd Qu.:1.0
## Max. :37.97 Max. :50.00 Max. :1.0
** Create training and testing datsets.**
set.seed(1)
train = sample(1:nrow(Boston2), 0.7*nrow(Boston2)) #train is made up of randon 70% of Boston2 dataset for training purposes
Boston2.train = Boston2[train,] #training dataset
Boston2.test = Boston2[-train,] #testing dataset
dim(Boston2.train)
## [1] 354 15
dim(Boston2.test)
## [1] 152 15
crim_ind.test=crim_ind[-train] # create the crim indicator vector for the Boston2.test testing dataset
crim_ind.test
## [1] 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [36] 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 1 0 1 1 1
## [71] 1 1 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 1 1 1 1 1 0 0 0 1 0 0 0 0 0 0 0 1
## [106] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [141] 1 1 1 1 1 1 0 0 1 0 0 0
Logistic Regression with all predictors
#fit for all predictors
glm.fit=glm(crim_ind~zn+indus+chas+nox+rm+age+dis+rad+tax+ptratio+black+lstat+medv, data=Boston2, subset=train, family=binomial)
summary(glm.fit)
##
## Call:
## glm(formula = crim_ind ~ zn + indus + chas + nox + rm + age +
## dis + rad + tax + ptratio + black + lstat + medv, family = binomial,
## data = Boston2, subset = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1363 -0.1504 -0.0009 0.0018 3.5797
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -42.731502 8.009551 -5.335 9.55e-08 ***
## zn -0.105323 0.045585 -2.310 0.020862 *
## indus -0.066308 0.057649 -1.150 0.250057
## chas 0.187896 0.810675 0.232 0.816711
## nox 56.344292 10.073420 5.593 2.23e-08 ***
## rm -0.234402 0.900292 -0.260 0.794585
## age 0.022052 0.014304 1.542 0.123141
## dis 1.143918 0.303909 3.764 0.000167 ***
## rad 0.657976 0.184373 3.569 0.000359 ***
## tax -0.006299 0.003239 -1.944 0.051849 .
## ptratio 0.292158 0.155755 1.876 0.060689 .
## black -0.008703 0.005212 -1.670 0.094933 .
## lstat 0.112414 0.060168 1.868 0.061715 .
## medv 0.191745 0.087816 2.184 0.028999 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 490.65 on 353 degrees of freedom
## Residual deviance: 140.41 on 340 degrees of freedom
## AIC: 168.41
##
## Number of Fisher Scoring iterations: 9
Logistic Regression with significant predictors
# fit with only the predictors that are significant - zn, nox, dis, rad and medv
glm.fit2=glm(crim_ind~ zn+nox+dis+rad+medv, data=Boston2, subset=train, family=binomial)
summary(glm.fit2)
##
## Call:
## glm(formula = crim_ind ~ zn + nox + dis + rad + medv, family = binomial,
## data = Boston2, subset = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0766 -0.3226 -0.0034 0.0069 3.2715
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -30.10487 4.91615 -6.124 9.14e-10 ***
## zn -0.09780 0.03537 -2.765 0.005698 **
## nox 42.65837 7.07436 6.030 1.64e-09 ***
## dis 0.84586 0.23756 3.561 0.000370 ***
## rad 0.49353 0.13194 3.741 0.000184 ***
## medv 0.08028 0.02906 2.763 0.005730 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 490.65 on 353 degrees of freedom
## Residual deviance: 164.99 on 348 degrees of freedom
## AIC: 176.99
##
## Number of Fisher Scoring iterations: 8
glm.probs=predict(glm.fit2, Boston2.test, type='response')
glm.pred=rep(0, length(glm.probs))
glm.pred[glm.probs > 0.5] = 1
table(glm.pred, crim_ind.test)
## crim_ind.test
## glm.pred 0 1
## 0 60 11
## 1 13 68
mean(glm.pred == crim_ind.test)
## [1] 0.8421053
mean(glm.pred != crim_ind.test)
## [1] 0.1578947
LDA
#LDA regression
lda.fit=lda(crim_ind~zn+nox+dis+rad+medv, data=Boston2, subset=train)
lda.fit
## Call:
## lda(crim_ind ~ zn + nox + dis + rad + medv, data = Boston2, subset = train)
##
## Prior probabilities of groups:
## 0 1
## 0.5084746 0.4915254
##
## Group means:
## zn nox dis rad medv
## 0 21.188889 0.4695150 5.070315 4.127778 25.37056
## 1 1.172414 0.6377989 2.537681 14.873563 20.44770
##
## Coefficients of linear discriminants:
## LD1
## zn -0.010534928
## nox 9.134260498
## dis 0.008656437
## rad 0.076273497
## medv 0.029024616
lda.pred = predict(lda.fit, Boston2.test)
names(lda.pred)
## [1] "class" "posterior" "x"
lda.class=lda.pred$class
table(lda.class, crim_ind.test)
## crim_ind.test
## lda.class 0 1
## 0 71 19
## 1 2 60
mean(lda.class == crim_ind.test)
## [1] 0.8618421
mean(lda.class != crim_ind.test)
## [1] 0.1381579
QDA
#QDA
qda.fit=qda(crim_ind~zn+nox+dis+rad+medv, data=Boston2, subset=train)
qda.fit
## Call:
## qda(crim_ind ~ zn + nox + dis + rad + medv, data = Boston2, subset = train)
##
## Prior probabilities of groups:
## 0 1
## 0.5084746 0.4915254
##
## Group means:
## zn nox dis rad medv
## 0 21.188889 0.4695150 5.070315 4.127778 25.37056
## 1 1.172414 0.6377989 2.537681 14.873563 20.44770
qda.pred = predict(qda.fit, Boston2.test)
names(qda.pred)
## [1] "class" "posterior"
qda.class=qda.pred$class
table(qda.class, crim_ind.test)
## crim_ind.test
## qda.class 0 1
## 0 66 15
## 1 7 64
mean(qda.class== crim_ind.test)
## [1] 0.8552632
mean(qda.class!=crim_ind.test)
## [1] 0.1447368
#KNN regression
library(class)
train.X=cbind(nox,rad,ptratio,medv)[train,]
test.X=cbind(nox,rad,ptratio,medv)[-train,]
train.crim_ind = crim_ind[train]
dim(train.X)
## [1] 354 4
dim(test.X)
## [1] 152 4
dim(train.crim_ind)
## NULL
set.seed(1)
#KNN=1
knn.pred=knn(train.X, test.X, train.crim_ind, k=1)
table(knn.pred, crim_ind.test)
## crim_ind.test
## knn.pred 0 1
## 0 58 6
## 1 15 73
mean(knn.pred== crim_ind.test)
## [1] 0.8618421
mean(knn.pred!=crim_ind.test)
## [1] 0.1381579
#KNN=5
knn.pred=knn(train.X, test.X, train.crim_ind, k=5)
table(knn.pred, crim_ind.test)
## crim_ind.test
## knn.pred 0 1
## 0 61 12
## 1 12 67
mean(knn.pred== crim_ind.test)
## [1] 0.8421053
mean(knn.pred!=crim_ind.test)
## [1] 0.1578947
#KNN=10
knn.pred=knn(train.X, test.X, train.crim_ind, k=10)
table(knn.pred, crim_ind.test)
## crim_ind.test
## knn.pred 0 1
## 0 67 20
## 1 6 59
mean(knn.pred== crim_ind.test)
## [1] 0.8289474
mean(knn.pred!=crim_ind.test)
## [1] 0.1710526
#KNN=100
knn.pred=knn(train.X, test.X, train.crim_ind, k=100)
table(knn.pred, crim_ind.test)
## crim_ind.test
## knn.pred 0 1
## 0 73 39
## 1 0 40
mean(knn.pred== crim_ind.test)
## [1] 0.7434211
mean(knn.pred!=crim_ind.test)
## [1] 0.2565789