Loading the libraries
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.1.2
library(MASS)
## Warning: package 'MASS' was built under R version 4.1.2
library(class)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##10. This question should be answered using the Weekly data set, which is part of the ISLR package. This data is similar in nature to the Smarket data from this chapter’s lab, except that it contains 1, 089 weekly returns for 21 years, from the beginning of 1990 to the end of 2010
Looking at the Weekly dataset
str(Weekly)
## 'data.frame': 1089 obs. of 9 variables:
## $ Year : num 1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
## $ Lag1 : num 0.816 -0.27 -2.576 3.514 0.712 ...
## $ Lag2 : num 1.572 0.816 -0.27 -2.576 3.514 ...
## $ Lag3 : num -3.936 1.572 0.816 -0.27 -2.576 ...
## $ Lag4 : num -0.229 -3.936 1.572 0.816 -0.27 ...
## $ Lag5 : num -3.484 -0.229 -3.936 1.572 0.816 ...
## $ Volume : num 0.155 0.149 0.16 0.162 0.154 ...
## $ Today : num -0.27 -2.576 3.514 0.712 1.178 ...
## $ Direction: Factor w/ 2 levels "Down","Up": 1 1 2 2 2 1 2 2 2 1 ...
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
Visualizing the data set
pairs(Weekly)
Correlation among the variables, take out Direction, it is qualitative The only correlation worth noting is between Year and Volume =0.8419
cor(Weekly [,-9])
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
## Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
## Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
## Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
## Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
## Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
## Lag5 Volume Today
## Year -0.030519101 0.84194162 -0.032459894
## Lag1 -0.008183096 -0.06495131 -0.075031842
## Lag2 -0.072499482 -0.08551314 0.059166717
## Lag3 0.060657175 -0.06928771 -0.071243639
## Lag4 -0.075675027 -0.06107462 -0.007825873
## Lag5 1.000000000 -0.05851741 0.011012698
## Volume -0.058517414 1.00000000 -0.033077783
## Today 0.011012698 -0.03307778 1.000000000
Plot Volume, from 1990 - 2010, weekly trading volume has increased
plot(Weekly$Volume)
glm.fits=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume, data=Weekly,family=binomial )
summary (glm.fits)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Predict the probabilites the volume will go up
glm.probs=predict(glm.fits,type="response")
contrasts(Weekly$Direction)
## Up
## Down 0
## Up 1
In order to make a prediction as to whether the market will go up or down on a particular day, we must convert these predicted probabilities into class labels, Up or Down. The following two commands create a vector of class predictions based on whether the predicted probability of a market increase is greater than or less than 0.5. The first command creates a vector of 1,250 Down elements. The second line transforms to Up all of the elements for which the predicted probability of a market increase exceeds 0.5.
glm.pred=rep("Down", 1089)
glm.pred[glm.probs >.5]="Up"
Confusion matrix Given these predictions, the table() function can be used to produce a confusion matrix in order to determine how many observations were correctly or incorrectly classified. Explain what the confusion matrix is telling you about the types of mistakes made by logistic regression. The model correctly predicted 557 weeks the market would go up and 54 weeks the market would go down. The model incorrectly predicted 48 weeks the market would go up but it went down and 430 weeks the market would go down and it went up. Model accuracy rate: 0.561
table(glm.pred ,Weekly$Direction)
##
## glm.pred Down Up
## Down 54 48
## Up 430 557
Accuracy rate
accuracy_rate <- (557+54)/1089
accuracy_rate
## [1] 0.5610652
Review Weekly data set to make sure the years are correct: 1990 - 2010
table(Weekly$Year)
##
## 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005
## 47 52 52 52 52 52 53 52 52 52 52 52 52 52 52 52
## 2006 2007 2008 2009 2010
## 52 53 52 52 52
Create a vector corresponding to the observations from 1990-2008 (Weekly.train). Vector to create a held out data set of observations from 2009 - 2010 (Weekly.test).
Weekly.train <- (Weekly$Year<2009) ##Boolean vector (T or F) 1089 elements True before 2009, False 2009-2010
Weekly.test <-Weekly[!Weekly.train ,] ##Get all the observations "rows" that are not Weekly.train from Weekly (subset of Weekly dataset)
Direction.test <- Weekly$Direction[!Weekly.train]
dim(Weekly.test)
## [1] 104 9
Fit the logistic regression model to the training subset
glm.fits2 = glm(Direction~Lag2, data = Weekly, family = binomial, subset = Weekly.train)
Test the model on the test set
glm.probs2 = predict(glm.fits2, Weekly.test, type = "response")
Compute predictions of new model
glm.pred2=rep("Down", 104)
glm.pred2[glm.probs2 >.5]="Up"
table(glm.pred2, Direction.test)
## Direction.test
## glm.pred2 Down Up
## Down 9 5
## Up 34 56
Accuracy rate
accuracy_rate2 <- (56+9)/104
accuracy_rate2
## [1] 0.625
lda.fit = lda(Direction~Lag2, data = Weekly, subset = Weekly.train)
lda.fit
## Call:
## lda(Direction ~ Lag2, data = Weekly, subset = Weekly.train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
##
## Coefficients of linear discriminants:
## LD1
## Lag2 0.4414162
lda.pred=predict (lda.fit , Weekly.test)
names(lda.pred)
## [1] "class" "posterior" "x"
lda.class=lda.pred$class
table(lda.class, Direction.test)
## Direction.test
## lda.class Down Up
## Down 9 5
## Up 34 56
Accuracy rate
accuracy_ratelda <- (56+9)/104
accuracy_ratelda
## [1] 0.625
qda.fit = qda(Direction~Lag2, data = Weekly, subset = Weekly.train)
qda.fit
## Call:
## qda(Direction ~ Lag2, data = Weekly, subset = Weekly.train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
qda.pred=predict (qda.fit , Weekly.test)
names(qda.pred)
## [1] "class" "posterior"
qda.class=qda.pred$class
table(qda.class, Direction.test)
## Direction.test
## qda.class Down Up
## Down 0 0
## Up 43 61
Accuracy rate qda - predicted everything was Up (overfitting?)
accuracy_rateqda <- (61+0)/104
accuracy_rateqda
## [1] 0.5865385
train.X = as.matrix(Weekly$Lag2[Weekly.train]) ##A matrix containing the predictors associated with the training data
test.X = as.matrix(Weekly$Lag2[!Weekly.train]) ##A matrix containing the predictors associated with the data for which we wish to make predictions
train.Direction = Weekly$Direction [Weekly.train] ##A vector containing the class labels for the training observations, labeled
dim(train.X)
## [1] 985 1
set seed, fit KNN model
set.seed(22)
knn.pred=knn(train.X, test.X, train.Direction, k=1)
table(knn.pred, Direction.test, dnn = c("Actual", "Predicted"))
## Predicted
## Actual Down Up
## Down 21 29
## Up 22 32
accuracy_rateknn <- (32+21)/104
accuracy_rateknn
## [1] 0.5096154
Logistic regression model with Lag 2, Lag 1 * Lag2 (interaction effect)
glm.fits3 = glm(Direction~Lag2 + Lag2*Lag1, data = Weekly, family = binomial, subset = Weekly.train)
Test the model on the test set
glm.probs3 = predict(glm.fits3, Weekly.test, type = "response")
Compute predictions of new model
glm.pred3=rep("Down", 104)
glm.pred3[glm.probs3 >.5]="Up"
table(glm.pred3, Direction.test)
## Direction.test
## glm.pred3 Down Up
## Down 7 8
## Up 36 53
accuracy_rateknn <- (53+7)/104
accuracy_rateknn
## [1] 0.5769231
lda.fit2 = lda(Direction~Lag2 + Lag3 + Lag4, data = Weekly, subset = Weekly.train)
lda.fit2
## Call:
## lda(Direction ~ Lag2 + Lag3 + Lag4, data = Weekly, subset = Weekly.train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2 Lag3 Lag4
## Down -0.03568254 0.17080045 0.15925624
## Up 0.26036581 0.08404044 0.09220956
##
## Coefficients of linear discriminants:
## LD1
## Lag2 0.41403813
## Lag3 -0.09567686
## Lag4 -0.13077102
lda.pred2=predict (lda.fit2 , Weekly.test)
names(lda.pred2)
## [1] "class" "posterior" "x"
lda.class2=lda.pred2$class
table(lda.class2, Direction.test)
## Direction.test
## lda.class2 Down Up
## Down 8 5
## Up 35 56
Accuracy rate
accuracy_ratelda2 <- (56+8)/104
accuracy_ratelda2
## [1] 0.6153846
qda.fit2 = qda(Direction~Lag2 + Lag3 + Lag2 * Lag3, data = Weekly, subset = Weekly.train)
qda.fit2
## Call:
## qda(Direction ~ Lag2 + Lag3 + Lag2 * Lag3, data = Weekly, subset = Weekly.train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2 Lag3 Lag2:Lag3
## Down -0.03568254 0.17080045 -0.1937158
## Up 0.26036581 0.08404044 -0.6405132
qda.pred2=predict (qda.fit2 , Weekly.test)
names(qda.pred2)
## [1] "class" "posterior"
qda.class2=qda.pred2$class
table(qda.class2, Direction.test)
## Direction.test
## qda.class2 Down Up
## Down 9 10
## Up 34 51
Accuracy rate qda - predicted everything was Up (overfitting?)
accuracy_rateqda2 <- (51+9)/104
accuracy_rateqda2
## [1] 0.5769231
train.X2 = cbind(Weekly$Lag2, Weekly$Lag3)[Weekly.train ,] ##A matrix containing the predictors associated with the training data
test.X2 = cbind(Weekly$Lag2, Weekly$Lag3)[!Weekly.train ,] ##A matrix containing the predictors associated with the data for which we wish to make predictions
train.Direction = Weekly$Direction [Weekly.train] ##A vector containing the class labels for the training observations, labeled
dim(train.X2)
## [1] 985 2
set seed, fit KNN model
set.seed(22)
knn.pred2=knn(train.X2, test.X2, train.Direction, k=3)
table(knn.pred2, Direction.test, dnn = c("Actual", "Predicted"))
## Predicted
## Actual Down Up
## Down 16 20
## Up 27 41
accuracy_rateknn <- (41+16)/104
accuracy_rateknn
## [1] 0.5480769
##11. In this problem, you will develop a model to predict whether a given car gets high or low gas mileage based on the Auto data set.
Look at the Auto data set
str(Auto)
## 'data.frame': 392 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : num 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : num 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : num 3504 3693 3436 3433 3449 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : num 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : num 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
Create a binary variable, mpg01, that contains a 1 if mpg contains a value above its median, and a 0 if mpg contains a value below its median.
Auto2 = Auto
med_mpg = median(Auto2$mpg)
Auto2 = mutate(Auto2, mpg01 = ifelse(mpg > med_mpg, 1,0))
str(Auto2)
## 'data.frame': 392 obs. of 10 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : num 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : num 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : num 3504 3693 3436 3433 3449 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : num 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : num 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
## $ mpg01 : num 0 0 0 0 0 0 0 0 0 0 ...
11 (b) Explore the data graphically in order to investigate the association between mpg01 and the other features. Which of the other features seem most likely to be useful in predicting mpg01? Scatterplots and boxplots may be useful tools to answer this question. Describe your findings.
pairs(Auto2) ##nothing really useful from this plot
From the correlation matrix, cylinders, displacement, horsepower, weight have negative correlations to mpg01.
mpg has a positive correlation to mpg01
cor(Auto2[,-9]) ##take out name, qualitative variable
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## mpg01 0.8369392 -0.7591939 -0.7534766 -0.6670526 -0.7577566
## acceleration year origin mpg01
## mpg 0.4233285 0.5805410 0.5652088 0.8369392
## cylinders -0.5046834 -0.3456474 -0.5689316 -0.7591939
## displacement -0.5438005 -0.3698552 -0.6145351 -0.7534766
## horsepower -0.6891955 -0.4163615 -0.4551715 -0.6670526
## weight -0.4168392 -0.3091199 -0.5850054 -0.7577566
## acceleration 1.0000000 0.2903161 0.2127458 0.3468215
## year 0.2903161 1.0000000 0.1815277 0.4299042
## origin 0.2127458 0.1815277 1.0000000 0.5136984
## mpg01 0.3468215 0.4299042 0.5136984 1.0000000
Set mpg01 to factor variable
Auto2$mpg01 = as.factor(Auto2$mpg01)
set.seed(22)
tr_ind = sample(nrow(Auto2), 0.7*nrow(Auto2), replace = F) ##using a 70/30 train/test split
Auto2train = Auto2[tr_ind,]
Auto2test = Auto2[-tr_ind,]
Looking at Auto2train 274 obs, 10 variables, 274 is 70% of 392
str(Auto2train)
## 'data.frame': 274 obs. of 10 variables:
## $ mpg : num 14 29.8 34.2 38 19.1 19 25.4 22 40.8 31 ...
## $ cylinders : num 8 4 4 4 6 6 6 6 4 4 ...
## $ displacement: num 302 89 105 91 225 156 168 232 85 91 ...
## $ horsepower : num 137 62 70 67 90 108 116 112 65 68 ...
## $ weight : num 4042 1845 2200 1995 3381 ...
## $ acceleration: num 14.5 15.3 13.2 16.2 18.7 15.5 12.6 14.7 19.2 17.6 ...
## $ year : num 73 80 79 82 80 76 81 82 80 82 ...
## $ origin : num 1 2 1 3 1 3 3 1 3 3 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 142 281 225 83 98 276 275 147 79 178 ...
## $ mpg01 : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 2 1 2 2 ...
Looking at Auto2test 118 obs, 10 variables, 118 is 30% of 392
str(Auto2test)
## 'data.frame': 118 obs. of 10 variables:
## $ mpg : num 18 17 14 14 15 15 14 21 25 24 ...
## $ cylinders : num 8 8 8 8 8 8 8 6 4 4 ...
## $ displacement: num 318 302 454 440 390 383 455 200 110 107 ...
## $ horsepower : num 150 140 220 215 190 170 225 85 87 90 ...
## $ weight : num 3436 3449 4354 4312 3850 ...
## $ acceleration: num 11 10.5 9 8.5 8.5 10 10 16 17.5 14.5 ...
## $ year : num 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : num 1 1 1 1 1 1 1 1 2 2 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 231 161 54 223 2 101 30 150 211 16 ...
## $ mpg01 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 2 2 ...
Creating the labels
mpg01.test = Auto2$mpg01[-tr_ind]
#mpg01.test = Auto2test$mpg01 another way to do it
str(mpg01.test)
## Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 2 2 ...
Fit the LDA model on the training set
lda.fitAuto = lda(mpg01 ~ cylinders + displacement + horsepower + weight, data = Auto2train)
lda.fitAuto
## Call:
## lda(mpg01 ~ cylinders + displacement + horsepower + weight, data = Auto2train)
##
## Prior probabilities of groups:
## 0 1
## 0.4781022 0.5218978
##
## Group means:
## cylinders displacement horsepower weight
## 0 6.664122 266.7099 126.71756 3606.679
## 1 4.195804 114.6538 77.75524 2306.245
##
## Coefficients of linear discriminants:
## LD1
## cylinders -0.192661950
## displacement -0.003520215
## horsepower 0.002627014
## weight -0.001160826
lda.predAuto=predict (lda.fitAuto , Auto2test)
names(lda.predAuto)
## [1] "class" "posterior" "x"
lda.classAuto=lda.predAuto$class
table(lda.classAuto, mpg01.test, dnn = c("Actual", "Predicted"))
## Predicted
## Actual 0 1
## 0 57 2
## 1 8 51
Accuracy rate
accuracy_rateldaAuto <- (51+57)/118
accuracy_rateldaAuto
## [1] 0.9152542
Accuracy rate another way
mean(lda.classAuto==mpg01.test)
## [1] 0.9152542
11 (d) LDA (b). What is the test error of the model obtained? Error rate 0.084
mean(lda.classAuto!=mpg01.test)
## [1] 0.08474576
11 (e) Perform QDA on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained? 0.076
qda.fitAuto = qda(mpg01 ~ cylinders + displacement + horsepower + weight, data = Auto2train)
qda.fitAuto
## Call:
## qda(mpg01 ~ cylinders + displacement + horsepower + weight, data = Auto2train)
##
## Prior probabilities of groups:
## 0 1
## 0.4781022 0.5218978
##
## Group means:
## cylinders displacement horsepower weight
## 0 6.664122 266.7099 126.71756 3606.679
## 1 4.195804 114.6538 77.75524 2306.245
qda.predAuto=predict (qda.fitAuto , Auto2test)
names(qda.predAuto)
## [1] "class" "posterior"
qda.classAuto=qda.predAuto$class
table(qda.classAuto, mpg01.test, dnn = c("Actual", "Predicted"))
## Predicted
## Actual 0 1
## 0 61 5
## 1 4 48
Accuracy rate
mean(qda.classAuto==mpg01.test)
## [1] 0.9237288
11 (e) QDA (b). What is the test error of the model obtained? Error rate
mean(qda.classAuto!=mpg01.test)
## [1] 0.07627119
glm.fitAuto = glm(mpg01 ~ cylinders + displacement + horsepower + weight, data = Auto2train, family = binomial)
glm.probsAuto = predict(glm.fitAuto, Auto2test, type = "response")
glm.predAuto= ifelse(glm.probsAuto >=0.5,1,0)
table(glm.predAuto, mpg01.test, dnn = c("Actual", "Predicted"))
## Predicted
## Actual 0 1
## 0 58 7
## 1 7 46
Accuracy rate
mean(glm.predAuto==mpg01.test)
## [1] 0.8813559
11 (f) Logistic Regression (b). What is the test error of the model obtained? Error rate
mean(glm.predAuto!=mpg01.test)
## [1] 0.1186441
11 (g) Perform KNN on the training data, with several values of K, in order to predict mpg01. Use only the variables that seemed most associated with mpg01 in (b). What test errors do you obtain? K=1: 0.11 K=3: 0.11 (different confusion matrix than K=1, same error rate) K=5: 0.09
Which value of K seems to perform the best on this data set? K=5
train.XAuto2 = cbind(Auto2$cylinders, Auto2$displacement, Auto2$horsepower, Auto2$weight)[tr_ind,] ##A matrix containing the predictors associated with the training data
test.XAuto2 = cbind(Auto2$cylinders, Auto2$displacement, Auto2$horsepower, Auto2$weight)[-tr_ind,] ##A matrix containing the predictors associated with the data for which we wish to make predictions
train.mpg01 = Auto2$mpg01 [tr_ind] ##A vector containing the class labels for the training observations, labeled
dim(train.XAuto2)
## [1] 274 4
set seed, fit KNN model, k=1
set.seed(22)
knn.predAuto=knn(train.XAuto2, test.XAuto2, train.mpg01, k=1)
table(knn.predAuto, mpg01.test, dnn = c("Actual", "Predicted"))
## Predicted
## Actual 0 1
## 0 56 5
## 1 9 48
mean(knn.predAuto==mpg01.test)
## [1] 0.8813559
mean(knn.predAuto!=mpg01.test)
## [1] 0.1186441
set seed, fit KNN model, k=3
set.seed(22)
knn.predAuto3=knn(train.XAuto2, test.XAuto2, train.mpg01, k=3)
table(knn.predAuto3, mpg01.test, dnn = c("Actual", "Predicted"))
## Predicted
## Actual 0 1
## 0 57 6
## 1 8 47
mean(knn.predAuto3==mpg01.test)
## [1] 0.8813559
mean(knn.predAuto3!=mpg01.test)
## [1] 0.1186441
set seed, fit KNN model, k=5
set.seed(22)
knn.predAuto5=knn(train.XAuto2, test.XAuto2, train.mpg01, k=5)
table(knn.predAuto5, mpg01.test, dnn = c("Actual", "Predicted"))
## Predicted
## Actual 0 1
## 0 58 4
## 1 7 49
mean(knn.predAuto5==mpg01.test)
## [1] 0.9067797
mean(knn.predAuto5!=mpg01.test)
## [1] 0.09322034
##13. Using the Boston data set, fit classification models in order to predict whether a given suburb has a crime rate above or below the median. Explore logistic regression, LDA, and KNN models using various subsets of the predictors. Describe your findings.
Below are the errors rates of 3 different versions of three different models (9 models) (Logestic regerssion, LDA, KNN). Top 3: KNN K=3 predictors nox, rad, dis had the lowest error rate; error rate 0.0592 Logestic regression predictors nox, rad, dis, age, tax, indus; error rate 0.1052 Logestic regression predictors nox, rad, dis, black; error rate 0.14473
Appears logistic regression, a simpler model would be the preferred model to go with.
#Logistic regression (3 models) glm.fitBoston: nox + rad + dis + age + tax + indus Error: 0.1052 glm.fitBoston2: nox + rad+ dis Error: 0.15789 glm.fitBoston3: nox + rad + dis + black Error: 0.14473
#LDA (3 models) lda.fitBoston: nox + rad + dis + age + tax + indus Error: 0.1644 lda.fitBoston2: nox + rad+ dis Error: 0.1644 lda.fitBoston3: nox + rad + dis + black Error: 0.1513
#KNN (3 models) KNN = 1 (nox, rad, dis, age, indus) Error: 0.1579 KNN = 3 (nox, rad, dis) Error: 0.0592 KNN = 5 (nox, rad, dis, black) Error: 0.1710
Looking at the Boston dataset
str(Boston)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
Create Boston 2 dataset to add binary variable
Boston2 = Boston
Create a binary variable, crim02, that contains a 1 if crime contains a value above its median, and a 0 if crime contains a value below its median.
med_crim = median(Boston2$crim)
Boston2 = mutate(Boston2, crim02 = ifelse(crim > med_crim, 1,0))
str(Boston2)
## 'data.frame': 506 obs. of 15 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
## $ crim02 : num 0 0 0 0 0 0 0 0 0 0 ...
summary(Boston2$crim02)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 0.0 0.5 0.5 1.0 1.0
Let’s see what correlates with crim02
pairs(Boston2)
Correlation matrix on crim02 nox 0.72, rad 0.62, dis -0.62, age 0.61, tax 0.61, indus 0.60,
cor(Boston2)
## crim zn indus chas nox
## crim 1.00000000 -0.20046922 0.40658341 -0.055891582 0.42097171
## zn -0.20046922 1.00000000 -0.53382819 -0.042696719 -0.51660371
## indus 0.40658341 -0.53382819 1.00000000 0.062938027 0.76365145
## chas -0.05589158 -0.04269672 0.06293803 1.000000000 0.09120281
## nox 0.42097171 -0.51660371 0.76365145 0.091202807 1.00000000
## rm -0.21924670 0.31199059 -0.39167585 0.091251225 -0.30218819
## age 0.35273425 -0.56953734 0.64477851 0.086517774 0.73147010
## dis -0.37967009 0.66440822 -0.70802699 -0.099175780 -0.76923011
## rad 0.62550515 -0.31194783 0.59512927 -0.007368241 0.61144056
## tax 0.58276431 -0.31456332 0.72076018 -0.035586518 0.66802320
## ptratio 0.28994558 -0.39167855 0.38324756 -0.121515174 0.18893268
## black -0.38506394 0.17552032 -0.35697654 0.048788485 -0.38005064
## lstat 0.45562148 -0.41299457 0.60379972 -0.053929298 0.59087892
## medv -0.38830461 0.36044534 -0.48372516 0.175260177 -0.42732077
## crim02 0.40939545 -0.43615103 0.60326017 0.070096774 0.72323480
## rm age dis rad tax ptratio
## crim -0.21924670 0.35273425 -0.37967009 0.625505145 0.58276431 0.2899456
## zn 0.31199059 -0.56953734 0.66440822 -0.311947826 -0.31456332 -0.3916785
## indus -0.39167585 0.64477851 -0.70802699 0.595129275 0.72076018 0.3832476
## chas 0.09125123 0.08651777 -0.09917578 -0.007368241 -0.03558652 -0.1215152
## nox -0.30218819 0.73147010 -0.76923011 0.611440563 0.66802320 0.1889327
## rm 1.00000000 -0.24026493 0.20524621 -0.209846668 -0.29204783 -0.3555015
## age -0.24026493 1.00000000 -0.74788054 0.456022452 0.50645559 0.2615150
## dis 0.20524621 -0.74788054 1.00000000 -0.494587930 -0.53443158 -0.2324705
## rad -0.20984667 0.45602245 -0.49458793 1.000000000 0.91022819 0.4647412
## tax -0.29204783 0.50645559 -0.53443158 0.910228189 1.00000000 0.4608530
## ptratio -0.35550149 0.26151501 -0.23247054 0.464741179 0.46085304 1.0000000
## black 0.12806864 -0.27353398 0.29151167 -0.444412816 -0.44180801 -0.1773833
## lstat -0.61380827 0.60233853 -0.49699583 0.488676335 0.54399341 0.3740443
## medv 0.69535995 -0.37695457 0.24992873 -0.381626231 -0.46853593 -0.5077867
## crim02 -0.15637178 0.61393992 -0.61634164 0.619786249 0.60874128 0.2535684
## black lstat medv crim02
## crim -0.38506394 0.4556215 -0.3883046 0.40939545
## zn 0.17552032 -0.4129946 0.3604453 -0.43615103
## indus -0.35697654 0.6037997 -0.4837252 0.60326017
## chas 0.04878848 -0.0539293 0.1752602 0.07009677
## nox -0.38005064 0.5908789 -0.4273208 0.72323480
## rm 0.12806864 -0.6138083 0.6953599 -0.15637178
## age -0.27353398 0.6023385 -0.3769546 0.61393992
## dis 0.29151167 -0.4969958 0.2499287 -0.61634164
## rad -0.44441282 0.4886763 -0.3816262 0.61978625
## tax -0.44180801 0.5439934 -0.4685359 0.60874128
## ptratio -0.17738330 0.3740443 -0.5077867 0.25356836
## black 1.00000000 -0.3660869 0.3334608 -0.35121093
## lstat -0.36608690 1.0000000 -0.7376627 0.45326273
## medv 0.33346082 -0.7376627 1.0000000 -0.26301673
## crim02 -0.35121093 0.4532627 -0.2630167 1.00000000
Make crim02 a factor variable
Boston2$crim02 = as.factor(Boston2$crim02)
str(Boston2)
## 'data.frame': 506 obs. of 15 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
## $ crim02 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
Create train test datasets Boston2train Boston2test
set.seed(22)
trb_ind = sample(nrow(Boston2), 0.7*nrow(Boston2), replace = F) ##using a 70/30 train/test split
Boston2train = Boston2[trb_ind,]
Boston2test = Boston2[-trb_ind,]
Create labels crim02.test
crim02.test = Boston2$crim02[-trb_ind]
Look at Boston2Train 354/15 - Boston2Train 354/15 + Boston2Test 152/15 = Boston2 505/15
str(Boston2train)
## 'data.frame': 354 obs. of 15 variables:
## $ crim : num 3.6737 11.5779 0.0715 0.0672 15.0234 ...
## $ zn : num 0 0 0 0 0 70 0 0 0 0 ...
## $ indus : num 18.1 18.1 4.49 3.24 18.1 ...
## $ chas : int 0 0 0 0 0 0 0 0 1 1 ...
## $ nox : num 0.583 0.7 0.449 0.46 0.614 0.4 0.671 0.544 0.489 0.77 ...
## $ rm : num 6.31 5.04 6.12 6.33 5.3 ...
## $ age : num 51.9 97 56.8 17.2 97.3 10 100 87.3 59.1 97.4 ...
## $ dis : num 3.99 1.77 3.75 5.21 2.1 ...
## $ rad : int 24 24 3 4 24 5 24 4 4 24 ...
## $ tax : num 666 666 247 430 666 358 666 304 277 666 ...
## $ ptratio: num 20.2 20.2 18.5 16.9 20.2 14.8 20.2 18.4 18.6 20.2 ...
## $ black : num 389 397 395 375 349 ...
## $ lstat : num 10.58 25.68 8.44 7.34 24.91 ...
## $ medv : num 21.2 9.7 22.2 22.6 12 29 10.2 23.8 24.4 17.8 ...
## $ crim02 : Factor w/ 2 levels "0","1": 2 2 1 1 2 1 2 2 1 2 ...
Look at Boston2Test 152/15
str(Boston2test)
## 'data.frame': 152 obs. of 15 variables:
## $ crim : num 0.0273 0.069 0.0883 0.2112 0.17 ...
## $ zn : num 0 0 12.5 12.5 12.5 12.5 12.5 0 0 0 ...
## $ indus : num 7.07 2.18 7.87 7.87 7.87 7.87 7.87 8.14 8.14 8.14 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.469 0.458 0.524 0.524 0.524 0.524 0.524 0.538 0.538 0.538 ...
## $ rm : num 7.18 7.15 6.01 5.63 6 ...
## $ age : num 61.1 54.2 66.6 100 85.9 94.3 39 84.5 81.7 89.2 ...
## $ dis : num 4.97 6.06 5.56 6.08 6.59 ...
## $ rad : int 2 3 5 5 5 5 5 4 4 4 ...
## $ tax : num 242 222 311 311 311 311 311 307 307 307 ...
## $ ptratio: num 17.8 18.7 15.2 15.2 15.2 15.2 15.2 21 21 21 ...
## $ black : num 393 397 396 387 387 ...
## $ lstat : num 4.03 5.33 12.43 29.93 17.1 ...
## $ medv : num 34.7 36.2 22.9 16.5 18.9 15 21.7 18.2 17.5 19.6 ...
## $ crim02 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 2 2 ...
Looking at the class labels
summary(crim02.test)
## 0 1
## 76 76
#Logistic regression (3 models) glm.fitBoston: nox + rad + dis + age + tax + indus Error: 0.1052 glm.fitBoston2: nox + rad+ dis Error: 0.15789 glm.fitBoston3: nox + rad + dis + black Error: 0.14473
glm.fitBoston = glm(crim02 ~ nox + rad + dis + age + tax + indus, data = Boston2train, family = binomial)
glm.probsBoston = predict(glm.fitBoston, Boston2test, type = "response" )
glm.predBoston = ifelse(glm.probsBoston >=0.5,1,0)
table(glm.predBoston,crim02.test, dnn = c("Actual", "Predicted"))
## Predicted
## Actual 0 1
## 0 69 9
## 1 7 67
Accuracy rate for Boston logistic regression
mean(glm.predBoston==crim02.test)
## [1] 0.8947368
Error rate for Boston logistic regression
mean(glm.predBoston!=crim02.test)
## [1] 0.1052632
glm.fitBoston2 = glm(crim02 ~ nox + rad + dis, data = Boston2train, family = binomial)
glm.probsBoston2 = predict(glm.fitBoston2, Boston2test, type = "response" )
glm.predBoston2 = ifelse(glm.probsBoston2 >=0.5,1,0)
table(glm.predBoston2,crim02.test, dnn = c("Actual", "Predicted"))
## Predicted
## Actual 0 1
## 0 69 17
## 1 7 59
Accuracy rate for Boston logistic regression
mean(glm.predBoston2==crim02.test)
## [1] 0.8421053
Error rate for Boston logistic regression
mean(glm.predBoston2!=crim02.test)
## [1] 0.1578947
glm.fitBoston3 = glm(crim02 ~ nox + rad + dis + black, data = Boston2train, family = binomial)
glm.probsBoston3 = predict(glm.fitBoston3, Boston2test, type = "response" )
glm.predBoston3 = ifelse(glm.probsBoston3 >=0.5,1,0)
table(glm.predBoston3,crim02.test, dnn = c("Actual", "Predicted"))
## Predicted
## Actual 0 1
## 0 69 15
## 1 7 61
Accuracy rate for Boston logistic regression
mean(glm.predBoston3==crim02.test)
## [1] 0.8552632
Error rate for Boston logistic regression
mean(glm.predBoston3!=crim02.test)
## [1] 0.1447368
#LDA (3 models) lda.fitBoston: nox + rad + dis + age + tax + indus Error: 0.1644 lda.fitBoston2: nox + rad+ dis Error: 0.1644 lda.fitBoston3: nox + rad + dis + black Error: 0.1513
lda.fitBoston = lda(crim02 ~ nox + rad + dis + age + tax + indus, data = Boston2train)
lda.predBoston = predict(lda.fitBoston,Boston2test)
lda.classBoston=lda.predBoston$class
table(lda.classBoston, crim02.test, dnn = c("Actual", "Predicted"))
## Predicted
## Actual 0 1
## 0 76 25
## 1 0 51
Accuracy lda.fitBoston
mean(lda.classBoston==crim02.test)
## [1] 0.8355263
Error lda.fitboston
mean(lda.classBoston!=crim02.test)
## [1] 0.1644737
lda.fitBoston2 = lda(crim02 ~ nox + rad + dis, data = Boston2train)
lda.predBoston2 = predict(lda.fitBoston,Boston2test)
lda.classBoston2=lda.predBoston2$class
table(lda.classBoston2, crim02.test, dnn = c("Actual", "Predicted"))
## Predicted
## Actual 0 1
## 0 76 25
## 1 0 51
Accuracy lda.fitBoston2
mean(lda.classBoston2==crim02.test)
## [1] 0.8355263
Error lda.fitboston
mean(lda.classBoston2!=crim02.test)
## [1] 0.1644737
lda.fitBoston3 = lda(crim02 ~ nox + rad + dis + black, data = Boston2train)
lda.predBoston3 = predict(lda.fitBoston3,Boston2test)
lda.classBoston3=lda.predBoston3$class
table(lda.classBoston3, crim02.test, dnn = c("Actual", "Predicted"))
## Predicted
## Actual 0 1
## 0 75 22
## 1 1 54
Accuracy lda.fitBoston3
mean(lda.classBoston3==crim02.test)
## [1] 0.8486842
Error lda.fitboston3
mean(lda.classBoston3!=crim02.test)
## [1] 0.1513158
#KNN (3 models) KNN = 1 (nox, rad, dis, age, indus) Error: 0.1579 KNN = 3 (nox, rad, dis) Error: 0.0592 KNN = 5 (nox, rad, dis, black) Error: 0.1710
train.XBoston = cbind(Boston2$nox, Boston2$rad, Boston2$dis, Boston2$age, Boston2$indus)[trb_ind,]
test.XBoston = cbind(Boston2$nox, Boston2$rad, Boston2$dis, Boston2$age, Boston2$indus)[-trb_ind,]
train.crim02 = Boston2$crim02[trb_ind]
dim(test.XBoston)
## [1] 152 5
set seed, fit knn model, k=1
set.seed(22)
knn.predBoston=knn(train.XBoston, test.XBoston, train.crim02, k=1)
table(knn.predBoston, crim02.test, dnn = c("Actual", "Predicted"))
## Predicted
## Actual 0 1
## 0 62 10
## 1 14 66
Accuracy knn.predBoston
mean(knn.predBoston==crim02.test)
## [1] 0.8421053
Error knn.predBoston
mean(knn.predBoston!=crim02.test)
## [1] 0.1578947
train.XBoston2 = cbind(Boston2$nox, Boston2$rad, Boston2$dis)[trb_ind,]
test.XBoston2 = cbind(Boston2$nox, Boston2$rad, Boston2$dis)[-trb_ind,]
train.crim02 = Boston2$crim02[trb_ind]
dim(test.XBoston)
## [1] 152 5
set seed, fit knn model, k=3
set.seed(22)
knn.predBoston2=knn(train.XBoston2, test.XBoston2, train.crim02, k=3)
table(knn.predBoston2, crim02.test, dnn = c("Actual", "Predicted"))
## Predicted
## Actual 0 1
## 0 75 8
## 1 1 68
Accuracy knn.predBoston2
mean(knn.predBoston2==crim02.test)
## [1] 0.9407895
Error knn.predBoston2
mean(knn.predBoston2!=crim02.test)
## [1] 0.05921053
train.XBoston3 = cbind(Boston2$nox, Boston2$rad, Boston2$dis, Boston2$black)[trb_ind,]
test.XBoston3 = cbind(Boston2$nox, Boston2$rad, Boston2$dis, Boston2$black)[-trb_ind,]
train.crim02 = Boston2$crim02[trb_ind]
dim(test.XBoston)
## [1] 152 5
set seed, fit knn model, k=5
set.seed(22)
knn.predBoston3=knn(train.XBoston3, test.XBoston3, train.crim02, k=5)
table(knn.predBoston3, crim02.test, dnn = c("Actual", "Predicted"))
## Predicted
## Actual 0 1
## 0 67 17
## 1 9 59
Accuracy knn.predBoston3
mean(knn.predBoston3==crim02.test)
## [1] 0.8289474
Error knn.predBoston3
mean(knn.predBoston3!=crim02.test)
## [1] 0.1710526