Loading the libraries

library(ISLR)
## Warning: package 'ISLR' was built under R version 4.1.2
library(MASS)
## Warning: package 'MASS' was built under R version 4.1.2
library(class)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

##10. This question should be answered using the Weekly data set, which is part of the ISLR package. This data is similar in nature to the Smarket data from this chapter’s lab, except that it contains 1, 089 weekly returns for 21 years, from the beginning of 1990 to the end of 2010

Looking at the Weekly dataset

str(Weekly)
## 'data.frame':    1089 obs. of  9 variables:
##  $ Year     : num  1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
##  $ Lag1     : num  0.816 -0.27 -2.576 3.514 0.712 ...
##  $ Lag2     : num  1.572 0.816 -0.27 -2.576 3.514 ...
##  $ Lag3     : num  -3.936 1.572 0.816 -0.27 -2.576 ...
##  $ Lag4     : num  -0.229 -3.936 1.572 0.816 -0.27 ...
##  $ Lag5     : num  -3.484 -0.229 -3.936 1.572 0.816 ...
##  $ Volume   : num  0.155 0.149 0.16 0.162 0.154 ...
##  $ Today    : num  -0.27 -2.576 3.514 0.712 1.178 ...
##  $ Direction: Factor w/ 2 levels "Down","Up": 1 1 2 2 2 1 2 2 2 1 ...
    1. Produce some numerical and graphical summaries of the Weekly data. Do there appear to be any patterns?
summary(Weekly)
##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
## 

Visualizing the data set

pairs(Weekly)

Correlation among the variables, take out Direction, it is qualitative The only correlation worth noting is between Year and Volume =0.8419

cor(Weekly [,-9])
##               Year         Lag1        Lag2        Lag3         Lag4
## Year    1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1   -0.03228927  1.000000000 -0.07485305  0.05863568 -0.071273876
## Lag2   -0.03339001 -0.074853051  1.00000000 -0.07572091  0.058381535
## Lag3   -0.03000649  0.058635682 -0.07572091  1.00000000 -0.075395865
## Lag4   -0.03112792 -0.071273876  0.05838153 -0.07539587  1.000000000
## Lag5   -0.03051910 -0.008183096 -0.07249948  0.06065717 -0.075675027
## Volume  0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today  -0.03245989 -0.075031842  0.05916672 -0.07124364 -0.007825873
##                Lag5      Volume        Today
## Year   -0.030519101  0.84194162 -0.032459894
## Lag1   -0.008183096 -0.06495131 -0.075031842
## Lag2   -0.072499482 -0.08551314  0.059166717
## Lag3    0.060657175 -0.06928771 -0.071243639
## Lag4   -0.075675027 -0.06107462 -0.007825873
## Lag5    1.000000000 -0.05851741  0.011012698
## Volume -0.058517414  1.00000000 -0.033077783
## Today   0.011012698 -0.03307778  1.000000000

Plot Volume, from 1990 - 2010, weekly trading volume has increased

plot(Weekly$Volume)

    1. Use the full data set to perform a logistic regression with Direction as the response and the five lag variables plus Volume as predictors. Use the summary function to print the results. Do any of the predictors appear to be statistically significant? If so, which ones? Lag2: p-value = 0.0296
glm.fits=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume, data=Weekly,family=binomial )
summary (glm.fits)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Weekly)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6949  -1.2565   0.9913   1.0849   1.4579  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4
    1. Compute the confusion matrix and overall fraction of correct predictions. Explain what the confusion matrix is telling you about the types of mistakes made by logistic regression.

Predict the probabilites the volume will go up

glm.probs=predict(glm.fits,type="response")
contrasts(Weekly$Direction)
##      Up
## Down  0
## Up    1

In order to make a prediction as to whether the market will go up or down on a particular day, we must convert these predicted probabilities into class labels, Up or Down. The following two commands create a vector of class predictions based on whether the predicted probability of a market increase is greater than or less than 0.5. The first command creates a vector of 1,250 Down elements. The second line transforms to Up all of the elements for which the predicted probability of a market increase exceeds 0.5.

glm.pred=rep("Down", 1089)
glm.pred[glm.probs >.5]="Up"

Confusion matrix Given these predictions, the table() function can be used to produce a confusion matrix in order to determine how many observations were correctly or incorrectly classified. Explain what the confusion matrix is telling you about the types of mistakes made by logistic regression. The model correctly predicted 557 weeks the market would go up and 54 weeks the market would go down. The model incorrectly predicted 48 weeks the market would go up but it went down and 430 weeks the market would go down and it went up. Model accuracy rate: 0.561

table(glm.pred ,Weekly$Direction)
##         
## glm.pred Down  Up
##     Down   54  48
##     Up    430 557

Accuracy rate

accuracy_rate <- (557+54)/1089
accuracy_rate
## [1] 0.5610652
    1. Now fit the logistic regression model using a training data period from 1990 to 2008, with Lag2 as the only predictor. Compute the confusion matrix and the overall fraction of correct predictions for the held out data (that is, the data from 2009 and 2010).

Review Weekly data set to make sure the years are correct: 1990 - 2010

table(Weekly$Year)
## 
## 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 
##   47   52   52   52   52   52   53   52   52   52   52   52   52   52   52   52 
## 2006 2007 2008 2009 2010 
##   52   53   52   52   52

Create a vector corresponding to the observations from 1990-2008 (Weekly.train). Vector to create a held out data set of observations from 2009 - 2010 (Weekly.test).

Weekly.train <- (Weekly$Year<2009) ##Boolean vector (T or F) 1089 elements True before 2009, False 2009-2010
Weekly.test <-Weekly[!Weekly.train ,]  ##Get all the observations "rows" that are not Weekly.train from Weekly (subset of Weekly dataset)
Direction.test <- Weekly$Direction[!Weekly.train]
dim(Weekly.test)
## [1] 104   9

Fit the logistic regression model to the training subset

glm.fits2 = glm(Direction~Lag2, data = Weekly, family = binomial, subset = Weekly.train)

Test the model on the test set

glm.probs2 = predict(glm.fits2, Weekly.test, type = "response")

Compute predictions of new model

glm.pred2=rep("Down", 104)
glm.pred2[glm.probs2 >.5]="Up"
table(glm.pred2, Direction.test)
##          Direction.test
## glm.pred2 Down Up
##      Down    9  5
##      Up     34 56

Accuracy rate

accuracy_rate2 <- (56+9)/104
accuracy_rate2
## [1] 0.625
    1. Repeat (d) using LDA.
lda.fit = lda(Direction~Lag2, data = Weekly, subset = Weekly.train)
lda.fit
## Call:
## lda(Direction ~ Lag2, data = Weekly, subset = Weekly.train)
## 
## Prior probabilities of groups:
##      Down        Up 
## 0.4477157 0.5522843 
## 
## Group means:
##             Lag2
## Down -0.03568254
## Up    0.26036581
## 
## Coefficients of linear discriminants:
##            LD1
## Lag2 0.4414162
lda.pred=predict (lda.fit , Weekly.test)
names(lda.pred)
## [1] "class"     "posterior" "x"
lda.class=lda.pred$class
table(lda.class, Direction.test)
##          Direction.test
## lda.class Down Up
##      Down    9  5
##      Up     34 56

Accuracy rate

accuracy_ratelda <- (56+9)/104
accuracy_ratelda
## [1] 0.625
    1. Repeat (d) using QDA.
qda.fit = qda(Direction~Lag2, data = Weekly, subset = Weekly.train)
qda.fit
## Call:
## qda(Direction ~ Lag2, data = Weekly, subset = Weekly.train)
## 
## Prior probabilities of groups:
##      Down        Up 
## 0.4477157 0.5522843 
## 
## Group means:
##             Lag2
## Down -0.03568254
## Up    0.26036581
qda.pred=predict (qda.fit , Weekly.test)
names(qda.pred)
## [1] "class"     "posterior"
qda.class=qda.pred$class
table(qda.class, Direction.test)
##          Direction.test
## qda.class Down Up
##      Down    0  0
##      Up     43 61

Accuracy rate qda - predicted everything was Up (overfitting?)

accuracy_rateqda <- (61+0)/104
accuracy_rateqda
## [1] 0.5865385
    1. Repeat (d) using KNN with K = 1. Create Train, Test and class labels
train.X = as.matrix(Weekly$Lag2[Weekly.train]) ##A matrix containing the predictors associated with the training data
test.X = as.matrix(Weekly$Lag2[!Weekly.train]) ##A matrix containing the predictors associated with the data for which we wish to make predictions
train.Direction = Weekly$Direction [Weekly.train] ##A vector containing the class labels for the training observations, labeled 
dim(train.X)
## [1] 985   1

set seed, fit KNN model

set.seed(22)
knn.pred=knn(train.X, test.X, train.Direction, k=1)
table(knn.pred, Direction.test, dnn = c("Actual", "Predicted"))
##       Predicted
## Actual Down Up
##   Down   21 29
##   Up     22 32
accuracy_rateknn <- (32+21)/104
accuracy_rateknn
## [1] 0.5096154
    1. Which of these methods appears to provide the best results on this data? Logistical Regression and LDA Summary of results Log Reg: 0.625 LDA: 0.625 QDA: 0.58 KNN: 0.51
    1. Experiment with different combinations of predictors, including possible transformations and interactions, for each of the methods. Report the variables, method, and associated confusion matrix that appears to provide the best results on the held out data. Note that you should also experiment with values for K in the KNN classifier.

Logistic regression model with Lag 2, Lag 1 * Lag2 (interaction effect)

glm.fits3 = glm(Direction~Lag2 + Lag2*Lag1, data = Weekly, family = binomial, subset = Weekly.train)

Test the model on the test set

glm.probs3 = predict(glm.fits3, Weekly.test, type = "response")

Compute predictions of new model

glm.pred3=rep("Down", 104)
glm.pred3[glm.probs3 >.5]="Up"
table(glm.pred3, Direction.test)
##          Direction.test
## glm.pred3 Down Up
##      Down    7  8
##      Up     36 53
accuracy_rateknn <- (53+7)/104
accuracy_rateknn
## [1] 0.5769231
    1. additional LDA model (Lag2, Lag3, Lag4)
lda.fit2 = lda(Direction~Lag2 + Lag3 + Lag4, data = Weekly, subset = Weekly.train)
lda.fit2
## Call:
## lda(Direction ~ Lag2 + Lag3 + Lag4, data = Weekly, subset = Weekly.train)
## 
## Prior probabilities of groups:
##      Down        Up 
## 0.4477157 0.5522843 
## 
## Group means:
##             Lag2       Lag3       Lag4
## Down -0.03568254 0.17080045 0.15925624
## Up    0.26036581 0.08404044 0.09220956
## 
## Coefficients of linear discriminants:
##              LD1
## Lag2  0.41403813
## Lag3 -0.09567686
## Lag4 -0.13077102
lda.pred2=predict (lda.fit2 , Weekly.test)
names(lda.pred2)
## [1] "class"     "posterior" "x"
lda.class2=lda.pred2$class
table(lda.class2, Direction.test)
##           Direction.test
## lda.class2 Down Up
##       Down    8  5
##       Up     35 56

Accuracy rate

accuracy_ratelda2 <- (56+8)/104
accuracy_ratelda2
## [1] 0.6153846
    1. Additional QDA model (Lag2, Lag3, Lag2 * Lag3)
qda.fit2 = qda(Direction~Lag2 + Lag3 + Lag2 * Lag3, data = Weekly, subset = Weekly.train)
qda.fit2
## Call:
## qda(Direction ~ Lag2 + Lag3 + Lag2 * Lag3, data = Weekly, subset = Weekly.train)
## 
## Prior probabilities of groups:
##      Down        Up 
## 0.4477157 0.5522843 
## 
## Group means:
##             Lag2       Lag3  Lag2:Lag3
## Down -0.03568254 0.17080045 -0.1937158
## Up    0.26036581 0.08404044 -0.6405132
qda.pred2=predict (qda.fit2 , Weekly.test)
names(qda.pred2)
## [1] "class"     "posterior"
qda.class2=qda.pred2$class
table(qda.class2, Direction.test)
##           Direction.test
## qda.class2 Down Up
##       Down    9 10
##       Up     34 51

Accuracy rate qda - predicted everything was Up (overfitting?)

accuracy_rateqda2 <- (51+9)/104
accuracy_rateqda2
## [1] 0.5769231
    1. Additional KNN model Create Train, Test and class labels
train.X2 = cbind(Weekly$Lag2, Weekly$Lag3)[Weekly.train ,] ##A matrix containing the predictors associated with the training data
test.X2 = cbind(Weekly$Lag2, Weekly$Lag3)[!Weekly.train ,] ##A matrix containing the predictors associated with the data for which we wish to make predictions
train.Direction = Weekly$Direction [Weekly.train] ##A vector containing the class labels for the training observations, labeled 
dim(train.X2)
## [1] 985   2

set seed, fit KNN model

set.seed(22)
knn.pred2=knn(train.X2, test.X2, train.Direction, k=3)
table(knn.pred2, Direction.test, dnn = c("Actual", "Predicted"))
##       Predicted
## Actual Down Up
##   Down   16 20
##   Up     27 41
accuracy_rateknn <- (41+16)/104
accuracy_rateknn
## [1] 0.5480769
    1. Additional variables and interaction effects for the models Summary of results Log Reg: 0.625 (variable = Lag2) Lob Reg2: 0.577 (variable = Lag1, Lag2, Lag1 * Lag2) LDA: 0.625 (variable = Lag2) LDA2: 0.615 (variable = Lag 2, Lag3, Lag4) QDA: 0.58 (variable = Lag2) QDA2: 0.577 (variable = Lag2, Lag3, Lag2 * Lag3) KNN: 0.51 (variable = Lag 2, KNN = 1) KNN2: 0.55 (variable = Lag2, Lag3)

##11. In this problem, you will develop a model to predict whether a given car gets high or low gas mileage based on the Auto data set.

  1. Create a binary variable, mpg01, that contains a 1 if mpg contains a value above its median, and a 0 if mpg contains a value below its median. You can compute the median using the median() function. Note you may find it helpful to use the data.frame() function to create a single data set containing both mpg01 and the other Auto variables.

Look at the Auto data set

str(Auto)
## 'data.frame':    392 obs. of  9 variables:
##  $ mpg         : num  18 15 18 16 17 15 14 14 14 15 ...
##  $ cylinders   : num  8 8 8 8 8 8 8 8 8 8 ...
##  $ displacement: num  307 350 318 304 302 429 454 440 455 390 ...
##  $ horsepower  : num  130 165 150 150 140 198 220 215 225 190 ...
##  $ weight      : num  3504 3693 3436 3433 3449 ...
##  $ acceleration: num  12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
##  $ year        : num  70 70 70 70 70 70 70 70 70 70 ...
##  $ origin      : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ name        : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...

Create a binary variable, mpg01, that contains a 1 if mpg contains a value above its median, and a 0 if mpg contains a value below its median.

Auto2 = Auto
med_mpg = median(Auto2$mpg)
Auto2 = mutate(Auto2, mpg01 = ifelse(mpg > med_mpg, 1,0))
str(Auto2)
## 'data.frame':    392 obs. of  10 variables:
##  $ mpg         : num  18 15 18 16 17 15 14 14 14 15 ...
##  $ cylinders   : num  8 8 8 8 8 8 8 8 8 8 ...
##  $ displacement: num  307 350 318 304 302 429 454 440 455 390 ...
##  $ horsepower  : num  130 165 150 150 140 198 220 215 225 190 ...
##  $ weight      : num  3504 3693 3436 3433 3449 ...
##  $ acceleration: num  12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
##  $ year        : num  70 70 70 70 70 70 70 70 70 70 ...
##  $ origin      : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ name        : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
##  $ mpg01       : num  0 0 0 0 0 0 0 0 0 0 ...

11 (b) Explore the data graphically in order to investigate the association between mpg01 and the other features. Which of the other features seem most likely to be useful in predicting mpg01? Scatterplots and boxplots may be useful tools to answer this question. Describe your findings.

pairs(Auto2)  ##nothing really useful from this plot

From the correlation matrix, cylinders, displacement, horsepower, weight have negative correlations to mpg01.
mpg has a positive correlation to mpg01

cor(Auto2[,-9]) ##take out name, qualitative variable
##                     mpg  cylinders displacement horsepower     weight
## mpg           1.0000000 -0.7776175   -0.8051269 -0.7784268 -0.8322442
## cylinders    -0.7776175  1.0000000    0.9508233  0.8429834  0.8975273
## displacement -0.8051269  0.9508233    1.0000000  0.8972570  0.9329944
## horsepower   -0.7784268  0.8429834    0.8972570  1.0000000  0.8645377
## weight       -0.8322442  0.8975273    0.9329944  0.8645377  1.0000000
## acceleration  0.4233285 -0.5046834   -0.5438005 -0.6891955 -0.4168392
## year          0.5805410 -0.3456474   -0.3698552 -0.4163615 -0.3091199
## origin        0.5652088 -0.5689316   -0.6145351 -0.4551715 -0.5850054
## mpg01         0.8369392 -0.7591939   -0.7534766 -0.6670526 -0.7577566
##              acceleration       year     origin      mpg01
## mpg             0.4233285  0.5805410  0.5652088  0.8369392
## cylinders      -0.5046834 -0.3456474 -0.5689316 -0.7591939
## displacement   -0.5438005 -0.3698552 -0.6145351 -0.7534766
## horsepower     -0.6891955 -0.4163615 -0.4551715 -0.6670526
## weight         -0.4168392 -0.3091199 -0.5850054 -0.7577566
## acceleration    1.0000000  0.2903161  0.2127458  0.3468215
## year            0.2903161  1.0000000  0.1815277  0.4299042
## origin          0.2127458  0.1815277  1.0000000  0.5136984
## mpg01           0.3468215  0.4299042  0.5136984  1.0000000
    1. Split the data into a training set and a test set.

Set mpg01 to factor variable

Auto2$mpg01 = as.factor(Auto2$mpg01)
set.seed(22)
tr_ind = sample(nrow(Auto2), 0.7*nrow(Auto2), replace = F)  ##using a 70/30 train/test split

Auto2train = Auto2[tr_ind,]
Auto2test = Auto2[-tr_ind,]

Looking at Auto2train 274 obs, 10 variables, 274 is 70% of 392

str(Auto2train)
## 'data.frame':    274 obs. of  10 variables:
##  $ mpg         : num  14 29.8 34.2 38 19.1 19 25.4 22 40.8 31 ...
##  $ cylinders   : num  8 4 4 4 6 6 6 6 4 4 ...
##  $ displacement: num  302 89 105 91 225 156 168 232 85 91 ...
##  $ horsepower  : num  137 62 70 67 90 108 116 112 65 68 ...
##  $ weight      : num  4042 1845 2200 1995 3381 ...
##  $ acceleration: num  14.5 15.3 13.2 16.2 18.7 15.5 12.6 14.7 19.2 17.6 ...
##  $ year        : num  73 80 79 82 80 76 81 82 80 82 ...
##  $ origin      : num  1 2 1 3 1 3 3 1 3 3 ...
##  $ name        : Factor w/ 304 levels "amc ambassador brougham",..: 142 281 225 83 98 276 275 147 79 178 ...
##  $ mpg01       : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 2 1 2 2 ...

Looking at Auto2test 118 obs, 10 variables, 118 is 30% of 392

str(Auto2test)
## 'data.frame':    118 obs. of  10 variables:
##  $ mpg         : num  18 17 14 14 15 15 14 21 25 24 ...
##  $ cylinders   : num  8 8 8 8 8 8 8 6 4 4 ...
##  $ displacement: num  318 302 454 440 390 383 455 200 110 107 ...
##  $ horsepower  : num  150 140 220 215 190 170 225 85 87 90 ...
##  $ weight      : num  3436 3449 4354 4312 3850 ...
##  $ acceleration: num  11 10.5 9 8.5 8.5 10 10 16 17.5 14.5 ...
##  $ year        : num  70 70 70 70 70 70 70 70 70 70 ...
##  $ origin      : num  1 1 1 1 1 1 1 1 2 2 ...
##  $ name        : Factor w/ 304 levels "amc ambassador brougham",..: 231 161 54 223 2 101 30 150 211 16 ...
##  $ mpg01       : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 2 2 ...

Creating the labels

mpg01.test = Auto2$mpg01[-tr_ind]
#mpg01.test = Auto2test$mpg01 another way to do it
str(mpg01.test)
##  Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 2 2 ...
    1. Perform LDA on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained? 0.084

Fit the LDA model on the training set

lda.fitAuto = lda(mpg01 ~ cylinders + displacement + horsepower + weight, data = Auto2train)
lda.fitAuto
## Call:
## lda(mpg01 ~ cylinders + displacement + horsepower + weight, data = Auto2train)
## 
## Prior probabilities of groups:
##         0         1 
## 0.4781022 0.5218978 
## 
## Group means:
##   cylinders displacement horsepower   weight
## 0  6.664122     266.7099  126.71756 3606.679
## 1  4.195804     114.6538   77.75524 2306.245
## 
## Coefficients of linear discriminants:
##                       LD1
## cylinders    -0.192661950
## displacement -0.003520215
## horsepower    0.002627014
## weight       -0.001160826
lda.predAuto=predict (lda.fitAuto , Auto2test)
names(lda.predAuto)
## [1] "class"     "posterior" "x"
lda.classAuto=lda.predAuto$class
table(lda.classAuto, mpg01.test, dnn = c("Actual", "Predicted"))
##       Predicted
## Actual  0  1
##      0 57  2
##      1  8 51

Accuracy rate

accuracy_rateldaAuto <- (51+57)/118
accuracy_rateldaAuto
## [1] 0.9152542

Accuracy rate another way

mean(lda.classAuto==mpg01.test)
## [1] 0.9152542

11 (d) LDA (b). What is the test error of the model obtained? Error rate 0.084

mean(lda.classAuto!=mpg01.test)
## [1] 0.08474576

11 (e) Perform QDA on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained? 0.076

qda.fitAuto = qda(mpg01 ~ cylinders + displacement + horsepower + weight, data = Auto2train)
qda.fitAuto
## Call:
## qda(mpg01 ~ cylinders + displacement + horsepower + weight, data = Auto2train)
## 
## Prior probabilities of groups:
##         0         1 
## 0.4781022 0.5218978 
## 
## Group means:
##   cylinders displacement horsepower   weight
## 0  6.664122     266.7099  126.71756 3606.679
## 1  4.195804     114.6538   77.75524 2306.245
qda.predAuto=predict (qda.fitAuto , Auto2test)
names(qda.predAuto)
## [1] "class"     "posterior"
qda.classAuto=qda.predAuto$class
table(qda.classAuto, mpg01.test, dnn = c("Actual", "Predicted"))
##       Predicted
## Actual  0  1
##      0 61  5
##      1  4 48

Accuracy rate

mean(qda.classAuto==mpg01.test)
## [1] 0.9237288

11 (e) QDA (b). What is the test error of the model obtained? Error rate

mean(qda.classAuto!=mpg01.test)
## [1] 0.07627119
  1. Perform logistic regression on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained? 0.11
glm.fitAuto = glm(mpg01 ~ cylinders + displacement + horsepower + weight, data = Auto2train, family = binomial)
glm.probsAuto = predict(glm.fitAuto, Auto2test, type = "response")
glm.predAuto= ifelse(glm.probsAuto >=0.5,1,0)
table(glm.predAuto, mpg01.test, dnn = c("Actual", "Predicted"))
##       Predicted
## Actual  0  1
##      0 58  7
##      1  7 46

Accuracy rate

mean(glm.predAuto==mpg01.test)
## [1] 0.8813559

11 (f) Logistic Regression (b). What is the test error of the model obtained? Error rate

mean(glm.predAuto!=mpg01.test)
## [1] 0.1186441

11 (g) Perform KNN on the training data, with several values of K, in order to predict mpg01. Use only the variables that seemed most associated with mpg01 in (b). What test errors do you obtain? K=1: 0.11 K=3: 0.11 (different confusion matrix than K=1, same error rate) K=5: 0.09

Which value of K seems to perform the best on this data set? K=5

train.XAuto2 = cbind(Auto2$cylinders, Auto2$displacement, Auto2$horsepower, Auto2$weight)[tr_ind,] ##A matrix containing the predictors associated with the training data
test.XAuto2 = cbind(Auto2$cylinders, Auto2$displacement, Auto2$horsepower, Auto2$weight)[-tr_ind,] ##A matrix containing the predictors associated with the data for which we wish to make predictions
train.mpg01 = Auto2$mpg01 [tr_ind] ##A vector containing the class labels for the training observations, labeled 
dim(train.XAuto2)
## [1] 274   4

set seed, fit KNN model, k=1

set.seed(22)
knn.predAuto=knn(train.XAuto2, test.XAuto2, train.mpg01, k=1)
table(knn.predAuto, mpg01.test, dnn = c("Actual", "Predicted"))
##       Predicted
## Actual  0  1
##      0 56  5
##      1  9 48
    1. (b). What test errors do you obtain? Which value of K seems to perform the best on this data set? K=1 Accuracy rate
mean(knn.predAuto==mpg01.test)
## [1] 0.8813559
    1. (b). What test errors do you obtain? Which value of K seems to perform the best on this data set? K=1 Error rate
mean(knn.predAuto!=mpg01.test)
## [1] 0.1186441

set seed, fit KNN model, k=3

set.seed(22)
knn.predAuto3=knn(train.XAuto2, test.XAuto2, train.mpg01, k=3)
table(knn.predAuto3, mpg01.test, dnn = c("Actual", "Predicted"))
##       Predicted
## Actual  0  1
##      0 57  6
##      1  8 47
    1. (b). What test errors do you obtain? Which value of K seems to perform the best on this data set? K=3 Accuracy rate
mean(knn.predAuto3==mpg01.test)
## [1] 0.8813559
    1. (b). What test errors do you obtain? Which value of K seems to perform the best on this data set? K=3 Error rate
mean(knn.predAuto3!=mpg01.test)
## [1] 0.1186441

set seed, fit KNN model, k=5

set.seed(22)
knn.predAuto5=knn(train.XAuto2, test.XAuto2, train.mpg01, k=5)
table(knn.predAuto5, mpg01.test, dnn = c("Actual", "Predicted"))
##       Predicted
## Actual  0  1
##      0 58  4
##      1  7 49
    1. (b). What test errors do you obtain? Which value of K seems to perform the best on this data set? K=5 Accuracy rate
mean(knn.predAuto5==mpg01.test)
## [1] 0.9067797
    1. (b). What test errors do you obtain? Which value of K seems to perform the best on this data set? K=5 Error rate
mean(knn.predAuto5!=mpg01.test)
## [1] 0.09322034

##13. Using the Boston data set, fit classification models in order to predict whether a given suburb has a crime rate above or below the median. Explore logistic regression, LDA, and KNN models using various subsets of the predictors. Describe your findings.

Below are the errors rates of 3 different versions of three different models (9 models) (Logestic regerssion, LDA, KNN). Top 3: KNN K=3 predictors nox, rad, dis had the lowest error rate; error rate 0.0592 Logestic regression predictors nox, rad, dis, age, tax, indus; error rate 0.1052 Logestic regression predictors nox, rad, dis, black; error rate 0.14473

Appears logistic regression, a simpler model would be the preferred model to go with.

#Logistic regression (3 models) glm.fitBoston: nox + rad + dis + age + tax + indus Error: 0.1052 glm.fitBoston2: nox + rad+ dis Error: 0.15789 glm.fitBoston3: nox + rad + dis + black Error: 0.14473

#LDA (3 models) lda.fitBoston: nox + rad + dis + age + tax + indus Error: 0.1644 lda.fitBoston2: nox + rad+ dis Error: 0.1644 lda.fitBoston3: nox + rad + dis + black Error: 0.1513

#KNN (3 models) KNN = 1 (nox, rad, dis, age, indus) Error: 0.1579 KNN = 3 (nox, rad, dis) Error: 0.0592 KNN = 5 (nox, rad, dis, black) Error: 0.1710

Looking at the Boston dataset

str(Boston)
## 'data.frame':    506 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ black  : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...

Create Boston 2 dataset to add binary variable

Boston2 = Boston

Create a binary variable, crim02, that contains a 1 if crime contains a value above its median, and a 0 if crime contains a value below its median.

med_crim = median(Boston2$crim)
Boston2 = mutate(Boston2, crim02 = ifelse(crim > med_crim, 1,0))
str(Boston2)
## 'data.frame':    506 obs. of  15 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ black  : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
##  $ crim02 : num  0 0 0 0 0 0 0 0 0 0 ...
summary(Boston2$crim02)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     0.0     0.5     0.5     1.0     1.0

Let’s see what correlates with crim02

pairs(Boston2)

Correlation matrix on crim02 nox 0.72, rad 0.62, dis -0.62, age 0.61, tax 0.61, indus 0.60,

cor(Boston2)
##                crim          zn       indus         chas         nox
## crim     1.00000000 -0.20046922  0.40658341 -0.055891582  0.42097171
## zn      -0.20046922  1.00000000 -0.53382819 -0.042696719 -0.51660371
## indus    0.40658341 -0.53382819  1.00000000  0.062938027  0.76365145
## chas    -0.05589158 -0.04269672  0.06293803  1.000000000  0.09120281
## nox      0.42097171 -0.51660371  0.76365145  0.091202807  1.00000000
## rm      -0.21924670  0.31199059 -0.39167585  0.091251225 -0.30218819
## age      0.35273425 -0.56953734  0.64477851  0.086517774  0.73147010
## dis     -0.37967009  0.66440822 -0.70802699 -0.099175780 -0.76923011
## rad      0.62550515 -0.31194783  0.59512927 -0.007368241  0.61144056
## tax      0.58276431 -0.31456332  0.72076018 -0.035586518  0.66802320
## ptratio  0.28994558 -0.39167855  0.38324756 -0.121515174  0.18893268
## black   -0.38506394  0.17552032 -0.35697654  0.048788485 -0.38005064
## lstat    0.45562148 -0.41299457  0.60379972 -0.053929298  0.59087892
## medv    -0.38830461  0.36044534 -0.48372516  0.175260177 -0.42732077
## crim02   0.40939545 -0.43615103  0.60326017  0.070096774  0.72323480
##                  rm         age         dis          rad         tax    ptratio
## crim    -0.21924670  0.35273425 -0.37967009  0.625505145  0.58276431  0.2899456
## zn       0.31199059 -0.56953734  0.66440822 -0.311947826 -0.31456332 -0.3916785
## indus   -0.39167585  0.64477851 -0.70802699  0.595129275  0.72076018  0.3832476
## chas     0.09125123  0.08651777 -0.09917578 -0.007368241 -0.03558652 -0.1215152
## nox     -0.30218819  0.73147010 -0.76923011  0.611440563  0.66802320  0.1889327
## rm       1.00000000 -0.24026493  0.20524621 -0.209846668 -0.29204783 -0.3555015
## age     -0.24026493  1.00000000 -0.74788054  0.456022452  0.50645559  0.2615150
## dis      0.20524621 -0.74788054  1.00000000 -0.494587930 -0.53443158 -0.2324705
## rad     -0.20984667  0.45602245 -0.49458793  1.000000000  0.91022819  0.4647412
## tax     -0.29204783  0.50645559 -0.53443158  0.910228189  1.00000000  0.4608530
## ptratio -0.35550149  0.26151501 -0.23247054  0.464741179  0.46085304  1.0000000
## black    0.12806864 -0.27353398  0.29151167 -0.444412816 -0.44180801 -0.1773833
## lstat   -0.61380827  0.60233853 -0.49699583  0.488676335  0.54399341  0.3740443
## medv     0.69535995 -0.37695457  0.24992873 -0.381626231 -0.46853593 -0.5077867
## crim02  -0.15637178  0.61393992 -0.61634164  0.619786249  0.60874128  0.2535684
##               black      lstat       medv      crim02
## crim    -0.38506394  0.4556215 -0.3883046  0.40939545
## zn       0.17552032 -0.4129946  0.3604453 -0.43615103
## indus   -0.35697654  0.6037997 -0.4837252  0.60326017
## chas     0.04878848 -0.0539293  0.1752602  0.07009677
## nox     -0.38005064  0.5908789 -0.4273208  0.72323480
## rm       0.12806864 -0.6138083  0.6953599 -0.15637178
## age     -0.27353398  0.6023385 -0.3769546  0.61393992
## dis      0.29151167 -0.4969958  0.2499287 -0.61634164
## rad     -0.44441282  0.4886763 -0.3816262  0.61978625
## tax     -0.44180801  0.5439934 -0.4685359  0.60874128
## ptratio -0.17738330  0.3740443 -0.5077867  0.25356836
## black    1.00000000 -0.3660869  0.3334608 -0.35121093
## lstat   -0.36608690  1.0000000 -0.7376627  0.45326273
## medv     0.33346082 -0.7376627  1.0000000 -0.26301673
## crim02  -0.35121093  0.4532627 -0.2630167  1.00000000

Make crim02 a factor variable

Boston2$crim02 = as.factor(Boston2$crim02)
str(Boston2)
## 'data.frame':    506 obs. of  15 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ black  : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
##  $ crim02 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...

Create train test datasets Boston2train Boston2test

set.seed(22)
trb_ind = sample(nrow(Boston2), 0.7*nrow(Boston2), replace = F)  ##using a 70/30 train/test split
Boston2train = Boston2[trb_ind,]
Boston2test = Boston2[-trb_ind,]

Create labels crim02.test

crim02.test = Boston2$crim02[-trb_ind]

Look at Boston2Train 354/15 - Boston2Train 354/15 + Boston2Test 152/15 = Boston2 505/15

str(Boston2train)
## 'data.frame':    354 obs. of  15 variables:
##  $ crim   : num  3.6737 11.5779 0.0715 0.0672 15.0234 ...
##  $ zn     : num  0 0 0 0 0 70 0 0 0 0 ...
##  $ indus  : num  18.1 18.1 4.49 3.24 18.1 ...
##  $ chas   : int  0 0 0 0 0 0 0 0 1 1 ...
##  $ nox    : num  0.583 0.7 0.449 0.46 0.614 0.4 0.671 0.544 0.489 0.77 ...
##  $ rm     : num  6.31 5.04 6.12 6.33 5.3 ...
##  $ age    : num  51.9 97 56.8 17.2 97.3 10 100 87.3 59.1 97.4 ...
##  $ dis    : num  3.99 1.77 3.75 5.21 2.1 ...
##  $ rad    : int  24 24 3 4 24 5 24 4 4 24 ...
##  $ tax    : num  666 666 247 430 666 358 666 304 277 666 ...
##  $ ptratio: num  20.2 20.2 18.5 16.9 20.2 14.8 20.2 18.4 18.6 20.2 ...
##  $ black  : num  389 397 395 375 349 ...
##  $ lstat  : num  10.58 25.68 8.44 7.34 24.91 ...
##  $ medv   : num  21.2 9.7 22.2 22.6 12 29 10.2 23.8 24.4 17.8 ...
##  $ crim02 : Factor w/ 2 levels "0","1": 2 2 1 1 2 1 2 2 1 2 ...

Look at Boston2Test 152/15

str(Boston2test)
## 'data.frame':    152 obs. of  15 variables:
##  $ crim   : num  0.0273 0.069 0.0883 0.2112 0.17 ...
##  $ zn     : num  0 0 12.5 12.5 12.5 12.5 12.5 0 0 0 ...
##  $ indus  : num  7.07 2.18 7.87 7.87 7.87 7.87 7.87 8.14 8.14 8.14 ...
##  $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.469 0.458 0.524 0.524 0.524 0.524 0.524 0.538 0.538 0.538 ...
##  $ rm     : num  7.18 7.15 6.01 5.63 6 ...
##  $ age    : num  61.1 54.2 66.6 100 85.9 94.3 39 84.5 81.7 89.2 ...
##  $ dis    : num  4.97 6.06 5.56 6.08 6.59 ...
##  $ rad    : int  2 3 5 5 5 5 5 4 4 4 ...
##  $ tax    : num  242 222 311 311 311 311 311 307 307 307 ...
##  $ ptratio: num  17.8 18.7 15.2 15.2 15.2 15.2 15.2 21 21 21 ...
##  $ black  : num  393 397 396 387 387 ...
##  $ lstat  : num  4.03 5.33 12.43 29.93 17.1 ...
##  $ medv   : num  34.7 36.2 22.9 16.5 18.9 15 21.7 18.2 17.5 19.6 ...
##  $ crim02 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 2 2 ...

Looking at the class labels

summary(crim02.test)
##  0  1 
## 76 76

#Logistic regression (3 models) glm.fitBoston: nox + rad + dis + age + tax + indus Error: 0.1052 glm.fitBoston2: nox + rad+ dis Error: 0.15789 glm.fitBoston3: nox + rad + dis + black Error: 0.14473

glm.fitBoston = glm(crim02 ~ nox + rad + dis + age + tax + indus, data = Boston2train, family = binomial)
glm.probsBoston = predict(glm.fitBoston, Boston2test, type = "response" )
glm.predBoston = ifelse(glm.probsBoston >=0.5,1,0)
table(glm.predBoston,crim02.test, dnn = c("Actual", "Predicted"))
##       Predicted
## Actual  0  1
##      0 69  9
##      1  7 67

Accuracy rate for Boston logistic regression

mean(glm.predBoston==crim02.test)
## [1] 0.8947368

Error rate for Boston logistic regression

mean(glm.predBoston!=crim02.test)
## [1] 0.1052632
glm.fitBoston2 = glm(crim02 ~ nox + rad + dis, data = Boston2train, family = binomial)
glm.probsBoston2 = predict(glm.fitBoston2, Boston2test, type = "response" )
glm.predBoston2 = ifelse(glm.probsBoston2 >=0.5,1,0)
table(glm.predBoston2,crim02.test, dnn = c("Actual", "Predicted"))
##       Predicted
## Actual  0  1
##      0 69 17
##      1  7 59

Accuracy rate for Boston logistic regression

mean(glm.predBoston2==crim02.test)
## [1] 0.8421053

Error rate for Boston logistic regression

mean(glm.predBoston2!=crim02.test)
## [1] 0.1578947
glm.fitBoston3 = glm(crim02 ~ nox + rad + dis + black, data = Boston2train, family = binomial)
glm.probsBoston3 = predict(glm.fitBoston3, Boston2test, type = "response" )
glm.predBoston3 = ifelse(glm.probsBoston3 >=0.5,1,0)
table(glm.predBoston3,crim02.test, dnn = c("Actual", "Predicted"))
##       Predicted
## Actual  0  1
##      0 69 15
##      1  7 61

Accuracy rate for Boston logistic regression

mean(glm.predBoston3==crim02.test)
## [1] 0.8552632

Error rate for Boston logistic regression

mean(glm.predBoston3!=crim02.test)
## [1] 0.1447368

#LDA (3 models) lda.fitBoston: nox + rad + dis + age + tax + indus Error: 0.1644 lda.fitBoston2: nox + rad+ dis Error: 0.1644 lda.fitBoston3: nox + rad + dis + black Error: 0.1513

lda.fitBoston = lda(crim02 ~ nox + rad + dis + age + tax + indus, data = Boston2train)
lda.predBoston = predict(lda.fitBoston,Boston2test)
lda.classBoston=lda.predBoston$class
table(lda.classBoston, crim02.test, dnn = c("Actual", "Predicted"))
##       Predicted
## Actual  0  1
##      0 76 25
##      1  0 51

Accuracy lda.fitBoston

mean(lda.classBoston==crim02.test)
## [1] 0.8355263

Error lda.fitboston

mean(lda.classBoston!=crim02.test)
## [1] 0.1644737
lda.fitBoston2 = lda(crim02 ~ nox + rad + dis, data = Boston2train)
lda.predBoston2 = predict(lda.fitBoston,Boston2test)
lda.classBoston2=lda.predBoston2$class
table(lda.classBoston2, crim02.test, dnn = c("Actual", "Predicted"))
##       Predicted
## Actual  0  1
##      0 76 25
##      1  0 51

Accuracy lda.fitBoston2

mean(lda.classBoston2==crim02.test)
## [1] 0.8355263

Error lda.fitboston

mean(lda.classBoston2!=crim02.test)
## [1] 0.1644737
lda.fitBoston3 = lda(crim02 ~ nox + rad + dis + black, data = Boston2train)
lda.predBoston3 = predict(lda.fitBoston3,Boston2test)
lda.classBoston3=lda.predBoston3$class
table(lda.classBoston3, crim02.test, dnn = c("Actual", "Predicted"))
##       Predicted
## Actual  0  1
##      0 75 22
##      1  1 54

Accuracy lda.fitBoston3

mean(lda.classBoston3==crim02.test)
## [1] 0.8486842

Error lda.fitboston3

mean(lda.classBoston3!=crim02.test)
## [1] 0.1513158

#KNN (3 models) KNN = 1 (nox, rad, dis, age, indus) Error: 0.1579 KNN = 3 (nox, rad, dis) Error: 0.0592 KNN = 5 (nox, rad, dis, black) Error: 0.1710

train.XBoston = cbind(Boston2$nox, Boston2$rad, Boston2$dis, Boston2$age, Boston2$indus)[trb_ind,]
test.XBoston = cbind(Boston2$nox, Boston2$rad, Boston2$dis, Boston2$age, Boston2$indus)[-trb_ind,]
train.crim02 = Boston2$crim02[trb_ind]
dim(test.XBoston)
## [1] 152   5

set seed, fit knn model, k=1

set.seed(22)
knn.predBoston=knn(train.XBoston, test.XBoston, train.crim02, k=1)
table(knn.predBoston, crim02.test, dnn = c("Actual", "Predicted"))
##       Predicted
## Actual  0  1
##      0 62 10
##      1 14 66

Accuracy knn.predBoston

mean(knn.predBoston==crim02.test)
## [1] 0.8421053

Error knn.predBoston

mean(knn.predBoston!=crim02.test)
## [1] 0.1578947
train.XBoston2 = cbind(Boston2$nox, Boston2$rad, Boston2$dis)[trb_ind,]
test.XBoston2 = cbind(Boston2$nox, Boston2$rad, Boston2$dis)[-trb_ind,]
train.crim02 = Boston2$crim02[trb_ind]
dim(test.XBoston)
## [1] 152   5

set seed, fit knn model, k=3

set.seed(22)
knn.predBoston2=knn(train.XBoston2, test.XBoston2, train.crim02, k=3)
table(knn.predBoston2, crim02.test, dnn = c("Actual", "Predicted"))
##       Predicted
## Actual  0  1
##      0 75  8
##      1  1 68

Accuracy knn.predBoston2

mean(knn.predBoston2==crim02.test)
## [1] 0.9407895

Error knn.predBoston2

mean(knn.predBoston2!=crim02.test)
## [1] 0.05921053
train.XBoston3 = cbind(Boston2$nox, Boston2$rad, Boston2$dis, Boston2$black)[trb_ind,]
test.XBoston3 = cbind(Boston2$nox, Boston2$rad, Boston2$dis, Boston2$black)[-trb_ind,]
train.crim02 = Boston2$crim02[trb_ind]
dim(test.XBoston)
## [1] 152   5

set seed, fit knn model, k=5

set.seed(22)
knn.predBoston3=knn(train.XBoston3, test.XBoston3, train.crim02, k=5)
table(knn.predBoston3, crim02.test, dnn = c("Actual", "Predicted"))
##       Predicted
## Actual  0  1
##      0 67 17
##      1  9 59

Accuracy knn.predBoston3

mean(knn.predBoston3==crim02.test)
## [1] 0.8289474

Error knn.predBoston3

mean(knn.predBoston3!=crim02.test)
## [1] 0.1710526