library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(readxl)
library(tinytex)
library(ISLR)
library(ISLR2)
## 
## Attaching package: 'ISLR2'
## 
## The following objects are masked from 'package:ISLR':
## 
##     Auto, Credit
library(MASS)
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:ISLR2':
## 
##     Boston
## 
## The following object is masked from 'package:dplyr':
## 
##     select
library(class)
library(e1071)

Chapter 4

PROBLEM 13

This question should be answered using the Weekly data set, which is part of the ISLR package. This data is similar in nature to the Smarket data from this chapter’s lab, except that it contains 1089 weekly returns for 21 years, from the beginning of 1990 to the end of 2010.

  1. Produce some numerical and graphical summaries of the Weekly data. Do there appear to be any patterns?
pairs(Weekly)

Weekly data information:

A data frame with 1089 observations on the following 9 variables.

Year The year that the observation was recorded

Lag1 Percentage return for previous week

Lag2 Percentage return for 2 weeks previous

Lag3 Percentage return for 3 weeks previous

Lag4 Percentage return for 4 weeks previous

Lag5 Percentage return for 5 weeks previous

Volume Volume of shares traded (average number of daily shares traded in billions)

Today Percentage return for this week

Direction A factor with levels Down and Up indicating whether the market had a positive or negative return on a given week

Weekly_NQ <- Weekly %>% select_if(is.numeric)
names(Weekly_NQ)
## [1] "Year"   "Lag1"   "Lag2"   "Lag3"   "Lag4"   "Lag5"   "Volume" "Today"
lm.Weekly <- lm(Volume~.,data=Weekly_NQ)
summary(lm.Weekly)
## 
## Call:
## lm(formula = Volume ~ ., data = Weekly_NQ)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7831 -0.6264 -0.1795  0.4426  5.7537 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.648e+02  9.045e+00 -51.390  < 2e-16 ***
## Year         2.332e-01  4.522e-03  51.570  < 2e-16 ***
## Lag1        -3.143e-02  1.165e-02  -2.699  0.00707 ** 
## Lag2        -4.610e-02  1.166e-02  -3.954 8.18e-05 ***
## Lag3        -3.427e-02  1.164e-02  -2.944  0.00331 ** 
## Lag4        -2.933e-02  1.163e-02  -2.522  0.01180 *  
## Lag5        -2.728e-02  1.160e-02  -2.351  0.01888 *  
## Today       -6.298e-03  1.162e-02  -0.542  0.58794    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8969 on 1081 degrees of freedom
## Multiple R-squared:  0.7191, Adjusted R-squared:  0.7173 
## F-statistic: 395.3 on 7 and 1081 DF,  p-value: < 2.2e-16
cor(Weekly_NQ)
##               Year         Lag1        Lag2        Lag3         Lag4
## Year    1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1   -0.03228927  1.000000000 -0.07485305  0.05863568 -0.071273876
## Lag2   -0.03339001 -0.074853051  1.00000000 -0.07572091  0.058381535
## Lag3   -0.03000649  0.058635682 -0.07572091  1.00000000 -0.075395865
## Lag4   -0.03112792 -0.071273876  0.05838153 -0.07539587  1.000000000
## Lag5   -0.03051910 -0.008183096 -0.07249948  0.06065717 -0.075675027
## Volume  0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today  -0.03245989 -0.075031842  0.05916672 -0.07124364 -0.007825873
##                Lag5      Volume        Today
## Year   -0.030519101  0.84194162 -0.032459894
## Lag1   -0.008183096 -0.06495131 -0.075031842
## Lag2   -0.072499482 -0.08551314  0.059166717
## Lag3    0.060657175 -0.06928771 -0.071243639
## Lag4   -0.075675027 -0.06107462 -0.007825873
## Lag5    1.000000000 -0.05851741  0.011012698
## Volume -0.058517414  1.00000000 -0.033077783
## Today   0.011012698 -0.03307778  1.000000000
lm.Year_Volume <- lm(Year~Volume, data=Weekly)
summary(lm.Year_Volume)
## 
## Call:
## lm(formula = Year ~ Volume, data = Weekly)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.3999  -2.2495   0.5893   2.6037   7.4944 
## 
## Coefficients:
##              Estimate Std. Error  t value Pr(>|t|)    
## (Intercept) 1.995e+03  1.350e-01 14775.21   <2e-16 ***
## Volume      3.012e+00  5.854e-02    51.45   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.257 on 1087 degrees of freedom
## Multiple R-squared:  0.7089, Adjusted R-squared:  0.7086 
## F-statistic:  2647 on 1 and 1087 DF,  p-value: < 2.2e-16
Year <- Weekly$Year
Volume <- Weekly$Volume
plot(Year,Volume)

Looking at the the above analysis we can see that there is a pattern between Volume and Year. With the Volume increasing as years increase.

  1. Use the full data set to perform a logistic regression with Direction as the response and the five lag variables plus Volume as predictors. Use the summary function to print the results. Do any of the predictors appear to be statistically significant? If so, which ones?
glm.direction <- glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume, data=Weekly,family=binomial)
summary (glm.direction)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Weekly)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6949  -1.2565   0.9913   1.0849   1.4579  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

There seems to be only 1 predictor that is statistically significant which is Lag2

  1. Compute the confusion matrix and overall fraction of correct predictions. Explain what the confusion matrix is telling you about the types of mistakes made by logistic regression.
glm.probs <- predict(glm.direction,type="response")
glm.probs [1:10]
##         1         2         3         4         5         6         7         8 
## 0.6086249 0.6010314 0.5875699 0.4816416 0.6169013 0.5684190 0.5786097 0.5151972 
##         9        10 
## 0.5715200 0.5554287
contrasts(Weekly$Direction)
##      Up
## Down  0
## Up    1
glm.pred=rep("Down",1089)
glm.pred[glm.probs >.5]= "Up"
table(glm.pred, Weekly$Direction)
##         
## glm.pred Down  Up
##     Down   54  48
##     Up    430 557
#False Negative/Positive
(48+430)/1089
## [1] 0.4389348
(48+430)
## [1] 478
#True Negative/Positive
(54+557)/1089
## [1] 0.5610652
(54+557)
## [1] 611
mean(glm.pred==Weekly$Direction)
## [1] 0.5610652
1-((54+557)/1089)
## [1] 0.4389348

Since the diagonal elements of the confusion matrix show the correct predictions, we can see that 54 where correctly predicted Down and 557 where correctly predicted Up. Using the mean function we can see it was correct on 56% of the weeks.

This result can be misleading because we trained and tested the model on the same set of 1089 weekly observations. This means we had a 43.89% training error rate.

In order to better test the model, we should split the data in to two parts. One part to train the model and the other to predict. This will help the model on accessing future predictions that are not part of the original model.

  1. Now fit the logistic regression model using a training data period from 1990 to 2008, with Lag2 as the only predictor. Compute the confusion matrix and the overall fraction of correct predictions for the held out data (that is, the data from 2009 and 2010).
train <- Weekly$Year<=2008
Weekly_2008 <- Weekly[!train,]
Direction_2008 <- Weekly$Direction[!train]
dim(Weekly_2008)
## [1] 104   9
glm.lag2 <- glm(Direction~Lag2, family=binomial,data=Weekly,subset=train)
glm.probs <- predict(glm.lag2,Weekly_2008,type="response")
glm.pred <- rep("Down",104)
glm.pred[glm.probs>.5]="Up"
table(glm.pred,Direction_2008)
##         Direction_2008
## glm.pred Down Up
##     Down    9  5
##     Up     34 56
mean(glm.pred==Direction_2008)
## [1] 0.625
1-mean(glm.pred==Direction_2008)
## [1] 0.375

62.5% correct predictions, or 37.5% error rate.

Linear Discriminate Analysis

  1. Repeat (d) using LDA.
lda.fit <-  lda(Direction~Lag2, data=Weekly, subset=train)
lda.fit
## Call:
## lda(Direction ~ Lag2, data = Weekly, subset = train)
## 
## Prior probabilities of groups:
##      Down        Up 
## 0.4477157 0.5522843 
## 
## Group means:
##             Lag2
## Down -0.03568254
## Up    0.26036581
## 
## Coefficients of linear discriminants:
##            LD1
## Lag2 0.4414162
plot(lda.fit)

lda.pred <- predict(lda.fit, Weekly_2008)
names(lda.pred)
## [1] "class"     "posterior" "x"
lda.class <- lda.pred$class
table(lda.class,Direction_2008)
##          Direction_2008
## lda.class Down Up
##      Down    9  5
##      Up     34 56
mean(lda.class==Direction_2008)
## [1] 0.625

LDA provided the exact result as logistic regression.

Quadratic Discriminant Analysis

qda.fit=qda(Direction~Lag2 ,data=Weekly ,subset=train)
qda.fit
## Call:
## qda(Direction ~ Lag2, data = Weekly, subset = train)
## 
## Prior probabilities of groups:
##      Down        Up 
## 0.4477157 0.5522843 
## 
## Group means:
##             Lag2
## Down -0.03568254
## Up    0.26036581
qda.class <- predict(qda.fit, Weekly_2008)$class
table(qda.class, Direction_2008)
##          Direction_2008
## qda.class Down Up
##      Down    0  0
##      Up     43 61
mean(qda.class==Direction_2008)
## [1] 0.5865385

58.65% correct prediction using QDA.

K Nearest Neighbors

  1. Repeat (d) using KNN with K = 1.
train.X <- Weekly[train,"Lag2",drop=F]
test.X <- Weekly[!train,"Lag2",drop=F]
train.Direction <- Weekly[train,"Direction",drop=T]
set.seed(1)
knn.pred <- knn(train.X,test.X,train.Direction ,k=1)
table(knn.pred, Direction_2008)
##         Direction_2008
## knn.pred Down Up
##     Down   21 30
##     Up     22 31
table(knn.pred,Direction_2008)
##         Direction_2008
## knn.pred Down Up
##     Down   21 30
##     Up     22 31
mean(knn.pred==Direction_2008)
## [1] 0.5

KNN results in 50%

Naive Bayes

nb_model <- naiveBayes(Direction~Lag2,data=Weekly, subset = train)
pred <- predict(nb_model,Weekly_2008)
table(pred,Direction_2008)
##       Direction_2008
## pred   Down Up
##   Down    0  0
##   Up     43 61
mean(pred==Direction_2008)
## [1] 0.5865385

The Naive Bayes method yielded 58.65%

  1. Which of these methods appears to provide the best results on this data?

In this case, Logistic seems the most simple and best model. Since LDA is similar to Logistic it also provided the best results.

  1. Experiment with different combinations of predictors, including possible transformations and interactions, for each of the methods. Report the variables, method, and associated confusion matrix that appears to provide the best results on the held out data. Note that you should also experiment with values for K in the KNN classifier.

KNN

set.seed(1)
knn.pred <- knn(train.X,test.X,train.Direction ,k=4)
table(knn.pred, Direction_2008)
##         Direction_2008
## knn.pred Down Up
##     Down   20 17
##     Up     23 44
mean(knn.pred==Direction_2008)
## [1] 0.6153846

Naive Bayes

nb_model <- naiveBayes(Direction~Lag1+Lag2+Lag3,data=Weekly, subset = train)
pred <- predict(nb_model,Weekly_2008)
table(pred,Direction_2008)
##       Direction_2008
## pred   Down Up
##   Down    5 10
##   Up     38 51
mean(pred==Direction_2008)
## [1] 0.5384615

QDA

qda.fit=qda(Direction~Lag1+Lag2+Lag3 ,data=Weekly ,subset=train)
qda.fit
## Call:
## qda(Direction ~ Lag1 + Lag2 + Lag3, data = Weekly, subset = train)
## 
## Prior probabilities of groups:
##      Down        Up 
## 0.4477157 0.5522843 
## 
## Group means:
##              Lag1        Lag2       Lag3
## Down  0.289444444 -0.03568254 0.17080045
## Up   -0.009213235  0.26036581 0.08404044
qda.class <- predict(qda.fit, Weekly_2008)$class
table(qda.class, Direction_2008)
##          Direction_2008
## qda.class Down Up
##      Down    6 10
##      Up     37 51
mean(qda.class==Direction_2008)
## [1] 0.5480769

LDA

lda.fit <-  lda(Direction~Lag1+Lag2+Lag3, data=Weekly, subset=train)
lda.fit
## Call:
## lda(Direction ~ Lag1 + Lag2 + Lag3, data = Weekly, subset = train)
## 
## Prior probabilities of groups:
##      Down        Up 
## 0.4477157 0.5522843 
## 
## Group means:
##              Lag1        Lag2       Lag3
## Down  0.289444444 -0.03568254 0.17080045
## Up   -0.009213235  0.26036581 0.08404044
## 
## Coefficients of linear discriminants:
##              LD1
## Lag1 -0.29658609
## Lag2  0.29258490
## Lag3 -0.04766747
plot(lda.fit)

lda.pred <- predict(lda.fit, Weekly_2008)
names(lda.pred)
## [1] "class"     "posterior" "x"
lda.class <- lda.pred$class
table(lda.class,Direction_2008)
##          Direction_2008
## lda.class Down Up
##      Down    8  9
##      Up     35 52
mean(lda.class==Direction_2008)
## [1] 0.5769231

Messing around with the models by adding extra predictors, changing K in KNN has been proven to not help increase the models accuracy in predicting the direction of the Weekly data set. Increasing K to 4 did provide a 10% improvement in the model but at the risk of increasing bias for new predictions.

PROBLEM 14

In this problem, you will develop a model to predict whether a given car gets high or low gas mileage based on the Auto data set.

  1. Create a binary variable, mpg01, that contains a 1 if mpg contains a value above its median, and a 0 if mpg contains a value below its median. You can compute the median using the median() function. Note you may find it helpful to use the data.frame() function to create a single data set containing both mpg01 and the other Auto variables.
#Creates Boolean variable for Median MPG 1 for over, 0 for under
mpg01 <- ifelse(Auto$mpg > median(Auto$mpg), 1, 0)
#Combines mpg01 to Auto
Auto_V <- data.frame(Auto, mpg01)
  1. Explore the data graphically in order to investigate the association between mpg01 and the other features. Which of the other features seem most likely to be useful in predicting mpg01? Scatterplots and boxplots may be useful tools to answer this question. Describe your findings.
Auto_NQ <- Auto_V %>% select_if(is.numeric)
names(Auto_V)
##  [1] "mpg"          "cylinders"    "displacement" "horsepower"   "weight"      
##  [6] "acceleration" "year"         "origin"       "name"         "mpg01"
names(Auto_NQ)
## [1] "mpg"          "cylinders"    "displacement" "horsepower"   "weight"      
## [6] "acceleration" "year"         "origin"       "mpg01"
cor(Auto_NQ)
##                     mpg  cylinders displacement horsepower     weight
## mpg           1.0000000 -0.7776175   -0.8051269 -0.7784268 -0.8322442
## cylinders    -0.7776175  1.0000000    0.9508233  0.8429834  0.8975273
## displacement -0.8051269  0.9508233    1.0000000  0.8972570  0.9329944
## horsepower   -0.7784268  0.8429834    0.8972570  1.0000000  0.8645377
## weight       -0.8322442  0.8975273    0.9329944  0.8645377  1.0000000
## acceleration  0.4233285 -0.5046834   -0.5438005 -0.6891955 -0.4168392
## year          0.5805410 -0.3456474   -0.3698552 -0.4163615 -0.3091199
## origin        0.5652088 -0.5689316   -0.6145351 -0.4551715 -0.5850054
## mpg01         0.8369392 -0.7591939   -0.7534766 -0.6670526 -0.7577566
##              acceleration       year     origin      mpg01
## mpg             0.4233285  0.5805410  0.5652088  0.8369392
## cylinders      -0.5046834 -0.3456474 -0.5689316 -0.7591939
## displacement   -0.5438005 -0.3698552 -0.6145351 -0.7534766
## horsepower     -0.6891955 -0.4163615 -0.4551715 -0.6670526
## weight         -0.4168392 -0.3091199 -0.5850054 -0.7577566
## acceleration    1.0000000  0.2903161  0.2127458  0.3468215
## year            0.2903161  1.0000000  0.1815277  0.4299042
## origin          0.2127458  0.1815277  1.0000000  0.5136984
## mpg01           0.3468215  0.4299042  0.5136984  1.0000000
plot(Auto_NQ)

Since mpg01 is a binary variable it’s difficult to discern anything through regular regression. We will need to use logistic analysis.

glm.fits <- glm(mpg01~displacement+horsepower+weight+acceleration+year, data=Auto_NQ,family = binomial)
summary(glm.fits)
## 
## Call:
## glm(formula = mpg01 ~ displacement + horsepower + weight + acceleration + 
##     year, family = binomial, data = Auto_NQ)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1982  -0.1129   0.0114   0.2243   3.3045  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -15.844178   5.636740  -2.811 0.004941 ** 
## displacement  -0.007024   0.006519  -1.078 0.281237    
## horsepower    -0.035762   0.023176  -1.543 0.122826    
## weight        -0.003984   0.001084  -3.676 0.000237 ***
## acceleration   0.008144   0.141248   0.058 0.954019    
## year           0.414101   0.072623   5.702 1.18e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 543.43  on 391  degrees of freedom
## Residual deviance: 159.34  on 386  degrees of freedom
## AIC: 171.34
## 
## Number of Fisher Scoring iterations: 8

We can see that weight and year have a significant relationship to mpg01

  1. Split the data into a training set and a test set.
set.seed(1)
sample1 <- sample(c(TRUE, FALSE), nrow(Auto_NQ), replace=TRUE, prob = c(.6,.4))
train <- Auto_NQ[sample1, ]
test <- Auto_NQ[!sample1, ]
dim(train)
## [1] 246   9
dim(test)
## [1] 146   9

LDA Auto

  1. Perform LDA on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained?
lda.fit.auto <- lda(mpg01 ~ weight+year, data=train)
lda.fit.auto
## Call:
## lda(mpg01 ~ weight + year, data = train)
## 
## Prior probabilities of groups:
##        0        1 
## 0.504065 0.495935 
## 
## Group means:
##     weight     year
## 0 3620.040 74.70968
## 1 2315.344 77.63115
## 
## Coefficients of linear discriminants:
##                LD1
## weight -0.00176319
## year    0.11400180
lda.class.auto <- predict(lda.fit.auto,test)$class
table(lda.class.auto,test$mpg01)
##               
## lda.class.auto  0  1
##              0 62  5
##              1 10 69
mean(lda.class.auto==test$mpg01)
## [1] 0.8972603
1-mean(lda.class.auto==test$mpg01)
## [1] 0.1027397

Using LDA and the weight and year as predictors we were able to get 89.72% correct. I must say that’s pretty impressive and cool! I used 60% of the data as the training data, and the remaining 30% for the testing data.

Test error rate 10.27%

QDA Auto

  1. Perform QDA on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained?
qda.fit.auto <-  qda(mpg01 ~ weight+year, data=train)
qda.fit.auto
## Call:
## qda(mpg01 ~ weight + year, data = train)
## 
## Prior probabilities of groups:
##        0        1 
## 0.504065 0.495935 
## 
## Group means:
##     weight     year
## 0 3620.040 74.70968
## 1 2315.344 77.63115
qda.class.auto <- predict(qda.fit.auto,test)$class
table(qda.class.auto,test$mpg01)
##               
## qda.class.auto  0  1
##              0 61  6
##              1 11 68
mean(qda.class.auto==test$mpg01)
## [1] 0.8835616
1-mean(qda.class.auto==test$mpg01)
## [1] 0.1164384

Using QDA we got a little less correct at 88.36% on the test data but still a nice result. Test error rate 11.64%

Logistic Regression

  1. Perform logistic regression on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained?
glm.auto <-  glm(mpg01~weight+year, family=binomial, data=train)
summary(glm.auto)
## 
## Call:
## glm(formula = mpg01 ~ weight + year, family = binomial, data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9209  -0.1360  -0.0007   0.2159   3.1868  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.082e+01  6.708e+00  -3.104  0.00191 ** 
## weight      -5.946e-03  8.835e-04  -6.730 1.69e-11 ***
## year         4.929e-01  1.063e-01   4.636 3.55e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 341.01  on 245  degrees of freedom
## Residual deviance:  93.95  on 243  degrees of freedom
## AIC: 99.95
## 
## Number of Fisher Scoring iterations: 7
glm.probs.auto <- predict(glm.auto,test,type="response")
glm.pred.auto <- rep(0,nrow(test))
glm.pred.auto[glm.probs.auto > 0.50]=1
table(glm.pred.auto,test$mpg01)
##              
## glm.pred.auto  0  1
##             0 64  9
##             1  8 65
mean(glm.pred.auto==test$mpg01)
## [1] 0.8835616
1-mean(glm.pred.auto==test$mpg01)
## [1] 0.1164384

Using logistic regression the model was still able to achieve a 88.36% success rate, still not as good as LDA but respectable.

Test error rate 11.64% the same as QDA

Naive Bayes

  1. Perform naive Bayes on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained?
nb_model.auto <- naiveBayes(mpg01~weight+year,data=train)
summary(nb_model.auto)
##           Length Class  Mode     
## apriori   2      table  numeric  
## tables    2      -none- list     
## levels    2      -none- character
## isnumeric 2      -none- logical  
## call      4      -none- call
prob.auto <- predict(nb_model.auto,test)
pred.auto <- rep(0,nrow(test))
pred.auto[prob.auto > 0.50]=1
table(pred.auto,test$mpg01)
##          
## pred.auto  0  1
##         0 72 74
mean(pred.auto==test$mpg01)
## [1] 0.4931507
1-mean(pred.auto==test$mpg01)
## [1] 0.5068493

The Naives Bayes did not produce favorable results with only 49.31% correct.

Test error rate 50.68%

PROBLEM 16

Using the Boston data set, fit classification models in order to predict whether a given census tract has a crime rate above or below the median. Explore logistic regression, LDA, naive Bayes, and KNN models using various subsets of the predictors. Describe your findings. Hint: You will have to create the response variable yourself, using the variables that are contained in the Boston data set.

#Creates Boolean variable for Median crim 1 for over, 0 for under
crim01 <- ifelse(Boston$crim > median(Boston$crim), 1, 0)
#Combines crim01 to Boston
Boston_V <- data.frame(Boston, crim01)
cor(Boston_V)
##                crim          zn       indus         chas         nox
## crim     1.00000000 -0.20046922  0.40658341 -0.055891582  0.42097171
## zn      -0.20046922  1.00000000 -0.53382819 -0.042696719 -0.51660371
## indus    0.40658341 -0.53382819  1.00000000  0.062938027  0.76365145
## chas    -0.05589158 -0.04269672  0.06293803  1.000000000  0.09120281
## nox      0.42097171 -0.51660371  0.76365145  0.091202807  1.00000000
## rm      -0.21924670  0.31199059 -0.39167585  0.091251225 -0.30218819
## age      0.35273425 -0.56953734  0.64477851  0.086517774  0.73147010
## dis     -0.37967009  0.66440822 -0.70802699 -0.099175780 -0.76923011
## rad      0.62550515 -0.31194783  0.59512927 -0.007368241  0.61144056
## tax      0.58276431 -0.31456332  0.72076018 -0.035586518  0.66802320
## ptratio  0.28994558 -0.39167855  0.38324756 -0.121515174  0.18893268
## black   -0.38506394  0.17552032 -0.35697654  0.048788485 -0.38005064
## lstat    0.45562148 -0.41299457  0.60379972 -0.053929298  0.59087892
## medv    -0.38830461  0.36044534 -0.48372516  0.175260177 -0.42732077
## crim01   0.40939545 -0.43615103  0.60326017  0.070096774  0.72323480
##                  rm         age         dis          rad         tax    ptratio
## crim    -0.21924670  0.35273425 -0.37967009  0.625505145  0.58276431  0.2899456
## zn       0.31199059 -0.56953734  0.66440822 -0.311947826 -0.31456332 -0.3916785
## indus   -0.39167585  0.64477851 -0.70802699  0.595129275  0.72076018  0.3832476
## chas     0.09125123  0.08651777 -0.09917578 -0.007368241 -0.03558652 -0.1215152
## nox     -0.30218819  0.73147010 -0.76923011  0.611440563  0.66802320  0.1889327
## rm       1.00000000 -0.24026493  0.20524621 -0.209846668 -0.29204783 -0.3555015
## age     -0.24026493  1.00000000 -0.74788054  0.456022452  0.50645559  0.2615150
## dis      0.20524621 -0.74788054  1.00000000 -0.494587930 -0.53443158 -0.2324705
## rad     -0.20984667  0.45602245 -0.49458793  1.000000000  0.91022819  0.4647412
## tax     -0.29204783  0.50645559 -0.53443158  0.910228189  1.00000000  0.4608530
## ptratio -0.35550149  0.26151501 -0.23247054  0.464741179  0.46085304  1.0000000
## black    0.12806864 -0.27353398  0.29151167 -0.444412816 -0.44180801 -0.1773833
## lstat   -0.61380827  0.60233853 -0.49699583  0.488676335  0.54399341  0.3740443
## medv     0.69535995 -0.37695457  0.24992873 -0.381626231 -0.46853593 -0.5077867
## crim01  -0.15637178  0.61393992 -0.61634164  0.619786249  0.60874128  0.2535684
##               black      lstat       medv      crim01
## crim    -0.38506394  0.4556215 -0.3883046  0.40939545
## zn       0.17552032 -0.4129946  0.3604453 -0.43615103
## indus   -0.35697654  0.6037997 -0.4837252  0.60326017
## chas     0.04878848 -0.0539293  0.1752602  0.07009677
## nox     -0.38005064  0.5908789 -0.4273208  0.72323480
## rm       0.12806864 -0.6138083  0.6953599 -0.15637178
## age     -0.27353398  0.6023385 -0.3769546  0.61393992
## dis      0.29151167 -0.4969958  0.2499287 -0.61634164
## rad     -0.44441282  0.4886763 -0.3816262  0.61978625
## tax     -0.44180801  0.5439934 -0.4685359  0.60874128
## ptratio -0.17738330  0.3740443 -0.5077867  0.25356836
## black    1.00000000 -0.3660869  0.3334608 -0.35121093
## lstat   -0.36608690  1.0000000 -0.7376627  0.45326273
## medv     0.33346082 -0.7376627  1.0000000 -0.26301673
## crim01  -0.35121093  0.4532627 -0.2630167  1.00000000

Looking at the correlation matrix with respect to crim01 we can take the following predictors and conduct further analysis:

indus proportion of non-retail business acres per town.

nox nitrogen oxides concentration (parts per 10 million).

age proportion of owner-occupied units built prior to 1940.

rad index of accessibility to radial highways.

tax full-value property-tax rate per $10,000.

lstat lower status of the population (percent).

Train/Test

set.seed(1)
sample_b <- sample(c(TRUE, FALSE), nrow(Boston_V), replace=TRUE, prob = c(.6,.4))
trainb <- Boston_V[sample_b, ]
testb <- Boston_V[!sample_b, ]
dim(trainb)
## [1] 314  15
dim(testb)
## [1] 192  15

Logistic Regression on Boston Crim01

glm.bos <-  glm(crim01~indus+nox+age+rad+tax+lstat, family=binomial, data=trainb)
summary(glm.bos)
## 
## Call:
## glm(formula = crim01 ~ indus + nox + age + rad + tax + lstat, 
##     family = binomial, data = trainb)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.08917  -0.19479   0.00026   0.01065   2.74919  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -22.535448   3.975270  -5.669 1.44e-08 ***
## indus        -0.072520   0.056568  -1.282 0.199841    
## nox          40.095414   8.392236   4.778 1.77e-06 ***
## age           0.022459   0.012367   1.816 0.069362 .  
## rad           0.576305   0.151603   3.801 0.000144 ***
## tax          -0.008295   0.003199  -2.593 0.009521 ** 
## lstat        -0.011124   0.043571  -0.255 0.798480    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 435.25  on 313  degrees of freedom
## Residual deviance: 139.35  on 307  degrees of freedom
## AIC: 153.35
## 
## Number of Fisher Scoring iterations: 8
glm.probs.b <- predict(glm.bos,testb,type="response")
glm.pred.b <- rep(0,nrow(testb))
glm.pred.b[glm.probs.b > 0.50]=1
table(glm.pred.b,testb$crim01)
##           
## glm.pred.b  0  1
##          0 88 13
##          1 10 81
mean(glm.pred.b==testb$crim01)
## [1] 0.8802083
1-mean(glm.pred.b==testb$crim01)
## [1] 0.1197917

We can see with a logistic regression model on the test set we were able to successfully predict 88.02% with a test error rate of 11.98%

LDA on Boston Crim01

lda.fit.b <- lda(crim01~indus+nox+age+rad+tax+lstat, data=trainb)
lda.fit.b
## Call:
## lda(crim01 ~ indus + nox + age + rad + tax + lstat, data = trainb)
## 
## Prior probabilities of groups:
##         0         1 
## 0.4936306 0.5063694 
## 
## Group means:
##       indus       nox      age       rad      tax     lstat
## 0  6.663226 0.4648245 50.03871  4.096774 303.8065  9.240387
## 1 15.310189 0.6426730 86.79308 14.610063 506.4969 15.832264
## 
## Coefficients of linear discriminants:
##                LD1
## indus  0.019009728
## nox    7.126273863
## age    0.018582977
## rad    0.089442271
## tax   -0.001811084
## lstat -0.022915042
lda.class.b <- predict(lda.fit.b,testb)$class
table(lda.class.b,testb$crim01)
##            
## lda.class.b  0  1
##           0 90 23
##           1  8 71
mean(lda.class.b==testb$crim01)
## [1] 0.8385417
1-mean(lda.class.b==testb$crim01)
## [1] 0.1614583

With LDA we get a lower success rate, with 83.85%. The test error rate is 16.15%.

Naive Bayes

nb_model.b <- naiveBayes(crim01~indus+nox+age+rad+tax+lstat, data=trainb)
summary(nb_model.b)
##           Length Class  Mode     
## apriori   2      table  numeric  
## tables    6      -none- list     
## levels    2      -none- character
## isnumeric 6      -none- logical  
## call      4      -none- call
prob.b <- predict(nb_model.b,testb)
pred.b <- rep(0,nrow(testb))
pred.b[prob.b > 0.50]=1
table(pred.b,testb$crim01)
##       
## pred.b  0  1
##      0 98 94
mean(pred.b==testb$crim01)
## [1] 0.5104167
1-mean(pred.b==testb$crim01)
## [1] 0.4895833

Naive Bayes provided pretty low numbers, 51.04% success. 48.96% error rate.

KNN

set.seed(1)

train.Boston = trainb[,c("indus","nox","age","rad","tax","lstat")]
test.Boston =  testb[,c("indus","nox","age","rad","tax","lstat")]

knn.pred=knn(train.Boston,test.Boston,trainb$crim01,k=1)

table(knn.pred,testb$crim01)
##         
## knn.pred  0  1
##        0 91  7
##        1  7 87
mean(knn.pred==testb$crim01)
## [1] 0.9270833
1-mean(knn.pred==testb$crim01)
## [1] 0.07291667
knn.pred=knn(train.Boston,test.Boston,trainb$crim01,k=2)

table(knn.pred,testb$crim01)
##         
## knn.pred  0  1
##        0 86 12
##        1 12 82
mean(knn.pred==testb$crim01)
## [1] 0.875
knn.pred=knn(train.Boston,test.Boston,trainb$crim01,k=3)

table(knn.pred,testb$crim01)
##         
## knn.pred  0  1
##        0 91 10
##        1  7 84
mean(knn.pred==testb$crim01)
## [1] 0.9114583
knn.pred=knn(train.Boston,test.Boston,trainb$crim01,k=4)

table(knn.pred,testb$crim01)
##         
## knn.pred  0  1
##        0 90 11
##        1  8 83
mean(knn.pred==testb$crim01)
## [1] 0.9010417

KNN with K = 1 provided the best result (out of 4) with a success rate of 92.71% with a test error rate of 7.29%

KNN was the best model so far for this data set with respect to predicting crim01 with the variables indus nox age rad tax and lstat