Data

auto_all_vars <- read.csv("~/Documents/Documents - Joanna’s MacBook Pro/School/ISYE7406/Auto.csv")

library(dplyr)
auto_all_vars <- auto_all_vars %>% 
  mutate(y = as.factor(if_else(mpg >= median(auto_all_vars$mpg), 1, 0)))

auto <- auto_all_vars[,-1]

library(caret)
trainIndex <- createDataPartition(auto$y, p = .8, 
                                  list = FALSE, 
                                  times = 1)
train <- auto[trainIndex,]
test <- auto[-trainIndex,]

TrainErr <- NULL;
TestErr  <- NULL;

EDA

Summary Statistics
Variable N Mean Std. Dev. Min Pctl. 25 Pctl. 75 Max
mpg 392 23.446 7.805 9 17 29 46.6
cylinders 392 5.472 1.706 3 4 8 8
displacement 392 194.412 104.644 68 105 275.75 455
horsepower 392 104.469 38.491 46 75 126 230
weight 392 2977.584 849.403 1613 2225.25 3614.75 5140
acceleration 392 15.541 2.759 8 13.775 17.025 24.8
year 392 75.98 3.684 70 73 79 82
origin 392 1.577 0.806 1 1 2 3
y 392
… 0 196 50%
… 1 196 50%

Var selection

#select var with high correlation coeficients for logistic regression
select_vars <- c("cylinders" , "weight", "displacement", "horsepower")

LDA Model

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
mod1 <- lda(train[,-8], train[,8]); 

## training error 
pred1 <- predict(mod1,train[,-8])$class; 
TrainErr <- c(TrainErr, mean( pred1  != train$y)); 
TrainErr; 
## [1] 0.08280255
## testing error 
pred1test <- predict(mod1,test[,-8])$class; 
TestErr <- c(TestErr,mean(pred1test != test$y));  
TestErr;
## [1] 0.1153846
table(pred1test,test$y) 
##          
## pred1test  0  1
##         0 31  1
##         1  8 38

QDA Model

mod2 <- qda(train[,-8], train[,8]);

## Training Error 
pred2 <- predict(mod2,train[,-8])$class;
TrainErr <- c(TrainErr, mean( pred2!= train$y))
TrainErr
## [1] 0.08280255 0.08917197
##  Testing Error 
TestErr <- c(TestErr, mean( predict(mod2,test[,-8])$class != test$y))
TestErr
## [1] 0.11538462 0.08974359

Naive Bayes Model

library(e1071)
mod3 <- naiveBayes(train[,-8], train[,8])

## Training Error
pred3 <- predict(mod3, train[,-8])
TrainErr <- c(TrainErr, mean( pred3 != train$y))
TrainErr 
## [1] 0.08280255 0.08917197 0.09872611
##  0.2765152 for miss.class.train.error of Naive Bayes
## Testing Error 
TestErr <- c(TestErr,  mean( predict(mod3,test[,-8]) != test$y))
TestErr
## [1] 0.11538462 0.08974359 0.08974359

Logistic Regression Full Model

mod4 <-  glm(y ~ .,family = binomial(link='logit'), data=train)
summary(mod4);
## 
## Call:
## glm(formula = y ~ ., family = binomial(link = "logit"), data = train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.47488  -0.08039   0.00480   0.17368   2.34395  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -17.015318   6.924549  -2.457 0.014001 *  
## cylinders     -0.080081   0.505770  -0.158 0.874193    
## displacement   0.002643   0.014672   0.180 0.857040    
## horsepower    -0.044899   0.028853  -1.556 0.119677    
## weight        -0.005063   0.001421  -3.563 0.000367 ***
## acceleration   0.004924   0.185078   0.027 0.978776    
## year           0.457514   0.091505   5.000 5.74e-07 ***
## origin         0.471550   0.445382   1.059 0.289712    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 435.30  on 313  degrees of freedom
## Residual deviance: 114.91  on 306  degrees of freedom
## AIC: 130.91
## 
## Number of Fisher Scoring iterations: 8
pred4 = round(predict(mod4, train[,-8], type="response"))
TrainErr <- c(TrainErr, mean(pred4 != train$y))
TrainErr
## [1] 0.08280255 0.08917197 0.09872611 0.08917197
## Testing Error of logistic regression
predtest4 = round(predict(mod4, test[,-8], type="response"))
TestErr <- c(TestErr, mean(predtest4 != test$y) )
TestErr
## [1] 0.11538462 0.08974359 0.08974359 0.10256410

Logistic Regression with reduced predictor variables

mod5 <-  glm(y ~ cylinders + weight + displacement + horsepower, 
             family=binomial(link='logit'), data=train)


## Training Error of logistic regression
pred5 = round(predict(mod4, train[,-8], type="response"))
TrainErr <- c(TrainErr, mean(pred5 != train$y))
TrainErr
## [1] 0.08280255 0.08917197 0.09872611 0.08917197 0.08917197
## Testing Error of logistic regression
predtest5 = round(predict(mod4, test[,-8], type="response"))
TestErr <- c(TestErr, mean(predtest5 != test$y) )
TestErr
## [1] 0.11538462 0.08974359 0.08974359 0.10256410 0.10256410

#KNN Model LOOCV to determine optimal value of k

library(kknn)
## 
## Attaching package: 'kknn'
## The following object is masked from 'package:caret':
## 
##     contr.dummy
loocv_model = train.kknn(y ~ cylinders + weight + displacement + horsepower,
                   train,
                   ks=(1:25),#Vector containing values of k train.kknn will iterate through
                   kernel = "optimal",
                   scale=TRUE) #scales the data

loocv_model
## 
## Call:
## train.kknn(formula = y ~ cylinders + weight + displacement +     horsepower, data = train, ks = (1:25), kernel = "optimal",     scale = TRUE)
## 
## Type of response variable: nominal
## Minimal misclassification: 0.08280255
## Best kernel: optimal
## Best k: 13
plot(loocv_model)

KNN model with optimal value of k

mod6 <- kknn(y~ cylinders + weight + displacement + horsepower,
             train = train,
             test = test,
             k =  12,
             kernel = "optimal",
             scale = TRUE)
mod6
## 
## Call:
## kknn(formula = y ~ cylinders + weight + displacement + horsepower,     train = train, test = test, k = 12, kernel = "optimal", scale = TRUE)
## 
## Response: "nominal"
## Testing Error
pred6 <- predict(mod6)
TestErr <- c(TestErr, mean(pred6 != test$y) )
TestErr
## [1] 0.11538462 0.08974359 0.08974359 0.10256410 0.10256410 0.10256410
TrainErr <- c(TrainErr, "na")
TrainErr
## [1] "0.0828025477707006" "0.089171974522293"  "0.0987261146496815"
## [4] "0.089171974522293"  "0.089171974522293"  "na"

AlL train and test Errors

Model <- c("LDA (all vars)", "QDA (all vars)", "Naive Bayes (all vars)", "Logistic Regression (all vars)", "Logistic Regression (best subset)", "KNN (best subset, k=14)")

error_df <- data_frame(Model, TrainErr, TestErr)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## ℹ Please use `tibble()` instead.
error_df

4. Cross-Validation

n1 = 314;   # training set sample size
n2 = 78;     # testing set sample size
n = dim(auto)[1];    ## the total sample size

set.seed(7406);  

B= 100;
CVALL = NULL;
for (b in 1:B){
  ### randomly select n1 observations as a new training  subset in each loop
  flag <- sort(sample(1:n, n1));
  train <- auto[flag,];  ## temp training set for CV
  test  <- auto[-flag,]; ## temp testing set for CV
  
  # LDA model
  mod1 <- lda(train[,-8], train[,8]);
  err1 <- mean(predict(mod1,test[,-8])$class != test$y)
  
  # QDA model
  mod2 <- qda(train[,-8], train[,8]);
  err2 <- mean(predict(mod2,test[,-8])$class != test$y)
  
  #Naive Bayes
  mod3 <- naiveBayes(train[,-8], train[,8])
  err3 <- mean(predict(mod3,test[,-8]) != test$y)
  
  #Logistic Regression
  mod4 <-  glm(y ~ .,family = binomial(link='logit'), data=train)
  pred4 = round(predict(mod4, test[,-8], type="response"))
  err4 <- mean(pred4 != test$y)
  
  #Logistic Regression Reduced
  mod5 <-  glm(y ~ cylinders + weight + displacement + horsepower, 
             family=binomial(link='logit'), data=train)
  pred4 = round(predict(mod4, test[,-8], type="response"))
  err5 <- mean(pred4 != test$y)
  
  #KNN
  mod6 <- kknn(y~ cylinders + weight + displacement + horsepower,
             train = train,
             test = test,
             k =  12,
             kernel = "optimal",
             scale = TRUE)
  pred6 <- predict(mod6)
  err6 <- mean(pred6 != test$y)
  
  cverror <- cbind(err1, err2, err3, err4, err5, err6)
  CVALL <- rbind(CVALL, cverror);
}
colnames(CVALL) <- Model

CVerr <- apply(CVALL, 2, mean);
CVvar <- apply(CVALL, 2, var);
all_errors_df <- data_frame(Model, TrainErr, TestErr, CVerr)

colnames(all_errors_df)[2] ="Training Error"
colnames(all_errors_df)[3] ="Test Error"
colnames(all_errors_df)[4] ="Cross Validation Error"

all_errors_df