auto_all_vars <- read.csv("~/Documents/Documents - Joanna’s MacBook Pro/School/ISYE7406/Auto.csv")
library(dplyr)
auto_all_vars <- auto_all_vars %>%
mutate(y = as.factor(if_else(mpg >= median(auto_all_vars$mpg), 1, 0)))
auto <- auto_all_vars[,-1]
library(caret)
trainIndex <- createDataPartition(auto$y, p = .8,
list = FALSE,
times = 1)
train <- auto[trainIndex,]
test <- auto[-trainIndex,]
TrainErr <- NULL;
TestErr <- NULL;
| Variable | N | Mean | Std. Dev. | Min | Pctl. 25 | Pctl. 75 | Max |
|---|---|---|---|---|---|---|---|
| mpg | 392 | 23.446 | 7.805 | 9 | 17 | 29 | 46.6 |
| cylinders | 392 | 5.472 | 1.706 | 3 | 4 | 8 | 8 |
| displacement | 392 | 194.412 | 104.644 | 68 | 105 | 275.75 | 455 |
| horsepower | 392 | 104.469 | 38.491 | 46 | 75 | 126 | 230 |
| weight | 392 | 2977.584 | 849.403 | 1613 | 2225.25 | 3614.75 | 5140 |
| acceleration | 392 | 15.541 | 2.759 | 8 | 13.775 | 17.025 | 24.8 |
| year | 392 | 75.98 | 3.684 | 70 | 73 | 79 | 82 |
| origin | 392 | 1.577 | 0.806 | 1 | 1 | 2 | 3 |
| y | 392 | ||||||
| … 0 | 196 | 50% | |||||
| … 1 | 196 | 50% |
#select var with high correlation coeficients for logistic regression
select_vars <- c("cylinders" , "weight", "displacement", "horsepower")
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
mod1 <- lda(train[,-8], train[,8]);
## training error
pred1 <- predict(mod1,train[,-8])$class;
TrainErr <- c(TrainErr, mean( pred1 != train$y));
TrainErr;
## [1] 0.08280255
## testing error
pred1test <- predict(mod1,test[,-8])$class;
TestErr <- c(TestErr,mean(pred1test != test$y));
TestErr;
## [1] 0.1153846
table(pred1test,test$y)
##
## pred1test 0 1
## 0 31 1
## 1 8 38
mod2 <- qda(train[,-8], train[,8]);
## Training Error
pred2 <- predict(mod2,train[,-8])$class;
TrainErr <- c(TrainErr, mean( pred2!= train$y))
TrainErr
## [1] 0.08280255 0.08917197
## Testing Error
TestErr <- c(TestErr, mean( predict(mod2,test[,-8])$class != test$y))
TestErr
## [1] 0.11538462 0.08974359
library(e1071)
mod3 <- naiveBayes(train[,-8], train[,8])
## Training Error
pred3 <- predict(mod3, train[,-8])
TrainErr <- c(TrainErr, mean( pred3 != train$y))
TrainErr
## [1] 0.08280255 0.08917197 0.09872611
## 0.2765152 for miss.class.train.error of Naive Bayes
## Testing Error
TestErr <- c(TestErr, mean( predict(mod3,test[,-8]) != test$y))
TestErr
## [1] 0.11538462 0.08974359 0.08974359
mod4 <- glm(y ~ .,family = binomial(link='logit'), data=train)
summary(mod4);
##
## Call:
## glm(formula = y ~ ., family = binomial(link = "logit"), data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.47488 -0.08039 0.00480 0.17368 2.34395
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -17.015318 6.924549 -2.457 0.014001 *
## cylinders -0.080081 0.505770 -0.158 0.874193
## displacement 0.002643 0.014672 0.180 0.857040
## horsepower -0.044899 0.028853 -1.556 0.119677
## weight -0.005063 0.001421 -3.563 0.000367 ***
## acceleration 0.004924 0.185078 0.027 0.978776
## year 0.457514 0.091505 5.000 5.74e-07 ***
## origin 0.471550 0.445382 1.059 0.289712
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 435.30 on 313 degrees of freedom
## Residual deviance: 114.91 on 306 degrees of freedom
## AIC: 130.91
##
## Number of Fisher Scoring iterations: 8
pred4 = round(predict(mod4, train[,-8], type="response"))
TrainErr <- c(TrainErr, mean(pred4 != train$y))
TrainErr
## [1] 0.08280255 0.08917197 0.09872611 0.08917197
## Testing Error of logistic regression
predtest4 = round(predict(mod4, test[,-8], type="response"))
TestErr <- c(TestErr, mean(predtest4 != test$y) )
TestErr
## [1] 0.11538462 0.08974359 0.08974359 0.10256410
mod5 <- glm(y ~ cylinders + weight + displacement + horsepower,
family=binomial(link='logit'), data=train)
## Training Error of logistic regression
pred5 = round(predict(mod4, train[,-8], type="response"))
TrainErr <- c(TrainErr, mean(pred5 != train$y))
TrainErr
## [1] 0.08280255 0.08917197 0.09872611 0.08917197 0.08917197
## Testing Error of logistic regression
predtest5 = round(predict(mod4, test[,-8], type="response"))
TestErr <- c(TestErr, mean(predtest5 != test$y) )
TestErr
## [1] 0.11538462 0.08974359 0.08974359 0.10256410 0.10256410
#KNN Model LOOCV to determine optimal value of k
library(kknn)
##
## Attaching package: 'kknn'
## The following object is masked from 'package:caret':
##
## contr.dummy
loocv_model = train.kknn(y ~ cylinders + weight + displacement + horsepower,
train,
ks=(1:25),#Vector containing values of k train.kknn will iterate through
kernel = "optimal",
scale=TRUE) #scales the data
loocv_model
##
## Call:
## train.kknn(formula = y ~ cylinders + weight + displacement + horsepower, data = train, ks = (1:25), kernel = "optimal", scale = TRUE)
##
## Type of response variable: nominal
## Minimal misclassification: 0.08280255
## Best kernel: optimal
## Best k: 13
plot(loocv_model)
KNN model with optimal value of k
mod6 <- kknn(y~ cylinders + weight + displacement + horsepower,
train = train,
test = test,
k = 12,
kernel = "optimal",
scale = TRUE)
mod6
##
## Call:
## kknn(formula = y ~ cylinders + weight + displacement + horsepower, train = train, test = test, k = 12, kernel = "optimal", scale = TRUE)
##
## Response: "nominal"
## Testing Error
pred6 <- predict(mod6)
TestErr <- c(TestErr, mean(pred6 != test$y) )
TestErr
## [1] 0.11538462 0.08974359 0.08974359 0.10256410 0.10256410 0.10256410
TrainErr <- c(TrainErr, "na")
TrainErr
## [1] "0.0828025477707006" "0.089171974522293" "0.0987261146496815"
## [4] "0.089171974522293" "0.089171974522293" "na"
Model <- c("LDA (all vars)", "QDA (all vars)", "Naive Bayes (all vars)", "Logistic Regression (all vars)", "Logistic Regression (best subset)", "KNN (best subset, k=14)")
error_df <- data_frame(Model, TrainErr, TestErr)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## ℹ Please use `tibble()` instead.
error_df
n1 = 314; # training set sample size
n2 = 78; # testing set sample size
n = dim(auto)[1]; ## the total sample size
set.seed(7406);
B= 100;
CVALL = NULL;
for (b in 1:B){
### randomly select n1 observations as a new training subset in each loop
flag <- sort(sample(1:n, n1));
train <- auto[flag,]; ## temp training set for CV
test <- auto[-flag,]; ## temp testing set for CV
# LDA model
mod1 <- lda(train[,-8], train[,8]);
err1 <- mean(predict(mod1,test[,-8])$class != test$y)
# QDA model
mod2 <- qda(train[,-8], train[,8]);
err2 <- mean(predict(mod2,test[,-8])$class != test$y)
#Naive Bayes
mod3 <- naiveBayes(train[,-8], train[,8])
err3 <- mean(predict(mod3,test[,-8]) != test$y)
#Logistic Regression
mod4 <- glm(y ~ .,family = binomial(link='logit'), data=train)
pred4 = round(predict(mod4, test[,-8], type="response"))
err4 <- mean(pred4 != test$y)
#Logistic Regression Reduced
mod5 <- glm(y ~ cylinders + weight + displacement + horsepower,
family=binomial(link='logit'), data=train)
pred4 = round(predict(mod4, test[,-8], type="response"))
err5 <- mean(pred4 != test$y)
#KNN
mod6 <- kknn(y~ cylinders + weight + displacement + horsepower,
train = train,
test = test,
k = 12,
kernel = "optimal",
scale = TRUE)
pred6 <- predict(mod6)
err6 <- mean(pred6 != test$y)
cverror <- cbind(err1, err2, err3, err4, err5, err6)
CVALL <- rbind(CVALL, cverror);
}
colnames(CVALL) <- Model
CVerr <- apply(CVALL, 2, mean);
CVvar <- apply(CVALL, 2, var);
all_errors_df <- data_frame(Model, TrainErr, TestErr, CVerr)
colnames(all_errors_df)[2] ="Training Error"
colnames(all_errors_df)[3] ="Test Error"
colnames(all_errors_df)[4] ="Cross Validation Error"
all_errors_df