#Load the library
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## âś” ggplot2 3.4.0 âś” purrr 1.0.1
## âś” tibble 3.1.8 âś” dplyr 1.0.10
## âś” tidyr 1.2.1 âś” stringr 1.5.0
## âś” readr 2.1.3 âś” forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::filter() masks stats::filter()
## âś– dplyr::lag() masks stats::lag()
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
library(MASS)
##
## Attaching package: 'MASS'
##
## The following objects are masked from 'package:openintro':
##
## housing, mammals
##
## The following object is masked from 'package:dplyr':
##
## select
library(corrplot)
## corrplot 0.92 loaded
library(ISLR)
head(Weekly)
## Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
## 1 1990 0.816 1.572 -3.936 -0.229 -3.484 0.1549760 -0.270 Down
## 2 1990 -0.270 0.816 1.572 -3.936 -0.229 0.1485740 -2.576 Down
## 3 1990 -2.576 -0.270 0.816 1.572 -3.936 0.1598375 3.514 Up
## 4 1990 3.514 -2.576 -0.270 0.816 1.572 0.1616300 0.712 Up
## 5 1990 0.712 3.514 -2.576 -0.270 0.816 0.1537280 1.178 Up
## 6 1990 1.178 0.712 3.514 -2.576 -0.270 0.1544440 -1.372 Down
WeeklyNum = select_if(Weekly, is.numeric)
M1 = cor(WeeklyNum)
corrplot(M1, method = "number")
pairs(WeeklyNum)
Volume and year has a high positive correlation of 0.84. Besides that, no correlation between different lags and other variables.
attach(Weekly)
Weekly.fit = glm (Direction ~ Lag1+Lag2+Lag3+Lag4+Lag5+Volume, data = Weekly, family = binomial)
summary(Weekly.fit)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Only significant variable of this model is Lag 2 with p-value < 0.05. All other variables has p-value greater than 0.05 and are not significant
Weeklyprob=predict.glm(Weekly.fit, newdata = Weekly, type="response")
predWeekly = ifelse(Weeklyprob>= 0.5, "Up", "Down")
predWeekly <- as.factor(predWeekly)
caret::confusionMatrix(as.factor(predWeekly), as.factor(Weekly$Direction), positive = "Up")
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 54 48
## Up 430 557
##
## Accuracy : 0.5611
## 95% CI : (0.531, 0.5908)
## No Information Rate : 0.5556
## P-Value [Acc > NIR] : 0.369
##
## Kappa : 0.035
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9207
## Specificity : 0.1116
## Pos Pred Value : 0.5643
## Neg Pred Value : 0.5294
## Prevalence : 0.5556
## Detection Rate : 0.5115
## Detection Prevalence : 0.9063
## Balanced Accuracy : 0.5161
##
## 'Positive' Class : Up
##
The confusion matrix is telling me that model accuracy overall is 56.11%. It can accurately predicts the “Up” with 92.07%. However, when predicting “Down”, the result is only 11.16% correctly
train <- subset(Weekly, Year<2009)
Weekly2009 <- subset(Weekly, Year >=2009)
Weekly.fit = glm (Direction ~ Lag2, data = train, family = binomial)
summary(Weekly.fit)
##
## Call:
## glm(formula = Direction ~ Lag2, family = binomial, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.536 -1.264 1.021 1.091 1.368
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.20326 0.06428 3.162 0.00157 **
## Lag2 0.05810 0.02870 2.024 0.04298 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1354.7 on 984 degrees of freedom
## Residual deviance: 1350.5 on 983 degrees of freedom
## AIC: 1354.5
##
## Number of Fisher Scoring iterations: 4
Direction2009 <- Weekly2009$Direction
Weeklyprob <- predict.glm(Weekly.fit, newdata = Weekly2009, type="response")
predWeekly <- ifelse(Weeklyprob>= 0.5, "Up", "Down")
predWeekly <- as.factor(predWeekly)
caret::confusionMatrix(as.factor(predWeekly), as.factor(Weekly2009$Direction), positive = "Up")
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 9 5
## Up 34 56
##
## Accuracy : 0.625
## 95% CI : (0.5247, 0.718)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.2439
##
## Kappa : 0.1414
##
## Mcnemar's Test P-Value : 7.34e-06
##
## Sensitivity : 0.9180
## Specificity : 0.2093
## Pos Pred Value : 0.6222
## Neg Pred Value : 0.6429
## Prevalence : 0.5865
## Detection Rate : 0.5385
## Detection Prevalence : 0.8654
## Balanced Accuracy : 0.5637
##
## 'Positive' Class : Up
##
The new linear regression model has a higher accuracy than the old one, after creating the model on train dataset and predict it on Weekly2009 data. Overall accuracy is now 62.5%. Even though predicting Up trends with decrease a small amount to 91.80%, the model increases its accuracy for predicting Down trends to 20.93%
library(MASS)
Weeklylda <- lda(Direction~Lag2, data=train)
Weeklylda_pred <- predict(Weeklylda, newdata=Weekly2009)
caret::confusionMatrix(as.factor(Weeklylda_pred$class), as.factor(Weekly2009$Direction), positive = "Up")
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 9 5
## Up 34 56
##
## Accuracy : 0.625
## 95% CI : (0.5247, 0.718)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.2439
##
## Kappa : 0.1414
##
## Mcnemar's Test P-Value : 7.34e-06
##
## Sensitivity : 0.9180
## Specificity : 0.2093
## Pos Pred Value : 0.6222
## Neg Pred Value : 0.6429
## Prevalence : 0.5865
## Detection Rate : 0.5385
## Detection Prevalence : 0.8654
## Balanced Accuracy : 0.5637
##
## 'Positive' Class : Up
##
The LDA model has similar accuracy to the logistc result in d). Overall accuracy is now 62.5%. Predicting Up trends has an accuracy of 91.80%, and prediction accuracy for Down trends is 20.93%
Weeklyqda = qda(Direction ~ Lag2, data = train)
Weeklyqda_pred = predict(Weeklyqda, newdata=Weekly2009)
caret::confusionMatrix(as.factor(Weeklyqda_pred$class), as.factor(Weekly2009$Direction), positive = "Up")
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 0 0
## Up 43 61
##
## Accuracy : 0.5865
## 95% CI : (0.4858, 0.6823)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : 0.5419
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 1.504e-10
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.5865
## Neg Pred Value : NaN
## Prevalence : 0.5865
## Detection Rate : 0.5865
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : Up
##
The QDA model has a significant lower accuracy than logistic regression and LDA, only 58.65%. It can only predict the Up trend with 58.65% accuracy but totally disregard the down trend.
library(class)
Week.train=as.matrix(train$Lag2)
Week.test=as.matrix(Weekly2009$Lag2)
train.Direction = train$Direction
set.seed(1)
Weekknn.pred=knn(Week.train,Week.test,train.Direction,k=1)
Corr_pred <- mean(Weekknn.pred == Direction2009)
Corr_pred
## [1] 0.5
library (e1071)
nb.fit <- naiveBayes(Direction ~ Lag2 , data = train)
nb.fit
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## Down Up
## 0.4477157 0.5522843
##
## Conditional probabilities:
## Lag2
## Y [,1] [,2]
## Down -0.03568254 2.199504
## Up 0.26036581 2.317485
nb.class <- predict (nb.fit , Weekly2009)
mean (nb.class == Weekly2009$Direction)
## [1] 0.5865385
The methods that have the highest accuracy rates are the Logistic Regression and Linear Discriminant Analysis; both having rates of 62.5%.
library (ISLR)
str(Auto)
## 'data.frame': 392 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : num 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : num 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : num 3504 3693 3436 3433 3449 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : num 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : num 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
attach(Auto)
## The following object is masked from package:ggplot2:
##
## mpg
Auto$mpg01 <- ifelse(Auto$mpg > median(Auto$mpg),1,0)
str(Auto)
## 'data.frame': 392 obs. of 10 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : num 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : num 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : num 3504 3693 3436 3433 3449 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : num 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : num 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
## $ mpg01 : num 0 0 0 0 0 0 0 0 0 0 ...
corrplot(cor(Auto[,-9]), method="number")
Cylinders, displacement, horsepower, weight, and origin are highly correlated with mpg01 and are useful to investigate mpg01
set.seed(1)
tr_ind = sample(nrow(Auto), 0.8*nrow(Auto), replace = F)
AutoTrain = Auto[tr_ind,]
AutoTest = Auto[-tr_ind,]
Autolda = lda(mpg01 ~ cylinders + displacement + horsepower + origin + weight, data = AutoTrain)
Autolda.pred = predict(Autolda, newdata = AutoTest)
caret::confusionMatrix(as.factor(Autolda.pred$class), as.factor(AutoTest$mpg01))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 35 0
## 1 7 37
##
## Accuracy : 0.9114
## 95% CI : (0.8259, 0.9636)
## No Information Rate : 0.5316
## P-Value [Acc > NIR] : 2.819e-13
##
## Kappa : 0.8241
##
## Mcnemar's Test P-Value : 0.02334
##
## Sensitivity : 0.8333
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.8409
## Prevalence : 0.5316
## Detection Rate : 0.4430
## Detection Prevalence : 0.4430
## Balanced Accuracy : 0.9167
##
## 'Positive' Class : 0
##
test_error <- mean(Autolda.pred$class != AutoTest$mpg01)
print(paste("LDA test error is ", test_error))
## [1] "LDA test error is 0.0886075949367089"
The test error of LDA model is 8.86%
Autoqda = qda(mpg01 ~ cylinders + displacement + horsepower + origin + weight, data = AutoTrain)
Autoqda_pred = predict(Autoqda, newdata=AutoTest)
caret::confusionMatrix(as.factor(Autoqda_pred$class), as.factor(AutoTest$mpg01))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 36 2
## 1 6 35
##
## Accuracy : 0.8987
## 95% CI : (0.8102, 0.9553)
## No Information Rate : 0.5316
## P-Value [Acc > NIR] : 2.278e-12
##
## Kappa : 0.798
##
## Mcnemar's Test P-Value : 0.2888
##
## Sensitivity : 0.8571
## Specificity : 0.9459
## Pos Pred Value : 0.9474
## Neg Pred Value : 0.8537
## Prevalence : 0.5316
## Detection Rate : 0.4557
## Detection Prevalence : 0.4810
## Balanced Accuracy : 0.9015
##
## 'Positive' Class : 0
##
QDAtest_error <- mean(Autoqda_pred$class != AutoTest$mpg01)
print(paste("QDA test error is ", QDAtest_error))
## [1] "QDA test error is 0.10126582278481"
QDA test error is 10.13%
Autologreg <- glm(mpg01 ~ cylinders + displacement + horsepower + origin + weight, data = AutoTrain, family = binomial)
predprob = predict.glm(Autologreg, newdata = AutoTest, type = "response")
Autologreg.pred <- ifelse(predprob>= 0.5, 1, 0)
caret::confusionMatrix(as.factor(Autologreg.pred), as.factor(AutoTest$mpg01))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 38 1
## 1 4 36
##
## Accuracy : 0.9367
## 95% CI : (0.8584, 0.9791)
## No Information Rate : 0.5316
## P-Value [Acc > NIR] : 2.725e-15
##
## Kappa : 0.8735
##
## Mcnemar's Test P-Value : 0.3711
##
## Sensitivity : 0.9048
## Specificity : 0.9730
## Pos Pred Value : 0.9744
## Neg Pred Value : 0.9000
## Prevalence : 0.5316
## Detection Rate : 0.4810
## Detection Prevalence : 0.4937
## Balanced Accuracy : 0.9389
##
## 'Positive' Class : 0
##
LogRegError <- mean(Autologreg.pred != AutoTest$mpg01)
print(paste("Log Reg test error is ", LogRegError))
## [1] "Log Reg test error is 0.0632911392405063"
Logistic Regression test error is 6.33%
library (e1071)
Autonb.fit <- naiveBayes(mpg01 ~ cylinders + displacement + horsepower + origin + weight, data = AutoTrain)
Autonb.fit
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## 0 1
## 0.4920128 0.5079872
##
## Conditional probabilities:
## cylinders
## Y [,1] [,2]
## 0 6.753247 1.4294370
## 1 4.207547 0.7297951
##
## displacement
## Y [,1] [,2]
## 0 269.6558 85.82353
## 1 117.5723 40.76848
##
## horsepower
## Y [,1] [,2]
## 0 128.0844 35.44949
## 1 79.8239 16.41647
##
## origin
## Y [,1] [,2]
## 0 1.162338 0.4913737
## 1 1.949686 0.8553535
##
## weight
## Y [,1] [,2]
## 0 3598.026 668.4909
## 1 2350.283 401.2710
Autonb.class <- predict (Autonb.fit , AutoTest)
mean (Autonb.class != AutoTest$mpg01)
## [1] 0.08860759
Test error of naive Bayes is 8.86%
library(class)
Auto.train=as.matrix(AutoTrain[, c("cylinders", "displacement", "horsepower", "origin", "weight")])
Auto.test=as.matrix(AutoTest[, c("cylinders", "displacement", "horsepower", "origin", "weight")])
set.seed(1)
Autoknn.pred=knn(Auto.train,Auto.test,AutoTrain$mpg01,k=1)
mean(Autoknn.pred != AutoTest$mpg01)
## [1] 0.1265823
Using k=1, KNN produce error rate of 12.68%
set.seed(1)
Autoknn.pred=knn(Auto.train,Auto.test,AutoTrain$mpg01,k=5)
mean(Autoknn.pred != AutoTest$mpg01)
## [1] 0.1012658
Using k=5, KNN produce error rate of 10.12%
set.seed(1)
Autoknn.pred=knn(Auto.train,Auto.test,AutoTrain$mpg01,k=10)
mean(Autoknn.pred != AutoTest$mpg01)
## [1] 0.1139241
Using k=10, KNN produce error rate of 11.39%. We can see that
increasing K value decrease the test error rate. E
str(Boston)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
summary(Boston$crim)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00632 0.08204 0.25651 3.61352 3.67708 88.97620
# Create categorical for crime
Boston$crime01 <- ifelse(Boston$crim > median(Boston$crim),1,0)
Boston$crime01 <- factor(Boston$crime01)
class(Boston$crime01)
## [1] "factor"
str(Boston)
## 'data.frame': 506 obs. of 15 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
## $ crime01: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
# Create correlation plot
BosCorr = cor(Boston[,-15])
corrplot(BosCorr, method = "number")
The correlation between variables are below 0.75 so we don’t need to remove any variables
# Create Training and Testing dataset
set.seed(1)
tr_ind = sample(nrow(Boston), 0.8*nrow(Boston), replace = F)
BosTrain = Boston[tr_ind,]
BosTest = Boston[-tr_ind,]
BosTrain$crime01 <- factor(BosTrain$crime01)
BosTest$crime01 <- factor(BosTest$crime01)
# Logistic Regression Model
Bos.log = glm(crime01 ~ zn+indus+nox+age+dis+rad+tax+lstat, data = BosTrain, family=binomial)
summary(Bos.log)
##
## Call:
## glm(formula = crime01 ~ zn + indus + nox + age + dis + rad +
## tax + lstat, family = binomial, data = BosTrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0836 -0.2279 -0.0004 0.0037 3.2642
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -30.869254 4.895094 -6.306 2.86e-10 ***
## zn -0.090201 0.034931 -2.582 0.009815 **
## indus -0.059719 0.047023 -1.270 0.204079
## nox 50.330509 8.423706 5.975 2.30e-09 ***
## age 0.015232 0.010737 1.419 0.156002
## dis 0.812129 0.233470 3.479 0.000504 ***
## rad 0.698118 0.146170 4.776 1.79e-06 ***
## tax -0.008689 0.002833 -3.067 0.002162 **
## lstat 0.001393 0.037640 0.037 0.970486
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 560.06 on 403 degrees of freedom
## Residual deviance: 178.60 on 395 degrees of freedom
## AIC: 196.6
##
## Number of Fisher Scoring iterations: 9
Bospred = predict.glm(Bos.log, newdata = BosTest, type = "response")
Boston.pred = rep(0, length(Bospred))
Boston.pred[Bospred > 0.5] = 1
caret::confusionMatrix(as.factor(Boston.pred), as.factor(BosTest$crime01))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 40 5
## 1 11 46
##
## Accuracy : 0.8431
## 95% CI : (0.7578, 0.9076)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 4.567e-13
##
## Kappa : 0.6863
##
## Mcnemar's Test P-Value : 0.2113
##
## Sensitivity : 0.7843
## Specificity : 0.9020
## Pos Pred Value : 0.8889
## Neg Pred Value : 0.8070
## Prevalence : 0.5000
## Detection Rate : 0.3922
## Detection Prevalence : 0.4412
## Balanced Accuracy : 0.8431
##
## 'Positive' Class : 0
##
mean(Boston.pred != BosTest$crime01)
## [1] 0.1568627
Accuracy for logistic regression is 84.31% while Sensitivity is 78.43% and specificity 90.20%. Test error is 15.68%
#LDA Model
Bos.lda = lda(crime01 ~ zn+indus+nox+age+dis+rad+tax+lstat, data = BosTrain)
Bos.lda
## Call:
## lda(crime01 ~ zn + indus + nox + age + dis + rad + tax + lstat,
## data = BosTrain)
##
## Prior probabilities of groups:
## 0 1
## 0.5 0.5
##
## Group means:
## zn indus nox age dis rad tax lstat
## 0 21.631188 7.165396 0.4704243 51.33861 5.036025 4.148515 309.4109 9.359851
## 1 1.108911 15.358267 0.6356634 85.71931 2.532918 15.099010 513.8812 15.722376
##
## Coefficients of linear discriminants:
## LD1
## zn -0.004914918
## indus 0.012609004
## nox 7.619751717
## age 0.012408717
## dis 0.027780021
## rad 0.101695112
## tax -0.002116586
## lstat -0.015897450
predclass_lda = predict(Bos.lda, newdata = BosTest)
caret::confusionMatrix(as.factor(predclass_lda$class), as.factor(BosTest$crime01))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 50 12
## 1 1 39
##
## Accuracy : 0.8725
## 95% CI : (0.7919, 0.9304)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 2.151e-15
##
## Kappa : 0.7451
##
## Mcnemar's Test P-Value : 0.005546
##
## Sensitivity : 0.9804
## Specificity : 0.7647
## Pos Pred Value : 0.8065
## Neg Pred Value : 0.9750
## Prevalence : 0.5000
## Detection Rate : 0.4902
## Detection Prevalence : 0.6078
## Balanced Accuracy : 0.8725
##
## 'Positive' Class : 0
##
mean(predclass_lda != BosTest$crime01)
## [1] 1
Accuracy for lda is 87.25% while Sensitivity is 98.04% and specificity 76.47%
#Naive Bayes
library (e1071)
Bosnb<- naiveBayes(crime01 ~ zn+indus+nox+age+dis+rad+tax+lstat, data = BosTrain)
Bosnb.class <- predict(Bosnb , BosTest)
caret::confusionMatrix(as.factor(Bosnb.class), as.factor(BosTest$crime01))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 45 15
## 1 6 36
##
## Accuracy : 0.7941
## 95% CI : (0.7027, 0.8678)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 8.549e-10
##
## Kappa : 0.5882
##
## Mcnemar's Test P-Value : 0.08086
##
## Sensitivity : 0.8824
## Specificity : 0.7059
## Pos Pred Value : 0.7500
## Neg Pred Value : 0.8571
## Prevalence : 0.5000
## Detection Rate : 0.4412
## Detection Prevalence : 0.5882
## Balanced Accuracy : 0.7941
##
## 'Positive' Class : 0
##
mean(Bosnb.class != BosTest$crime01)
## [1] 0.2058824
Accuracy for naive Bayes model is 79.41% while Sensitivity is 88.24% and specificity 70.59%. Test error is 20.58%
library(class)
set.seed(1)
Bosknn <- knn(BosTrain[, c("zn", "indus", "nox", "age", "dis", "rad", "tax", "lstat")],
BosTest[, c("zn", "indus", "nox", "age", "dis", "rad", "tax", "lstat")],
cl = BosTrain$crime01,
k = 1)
caret::confusionMatrix(as.factor(Bosknn), as.factor(BosTest$crime01))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 48 4
## 1 3 47
##
## Accuracy : 0.9314
## 95% CI : (0.8637, 0.972)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8627
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9412
## Specificity : 0.9216
## Pos Pred Value : 0.9231
## Neg Pred Value : 0.9400
## Prevalence : 0.5000
## Detection Rate : 0.4706
## Detection Prevalence : 0.5098
## Balanced Accuracy : 0.9314
##
## 'Positive' Class : 0
##
mean(Bosknn != BosTest$crime01)
## [1] 0.06862745
Accuracy for knn model is 91.18% while Sensitivity is 92.16% and specificity 90.20%. Test error is 6.86%
After reviewing the results of each classification method, knn has the highest accuracy, specificity, and sensitivity and lowest test error. Closely reviewing the logistic regression model, the variables that were non significant were: indus, age, and lstat. Next best fit model is logististic regression with test accuracy of 84.31%, Sensitivity of 78.43%, and specificity of 90.20%. Test error is 15.68% for logistic regression model