library(ISLR)
## Warning: package 'ISLR' was built under R version 4.0.3
week = Weekly
summary(week)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
hist(week$Lag1)
hist(week$Lag2)
hist(week$Lag3)
hist(week$Lag4)
hist(week$Lag5)
hist(week$Volume)
hist(week$Today)
pairs(week[1:8])
The only pattern that can easily be seen through the pairs function is between the year and the volume. Other than that, it is hard to distinguish any other patterns.
week.glm = glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = week, family = binomial)
summary(week.glm)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = week)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Only one predictor is statistically significant. This variable is Lag2.
#Way 1
week.probs = predict(week.glm, type = "response")
contrasts(week$Direction)
## Up
## Down 0
## Up 1
week.preds = rep("Down", 1089)
week.preds[week.probs>.5] = "Up"
table(week.preds, week$Direction)
##
## week.preds Down Up
## Down 54 48
## Up 430 557
(557+54)/1089
## [1] 0.5610652
#Way 2
week$PredProb = predict.glm(week.glm, newdata = week, type = "response")
week$PredSur = ifelse(week$PredProb >= .5,"Up","Down")
caret::confusionMatrix(as.factor(week$Direction), as.factor(week$PredSur), positive = "Up")
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 54 430
## Up 48 557
##
## Accuracy : 0.5611
## 95% CI : (0.531, 0.5908)
## No Information Rate : 0.9063
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.035
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.5643
## Specificity : 0.5294
## Pos Pred Value : 0.9207
## Neg Pred Value : 0.1116
## Prevalence : 0.9063
## Detection Rate : 0.5115
## Detection Prevalence : 0.5556
## Balanced Accuracy : 0.5469
##
## 'Positive' Class : Up
##
The overall accuracy of the model is .5611, which is slightly better than guessing. The model is also better at predicting the true positive versus the true negative. This can be seen above by comparing the sensitivity and the specificity.
set.seed(1)
train =(week$Year<2009)
week.test = week[!train,1:8]
week.train = week[train,]
direction.test = week$Direction[!train]
week.glm2 = glm(Direction ~ Lag2, data = week, family = binomial, subset = train)
week.test$PredProb = predict.glm(week.glm2, newdata = week.test, type = "response")
week.test$PredSur = ifelse(week.test$PredProb >= .5,"Up","Down")
caret::confusionMatrix(as.factor(direction.test), as.factor(week.test$PredSur), positive = "Up")
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 9 34
## Up 5 56
##
## Accuracy : 0.625
## 95% CI : (0.5247, 0.718)
## No Information Rate : 0.8654
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1414
##
## Mcnemar's Test P-Value : 7.34e-06
##
## Sensitivity : 0.6222
## Specificity : 0.6429
## Pos Pred Value : 0.9180
## Neg Pred Value : 0.2093
## Prevalence : 0.8654
## Detection Rate : 0.5385
## Detection Prevalence : 0.5865
## Balanced Accuracy : 0.6325
##
## 'Positive' Class : Up
##
library(MASS)
week.lda = lda(Direction ~ Lag2, data = week, subset = train)
week.lda
## Call:
## lda(Direction ~ Lag2, data = week, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
##
## Coefficients of linear discriminants:
## LD1
## Lag2 0.4414162
lda.pred = predict(week.lda, week.test)
lda.class= lda.pred$class
caret::confusionMatrix(as.factor(direction.test), as.factor(lda.class), positive = "Up")
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 9 34
## Up 5 56
##
## Accuracy : 0.625
## 95% CI : (0.5247, 0.718)
## No Information Rate : 0.8654
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1414
##
## Mcnemar's Test P-Value : 7.34e-06
##
## Sensitivity : 0.6222
## Specificity : 0.6429
## Pos Pred Value : 0.9180
## Neg Pred Value : 0.2093
## Prevalence : 0.8654
## Detection Rate : 0.5385
## Detection Prevalence : 0.5865
## Balanced Accuracy : 0.6325
##
## 'Positive' Class : Up
##
week.qda = qda(Direction ~ Lag2, data = week, subset = train)
week.qda
## Call:
## qda(Direction ~ Lag2, data = week, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
qda.pred = predict(week.qda, week.test)
qda.class = qda.pred$class
caret::confusionMatrix(as.factor(direction.test), as.factor(qda.class), positive = "Up")
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 0 43
## Up 0 61
##
## Accuracy : 0.5865
## 95% CI : (0.4858, 0.6823)
## No Information Rate : 1
## P-Value [Acc > NIR] : 1
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 1.504e-10
##
## Sensitivity : 0.5865
## Specificity : NA
## Pos Pred Value : NA
## Neg Pred Value : NA
## Prevalence : 1.0000
## Detection Rate : 0.5865
## Detection Prevalence : 0.5865
## Balanced Accuracy : NA
##
## 'Positive' Class : Up
##
library(class)
train.x = data.frame(week.train$Lag2)
test.x = data.frame(week.test$Lag2)
train.direction = week.train$Direction
train.direction = as.character(train.direction)
set.seed(1)
knn.pred = knn(train.x, test.x, train.direction, k=1)
caret::confusionMatrix(as.factor(direction.test), as.factor(knn.pred), positive = "Up")
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 21 22
## Up 30 31
##
## Accuracy : 0.5
## 95% CI : (0.4003, 0.5997)
## No Information Rate : 0.5096
## P-Value [Acc > NIR] : 0.6158
##
## Kappa : -0.0033
##
## Mcnemar's Test P-Value : 0.3317
##
## Sensitivity : 0.5849
## Specificity : 0.4118
## Pos Pred Value : 0.5082
## Neg Pred Value : 0.4884
## Prevalence : 0.5096
## Detection Rate : 0.2981
## Detection Prevalence : 0.5865
## Balanced Accuracy : 0.4983
##
## 'Positive' Class : Up
##
The best model was the logistic and LDA model, both with an accuracy level of .625.
#Logistic with interactions
week.glm3 = glm(Direction ~ Lag1*Lag2 + Lag1*Lag3 + Lag1*Lag4 + Lag1*Lag5 + Lag1*Volume + Lag1*Today + Lag2*Lag3 + Lag2*Lag4 + Lag2*Lag5 + Lag2*Volume + Lag2*Today + Lag3*Lag4 + Lag3*Lag5 + Lag3*Volume + Lag3*Today + Lag4*Lag5 + Lag4*Volume + Lag4*Today + Lag5*Volume + Lag5*Today + Volume*Today, data = week, family = binomial, subset = train)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
week.test$PredProb2 = predict.glm(week.glm3, newdata = week.test, type = "response")
week.test$PredSur2 = ifelse(week.test$PredProb2 >= .5,"Up","Down")
caret::confusionMatrix(as.factor(direction.test), as.factor(week.test$PredSur2), positive = "Up")
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 41 2
## Up 2 59
##
## Accuracy : 0.9615
## 95% CI : (0.9044, 0.9894)
## No Information Rate : 0.5865
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9207
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9672
## Specificity : 0.9535
## Pos Pred Value : 0.9672
## Neg Pred Value : 0.9535
## Prevalence : 0.5865
## Detection Rate : 0.5673
## Detection Prevalence : 0.5865
## Balanced Accuracy : 0.9604
##
## 'Positive' Class : Up
##
#KNN
knn.pred5 = knn(train.x, test.x, train.direction, k=5)
caret::confusionMatrix(as.factor(direction.test), as.factor(knn.pred5), positive = "Up")
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 15 28
## Up 20 41
##
## Accuracy : 0.5385
## 95% CI : (0.438, 0.6367)
## No Information Rate : 0.6635
## P-Value [Acc > NIR] : 0.9970
##
## Kappa : 0.0216
##
## Mcnemar's Test P-Value : 0.3123
##
## Sensitivity : 0.5942
## Specificity : 0.4286
## Pos Pred Value : 0.6721
## Neg Pred Value : 0.3488
## Prevalence : 0.6635
## Detection Rate : 0.3942
## Detection Prevalence : 0.5865
## Balanced Accuracy : 0.5114
##
## 'Positive' Class : Up
##
knn.pred10 = knn(train.x, test.x, train.direction, k=10)
caret::confusionMatrix(as.factor(direction.test), as.factor(knn.pred10), positive = "Up")
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 17 26
## Up 19 42
##
## Accuracy : 0.5673
## 95% CI : (0.4665, 0.6641)
## No Information Rate : 0.6538
## P-Value [Acc > NIR] : 0.9734
##
## Kappa : 0.0859
##
## Mcnemar's Test P-Value : 0.3711
##
## Sensitivity : 0.6176
## Specificity : 0.4722
## Pos Pred Value : 0.6885
## Neg Pred Value : 0.3953
## Prevalence : 0.6538
## Detection Rate : 0.4038
## Detection Prevalence : 0.5865
## Balanced Accuracy : 0.5449
##
## 'Positive' Class : Up
##
knn.pred15 = knn(train.x, test.x, train.direction, k=15)
caret::confusionMatrix(as.factor(direction.test), as.factor(knn.pred15), positive = "Up")
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 20 23
## Up 20 41
##
## Accuracy : 0.5865
## 95% CI : (0.4858, 0.6823)
## No Information Rate : 0.6154
## P-Value [Acc > NIR] : 0.7609
##
## Kappa : 0.1387
##
## Mcnemar's Test P-Value : 0.7604
##
## Sensitivity : 0.6406
## Specificity : 0.5000
## Pos Pred Value : 0.6721
## Neg Pred Value : 0.4651
## Prevalence : 0.6154
## Detection Rate : 0.3942
## Detection Prevalence : 0.5865
## Balanced Accuracy : 0.5703
##
## 'Positive' Class : Up
##
knn.pred20 = knn(train.x, test.x, train.direction, k=20)
caret::confusionMatrix(as.factor(direction.test), as.factor(knn.pred20), positive = "Up")
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 21 22
## Up 20 41
##
## Accuracy : 0.5962
## 95% CI : (0.4954, 0.6913)
## No Information Rate : 0.6058
## P-Value [Acc > NIR] : 0.6207
##
## Kappa : 0.1616
##
## Mcnemar's Test P-Value : 0.8774
##
## Sensitivity : 0.6508
## Specificity : 0.5122
## Pos Pred Value : 0.6721
## Neg Pred Value : 0.4884
## Prevalence : 0.6058
## Detection Rate : 0.3942
## Detection Prevalence : 0.5865
## Balanced Accuracy : 0.5815
##
## 'Positive' Class : Up
##
The best result was created by the logistic model and using all the interactions possible. The accuracy of the model is .9615. The sensitivity and specificity were also greatly improved from the past models. The KNN model was also re-tested with different k levels and the best model has an accuracy level of .5865 with k =20. After increasing k, the accuracy started to decrease.
setwd("C:/Users/arami/Desktop/STAT 6543/HOMEWORK 3")
auto = read.csv("Auto.csv", na.strings = "?")
auto = na.omit(auto)
mpg.median = median(auto$mpg)
auto$mpg01 = auto$mpg
auto$mpg01[auto$mpg >= mpg.median] = 1
auto$mpg01[auto$mpg < mpg.median] = 0
str(auto)
## 'data.frame': 392 obs. of 10 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : int 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : int 3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ name : chr "chevrolet chevelle malibu" "buick skylark 320" "plymouth satellite" "amc rebel sst" ...
## $ mpg01 : num 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, "na.action")= 'omit' Named int [1:5] 33 127 331 337 355
## ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...
pairs(auto[c(1:8, 10)])
par(mfrow=c(1,2))
boxplot(auto$cylinders ~ auto$mpg01)
boxplot(auto$displacement ~ auto$mpg01)
boxplot(auto$horsepower ~ auto$mpg01)
boxplot(auto$weight ~ auto$mpg01)
boxplot(auto$acceleration ~ auto$mpg01)
boxplot(auto$year ~ auto$mpg01)
If we only look at the pair charts that were created, it is hard to see any correlation between mpg01 and the other variables. Once we created the boxplots, these relationships make easier to observe. More cylinders, a higher displacement, horsepower and weight seems to increase the likelihood of mpg01 being 0. Meanwhile lower acceleration and year tends to increase the likelihood of mpg01 being 0.
set.seed(1)
train=sample(392,310)
auto.train = auto[train,]
auto.test.x = auto[-train,1:9]
auto.test.y = auto[-train, 10]
auto.lm = lm(mpg01 ~ cylinders + displacement + horsepower + weight + acceleration + year + origin, data = auto)
summary(auto.lm)
##
## Call:
## lm(formula = mpg01 ~ cylinders + displacement + horsepower +
## weight + acceleration + year + origin, data = auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.93858 -0.15035 0.06735 0.19175 0.90105
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.366e-01 4.177e-01 -1.524 0.1284
## cylinders -1.183e-01 2.908e-02 -4.067 5.78e-05 ***
## displacement 3.395e-04 6.760e-04 0.502 0.6158
## horsepower 2.130e-03 1.240e-03 1.718 0.0867 .
## weight -2.873e-04 5.865e-05 -4.899 1.43e-06 ***
## acceleration 2.305e-03 8.891e-03 0.259 0.7956
## year 2.949e-02 4.585e-03 6.433 3.73e-10 ***
## origin 4.683e-02 2.502e-02 1.872 0.0620 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2993 on 384 degrees of freedom
## Multiple R-squared: 0.649, Adjusted R-squared: 0.6426
## F-statistic: 101.4 on 7 and 384 DF, p-value: < 2.2e-16
auto.lda = lda(mpg01 ~ cylinders + weight + year, data = auto.train)
auto.lda
## Call:
## lda(mpg01 ~ cylinders + weight + year, data = auto.train)
##
## Prior probabilities of groups:
## 0 1
## 0.4935484 0.5064516
##
## Group means:
## cylinders weight year
## 0 6.771242 3604.667 74.54248
## 1 4.203822 2346.051 77.59873
##
## Coefficients of linear discriminants:
## LD1
## cylinders -0.4212284939
## weight -0.0009673581
## year 0.1022662664
auto.lda.pred = predict(auto.lda, auto.test.x)
auto.lda.class= auto.lda.pred$class
caret::confusionMatrix(as.factor(auto.test.y), as.factor(auto.lda.class), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 36 7
## 1 0 39
##
## Accuracy : 0.9146
## 95% CI : (0.832, 0.965)
## No Information Rate : 0.561
## P-Value [Acc > NIR] : 2.002e-12
##
## Kappa : 0.8303
##
## Mcnemar's Test P-Value : 0.02334
##
## Sensitivity : 0.8478
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.8372
## Prevalence : 0.5610
## Detection Rate : 0.4756
## Detection Prevalence : 0.4756
## Balanced Accuracy : 0.9239
##
## 'Positive' Class : 1
##
The model performs well, with an accuracy level of .9146.
auto.qda = qda(mpg01 ~ cylinders + weight + year, data = auto.train)
auto.qda
## Call:
## qda(mpg01 ~ cylinders + weight + year, data = auto.train)
##
## Prior probabilities of groups:
## 0 1
## 0.4935484 0.5064516
##
## Group means:
## cylinders weight year
## 0 6.771242 3604.667 74.54248
## 1 4.203822 2346.051 77.59873
auto.qda.pred = predict(auto.qda, auto.test.x)
auto.qda.class = auto.qda.pred$class
caret::confusionMatrix(as.factor(auto.test.y), as.factor(auto.qda.class), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 38 5
## 1 0 39
##
## Accuracy : 0.939
## 95% CI : (0.8634, 0.9799)
## No Information Rate : 0.5366
## P-Value [Acc > NIR] : 9.57e-16
##
## Kappa : 0.8785
##
## Mcnemar's Test P-Value : 0.07364
##
## Sensitivity : 0.8864
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.8837
## Prevalence : 0.5366
## Detection Rate : 0.4756
## Detection Prevalence : 0.4756
## Balanced Accuracy : 0.9432
##
## 'Positive' Class : 1
##
This qda model performs slightly better than the lda model that was created in part D). The accuracy level resulted from the test was .939.
auto.log = glm(mpg01 ~ cylinders + weight + year, data = auto.train, family = binomial)
auto.PredProb = predict.glm(auto.log, newdata = auto.test.x, type = "response")
auto.PredSur = ifelse(auto.PredProb >= .5,1,0)
caret::confusionMatrix(as.factor(auto.test.y), as.factor(auto.PredSur), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 39 4
## 1 3 36
##
## Accuracy : 0.9146
## 95% CI : (0.832, 0.965)
## No Information Rate : 0.5122
## P-Value [Acc > NIR] : 4.455e-15
##
## Kappa : 0.8291
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9000
## Specificity : 0.9286
## Pos Pred Value : 0.9231
## Neg Pred Value : 0.9070
## Prevalence : 0.4878
## Detection Rate : 0.4390
## Detection Prevalence : 0.4756
## Balanced Accuracy : 0.9143
##
## 'Positive' Class : 1
##
The accuracy of this model is the same that was seen on part D), at .9146.
auto.knn.train.x = auto.train[c(2,5,7)]
auto.knn.train.y = auto.train[,10]
auto.knn.test.x = auto.test.x[c(2,5,7)]
set.seed(1)
auto.knn.pred = knn(auto.knn.train.x, auto.knn.test.x, auto.knn.train.y, k =1)
caret::confusionMatrix(as.factor(auto.test.y), as.factor(auto.knn.pred), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 38 5
## 1 5 34
##
## Accuracy : 0.878
## 95% CI : (0.7871, 0.9399)
## No Information Rate : 0.5244
## P-Value [Acc > NIR] : 9.717e-12
##
## Kappa : 0.7555
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8718
## Specificity : 0.8837
## Pos Pred Value : 0.8718
## Neg Pred Value : 0.8837
## Prevalence : 0.4756
## Detection Rate : 0.4146
## Detection Prevalence : 0.4756
## Balanced Accuracy : 0.8778
##
## 'Positive' Class : 1
##
auto.knn.pred5 = knn(auto.knn.train.x, auto.knn.test.x, auto.knn.train.y, k =5)
caret::confusionMatrix(as.factor(auto.test.y), as.factor(auto.knn.pred5), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 38 5
## 1 5 34
##
## Accuracy : 0.878
## 95% CI : (0.7871, 0.9399)
## No Information Rate : 0.5244
## P-Value [Acc > NIR] : 9.717e-12
##
## Kappa : 0.7555
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8718
## Specificity : 0.8837
## Pos Pred Value : 0.8718
## Neg Pred Value : 0.8837
## Prevalence : 0.4756
## Detection Rate : 0.4146
## Detection Prevalence : 0.4756
## Balanced Accuracy : 0.8778
##
## 'Positive' Class : 1
##
auto.knn.pred10 = knn(auto.knn.train.x, auto.knn.test.x, auto.knn.train.y, k =10)
caret::confusionMatrix(as.factor(auto.test.y), as.factor(auto.knn.pred10), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 38 5
## 1 5 34
##
## Accuracy : 0.878
## 95% CI : (0.7871, 0.9399)
## No Information Rate : 0.5244
## P-Value [Acc > NIR] : 9.717e-12
##
## Kappa : 0.7555
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8718
## Specificity : 0.8837
## Pos Pred Value : 0.8718
## Neg Pred Value : 0.8837
## Prevalence : 0.4756
## Detection Rate : 0.4146
## Detection Prevalence : 0.4756
## Balanced Accuracy : 0.8778
##
## 'Positive' Class : 1
##
auto.knn.pred15 = knn(auto.knn.train.x, auto.knn.test.x, auto.knn.train.y, k =15)
caret::confusionMatrix(as.factor(auto.test.y), as.factor(auto.knn.pred15), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 37 6
## 1 5 34
##
## Accuracy : 0.8659
## 95% CI : (0.7726, 0.9311)
## No Information Rate : 0.5122
## P-Value [Acc > NIR] : 1.449e-11
##
## Kappa : 0.7314
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8500
## Specificity : 0.8810
## Pos Pred Value : 0.8718
## Neg Pred Value : 0.8605
## Prevalence : 0.4878
## Detection Rate : 0.4146
## Detection Prevalence : 0.4756
## Balanced Accuracy : 0.8655
##
## 'Positive' Class : 1
##
auto.knn.pred20 = knn(auto.knn.train.x, auto.knn.test.x, auto.knn.train.y, k =20)
caret::confusionMatrix(as.factor(auto.test.y), as.factor(auto.knn.pred20), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 36 7
## 1 5 34
##
## Accuracy : 0.8537
## 95% CI : (0.7583, 0.922)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 2.054e-11
##
## Kappa : 0.7073
##
## Mcnemar's Test P-Value : 0.7728
##
## Sensitivity : 0.8293
## Specificity : 0.8780
## Pos Pred Value : 0.8718
## Neg Pred Value : 0.8372
## Prevalence : 0.5000
## Detection Rate : 0.4146
## Detection Prevalence : 0.4756
## Balanced Accuracy : 0.8537
##
## 'Positive' Class : 1
##
auto.knn.pred25 = knn(auto.knn.train.x, auto.knn.test.x, auto.knn.train.y, k =25)
caret::confusionMatrix(as.factor(auto.test.y), as.factor(auto.knn.pred25), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 37 6
## 1 5 34
##
## Accuracy : 0.8659
## 95% CI : (0.7726, 0.9311)
## No Information Rate : 0.5122
## P-Value [Acc > NIR] : 1.449e-11
##
## Kappa : 0.7314
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8500
## Specificity : 0.8810
## Pos Pred Value : 0.8718
## Neg Pred Value : 0.8605
## Prevalence : 0.4878
## Detection Rate : 0.4146
## Detection Prevalence : 0.4756
## Balanced Accuracy : 0.8655
##
## 'Positive' Class : 1
##
The best knn model that was found has an accuracy level of .878 with a k value of 1,5 and 10.
boston = Boston
crim.median = median(boston$crim)
boston$crim.above.med = boston$crim
boston$crim.above.med[boston$crim >= crim.median] = 1
boston$crim.above.med[boston$crim < crim.median] = 0
set.seed(1)
train = sample(506, 400)
boston.train = boston[train,]
boston.test.x = boston[-train,1:14]
boston.test.y = boston[-train, 15]
#Logistic
boston.log = glm(crim.above.med ~ zn + indus + chas + nox + rm + age + dis + rad + tax + ptratio + black + lstat + medv, data = boston.train, family = binomial)
summary(boston.log)
##
## Call:
## glm(formula = crim.above.med ~ zn + indus + chas + nox + rm +
## age + dis + rad + tax + ptratio + black + lstat + medv, family = binomial,
## data = boston.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0405 -0.1412 -0.0002 0.0017 3.6053
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -42.413075 7.663745 -5.534 3.13e-08 ***
## zn -0.105480 0.042747 -2.468 0.013605 *
## indus -0.064406 0.051401 -1.253 0.210203
## chas 0.471499 0.781593 0.603 0.546340
## nox 56.009198 9.235915 6.064 1.33e-09 ***
## rm -0.151684 0.850862 -0.178 0.858511
## age 0.021783 0.013687 1.591 0.111499
## dis 1.062174 0.277926 3.822 0.000132 ***
## rad 0.681962 0.175689 3.882 0.000104 ***
## tax -0.007108 0.003163 -2.248 0.024607 *
## ptratio 0.361211 0.147963 2.441 0.014637 *
## black -0.010394 0.005750 -1.808 0.070671 .
## lstat 0.080476 0.056675 1.420 0.155626
## medv 0.169017 0.080670 2.095 0.036155 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 554.52 on 399 degrees of freedom
## Residual deviance: 159.23 on 386 degrees of freedom
## AIC: 187.23
##
## Number of Fisher Scoring iterations: 9
boston.log.PredProb = predict.glm(boston.log, newdata = boston.test.x, type = "response")
boston.log.PredSur = ifelse(boston.log.PredProb >= .5,1,0)
caret::confusionMatrix(as.factor(boston.test.y), as.factor(boston.log.PredSur), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 45 8
## 1 5 48
##
## Accuracy : 0.8774
## 95% CI : (0.7994, 0.9331)
## No Information Rate : 0.5283
## P-Value [Acc > NIR] : 1.817e-14
##
## Kappa : 0.7547
##
## Mcnemar's Test P-Value : 0.5791
##
## Sensitivity : 0.8571
## Specificity : 0.9000
## Pos Pred Value : 0.9057
## Neg Pred Value : 0.8491
## Prevalence : 0.5283
## Detection Rate : 0.4528
## Detection Prevalence : 0.5000
## Balanced Accuracy : 0.8786
##
## 'Positive' Class : 1
##
boston.log2 = glm(crim.above.med ~ zn + nox + age + dis + rad + tax + ptratio + black + medv, data = boston.train, family = binomial)
summary(boston.log2)
##
## Call:
## glm(formula = crim.above.med ~ zn + nox + age + dis + rad + tax +
## ptratio + black + medv, family = binomial, data = boston.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9541 -0.1624 -0.0002 0.0015 3.6164
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -39.093913 7.230444 -5.407 6.41e-08 ***
## zn -0.104871 0.039008 -2.688 0.00718 **
## nox 50.727147 8.131533 6.238 4.42e-10 ***
## age 0.026352 0.011576 2.276 0.02282 *
## dis 0.999550 0.269695 3.706 0.00021 ***
## rad 0.759663 0.163470 4.647 3.37e-06 ***
## tax -0.008527 0.002964 -2.877 0.00402 **
## ptratio 0.350180 0.134863 2.597 0.00942 **
## black -0.010352 0.005822 -1.778 0.07538 .
## medv 0.128555 0.040044 3.210 0.00133 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 554.52 on 399 degrees of freedom
## Residual deviance: 163.07 on 390 degrees of freedom
## AIC: 183.07
##
## Number of Fisher Scoring iterations: 9
boston.log.PredProb2 = predict.glm(boston.log2, newdata = boston.test.x, type = "response")
boston.log.PredSur2 = ifelse(boston.log.PredProb2 >= .5,1,0)
caret::confusionMatrix(as.factor(boston.test.y), as.factor(boston.log.PredSur2), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 47 6
## 1 6 47
##
## Accuracy : 0.8868
## 95% CI : (0.8106, 0.9401)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7736
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8868
## Specificity : 0.8868
## Pos Pred Value : 0.8868
## Neg Pred Value : 0.8868
## Prevalence : 0.5000
## Detection Rate : 0.4434
## Detection Prevalence : 0.5000
## Balanced Accuracy : 0.8868
##
## 'Positive' Class : 1
##
#LDA
boston.lda = lda(crim.above.med ~ zn + indus + chas + nox + rm + age + dis + rad + tax + ptratio + black + lstat + medv, data = boston.train)
boston.lda.pred = predict(boston.lda, boston.test.x)
boston.lda.class= boston.lda.pred$class
caret::confusionMatrix(as.factor(boston.test.y), as.factor(boston.lda.class), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 52 1
## 1 11 42
##
## Accuracy : 0.8868
## 95% CI : (0.8106, 0.9401)
## No Information Rate : 0.5943
## P-Value [Acc > NIR] : 3.067e-11
##
## Kappa : 0.7736
##
## Mcnemar's Test P-Value : 0.009375
##
## Sensitivity : 0.9767
## Specificity : 0.8254
## Pos Pred Value : 0.7925
## Neg Pred Value : 0.9811
## Prevalence : 0.4057
## Detection Rate : 0.3962
## Detection Prevalence : 0.5000
## Balanced Accuracy : 0.9011
##
## 'Positive' Class : 1
##
boston.lda2 = lda(crim.above.med ~ zn + nox + age + dis + rad + tax + ptratio + black + medv, data = boston.train)
boston.lda.pred2 = predict(boston.lda2, boston.test.x)
boston.lda.class2= boston.lda.pred2$class
caret::confusionMatrix(as.factor(boston.test.y), as.factor(boston.lda.class2), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 52 1
## 1 13 40
##
## Accuracy : 0.8679
## 95% CI : (0.7883, 0.9259)
## No Information Rate : 0.6132
## P-Value [Acc > NIR] : 6.666e-09
##
## Kappa : 0.7358
##
## Mcnemar's Test P-Value : 0.003283
##
## Sensitivity : 0.9756
## Specificity : 0.8000
## Pos Pred Value : 0.7547
## Neg Pred Value : 0.9811
## Prevalence : 0.3868
## Detection Rate : 0.3774
## Detection Prevalence : 0.5000
## Balanced Accuracy : 0.8878
##
## 'Positive' Class : 1
##
#QDA
boston.qda = qda(crim.above.med ~ zn + indus + chas + nox + rm + age + dis + rad + tax + ptratio + black + lstat + medv, data = boston.train)
boston.qda.pred = predict(boston.qda, boston.test.x)
boston.qda.class = boston.qda.pred$class
caret::confusionMatrix(as.factor(boston.test.y), as.factor(boston.qda.class), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 52 1
## 1 9 44
##
## Accuracy : 0.9057
## 95% CI : (0.8333, 0.9538)
## No Information Rate : 0.5755
## P-Value [Acc > NIR] : 6.437e-14
##
## Kappa : 0.8113
##
## Mcnemar's Test P-Value : 0.02686
##
## Sensitivity : 0.9778
## Specificity : 0.8525
## Pos Pred Value : 0.8302
## Neg Pred Value : 0.9811
## Prevalence : 0.4245
## Detection Rate : 0.4151
## Detection Prevalence : 0.5000
## Balanced Accuracy : 0.9151
##
## 'Positive' Class : 1
##
boston.qda2 = qda(crim.above.med ~ zn + nox + age + dis + rad + tax + ptratio + black + medv, data = boston.train)
boston.qda.pred2 = predict(boston.qda2, boston.test.x)
boston.qda.class2 = boston.qda.pred2$class
caret::confusionMatrix(as.factor(boston.test.y), as.factor(boston.qda.class2), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 52 1
## 1 11 42
##
## Accuracy : 0.8868
## 95% CI : (0.8106, 0.9401)
## No Information Rate : 0.5943
## P-Value [Acc > NIR] : 3.067e-11
##
## Kappa : 0.7736
##
## Mcnemar's Test P-Value : 0.009375
##
## Sensitivity : 0.9767
## Specificity : 0.8254
## Pos Pred Value : 0.7925
## Neg Pred Value : 0.9811
## Prevalence : 0.4057
## Detection Rate : 0.3962
## Detection Prevalence : 0.5000
## Balanced Accuracy : 0.9011
##
## 'Positive' Class : 1
##
#KNN
boston.knn.train.x = boston.train[c(2:13)]
boston.knn.train.y = boston.train[,15]
boston.knn.test.x = boston.test.x[,2:13]
set.seed(1)
boston.knn.pred = knn(boston.knn.train.x, boston.knn.test.x, boston.knn.train.y, k =1)
caret::confusionMatrix(as.factor(boston.test.y), as.factor(boston.knn.pred), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 50 3
## 1 4 49
##
## Accuracy : 0.934
## 95% CI : (0.8687, 0.973)
## No Information Rate : 0.5094
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8679
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9423
## Specificity : 0.9259
## Pos Pred Value : 0.9245
## Neg Pred Value : 0.9434
## Prevalence : 0.4906
## Detection Rate : 0.4623
## Detection Prevalence : 0.5000
## Balanced Accuracy : 0.9341
##
## 'Positive' Class : 1
##
boston.knn.pred = knn(boston.knn.train.x, boston.knn.test.x, boston.knn.train.y, k =5)
caret::confusionMatrix(as.factor(boston.test.y), as.factor(boston.knn.pred), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 45 8
## 1 4 49
##
## Accuracy : 0.8868
## 95% CI : (0.8106, 0.9401)
## No Information Rate : 0.5377
## P-Value [Acc > NIR] : 1.155e-14
##
## Kappa : 0.7736
##
## Mcnemar's Test P-Value : 0.3865
##
## Sensitivity : 0.8596
## Specificity : 0.9184
## Pos Pred Value : 0.9245
## Neg Pred Value : 0.8491
## Prevalence : 0.5377
## Detection Rate : 0.4623
## Detection Prevalence : 0.5000
## Balanced Accuracy : 0.8890
##
## 'Positive' Class : 1
##
boston.knn.pred = knn(boston.knn.train.x, boston.knn.test.x, boston.knn.train.y, k =10)
caret::confusionMatrix(as.factor(boston.test.y), as.factor(boston.knn.pred), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 44 9
## 1 4 49
##
## Accuracy : 0.8774
## 95% CI : (0.7994, 0.9331)
## No Information Rate : 0.5472
## P-Value [Acc > NIR] : 2.833e-13
##
## Kappa : 0.7547
##
## Mcnemar's Test P-Value : 0.2673
##
## Sensitivity : 0.8448
## Specificity : 0.9167
## Pos Pred Value : 0.9245
## Neg Pred Value : 0.8302
## Prevalence : 0.5472
## Detection Rate : 0.4623
## Detection Prevalence : 0.5000
## Balanced Accuracy : 0.8807
##
## 'Positive' Class : 1
##
Every model was ran with all the variables as a baseline. The second model was selected by utilizing backwards selection through the logistic model. The second model only performed between with the logistic model, which resulted with an accuracy level of .8868. The best model for LDA received the same value as the logistic model and the best overall was seen from the KNN model. This was achieved with its first model and returned an accuracy of .934. The best QDA model retured an accuracy level of .9057