Question 10

A)

library(ISLR)
## Warning: package 'ISLR' was built under R version 4.0.3
week = Weekly
summary(week)
##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
## 
hist(week$Lag1)

hist(week$Lag2)

hist(week$Lag3)

hist(week$Lag4)

hist(week$Lag5)

hist(week$Volume)

hist(week$Today)

pairs(week[1:8])

The only pattern that can easily be seen through the pairs function is between the year and the volume. Other than that, it is hard to distinguish any other patterns.

B)

week.glm = glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = week, family = binomial)
summary(week.glm)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = week)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6949  -1.2565   0.9913   1.0849   1.4579  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

Only one predictor is statistically significant. This variable is Lag2.

C)

#Way 1
week.probs = predict(week.glm, type = "response")
contrasts(week$Direction)
##      Up
## Down  0
## Up    1
week.preds = rep("Down", 1089)
week.preds[week.probs>.5] = "Up"
table(week.preds, week$Direction)
##           
## week.preds Down  Up
##       Down   54  48
##       Up    430 557
(557+54)/1089
## [1] 0.5610652
#Way 2
week$PredProb = predict.glm(week.glm, newdata = week, type = "response")
week$PredSur = ifelse(week$PredProb >= .5,"Up","Down")
caret::confusionMatrix(as.factor(week$Direction), as.factor(week$PredSur), positive = "Up")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Down  Up
##       Down   54 430
##       Up     48 557
##                                          
##                Accuracy : 0.5611         
##                  95% CI : (0.531, 0.5908)
##     No Information Rate : 0.9063         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.035          
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.5643         
##             Specificity : 0.5294         
##          Pos Pred Value : 0.9207         
##          Neg Pred Value : 0.1116         
##              Prevalence : 0.9063         
##          Detection Rate : 0.5115         
##    Detection Prevalence : 0.5556         
##       Balanced Accuracy : 0.5469         
##                                          
##        'Positive' Class : Up             
## 

The overall accuracy of the model is .5611, which is slightly better than guessing. The model is also better at predicting the true positive versus the true negative. This can be seen above by comparing the sensitivity and the specificity.

D)

set.seed(1)
train =(week$Year<2009)
week.test = week[!train,1:8]
week.train = week[train,]
direction.test = week$Direction[!train]

week.glm2 = glm(Direction ~ Lag2, data = week, family = binomial, subset = train)

week.test$PredProb = predict.glm(week.glm2, newdata = week.test, type = "response")
week.test$PredSur = ifelse(week.test$PredProb >= .5,"Up","Down")
caret::confusionMatrix(as.factor(direction.test), as.factor(week.test$PredSur), positive = "Up")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Down Up
##       Down    9 34
##       Up      5 56
##                                          
##                Accuracy : 0.625          
##                  95% CI : (0.5247, 0.718)
##     No Information Rate : 0.8654         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.1414         
##                                          
##  Mcnemar's Test P-Value : 7.34e-06       
##                                          
##             Sensitivity : 0.6222         
##             Specificity : 0.6429         
##          Pos Pred Value : 0.9180         
##          Neg Pred Value : 0.2093         
##              Prevalence : 0.8654         
##          Detection Rate : 0.5385         
##    Detection Prevalence : 0.5865         
##       Balanced Accuracy : 0.6325         
##                                          
##        'Positive' Class : Up             
## 

E)

library(MASS)
week.lda = lda(Direction ~ Lag2, data = week, subset = train)
week.lda
## Call:
## lda(Direction ~ Lag2, data = week, subset = train)
## 
## Prior probabilities of groups:
##      Down        Up 
## 0.4477157 0.5522843 
## 
## Group means:
##             Lag2
## Down -0.03568254
## Up    0.26036581
## 
## Coefficients of linear discriminants:
##            LD1
## Lag2 0.4414162
lda.pred = predict(week.lda, week.test)
lda.class= lda.pred$class
caret::confusionMatrix(as.factor(direction.test), as.factor(lda.class), positive = "Up")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Down Up
##       Down    9 34
##       Up      5 56
##                                          
##                Accuracy : 0.625          
##                  95% CI : (0.5247, 0.718)
##     No Information Rate : 0.8654         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.1414         
##                                          
##  Mcnemar's Test P-Value : 7.34e-06       
##                                          
##             Sensitivity : 0.6222         
##             Specificity : 0.6429         
##          Pos Pred Value : 0.9180         
##          Neg Pred Value : 0.2093         
##              Prevalence : 0.8654         
##          Detection Rate : 0.5385         
##    Detection Prevalence : 0.5865         
##       Balanced Accuracy : 0.6325         
##                                          
##        'Positive' Class : Up             
## 

F)

week.qda = qda(Direction ~ Lag2, data = week, subset = train)
week.qda
## Call:
## qda(Direction ~ Lag2, data = week, subset = train)
## 
## Prior probabilities of groups:
##      Down        Up 
## 0.4477157 0.5522843 
## 
## Group means:
##             Lag2
## Down -0.03568254
## Up    0.26036581
qda.pred = predict(week.qda, week.test)
qda.class = qda.pred$class
caret::confusionMatrix(as.factor(direction.test), as.factor(qda.class), positive = "Up")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Down Up
##       Down    0 43
##       Up      0 61
##                                           
##                Accuracy : 0.5865          
##                  95% CI : (0.4858, 0.6823)
##     No Information Rate : 1               
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : 1.504e-10       
##                                           
##             Sensitivity : 0.5865          
##             Specificity :     NA          
##          Pos Pred Value :     NA          
##          Neg Pred Value :     NA          
##              Prevalence : 1.0000          
##          Detection Rate : 0.5865          
##    Detection Prevalence : 0.5865          
##       Balanced Accuracy :     NA          
##                                           
##        'Positive' Class : Up              
## 

G)

library(class)
train.x = data.frame(week.train$Lag2)
test.x = data.frame(week.test$Lag2)
train.direction = week.train$Direction
train.direction = as.character(train.direction)

set.seed(1)
knn.pred = knn(train.x, test.x, train.direction, k=1)
caret::confusionMatrix(as.factor(direction.test), as.factor(knn.pred), positive = "Up")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Down Up
##       Down   21 22
##       Up     30 31
##                                           
##                Accuracy : 0.5             
##                  95% CI : (0.4003, 0.5997)
##     No Information Rate : 0.5096          
##     P-Value [Acc > NIR] : 0.6158          
##                                           
##                   Kappa : -0.0033         
##                                           
##  Mcnemar's Test P-Value : 0.3317          
##                                           
##             Sensitivity : 0.5849          
##             Specificity : 0.4118          
##          Pos Pred Value : 0.5082          
##          Neg Pred Value : 0.4884          
##              Prevalence : 0.5096          
##          Detection Rate : 0.2981          
##    Detection Prevalence : 0.5865          
##       Balanced Accuracy : 0.4983          
##                                           
##        'Positive' Class : Up              
## 

H)

The best model was the logistic and LDA model, both with an accuracy level of .625.

I)

#Logistic with interactions
week.glm3 = glm(Direction ~ Lag1*Lag2 + Lag1*Lag3 + Lag1*Lag4 + Lag1*Lag5 + Lag1*Volume + Lag1*Today + Lag2*Lag3 + Lag2*Lag4 + Lag2*Lag5 + Lag2*Volume + Lag2*Today + Lag3*Lag4 + Lag3*Lag5 + Lag3*Volume + Lag3*Today + Lag4*Lag5 + Lag4*Volume + Lag4*Today + Lag5*Volume + Lag5*Today + Volume*Today, data = week, family = binomial, subset = train)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
week.test$PredProb2 = predict.glm(week.glm3, newdata = week.test, type = "response")
week.test$PredSur2 = ifelse(week.test$PredProb2 >= .5,"Up","Down")
caret::confusionMatrix(as.factor(direction.test), as.factor(week.test$PredSur2), positive = "Up")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Down Up
##       Down   41  2
##       Up      2 59
##                                           
##                Accuracy : 0.9615          
##                  95% CI : (0.9044, 0.9894)
##     No Information Rate : 0.5865          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9207          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9672          
##             Specificity : 0.9535          
##          Pos Pred Value : 0.9672          
##          Neg Pred Value : 0.9535          
##              Prevalence : 0.5865          
##          Detection Rate : 0.5673          
##    Detection Prevalence : 0.5865          
##       Balanced Accuracy : 0.9604          
##                                           
##        'Positive' Class : Up              
## 
#KNN
knn.pred5 = knn(train.x, test.x, train.direction, k=5)
caret::confusionMatrix(as.factor(direction.test), as.factor(knn.pred5), positive = "Up")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Down Up
##       Down   15 28
##       Up     20 41
##                                          
##                Accuracy : 0.5385         
##                  95% CI : (0.438, 0.6367)
##     No Information Rate : 0.6635         
##     P-Value [Acc > NIR] : 0.9970         
##                                          
##                   Kappa : 0.0216         
##                                          
##  Mcnemar's Test P-Value : 0.3123         
##                                          
##             Sensitivity : 0.5942         
##             Specificity : 0.4286         
##          Pos Pred Value : 0.6721         
##          Neg Pred Value : 0.3488         
##              Prevalence : 0.6635         
##          Detection Rate : 0.3942         
##    Detection Prevalence : 0.5865         
##       Balanced Accuracy : 0.5114         
##                                          
##        'Positive' Class : Up             
## 
knn.pred10 = knn(train.x, test.x, train.direction, k=10)
caret::confusionMatrix(as.factor(direction.test), as.factor(knn.pred10), positive = "Up")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Down Up
##       Down   17 26
##       Up     19 42
##                                           
##                Accuracy : 0.5673          
##                  95% CI : (0.4665, 0.6641)
##     No Information Rate : 0.6538          
##     P-Value [Acc > NIR] : 0.9734          
##                                           
##                   Kappa : 0.0859          
##                                           
##  Mcnemar's Test P-Value : 0.3711          
##                                           
##             Sensitivity : 0.6176          
##             Specificity : 0.4722          
##          Pos Pred Value : 0.6885          
##          Neg Pred Value : 0.3953          
##              Prevalence : 0.6538          
##          Detection Rate : 0.4038          
##    Detection Prevalence : 0.5865          
##       Balanced Accuracy : 0.5449          
##                                           
##        'Positive' Class : Up              
## 
knn.pred15 = knn(train.x, test.x, train.direction, k=15)
caret::confusionMatrix(as.factor(direction.test), as.factor(knn.pred15), positive = "Up")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Down Up
##       Down   20 23
##       Up     20 41
##                                           
##                Accuracy : 0.5865          
##                  95% CI : (0.4858, 0.6823)
##     No Information Rate : 0.6154          
##     P-Value [Acc > NIR] : 0.7609          
##                                           
##                   Kappa : 0.1387          
##                                           
##  Mcnemar's Test P-Value : 0.7604          
##                                           
##             Sensitivity : 0.6406          
##             Specificity : 0.5000          
##          Pos Pred Value : 0.6721          
##          Neg Pred Value : 0.4651          
##              Prevalence : 0.6154          
##          Detection Rate : 0.3942          
##    Detection Prevalence : 0.5865          
##       Balanced Accuracy : 0.5703          
##                                           
##        'Positive' Class : Up              
## 
knn.pred20 = knn(train.x, test.x, train.direction, k=20)
caret::confusionMatrix(as.factor(direction.test), as.factor(knn.pred20), positive = "Up")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Down Up
##       Down   21 22
##       Up     20 41
##                                           
##                Accuracy : 0.5962          
##                  95% CI : (0.4954, 0.6913)
##     No Information Rate : 0.6058          
##     P-Value [Acc > NIR] : 0.6207          
##                                           
##                   Kappa : 0.1616          
##                                           
##  Mcnemar's Test P-Value : 0.8774          
##                                           
##             Sensitivity : 0.6508          
##             Specificity : 0.5122          
##          Pos Pred Value : 0.6721          
##          Neg Pred Value : 0.4884          
##              Prevalence : 0.6058          
##          Detection Rate : 0.3942          
##    Detection Prevalence : 0.5865          
##       Balanced Accuracy : 0.5815          
##                                           
##        'Positive' Class : Up              
## 

The best result was created by the logistic model and using all the interactions possible. The accuracy of the model is .9615. The sensitivity and specificity were also greatly improved from the past models. The KNN model was also re-tested with different k levels and the best model has an accuracy level of .5865 with k =20. After increasing k, the accuracy started to decrease.

Question 11

A)

setwd("C:/Users/arami/Desktop/STAT 6543/HOMEWORK 3")
auto = read.csv("Auto.csv", na.strings = "?")
auto = na.omit(auto)

mpg.median = median(auto$mpg)
auto$mpg01 = auto$mpg
auto$mpg01[auto$mpg >= mpg.median] = 1
auto$mpg01[auto$mpg < mpg.median] = 0

B)

str(auto)
## 'data.frame':    392 obs. of  10 variables:
##  $ mpg         : num  18 15 18 16 17 15 14 14 14 15 ...
##  $ cylinders   : int  8 8 8 8 8 8 8 8 8 8 ...
##  $ displacement: num  307 350 318 304 302 429 454 440 455 390 ...
##  $ horsepower  : int  130 165 150 150 140 198 220 215 225 190 ...
##  $ weight      : int  3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
##  $ acceleration: num  12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
##  $ year        : int  70 70 70 70 70 70 70 70 70 70 ...
##  $ origin      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ name        : chr  "chevrolet chevelle malibu" "buick skylark 320" "plymouth satellite" "amc rebel sst" ...
##  $ mpg01       : num  0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, "na.action")= 'omit' Named int [1:5] 33 127 331 337 355
##   ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...
pairs(auto[c(1:8, 10)])

par(mfrow=c(1,2))
boxplot(auto$cylinders ~ auto$mpg01)
boxplot(auto$displacement ~ auto$mpg01)

boxplot(auto$horsepower ~ auto$mpg01)
boxplot(auto$weight ~ auto$mpg01)

boxplot(auto$acceleration ~ auto$mpg01)
boxplot(auto$year ~ auto$mpg01)

If we only look at the pair charts that were created, it is hard to see any correlation between mpg01 and the other variables. Once we created the boxplots, these relationships make easier to observe. More cylinders, a higher displacement, horsepower and weight seems to increase the likelihood of mpg01 being 0. Meanwhile lower acceleration and year tends to increase the likelihood of mpg01 being 0.

C)

set.seed(1)
train=sample(392,310)
auto.train = auto[train,]
auto.test.x = auto[-train,1:9]
auto.test.y = auto[-train, 10]

D)

auto.lm = lm(mpg01 ~ cylinders + displacement + horsepower + weight + acceleration + year + origin, data = auto)
summary(auto.lm)
## 
## Call:
## lm(formula = mpg01 ~ cylinders + displacement + horsepower + 
##     weight + acceleration + year + origin, data = auto)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.93858 -0.15035  0.06735  0.19175  0.90105 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -6.366e-01  4.177e-01  -1.524   0.1284    
## cylinders    -1.183e-01  2.908e-02  -4.067 5.78e-05 ***
## displacement  3.395e-04  6.760e-04   0.502   0.6158    
## horsepower    2.130e-03  1.240e-03   1.718   0.0867 .  
## weight       -2.873e-04  5.865e-05  -4.899 1.43e-06 ***
## acceleration  2.305e-03  8.891e-03   0.259   0.7956    
## year          2.949e-02  4.585e-03   6.433 3.73e-10 ***
## origin        4.683e-02  2.502e-02   1.872   0.0620 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2993 on 384 degrees of freedom
## Multiple R-squared:  0.649,  Adjusted R-squared:  0.6426 
## F-statistic: 101.4 on 7 and 384 DF,  p-value: < 2.2e-16
auto.lda = lda(mpg01 ~ cylinders + weight + year, data = auto.train)
auto.lda
## Call:
## lda(mpg01 ~ cylinders + weight + year, data = auto.train)
## 
## Prior probabilities of groups:
##         0         1 
## 0.4935484 0.5064516 
## 
## Group means:
##   cylinders   weight     year
## 0  6.771242 3604.667 74.54248
## 1  4.203822 2346.051 77.59873
## 
## Coefficients of linear discriminants:
##                     LD1
## cylinders -0.4212284939
## weight    -0.0009673581
## year       0.1022662664
auto.lda.pred = predict(auto.lda, auto.test.x)
auto.lda.class= auto.lda.pred$class
caret::confusionMatrix(as.factor(auto.test.y), as.factor(auto.lda.class), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 36  7
##          1  0 39
##                                         
##                Accuracy : 0.9146        
##                  95% CI : (0.832, 0.965)
##     No Information Rate : 0.561         
##     P-Value [Acc > NIR] : 2.002e-12     
##                                         
##                   Kappa : 0.8303        
##                                         
##  Mcnemar's Test P-Value : 0.02334       
##                                         
##             Sensitivity : 0.8478        
##             Specificity : 1.0000        
##          Pos Pred Value : 1.0000        
##          Neg Pred Value : 0.8372        
##              Prevalence : 0.5610        
##          Detection Rate : 0.4756        
##    Detection Prevalence : 0.4756        
##       Balanced Accuracy : 0.9239        
##                                         
##        'Positive' Class : 1             
## 

The model performs well, with an accuracy level of .9146.

E)

auto.qda = qda(mpg01 ~ cylinders + weight + year, data = auto.train)
auto.qda
## Call:
## qda(mpg01 ~ cylinders + weight + year, data = auto.train)
## 
## Prior probabilities of groups:
##         0         1 
## 0.4935484 0.5064516 
## 
## Group means:
##   cylinders   weight     year
## 0  6.771242 3604.667 74.54248
## 1  4.203822 2346.051 77.59873
auto.qda.pred = predict(auto.qda, auto.test.x)
auto.qda.class = auto.qda.pred$class
caret::confusionMatrix(as.factor(auto.test.y), as.factor(auto.qda.class), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 38  5
##          1  0 39
##                                           
##                Accuracy : 0.939           
##                  95% CI : (0.8634, 0.9799)
##     No Information Rate : 0.5366          
##     P-Value [Acc > NIR] : 9.57e-16        
##                                           
##                   Kappa : 0.8785          
##                                           
##  Mcnemar's Test P-Value : 0.07364         
##                                           
##             Sensitivity : 0.8864          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.8837          
##              Prevalence : 0.5366          
##          Detection Rate : 0.4756          
##    Detection Prevalence : 0.4756          
##       Balanced Accuracy : 0.9432          
##                                           
##        'Positive' Class : 1               
## 

This qda model performs slightly better than the lda model that was created in part D). The accuracy level resulted from the test was .939.

F)

auto.log = glm(mpg01 ~ cylinders + weight + year, data = auto.train, family = binomial)

auto.PredProb = predict.glm(auto.log, newdata = auto.test.x, type = "response")
auto.PredSur = ifelse(auto.PredProb >= .5,1,0)
caret::confusionMatrix(as.factor(auto.test.y), as.factor(auto.PredSur), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 39  4
##          1  3 36
##                                         
##                Accuracy : 0.9146        
##                  95% CI : (0.832, 0.965)
##     No Information Rate : 0.5122        
##     P-Value [Acc > NIR] : 4.455e-15     
##                                         
##                   Kappa : 0.8291        
##                                         
##  Mcnemar's Test P-Value : 1             
##                                         
##             Sensitivity : 0.9000        
##             Specificity : 0.9286        
##          Pos Pred Value : 0.9231        
##          Neg Pred Value : 0.9070        
##              Prevalence : 0.4878        
##          Detection Rate : 0.4390        
##    Detection Prevalence : 0.4756        
##       Balanced Accuracy : 0.9143        
##                                         
##        'Positive' Class : 1             
## 

The accuracy of this model is the same that was seen on part D), at .9146.

G)

auto.knn.train.x = auto.train[c(2,5,7)]
auto.knn.train.y = auto.train[,10]
auto.knn.test.x = auto.test.x[c(2,5,7)]

set.seed(1)
auto.knn.pred = knn(auto.knn.train.x, auto.knn.test.x, auto.knn.train.y, k =1)
caret::confusionMatrix(as.factor(auto.test.y), as.factor(auto.knn.pred), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 38  5
##          1  5 34
##                                           
##                Accuracy : 0.878           
##                  95% CI : (0.7871, 0.9399)
##     No Information Rate : 0.5244          
##     P-Value [Acc > NIR] : 9.717e-12       
##                                           
##                   Kappa : 0.7555          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.8718          
##             Specificity : 0.8837          
##          Pos Pred Value : 0.8718          
##          Neg Pred Value : 0.8837          
##              Prevalence : 0.4756          
##          Detection Rate : 0.4146          
##    Detection Prevalence : 0.4756          
##       Balanced Accuracy : 0.8778          
##                                           
##        'Positive' Class : 1               
## 
auto.knn.pred5 = knn(auto.knn.train.x, auto.knn.test.x, auto.knn.train.y, k =5)
caret::confusionMatrix(as.factor(auto.test.y), as.factor(auto.knn.pred5), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 38  5
##          1  5 34
##                                           
##                Accuracy : 0.878           
##                  95% CI : (0.7871, 0.9399)
##     No Information Rate : 0.5244          
##     P-Value [Acc > NIR] : 9.717e-12       
##                                           
##                   Kappa : 0.7555          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.8718          
##             Specificity : 0.8837          
##          Pos Pred Value : 0.8718          
##          Neg Pred Value : 0.8837          
##              Prevalence : 0.4756          
##          Detection Rate : 0.4146          
##    Detection Prevalence : 0.4756          
##       Balanced Accuracy : 0.8778          
##                                           
##        'Positive' Class : 1               
## 
auto.knn.pred10 = knn(auto.knn.train.x, auto.knn.test.x, auto.knn.train.y, k =10)
caret::confusionMatrix(as.factor(auto.test.y), as.factor(auto.knn.pred10), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 38  5
##          1  5 34
##                                           
##                Accuracy : 0.878           
##                  95% CI : (0.7871, 0.9399)
##     No Information Rate : 0.5244          
##     P-Value [Acc > NIR] : 9.717e-12       
##                                           
##                   Kappa : 0.7555          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.8718          
##             Specificity : 0.8837          
##          Pos Pred Value : 0.8718          
##          Neg Pred Value : 0.8837          
##              Prevalence : 0.4756          
##          Detection Rate : 0.4146          
##    Detection Prevalence : 0.4756          
##       Balanced Accuracy : 0.8778          
##                                           
##        'Positive' Class : 1               
## 
auto.knn.pred15 = knn(auto.knn.train.x, auto.knn.test.x, auto.knn.train.y, k =15)
caret::confusionMatrix(as.factor(auto.test.y), as.factor(auto.knn.pred15), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 37  6
##          1  5 34
##                                           
##                Accuracy : 0.8659          
##                  95% CI : (0.7726, 0.9311)
##     No Information Rate : 0.5122          
##     P-Value [Acc > NIR] : 1.449e-11       
##                                           
##                   Kappa : 0.7314          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.8500          
##             Specificity : 0.8810          
##          Pos Pred Value : 0.8718          
##          Neg Pred Value : 0.8605          
##              Prevalence : 0.4878          
##          Detection Rate : 0.4146          
##    Detection Prevalence : 0.4756          
##       Balanced Accuracy : 0.8655          
##                                           
##        'Positive' Class : 1               
## 
auto.knn.pred20 = knn(auto.knn.train.x, auto.knn.test.x, auto.knn.train.y, k =20)
caret::confusionMatrix(as.factor(auto.test.y), as.factor(auto.knn.pred20), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 36  7
##          1  5 34
##                                          
##                Accuracy : 0.8537         
##                  95% CI : (0.7583, 0.922)
##     No Information Rate : 0.5            
##     P-Value [Acc > NIR] : 2.054e-11      
##                                          
##                   Kappa : 0.7073         
##                                          
##  Mcnemar's Test P-Value : 0.7728         
##                                          
##             Sensitivity : 0.8293         
##             Specificity : 0.8780         
##          Pos Pred Value : 0.8718         
##          Neg Pred Value : 0.8372         
##              Prevalence : 0.5000         
##          Detection Rate : 0.4146         
##    Detection Prevalence : 0.4756         
##       Balanced Accuracy : 0.8537         
##                                          
##        'Positive' Class : 1              
## 
auto.knn.pred25 = knn(auto.knn.train.x, auto.knn.test.x, auto.knn.train.y, k =25)
caret::confusionMatrix(as.factor(auto.test.y), as.factor(auto.knn.pred25), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 37  6
##          1  5 34
##                                           
##                Accuracy : 0.8659          
##                  95% CI : (0.7726, 0.9311)
##     No Information Rate : 0.5122          
##     P-Value [Acc > NIR] : 1.449e-11       
##                                           
##                   Kappa : 0.7314          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.8500          
##             Specificity : 0.8810          
##          Pos Pred Value : 0.8718          
##          Neg Pred Value : 0.8605          
##              Prevalence : 0.4878          
##          Detection Rate : 0.4146          
##    Detection Prevalence : 0.4756          
##       Balanced Accuracy : 0.8655          
##                                           
##        'Positive' Class : 1               
## 

The best knn model that was found has an accuracy level of .878 with a k value of 1,5 and 10.

Question 13

boston = Boston

crim.median = median(boston$crim)
boston$crim.above.med = boston$crim

boston$crim.above.med[boston$crim >= crim.median] = 1
boston$crim.above.med[boston$crim < crim.median] = 0

set.seed(1)
train = sample(506, 400)
boston.train = boston[train,]
boston.test.x = boston[-train,1:14]
boston.test.y = boston[-train, 15]
#Logistic
boston.log = glm(crim.above.med ~ zn + indus + chas + nox + rm + age + dis + rad + tax + ptratio + black + lstat + medv, data = boston.train, family = binomial)

summary(boston.log)
## 
## Call:
## glm(formula = crim.above.med ~ zn + indus + chas + nox + rm + 
##     age + dis + rad + tax + ptratio + black + lstat + medv, family = binomial, 
##     data = boston.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0405  -0.1412  -0.0002   0.0017   3.6053  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -42.413075   7.663745  -5.534 3.13e-08 ***
## zn           -0.105480   0.042747  -2.468 0.013605 *  
## indus        -0.064406   0.051401  -1.253 0.210203    
## chas          0.471499   0.781593   0.603 0.546340    
## nox          56.009198   9.235915   6.064 1.33e-09 ***
## rm           -0.151684   0.850862  -0.178 0.858511    
## age           0.021783   0.013687   1.591 0.111499    
## dis           1.062174   0.277926   3.822 0.000132 ***
## rad           0.681962   0.175689   3.882 0.000104 ***
## tax          -0.007108   0.003163  -2.248 0.024607 *  
## ptratio       0.361211   0.147963   2.441 0.014637 *  
## black        -0.010394   0.005750  -1.808 0.070671 .  
## lstat         0.080476   0.056675   1.420 0.155626    
## medv          0.169017   0.080670   2.095 0.036155 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 554.52  on 399  degrees of freedom
## Residual deviance: 159.23  on 386  degrees of freedom
## AIC: 187.23
## 
## Number of Fisher Scoring iterations: 9
boston.log.PredProb = predict.glm(boston.log, newdata = boston.test.x, type = "response")
boston.log.PredSur = ifelse(boston.log.PredProb >= .5,1,0)
caret::confusionMatrix(as.factor(boston.test.y), as.factor(boston.log.PredSur), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 45  8
##          1  5 48
##                                           
##                Accuracy : 0.8774          
##                  95% CI : (0.7994, 0.9331)
##     No Information Rate : 0.5283          
##     P-Value [Acc > NIR] : 1.817e-14       
##                                           
##                   Kappa : 0.7547          
##                                           
##  Mcnemar's Test P-Value : 0.5791          
##                                           
##             Sensitivity : 0.8571          
##             Specificity : 0.9000          
##          Pos Pred Value : 0.9057          
##          Neg Pred Value : 0.8491          
##              Prevalence : 0.5283          
##          Detection Rate : 0.4528          
##    Detection Prevalence : 0.5000          
##       Balanced Accuracy : 0.8786          
##                                           
##        'Positive' Class : 1               
## 
boston.log2 = glm(crim.above.med ~ zn + nox + age + dis + rad + tax + ptratio + black + medv, data = boston.train, family = binomial)
summary(boston.log2)
## 
## Call:
## glm(formula = crim.above.med ~ zn + nox + age + dis + rad + tax + 
##     ptratio + black + medv, family = binomial, data = boston.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9541  -0.1624  -0.0002   0.0015   3.6164  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -39.093913   7.230444  -5.407 6.41e-08 ***
## zn           -0.104871   0.039008  -2.688  0.00718 ** 
## nox          50.727147   8.131533   6.238 4.42e-10 ***
## age           0.026352   0.011576   2.276  0.02282 *  
## dis           0.999550   0.269695   3.706  0.00021 ***
## rad           0.759663   0.163470   4.647 3.37e-06 ***
## tax          -0.008527   0.002964  -2.877  0.00402 ** 
## ptratio       0.350180   0.134863   2.597  0.00942 ** 
## black        -0.010352   0.005822  -1.778  0.07538 .  
## medv          0.128555   0.040044   3.210  0.00133 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 554.52  on 399  degrees of freedom
## Residual deviance: 163.07  on 390  degrees of freedom
## AIC: 183.07
## 
## Number of Fisher Scoring iterations: 9
boston.log.PredProb2 = predict.glm(boston.log2, newdata = boston.test.x, type = "response")
boston.log.PredSur2 = ifelse(boston.log.PredProb2 >= .5,1,0)
caret::confusionMatrix(as.factor(boston.test.y), as.factor(boston.log.PredSur2), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 47  6
##          1  6 47
##                                           
##                Accuracy : 0.8868          
##                  95% CI : (0.8106, 0.9401)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.7736          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.8868          
##             Specificity : 0.8868          
##          Pos Pred Value : 0.8868          
##          Neg Pred Value : 0.8868          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4434          
##    Detection Prevalence : 0.5000          
##       Balanced Accuracy : 0.8868          
##                                           
##        'Positive' Class : 1               
## 
#LDA
boston.lda = lda(crim.above.med ~ zn + indus + chas + nox + rm + age + dis + rad + tax + ptratio + black + lstat + medv, data = boston.train)

boston.lda.pred = predict(boston.lda, boston.test.x)
boston.lda.class= boston.lda.pred$class
caret::confusionMatrix(as.factor(boston.test.y), as.factor(boston.lda.class), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 52  1
##          1 11 42
##                                           
##                Accuracy : 0.8868          
##                  95% CI : (0.8106, 0.9401)
##     No Information Rate : 0.5943          
##     P-Value [Acc > NIR] : 3.067e-11       
##                                           
##                   Kappa : 0.7736          
##                                           
##  Mcnemar's Test P-Value : 0.009375        
##                                           
##             Sensitivity : 0.9767          
##             Specificity : 0.8254          
##          Pos Pred Value : 0.7925          
##          Neg Pred Value : 0.9811          
##              Prevalence : 0.4057          
##          Detection Rate : 0.3962          
##    Detection Prevalence : 0.5000          
##       Balanced Accuracy : 0.9011          
##                                           
##        'Positive' Class : 1               
## 
boston.lda2 = lda(crim.above.med ~ zn + nox + age + dis + rad + tax + ptratio + black + medv, data = boston.train)

boston.lda.pred2 = predict(boston.lda2, boston.test.x)
boston.lda.class2= boston.lda.pred2$class
caret::confusionMatrix(as.factor(boston.test.y), as.factor(boston.lda.class2), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 52  1
##          1 13 40
##                                           
##                Accuracy : 0.8679          
##                  95% CI : (0.7883, 0.9259)
##     No Information Rate : 0.6132          
##     P-Value [Acc > NIR] : 6.666e-09       
##                                           
##                   Kappa : 0.7358          
##                                           
##  Mcnemar's Test P-Value : 0.003283        
##                                           
##             Sensitivity : 0.9756          
##             Specificity : 0.8000          
##          Pos Pred Value : 0.7547          
##          Neg Pred Value : 0.9811          
##              Prevalence : 0.3868          
##          Detection Rate : 0.3774          
##    Detection Prevalence : 0.5000          
##       Balanced Accuracy : 0.8878          
##                                           
##        'Positive' Class : 1               
## 
#QDA
boston.qda = qda(crim.above.med ~ zn + indus + chas + nox + rm + age + dis + rad + tax + ptratio + black + lstat + medv, data = boston.train)

boston.qda.pred = predict(boston.qda, boston.test.x)
boston.qda.class = boston.qda.pred$class
caret::confusionMatrix(as.factor(boston.test.y), as.factor(boston.qda.class), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 52  1
##          1  9 44
##                                           
##                Accuracy : 0.9057          
##                  95% CI : (0.8333, 0.9538)
##     No Information Rate : 0.5755          
##     P-Value [Acc > NIR] : 6.437e-14       
##                                           
##                   Kappa : 0.8113          
##                                           
##  Mcnemar's Test P-Value : 0.02686         
##                                           
##             Sensitivity : 0.9778          
##             Specificity : 0.8525          
##          Pos Pred Value : 0.8302          
##          Neg Pred Value : 0.9811          
##              Prevalence : 0.4245          
##          Detection Rate : 0.4151          
##    Detection Prevalence : 0.5000          
##       Balanced Accuracy : 0.9151          
##                                           
##        'Positive' Class : 1               
## 
boston.qda2 = qda(crim.above.med ~ zn + nox + age + dis + rad + tax + ptratio + black + medv, data = boston.train)

boston.qda.pred2 = predict(boston.qda2, boston.test.x)
boston.qda.class2 = boston.qda.pred2$class
caret::confusionMatrix(as.factor(boston.test.y), as.factor(boston.qda.class2), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 52  1
##          1 11 42
##                                           
##                Accuracy : 0.8868          
##                  95% CI : (0.8106, 0.9401)
##     No Information Rate : 0.5943          
##     P-Value [Acc > NIR] : 3.067e-11       
##                                           
##                   Kappa : 0.7736          
##                                           
##  Mcnemar's Test P-Value : 0.009375        
##                                           
##             Sensitivity : 0.9767          
##             Specificity : 0.8254          
##          Pos Pred Value : 0.7925          
##          Neg Pred Value : 0.9811          
##              Prevalence : 0.4057          
##          Detection Rate : 0.3962          
##    Detection Prevalence : 0.5000          
##       Balanced Accuracy : 0.9011          
##                                           
##        'Positive' Class : 1               
## 
#KNN
boston.knn.train.x = boston.train[c(2:13)]
boston.knn.train.y = boston.train[,15]
boston.knn.test.x = boston.test.x[,2:13]

set.seed(1)
boston.knn.pred = knn(boston.knn.train.x, boston.knn.test.x, boston.knn.train.y, k =1)
caret::confusionMatrix(as.factor(boston.test.y), as.factor(boston.knn.pred), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 50  3
##          1  4 49
##                                          
##                Accuracy : 0.934          
##                  95% CI : (0.8687, 0.973)
##     No Information Rate : 0.5094         
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.8679         
##                                          
##  Mcnemar's Test P-Value : 1              
##                                          
##             Sensitivity : 0.9423         
##             Specificity : 0.9259         
##          Pos Pred Value : 0.9245         
##          Neg Pred Value : 0.9434         
##              Prevalence : 0.4906         
##          Detection Rate : 0.4623         
##    Detection Prevalence : 0.5000         
##       Balanced Accuracy : 0.9341         
##                                          
##        'Positive' Class : 1              
## 
boston.knn.pred = knn(boston.knn.train.x, boston.knn.test.x, boston.knn.train.y, k =5)
caret::confusionMatrix(as.factor(boston.test.y), as.factor(boston.knn.pred), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 45  8
##          1  4 49
##                                           
##                Accuracy : 0.8868          
##                  95% CI : (0.8106, 0.9401)
##     No Information Rate : 0.5377          
##     P-Value [Acc > NIR] : 1.155e-14       
##                                           
##                   Kappa : 0.7736          
##                                           
##  Mcnemar's Test P-Value : 0.3865          
##                                           
##             Sensitivity : 0.8596          
##             Specificity : 0.9184          
##          Pos Pred Value : 0.9245          
##          Neg Pred Value : 0.8491          
##              Prevalence : 0.5377          
##          Detection Rate : 0.4623          
##    Detection Prevalence : 0.5000          
##       Balanced Accuracy : 0.8890          
##                                           
##        'Positive' Class : 1               
## 
boston.knn.pred = knn(boston.knn.train.x, boston.knn.test.x, boston.knn.train.y, k =10)
caret::confusionMatrix(as.factor(boston.test.y), as.factor(boston.knn.pred), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 44  9
##          1  4 49
##                                           
##                Accuracy : 0.8774          
##                  95% CI : (0.7994, 0.9331)
##     No Information Rate : 0.5472          
##     P-Value [Acc > NIR] : 2.833e-13       
##                                           
##                   Kappa : 0.7547          
##                                           
##  Mcnemar's Test P-Value : 0.2673          
##                                           
##             Sensitivity : 0.8448          
##             Specificity : 0.9167          
##          Pos Pred Value : 0.9245          
##          Neg Pred Value : 0.8302          
##              Prevalence : 0.5472          
##          Detection Rate : 0.4623          
##    Detection Prevalence : 0.5000          
##       Balanced Accuracy : 0.8807          
##                                           
##        'Positive' Class : 1               
## 

Every model was ran with all the variables as a baseline. The second model was selected by utilizing backwards selection through the logistic model. The second model only performed between with the logistic model, which resulted with an accuracy level of .8868. The best model for LDA received the same value as the logistic model and the best overall was seen from the KNN model. This was achieved with its first model and returned an accuracy of .934. The best QDA model retured an accuracy level of .9057