Assignment3

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(ISLR2)

## Warning: package 'ISLR2' was built under R version 4.3.3

library(corrplot)

## Warning: package 'corrplot' was built under R version 4.3.3

## corrplot 0.95 loaded

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:ISLR2':
## 
##     Boston

library(class)
library(e1071)

## Warning: package 'e1071' was built under R version 4.3.3

library(caTools) # New package to note and explore

## Warning: package 'caTools' was built under R version 4.3.3

attach(Weekly)
head(Weekly)

##   Year   Lag1   Lag2   Lag3   Lag4   Lag5    Volume  Today Direction
## 1 1990  0.816  1.572 -3.936 -0.229 -3.484 0.1549760 -0.270      Down
## 2 1990 -0.270  0.816  1.572 -3.936 -0.229 0.1485740 -2.576      Down
## 3 1990 -2.576 -0.270  0.816  1.572 -3.936 0.1598375  3.514        Up
## 4 1990  3.514 -2.576 -0.270  0.816  1.572 0.1616300  0.712        Up
## 5 1990  0.712  3.514 -2.576 -0.270  0.816 0.1537280  1.178        Up
## 6 1990  1.178  0.712  3.514 -2.576 -0.270 0.1544440 -1.372      Down

str(Weekly)

## 'data.frame':    1089 obs. of  9 variables:
##  $ Year     : num  1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
##  $ Lag1     : num  0.816 -0.27 -2.576 3.514 0.712 ...
##  $ Lag2     : num  1.572 0.816 -0.27 -2.576 3.514 ...
##  $ Lag3     : num  -3.936 1.572 0.816 -0.27 -2.576 ...
##  $ Lag4     : num  -0.229 -3.936 1.572 0.816 -0.27 ...
##  $ Lag5     : num  -3.484 -0.229 -3.936 1.572 0.816 ...
##  $ Volume   : num  0.155 0.149 0.16 0.162 0.154 ...
##  $ Today    : num  -0.27 -2.576 3.514 0.712 1.178 ...
##  $ Direction: Factor w/ 2 levels "Down","Up": 1 1 2 2 2 1 2 2 2 1 ...

# Question 13 Part (a). Produce some numerical and graphical summaries of the
# Weekly data. Do there appear to be any patterns?

summary(Weekly)

##       Year           Lag1               Lag2               Lag3         
##  Min.   :1990   Min.   :-18.1950   Min.   :-18.1950   Min.   :-18.1950  
##  1st Qu.:1995   1st Qu.: -1.1540   1st Qu.: -1.1540   1st Qu.: -1.1580  
##  Median :2000   Median :  0.2410   Median :  0.2410   Median :  0.2410  
##  Mean   :2000   Mean   :  0.1506   Mean   :  0.1511   Mean   :  0.1472  
##  3rd Qu.:2005   3rd Qu.:  1.4050   3rd Qu.:  1.4090   3rd Qu.:  1.4090  
##  Max.   :2010   Max.   : 12.0260   Max.   : 12.0260   Max.   : 12.0260  
##       Lag4               Lag5              Volume            Today         
##  Min.   :-18.1950   Min.   :-18.1950   Min.   :0.08747   Min.   :-18.1950  
##  1st Qu.: -1.1580   1st Qu.: -1.1660   1st Qu.:0.33202   1st Qu.: -1.1540  
##  Median :  0.2380   Median :  0.2340   Median :1.00268   Median :  0.2410  
##  Mean   :  0.1458   Mean   :  0.1399   Mean   :1.57462   Mean   :  0.1499  
##  3rd Qu.:  1.4090   3rd Qu.:  1.4050   3rd Qu.:2.05373   3rd Qu.:  1.4050  
##  Max.   : 12.0260   Max.   : 12.0260   Max.   :9.32821   Max.   : 12.0260  
##  Direction 
##  Down:484  
##  Up  :605  
##            
##            
##            
##

# Question 13 Part (a). Produce some numerical and graphical summaries of the
# Weekly data. Do there appear to be any patterns?

pairs(Weekly)

# Question 13 Part (a). Produce some numerical and graphical summaries of the
# Weekly data. Do there appear to be any patterns?

# Subset the data to exclude direction other wise it won't work cause it is a factor variable 

cor(Weekly[, -9])

##               Year         Lag1        Lag2        Lag3         Lag4
## Year    1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1   -0.03228927  1.000000000 -0.07485305  0.05863568 -0.071273876
## Lag2   -0.03339001 -0.074853051  1.00000000 -0.07572091  0.058381535
## Lag3   -0.03000649  0.058635682 -0.07572091  1.00000000 -0.075395865
## Lag4   -0.03112792 -0.071273876  0.05838153 -0.07539587  1.000000000
## Lag5   -0.03051910 -0.008183096 -0.07249948  0.06065717 -0.075675027
## Volume  0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today  -0.03245989 -0.075031842  0.05916672 -0.07124364 -0.007825873
##                Lag5      Volume        Today
## Year   -0.030519101  0.84194162 -0.032459894
## Lag1   -0.008183096 -0.06495131 -0.075031842
## Lag2   -0.072499482 -0.08551314  0.059166717
## Lag3    0.060657175 -0.06928771 -0.071243639
## Lag4   -0.075675027 -0.06107462 -0.007825873
## Lag5    1.000000000 -0.05851741  0.011012698
## Volume -0.058517414  1.00000000 -0.033077783
## Today   0.011012698 -0.03307778  1.000000000

# Question 13 Part (a). Produce some numerical and graphical summaries of the
# Weekly data. Do there appear to be any patterns?

numerical_variables = Weekly[, c("Lag1", "Lag2", "Lag3", "Lag4", "Lag5", "Volume", "Today","Year")]

cor_matrix = cor(numerical_variables, use = "complete.obs") # The use complete 
# handles missing variables

corrplot(cor_matrix, 
         method = "circle",           
         order = "hclust",            
         type = "upper",              
         tl.col = "black",            
         tl.srt = 45,                 
         tl.cex = 0.8,                
         col = colorRampPalette(c("blue", "white", "red"))(200),  
         addCoef.col = "black",      
         diag = FALSE,                
         number.cex = 0.7)

# Question 13 Part (a). Produce some numerical and graphical summaries of the
# Weekly data. Do there appear to be any patterns?

# Based on the numerical summaries and graphs produced above, the correlation
# between 'Volume' and 'Year' are the only patterns that are present.

# Question 13 Part (b)
# Use the full data set to perform a logistic regression with
# Direction as the response and the five lag variables plus Volume
# as predictors. Use the summary function to print the results. Do
# any of the predictors appear to be statistically significant? If so,
# which ones?

logit_weekly_model = glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
                   data = Weekly, family = binomial)
summary(logit_weekly_model)

## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Weekly)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.26686    0.08593   3.106   0.0019 **
## Lag1        -0.04127    0.02641  -1.563   0.1181   
## Lag2         0.05844    0.02686   2.175   0.0296 * 
## Lag3        -0.01606    0.02666  -0.602   0.5469   
## Lag4        -0.02779    0.02646  -1.050   0.2937   
## Lag5        -0.01447    0.02638  -0.549   0.5833   
## Volume      -0.02274    0.03690  -0.616   0.5377   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1496.2  on 1088  degrees of freedom
## Residual deviance: 1486.4  on 1082  degrees of freedom
## AIC: 1500.4
## 
## Number of Fisher Scoring iterations: 4

# Question 13 Part (b)
# Do any of the predictors appear to be statistically significant? If so,
# which ones?

# From the results of the logistic regression Lag 2 is the only predictor
# that is statically significant as it has a p value that is less than 0.05.

# Question 13 Part (c)
# Compute the confusion matrix and overall fraction of correct
# predictions. Explain what the confusion matrix is telling you
# about the types of mistakes made by logistic regression.

weekly_prob = predict(logit_weekly_model, type='response')
weekly_pred = rep("Down",1089)
weekly_pred[weekly_prob>0.5] = "Up"
table(weekly_pred, Direction)

##            Direction
## weekly_pred Down  Up
##        Down   54  48
##        Up    430 557

# Calculate overall fraction of correct predictions(accuracy rate)

(557+54)/1089

## [1] 0.5610652

# Calculate when indication is Up. This is known as
# precision or positive predictive value 

557/(557+48)

## [1] 0.9206612

# Calculate when indication is down. This is for Negative Predictive Value

54/(54+430)

## [1] 0.1115702

# Calculate error rate

(48+430)/(1089)

## [1] 0.4389348

# Question 13 Part (c)
# Compute the confusion matrix and overall fraction of correct
# predictions. Explain what the confusion matrix is telling you
# about the types of mistakes made by logistic regression.

# From the confusion matrix results, the overall fraction of correct predictions
# is 0.5610652 which indicates a accuracy of 56.1%. The model is able to 
# correctly predict up 92.07% of the time, however the model can only correctly
# predict down 11.16 % of the time. This logistic regression has
# an error rate of 43.89%.

# Question 13 Part (d)
# Now fit the logistic regression model using a training data period
# from 1990 to 2008, with Lag2 as the only predictor. Compute the
# confusion matrix and the overall fraction of correct predictions
# for the held out data (that is, the data from 2009 and 2010).

train = (Weekly$Year < 2009)
weekly_2009 = Weekly[!train,]
weekly_fits = glm(Direction ~ Lag2, data=Weekly, family=binomial, subset=train)
weekly_prob = predict(weekly_fits, weekly_2009, type = "response")
weekly_pred = rep("Down", length(weekly_prob))
weekly_pred[weekly_prob > 0.5] = "Up"
Direction_2009 = Direction[!train]
table(weekly_pred, Direction_2009)

##            Direction_2009
## weekly_pred Down Up
##        Down    9  5
##        Up     34 56

# Question 13 Part (d)
# Calculate overall fraction of correct predictions(accuracy rate)

(56+9)/104

## [1] 0.625

# Question 13 Part (d)
# Calculate when indication is Up (precision) or (positive predictive value)

56 / (56+5)

## [1] 0.9180328

# Question 13 Part (d)
# Calculate when indication is down (Negative Predictive Value)

9 / (9+34)

## [1] 0.2093023

# Question 13 Part (d) Calculate error rate

(5+34)/104

## [1] 0.375

# Question 13 Part (d)
# Compute the confusion matrix and the overall fraction of correct predictions
# for the held out data (that is, the data from 2009 and 2010).

# From the confusion matrix results, the overall fraction of correct predictions
# is 0.625 which indicates a accuracy of 62.50%. The model is able to 
# correctly predict up 91.80% of the time, however the model can only correctly
# predict down 20.93 % of the time. This logistic regression has
# an error rate of 37.50%.

# Question 13 Part (e)
# Repeat (d) using LDA.

weekly_lda_fit = lda(Direction~ Lag2, data=Weekly, subset=train)
weekly_lda_pred = predict(weekly_lda_fit, weekly_2009)
table(weekly_lda_pred$class, Direction_2009)

##       Direction_2009
##        Down Up
##   Down    9  5
##   Up     34 56

# Question 13 Part (e)
# Calculate overall fraction of correct predictions(accuracy rate)

(56 + 9) / 104

## [1] 0.625

# Question 13 Part (e)
# Calculate when indication is Up (precision) or (positive predictive value)

56 / (56 + 5)

## [1] 0.9180328

# Question 13 Part (e)
# Calculate when indication is down (Negative Predictive Value)

9 / (9 + 34)

## [1] 0.2093023

# Question 13 Part (e) Calculate error rate

(5 + 34) / 104

## [1] 0.375

# Question 13 Part (e)

# From the confusion matrix results, the overall fraction of correct predictions
# is 0.625 which indicates a accuracy of 62.50%. The model is able to 
# correctly predict up 91.80% of the time, however the model can only correctly
# predict down 20.93 % of the time. This LDA model has
# an error rate of 37.50%.

# Question 13 Part (f)
# Repeat (d) using QDA.

weekly_qda_fit = qda(Direction ~ Lag2, data = Weekly, subset = train)
weekly_qda_pred = predict(weekly_qda_fit, weekly_2009)$class
table(weekly_qda_pred, Direction_2009)

##                Direction_2009
## weekly_qda_pred Down Up
##            Down    0  0
##            Up     43 61

# Question 13 Part (f)
# Calculate overall fraction of correct predictions (accuracy rate)
(61 + 0) / 104

## [1] 0.5865385

# Calculate when indication is Up (precision) or (positive predictive value)
61 / (61 + 0)

## [1] 1

# Calculate when indication is down (Negative Predictive Value)
0 / (0 + 43)

## [1] 0

# Calculate error rate
(0 + 43) / 104

## [1] 0.4134615

# From the confusion matrix results, the overall fraction of correct predictions
# is 0.5865385 which indicates a accuracy of 58.65%. The model is able to 
# correctly predict up 100% of the time, however the model can only correctly
# predict down 0% of the time. This QDA model has
# an error rate of 41.35%.

# Question 13 (g)
# Repeat (d) using KNN with K = 1.

week_train = as.matrix(Lag2[train])
week_test = as.matrix(Lag2[!train])
train_Direction = Direction[train]
set.seed(1)
week_knn_pred = knn(week_train,week_test,train_Direction, k=1)
table(week_knn_pred ,Direction_2009)

##              Direction_2009
## week_knn_pred Down Up
##          Down   21 30
##          Up     22 31

# Question 13 (g)
# Calculate overall fraction of correct predictions (accuracy rate)
(31 + 21) / 104

## [1] 0.5

# Calculate when indication is Up (precision) or (positive predictive value)
31 / (31 + 30)

## [1] 0.5081967

# Calculate when indication is down (Negative Predictive Value)
21 / (21 + 22)

## [1] 0.4883721

# Calculate error rate
(30 + 22) / 104

## [1] 0.5

# Question 13 (g)
# From the confusion matrix results, the overall fraction of correct predictions
# is 0.5 which indicates a accuracy of 50%. The model is able to 
# correctly predict up 50.82% of the time, however the model can only correctly
# predict down 48.83% of the time. This KNN model has
# an error rate of 50%.

# Question 13 (h)
# Repeat (d) using naive Bayes.

weekly_nbay_fit = naiveBayes(Direction ~ Lag2 ,data = Weekly ,subset = train)
weekly_nbay_class = predict(weekly_nbay_fit ,weekly_2009)
table(weekly_nbay_class ,Direction_2009)

##                  Direction_2009
## weekly_nbay_class Down Up
##              Down    0  0
##              Up     43 61

# Question 13 (h)
# Calculate overall fraction of correct predictions (accuracy rate)
(61 + 0) / 104

## [1] 0.5865385

# Calculate when indication is Up (precision) or (positive predictive value)
61 / (61 + 0)

## [1] 1

# Calculate when indication is down (Negative Predictive Value)
0 / (0 + 43)

## [1] 0

# Calculate error rate
(0 + 43) / 104

## [1] 0.4134615

# Question 13 (h)
# From the confusion matrix results, the overall fraction of correct predictions
# is 0.5865385 which indicates a accuracy of 58.65%. The model is able to 
# correctly predict up 100% of the time, however the model correctly
# predicts down 0% of the time. This Naive Bayes model has
# an error rate of 41.35%.

# Question 13 (i)
# Which of these methods appears to provide the best results on this data?

# Based on the performance metrics, Logistic Regression appears to provide the 
# best overall results on this data. It has the highest accuracy of 62.5%, which
# means it makes the fewest overall mistakes and it has a high precision for 
# "Up" at 91.8%, meaning when it predicts "Up," it is correct most of the time.
# It also has the lowest error rate of 35.5% compared to the other models
# Despite the negative predictive value for "Down" being a low 20.9% overall, 
# the method that appears to provide the pest result of this data is logistic 
# regression part(d).

# Question 13 (j)
# Experiment with different combinations of predictors, including
# possible transformations and interactions, for each of the
# methods. Report the variables, method, and associated confusion
# matrix that appears to provide the best results on the held
# out data. Note that you should also experiment with values for
# K in the KNN classifier.


# Logistic Regression with new predictor of Lag1 excluding lag2
train = (Weekly$Year < 2009)
weekly_2009 = Weekly[!train,]
weekly_fits = glm(Direction ~ Lag1, data=Weekly, family=binomial, subset=train)
weekly_prob = predict(weekly_fits, weekly_2009, type = "response")
weekly_pred = rep("Down", length(weekly_prob))
weekly_pred[weekly_prob > 0.5] = "Up"
Direction_2009 = Direction[!train]
table(weekly_pred, Direction_2009)

##            Direction_2009
## weekly_pred Down Up
##        Down    4  6
##        Up     39 55

# Calculate overall fraction of correct predictions (accuracy rate)
(55 + 4) / 104

## [1] 0.5673077

# Calculate when indication is Up (precision) or (positive predictive value)
55 / (55 + 6)

## [1] 0.9016393

# Calculate when indication is down (Negative Predictive Value)
4 / (4 + 39)

## [1] 0.09302326

# Calculate error rate
(6 + 39) / 104

## [1] 0.4326923

# Question 13 (j)
# LDA model with interaction between lag 1 and lag 2 as predictor 
weekly_lda_fitq13 = lda(Direction~ Lag1:Lag2, data = Weekly, family = binomial, 
                    subset=train)
weekly_lda_predq13 = predict(weekly_lda_fitq13, weekly_2009)
table(weekly_lda_predq13$class, Direction_2009)

##       Direction_2009
##        Down Up
##   Down    0  1
##   Up     43 60

# Calculate overall fraction of correct predictions (accuracy rate)
(60 + 0) / 104

## [1] 0.5769231

# Calculate when indication is Up (precision) or (positive predictive value)
60 / (60 + 1)

## [1] 0.9836066

# Calculate when indication is down (Negative Predictive Value)
0 / (0 + 43)

## [1] 0

# Calculate error rate
(1 + 43) / 104

## [1] 0.4230769

# Question 13 (j)
# QDA Model with Lag1 squared transformation 

weekly_qda_fitq13 = qda(Direction ~ Lag1^2, data = Weekly, subset = train)
weekly_qda_predq13 = predict(weekly_qda_fitq13, weekly_2009)$class
table(weekly_qda_predq13, Direction_2009)

##                   Direction_2009
## weekly_qda_predq13 Down Up
##               Down    0  0
##               Up     43 61

# Calculate overall fraction of correct predictions (accuracy rate)
(61 + 0) / 104

## [1] 0.5865385

# Calculate when indication is Up (precision) or (positive predictive value)
61 / (61 + 0)

## [1] 1

# Calculate when indication is down (Negative Predictive Value)
0 / (0 + 43)

## [1] 0

# Calculate error rate
(0 + 43) / 104

## [1] 0.4134615

# Question 13 (j)
# KNN Model with k=10, Lag2
set.seed(1)
week_knn_predq13 = knn(week_train,week_test,train_Direction, k=10)
table(week_knn_predq13,Direction_2009)

##                 Direction_2009
## week_knn_predq13 Down Up
##             Down   17 21
##             Up     26 40

# Calculate overall fraction of correct predictions (accuracy rate)
(40 + 17) / 104

## [1] 0.5480769

# Calculate when indication is Up (precision) or (positive predictive value)
40 / (40 + 21)

## [1] 0.6557377

# Calculate when indication is down (Negative Predictive Value)
17 / (17 + 26)

## [1] 0.3953488

# Calculate error rate
(21 + 26) / 104

## [1] 0.4519231

# Question 13 (j)
# Naive Bayes with new predictor of lag 1 with lag 2 still included

weekly_nbay_fitq13j = naiveBayes(Direction ~ Lag1 + Lag2 ,data = Weekly ,subset = train)
weekly_nbay_classq13j = predict(weekly_nbay_fitq13j ,weekly_2009)
table(weekly_nbay_classq13j ,Direction_2009)

##                      Direction_2009
## weekly_nbay_classq13j Down Up
##                  Down    3  8
##                  Up     40 53

# Calculate overall fraction of correct predictions (accuracy rate)
(53 + 3) / 104

## [1] 0.5384615

# Calculate when indication is Up (precision) or (positive predictive value)
53 / (53 + 8)

## [1] 0.8688525

# Calculate when indication is down (Negative Predictive Value)
3 / (3 + 40)

## [1] 0.06976744

# Calculate error rate
(8 + 40) / 104

## [1] 0.4615385

# Question 13 (j)
# Report the variables, method, and associated confusion
# matrix that appears to provide the best results on the held
# out data. Note that you should also experiment with values for
# K in the KNN classifier.

# Based on the results the matrix that had the best result on the held out
# data was the QDA model because with a accuracy of 59.6% it was the highest 
# out of all the other models. The other statistics that can be determined
# from this matrix is that it has a 100% precision but a 0% negative predictive
# value, and a error rate of 41.3% which is the lowest error rate out of the 
# model. The predictive variable associated with this model is Lag1^2 
# transformation.

# Use detach() function to ensure that possible variable names between
# data sets do not interfere with other datasets that have overlapping variable
# names. 

detach(Weekly)

# Q14. In this problem, you will develop a model to predict whether a given
# car gets high or low gas mileage based on the Auto data set.

attach(Auto)

# Question 14 (a)
# Create a binary variable, mpg01, that contains a 1 if mpg contains
# a value above its median, and a 0 if mpg contains a value below
# its median. You can compute the median using the median()
# function. Note you may find it helpful to use the data.frame()
# function to create a single data set containing both mpg01 and
# the other Auto variables.

mpg01 = ifelse(Auto$mpg > median(Auto$mpg),1,0)
auto_mpg01 = data.frame(Auto, mpg01)

# Check Variables
str(auto_mpg01)

## 'data.frame':    392 obs. of  10 variables:
##  $ mpg         : num  18 15 18 16 17 15 14 14 14 15 ...
##  $ cylinders   : int  8 8 8 8 8 8 8 8 8 8 ...
##  $ displacement: num  307 350 318 304 302 429 454 440 455 390 ...
##  $ horsepower  : int  130 165 150 150 140 198 220 215 225 190 ...
##  $ weight      : int  3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
##  $ acceleration: num  12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
##  $ year        : int  70 70 70 70 70 70 70 70 70 70 ...
##  $ origin      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ name        : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
##  $ mpg01       : num  0 0 0 0 0 0 0 0 0 0 ...

# Question 14 (b)
# Explore the data graphically in order to investigate the association
# between mpg01 and the other features. Which of the other
# features seem most likely to be useful in predicting mpg01? Scatterplots
# and boxplots may be useful tools to answer this question.
# Describe your findings.

# Need to get rid of name variable 
pairs(auto_mpg01[,-9])

cor(auto_mpg01[, -9])

##                     mpg  cylinders displacement horsepower     weight
## mpg           1.0000000 -0.7776175   -0.8051269 -0.7784268 -0.8322442
## cylinders    -0.7776175  1.0000000    0.9508233  0.8429834  0.8975273
## displacement -0.8051269  0.9508233    1.0000000  0.8972570  0.9329944
## horsepower   -0.7784268  0.8429834    0.8972570  1.0000000  0.8645377
## weight       -0.8322442  0.8975273    0.9329944  0.8645377  1.0000000
## acceleration  0.4233285 -0.5046834   -0.5438005 -0.6891955 -0.4168392
## year          0.5805410 -0.3456474   -0.3698552 -0.4163615 -0.3091199
## origin        0.5652088 -0.5689316   -0.6145351 -0.4551715 -0.5850054
## mpg01         0.8369392 -0.7591939   -0.7534766 -0.6670526 -0.7577566
##              acceleration       year     origin      mpg01
## mpg             0.4233285  0.5805410  0.5652088  0.8369392
## cylinders      -0.5046834 -0.3456474 -0.5689316 -0.7591939
## displacement   -0.5438005 -0.3698552 -0.6145351 -0.7534766
## horsepower     -0.6891955 -0.4163615 -0.4551715 -0.6670526
## weight         -0.4168392 -0.3091199 -0.5850054 -0.7577566
## acceleration    1.0000000  0.2903161  0.2127458  0.3468215
## year            0.2903161  1.0000000  0.1815277  0.4299042
## origin          0.2127458  0.1815277  1.0000000  0.5136984
## mpg01           0.3468215  0.4299042  0.5136984  1.0000000

# Correlation matrix plot with values 
numerical_variables_auto = auto_mpg01[, c("mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration", "year","mpg01")]

# Calculate the correlation matrix
cor_matrix_auto = cor(numerical_variables_auto, use = "complete.obs")  

# Plot the correlation matrix
corrplot(cor_matrix_auto, 
         method = "circle",          
         order = "hclust",           
         type = "upper",            
         tl.col = "black",           
         tl.srt = 45,                
         tl.cex = 0.8,               
         col = colorRampPalette(c("blue", "white", "red"))(200),  
         addCoef.col = "black",      
         diag = FALSE,               
         number.cex = 0.7)

par(mfrow=c(2,3))
boxplot(cylinders ~ mpg01, data = auto_mpg01, main = "Cylinders vs mpg01")
boxplot(displacement ~ mpg01, data = auto_mpg01, main = "Displacement vs mpg01")
boxplot(horsepower ~ mpg01, data = auto_mpg01, main = "Horsepower vs mpg01")
boxplot(weight ~ mpg01, data = auto_mpg01, main = "Weight vs mpg01")
boxplot(acceleration ~ mpg01, data = auto_mpg01, main = "Acceleration vs mpg01")
boxplot(year ~ mpg01, data = auto_mpg01, main = "Year vs mpg01")

# Question 14 (b)

# From the various graphs generated, the features that are useful in 
# predicting 'mpg01' are 'Weight', 'Cylinders', and 'Displacement'. From the
# scatter plot matrix, correlation values with matrix these three variables 
# had a higher absolute value association compared to all other variables. 
# The box plot generated further demonstrates that these three features
# are useful for predicting 'mpg01' because their IQR range within each feature,
# which is illustrated as the box in the box plot, when comparing between
# 'mpg01' value of either 0 or 1, are the furthest away from each other compared
# to other variables.

# Question 14 (c) Split the data into a training set and a test set.
# 80-20 split  

set.seed(5)

# Randomly sample 80% of the row indices for the training set
train_x = sample(1:nrow(auto_mpg01), 0.8 * nrow(auto_mpg01))

# Create the training set using the sampled rows in the dataset
train_data = auto_mpg01[train_x, ]

# Create the test set using the remaining rows basically those not in train_x
test_data = auto_mpg01[-train_x, ]

# Question 14 (d)
# Perform LDA on the training data in order to predict mpg01
# using the variables that seemed most associated with mpg01 in
# (b). What is the test error of the model obtained?

lda_auto_fit = lda(mpg01 ~ weight + cylinders + displacement, data=train_data)
lda_auto_pred = predict(lda_auto_fit, test_data)$class
table(lda_auto_pred, test_data$mpg01)

##              
## lda_auto_pred  0  1
##             0 26  5
##             1  4 44

# Calculate Test error
9 / 79

## [1] 0.1139241

# Question 14 (d). What is the test error.
# The test error is 11.4%

# Question 14 (e)
# Perform QDA on the training data in order to predict mpg01
# using the variables that seemed most associated with mpg01 in
# (b). What is the test error of the model obtained?

qda_auto_fit = qda(mpg01 ~ weight + cylinders + displacement, data=train_data)
qda_auto_pred = predict(qda_auto_fit, test_data)$class
table(qda_auto_pred, test_data$mpg01)

##              
## qda_auto_pred  0  1
##             0 27  5
##             1  3 44

# Calculate Test error
8/79

## [1] 0.1012658

# Question 14 (e). What is the test error.
# The test error is 10.1%

# Question 14(f). 
# Perform logistic regression on the training data in order to predict
# mpg01 using the variables that seemed most associated with
# mpg01 in (b). What is the test error of the model obtained?

autoglm_fit = glm(mpg01 ~ weight + cylinders + displacement, family=binomial,
                  data = train_data)
glm_auto_probs = predict(autoglm_fit,test_data,type = "response")
glm_auto_pred = rep(0,nrow(test_data))
glm_auto_pred[glm_auto_probs > 0.50]=1
table(glm_auto_pred, test_data$mpg01)

##              
## glm_auto_pred  0  1
##             0 26  5
##             1  4 44

# Question 14(f).
# Calculate overall fraction of correct predictions (accuracy rate)
(26 + 44) / 79

## [1] 0.8860759

# Calculate when indication is Up (precision) or (positive predictive value)
44 / (44 + 5)

## [1] 0.8979592

# Calculate when indication is down (Negative Predictive Value)
26 / (26 + 4)

## [1] 0.8666667

# Calculate Test Error (incorrect predictions)
(5 + 4) / 79

## [1] 0.1139241

# Question 14(f).
# From the confusion matrix results, the overall fraction of correct predictions
# is 0.8860759 which indicates a accuracy of 88.61%. The model is able to 
# correctly predict up 89.80% of the time, and the model correctly
# predicts down 86.67% of the time. This logistic regression has
# an test error of 11.39%.

# Question 14(g).
# Perform naive Bayes on the training data in order to predict
# mpg01 using the variables that seemed most associated with mpg01
# in (b). What is the test error of the model obtained?
nbay_auto_fit = naiveBayes(mpg01~ weight + cylinders + displacement, 
                         data = train_data)
nbay_auto_class = predict(nbay_auto_fit, test_data)
table(nbay_auto_class, test_data$mpg01)

##                
## nbay_auto_class  0  1
##               0 26  5
##               1  4 44

# Calculate test error

(5 + 4) / 79

## [1] 0.1139241

# Question 14(g)
# Test error is 11.40%

# Question 14(g)
# Perform KNN on the training data, with several values of K, in
# order to predict mpg01. Use only the variables that seemed most
# associated with mpg01 in (b). What test errors do you obtain?
# Which value of K seems to perform the best on this data set?


# Note you are going to have to make sure you use only the selected variables
# that are of interest and exlude the rest thus you must start from scratch
# when creating the data set. 
# Select relevant variables for KNN
knn_data = train_data[, c("mpg01", "weight", "cylinders", "displacement")]

# Split the data into a training set and a testing set
set.seed(5)  # Set seed for reproducibility
train_index_knn = sample(1:nrow(knn_data), 0.8 * nrow(knn_data))  # 80-20 Split
train_set_knn = knn_data[train_index_knn, ]
test_set_knn = knn_data[-train_index_knn, ]

# Separate predictors and target variable
train_x_knn = train_set_knn[, -1]  #predictors(weight, cylinders, displacement)
train_y_knn = train_set_knn$mpg01  # target variable (mpg01)
test_x_knn = test_set_knn[, -1]  # predictors for testing
test_y_knn = test_set_knn$mpg01  # target variable for testing

# Perform KNN with K = 10
predictions_knn = knn(train = train_x_knn, test = test_x_knn, cl = train_y_knn, 
                       k = 10)

# confusion table 
table(predictions_knn, test_y_knn)

##                test_y_knn
## predictions_knn  0  1
##               0 25  1
##               1  5 32

# Test error for k = 10 

6/63

## [1] 0.0952381

# Question 14(g)
# Perform KNN with K = 20
predictions_knn = knn(train = train_x_knn, test = test_x_knn, cl = train_y_knn, 
                       k = 20)

# confusion table 
table(predictions_knn, test_y_knn)

##                test_y_knn
## predictions_knn  0  1
##               0 26  1
##               1  4 32

# Test error for k = 20 

5/63

## [1] 0.07936508

# Question 14(h)
# Perform KNN with K = 5
predictions_knn = knn(train = train_x_knn, test = test_x_knn, cl = train_y_knn, 
                       k = 5)

# confusion table 
table(predictions_knn, test_y_knn)

##                test_y_knn
## predictions_knn  0  1
##               0 25  1
##               1  5 32

# Test error for k = 5 

6/63

## [1] 0.0952381

# Question 14(h)
# What test errors do you obtain?
# Which value of K seems to perform the best on this data set?

# For k = 5 a test error of 9.52% was obtain.
# For k = 10 a test error of 9.52% was obtain.
# For k = 20 a test error of 7.94% was obtain.
# Based on these test errors the value of k that seems to 
# perform the best on this data set is k = 20.

# Detach from Auto data set 
detach(Auto)

# Question 16 
# Using the Boston data set, fit classification models in order to predict
# whether a given census tract has a crime rate above or below the median.
# Explore logistic regression, LDA, naive Bayes, and KNN models
# using various subsets of the predictors. Describe your findings.
# Hint: You will have to create the response variable yourself, using the
# variables that are contained in the Boston data set.

attach(Boston)
str(Boston)

## 'data.frame':    506 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ black  : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...

cor(Boston)

##                crim          zn       indus         chas         nox
## crim     1.00000000 -0.20046922  0.40658341 -0.055891582  0.42097171
## zn      -0.20046922  1.00000000 -0.53382819 -0.042696719 -0.51660371
## indus    0.40658341 -0.53382819  1.00000000  0.062938027  0.76365145
## chas    -0.05589158 -0.04269672  0.06293803  1.000000000  0.09120281
## nox      0.42097171 -0.51660371  0.76365145  0.091202807  1.00000000
## rm      -0.21924670  0.31199059 -0.39167585  0.091251225 -0.30218819
## age      0.35273425 -0.56953734  0.64477851  0.086517774  0.73147010
## dis     -0.37967009  0.66440822 -0.70802699 -0.099175780 -0.76923011
## rad      0.62550515 -0.31194783  0.59512927 -0.007368241  0.61144056
## tax      0.58276431 -0.31456332  0.72076018 -0.035586518  0.66802320
## ptratio  0.28994558 -0.39167855  0.38324756 -0.121515174  0.18893268
## black   -0.38506394  0.17552032 -0.35697654  0.048788485 -0.38005064
## lstat    0.45562148 -0.41299457  0.60379972 -0.053929298  0.59087892
## medv    -0.38830461  0.36044534 -0.48372516  0.175260177 -0.42732077
##                  rm         age         dis          rad         tax    ptratio
## crim    -0.21924670  0.35273425 -0.37967009  0.625505145  0.58276431  0.2899456
## zn       0.31199059 -0.56953734  0.66440822 -0.311947826 -0.31456332 -0.3916785
## indus   -0.39167585  0.64477851 -0.70802699  0.595129275  0.72076018  0.3832476
## chas     0.09125123  0.08651777 -0.09917578 -0.007368241 -0.03558652 -0.1215152
## nox     -0.30218819  0.73147010 -0.76923011  0.611440563  0.66802320  0.1889327
## rm       1.00000000 -0.24026493  0.20524621 -0.209846668 -0.29204783 -0.3555015
## age     -0.24026493  1.00000000 -0.74788054  0.456022452  0.50645559  0.2615150
## dis      0.20524621 -0.74788054  1.00000000 -0.494587930 -0.53443158 -0.2324705
## rad     -0.20984667  0.45602245 -0.49458793  1.000000000  0.91022819  0.4647412
## tax     -0.29204783  0.50645559 -0.53443158  0.910228189  1.00000000  0.4608530
## ptratio -0.35550149  0.26151501 -0.23247054  0.464741179  0.46085304  1.0000000
## black    0.12806864 -0.27353398  0.29151167 -0.444412816 -0.44180801 -0.1773833
## lstat   -0.61380827  0.60233853 -0.49699583  0.488676335  0.54399341  0.3740443
## medv     0.69535995 -0.37695457  0.24992873 -0.381626231 -0.46853593 -0.5077867
##               black      lstat       medv
## crim    -0.38506394  0.4556215 -0.3883046
## zn       0.17552032 -0.4129946  0.3604453
## indus   -0.35697654  0.6037997 -0.4837252
## chas     0.04878848 -0.0539293  0.1752602
## nox     -0.38005064  0.5908789 -0.4273208
## rm       0.12806864 -0.6138083  0.6953599
## age     -0.27353398  0.6023385 -0.3769546
## dis      0.29151167 -0.4969958  0.2499287
## rad     -0.44441282  0.4886763 -0.3816262
## tax     -0.44180801  0.5439934 -0.4685359
## ptratio -0.17738330  0.3740443 -0.5077867
## black    1.00000000 -0.3660869  0.3334608
## lstat   -0.36608690  1.0000000 -0.7376627
## medv     0.33346082 -0.7376627  1.0000000

# Question 16 
# Explore logistic regression, LDA, naive Bayes, and KNN models
# using various subsets of the predictors. Describe your findings.
# Hint: You will have to create the response variable yourself, using the
# variables that are contained in the Boston data set.


set.seed(100)

# Created response variable crime_rate_above_median
crime_rate_higher_median = ifelse(Boston$crim > median(Boston$crim), 1, 0)

# Add created response variable to the Boston data set 
Boston$crime_rate_higher_median = crime_rate_higher_median

# Old method of splitting data was very time consuming try finding 
# alternative to save time, and have the code a bit more tidy. 
# Split data into training and testing sets 80-20
split = sample.split(Boston$crime_rate_higher_median, SplitRatio = 0.8)

# Create training and testing datasets
Boston_train = subset(Boston, split == TRUE)  
Boston_test = subset(Boston, split == FALSE)  

# Create crime_rate_higher_median vector for the test set
crime_rate_higher_median_test = Boston_test$crime_rate_higher_median  


# Create Logistic Regression Model
logistic_model_q16 = glm(crime_rate_higher_median ~ nox + rad + indus + tax,
                         data = Boston_train, family = "binomial")
summary(logistic_model_q16)

## 
## Call:
## glm(formula = crime_rate_higher_median ~ nox + rad + indus + 
##     tax, family = "binomial", data = Boston_train)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -21.899771   3.104184  -7.055 1.73e-12 ***
## nox          39.364719   6.381663   6.168 6.90e-10 ***
## rad           0.664634   0.137648   4.828 1.38e-06 ***
## indus        -0.047205   0.045552  -1.036  0.30007    
## tax          -0.006837   0.002542  -2.689  0.00716 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 560.06  on 403  degrees of freedom
## Residual deviance: 191.26  on 399  degrees of freedom
## AIC: 201.26
## 
## Number of Fisher Scoring iterations: 8

# Predict probabilities on the test set
glm_probs = predict(logistic_model_q16, Boston_test, type = "response")

# Convert probabilities to binary predictions (threshold = 0.5)
glm_pred = rep(0, length(glm_probs))
glm_pred[glm_probs > 0.5] = 1

table(glm_pred, crime_rate_higher_median_test)

##         crime_rate_higher_median_test
## glm_pred  0  1
##        0 48 12
##        1  3 39

# Calculate Accuracy 
87/102

## [1] 0.8529412

# Calculate test error (misclassification rate)
15/(102)

## [1] 0.1470588

# Question 16 LDA Model 
lda_model_q16 = lda(crime_rate_higher_median ~ nox + rad + indus + tax,
                    data = Boston)
lda_Boston_pred = predict(lda_model_q16, Boston_test)
table(lda_Boston_pred$class, crime_rate_higher_median_test)

##    crime_rate_higher_median_test
##      0  1
##   0 51 16
##   1  0 35

# Calculate Accuracy 
86/102

## [1] 0.8431373

# Calculate test error (misclassification rate)
16/102

## [1] 0.1568627

# Create the Naive Bayes model
nb_model_q16 = naiveBayes(crime_rate_higher_median ~ nox + rad + indus + tax,
                          data = Boston_train)
nb_Boston_pred = predict(nb_model_q16, Boston_test)
table(nb_Boston_pred, crime_rate_higher_median_test)

##               crime_rate_higher_median_test
## nb_Boston_pred  0  1
##              0 50 17
##              1  1 34

# Calculate Accuracy 
84/102

## [1] 0.8235294

# Calculate test error (misclassification rate)
18/102

## [1] 0.1764706

# KNN model with k = 10 

# Prepare the training and testing datasets
# We need to use only the predictor variables (nox, rad, indus, tax) for KNN
train_X = Boston_train[, c("nox", "rad", "indus", "tax")]
test_X = Boston_test[, c("nox", "rad", "indus", "tax")]

# Response variable (crime_rate_higher_median) for both training and testing datasets
train_y = Boston_train$crime_rate_higher_median
test_y = crime_rate_higher_median_test

# Train the KNN model with k=10
knn_Boston_pred = knn(train = train_X, test = test_X, cl = train_y, k = 10)

# Create the confusion matrix (use predicted class from knn_Boston_pred)
table(knn_Boston_pred, test_y)

##                test_y
## knn_Boston_pred  0  1
##               0 45  3
##               1  6 48

# Calculate Accuracy 
92/102

## [1] 0.9019608

# Calculate test error (misclassification rate)
9/102

## [1] 0.08823529

# Question 16 
# Explore logistic regression, LDA, naive Bayes, and KNN models
# using various subsets of the predictors. Describe your findings.

# Based on the different models the logistic regression has an accuracy of 
# 85.3% with an error rate of 14.7%. The LDA model had an accuracy of 
# 84.3% with an error rate of 15.7%. The Naive Bayes model had an accuracy
# of 82.4% and an error rate of 17.6%. The KNN model with k= 10 had an accuracy
# of 90.2% and an error rate of 8.8%  So using the subset predictors of 'nox'
# 'rad', 'indus', and 'tax', the model that is the best is the KNN model due
# to its higher accuracy and lowest error rate out of all the models.

Assignment3

Daniel O.

2025-03-03

R Markdown