This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.3.3
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.95 loaded
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
library(class)
library(e1071)
## Warning: package 'e1071' was built under R version 4.3.3
library(caTools) # New package to note and explore
## Warning: package 'caTools' was built under R version 4.3.3
attach(Weekly)
head(Weekly)
## Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
## 1 1990 0.816 1.572 -3.936 -0.229 -3.484 0.1549760 -0.270 Down
## 2 1990 -0.270 0.816 1.572 -3.936 -0.229 0.1485740 -2.576 Down
## 3 1990 -2.576 -0.270 0.816 1.572 -3.936 0.1598375 3.514 Up
## 4 1990 3.514 -2.576 -0.270 0.816 1.572 0.1616300 0.712 Up
## 5 1990 0.712 3.514 -2.576 -0.270 0.816 0.1537280 1.178 Up
## 6 1990 1.178 0.712 3.514 -2.576 -0.270 0.1544440 -1.372 Down
str(Weekly)
## 'data.frame': 1089 obs. of 9 variables:
## $ Year : num 1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
## $ Lag1 : num 0.816 -0.27 -2.576 3.514 0.712 ...
## $ Lag2 : num 1.572 0.816 -0.27 -2.576 3.514 ...
## $ Lag3 : num -3.936 1.572 0.816 -0.27 -2.576 ...
## $ Lag4 : num -0.229 -3.936 1.572 0.816 -0.27 ...
## $ Lag5 : num -3.484 -0.229 -3.936 1.572 0.816 ...
## $ Volume : num 0.155 0.149 0.16 0.162 0.154 ...
## $ Today : num -0.27 -2.576 3.514 0.712 1.178 ...
## $ Direction: Factor w/ 2 levels "Down","Up": 1 1 2 2 2 1 2 2 2 1 ...
# Question 13 Part (a). Produce some numerical and graphical summaries of the
# Weekly data. Do there appear to be any patterns?
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
# Question 13 Part (a). Produce some numerical and graphical summaries of the
# Weekly data. Do there appear to be any patterns?
pairs(Weekly)
# Question 13 Part (a). Produce some numerical and graphical summaries of the
# Weekly data. Do there appear to be any patterns?
# Subset the data to exclude direction other wise it won't work cause it is a factor variable
cor(Weekly[, -9])
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
## Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
## Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
## Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
## Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
## Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
## Lag5 Volume Today
## Year -0.030519101 0.84194162 -0.032459894
## Lag1 -0.008183096 -0.06495131 -0.075031842
## Lag2 -0.072499482 -0.08551314 0.059166717
## Lag3 0.060657175 -0.06928771 -0.071243639
## Lag4 -0.075675027 -0.06107462 -0.007825873
## Lag5 1.000000000 -0.05851741 0.011012698
## Volume -0.058517414 1.00000000 -0.033077783
## Today 0.011012698 -0.03307778 1.000000000
# Question 13 Part (a). Produce some numerical and graphical summaries of the
# Weekly data. Do there appear to be any patterns?
numerical_variables = Weekly[, c("Lag1", "Lag2", "Lag3", "Lag4", "Lag5", "Volume", "Today","Year")]
cor_matrix = cor(numerical_variables, use = "complete.obs") # The use complete
# handles missing variables
corrplot(cor_matrix,
method = "circle",
order = "hclust",
type = "upper",
tl.col = "black",
tl.srt = 45,
tl.cex = 0.8,
col = colorRampPalette(c("blue", "white", "red"))(200),
addCoef.col = "black",
diag = FALSE,
number.cex = 0.7)
# Question 13 Part (a). Produce some numerical and graphical summaries of the
# Weekly data. Do there appear to be any patterns?
# Based on the numerical summaries and graphs produced above, the correlation
# between 'Volume' and 'Year' are the only patterns that are present.
# Question 13 Part (b)
# Use the full data set to perform a logistic regression with
# Direction as the response and the five lag variables plus Volume
# as predictors. Use the summary function to print the results. Do
# any of the predictors appear to be statistically significant? If so,
# which ones?
logit_weekly_model = glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Weekly, family = binomial)
summary(logit_weekly_model)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
# Question 13 Part (b)
# Do any of the predictors appear to be statistically significant? If so,
# which ones?
# From the results of the logistic regression Lag 2 is the only predictor
# that is statically significant as it has a p value that is less than 0.05.
# Question 13 Part (c)
# Compute the confusion matrix and overall fraction of correct
# predictions. Explain what the confusion matrix is telling you
# about the types of mistakes made by logistic regression.
weekly_prob = predict(logit_weekly_model, type='response')
weekly_pred = rep("Down",1089)
weekly_pred[weekly_prob>0.5] = "Up"
table(weekly_pred, Direction)
## Direction
## weekly_pred Down Up
## Down 54 48
## Up 430 557
# Calculate overall fraction of correct predictions(accuracy rate)
(557+54)/1089
## [1] 0.5610652
# Calculate when indication is Up. This is known as
# precision or positive predictive value
557/(557+48)
## [1] 0.9206612
# Calculate when indication is down. This is for Negative Predictive Value
54/(54+430)
## [1] 0.1115702
# Calculate error rate
(48+430)/(1089)
## [1] 0.4389348
# Question 13 Part (c)
# Compute the confusion matrix and overall fraction of correct
# predictions. Explain what the confusion matrix is telling you
# about the types of mistakes made by logistic regression.
# From the confusion matrix results, the overall fraction of correct predictions
# is 0.5610652 which indicates a accuracy of 56.1%. The model is able to
# correctly predict up 92.07% of the time, however the model can only correctly
# predict down 11.16 % of the time. This logistic regression has
# an error rate of 43.89%.
# Question 13 Part (d)
# Now fit the logistic regression model using a training data period
# from 1990 to 2008, with Lag2 as the only predictor. Compute the
# confusion matrix and the overall fraction of correct predictions
# for the held out data (that is, the data from 2009 and 2010).
train = (Weekly$Year < 2009)
weekly_2009 = Weekly[!train,]
weekly_fits = glm(Direction ~ Lag2, data=Weekly, family=binomial, subset=train)
weekly_prob = predict(weekly_fits, weekly_2009, type = "response")
weekly_pred = rep("Down", length(weekly_prob))
weekly_pred[weekly_prob > 0.5] = "Up"
Direction_2009 = Direction[!train]
table(weekly_pred, Direction_2009)
## Direction_2009
## weekly_pred Down Up
## Down 9 5
## Up 34 56
# Question 13 Part (d)
# Calculate overall fraction of correct predictions(accuracy rate)
(56+9)/104
## [1] 0.625
# Question 13 Part (d)
# Calculate when indication is Up (precision) or (positive predictive value)
56 / (56+5)
## [1] 0.9180328
# Question 13 Part (d)
# Calculate when indication is down (Negative Predictive Value)
9 / (9+34)
## [1] 0.2093023
# Question 13 Part (d) Calculate error rate
(5+34)/104
## [1] 0.375
# Question 13 Part (d)
# Compute the confusion matrix and the overall fraction of correct predictions
# for the held out data (that is, the data from 2009 and 2010).
# From the confusion matrix results, the overall fraction of correct predictions
# is 0.625 which indicates a accuracy of 62.50%. The model is able to
# correctly predict up 91.80% of the time, however the model can only correctly
# predict down 20.93 % of the time. This logistic regression has
# an error rate of 37.50%.
# Question 13 Part (e)
# Repeat (d) using LDA.
weekly_lda_fit = lda(Direction~ Lag2, data=Weekly, subset=train)
weekly_lda_pred = predict(weekly_lda_fit, weekly_2009)
table(weekly_lda_pred$class, Direction_2009)
## Direction_2009
## Down Up
## Down 9 5
## Up 34 56
# Question 13 Part (e)
# Calculate overall fraction of correct predictions(accuracy rate)
(56 + 9) / 104
## [1] 0.625
# Question 13 Part (e)
# Calculate when indication is Up (precision) or (positive predictive value)
56 / (56 + 5)
## [1] 0.9180328
# Question 13 Part (e)
# Calculate when indication is down (Negative Predictive Value)
9 / (9 + 34)
## [1] 0.2093023
# Question 13 Part (e) Calculate error rate
(5 + 34) / 104
## [1] 0.375
# Question 13 Part (e)
# From the confusion matrix results, the overall fraction of correct predictions
# is 0.625 which indicates a accuracy of 62.50%. The model is able to
# correctly predict up 91.80% of the time, however the model can only correctly
# predict down 20.93 % of the time. This LDA model has
# an error rate of 37.50%.
# Question 13 Part (f)
# Repeat (d) using QDA.
weekly_qda_fit = qda(Direction ~ Lag2, data = Weekly, subset = train)
weekly_qda_pred = predict(weekly_qda_fit, weekly_2009)$class
table(weekly_qda_pred, Direction_2009)
## Direction_2009
## weekly_qda_pred Down Up
## Down 0 0
## Up 43 61
# Question 13 Part (f)
# Calculate overall fraction of correct predictions (accuracy rate)
(61 + 0) / 104
## [1] 0.5865385
# Calculate when indication is Up (precision) or (positive predictive value)
61 / (61 + 0)
## [1] 1
# Calculate when indication is down (Negative Predictive Value)
0 / (0 + 43)
## [1] 0
# Calculate error rate
(0 + 43) / 104
## [1] 0.4134615
# From the confusion matrix results, the overall fraction of correct predictions
# is 0.5865385 which indicates a accuracy of 58.65%. The model is able to
# correctly predict up 100% of the time, however the model can only correctly
# predict down 0% of the time. This QDA model has
# an error rate of 41.35%.
# Question 13 (g)
# Repeat (d) using KNN with K = 1.
week_train = as.matrix(Lag2[train])
week_test = as.matrix(Lag2[!train])
train_Direction = Direction[train]
set.seed(1)
week_knn_pred = knn(week_train,week_test,train_Direction, k=1)
table(week_knn_pred ,Direction_2009)
## Direction_2009
## week_knn_pred Down Up
## Down 21 30
## Up 22 31
# Question 13 (g)
# Calculate overall fraction of correct predictions (accuracy rate)
(31 + 21) / 104
## [1] 0.5
# Calculate when indication is Up (precision) or (positive predictive value)
31 / (31 + 30)
## [1] 0.5081967
# Calculate when indication is down (Negative Predictive Value)
21 / (21 + 22)
## [1] 0.4883721
# Calculate error rate
(30 + 22) / 104
## [1] 0.5
# Question 13 (g)
# From the confusion matrix results, the overall fraction of correct predictions
# is 0.5 which indicates a accuracy of 50%. The model is able to
# correctly predict up 50.82% of the time, however the model can only correctly
# predict down 48.83% of the time. This KNN model has
# an error rate of 50%.
# Question 13 (h)
# Repeat (d) using naive Bayes.
weekly_nbay_fit = naiveBayes(Direction ~ Lag2 ,data = Weekly ,subset = train)
weekly_nbay_class = predict(weekly_nbay_fit ,weekly_2009)
table(weekly_nbay_class ,Direction_2009)
## Direction_2009
## weekly_nbay_class Down Up
## Down 0 0
## Up 43 61
# Question 13 (h)
# Calculate overall fraction of correct predictions (accuracy rate)
(61 + 0) / 104
## [1] 0.5865385
# Calculate when indication is Up (precision) or (positive predictive value)
61 / (61 + 0)
## [1] 1
# Calculate when indication is down (Negative Predictive Value)
0 / (0 + 43)
## [1] 0
# Calculate error rate
(0 + 43) / 104
## [1] 0.4134615
# Question 13 (h)
# From the confusion matrix results, the overall fraction of correct predictions
# is 0.5865385 which indicates a accuracy of 58.65%. The model is able to
# correctly predict up 100% of the time, however the model correctly
# predicts down 0% of the time. This Naive Bayes model has
# an error rate of 41.35%.
# Question 13 (i)
# Which of these methods appears to provide the best results on this data?
# Based on the performance metrics, Logistic Regression appears to provide the
# best overall results on this data. It has the highest accuracy of 62.5%, which
# means it makes the fewest overall mistakes and it has a high precision for
# "Up" at 91.8%, meaning when it predicts "Up," it is correct most of the time.
# It also has the lowest error rate of 35.5% compared to the other models
# Despite the negative predictive value for "Down" being a low 20.9% overall,
# the method that appears to provide the pest result of this data is logistic
# regression part(d).
# Question 13 (j)
# Experiment with different combinations of predictors, including
# possible transformations and interactions, for each of the
# methods. Report the variables, method, and associated confusion
# matrix that appears to provide the best results on the held
# out data. Note that you should also experiment with values for
# K in the KNN classifier.
# Logistic Regression with new predictor of Lag1 excluding lag2
train = (Weekly$Year < 2009)
weekly_2009 = Weekly[!train,]
weekly_fits = glm(Direction ~ Lag1, data=Weekly, family=binomial, subset=train)
weekly_prob = predict(weekly_fits, weekly_2009, type = "response")
weekly_pred = rep("Down", length(weekly_prob))
weekly_pred[weekly_prob > 0.5] = "Up"
Direction_2009 = Direction[!train]
table(weekly_pred, Direction_2009)
## Direction_2009
## weekly_pred Down Up
## Down 4 6
## Up 39 55
# Calculate overall fraction of correct predictions (accuracy rate)
(55 + 4) / 104
## [1] 0.5673077
# Calculate when indication is Up (precision) or (positive predictive value)
55 / (55 + 6)
## [1] 0.9016393
# Calculate when indication is down (Negative Predictive Value)
4 / (4 + 39)
## [1] 0.09302326
# Calculate error rate
(6 + 39) / 104
## [1] 0.4326923
# Question 13 (j)
# LDA model with interaction between lag 1 and lag 2 as predictor
weekly_lda_fitq13 = lda(Direction~ Lag1:Lag2, data = Weekly, family = binomial,
subset=train)
weekly_lda_predq13 = predict(weekly_lda_fitq13, weekly_2009)
table(weekly_lda_predq13$class, Direction_2009)
## Direction_2009
## Down Up
## Down 0 1
## Up 43 60
# Calculate overall fraction of correct predictions (accuracy rate)
(60 + 0) / 104
## [1] 0.5769231
# Calculate when indication is Up (precision) or (positive predictive value)
60 / (60 + 1)
## [1] 0.9836066
# Calculate when indication is down (Negative Predictive Value)
0 / (0 + 43)
## [1] 0
# Calculate error rate
(1 + 43) / 104
## [1] 0.4230769
# Question 13 (j)
# QDA Model with Lag1 squared transformation
weekly_qda_fitq13 = qda(Direction ~ Lag1^2, data = Weekly, subset = train)
weekly_qda_predq13 = predict(weekly_qda_fitq13, weekly_2009)$class
table(weekly_qda_predq13, Direction_2009)
## Direction_2009
## weekly_qda_predq13 Down Up
## Down 0 0
## Up 43 61
# Calculate overall fraction of correct predictions (accuracy rate)
(61 + 0) / 104
## [1] 0.5865385
# Calculate when indication is Up (precision) or (positive predictive value)
61 / (61 + 0)
## [1] 1
# Calculate when indication is down (Negative Predictive Value)
0 / (0 + 43)
## [1] 0
# Calculate error rate
(0 + 43) / 104
## [1] 0.4134615
# Question 13 (j)
# KNN Model with k=10, Lag2
set.seed(1)
week_knn_predq13 = knn(week_train,week_test,train_Direction, k=10)
table(week_knn_predq13,Direction_2009)
## Direction_2009
## week_knn_predq13 Down Up
## Down 17 21
## Up 26 40
# Calculate overall fraction of correct predictions (accuracy rate)
(40 + 17) / 104
## [1] 0.5480769
# Calculate when indication is Up (precision) or (positive predictive value)
40 / (40 + 21)
## [1] 0.6557377
# Calculate when indication is down (Negative Predictive Value)
17 / (17 + 26)
## [1] 0.3953488
# Calculate error rate
(21 + 26) / 104
## [1] 0.4519231
# Question 13 (j)
# Naive Bayes with new predictor of lag 1 with lag 2 still included
weekly_nbay_fitq13j = naiveBayes(Direction ~ Lag1 + Lag2 ,data = Weekly ,subset = train)
weekly_nbay_classq13j = predict(weekly_nbay_fitq13j ,weekly_2009)
table(weekly_nbay_classq13j ,Direction_2009)
## Direction_2009
## weekly_nbay_classq13j Down Up
## Down 3 8
## Up 40 53
# Calculate overall fraction of correct predictions (accuracy rate)
(53 + 3) / 104
## [1] 0.5384615
# Calculate when indication is Up (precision) or (positive predictive value)
53 / (53 + 8)
## [1] 0.8688525
# Calculate when indication is down (Negative Predictive Value)
3 / (3 + 40)
## [1] 0.06976744
# Calculate error rate
(8 + 40) / 104
## [1] 0.4615385
# Question 13 (j)
# Report the variables, method, and associated confusion
# matrix that appears to provide the best results on the held
# out data. Note that you should also experiment with values for
# K in the KNN classifier.
# Based on the results the matrix that had the best result on the held out
# data was the QDA model because with a accuracy of 59.6% it was the highest
# out of all the other models. The other statistics that can be determined
# from this matrix is that it has a 100% precision but a 0% negative predictive
# value, and a error rate of 41.3% which is the lowest error rate out of the
# model. The predictive variable associated with this model is Lag1^2
# transformation.
# Use detach() function to ensure that possible variable names between
# data sets do not interfere with other datasets that have overlapping variable
# names.
detach(Weekly)
# Q14. In this problem, you will develop a model to predict whether a given
# car gets high or low gas mileage based on the Auto data set.
attach(Auto)
# Question 14 (a)
# Create a binary variable, mpg01, that contains a 1 if mpg contains
# a value above its median, and a 0 if mpg contains a value below
# its median. You can compute the median using the median()
# function. Note you may find it helpful to use the data.frame()
# function to create a single data set containing both mpg01 and
# the other Auto variables.
mpg01 = ifelse(Auto$mpg > median(Auto$mpg),1,0)
auto_mpg01 = data.frame(Auto, mpg01)
# Check Variables
str(auto_mpg01)
## 'data.frame': 392 obs. of 10 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : int 130 165 150 150 140 198 220 215 225 190 ...
## $ weight : int 3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
## $ mpg01 : num 0 0 0 0 0 0 0 0 0 0 ...
# Question 14 (b)
# Explore the data graphically in order to investigate the association
# between mpg01 and the other features. Which of the other
# features seem most likely to be useful in predicting mpg01? Scatterplots
# and boxplots may be useful tools to answer this question.
# Describe your findings.
# Need to get rid of name variable
pairs(auto_mpg01[,-9])
cor(auto_mpg01[, -9])
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## mpg01 0.8369392 -0.7591939 -0.7534766 -0.6670526 -0.7577566
## acceleration year origin mpg01
## mpg 0.4233285 0.5805410 0.5652088 0.8369392
## cylinders -0.5046834 -0.3456474 -0.5689316 -0.7591939
## displacement -0.5438005 -0.3698552 -0.6145351 -0.7534766
## horsepower -0.6891955 -0.4163615 -0.4551715 -0.6670526
## weight -0.4168392 -0.3091199 -0.5850054 -0.7577566
## acceleration 1.0000000 0.2903161 0.2127458 0.3468215
## year 0.2903161 1.0000000 0.1815277 0.4299042
## origin 0.2127458 0.1815277 1.0000000 0.5136984
## mpg01 0.3468215 0.4299042 0.5136984 1.0000000
# Correlation matrix plot with values
numerical_variables_auto = auto_mpg01[, c("mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration", "year","mpg01")]
# Calculate the correlation matrix
cor_matrix_auto = cor(numerical_variables_auto, use = "complete.obs")
# Plot the correlation matrix
corrplot(cor_matrix_auto,
method = "circle",
order = "hclust",
type = "upper",
tl.col = "black",
tl.srt = 45,
tl.cex = 0.8,
col = colorRampPalette(c("blue", "white", "red"))(200),
addCoef.col = "black",
diag = FALSE,
number.cex = 0.7)
par(mfrow=c(2,3))
boxplot(cylinders ~ mpg01, data = auto_mpg01, main = "Cylinders vs mpg01")
boxplot(displacement ~ mpg01, data = auto_mpg01, main = "Displacement vs mpg01")
boxplot(horsepower ~ mpg01, data = auto_mpg01, main = "Horsepower vs mpg01")
boxplot(weight ~ mpg01, data = auto_mpg01, main = "Weight vs mpg01")
boxplot(acceleration ~ mpg01, data = auto_mpg01, main = "Acceleration vs mpg01")
boxplot(year ~ mpg01, data = auto_mpg01, main = "Year vs mpg01")
# Question 14 (b)
# From the various graphs generated, the features that are useful in
# predicting 'mpg01' are 'Weight', 'Cylinders', and 'Displacement'. From the
# scatter plot matrix, correlation values with matrix these three variables
# had a higher absolute value association compared to all other variables.
# The box plot generated further demonstrates that these three features
# are useful for predicting 'mpg01' because their IQR range within each feature,
# which is illustrated as the box in the box plot, when comparing between
# 'mpg01' value of either 0 or 1, are the furthest away from each other compared
# to other variables.
# Question 14 (c) Split the data into a training set and a test set.
# 80-20 split
set.seed(5)
# Randomly sample 80% of the row indices for the training set
train_x = sample(1:nrow(auto_mpg01), 0.8 * nrow(auto_mpg01))
# Create the training set using the sampled rows in the dataset
train_data = auto_mpg01[train_x, ]
# Create the test set using the remaining rows basically those not in train_x
test_data = auto_mpg01[-train_x, ]
# Question 14 (d)
# Perform LDA on the training data in order to predict mpg01
# using the variables that seemed most associated with mpg01 in
# (b). What is the test error of the model obtained?
lda_auto_fit = lda(mpg01 ~ weight + cylinders + displacement, data=train_data)
lda_auto_pred = predict(lda_auto_fit, test_data)$class
table(lda_auto_pred, test_data$mpg01)
##
## lda_auto_pred 0 1
## 0 26 5
## 1 4 44
# Calculate Test error
9 / 79
## [1] 0.1139241
# Question 14 (d). What is the test error.
# The test error is 11.4%
# Question 14 (e)
# Perform QDA on the training data in order to predict mpg01
# using the variables that seemed most associated with mpg01 in
# (b). What is the test error of the model obtained?
qda_auto_fit = qda(mpg01 ~ weight + cylinders + displacement, data=train_data)
qda_auto_pred = predict(qda_auto_fit, test_data)$class
table(qda_auto_pred, test_data$mpg01)
##
## qda_auto_pred 0 1
## 0 27 5
## 1 3 44
# Calculate Test error
8/79
## [1] 0.1012658
# Question 14 (e). What is the test error.
# The test error is 10.1%
# Question 14(f).
# Perform logistic regression on the training data in order to predict
# mpg01 using the variables that seemed most associated with
# mpg01 in (b). What is the test error of the model obtained?
autoglm_fit = glm(mpg01 ~ weight + cylinders + displacement, family=binomial,
data = train_data)
glm_auto_probs = predict(autoglm_fit,test_data,type = "response")
glm_auto_pred = rep(0,nrow(test_data))
glm_auto_pred[glm_auto_probs > 0.50]=1
table(glm_auto_pred, test_data$mpg01)
##
## glm_auto_pred 0 1
## 0 26 5
## 1 4 44
# Question 14(f).
# Calculate overall fraction of correct predictions (accuracy rate)
(26 + 44) / 79
## [1] 0.8860759
# Calculate when indication is Up (precision) or (positive predictive value)
44 / (44 + 5)
## [1] 0.8979592
# Calculate when indication is down (Negative Predictive Value)
26 / (26 + 4)
## [1] 0.8666667
# Calculate Test Error (incorrect predictions)
(5 + 4) / 79
## [1] 0.1139241
# Question 14(f).
# From the confusion matrix results, the overall fraction of correct predictions
# is 0.8860759 which indicates a accuracy of 88.61%. The model is able to
# correctly predict up 89.80% of the time, and the model correctly
# predicts down 86.67% of the time. This logistic regression has
# an test error of 11.39%.
# Question 14(g).
# Perform naive Bayes on the training data in order to predict
# mpg01 using the variables that seemed most associated with mpg01
# in (b). What is the test error of the model obtained?
nbay_auto_fit = naiveBayes(mpg01~ weight + cylinders + displacement,
data = train_data)
nbay_auto_class = predict(nbay_auto_fit, test_data)
table(nbay_auto_class, test_data$mpg01)
##
## nbay_auto_class 0 1
## 0 26 5
## 1 4 44
# Calculate test error
(5 + 4) / 79
## [1] 0.1139241
# Question 14(g)
# Test error is 11.40%
# Question 14(g)
# Perform KNN on the training data, with several values of K, in
# order to predict mpg01. Use only the variables that seemed most
# associated with mpg01 in (b). What test errors do you obtain?
# Which value of K seems to perform the best on this data set?
# Note you are going to have to make sure you use only the selected variables
# that are of interest and exlude the rest thus you must start from scratch
# when creating the data set.
# Select relevant variables for KNN
knn_data = train_data[, c("mpg01", "weight", "cylinders", "displacement")]
# Split the data into a training set and a testing set
set.seed(5) # Set seed for reproducibility
train_index_knn = sample(1:nrow(knn_data), 0.8 * nrow(knn_data)) # 80-20 Split
train_set_knn = knn_data[train_index_knn, ]
test_set_knn = knn_data[-train_index_knn, ]
# Separate predictors and target variable
train_x_knn = train_set_knn[, -1] #predictors(weight, cylinders, displacement)
train_y_knn = train_set_knn$mpg01 # target variable (mpg01)
test_x_knn = test_set_knn[, -1] # predictors for testing
test_y_knn = test_set_knn$mpg01 # target variable for testing
# Perform KNN with K = 10
predictions_knn = knn(train = train_x_knn, test = test_x_knn, cl = train_y_knn,
k = 10)
# confusion table
table(predictions_knn, test_y_knn)
## test_y_knn
## predictions_knn 0 1
## 0 25 1
## 1 5 32
# Test error for k = 10
6/63
## [1] 0.0952381
# Question 14(g)
# Perform KNN with K = 20
predictions_knn = knn(train = train_x_knn, test = test_x_knn, cl = train_y_knn,
k = 20)
# confusion table
table(predictions_knn, test_y_knn)
## test_y_knn
## predictions_knn 0 1
## 0 26 1
## 1 4 32
# Test error for k = 20
5/63
## [1] 0.07936508
# Question 14(h)
# Perform KNN with K = 5
predictions_knn = knn(train = train_x_knn, test = test_x_knn, cl = train_y_knn,
k = 5)
# confusion table
table(predictions_knn, test_y_knn)
## test_y_knn
## predictions_knn 0 1
## 0 25 1
## 1 5 32
# Test error for k = 5
6/63
## [1] 0.0952381
# Question 14(h)
# What test errors do you obtain?
# Which value of K seems to perform the best on this data set?
# For k = 5 a test error of 9.52% was obtain.
# For k = 10 a test error of 9.52% was obtain.
# For k = 20 a test error of 7.94% was obtain.
# Based on these test errors the value of k that seems to
# perform the best on this data set is k = 20.
# Detach from Auto data set
detach(Auto)
# Question 16
# Using the Boston data set, fit classification models in order to predict
# whether a given census tract has a crime rate above or below the median.
# Explore logistic regression, LDA, naive Bayes, and KNN models
# using various subsets of the predictors. Describe your findings.
# Hint: You will have to create the response variable yourself, using the
# variables that are contained in the Boston data set.
attach(Boston)
str(Boston)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
cor(Boston)
## crim zn indus chas nox
## crim 1.00000000 -0.20046922 0.40658341 -0.055891582 0.42097171
## zn -0.20046922 1.00000000 -0.53382819 -0.042696719 -0.51660371
## indus 0.40658341 -0.53382819 1.00000000 0.062938027 0.76365145
## chas -0.05589158 -0.04269672 0.06293803 1.000000000 0.09120281
## nox 0.42097171 -0.51660371 0.76365145 0.091202807 1.00000000
## rm -0.21924670 0.31199059 -0.39167585 0.091251225 -0.30218819
## age 0.35273425 -0.56953734 0.64477851 0.086517774 0.73147010
## dis -0.37967009 0.66440822 -0.70802699 -0.099175780 -0.76923011
## rad 0.62550515 -0.31194783 0.59512927 -0.007368241 0.61144056
## tax 0.58276431 -0.31456332 0.72076018 -0.035586518 0.66802320
## ptratio 0.28994558 -0.39167855 0.38324756 -0.121515174 0.18893268
## black -0.38506394 0.17552032 -0.35697654 0.048788485 -0.38005064
## lstat 0.45562148 -0.41299457 0.60379972 -0.053929298 0.59087892
## medv -0.38830461 0.36044534 -0.48372516 0.175260177 -0.42732077
## rm age dis rad tax ptratio
## crim -0.21924670 0.35273425 -0.37967009 0.625505145 0.58276431 0.2899456
## zn 0.31199059 -0.56953734 0.66440822 -0.311947826 -0.31456332 -0.3916785
## indus -0.39167585 0.64477851 -0.70802699 0.595129275 0.72076018 0.3832476
## chas 0.09125123 0.08651777 -0.09917578 -0.007368241 -0.03558652 -0.1215152
## nox -0.30218819 0.73147010 -0.76923011 0.611440563 0.66802320 0.1889327
## rm 1.00000000 -0.24026493 0.20524621 -0.209846668 -0.29204783 -0.3555015
## age -0.24026493 1.00000000 -0.74788054 0.456022452 0.50645559 0.2615150
## dis 0.20524621 -0.74788054 1.00000000 -0.494587930 -0.53443158 -0.2324705
## rad -0.20984667 0.45602245 -0.49458793 1.000000000 0.91022819 0.4647412
## tax -0.29204783 0.50645559 -0.53443158 0.910228189 1.00000000 0.4608530
## ptratio -0.35550149 0.26151501 -0.23247054 0.464741179 0.46085304 1.0000000
## black 0.12806864 -0.27353398 0.29151167 -0.444412816 -0.44180801 -0.1773833
## lstat -0.61380827 0.60233853 -0.49699583 0.488676335 0.54399341 0.3740443
## medv 0.69535995 -0.37695457 0.24992873 -0.381626231 -0.46853593 -0.5077867
## black lstat medv
## crim -0.38506394 0.4556215 -0.3883046
## zn 0.17552032 -0.4129946 0.3604453
## indus -0.35697654 0.6037997 -0.4837252
## chas 0.04878848 -0.0539293 0.1752602
## nox -0.38005064 0.5908789 -0.4273208
## rm 0.12806864 -0.6138083 0.6953599
## age -0.27353398 0.6023385 -0.3769546
## dis 0.29151167 -0.4969958 0.2499287
## rad -0.44441282 0.4886763 -0.3816262
## tax -0.44180801 0.5439934 -0.4685359
## ptratio -0.17738330 0.3740443 -0.5077867
## black 1.00000000 -0.3660869 0.3334608
## lstat -0.36608690 1.0000000 -0.7376627
## medv 0.33346082 -0.7376627 1.0000000
# Question 16
# Explore logistic regression, LDA, naive Bayes, and KNN models
# using various subsets of the predictors. Describe your findings.
# Hint: You will have to create the response variable yourself, using the
# variables that are contained in the Boston data set.
set.seed(100)
# Created response variable crime_rate_above_median
crime_rate_higher_median = ifelse(Boston$crim > median(Boston$crim), 1, 0)
# Add created response variable to the Boston data set
Boston$crime_rate_higher_median = crime_rate_higher_median
# Old method of splitting data was very time consuming try finding
# alternative to save time, and have the code a bit more tidy.
# Split data into training and testing sets 80-20
split = sample.split(Boston$crime_rate_higher_median, SplitRatio = 0.8)
# Create training and testing datasets
Boston_train = subset(Boston, split == TRUE)
Boston_test = subset(Boston, split == FALSE)
# Create crime_rate_higher_median vector for the test set
crime_rate_higher_median_test = Boston_test$crime_rate_higher_median
# Create Logistic Regression Model
logistic_model_q16 = glm(crime_rate_higher_median ~ nox + rad + indus + tax,
data = Boston_train, family = "binomial")
summary(logistic_model_q16)
##
## Call:
## glm(formula = crime_rate_higher_median ~ nox + rad + indus +
## tax, family = "binomial", data = Boston_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -21.899771 3.104184 -7.055 1.73e-12 ***
## nox 39.364719 6.381663 6.168 6.90e-10 ***
## rad 0.664634 0.137648 4.828 1.38e-06 ***
## indus -0.047205 0.045552 -1.036 0.30007
## tax -0.006837 0.002542 -2.689 0.00716 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 560.06 on 403 degrees of freedom
## Residual deviance: 191.26 on 399 degrees of freedom
## AIC: 201.26
##
## Number of Fisher Scoring iterations: 8
# Predict probabilities on the test set
glm_probs = predict(logistic_model_q16, Boston_test, type = "response")
# Convert probabilities to binary predictions (threshold = 0.5)
glm_pred = rep(0, length(glm_probs))
glm_pred[glm_probs > 0.5] = 1
table(glm_pred, crime_rate_higher_median_test)
## crime_rate_higher_median_test
## glm_pred 0 1
## 0 48 12
## 1 3 39
# Calculate Accuracy
87/102
## [1] 0.8529412
# Calculate test error (misclassification rate)
15/(102)
## [1] 0.1470588
# Question 16 LDA Model
lda_model_q16 = lda(crime_rate_higher_median ~ nox + rad + indus + tax,
data = Boston)
lda_Boston_pred = predict(lda_model_q16, Boston_test)
table(lda_Boston_pred$class, crime_rate_higher_median_test)
## crime_rate_higher_median_test
## 0 1
## 0 51 16
## 1 0 35
# Calculate Accuracy
86/102
## [1] 0.8431373
# Calculate test error (misclassification rate)
16/102
## [1] 0.1568627
# Create the Naive Bayes model
nb_model_q16 = naiveBayes(crime_rate_higher_median ~ nox + rad + indus + tax,
data = Boston_train)
nb_Boston_pred = predict(nb_model_q16, Boston_test)
table(nb_Boston_pred, crime_rate_higher_median_test)
## crime_rate_higher_median_test
## nb_Boston_pred 0 1
## 0 50 17
## 1 1 34
# Calculate Accuracy
84/102
## [1] 0.8235294
# Calculate test error (misclassification rate)
18/102
## [1] 0.1764706
# KNN model with k = 10
# Prepare the training and testing datasets
# We need to use only the predictor variables (nox, rad, indus, tax) for KNN
train_X = Boston_train[, c("nox", "rad", "indus", "tax")]
test_X = Boston_test[, c("nox", "rad", "indus", "tax")]
# Response variable (crime_rate_higher_median) for both training and testing datasets
train_y = Boston_train$crime_rate_higher_median
test_y = crime_rate_higher_median_test
# Train the KNN model with k=10
knn_Boston_pred = knn(train = train_X, test = test_X, cl = train_y, k = 10)
# Create the confusion matrix (use predicted class from knn_Boston_pred)
table(knn_Boston_pred, test_y)
## test_y
## knn_Boston_pred 0 1
## 0 45 3
## 1 6 48
# Calculate Accuracy
92/102
## [1] 0.9019608
# Calculate test error (misclassification rate)
9/102
## [1] 0.08823529
# Question 16
# Explore logistic regression, LDA, naive Bayes, and KNN models
# using various subsets of the predictors. Describe your findings.
# Based on the different models the logistic regression has an accuracy of
# 85.3% with an error rate of 14.7%. The LDA model had an accuracy of
# 84.3% with an error rate of 15.7%. The Naive Bayes model had an accuracy
# of 82.4% and an error rate of 17.6%. The KNN model with k= 10 had an accuracy
# of 90.2% and an error rate of 8.8% So using the subset predictors of 'nox'
# 'rad', 'indus', and 'tax', the model that is the best is the KNN model due
# to its higher accuracy and lowest error rate out of all the models.