suppressMessages(library(ISLR))
head(Weekly)
## Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
## 1 1990 0.816 1.572 -3.936 -0.229 -3.484 0.1549760 -0.270 Down
## 2 1990 -0.270 0.816 1.572 -3.936 -0.229 0.1485740 -2.576 Down
## 3 1990 -2.576 -0.270 0.816 1.572 -3.936 0.1598375 3.514 Up
## 4 1990 3.514 -2.576 -0.270 0.816 1.572 0.1616300 0.712 Up
## 5 1990 0.712 3.514 -2.576 -0.270 0.816 0.1537280 1.178 Up
## 6 1990 1.178 0.712 3.514 -2.576 -0.270 0.1544440 -1.372 Down
weekly = Weekly
names(weekly) = tolower(names(weekly))
summary(weekly)
## year lag1 lag2 lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## lag4 lag5 volume
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202
## Median : 0.2380 Median : 0.2340 Median :1.00268
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821
## today direction
## Min. :-18.1950 Down:484
## 1st Qu.: -1.1540 Up :605
## Median : 0.2410
## Mean : 0.1499
## 3rd Qu.: 1.4050
## Max. : 12.0260
cor(weekly[,-9])
## year lag1 lag2 lag3 lag4
## year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
## lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
## lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
## lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
## lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
## volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
## lag5 volume today
## year -0.030519101 0.84194162 -0.032459894
## lag1 -0.008183096 -0.06495131 -0.075031842
## lag2 -0.072499482 -0.08551314 0.059166717
## lag3 0.060657175 -0.06928771 -0.071243639
## lag4 -0.075675027 -0.06107462 -0.007825873
## lag5 1.000000000 -0.05851741 0.011012698
## volume -0.058517414 1.00000000 -0.033077783
## today 0.011012698 -0.03307778 1.000000000
pairs(weekly[,-9])
(Hint: Use function glm() and pass in the argument family=binomial to run logistic regression. Use Pr(>|z|) to find out which variables are significant.)
glm.fit = glm(direction ~ lag1+lag2+lag3+lag4+lag5+volume, data=weekly, family=binomial)
summary(glm.fit)
##
## Call:
## glm(formula = direction ~ lag1 + lag2 + lag3 + lag4 + lag5 +
## volume, family = binomial, data = weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## lag1 -0.04127 0.02641 -1.563 0.1181
## lag2 0.05844 0.02686 2.175 0.0296 *
## lag3 -0.01606 0.02666 -0.602 0.5469
## lag4 -0.02779 0.02646 -1.050 0.2937
## lag5 -0.01447 0.02638 -0.549 0.5833
## volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
based on p-values of individual predictor variables in summary output of the logistic regression model, lag2 is the only significant predictor of the “direction” as it has p-value less than significance level of 0.05.
(Hint: Use function predict() to make the predictions on the same data set you use to train a logistic regression model. Remember to pass in the argument type=response to obtain prediction probabilities. Let a predicted probability > 0.5 be “Up” and <=0.5 be “Down”. Then use function table() to obtain the confusion matrix. Calculate the overall fraction of correct predictions, sensitivity and specificity according to confusion matrix.)
glm.probs = predict(glm.fit, weekly, type="response")
contrasts(weekly$direction)
## Up
## Down 0
## Up 1
glm.pred = ifelse(glm.probs > 0.5, "Up","Down")
glm.confusion = table(glm.pred, weekly$direction)
glm.confusion
##
## glm.pred Down Up
## Down 54 48
## Up 430 557
# Regard value "Down" as the positive class.
glm.accuracy.percentage = round(mean(glm.pred == weekly$direction)*100,2)
glm.sensitivity.percentage = round(length(which(glm.pred == "Down" & weekly$direction == "Down"))/length(which(weekly$direction == "Down"))*100,2)
glm.specificity.percentage = round(length(which(glm.pred == "Up" & weekly$direction == "Up"))/length(which(weekly$direction == "Up"))*100,2)
glm.evaluation = data.frame(Accuracy = glm.accuracy.percentage, Sensitivity = glm.sensitivity.percentage, Specificity = glm.specificity.percentage)
glm.evaluation
## Accuracy Sensitivity Specificity
## 1 56.11 11.16 92.07
dim(weekly)
## [1] 1089 9
train = weekly[weekly$year %in% c(1990:2008),]
dim(train)
## [1] 985 9
levels(as.factor(train$year)) # sanity check
## [1] "1990" "1991" "1992" "1993" "1994" "1995" "1996" "1997" "1998" "1999"
## [11] "2000" "2001" "2002" "2003" "2004" "2005" "2006" "2007" "2008"
test = weekly[weekly$year %in% c(2009:2010),]
dim(test)
## [1] 104 9
levels(as.factor(test$year)) # sanity check
## [1] "2009" "2010"
glm.fit2 = glm(direction ~ lag2, data=train, family=binomial)
summary(glm.fit2)
##
## Call:
## glm(formula = direction ~ lag2, family = binomial, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.536 -1.264 1.021 1.091 1.368
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.20326 0.06428 3.162 0.00157 **
## lag2 0.05810 0.02870 2.024 0.04298 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1354.7 on 984 degrees of freedom
## Residual deviance: 1350.5 on 983 degrees of freedom
## AIC: 1354.5
##
## Number of Fisher Scoring iterations: 4
glm.probs2 = predict(glm.fit2, test, type="response")
glm.pred2 = ifelse(glm.probs2 > 0.5, "Up","Down")
glm.confusion2 = table(glm.pred2, test$direction)
glm.confusion2
##
## glm.pred2 Down Up
## Down 9 5
## Up 34 56
# Regard value "Down" as the positive class.
glm.accuracy.percentage2 = round(mean(glm.pred2 == test$direction)*100,2)
glm.sensitivity.percentage2 = round(length(which(glm.pred2 == "Down" & test$direction == "Down"))/length(which(test$direction == "Down"))*100,2)
glm.specificity.percentage2 = round(length(which(glm.pred2 == "Up" & test$direction == "Up"))/length(which(test$direction == "Up"))*100,2)
glm.evaluation2 = data.frame(Accuracy = glm.accuracy.percentage2, Sensitivity = glm.sensitivity.percentage2, Specificity = glm.specificity.percentage2)
glm.evaluation2
## Accuracy Sensitivity Specificity
## 1 62.5 20.93 91.8
(Hint: Use function lda() to train a LDA model, which is part of the MASS package.)
suppressMessages(library(MASS))
lda.fit = lda(direction ~ lag2, data=train)
lda.fit
## Call:
## lda(direction ~ lag2, data = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## lag2
## Down -0.03568254
## Up 0.26036581
##
## Coefficients of linear discriminants:
## LD1
## lag2 0.4414162
summary(lda.fit)
## Length Class Mode
## prior 2 -none- numeric
## counts 2 -none- numeric
## means 2 -none- numeric
## scaling 1 -none- numeric
## lev 2 -none- character
## svd 1 -none- numeric
## N 1 -none- numeric
## call 3 -none- call
## terms 3 terms call
## xlevels 0 -none- list
lda.pred = predict(lda.fit, test)
names(lda.pred)
## [1] "class" "posterior" "x"
lda.pred$class
## [1] Up Up Down Down Up Up Up Down Down Down Down Up Up Up
## [15] Up Up Up Up Up Up Down Up Up Up Up Up Up Up
## [29] Up Up Up Up Up Up Up Up Up Up Up Up Up Up
## [43] Up Up Down Up Up Up Up Up Up Up Up Up Up Up
## [57] Down Up Up Up Up Up Up Up Up Up Up Up Up Up
## [71] Up Down Up Down Up Up Up Up Down Down Up Up Up Up
## [85] Up Down Up Up Up Up Up Up Up Up Up Up Up Up
## [99] Up Up Up Up Up Up
## Levels: Down Up
lda.confusion = table(lda.pred$class, test$direction)
lda.confusion
##
## Down Up
## Down 9 5
## Up 34 56
# Regard value "Down" as the positive class.
lda.accuracy.percentage = round(mean(lda.pred$class == test$direction)*100,2)
lda.sensitivity.percentage = round(length(which(lda.pred$class == "Down" & test$direction == "Down"))/length(which(test$direction == "Down"))*100,2)
lda.specificity.percentage = round(length(which(lda.pred$class == "Up" & test$direction == "Up"))/length(which(test$direction == "Up"))*100,2)
lda.evaluation = data.frame(Accuracy = lda.accuracy.percentage, Sensitivity = lda.sensitivity.percentage, Specificity = lda.specificity.percentage)
lda.evaluation
## Accuracy Sensitivity Specificity
## 1 62.5 20.93 91.8
(Hint: Use function qda() to train a QDA model, which is part of the MASS package.)
qda.fit = qda(direction ~ lag2, data=train)
qda.fit
## Call:
## qda(direction ~ lag2, data = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## lag2
## Down -0.03568254
## Up 0.26036581
summary(qda.fit)
## Length Class Mode
## prior 2 -none- numeric
## counts 2 -none- numeric
## means 2 -none- numeric
## scaling 2 -none- numeric
## ldet 2 -none- numeric
## lev 2 -none- character
## N 1 -none- numeric
## call 3 -none- call
## terms 3 terms call
## xlevels 0 -none- list
qda.pred = predict(qda.fit, test)
names(qda.pred)
## [1] "class" "posterior"
qda.pred$class
## [1] Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up
## [24] Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up
## [47] Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up
## [70] Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up Up
## [93] Up Up Up Up Up Up Up Up Up Up Up Up
## Levels: Down Up
qda.confusion = table(qda.pred$class, test$direction)
qda.confusion
##
## Down Up
## Down 0 0
## Up 43 61
# Regard value "Down" as the positive class.
qda.accuracy.percentage = round(mean(qda.pred$class == test$direction)*100,2)
qda.sensitivity.percentage = round(length(which(qda.pred$class == "Down" & test$direction == "Down"))/length(which(test$direction == "Down"))*100,2)
qda.specificity.percentage = round(length(which(qda.pred$class == "Up" & test$direction == "Up"))/length(which(test$direction == "Up"))*100,2)
qda.evaluation = data.frame(Accuracy =qda.accuracy.percentage, Sensitivity = qda.sensitivity.percentage, Specificity = qda.specificity.percentage)
qda.evaluation
## Accuracy Sensitivity Specificity
## 1 58.65 0 100
(Hint: Use the overall fraction of correct predictions to determine which method(s) perform the best.)
model_comparison = rbind(glm.evaluation2, lda.evaluation, qda.evaluation)
model_comparison = cbind(Model = c("Logistic_Regression","LDA","QDA"), model_comparison)
model_comparison[order(model_comparison$Accuracy, decreasing = TRUE),]
## Model Accuracy Sensitivity Specificity
## 1 Logistic_Regression 62.50 20.93 91.8
## 2 LDA 62.50 20.93 91.8
## 3 QDA 58.65 0.00 100.0
suppressMessages(library(MASS))
auto = Auto
names(auto) = tolower(names(auto))
head(auto)
## mpg cylinders displacement horsepower weight acceleration year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10.0 70 1
## name
## 1 chevrolet chevelle malibu
## 2 buick skylark 320
## 3 plymouth satellite
## 4 amc rebel sst
## 5 ford torino
## 6 ford galaxie 500
dim(auto)
## [1] 392 9
med = median(auto$mpg)
auto$mpg01 = ifelse(auto$mpg > med, 1,0)
table(auto$mpg01)
##
## 0 1
## 196 196
(Hint: Use function plot() to make scatterplot and use function boxplot() to make boxplot.)
cor(auto[,-9])
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## mpg01 0.8369392 -0.7591939 -0.7534766 -0.6670526 -0.7577566
## acceleration year origin mpg01
## mpg 0.4233285 0.5805410 0.5652088 0.8369392
## cylinders -0.5046834 -0.3456474 -0.5689316 -0.7591939
## displacement -0.5438005 -0.3698552 -0.6145351 -0.7534766
## horsepower -0.6891955 -0.4163615 -0.4551715 -0.6670526
## weight -0.4168392 -0.3091199 -0.5850054 -0.7577566
## acceleration 1.0000000 0.2903161 0.2127458 0.3468215
## year 0.2903161 1.0000000 0.1815277 0.4299042
## origin 0.2127458 0.1815277 1.0000000 0.5136984
## mpg01 0.3468215 0.4299042 0.5136984 1.0000000
pairs(auto[,-9])
par(mfrow = c(1,2))
for(i in 1:(ncol(auto)-2)){
boxplot(auto[,i] ~ as.factor(auto$mpg01),
xlab = "Gas Mileage (0:Low, 1:High)",
ylab = colnames(auto)[i],
col = c("red","green"))
}
par(default_par)
set.seed(13) trainIndex = createDataPartition(Auto$mpg01, p=.67, list=FALSE, times=1)
Then Auto[trainIndex,] is your training set and Auto[-trainIndex,] is your test set.
suppressMessages(library(caret))
set.seed(13)
auto$mpg01 = as.factor(as.character(auto$mpg01))
auto = auto[,-9] # removing "name" column to avoid the error i am getting during predict.glm()
trainIndex = createDataPartition(auto$mpg01, p=.67, list=FALSE, times=1)
trainauto = auto[trainIndex,]
testauto = auto[-trainIndex,]
dim(auto)
## [1] 392 9
dim(trainauto)
## [1] 264 9
dim(testauto)
## [1] 128 9
(Hint: Calculate the test error rate using the confusion matrix on the test set.)
lda.fit = lda(mpg01 ~ .-mpg, data=trainauto)
lda.fit
## Call:
## lda(mpg01 ~ . - mpg, data = trainauto)
##
## Prior probabilities of groups:
## 0 1
## 0.5 0.5
##
## Group means:
## cylinders displacement horsepower weight acceleration year
## 0 6.666667 266.8864 128.56061 3584.735 14.57879 74.19697
## 1 4.143939 113.5189 79.44697 2317.992 16.28939 77.52273
## origin
## 0 1.189394
## 1 2.015152
##
## Coefficients of linear discriminants:
## LD1
## cylinders -0.533671238
## displacement 0.002693468
## horsepower 0.005944805
## weight -0.001117793
## acceleration -0.016153715
## year 0.134896097
## origin 0.239014920
summary(lda.fit)
## Length Class Mode
## prior 2 -none- numeric
## counts 2 -none- numeric
## means 14 -none- numeric
## scaling 7 -none- numeric
## lev 2 -none- character
## svd 1 -none- numeric
## N 1 -none- numeric
## call 3 -none- call
## terms 3 terms call
## xlevels 0 -none- list
lda.pred = predict(lda.fit, testauto)
names(lda.pred)
## [1] "class" "posterior" "x"
lda.pred$class
## [1] 0 0 0 1 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 0 0 0 0 1 0 1 0 0 1 0 0
## [36] 1 0 0 0 1 1 1 1 0 0 0 0 1 0 1 1 1 1 1 1 1 0 0 0 0 1 1 0 0 1 1 0 1 0 1
## [71] 1 0 0 0 0 1 1 1 1 0 0 0 1 0 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 0 1 1 1 1
## [106] 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1
## Levels: 0 1
lda.confusion = table(lda.pred$class, testauto$mpg01)
lda.confusion
##
## 0 1
## 0 57 3
## 1 7 61
# Regarding value "1:high" as the positive class.
lda.accuracy.percentage = round(mean(lda.pred$class == testauto$mpg01)*100,2)
lda.error.percentage = round(mean(lda.pred$class != testauto$mpg01)*100,2)
lda.error.percentage+lda.accuracy.percentage
## [1] 100
lda.sensitivity.percentage = round(length(which(lda.pred$class == "1" & testauto$mpg01 == "1"))/length(which(testauto$mpg01 == "1"))*100,2)
lda.specificity.percentage = round(length(which(lda.pred$class == "0" & testauto$mpg01 == "0"))/length(which(testauto$mpg01 == "0"))*100,2)
lda.evaluation = data.frame(Test_Error_rate = lda.error.percentage, Accuracy = lda.accuracy.percentage, Sensitivity = lda.sensitivity.percentage, Specificity = lda.specificity.percentage)
lda.evaluation
## Test_Error_rate Accuracy Sensitivity Specificity
## 1 7.81 92.19 95.31 89.06
qda.fit = qda(mpg01 ~ .-mpg, data=trainauto)
qda.fit
## Call:
## qda(mpg01 ~ . - mpg, data = trainauto)
##
## Prior probabilities of groups:
## 0 1
## 0.5 0.5
##
## Group means:
## cylinders displacement horsepower weight acceleration year
## 0 6.666667 266.8864 128.56061 3584.735 14.57879 74.19697
## 1 4.143939 113.5189 79.44697 2317.992 16.28939 77.52273
## origin
## 0 1.189394
## 1 2.015152
summary(qda.fit)
## Length Class Mode
## prior 2 -none- numeric
## counts 2 -none- numeric
## means 14 -none- numeric
## scaling 98 -none- numeric
## ldet 2 -none- numeric
## lev 2 -none- character
## N 1 -none- numeric
## call 3 -none- call
## terms 3 terms call
## xlevels 0 -none- list
qda.pred = predict(qda.fit, testauto)
names(qda.pred)
## [1] "class" "posterior"
qda.pred$class
## [1] 0 0 0 1 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 0 0 0 0 1 0 1 0 0 1 0 0
## [36] 1 0 0 0 1 1 1 1 0 0 0 0 1 0 1 1 1 1 0 1 1 0 0 0 0 1 1 0 0 1 1 0 0 0 1
## [71] 1 0 0 0 0 1 1 1 1 0 0 0 1 0 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 0 1 1 1 1
## [106] 1 1 1 1 1 1 1 1 1 1 0 0 0 1 1 1 0 1 0 1 1 1 1
## Levels: 0 1
qda.confusion = table(qda.pred$class, testauto$mpg01)
qda.confusion
##
## 0 1
## 0 61 5
## 1 3 59
# Regarding value "1:high" as the positive class.
qda.accuracy.percentage = round(mean(qda.pred$class == testauto$mpg01)*100,2)
qda.error.percentage = round(mean(qda.pred$class != testauto$mpg01)*100,2)
qda.error.percentage+qda.accuracy.percentage
## [1] 100
qda.sensitivity.percentage = round(length(which(qda.pred$class == "1" & testauto$mpg01 == "1"))/length(which(testauto$mpg01 == "1"))*100,2)
qda.specificity.percentage = round(length(which(qda.pred$class == "0" & testauto$mpg01 == "0"))/length(which(testauto$mpg01 == "0"))*100,2)
qda.evaluation = data.frame(Test_Error_rate = qda.error.percentage, Accuracy = qda.accuracy.percentage, Sensitivity = qda.sensitivity.percentage, Specificity = qda.specificity.percentage)
qda.evaluation
## Test_Error_rate Accuracy Sensitivity Specificity
## 1 6.25 93.75 92.19 95.31
glm.fit3 = glm(mpg01 ~ .-mpg, data=trainauto, family = binomial)
summary(glm.fit3)
##
## Call:
## glm(formula = mpg01 ~ . - mpg, family = binomial, data = trainauto)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.37410 -0.10718 0.00796 0.22286 3.07994
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -22.134093 7.400266 -2.991 0.00278 **
## cylinders -0.389773 0.512952 -0.760 0.44734
## displacement 0.006032 0.015974 0.378 0.70573
## horsepower -0.021597 0.026927 -0.802 0.42252
## weight -0.004735 0.001480 -3.198 0.00138 **
## acceleration 0.056806 0.164796 0.345 0.73031
## year 0.480997 0.098479 4.884 1.04e-06 ***
## origin 0.586447 0.453602 1.293 0.19606
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 365.98 on 263 degrees of freedom
## Residual deviance: 107.00 on 256 degrees of freedom
## AIC: 123
##
## Number of Fisher Scoring iterations: 8
# I had to remove "name" columns as during predict.glm() i was getting error of "Error in model.frame.default..." as levels of variable "name" were not same in both training and test sets. this might be due to the suggested method of splitting the dataset. I understand there might be a better way to split the datasets to avoid this error, for example using stratified from the splitstackshape package. but at this point to continue with the exercise i have removed the column "name" explicitly instead of during modeling and i didn't get the error after that.
# in presence of "name" column, i would have used below formula to fit the glm model to data:
# glm.fit3 = glm(mpg01 ~ .-mpg-name, data=trainauto, family = binomial)
contrasts(trainauto$mpg01)
## 1
## 0 0
## 1 1
glm.probs3 = predict(glm.fit3, testauto, type="response")
dim(testauto)
## [1] 128 9
length(glm.probs3)
## [1] 128
glm.pred3 = ifelse(glm.probs3 > 0.5, "1","0")
glm.confusion3 = table(glm.pred3, testauto$mpg01)
glm.confusion3
##
## glm.pred3 0 1
## 0 59 4
## 1 5 60
# Regarding value "1:High" as the positive class.
glm.error.percentage3 = round(mean(glm.pred3 != testauto$mpg01)*100,2)
glm.accuracy.percentage3 = round(mean(glm.pred3 == testauto$mpg01)*100,2)
glm.accuracy.percentage3 + glm.error.percentage3
## [1] 100
glm.sensitivity.percentage3 = round(length(which(glm.pred3 == "1" & testauto$mpg01 == "1"))/length(which(testauto$mpg01 == "1"))*100,2)
glm.specificity.percentage3 = round(length(which(glm.pred3 == "0" & testauto$mpg01 == "0"))/length(which(testauto$mpg01 == "0"))*100,2)
glm.evaluation3 = data.frame(Test_Error_rate = glm.error.percentage3, Accuracy = glm.accuracy.percentage3, Sensitivity = glm.sensitivity.percentage3, Specificity = glm.specificity.percentage3)
glm.evaluation3
## Test_Error_rate Accuracy Sensitivity Specificity
## 1 7.03 92.97 93.75 92.19
model_comparison = rbind(glm.evaluation3, lda.evaluation, qda.evaluation)
model_comparison = cbind(Model = c("Logistic_Regression","LDA","QDA"), model_comparison)
model_comparison[order(model_comparison$Accuracy, decreasing = TRUE),]
## Model Test_Error_rate Accuracy Sensitivity Specificity
## 3 QDA 6.25 93.75 92.19 95.31
## 1 Logistic_Regression 7.03 92.97 93.75 92.19
## 2 LDA 7.81 92.19 95.31 89.06