13.)
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.1.3
View(Weekly)
Weekly=Weekly
library(corrplot)
## corrplot 0.92 loaded
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
corrplot(cor(Weekly[,-9]), method="square")
The only variables with high correlation are Volume and Year.
Weekly.fit = glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume, data=Weekly,family=binomial)
summary(Weekly.fit)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
The only statistically significant variable is Lag2.
logWeekly.prob = predict(Weekly.fit, type='response')
logWeekly.pred = rep("Down", length(logWeekly.prob))
logWeekly.pred[logWeekly.prob > 0.5] = "Up"
table(logWeekly.pred, Weekly$Direction)
##
## logWeekly.pred Down Up
## Down 54 48
## Up 430 557
(557+54)/(54+48+430+557)
## [1] 0.5610652
557/(557+48)
## [1] 0.9206612
54/(54+430)
## [1] 0.1115702
The model is correct roughly 56% of the time. Up weekly trends were correctly predicted 92% of the time, but down weekly trends were correctly predicted only 11% of the time.
Weekly.train = (Weekly$Year<2009)
Weekly.0910 = Weekly[!Weekly.train,]
Weekly.fit = glm(Direction~Lag2, data=Weekly,family=binomial, subset=Weekly.train)
logWeekly.prob= predict(Weekly.fit, Weekly.0910, type = "response")
logWeekly.pred = rep("Down", length(logWeekly.prob))
logWeekly.pred[logWeekly.prob > 0.5] = "Up"
Direction.0910 = Weekly$Direction[!Weekly.train]
table(logWeekly.pred, Direction.0910)
## Direction.0910
## logWeekly.pred Down Up
## Down 9 5
## Up 34 56
mean(logWeekly.pred == Direction.0910)
## [1] 0.625
This model correctly predicts trends with 62.5% accuracy, a moderate improvement over the last model. This model is also better at predicting upward trends than downward trends.
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
Weeklylda.fit = lda(Direction~Lag2, data=Weekly, family=binomial, subset=Weekly.train)
Weeklylda.pred<-predict(Weeklylda.fit, Weekly.0910)
table(Weeklylda.pred$class, Direction.0910)
## Direction.0910
## Down Up
## Down 9 5
## Up 34 56
mean(Weeklylda.pred$class==Direction.0910)
## [1] 0.625
62.5% accurate.
Weeklyqda.fit = qda(Direction ~ Lag2, data = Weekly, subset = Weekly.train)
Weeklyqda.pred = predict(Weeklyqda.fit, Weekly.0910)$class
table(Weeklyqda.pred, Direction.0910)
## Direction.0910
## Weeklyqda.pred Down Up
## Down 0 0
## Up 43 61
mean(Weeklyqda.pred==Direction.0910)
## [1] 0.5865385
58.65% accurate.
library(class)
Week.train=as.matrix(Weekly$Lag2[Weekly.train])
Week.test=as.matrix(Weekly$Lag2[!Weekly.train])
train.Direction = Weekly$Direction[Weekly.train]
set.seed(99)
Weekknn.pred = knn(Week.train,Week.test,train.Direction,k=1)
table(Weekknn.pred,Direction.0910)
## Direction.0910
## Weekknn.pred Down Up
## Down 21 30
## Up 22 31
mean(Weekknn.pred == Direction.0910)
## [1] 0.5
50% accuracy. Basically equal to random chance.
library(e1071)
week.bayes = naiveBayes(Weekly$Direction ~ Lag2 ,data=Weekly ,subset=Weekly.train)
week.bayes.class = predict(week.bayes ,Weekly.0910)
table(week.bayes.class, Direction.0910)
## Direction.0910
## week.bayes.class Down Up
## Down 0 0
## Up 43 61
mean(week.bayes.class == Direction.0910)
## [1] 0.5865385
58.65% accurate.
Logistic Regression and Linear Discriminant Analysis had the best accuracy, both having rates of 62.5%.
14.)
library(knitr)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.1.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x dplyr::select() masks MASS::select()
View(Auto)
auto = Auto
set.seed(14)
df = auto
df = df[,1:8]
med = median(df$mpg)
df$mpg01 = ifelse(df$mpg > med, 1, 0)
df$mpg01 = as.factor(df$mpg01)
plot(df$mpg01, df$horsepower) #Good predictor
plot(df$mpg01, df$cylinders)
plot(df$mpg01, df$acceleration)
plot(df$mpg01, df$displacement) #Good
plot(df$mpg01, df$weight) #Good
plot(df$mpg01, df$year)
rows = sample(1:nrow(df), 0.75*nrow(df))
auto.train = df[rows,]
auto.test = df[-rows,]
lda.auto = lda(mpg01 ~ horsepower + displacement + weight, data = auto.train)
lda.pred = predict(lda.auto, auto.test)
lda.class = lda.pred$class
table(lda.class , auto.test$mpg01)
##
## lda.class 0 1
## 0 38 4
## 1 9 47
86.7% accurate. Test error is 13.3%.
qda.auto = qda(mpg01 ~ horsepower + displacement + weight, data = auto.train)
qda.pred = predict(qda.auto, auto.test)
qda.class = qda.pred$class
table(qda.class, auto.test$mpg01)
##
## qda.class 0 1
## 0 39 5
## 1 8 46
86.7% accurate. Test error is 13.3%
log.auto = glm(mpg01 ~ horsepower + displacement + weight, family = 'binomial', data = auto.train)
log.pred = predict(log.auto, auto.test, type = 'response')
log.pred = as.factor(ifelse(log.pred > 0.5, 1, 0))
table(log.pred, auto.test$mpg01)
##
## log.pred 0 1
## 0 39 4
## 1 8 47
87.8% accurate. Test error is 12.2%
bayes.auto = naiveBayes(mpg01 ~ horsepower + displacement + weight, data = auto.train)
bayes.class = predict(bayes.auto, auto.test)
table(bayes.class, auto.test$mpg01)
##
## bayes.class 0 1
## 0 38 4
## 1 9 47
86.7% accurate. Test error is 13.3%
library(class)
knn_pred_y = knn(auto.train, auto.test, auto.train$mpg01, k = 1)
table(knn_pred_y, auto.test$mpg01)
##
## knn_pred_y 0 1
## 0 39 7
## 1 8 44
knn_pred_y = knn(auto.train, auto.test, auto.train$mpg01, k = 2)
table(knn_pred_y, auto.test$mpg01)
##
## knn_pred_y 0 1
## 0 39 5
## 1 8 46
knn_pred_y = knn(auto.train, auto.test, auto.train$mpg01, k = 3)
table(knn_pred_y, auto.test$mpg01)
##
## knn_pred_y 0 1
## 0 39 5
## 1 8 46
knn_pred_y = knn(auto.train, auto.test, auto.train$mpg01, k = 4)
table(knn_pred_y, auto.test$mpg01)
##
## knn_pred_y 0 1
## 0 38 6
## 1 9 45
K=1: 84.7% accurate. Test error is 15.3% K=2: 86.7% accurate. Test error is 13.3% K=3: 86.7% accurate. Test error is 13.3% K=4: 84.7% accurate. Test error is 15.3%
Model is most accurate when K=2 or K=3.
16.) Using the Boston data set, fit classification models in order to predict whether a given census tract has a crime rate above or below the median. Explore logistic regression, LDA, naive Bayes, and KNN models using various subsets of the predictors. Describe your findings.
boston = Boston
boston$chas = as.factor(boston$chas)
boston$crime_factor = factor(ifelse(boston$crim > median(boston$crim),
'High', 'Low'),
levels = c('High', 'Low'))
cor_test <- boston %>%
select(-chas, -crime_factor) %>%
cor.mtest(conf.level = .95)
boston %>%
select(-chas, -crime_factor) %>%
cor %>%
corrplot(method = 'color',
order = 'hclust', addrect = 2,
tl.col = 'black', addCoef.col = 'black', number.cex = 0.65,
p.mat = cor_test$p, sig.level = .05)
These variables have a fairly high correlation with crime rate: lsat,
indus, nox, rad, tax.
set.seed(6)
rows = sample(1:nrow(boston), .75*nrow(boston))
bost.train = (boston[rows,])
bost.test = (boston[-rows,])
log.bost = glm(crime_factor ~ indus + nox + rad + tax + lstat, family = 'binomial', data = bost.train)
log.pred = predict(log.bost , bost.test, type = 'response')
log.pred = as.factor(ifelse(log.pred > 0.5, 1, 0))
table(log.pred , bost.test$crime_factor)
##
## log.pred High Low
## 0 47 6
## 1 13 61
Test error is 13.4%.
lda.bost = lda(crime_factor ~ indus + nox + rad + tax + lstat, data = bost.train)
lda.pred = predict(lda.bost, bost.test)
lda.class = lda.pred$class
table(lda.class, bost.test$crime_factor)
##
## lda.class High Low
## High 41 2
## Low 19 65
Test error is 18.9%.
qda.bost = qda(crime_factor ~ indus + nox + rad + tax + lstat, data = bost.train)
qda.pred = predict(qda.bost, bost.test)
qda.class <- qda.pred$class
table(qda.class, bost.test$crime_factor)
##
## qda.class High Low
## High 44 2
## Low 16 65
Test error is 15.0%.
trainKNN = bost.train[, c('indus','nox','rad','tax', 'lstat')]
testKNN = bost.test[, c('indus','nox','rad','tax', 'lstat')]
n = 20
testError = sapply(1:n,function(k){
knn.pred = knn(train = trainKNN, test = testKNN, cl = bost.train$crime_factor, k = k)
mean(knn.pred!=bost.test$crime_factor)
})
testError
## [1] 0.07874016 0.09448819 0.06299213 0.04724409 0.03937008 0.03937008
## [7] 0.04724409 0.03937008 0.03937008 0.04724409 0.04724409 0.04724409
## [13] 0.04724409 0.06299213 0.04724409 0.04724409 0.06299213 0.06299213
## [19] 0.06299213 0.05511811
When K=2, test error is 6.30%. This is by far the most accurate model.