## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## âś” ggplot2 3.4.1 âś” purrr 1.0.1
## âś” tibble 3.1.8 âś” dplyr 1.1.0
## âś” tidyr 1.3.0 âś” stringr 1.5.0
## âś” readr 2.1.4 âś” forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::filter() masks stats::filter()
## âś– dplyr::lag() masks stats::lag()
##
## Attaching package: 'MASS'
##
##
## The following object is masked from 'package:ISLR2':
##
## Boston
##
##
## The following object is masked from 'package:dplyr':
##
## select
##Question 13
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
pairs(Weekly)
It looks like there is an exponential relationship between volume and year. Most of the Lag variables look the same. Nothing else really stands out.
logreg <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, family = 'binomial', data = Weekly)
summary(logreg)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = "binomial", data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
It looks like the Intercept and Lag2 are significant at the \(\alpha\) = .05 level.
prob <- predict(logreg, type = 'response')
confusion <- rep("Down", 1089)
confusion[prob > .5] = 'Up'
table(confusion, Weekly$Direction)
##
## confusion Down Up
## Down 54 48
## Up 430 557
This confusion matrix is saying that we’re classifying 56.1065% of the observations correctly. We are misclassifying 478 observations. It seems like our model most often sorts Up and very rarely sorts Down.
training <- Weekly[Weekly$Year<2009,]
test <- Weekly[Weekly$Year>=2009,]
nlogreg <- glm(Direction ~ Lag2, data = training, family = 'binomial')
summary(nlogreg)
##
## Call:
## glm(formula = Direction ~ Lag2, family = "binomial", data = training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.536 -1.264 1.021 1.091 1.368
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.20326 0.06428 3.162 0.00157 **
## Lag2 0.05810 0.02870 2.024 0.04298 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1354.7 on 984 degrees of freedom
## Residual deviance: 1350.5 on 983 degrees of freedom
## AIC: 1354.5
##
## Number of Fisher Scoring iterations: 4
testprob <- predict(nlogreg, type = 'response', newdata = test)
testold <- Weekly$Direction[Weekly$Year>=2009]
testprediction <- rep('Down', 104)
testprediction[testprob>.5] = "Up"
table(testprediction,testold)
## testold
## testprediction Down Up
## Down 9 5
## Up 34 56
We are still overestimating Up, but our accuracy increased to 62.5% by using a training data set.
ldafit <-lda(Direction~Lag2, data = training)
ldafit
## Call:
## lda(Direction ~ Lag2, data = training)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
##
## Coefficients of linear discriminants:
## LD1
## Lag2 0.4414162
ldapredict <- predict(ldafit, newdata = test, type = 'response')
ldaclass <- ldapredict$class
table(ldaclass, test$Direction)
##
## ldaclass Down Up
## Down 9 5
## Up 34 56
This is the same as we just got using logistic regression.
qdafit <- qda(Direction ~ Lag2, data = training)
qdafit
## Call:
## qda(Direction ~ Lag2, data = training)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
qdapredict <- predict(qdafit, newdata = test, type = 'response')
qdaclass <- qdapredict$class
table(qdaclass, test$Direction)
##
## qdaclass Down Up
## Down 0 0
## Up 43 61
It appears that we overestimate for Up.We correctly classify 58.65% of our data.
set.seed(10)
ktrainx <- cbind(training$Lag2)
ktestx <- cbind(test$Lag2)
ktrainy <- cbind(training$Direction)
knn1predict <- knn(ktrainx, ktestx, ktrainy, k =1)
table(knn1predict, test$Direction)
##
## knn1predict Down Up
## 1 21 29
## 2 22 32
We have a correct classification rate of 50.96%.
train=(Weekly$Year<2009)
weekly09 <- Weekly[!train,]
direction09 <- Weekly$Direction[!train]
nbayes=naiveBayes(Direction~Lag2 ,data=Weekly ,subset=train)
nbayes
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## Down Up
## 0.4477157 0.5522843
##
## Conditional probabilities:
## Lag2
## Y [,1] [,2]
## Down -0.03568254 2.199504
## Up 0.26036581 2.317485
nbayesclass <- predict(nbayes, weekly09)
table(nbayesclass, direction09)
## direction09
## nbayesclass Down Up
## Down 0 0
## Up 43 61
This classifcation rate is the same as our qda model, around 58.65%
It appears that logistic regression is the best predictor for this data set at a 62.5% successful classification rate. This is of course without any transformations or interactions in the data.
log2 <- glm(Direction ~ Lag1*Lag2, data = Weekly, family = binomial, subset = train)
summary(log2)
##
## Call:
## glm(formula = Direction ~ Lag1 * Lag2, family = binomial, data = Weekly,
## subset = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.573 -1.259 1.003 1.086 1.596
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.211419 0.064589 3.273 0.00106 **
## Lag1 -0.051505 0.030727 -1.676 0.09370 .
## Lag2 0.053471 0.029193 1.832 0.06700 .
## Lag1:Lag2 0.001921 0.007460 0.257 0.79680
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1354.7 on 984 degrees of freedom
## Residual deviance: 1346.9 on 981 degrees of freedom
## AIC: 1354.9
##
## Number of Fisher Scoring iterations: 4
log2prob <- predict(log2, weekly09, type = 'response')
log2pred <- rep("Down", length(log2prob))
log2pred[log2prob>.5] = 'Up'
table(log2pred, direction09)
## direction09
## log2pred Down Up
## Down 7 8
## Up 36 53
I added in the Lag1 variable and the interaction between Lag1 and Lag2 in the logistic regression model. It seems like it classifies 57.69% correct, which is worse than if we just used the Lag2 variable by itself.
lda2 <- lda(Direction ~ Lag1*Lag2, data = Weekly, family=binomial, subset=train)
lda2predict <- predict(lda2, weekly09)
table(lda2predict$class, direction09)
## direction09
## Down Up
## Down 7 8
## Up 36 53
This predicted the exact same as our logistic regression problem.
qda2 <- qda(Direction ~poly(Lag2,2), data =Weekly, subset = train)
gda2predict <- predict(qda2, weekly09)$class
table(gda2predict, direction09)
## direction09
## gda2predict Down Up
## Down 7 3
## Up 36 58
By using a squared Lag2 in the model, we improve our accuracy to 62.5% where our all time best currenty resides.
knn5predict <- knn(ktrainx, ktestx, ktrainy, k =5)
knn10predict <- knn(ktrainx, ktestx, ktrainy, k =10)
table(knn5predict, test$Direction)
##
## knn5predict Down Up
## 1 16 21
## 2 27 40
table(knn10predict, test$Direction)
##
## knn10predict Down Up
## 1 17 19
## 2 26 42
Having a k=5 gives us an accuracy of 53.85% and k=10 gave us an accuracy of 56.73%. Overall, our inital logisitic regression model gave us the best accuracy of 62.5% We were able to replicate that by squaring our Lag2 term in qda.
##Question 14
mpg01 <- ifelse( Auto$mpg > median(Auto$mpg), yes = 1, no = 0)
Auto <- data.frame(Auto, mpg01)
head(Auto)
cor(Auto[,-9])
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## mpg01 0.8369392 -0.7591939 -0.7534766 -0.6670526 -0.7577566
## acceleration year origin mpg01
## mpg 0.4233285 0.5805410 0.5652088 0.8369392
## cylinders -0.5046834 -0.3456474 -0.5689316 -0.7591939
## displacement -0.5438005 -0.3698552 -0.6145351 -0.7534766
## horsepower -0.6891955 -0.4163615 -0.4551715 -0.6670526
## weight -0.4168392 -0.3091199 -0.5850054 -0.7577566
## acceleration 1.0000000 0.2903161 0.2127458 0.3468215
## year 0.2903161 1.0000000 0.1815277 0.4299042
## origin 0.2127458 0.1815277 1.0000000 0.5136984
## mpg01 0.3468215 0.4299042 0.5136984 1.0000000
pairs(Auto)
trainb <- (Auto$year %% 2 == 0) #This picks an even year
testb <- !trainb
Atrain <- Auto[trainb,]
Atest <- Auto[testb,]
mpg01test <- mpg01[testb]
ldampg <- lda(mpg01 ~ cylinders + weight + horsepower + displacement + year, data = Auto, subset = trainb)
ldapredictmpg <- predict(ldampg, Atest)
table(ldapredictmpg$class, mpg01test)
## mpg01test
## 0 1
## 0 87 6
## 1 13 76
mean(ldapredictmpg$class != mpg01test)
## [1] 0.1043956
qdampg <- qda(mpg01 ~ cylinders + weight + horsepower + displacement + year, data = Auto, subset = trainb)
qdapredict <- predict(qdampg, Atest)
table(qdapredict$class, mpg01test)
## mpg01test
## 0 1
## 0 89 12
## 1 11 70
mean(qdapredict$class != mpg01test)
## [1] 0.1263736
mpglog <- glm(mpg01~ cylinders + weight + horsepower + displacement + year, data = Atrain, family =binomial)
summary(mpglog)
##
## Call:
## glm(formula = mpg01 ~ cylinders + weight + horsepower + displacement +
## year, family = binomial, data = Atrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.38487 -0.00601 0.01455 0.13620 1.76182
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -14.183575 7.621940 -1.861 0.0628 .
## cylinders -0.021014 0.767203 -0.027 0.9781
## weight -0.005155 0.001602 -3.217 0.0013 **
## horsepower -0.050225 0.027943 -1.797 0.0723 .
## displacement -0.018794 0.018025 -1.043 0.2971
## year 0.478279 0.122101 3.917 8.96e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 289.577 on 209 degrees of freedom
## Residual deviance: 58.285 on 204 degrees of freedom
## AIC: 70.285
##
## Number of Fisher Scoring iterations: 9
logpredict <- predict(mpglog, Atest, type = 'response')
predmpg <- rep(0, length(logpredict))
predmpg[logpredict>.05] <- 1
table(predmpg, mpg01test)
## mpg01test
## predmpg 0 1
## 0 79 3
## 1 21 79
mean(predmpg != mpg01test)
## [1] 0.1318681
n.bayes<- naiveBayes(mpg01 ~ cylinders + displacement + horsepower + weight + year, data = Atrain)
n.bayesclass <- predict(n.bayes, Atest)
table(n.bayesclass, mpg01test)
## mpg01test
## n.bayesclass 0 1
## 0 88 11
## 1 12 71
mean(n.bayesclass != mpg01test)
## [1] 0.1263736
#I was having trouble binding the data for KNN here for some reason, so I found some
#code online that took a different approach.
data = scale(Auto[,-c(9,10)])
set.seed(1)
train <- sample(1:dim(Auto)[1], 392*.7, rep=FALSE)
test <- -train
training_data = data[train,c("cylinders","horsepower","weight","acceleration")]
testing_data = data[test, c("cylinders", "horsepower","weight","acceleration")]
train.mpg01 = Auto$mpg01[train]
test.mpg01= Auto$mpg01[test]
set.seed(1)
knn_pred_y = knn(training_data, testing_data, train.mpg01, k = 1)
table(knn_pred_y, test.mpg01)
## test.mpg01
## knn_pred_y 0 1
## 0 52 10
## 1 9 47
mean(knn_pred_y != test.mpg01)
## [1] 0.1610169
set.seed(1)
knn_pred_y = knn(training_data, testing_data, train.mpg01, k = 5)
table(knn_pred_y, test.mpg01)
## test.mpg01
## knn_pred_y 0 1
## 0 51 4
## 1 10 53
mean(knn_pred_y != test.mpg01)
## [1] 0.1186441
set.seed(1)
knn_pred_y = knn(training_data, testing_data, train.mpg01, k = 15)
table(knn_pred_y, test.mpg01)
## test.mpg01
## knn_pred_y 0 1
## 0 53 4
## 1 8 53
mean(knn_pred_y != test.mpg01)
## [1] 0.1016949
set.seed(1)
knn_pred_y = knn(training_data, testing_data, train.mpg01, k = 20)
table(knn_pred_y, test.mpg01)
## test.mpg01
## knn_pred_y 0 1
## 0 53 4
## 1 8 53
mean(knn_pred_y != test.mpg01)
## [1] 0.1016949
It appears that the error rate stabilizes somewhere around 15. Increasing K>15 doesn’t seem to have a significant impact.
##Question 16
crime <- rep(0, length(Boston$crim))
crime[Boston$crim > median(Boston$crim)] <- 1
Boston<- data.frame(Boston,crime)
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv crime
## Min. : 1.73 Min. : 5.00 Min. :0.0
## 1st Qu.: 6.95 1st Qu.:17.02 1st Qu.:0.0
## Median :11.36 Median :21.20 Median :0.5
## Mean :12.65 Mean :22.53 Mean :0.5
## 3rd Qu.:16.95 3rd Qu.:25.00 3rd Qu.:1.0
## Max. :37.97 Max. :50.00 Max. :1.0
set.seed(1)
traincrime<- sample(1:dim(Boston)[1], dim(Boston)[1]*.7, rep = FALSE)
testcrime <- -traincrime
Bostontrain <- Boston[traincrime,]
Bostontest <- Boston[testcrime,]
crimetest <- crime[testcrime]
pairs(Boston)
lgcrime <- glm(crime~indus + nox + age +rad + tax +lstat, data = Boston, family = binomial)
lgprobs <- predict(lgcrime, Bostontest, type = 'response')
predictlg <- rep(0, length(lgprobs))
predictlg[lgprobs>.5]<-1
table(predictlg, crimetest)
## crimetest
## predictlg 0 1
## 0 65 14
## 1 8 65
mean(predictlg != crimetest)
## [1] 0.1447368
Our logistic regression has a 14.47% error rate.
ldacrime <- lda(crime~ indus + nox + age +rad + tax +lstat, data = Boston)
predictldacrime <- predict(ldacrime, Bostontest)
table(predictldacrime$class, crimetest)
## crimetest
## 0 1
## 0 71 19
## 1 2 60
mean(predictldacrime$class != crimetest)
## [1] 0.1381579
Our lda has a 13.82% error rate, which is better than our logistic regression.
crimenbayes <- naiveBayes(crime~ indus + nox + age +rad + tax +lstat, data = Boston)
crimenbayesclass <- predict(crimenbayes, Bostontest)
table(crimenbayesclass, crimetest)
## crimetest
## crimenbayesclass 0 1
## 0 68 24
## 1 5 55
mean(crimenbayesclass != crimetest)
## [1] 0.1907895
Our naive Bayes has the highest error rate so far of 19.08%.
kbostontraining <- cbind(Boston$indus, Boston$nox, Boston$age, Boston$rad,Boston$tax ,Boston$lsta)[traincrime,]
kbostontesting <- cbind(Boston$indus, Boston$nox, Boston$age, Boston$rad, Boston$tax, Boston$lsta)[testcrime,]
kcrimetrain <- crime[traincrime]
kcrimetest <-crime[testcrime]
crimeknn1<- knn(kbostontraining, kbostontesting, kcrimetrain, k=1)
table(crimeknn1, kcrimetest)
## kcrimetest
## crimeknn1 0 1
## 0 64 5
## 1 9 74
mean(crimeknn1 != kcrimetest)
## [1] 0.09210526
crimeknn3<- knn(kbostontraining, kbostontesting, kcrimetrain, k=3)
table(crimeknn3, kcrimetest)
## kcrimetest
## crimeknn3 0 1
## 0 65 6
## 1 8 73
mean(crimeknn3 != kcrimetest)
## [1] 0.09210526
crimeknn5<- knn(kbostontraining, kbostontesting, kcrimetrain, k=5)
table(crimeknn5, kcrimetest)
## kcrimetest
## crimeknn5 0 1
## 0 64 6
## 1 9 73
mean(crimeknn5 != kcrimetest)
## [1] 0.09868421
It looks like the best k is somewhere between 1 and 3, as it gives us the lowest error rate out of all our models at 9.21%. It looks like for this data set, using Knn classification will give us the best results.