##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
## The following objects are masked from 'package:openintro':
##
## housing, mammals
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:openintro':
##
## densityPlot
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
## Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
## Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
## Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
## Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
## Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
## Lag5 Volume Today
## Year -0.030519101 0.84194162 -0.032459894
## Lag1 -0.008183096 -0.06495131 -0.075031842
## Lag2 -0.072499482 -0.08551314 0.059166717
## Lag3 0.060657175 -0.06928771 -0.071243639
## Lag4 -0.075675027 -0.06107462 -0.007825873
## Lag5 1.000000000 -0.05851741 0.011012698
## Volume -0.058517414 1.00000000 -0.033077783
## Today 0.011012698 -0.03307778 1.000000000
ggplot(Weekly, aes(x = Year, y = Volume)) +
geom_line() +
geom_point() +
labs(title = "Trading Volume Over Time", x = "Year", y = "Volume") +
theme_minimal()The correlations between the Lag variables and
Today are close to zero. The strongest correlation is
between Year and Volume. When we plot
Volume against Year, we see that it is
increasing over time.
log1 <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = Weekly, family = binomial)
summary(log1)##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Lag2 is the only predictor that is statistically
significant as its p-value is less than 0.05.
probs <- predict(log1, type = "response")
pred <- rep("Down", length(probs))
pred[probs > 0.5] <- "Up"
table(pred, Direction)## Direction
## pred Down Up
## Down 54 48
## Up 430 557
## [1] 0.5610652
Using the confusion matrix, the logistic model has an accuracy rate
of 56.11%. True positives have a good detection rate but the model seems
to fail at detecting Down values accurately.
train <- (Year < 2009)
Weekly.20092010 <- Weekly[!train, ]
Direction.20092010 <- Direction[!train]
log2 <- glm(Direction ~ Lag2, data = Weekly, family = binomial, subset = train)
summary(log2)##
## Call:
## glm(formula = Direction ~ Lag2, family = binomial, data = Weekly,
## subset = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.536 -1.264 1.021 1.091 1.368
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.20326 0.06428 3.162 0.00157 **
## Lag2 0.05810 0.02870 2.024 0.04298 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1354.7 on 984 degrees of freedom
## Residual deviance: 1350.5 on 983 degrees of freedom
## AIC: 1354.5
##
## Number of Fisher Scoring iterations: 4
probs2 <- predict(log2, Weekly.20092010, type = "response")
pred2 <- rep("Down", length(probs2))
pred2[probs2 > 0.5] <- "Up"
table(pred2, Direction.20092010)## Direction.20092010
## pred2 Down Up
## Down 9 5
## Up 34 56
## [1] 0.625
Using the confusion matrix, this logistic model has a better accuracy rate of 62.5%. True positives still seem to have a better detection rate than true negatives, suggesting that even with an updated model and trimmed training data, bias is still present.
## Call:
## lda(Direction ~ Lag2, data = Weekly, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
##
## Coefficients of linear discriminants:
## LD1
## Lag2 0.4414162
## Direction.20092010
## Down Up
## Down 9 5
## Up 34 56
## [1] 0.625
The LDA model performs extremely similar to the previous logistic model. This model also has an accuracy rate of 62.5%, with true positives having a better detection rate compared to true negatives.
## Call:
## qda(Direction ~ Lag2, data = Weekly, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
## Direction.20092010
## Down Up
## Down 0 0
## Up 43 61
## [1] 0.5865385
The QDA model performs worse than both the updated logistic model
(log2) and the LDA model with an accuary of 58.65%. The
model detects Up correctly a majority of the time but fails
to detect Down at all.
train.X <- as.matrix(Lag2[train])
test.X <- as.matrix(Lag2[!train])
train.Direction <- Direction[train]
test.Direction = Direction[!train]
set.seed(1)
pred.knn <- knn(train.X, test.X, train.Direction, k = 1)
table(pred.knn, test.Direction)## test.Direction
## pred.knn Down Up
## Down 21 30
## Up 22 31
## [1] 0.5
This model is the worst performing so far with an accuracy of 50%.
Up still seems to have the best positive prediction
rate.
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## Down Up
## 0.4477157 0.5522843
##
## Conditional probabilities:
## Lag2
## Y [,1] [,2]
## Down -0.03568254 2.199504
## Up 0.26036581 2.317485
## Direction.20092010
## pred.nb Down Up
## Down 0 0
## Up 43 61
## [1] 0.5865385
This model has an accuracy of 58.65%, which is better than the first
logistic model (log1) and the KNN model. It performs on par
with the QDA model as they have the same rates and performs worse than
the log2 model and the LDA model. This model detects
Up correctly a majority of the time but fails to detect
Down at all.
Using the accuracy rates, the updated logistic model known as
log2 (62.5%) and LDA model (62.5%) perform the best
results. The QDA (58.65%) and Naive Bayes (58.65%) model follow. The
full logistic model (56.11%) performs marginally worse. The KNN (50%)
model performs worst of all.
# Logistic regression with Lag2:Lag1
fit.glm3 <- glm(Direction ~ Lag2:Lag1, data = Weekly, family = binomial, subset = train)
probs3 <- predict(fit.glm3, Weekly.20092010, type = "response")
pred.glm3 <- rep("Down", length(probs3))
pred.glm3[probs3 > 0.5] = "Up"
table(pred.glm3, Direction.20092010)## Direction.20092010
## pred.glm3 Down Up
## Down 1 1
## Up 42 60
## [1] 0.5865385
# LDA with Lag2:Lag1
fit.lda2 <- lda(Direction ~ Lag2:Lag1, data = Weekly, subset = train)
pred.lda2 <- predict(fit.lda2, Weekly.20092010)
table(pred.lda2$class, Direction.20092010)## Direction.20092010
## Down Up
## Down 0 1
## Up 43 60
## [1] 0.5769231
# QDA with sqrt(abs(Lag2))
fit.qda2 <- qda(Direction ~ Lag2 + sqrt(abs(Lag2)), data = Weekly, subset = train)
pred.qda2 <- predict(fit.qda2, Weekly.20092010)
table(pred.qda2$class, Direction.20092010)## Direction.20092010
## Down Up
## Down 12 13
## Up 31 48
## [1] 0.5769231
# KNN k=10
pred.knn2 <- knn(train.X, test.X, train.Direction, k = 10)
table(pred.knn2, Direction.20092010)## Direction.20092010
## pred.knn2 Down Up
## Down 17 18
## Up 26 43
## [1] 0.5769231
# KNN k=100
pred.knn3 <- knn(train.X, test.X, train.Direction, k = 100)
table(pred.knn3, Direction.20092010)## Direction.20092010
## pred.knn3 Down Up
## Down 9 12
## Up 34 49
## [1] 0.5576923
The first model employs logistic regression with a
Lag2:Lag1 interaction term. This has an accuracy rate of
58.65%.
The second model employs LDA with this same interaction term. The accuracy rate this time is 57.69%.
The third model uses QDA with an absolute square-root transformation
applied to Lag 2. This model has an accuracy rate of 57.69%
(same as second model).
The fourth model uses KNN where K=10. This model has an accuracy of 54.81%.
The fifth model uses KNN again but with K=100 this time. The model accuracy marginally increases to 55.77% in this case.
Compared to original preferred logistic model (log2) and
the original LDA model, which both had an accuracy of 62.5%, none of
these newer models overtake their performance. When comparing these new
models to each other, the logistic model with the Lag2:Lag1
interaction performs the best.
## The following object is masked from package:lubridate:
##
## origin
## The following object is masked from package:ggplot2:
##
## mpg
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## mpg01 0.8369392 -0.7591939 -0.7534766 -0.6670526 -0.7577566
## acceleration year origin mpg01
## mpg 0.4233285 0.5805410 0.5652088 0.8369392
## cylinders -0.5046834 -0.3456474 -0.5689316 -0.7591939
## displacement -0.5438005 -0.3698552 -0.6145351 -0.7534766
## horsepower -0.6891955 -0.4163615 -0.4551715 -0.6670526
## weight -0.4168392 -0.3091199 -0.5850054 -0.7577566
## acceleration 1.0000000 0.2903161 0.2127458 0.3468215
## year 0.2903161 1.0000000 0.1815277 0.4299042
## origin 0.2127458 0.1815277 1.0000000 0.5136984
## mpg01 0.3468215 0.4299042 0.5136984 1.0000000
par(mfrow = c(2,2))
boxplot(cylinders ~ mpg01, data = Auto, main = "Cylinders vs mpg01")
boxplot(displacement ~ mpg01, data = Auto, main = "Displacement vs mpg01")
boxplot(horsepower ~ mpg01, data = Auto, main = "Horsepower vs mpg01")
boxplot(weight ~ mpg01, data = Auto, main = "Weight vs mpg01")mpg01 has a strong negative negative correlation with
cylinders, displacement,
horsepower, and weight. The plots indicate
that when mpg is below the median, these variables will
decrease as compared to when mpg is higher than the
median.
lda_mpg <- lda(mpg01 ~ cylinders + weight + displacement + horsepower, data = Auto, subset = train)
lda_mpg## Call:
## lda(mpg01 ~ cylinders + weight + displacement + horsepower, data = Auto,
## subset = train)
##
## Prior probabilities of groups:
## 0 1
## 0.4927007 0.5072993
##
## Group means:
## cylinders weight displacement horsepower
## 0 6.777778 3611.052 271.9333 129.13333
## 1 4.187050 2342.165 116.8129 79.27338
##
## Coefficients of linear discriminants:
## LD1
## cylinders -0.3962357999
## weight -0.0008321338
## displacement -0.0047630097
## horsepower 0.0061919395
## mpg01.test
## 0 1
## 0 50 3
## 1 11 54
## [1] 0.1186441
This LDA model has a test error rate of 11.86%.
qda_mpg <- qda(mpg01 ~ cylinders + weight + displacement + horsepower, data = Auto, subset = train)
qda_mpg## Call:
## qda(mpg01 ~ cylinders + weight + displacement + horsepower, data = Auto,
## subset = train)
##
## Prior probabilities of groups:
## 0 1
## 0.4927007 0.5072993
##
## Group means:
## cylinders weight displacement horsepower
## 0 6.777778 3611.052 271.9333 129.13333
## 1 4.187050 2342.165 116.8129 79.27338
## mpg01.test
## 0 1
## 0 52 5
## 1 9 52
## [1] 0.1186441
This QDA model also has a test error rate of 11.86% (same as LDA).
log_mpg <- glm(mpg01 ~ cylinders + weight + displacement + horsepower, data = Auto, family = binomial, subset = train)
summary(log_mpg)##
## Call:
## glm(formula = mpg01 ~ cylinders + weight + displacement + horsepower,
## family = binomial, data = Auto, subset = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4794 -0.1963 0.1056 0.3508 3.3756
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 11.725290 2.147421 5.460 4.76e-08 ***
## cylinders 0.056770 0.419131 0.135 0.8923
## weight -0.001931 0.000817 -2.364 0.0181 *
## displacement -0.014718 0.009904 -1.486 0.1373
## horsepower -0.041518 0.017821 -2.330 0.0198 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 379.79 on 273 degrees of freedom
## Residual deviance: 144.49 on 269 degrees of freedom
## AIC: 154.49
##
## Number of Fisher Scoring iterations: 7
probs <- predict(log_mpg, Auto.test, type = "response")
pred.log_mpg <- rep(0, length(probs))
pred.log_mpg[probs > 0.5] <- 1
table(pred.log_mpg, mpg01.test)## mpg01.test
## pred.log_mpg 0 1
## 0 53 3
## 1 8 54
## [1] 0.09322034
This logistic model has a test error rate of 9.32%.
nb_mpg <- naiveBayes(mpg01 ~ cylinders + weight + displacement + horsepower, data = Auto, subset = train)
nb_mpg##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## 0 1
## 0.4927007 0.5072993
##
## Conditional probabilities:
## cylinders
## Y [,1] [,2]
## 0 6.777778 1.4177271
## 1 4.187050 0.6655971
##
## weight
## Y [,1] [,2]
## 0 3611.052 675.2920
## 1 2342.165 392.0333
##
## displacement
## Y [,1] [,2]
## 0 271.9333 85.11828
## 1 116.8129 37.54294
##
## horsepower
## Y [,1] [,2]
## 0 129.13333 36.06147
## 1 79.27338 16.37106
## mpg01.test
## pred.nb_mpg 0 1
## 0 52 4
## 1 9 53
## [1] 0.1101695
This Naive Bayes model has a test error rate of 11.02%.
train.X <- cbind(cylinders, weight, displacement, horsepower)[train, ]
test.X <- cbind(cylinders, weight, displacement, horsepower)[-train, ]
train.mpg01 <- mpg01[train]
set.seed(1)
error_rates <- numeric(100)
min_error <- Inf
best_k <- 1
for (k in 1:100) {
pred.knn_mpg <- knn(train.X, test.X, train.mpg01, k = k)
error_rates[k] <- mean(pred.knn_mpg != mpg01.test)
if (error_rates[k] < min_error) {
min_error <- error_rates[k]
best_k <- k
}
}
error_table <- data.frame(k = 1:100, error_rate = error_rates)
print(error_table)## k error_rate
## 1 1 0.1355932
## 2 2 0.1271186
## 3 3 0.1101695
## 4 4 0.1016949
## 5 5 0.1271186
## 6 6 0.1186441
## 7 7 0.1271186
## 8 8 0.1525424
## 9 9 0.1440678
## 10 10 0.1440678
## 11 11 0.1440678
## 12 12 0.1440678
## 13 13 0.1440678
## 14 14 0.1440678
## 15 15 0.1440678
## 16 16 0.1355932
## 17 17 0.1355932
## 18 18 0.1440678
## 19 19 0.1440678
## 20 20 0.1355932
## 21 21 0.1355932
## 22 22 0.1440678
## 23 23 0.1355932
## 24 24 0.1355932
## 25 25 0.1355932
## 26 26 0.1355932
## 27 27 0.1355932
## 28 28 0.1355932
## 29 29 0.1355932
## 30 30 0.1355932
## 31 31 0.1355932
## 32 32 0.1355932
## 33 33 0.1355932
## 34 34 0.1355932
## 35 35 0.1355932
## 36 36 0.1355932
## 37 37 0.1355932
## 38 38 0.1355932
## 39 39 0.1355932
## 40 40 0.1355932
## 41 41 0.1355932
## 42 42 0.1355932
## 43 43 0.1355932
## 44 44 0.1355932
## 45 45 0.1355932
## 46 46 0.1355932
## 47 47 0.1355932
## 48 48 0.1440678
## 49 49 0.1525424
## 50 50 0.1610169
## 51 51 0.1525424
## 52 52 0.1440678
## 53 53 0.1525424
## 54 54 0.1440678
## 55 55 0.1610169
## 56 56 0.1525424
## 57 57 0.1525424
## 58 58 0.1525424
## 59 59 0.1610169
## 60 60 0.1610169
## 61 61 0.1525424
## 62 62 0.1525424
## 63 63 0.1525424
## 64 64 0.1610169
## 65 65 0.1525424
## 66 66 0.1440678
## 67 67 0.1525424
## 68 68 0.1610169
## 69 69 0.1610169
## 70 70 0.1610169
## 71 71 0.1525424
## 72 72 0.1525424
## 73 73 0.1610169
## 74 74 0.1610169
## 75 75 0.1610169
## 76 76 0.1440678
## 77 77 0.1610169
## 78 78 0.1525424
## 79 79 0.1610169
## 80 80 0.1355932
## 81 81 0.1355932
## 82 82 0.1355932
## 83 83 0.1355932
## 84 84 0.1355932
## 85 85 0.1440678
## 86 86 0.1525424
## 87 87 0.1610169
## 88 88 0.1440678
## 89 89 0.1355932
## 90 90 0.1355932
## 91 91 0.1355932
## 92 92 0.1610169
## 93 93 0.1610169
## 94 94 0.1440678
## 95 95 0.1525424
## 96 96 0.1610169
## 97 97 0.1610169
## 98 98 0.1525424
## 99 99 0.1525424
## 100 100 0.1525424
cat("Best K:", best_k, "with the lowest error rate:", min_error, "has the best performance on this data set.")## Best K: 4 with the lowest error rate: 0.1016949 has the best performance on this data set.
\(\text{Hint: You will have to create the response variable yourself, using the variables that are contained in the Boston data set.}\)
data("Boston")
high_crim <- ifelse(Boston$crim > median(Boston$crim), 1, 0)
Boston <- data.frame(Boston, high_crim)
cor(Boston)## crim zn indus chas nox
## crim 1.00000000 -0.20046922 0.40658341 -0.055891582 0.42097171
## zn -0.20046922 1.00000000 -0.53382819 -0.042696719 -0.51660371
## indus 0.40658341 -0.53382819 1.00000000 0.062938027 0.76365145
## chas -0.05589158 -0.04269672 0.06293803 1.000000000 0.09120281
## nox 0.42097171 -0.51660371 0.76365145 0.091202807 1.00000000
## rm -0.21924670 0.31199059 -0.39167585 0.091251225 -0.30218819
## age 0.35273425 -0.56953734 0.64477851 0.086517774 0.73147010
## dis -0.37967009 0.66440822 -0.70802699 -0.099175780 -0.76923011
## rad 0.62550515 -0.31194783 0.59512927 -0.007368241 0.61144056
## tax 0.58276431 -0.31456332 0.72076018 -0.035586518 0.66802320
## ptratio 0.28994558 -0.39167855 0.38324756 -0.121515174 0.18893268
## black -0.38506394 0.17552032 -0.35697654 0.048788485 -0.38005064
## lstat 0.45562148 -0.41299457 0.60379972 -0.053929298 0.59087892
## medv -0.38830461 0.36044534 -0.48372516 0.175260177 -0.42732077
## high_crim 0.40939545 -0.43615103 0.60326017 0.070096774 0.72323480
## rm age dis rad tax
## crim -0.21924670 0.35273425 -0.37967009 0.625505145 0.58276431
## zn 0.31199059 -0.56953734 0.66440822 -0.311947826 -0.31456332
## indus -0.39167585 0.64477851 -0.70802699 0.595129275 0.72076018
## chas 0.09125123 0.08651777 -0.09917578 -0.007368241 -0.03558652
## nox -0.30218819 0.73147010 -0.76923011 0.611440563 0.66802320
## rm 1.00000000 -0.24026493 0.20524621 -0.209846668 -0.29204783
## age -0.24026493 1.00000000 -0.74788054 0.456022452 0.50645559
## dis 0.20524621 -0.74788054 1.00000000 -0.494587930 -0.53443158
## rad -0.20984667 0.45602245 -0.49458793 1.000000000 0.91022819
## tax -0.29204783 0.50645559 -0.53443158 0.910228189 1.00000000
## ptratio -0.35550149 0.26151501 -0.23247054 0.464741179 0.46085304
## black 0.12806864 -0.27353398 0.29151167 -0.444412816 -0.44180801
## lstat -0.61380827 0.60233853 -0.49699583 0.488676335 0.54399341
## medv 0.69535995 -0.37695457 0.24992873 -0.381626231 -0.46853593
## high_crim -0.15637178 0.61393992 -0.61634164 0.619786249 0.60874128
## ptratio black lstat medv high_crim
## crim 0.2899456 -0.38506394 0.4556215 -0.3883046 0.40939545
## zn -0.3916785 0.17552032 -0.4129946 0.3604453 -0.43615103
## indus 0.3832476 -0.35697654 0.6037997 -0.4837252 0.60326017
## chas -0.1215152 0.04878848 -0.0539293 0.1752602 0.07009677
## nox 0.1889327 -0.38005064 0.5908789 -0.4273208 0.72323480
## rm -0.3555015 0.12806864 -0.6138083 0.6953599 -0.15637178
## age 0.2615150 -0.27353398 0.6023385 -0.3769546 0.61393992
## dis -0.2324705 0.29151167 -0.4969958 0.2499287 -0.61634164
## rad 0.4647412 -0.44441282 0.4886763 -0.3816262 0.61978625
## tax 0.4608530 -0.44180801 0.5439934 -0.4685359 0.60874128
## ptratio 1.0000000 -0.17738330 0.3740443 -0.5077867 0.25356836
## black -0.1773833 1.00000000 -0.3660869 0.3334608 -0.35121093
## lstat 0.3740443 -0.36608690 1.0000000 -0.7376627 0.45326273
## medv -0.5077867 0.33346082 -0.7376627 1.0000000 -0.26301673
## high_crim 0.2535684 -0.35121093 0.4532627 -0.2630167 1.00000000
set.seed(1)
train <- sample(nrow(Boston), size = 0.7*nrow(Boston))
Boston.train <- Boston[train,]
Boston.test <- Boston[-train,]
high_crim.test<-Boston.test$high_crimindus, nox, age,
dis, rad, and tax have strong
correlations with high_crim.
log_crim <- glm(high_crim ~ . -high_crim - crim, data = Boston.train, family = binomial)
pred.log_crim <- predict(log_crim, Boston.test, type = "response")
class.log_crim <- ifelse(pred.log_crim > 0.5, 1, 0)
table(class.log_crim,high_crim.test)## high_crim.test
## class.log_crim 0 1
## 0 62 5
## 1 11 74
## [1] 0.1052632
## zn indus chas nox rm age dis rad
## 2.470528 3.155461 1.332046 5.776329 6.400261 2.521466 6.019091 2.020340
## tax ptratio black lstat medv
## 1.874896 2.057383 1.051297 2.532954 9.764531
This logistic model has a test error rate of 10.53%.
log_crim2 <- glm(high_crim ~ indus + nox + age + dis + rad + tax - high_crim - crim,
data = Boston.train, family = binomial)
pred.log_crim2 <- predict(log_crim2, Boston.test, type = "response")
class.log_crim2 <- ifelse(pred.log_crim2 > 0.5, 1, 0)
table(class.log_crim2,high_crim.test)## high_crim.test
## class.log_crim2 0 1
## 0 58 6
## 1 15 73
## [1] 0.1381579
## indus nox age dis rad tax
## 2.847822 5.480657 1.640029 2.810917 1.514278 1.521426
The logistic regression, with the predictors that have the strongest
correlation with high_crim, has a test error rate of
13.82%.
lda_crim <- lda(high_crim ~ . - high_crim - crim, data = Boston, subset = train)
pred.lda_crim <- predict(lda_crim, Boston.test)
table(pred.lda_crim$class, high_crim.test)## high_crim.test
## 0 1
## 0 70 20
## 1 3 59
## [1] 0.1513158
The full LDA model has a test error rate of 15.13%.
lda_crim2 <- lda(high_crim ~ indus + nox + age + dis + rad + tax - high_crim - crim,
data = Boston, subset = train)
pred.lda_crim2 <- predict(lda_crim2, Boston.test)
table(pred.lda_crim2$class, high_crim.test)## high_crim.test
## 0 1
## 0 71 20
## 1 2 59
## [1] 0.1447368
The LDA model with the predictors that have the strongest correlation
with high_crim has a test error rate of 14.47%.
nb_crim <- naiveBayes(high_crim ~ . - high_crim - crim, data = Boston, subset = train)
nb_crim <- predict(nb_crim, Boston.test)
table(nb_crim, high_crim.test)## high_crim.test
## nb_crim 0 1
## 0 68 20
## 1 5 59
## [1] 0.1644737
The full Naives Bayes model has a test error rate of 16.45%.
nb_crim2 <- naiveBayes(high_crim ~ indus + nox + age + dis + rad + tax
- high_crim - crim, data = Boston, subset = train)
nb_crim2 <- predict(nb_crim2, Boston.test)
table(nb_crim2, high_crim.test)## high_crim.test
## nb_crim2 0 1
## 0 65 19
## 1 8 60
## [1] 0.1776316
The Naive Bayes model with the predictors that have the strongest
correlation with high_crim has a test error rate of
17.76%.
train.X <- Boston[train, c("zn", "indus", "chas", "nox", "rm", "age",
"dis", "rad", "tax", "ptratio", "black",
"lstat", "medv")]
test.X <- Boston[-train, c("zn", "indus", "chas", "nox", "rm", "age",
"dis", "rad", "tax", "ptratio", "black",
"lstat", "medv")]
train.high_crim <- high_crim[train]
set.seed(1)
pred.knn_crim <- knn(train.X, test.X, train.high_crim, k = 1)
table(pred.knn_crim, high_crim.test)## high_crim.test
## pred.knn_crim 0 1
## 0 65 6
## 1 8 73
## [1] 0.09210526
The KNN model where K=1 has a test error rate of 9.21%.
pred.knn_crim2 <- knn(train.X, test.X, train.high_crim, k = 10)
table(pred.knn_crim2, high_crim.test)## high_crim.test
## pred.knn_crim2 0 1
## 0 59 9
## 1 14 70
## [1] 0.1513158
The KNN model where K=10 has a test error rate of 15.13%.
pred.knn_crim3 <- knn(train.X, test.X, train.high_crim, k = 100)
table(pred.knn_crim3, high_crim.test)## high_crim.test
## pred.knn_crim3 0 1
## 0 70 24
## 1 3 55
## [1] 0.1776316
The KNN model where K=100 has a test error rate of 17.76%.