library(ISLR)
library(MASS)
library(class)
set.seed(1)
data("Weekly")
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
cor(Weekly[, -9])
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
## Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
## Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
## Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
## Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
## Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
## Lag5 Volume Today
## Year -0.030519101 0.84194162 -0.032459894
## Lag1 -0.008183096 -0.06495131 -0.075031842
## Lag2 -0.072499482 -0.08551314 0.059166717
## Lag3 0.060657175 -0.06928771 -0.071243639
## Lag4 -0.075675027 -0.06107462 -0.007825873
## Lag5 1.000000000 -0.05851741 0.011012698
## Volume -0.058517414 1.00000000 -0.033077783
## Today 0.011012698 -0.03307778 1.000000000
pairs(Weekly)
summary() showed that the values of the lags are similar. Correlations aren’t particularly strong, though some correlation can be observed between Year and Volume.
lg_fit <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = Weekly, family = binomial)
summary(lg_fit)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Lag2 appears to give some statistical significance with its p-value denoted by * (< 0.05).
lg_prediction <- predict(lg_fit, type = "response")
lg_prediction_results <- rep("Down", length(lg_prediction))
lg_prediction_results[lg_prediction > 0.5] <- "Up"
table(lg_prediction_results, Weekly$Direction)
##
## lg_prediction_results Down Up
## Down 54 48
## Up 430 557
The confusion matrix is telling us that the test error rate is approximately 43.89%, as shown below. It can also be observed that when the market went “Up” the predictions were right approximately 92.07% (557 / 605) of the time; conversely, when the market went “Down” the predictions were right approximately 11.16% (54 / 484) of the time.
mean(lg_prediction_results != Weekly$Direction)
## [1] 0.4389348
# the confusion matrix
weekly_training_data = Weekly[Weekly$Year<2009,]
weekly_testing_data = Weekly[Weekly$Year>2008,]
lg_fit_2 = glm(Direction ~ Lag2, data = weekly_training_data, family = "binomial")
lg_prediction_2 <- predict(lg_fit_2, weekly_testing_data, type = "response")
lg_prediction_results_2 <- rep("Down", length(lg_prediction_2))
lg_prediction_results_2[lg_prediction_2 > 0.5] <- "Up"
table(lg_prediction_results_2, Weekly[Weekly$Year>2008,]$Direction)
##
## lg_prediction_results_2 Down Up
## Down 9 5
## Up 34 56
# the overall fraction of correct predictions
mean(lg_prediction_results_2 == Weekly[Weekly$Year>2008,]$Direction)
## [1] 0.625
lda_fit = lda(Direction ~ Lag2, data = weekly_training_data)
lda_prediction = predict(lda_fit, weekly_testing_data, type ="response")
table(lda_prediction$class, Weekly[Weekly$Year>2008,]$Direction)
##
## Down Up
## Down 9 5
## Up 34 56
mean(lda_prediction$class == Weekly[Weekly$Year>2008,]$Direction)
## [1] 0.625
As can be seen, the output resembles that of the logistic regression model.
qda_fit = qda(Direction ~ Lag2, data = weekly_training_data)
qda_prediction = predict(qda_fit, weekly_testing_data, type ="response")
table(qda_prediction$class, Weekly[Weekly$Year>2008,]$Direction)
##
## Down Up
## Down 0 0
## Up 43 61
mean(qda_prediction$class == Weekly[Weekly$Year>2008,]$Direction)
## [1] 0.5865385
The overall fraction of correct predictions decreased with this model.
knn_weekly_training_data = as.matrix(weekly_training_data[, c("Lag2")])
knn_weekly_testing_data = as.matrix(weekly_testing_data[, c("Lag2")])
knn_weekly_training_data_direction = as.matrix(weekly_training_data[, c("Direction")])
knn_weekly_prediction = knn(knn_weekly_training_data, knn_weekly_testing_data, knn_weekly_training_data_direction, k = 1)
table(knn_weekly_prediction, weekly_testing_data$Direction)
##
## knn_weekly_prediction Down Up
## Down 21 30
## Up 22 31
mean(knn_weekly_prediction == weekly_testing_data$Direction)
## [1] 0.5
While the model was right 50% of the time which was a decrease from models prior, it can be seen that the model improved when the market went “Down”.
If we base it on the overall fraction of correct predictions, it can be observed that logistic regression and LDA gave a higher percentage that QDA, and than KNN which came in last. Again, it should be noted that KNN showed improvement over the others when the market went “Down”.
# logistic regression with interaction
fit_interaction_lr_1 <- lda(Direction ~ Lag2:Volume, data = weekly_training_data)
prediction_interaction_lr_1 <- predict(fit_interaction_lr_1, weekly_testing_data, type = "response")
table(prediction_interaction_lr_1$class, weekly_testing_data$Direction)
##
## Down Up
## Down 8 6
## Up 35 55
mean(prediction_interaction_lr_1$class == weekly_testing_data$Direction)
## [1] 0.6057692
# QDA with transformation
fit_qda_transformation <- qda(Direction ~ sqrt(abs(Lag2)), data = weekly_training_data)
prediction_qda_transformation <- predict(fit_qda_transformation, weekly_testing_data)
table(prediction_qda_transformation$class, weekly_testing_data$Direction)
##
## Down Up
## Down 0 0
## Up 43 61
mean(prediction_qda_transformation$class == weekly_testing_data$Direction)
## [1] 0.5865385
Of logistic regression with interaction and QDA with transformation, logistic regression with interaction had a higher overall fraction of correct predictions of approximately 60.58%.
# KNN with different values of K
errors <- array(0, dim = 50)
for(i in 1:50){
knn_weekly_prediction_2 = knn(knn_weekly_training_data, knn_weekly_testing_data, knn_weekly_training_data_direction, k = i)
errors[i] = mean(knn_weekly_prediction_2 != weekly_testing_data$Direction)
}
plot(errors, xlab="K value", ylab="Test error")
lines(errors, pch=16)
min(errors)
## [1] 0.3846154
which(errors == min(errors))
## [1] 47
As can be seen, K = 47 gave the lowest test error rate of approximately 38.46%.
data("Auto")
summary(Auto)
## mpg cylinders displacement horsepower weight
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0 Min. :1613
## 1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0 1st Qu.:2225
## Median :22.75 Median :4.000 Median :151.0 Median : 93.5 Median :2804
## Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5 Mean :2978
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0 3rd Qu.:3615
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0 Max. :5140
##
## acceleration year origin name
## Min. : 8.00 Min. :70.00 Min. :1.000 amc matador : 5
## 1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000 ford pinto : 5
## Median :15.50 Median :76.00 Median :1.000 toyota corolla : 5
## Mean :15.54 Mean :75.98 Mean :1.577 amc gremlin : 4
## 3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000 amc hornet : 4
## Max. :24.80 Max. :82.00 Max. :3.000 chevrolet chevette: 4
## (Other) :365
mpg01 <- ifelse(Auto$mpg > median(Auto$mpg), 1, 0)
Auto_part_a <- data.frame(Auto, mpg01)
summary(Auto_part_a)
## mpg cylinders displacement horsepower weight
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0 Min. :1613
## 1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0 1st Qu.:2225
## Median :22.75 Median :4.000 Median :151.0 Median : 93.5 Median :2804
## Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5 Mean :2978
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0 3rd Qu.:3615
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0 Max. :5140
##
## acceleration year origin name
## Min. : 8.00 Min. :70.00 Min. :1.000 amc matador : 5
## 1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000 ford pinto : 5
## Median :15.50 Median :76.00 Median :1.000 toyota corolla : 5
## Mean :15.54 Mean :75.98 Mean :1.577 amc gremlin : 4
## 3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000 amc hornet : 4
## Max. :24.80 Max. :82.00 Max. :3.000 chevrolet chevette: 4
## (Other) :365
## mpg01
## Min. :0.0
## 1st Qu.:0.0
## Median :0.5
## Mean :0.5
## 3rd Qu.:1.0
## Max. :1.0
##
cor(Auto_part_a[,-9])
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## mpg01 0.8369392 -0.7591939 -0.7534766 -0.6670526 -0.7577566
## acceleration year origin mpg01
## mpg 0.4233285 0.5805410 0.5652088 0.8369392
## cylinders -0.5046834 -0.3456474 -0.5689316 -0.7591939
## displacement -0.5438005 -0.3698552 -0.6145351 -0.7534766
## horsepower -0.6891955 -0.4163615 -0.4551715 -0.6670526
## weight -0.4168392 -0.3091199 -0.5850054 -0.7577566
## acceleration 1.0000000 0.2903161 0.2127458 0.3468215
## year 0.2903161 1.0000000 0.1815277 0.4299042
## origin 0.2127458 0.1815277 1.0000000 0.5136984
## mpg01 0.3468215 0.4299042 0.5136984 1.0000000
pairs(Auto_part_a)
boxplot(displacement ~ mpg01, data = Auto_part_a, main = "Displacement")
boxplot(weight ~ mpg01, data = Auto_part_a, main = "Weight")
boxplot(cylinders ~ mpg01, data = Auto_part_a, main = "Cylinders")
boxplot(horsepower ~ mpg01, data = Auto_part_a, main = "Horsepower")
Based off the outputs above, some associations between mpg01 and cylinders, weight, horsepower, and displacement can be observed.
set.seed(1)
training_amount <- nrow(Auto_part_a) * 0.75
training_index <- sample(nrow(Auto_part_a), size = training_amount)
training_data <- Auto_part_a[training_index,]
testing_data <- Auto_part_a[-training_index,]
library(MASS)
lda_fit <- lda(mpg01 ~ cylinders + weight + displacement + horsepower, data = training_data)
lda_fit
## Call:
## lda(mpg01 ~ cylinders + weight + displacement + horsepower, data = training_data)
##
## Prior probabilities of groups:
## 0 1
## 0.4863946 0.5136054
##
## Group means:
## cylinders weight displacement horsepower
## 0 6.804196 3625.434 273.8881 129.60839
## 1 4.198675 2347.728 118.0265 79.66225
##
## Coefficients of linear discriminants:
## LD1
## cylinders -0.3975941148
## weight -0.0008601142
## displacement -0.0038389582
## horsepower 0.0048106758
lda_prediction <- predict(lda_fit, testing_data)
table(lda_prediction$class, mpg01[-training_index])
##
## 0 1
## 0 42 2
## 1 11 43
mean(lda_prediction$class != mpg01[-training_index])
## [1] 0.1326531
As shown, the test error rate is approximately 13.27%.
qda_fit <- qda(mpg01 ~ cylinders + weight + displacement + horsepower, data = training_data)
qda_fit
## Call:
## qda(mpg01 ~ cylinders + weight + displacement + horsepower, data = training_data)
##
## Prior probabilities of groups:
## 0 1
## 0.4863946 0.5136054
##
## Group means:
## cylinders weight displacement horsepower
## 0 6.804196 3625.434 273.8881 129.60839
## 1 4.198675 2347.728 118.0265 79.66225
qda_prediction <- predict(qda_fit, testing_data)
table(qda_prediction$class, mpg01[-training_index])
##
## 0 1
## 0 45 4
## 1 8 41
mean(qda_prediction$class != mpg01[-training_index])
## [1] 0.122449
As shown, the given test error rate is approximately 12.24%.
lr_fit <- glm(mpg01 ~ cylinders + weight + displacement + horsepower, family = binomial, data = training_data)
lr_fit
##
## Call: glm(formula = mpg01 ~ cylinders + weight + displacement + horsepower,
## family = binomial, data = training_data)
##
## Coefficients:
## (Intercept) cylinders weight displacement horsepower
## 11.114795 0.018886 -0.001956 -0.012320 -0.035758
##
## Degrees of Freedom: 293 Total (i.e. Null); 289 Residual
## Null Deviance: 407.4
## Residual Deviance: 161.4 AIC: 171.4
lr_prediction <- predict(lr_fit, testing_data, type = "response")
lr_prediction_final <- round(lr_prediction)
table(lr_prediction_final, mpg01[-training_index])
##
## lr_prediction_final 0 1
## 0 45 2
## 1 8 43
mean(lr_prediction_final != mpg01[-training_index])
## [1] 0.1020408
It can be seen that the test error rate is approximately 10.20%.
knn_training_data = training_data[, c("cylinders","horsepower","weight","acceleration")]
knn_testing_data = testing_data[, c("cylinders", "horsepower","weight","acceleration")]
knn_prediction = knn(knn_training_data, knn_testing_data, mpg01[training_index], k = 1)
table(knn_prediction, mpg01[-training_index])
##
## knn_prediction 0 1
## 0 44 8
## 1 9 37
mean(knn_prediction != mpg01[-training_index])
## [1] 0.1734694
Here, the test error rate is approximately 17.35% for K=1. Compared to previous model outputs, this is an increase in test error rate which is not desirable. In turn, we conduct a loop below to find a better value for K.
errors <- array(0, dim = 50)
for(i in 1:50){
knn_prediction = knn(knn_training_data, knn_testing_data, mpg01[training_index], k = i)
errors[i] = mean(knn_prediction != mpg01[-training_index])
}
plot(errors, xlab="K value", ylab="Test error")
lines(errors, pch=16)
min(errors)
## [1] 0.1020408
which(errors == min(errors))
## [1] 3
As shown, K = 3 gave the lowest test error rate of approximately 10.20%. As depicted in the plot above, the next lowest test error rate was given by K = 5 of approximately 11.22% (outputted below).
errors[5]
## [1] 0.1122449
data("Boston")
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
crim01 <- rep(0, length(Boston$crim))
crim01[Boston$crim > median(Boston$crim)] <- 1
boston_df <- data.frame(Boston, crim01)
set.seed(1)
boston_training_amount <- nrow(boston_df) * 0.75
boston_training_index <- sample(nrow(boston_df), size = boston_training_amount)
boston_training_data <- boston_df[boston_training_index,]
boston_testing_data <- boston_df[-boston_training_index,]
cor(boston_df)
## crim zn indus chas nox
## crim 1.00000000 -0.20046922 0.40658341 -0.055891582 0.42097171
## zn -0.20046922 1.00000000 -0.53382819 -0.042696719 -0.51660371
## indus 0.40658341 -0.53382819 1.00000000 0.062938027 0.76365145
## chas -0.05589158 -0.04269672 0.06293803 1.000000000 0.09120281
## nox 0.42097171 -0.51660371 0.76365145 0.091202807 1.00000000
## rm -0.21924670 0.31199059 -0.39167585 0.091251225 -0.30218819
## age 0.35273425 -0.56953734 0.64477851 0.086517774 0.73147010
## dis -0.37967009 0.66440822 -0.70802699 -0.099175780 -0.76923011
## rad 0.62550515 -0.31194783 0.59512927 -0.007368241 0.61144056
## tax 0.58276431 -0.31456332 0.72076018 -0.035586518 0.66802320
## ptratio 0.28994558 -0.39167855 0.38324756 -0.121515174 0.18893268
## black -0.38506394 0.17552032 -0.35697654 0.048788485 -0.38005064
## lstat 0.45562148 -0.41299457 0.60379972 -0.053929298 0.59087892
## medv -0.38830461 0.36044534 -0.48372516 0.175260177 -0.42732077
## crim01 0.40939545 -0.43615103 0.60326017 0.070096774 0.72323480
## rm age dis rad tax ptratio
## crim -0.21924670 0.35273425 -0.37967009 0.625505145 0.58276431 0.2899456
## zn 0.31199059 -0.56953734 0.66440822 -0.311947826 -0.31456332 -0.3916785
## indus -0.39167585 0.64477851 -0.70802699 0.595129275 0.72076018 0.3832476
## chas 0.09125123 0.08651777 -0.09917578 -0.007368241 -0.03558652 -0.1215152
## nox -0.30218819 0.73147010 -0.76923011 0.611440563 0.66802320 0.1889327
## rm 1.00000000 -0.24026493 0.20524621 -0.209846668 -0.29204783 -0.3555015
## age -0.24026493 1.00000000 -0.74788054 0.456022452 0.50645559 0.2615150
## dis 0.20524621 -0.74788054 1.00000000 -0.494587930 -0.53443158 -0.2324705
## rad -0.20984667 0.45602245 -0.49458793 1.000000000 0.91022819 0.4647412
## tax -0.29204783 0.50645559 -0.53443158 0.910228189 1.00000000 0.4608530
## ptratio -0.35550149 0.26151501 -0.23247054 0.464741179 0.46085304 1.0000000
## black 0.12806864 -0.27353398 0.29151167 -0.444412816 -0.44180801 -0.1773833
## lstat -0.61380827 0.60233853 -0.49699583 0.488676335 0.54399341 0.3740443
## medv 0.69535995 -0.37695457 0.24992873 -0.381626231 -0.46853593 -0.5077867
## crim01 -0.15637178 0.61393992 -0.61634164 0.619786249 0.60874128 0.2535684
## black lstat medv crim01
## crim -0.38506394 0.4556215 -0.3883046 0.40939545
## zn 0.17552032 -0.4129946 0.3604453 -0.43615103
## indus -0.35697654 0.6037997 -0.4837252 0.60326017
## chas 0.04878848 -0.0539293 0.1752602 0.07009677
## nox -0.38005064 0.5908789 -0.4273208 0.72323480
## rm 0.12806864 -0.6138083 0.6953599 -0.15637178
## age -0.27353398 0.6023385 -0.3769546 0.61393992
## dis 0.29151167 -0.4969958 0.2499287 -0.61634164
## rad -0.44441282 0.4886763 -0.3816262 0.61978625
## tax -0.44180801 0.5439934 -0.4685359 0.60874128
## ptratio -0.17738330 0.3740443 -0.5077867 0.25356836
## black 1.00000000 -0.3660869 0.3334608 -0.35121093
## lstat -0.36608690 1.0000000 -0.7376627 0.45326273
## medv 0.33346082 -0.7376627 1.0000000 -0.26301673
## crim01 -0.35121093 0.4532627 -0.2630167 1.00000000
#only removing the crime variables
boston_lda_fit <- lda(crim01 ~ . - crim01 - crim, data = boston_training_data)
boston_prediction_lda <- predict(boston_lda_fit, boston_testing_data)
table(boston_prediction_lda$class, crim01[-boston_training_index])
##
## 0 1
## 0 59 15
## 1 4 49
boston_lda_fit
## Call:
## lda(crim01 ~ . - crim01 - crim, data = boston_training_data)
##
## Prior probabilities of groups:
## 0 1
## 0.5013193 0.4986807
##
## Group means:
## zn indus chas nox rm age dis rad
## 0 21.757895 7.103263 0.06315789 0.4694826 6.443258 50.56105 5.060515 4.157895
## 1 1.079365 15.403016 0.08994709 0.6363175 6.174714 85.66190 2.540708 14.968254
## tax ptratio black lstat medv
## 0 307.4632 17.87579 388.0490 9.241211 25.34895
## 1 512.4550 19.10317 320.9546 15.866455 20.26032
##
## Coefficients of linear discriminants:
## LD1
## zn -0.0085230246
## indus 0.0150302363
## chas -0.1728111352
## nox 8.6054129758
## rm 0.1119278355
## age 0.0121565634
## dis 0.1284274358
## rad 0.0745769280
## tax -0.0010170190
## ptratio 0.0657333031
## black -0.0009543915
## lstat 0.0279785001
## medv 0.0460937756
mean(boston_prediction_lda$class != crim01[-boston_training_index])
## [1] 0.1496063
For the LDA model after only removing the crime variables, we arrived at a test error rate of approximately 14.96%.
boston_lda_fit_2 <- lda(crim01 ~ chas + nox + rm + ptratio + medv, data = boston_training_data)
boston_prediction_lda_2 <- predict(boston_lda_fit_2, boston_testing_data)
table(boston_prediction_lda_2$class, crim01[-boston_training_index])
##
## 0 1
## 0 56 16
## 1 7 48
mean(boston_prediction_lda_2$class != crim01[-boston_training_index])
## [1] 0.1811024
For the LDA model after removing some of the predictors, we arrived at a test error rate of approximately 18.11%.
boston_lda_fit_3 <- lda(crim01 ~ chas + nox + dis + tax, data = boston_training_data)
boston_prediction_lda_3 <- predict(boston_lda_fit_3, boston_testing_data)
table(boston_prediction_lda_3$class, crim01[-boston_training_index])
##
## 0 1
## 0 59 15
## 1 4 49
mean(boston_prediction_lda_3$class != crim01[-boston_training_index])
## [1] 0.1496063
The LDA model here was able to be reduced to 4 predictors while giving the same test error rate of approximately 14.96% as when only the crime variables were removed.
boston_lr_fit_1 <- glm(crim01 ~ . - crim01 - crim, data = boston_training_data, family = binomial)
boston_lr_fit_1
##
## Call: glm(formula = crim01 ~ . - crim01 - crim, family = binomial,
## data = boston_training_data)
##
## Coefficients:
## (Intercept) zn indus chas nox rm
## -44.511620 -0.115935 -0.085428 0.204896 60.075029 -0.422718
## age dis rad tax ptratio black
## 0.027567 1.262936 0.641510 -0.006109 0.309168 -0.009583
## lstat medv
## 0.111155 0.210806
##
## Degrees of Freedom: 378 Total (i.e. Null); 365 Residual
## Null Deviance: 525.4
## Residual Deviance: 147.5 AIC: 175.5
boston_prediction_lr_1 <- predict(boston_lr_fit_1, boston_testing_data, type = "response")
lr_prediction_final_1 <- round(boston_prediction_lr_1)
table(lr_prediction_final_1, crim01[-boston_training_index])
##
## lr_prediction_final_1 0 1
## 0 51 5
## 1 12 59
mean(lr_prediction_final_1 != crim01[-boston_training_index])
## [1] 0.1338583
For the logistic regression model after only removing the crime variables, the test error rate was improved from that of the LDA model to approximately 13.39%.
boston_lr_fit_2 <- glm(crim01 ~ chas + nox + rm + ptratio + medv, data = boston_training_data, family = binomial)
boston_prediction_lr_2 <- predict(boston_lr_fit_2, boston_testing_data, type = "response")
lr_prediction_final_2 <- round(boston_prediction_lr_2)
table(lr_prediction_final_2, crim01[-boston_training_index])
##
## lr_prediction_final_2 0 1
## 0 57 12
## 1 6 52
mean(lr_prediction_final_2 != crim01[-boston_training_index])
## [1] 0.1417323
For the logistic regression model after removing some of the predictors, an improvement over that of the corresponding LDA model was observed giving a test error rate of approximately 14.17%.
boston_lr_fit_3 <- glm(crim01 ~ chas + nox + dis + tax, data = boston_training_data, family = binomial)
boston_prediction_lr_3 <- predict(boston_lr_fit_3, boston_testing_data, type = "response")
lr_prediction_final_3 <- round(boston_prediction_lr_3)
table(lr_prediction_final_3, crim01[-boston_training_index])
##
## lr_prediction_final_3 0 1
## 0 49 10
## 1 14 54
mean(lr_prediction_final_3 != crim01[-boston_training_index])
## [1] 0.1889764
Interestingly, the test error rate of this logistic regression model was more that its LDA counterpart, giving approximately 18.90%.
knn_boston_training_data_1 = boston_training_data[, c("chas","nox","dis","tax")]
knn_boston_testing_data_1 = boston_testing_data[, c("chas","nox","dis","tax")]
knn_boston_prediction_1 = knn(knn_boston_training_data_1, knn_boston_testing_data_1, crim01[boston_training_index], k = 1)
table(knn_boston_prediction_1, crim01[-boston_training_index])
##
## knn_boston_prediction_1 0 1
## 0 58 2
## 1 5 62
mean(knn_boston_prediction_1 != crim01[-boston_training_index])
## [1] 0.05511811
This model with K = 1 and reducing the predictors to the 4 used above gave the best test error rate thus far of approximately 5.51%.
dropping = c("crim", "crim01")
knn_boston_training_data_2 = boston_training_data[,!names(boston_training_data) %in% dropping]
knn_boston_testing_data_2 = boston_testing_data[,!names(boston_training_data) %in% dropping]
knn_boston_prediction_2 = knn(knn_boston_training_data_2, knn_boston_testing_data_2, crim01[boston_training_index], k = 1)
table(knn_boston_prediction_2, crim01[-boston_training_index])
##
## knn_boston_prediction_2 0 1
## 0 55 5
## 1 8 59
mean(knn_boston_prediction_2 != crim01[-boston_training_index])
## [1] 0.1023622
While this model with K = 1 and only removing the crim predictors gave better test error rates than the LDA and logistic regression models far of approximately 10.24%, the previous KNN model gave a better test error rate so we explore additional K values for the first KNN model below.
errors <- array(0, dim = 50)
for(i in 1:50){
knn_boston_prediction = knn(knn_boston_training_data_1, knn_boston_testing_data_1, crim01[boston_training_index], k = i)
errors[i] = mean(knn_boston_prediction != crim01[-boston_training_index])
}
plot(errors, xlab="K value", ylab="Test error")
lines(errors, pch=16)
min(errors)
## [1] 0.05511811
which(errors == min(errors))
## [1] 1
As shown above, K = 1 that we previously utilized gave the lowest test error rate of all the models covered of approximately 5.51%.