Problem 10 - Part A
weekly <- Weekly
attach(weekly)
summary(weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
The results of the pairs plot suggest that there may be a relationship shared between the Volume and Year variables. At a glance, there does not appear to be an easily identifiable relationship between any of the other variables.
pairs(weekly)
plot(Volume)
The results of the correlation table provide further evidence to suggest that there is a relationship shared between the Volume and Year variables. the correlation value 0.8419162 for Volume/Year is quite large in comparison to any of the other correlation values in this data.
cor(weekly[,-9])
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
## Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
## Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
## Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
## Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
## Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
## Lag5 Volume Today
## Year -0.030519101 0.84194162 -0.032459894
## Lag1 -0.008183096 -0.06495131 -0.075031842
## Lag2 -0.072499482 -0.08551314 0.059166717
## Lag3 0.060657175 -0.06928771 -0.071243639
## Lag4 -0.075675027 -0.06107462 -0.007825873
## Lag5 1.000000000 -0.05851741 0.011012698
## Volume -0.058517414 1.00000000 -0.033077783
## Today 0.011012698 -0.03307778 1.000000000
Problem 10 - Part B
The only variable in the logistic regression model which appears to be statistically significant is Lag 2 which has a relatively small p-value (0.0296). The remaining variables in this model are all statistically insignificant with large p-values.
weeklyGLM <- glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume, data=weekly, family=binomial)
summary(weeklyGLM)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
Problem 10 - Part C
weeklyProb.1 <- predict(weeklyGLM, type="response")
weeklyPred.1 <- rep("Down", length(weeklyProb.1))
weeklyPred.1[weeklyProb.1 > 0.5] = "Up"
The amount of correct predictions in this model is 611 out of 1089 total predictions. This equals a classification accuracy rate of about 56.1% and a test error rate of 43.9%. Further examination of the amount of “Up” and “Down” predicted correctly by the logistic model reveals that the predictions are mostly correct for weeks when the market goes up but mostly incorrect for weeks when the market goes down. The accuracy for “Up” is about 92% and the accuracy for “Down” is only about 11.1%.
table(weeklyPred.1, Direction)
## Direction
## weeklyPred.1 Down Up
## Down 54 48
## Up 430 557
54+557
## [1] 611
54+48+430+557
## [1] 1089
(54+557)/(54+48+430+557)
## [1] 0.5610652
557/(557+48)
## [1] 0.9206612
54/(430+54)
## [1] 0.1115702
Problem 10 - Part D
weeklyTrain <- (Year < 2009)
weekly.0910 <- weekly[!weeklyTrain, ]
direction.0910 <- Direction[!weeklyTrain]
weeklyGLM.2 <- glm(Direction~Lag2, data=weekly, family=binomial, subset=weeklyTrain)
weeklyProb.2 <- predict(weeklyGLM.2, weekly.0910, type="response")
weeklyPred.2 <- rep("Down", length(weeklyProb.2))
weeklyPred.2[weeklyProb.2 > 0.5] = "Up"
summary(weeklyGLM.2)
##
## Call:
## glm(formula = Direction ~ Lag2, family = binomial, data = weekly,
## subset = weeklyTrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.536 -1.264 1.021 1.091 1.368
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.20326 0.06428 3.162 0.00157 **
## Lag2 0.05810 0.02870 2.024 0.04298 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1354.7 on 984 degrees of freedom
## Residual deviance: 1350.5 on 983 degrees of freedom
## AIC: 1354.5
##
## Number of Fisher Scoring iterations: 4
The amount of correct predictions in the logistic regression model containing 2009/2010 data is 65 out of 104 total predictions. This equals a classification accuracy rate of exactly 62.5%.
table(weeklyPred.2, direction.0910)
## direction.0910
## weeklyPred.2 Down Up
## Down 9 5
## Up 34 56
9+56
## [1] 65
9+56+5+34
## [1] 104
mean(weeklyPred.2 == direction.0910)
## [1] 0.625
Problem 10 - Part E
weeklyLDA <- lda(Direction~Lag2, data=weekly, subset=weeklyTrain)
weeklyPred.3 <- predict(weeklyLDA, weekly.0910)
Similar to the GLM above, the amount of correct predictions in the LDA model containing 2009/2010 data is 65 out of 104 total predictions. This equals a classification accuracy rate of exactly 62.5%.
table(weeklyPred.3$class, direction.0910)
## direction.0910
## Down Up
## Down 9 5
## Up 34 56
mean(weeklyPred.3$class == direction.0910)
## [1] 0.625
Problem 10 - Part F
weeklyQDA <- qda(Direction~Lag2, data=weekly, subset=weeklyTrain)
weeklyPred.4 <- predict(weeklyQDA, weekly.0910)
The amount of correct predictions in the QDA model containing 2009/2010 data is 61 out of 104 total predictions. This equals a classification accuracy rate of approximately 58.65%.
table(weeklyPred.4$class, direction.0910)
## direction.0910
## Down Up
## Down 0 0
## Up 43 61
mean(weeklyPred.4$class == direction.0910)
## [1] 0.5865385
Problem 10 - Part G
The amount of correct predictions in the KNN model containing 2009/2010 data is 52 out of 104 total predictions. This equals a classification accuracy rate of exactly 50%.
weeklyTrain.KNN <- as.matrix(Lag2[weeklyTrain])
weeklyTest.KNN <- as.matrix(Lag2[!weeklyTrain])
trainDirection.1 <- Direction[weeklyTrain]
set.seed(1)
weeklyPred.5 <- knn(weeklyTrain.KNN, weeklyTest.KNN, trainDirection.1, k = 1)
table(weeklyPred.5, direction.0910)
## direction.0910
## weeklyPred.5 Down Up
## Down 21 30
## Up 22 31
mean(weeklyPred.5 == direction.0910)
## [1] 0.5
Problem 10 - Part H
The model which appears to provide the best results on this data is a actually a tie between logistic regression and LDA. Both of these models have an identical classification accuracy rate of 62.5% and a test error rate of 37.5%. The accuracy of these two models exceeds that of the QDA and KNN models which have accuracy rates of 58.7% and 50%, respectively.
Problem 10 - Part I
The classification accuracy rate for the LDA model with Lag 1 and Lag 3 is 59.61%.
weeklyLDA.2 <- lda(Direction~Lag1+Lag3, data=weekly, subset=weeklyTrain)
weeklyPred.6 <- predict(weeklyLDA.2, weekly.0910)
table(weeklyPred.6$class, direction.0910)
## direction.0910
## Down Up
## Down 5 4
## Up 38 57
mean(weeklyPred.6$class == direction.0910)
## [1] 0.5961538
The classification accuracy rate for the QDA model with the interaction of Lag 1 and Lag 3 is 58.65%.
weeklyQDA.2 <- qda(Direction~Lag1:Lag3, data=weekly, subset=weeklyTrain)
weeklyPred.7 <- predict(weeklyQDA.2, weekly.0910)
table(weeklyPred.7$class, direction.0910)
## direction.0910
## Down Up
## Down 0 0
## Up 43 61
mean(weeklyPred.7$class == direction.0910)
## [1] 0.5865385
The classification accuracy rate for the LDA model with the Lag 1 and Lag 3 squared is 57.69%.
weeklyLDA.3 <- lda(Direction~Lag1+I(Lag3^2), data=weekly, subset=weeklyTrain)
weeklyPred.8 <- predict(weeklyLDA.3, weekly.0910)
table(weeklyPred.8$class, direction.0910)
## direction.0910
## Down Up
## Down 4 5
## Up 39 56
mean(weeklyPred.8$class == direction.0910)
## [1] 0.5769231
The classification accuracy rate for the GLM model with the interaction of Lag 1 and Lag 3 is 58.65%.
weeklyGLM.2 <- glm(Direction~Lag1:Lag3, data=weekly, family=binomial, subset=weeklyTrain)
weeklyProb.3 <- predict(weeklyGLM.2, weekly.0910, type="response")
weeklyPred.9 <- rep("Down", length(weeklyProb.3))
weeklyPred.9[weeklyProb.3 > 0.5] = "Up"
direction.0910.2 <- Direction[!weeklyTrain]
table(weeklyPred.9, direction.0910.2)
## direction.0910.2
## weeklyPred.9 Down Up
## Down 3 3
## Up 40 58
mean(weeklyPred.9 == direction.0910.2)
## [1] 0.5865385
The classification accuracy rate for the KNN model for k=5 is 53.85%.
set.seed(1)
weeklyPred.10 <- knn(weeklyTrain.KNN, weeklyTest.KNN, trainDirection.1, k=5)
table(weeklyPred.10, direction.0910)
## direction.0910
## weeklyPred.10 Down Up
## Down 16 21
## Up 27 40
mean(weeklyPred.10 == direction.0910)
## [1] 0.5384615
The classification accuracy rate for the KNN model for k=25 is 52.88%.
set.seed(1)
weeklyPred.11 <- knn(weeklyTrain.KNN, weeklyTest.KNN, trainDirection.1, k=25)
table(weeklyPred.11, direction.0910)
## direction.0910
## weeklyPred.11 Down Up
## Down 19 25
## Up 24 36
mean(weeklyPred.11 == direction.0910)
## [1] 0.5288462
The classification accuracy rate for the KNN model for k=25 is 55.77%.
set.seed(1)
weeklyPred.12 <- knn(weeklyTrain.KNN, weeklyTest.KNN, trainDirection.1, k=40)
table(weeklyPred.12, direction.0910)
## direction.0910
## weeklyPred.12 Down Up
## Down 21 24
## Up 22 37
mean(weeklyPred.12 == direction.0910)
## [1] 0.5576923
In terms of classification accuracy, the first LDA model provides the best results. This model includes Lag 1 and Lag 3 as predictors and has a classification accuracy rate of 59.61% and test error rate of 40.39%.
Problem 11 - Part A
In this chunk, I am creating the mpg01 binary variable. For values in mpg above its median, mpg01 is equal to 1. For values in mpg below its median, mpg01 is equal to 0.
auto <- Auto
attach(auto)
mpg01 <- rep(0, length(mpg))
mpg01[mpg > median(mpg)] = 1
auto <- data.frame(auto, mpg01)
Problem 11 - Part B
The results of the scatterplot matrix and correlation table suggest that the variables cylinders, displacement, horsepower and weight are the most useful for predicting mpg01. The variable mpg is not considered since it is directly correlated with mpg01. The variables origin and cylinders are not explored in the boxplots due to the two variables being somewhat categorical in nature.
pairs(auto)
cor(auto[,-9])
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## mpg01 0.8369392 -0.7591939 -0.7534766 -0.6670526 -0.7577566
## acceleration year origin mpg01
## mpg 0.4233285 0.5805410 0.5652088 0.8369392
## cylinders -0.5046834 -0.3456474 -0.5689316 -0.7591939
## displacement -0.5438005 -0.3698552 -0.6145351 -0.7534766
## horsepower -0.6891955 -0.4163615 -0.4551715 -0.6670526
## weight -0.4168392 -0.3091199 -0.5850054 -0.7577566
## acceleration 1.0000000 0.2903161 0.2127458 0.3468215
## year 0.2903161 1.0000000 0.1815277 0.4299042
## origin 0.2127458 0.1815277 1.0000000 0.5136984
## mpg01 0.3468215 0.4299042 0.5136984 1.0000000
The results of the boxplot suggest that the mean of horsepower is larger when mpg01 is below the median.
boxplot(horsepower~mpg01, auto)
The results of the boxplot suggest that the mean of of displacement is larger when mpg01 is below the median.
boxplot(displacement~mpg01, auto)
The results of the boxplot suggest that the mean of weight is larger when mpg01 is below the median.
boxplot(weight~mpg01, auto)
The results of the boxplot suggest that the mean of acceleration is slightly smaller when mpg01 is below the median.
boxplot(acceleration~mpg01, auto)
The results of the boxplot below suggest that mpg01 is below the median for cars that are from earlier years.
boxplot(year~mpg01, auto)
Problem 11 - Part C
In this chunk, I am splitting the data set into 70% train and 30% test.
set.seed(1)
split <- sample(nrow(auto), nrow(auto) * 0.7)
autoTrain <- auto[split,]
autoTest <- auto[-split,]
Problem 11 - Part D
The test error rate for the LDA model with the variables cylinders, displacement, horsepower and weight is 11.86%.
autoLDA <- lda(mpg01~cylinders+displacement+horsepower+weight, data=autoTrain)
autoPred.1 <- predict(autoLDA, autoTest)
table(autoPred.1$class, autoTest$mpg01)
##
## 0 1
## 0 50 3
## 1 11 54
mean(autoPred.1$class != autoTest$mpg01)
## [1] 0.1186441
Problem 11 - Part E
The test error rate for the LDA model with the variables cylinders, displacement, horsepower and weight is 11.86%.
autoQDA <- qda(mpg01~cylinders+displacement+horsepower+weight, data=autoTrain)
autoPred.2 <- predict(autoQDA, autoTest)
table(autoPred.2$class, autoTest$mpg01)
##
## 0 1
## 0 52 5
## 1 9 52
mean(autoPred.2$class != autoTest$mpg01)
## [1] 0.1186441
Problem 11 - Part F
The test error rate for the LDA model with the variables cylinders, displacement, horsepower and weight is 9.32%.
autoGLM <- glm(mpg01~cylinders+displacement+horsepower+weight, data=autoTrain, family=binomial)
autoProb <- predict(autoGLM, autoTest, type="response")
autoPred.3 <- rep(0, length(autoProb))
autoPred.3[autoProb > 0.5] = 1
table(autoPred.3, autoTest$mpg01)
##
## autoPred.3 0 1
## 0 53 3
## 1 8 54
mean(autoPred.3 != autoTest$mpg01)
## [1] 0.09322034
Problem 11 - Part G
The test error rate for the KNN model for K=1 with the variables cylinders, displacement, horsepower and weight is 13.56%.
autoTrain.KNN <- cbind(autoTrain$cylinders, autoTrain$weight, autoTrain$displacement, autoTrain$horsepower)
autoTest.KNN <- cbind(autoTest$cylinders, autoTest$weight, autoTest$displacement, autoTest$horsepower)
set.seed(1)
autoPred.4 <- knn(autoTrain.KNN, autoTest.KNN, autoTrain$mpg01, k=1)
table(autoPred.4, autoTest$mpg01)
##
## autoPred.4 0 1
## 0 51 6
## 1 10 51
mean(autoPred.4 != autoTest$mpg01)
## [1] 0.1355932
The test error rate for the KNN model for K=5 with the variables cylinders, displacement, horsepower and weight is 12.71%.
set.seed(1)
autoPred.5 <- knn(autoTrain.KNN, autoTest.KNN, autoTrain$mpg01, k=5)
table(autoPred.5, autoTest$mpg01)
##
## autoPred.5 0 1
## 0 51 5
## 1 10 52
mean(autoPred.5 != autoTest$mpg01)
## [1] 0.1271186
The test error rate for the KNN model for K=10 with the variables cylinders, displacement, horsepower and weight is 14.40%.
set.seed(1)
autoPred.6 <- knn(autoTrain.KNN, autoTest.KNN, autoTrain$mpg01, k=10)
table(autoPred.6, autoTest$mpg01)
##
## autoPred.6 0 1
## 0 49 5
## 1 12 52
mean(autoPred.6 != autoTest$mpg01)
## [1] 0.1440678
The test error rate for the KNN model for K=10 with the variables cylinders, displacement, horsepower and weight is 11.01%.
set.seed(1)
autoPred.7 <- knn(autoTrain.KNN, autoTest.KNN, autoTrain$mpg01, k=3)
table(autoPred.7, autoTest$mpg01)
##
## autoPred.7 0 1
## 0 52 4
## 1 9 53
mean(autoPred.7 != autoTest$mpg01)
## [1] 0.1101695
Problem 13
boston <- Boston
attach(boston)
In this chunk, I am creating the crim01 binary variable. For values in crim above its median, crim01 is equal to 1. For values in crim below its median, crim01 is equal to 0.
crim01 <- rep(0, length(crim))
crim01[crim > median(crim)] = 1
boston <- data.frame(boston, crim01)
In this chunk, I am splitting the data set into 70% train and 30% test.
set.seed(1)
split.2 <- sample(nrow(boston), nrow(boston) * 0.7)
bostonTrain <- boston[split.2,]
bostonTest <- boston[-split.2,]
summary(boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv crim01
## Min. : 1.73 Min. : 5.00 Min. :0.0
## 1st Qu.: 6.95 1st Qu.:17.02 1st Qu.:0.0
## Median :11.36 Median :21.20 Median :0.5
## Mean :12.65 Mean :22.53 Mean :0.5
## 3rd Qu.:16.95 3rd Qu.:25.00 3rd Qu.:1.0
## Max. :37.97 Max. :50.00 Max. :1.0
The classification accuracy rate for the LDA model with the predictors nox, age, dis, ptratio, lstat and medv is 84.87%.
bostonLDA <- lda(crim01~nox+age+dis+ptratio+lstat+medv, data=bostonTrain)
bostonPred.1 <- predict(bostonLDA, bostonTest)
table(bostonPred.1$class, bostonTest$crim01)
##
## 0 1
## 0 66 16
## 1 7 63
mean(bostonPred.1$class == bostonTest$crim01)
## [1] 0.8486842
The classification accuracy rate for the GLM model with the predictors nox, age, tax and dis is 81.58%.
bostonGLM <- glm(crim01~nox+age+tax+dis, data=bostonTrain, family=binomial)
bostonProb.1 = predict(bostonGLM, bostonTest, type="response")
bostonPred.2 = rep(0, length(bostonProb.1))
bostonPred.2[bostonProb.1 > 0.5] = 1
table(bostonPred.2, bostonTest$crim01)
##
## bostonPred.2 0 1
## 0 56 11
## 1 17 68
mean(bostonPred.2 == bostonTest$crim01)
## [1] 0.8157895
The classification accuracy rate for the GLM model with the predictors nox, age, dis, ptratio, lstat and medv is 85.53%.
bostonGLM.2 <- glm(crim01~nox+age+dis+ptratio+lstat+medv, data=bostonTrain, family=binomial)
bostonProb.2 = predict(bostonGLM.2, bostonTest, type="response")
bostonPred.3 = rep(0, length(bostonProb.2))
bostonPred.3[bostonProb.2 > 0.5] = 1
table(bostonPred.3, bostonTest$crim01)
##
## bostonPred.3 0 1
## 0 61 10
## 1 12 69
mean(bostonPred.3 == bostonTest$crim01)
## [1] 0.8552632
The classification accuracy rate for the LDA model with the predictors nox and age is 84.87%.
bostonLDA.2 <- lda(crim01~nox+age, data=bostonTrain)
bostonPred.4 <- predict(bostonLDA.2, bostonTest)
table(bostonPred.4$class, bostonTest$crim01)
##
## 0 1
## 0 62 12
## 1 11 67
mean(bostonPred.4$class == bostonTest$crim01)
## [1] 0.8486842
The classification accuracy rate for the KNN model with the predictors age, dis and medv is 78.29%.
bostonTrain.KNN <- cbind(bostonTrain$age, bostonTrain$dis, bostonTrain$medv)
bostonTest.KNN <- cbind(bostonTest$age, bostonTest$dis, bostonTest$medv)
set.seed(1)
bostonPred.5 <- knn(bostonTrain.KNN, bostonTest.KNN, bostonTrain$crim01, k=1)
table(bostonPred.5, bostonTest$crim01)
##
## bostonPred.5 0 1
## 0 55 15
## 1 18 64
mean(bostonPred.5 == bostonTest$crim01)
## [1] 0.7828947
The classification accuracy rate for the KNN model with the predictors age, dis and medv is 80.26%.
set.seed(1)
bostonPred.6 <- knn(bostonTrain.KNN, bostonTest.KNN, bostonTrain$crim01, k=10)
table(bostonPred.6, bostonTest$crim01)
##
## bostonPred.6 0 1
## 0 59 16
## 1 14 63
mean(bostonPred.6 == bostonTest$crim01)
## [1] 0.8026316
The classification accuracy rate for the KNN model with the predictors age, dis and medv is 82.89%.
set.seed(1)
bostonPred.7 <- knn(bostonTrain.KNN, bostonTest.KNN, bostonTrain$crim01, k=20)
table(bostonPred.7, bostonTest$crim01)
##
## bostonPred.7 0 1
## 0 60 13
## 1 13 66
mean(bostonPred.7 == bostonTest$crim01)
## [1] 0.8289474