library(ISLR)
## Warning: package 'ISLR' was built under R version 3.5.3
weeklyDf<-Weekly
head(weeklyDf)
## Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
## 1 1990 0.816 1.572 -3.936 -0.229 -3.484 0.1549760 -0.270 Down
## 2 1990 -0.270 0.816 1.572 -3.936 -0.229 0.1485740 -2.576 Down
## 3 1990 -2.576 -0.270 0.816 1.572 -3.936 0.1598375 3.514 Up
## 4 1990 3.514 -2.576 -0.270 0.816 1.572 0.1616300 0.712 Up
## 5 1990 0.712 3.514 -2.576 -0.270 0.816 0.1537280 1.178 Up
## 6 1990 1.178 0.712 3.514 -2.576 -0.270 0.1544440 -1.372 Down
summary(weeklyDf)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202
## Median : 0.2380 Median : 0.2340 Median :1.00268
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821
## Today Direction
## Min. :-18.1950 Down:484
## 1st Qu.: -1.1540 Up :605
## Median : 0.2410
## Mean : 0.1499
## 3rd Qu.: 1.4050
## Max. : 12.0260
pairs(weeklyDf)
cor(weeklyDf[,-9])
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
## Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
## Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
## Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
## Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
## Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
## Lag5 Volume Today
## Year -0.030519101 0.84194162 -0.032459894
## Lag1 -0.008183096 -0.06495131 -0.075031842
## Lag2 -0.072499482 -0.08551314 0.059166717
## Lag3 0.060657175 -0.06928771 -0.071243639
## Lag4 -0.075675027 -0.06107462 -0.007825873
## Lag5 1.000000000 -0.05851741 0.011012698
## Volume -0.058517414 1.00000000 -0.033077783
## Today 0.011012698 -0.03307778 1.000000000
Looking at the summary, there is data from 1990 to 2010. The percentage return for the week is captured in the \(Today\) variable, which has a mean of \(14.99\%\), a minimum of \(-1819\%\), and a maximum of \(1202.6\%\). The other variables are lags up to the 4th lag, and whether the stock market had a positive or negative return for a given week. Looking at the scatter plot of the variables, there are not clear patterns between any of the lag variables with each other, the only clear pattern is an exponential increase in the volume traded in a week over the years. This relationship is further supported by the correlation matrix showing a strong correlation of \(0.8419\) between \(Volume\) and \(Year\). ### (b)
fullLogit.fit<-glm(Direction~. - Today - Year,data = weeklyDf, family = 'binomial')
summary(fullLogit.fit)
##
## Call:
## glm(formula = Direction ~ . - Today - Year, family = "binomial",
## data = weeklyDf)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
From the coefficient table it appears that only \(Lag2\) is significant with a p-value of \(0.0296\). ### (c)
preds <- predict(fullLogit.fit, type = "response")
direction.pred <- rep("Down",length(preds))
direction.pred[preds > 0.5] <- "Up"
table(direction.pred,weeklyDf$Direction)
##
## direction.pred Down Up
## Down 54 48
## Up 430 557
directionTable<-matrix(table(direction.pred,weeklyDf$Direction),ncol = 2)
(directionTable[1,1] + directionTable[2,2])/sum(directionTable)
## [1] 0.5610652
Looking at the confusion matrix, it appears that the model does not do a good job of differentiating whether the market will go up or down, with an accuracy of about \(56.11\%\), thus incorrectly predicts the direction \(43.89\%\) of the time. ### (d)
weeklyDfTrain <- weeklyDf[weeklyDf$Year<2009,]
weeklyDfTest <- weeklyDf[weeklyDf$Year>=2009,]
logit08.fit<-glm(Direction~ Lag2,data = weeklyDfTrain, family = 'binomial')
preds09<- predict(logit08.fit,newdata = weeklyDfTest,type = "response")
direction09.pred <- rep("Down",length(preds09))
direction09.pred[preds09 > 0.5] <- "Up"
table(direction09.pred,weeklyDfTest$Direction)
##
## direction09.pred Down Up
## Down 9 5
## Up 34 56
directionTable09<-matrix(table(direction09.pred,weeklyDfTest$Direction),ncol = 2)
(directionTable09[1,1] + directionTable09[2,2])/sum(directionTable09)
## [1] 0.625
Checking the accuracy on the hold out data, the model performance improved from the previous model to \(62.5\%\). This means that about \(37.5\%\) of the time the model incorrectly predicts the direction. ### (e)
library(MASS)
## Warning: package 'MASS' was built under R version 3.5.3
lda.fit <- lda(Direction ~ Lag2, data = weeklyDfTrain )
preds09lda<- predict(lda.fit,newdata = weeklyDfTest)
table(preds09lda$class,weeklyDfTest$Direction)
##
## Down Up
## Down 9 5
## Up 34 56
directionTable09lda<-matrix(table(preds09lda$class,weeklyDfTest$Direction),ncol = 2)
(directionTable09lda[1,1] + directionTable09lda[2,2])/sum(directionTable09lda)
## [1] 0.625
The LDA model predicts with same accuracy of the logistic regression, having an accuracy of \(62.5\%\) and an error rate of \(37.5\%\).
qda.fit <- qda(Direction ~ Lag2, data = weeklyDfTrain )
preds09qda<- predict(qda.fit,newdata = weeklyDfTest)
table(preds09qda$class,weeklyDfTest$Direction)
##
## Down Up
## Down 0 0
## Up 43 61
directionTable09qda<-matrix(table(preds09qda$class,weeklyDfTest$Direction),ncol = 2)
(directionTable09qda[1,1] + directionTable09qda[2,2])/sum(directionTable09qda)
## [1] 0.5865385
With an accuracy of \(58.65\%\) and an error rate of \(41.35\%\), the QDA model performs worse than the LDA and logistic models. ### (g)
library(class)
set.seed(1)
preds09knn <- knn(as.matrix(weeklyDfTrain$Lag2),as.matrix(weeklyDfTest$Lag2),weeklyDfTrain$Direction, k = 1)
table(preds09knn,weeklyDfTest$Direction)
##
## preds09knn Down Up
## Down 21 30
## Up 22 31
directionTable09knn<-matrix(table(preds09knn,weeklyDfTest$Direction),ncol = 2)
(directionTable09knn[1,1] + directionTable09knn[2,2])/sum(directionTable09knn)
## [1] 0.5
KN with \(k=1\) performs the worst out of all the models so far, with both an accuracy and error rate of \(50\%\).N ### (h) Comparing all the models, LDA and logistic regression perform the best, with QDA following and KNN performing the worst. ## 11. ### (a)
autoDf <- Auto
autoDf$mpg01<-ifelse(autoDf$mpg > median(autoDf$mpg), 1, 0)
pairs(autoDf[,-9])
cor(autoDf[,-9])
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## mpg01 0.8369392 -0.7591939 -0.7534766 -0.6670526 -0.7577566
## acceleration year origin mpg01
## mpg 0.4233285 0.5805410 0.5652088 0.8369392
## cylinders -0.5046834 -0.3456474 -0.5689316 -0.7591939
## displacement -0.5438005 -0.3698552 -0.6145351 -0.7534766
## horsepower -0.6891955 -0.4163615 -0.4551715 -0.6670526
## weight -0.4168392 -0.3091199 -0.5850054 -0.7577566
## acceleration 1.0000000 0.2903161 0.2127458 0.3468215
## year 0.2903161 1.0000000 0.1815277 0.4299042
## origin 0.2127458 0.1815277 1.0000000 0.5136984
## mpg01 0.3468215 0.4299042 0.5136984 1.0000000
boxplot(cylinders ~ mpg01, data = autoDf, main = "Cylinders vs. mpg01")
boxplot(displacement ~ mpg01, data = autoDf, main = "Displacement vs. mpg01")
boxplot(horsepower ~ mpg01, data = autoDf, main = "Horsepower vs. mpg01")
boxplot(weight ~ mpg01, data = autoDf, main = "Weight vs. mpg01")
Looking at the scatterplots, the correlation between the variables and \(mpg01\) are hard to tell since \(mpg01\) is a binary variable. The correlation matrix makes the relationships a lot clearer, showing \(cylinders\), \(displacement\), and \(weight\) all have strong negative correlations with \(mpg01\). \(horsepower\) has a fairly strong negative correlation, while \(origin\) has a good positive correlation. Lastly, \(year\) and \(acceleration\) have somewhat weak positive correlations. The scatterplots reinforce the findings from the correlation matrix, with one interesting note of a lack of variety in \(cylinders\), with many high mpg engines only having 4 cylinders. ### (c)
set.seed(1)
smp_size <- floor(0.8 * nrow(autoDf))
train_ind <- sample(seq_len(nrow(autoDf)), size = smp_size)
autoDfTrain <- autoDf[train_ind, ]
autoDfTest <- autoDf[-train_ind, ]
ldampg.fit <- lda(mpg01 ~ cylinders + displacement + horsepower + weight, data = autoDfTrain )
ldampg.fit
## Call:
## lda(mpg01 ~ cylinders + displacement + horsepower + weight, data = autoDfTrain)
##
## Prior probabilities of groups:
## 0 1
## 0.514377 0.485623
##
## Group means:
## cylinders displacement horsepower weight
## 0 6.677019 269.8323 128.71429 3601.379
## 1 4.210526 117.4145 78.57895 2350.526
##
## Coefficients of linear discriminants:
## LD1
## cylinders -0.4183687107
## displacement -0.0017457149
## horsepower 0.0028180950
## weight -0.0009283838
predmpg.lda <- predict(ldampg.fit,autoDfTest)
mean(predmpg.lda$class != autoDfTest$mpg01)
## [1] 0.05063291
The LDA model is trained on \(80\%\) of the data, predicting \(mpg01\) with \(cylinders\), \(displacement\), \(horsepower\), and \(weight\). This model does a good job of predicting which cars have an mpg higher than the median, with the test error rate only being about \(0.05\%\). ### (e)
qdampg.fit <- qda(mpg01 ~ cylinders + displacement + horsepower + weight, data = autoDfTrain )
qdampg.fit
## Call:
## qda(mpg01 ~ cylinders + displacement + horsepower + weight, data = autoDfTrain)
##
## Prior probabilities of groups:
## 0 1
## 0.514377 0.485623
##
## Group means:
## cylinders displacement horsepower weight
## 0 6.677019 269.8323 128.71429 3601.379
## 1 4.210526 117.4145 78.57895 2350.526
predmpg.qda <- predict(qdampg.fit,autoDfTest)
mean(predmpg.qda$class != autoDfTest$mpg01)
## [1] 0.08860759
The QDA model that is fit has an error rate of about \(0.09\%\), making it slightly worse than the LDA. ### (f)
logitmpg.fit<-glm(mpg01 ~ cylinders + displacement + horsepower + weight, data = autoDfTrain, family = 'binomial')
logitmpg.fit
##
## Call: glm(formula = mpg01 ~ cylinders + displacement + horsepower +
## weight, family = "binomial", data = autoDfTrain)
##
## Coefficients:
## (Intercept) cylinders displacement horsepower weight
## 11.620021 0.113614 -0.014494 -0.050572 -0.001746
##
## Degrees of Freedom: 312 Total (i.e. Null); 308 Residual
## Null Deviance: 433.7
## Residual Deviance: 173.5 AIC: 183.5
predmpg <- predict(logitmpg.fit,autoDfTest, type = "response")
predmpg.logit <- rep(0, length(predmpg))
predmpg.logit[predmpg > 0.5] <- 1
mean(predmpg.logit != autoDfTest$mpg01)
## [1] 0.08860759
The logistic model performs slightly worse than the LDA and the same as the QDA, with an error rate of approximately \(0.09\%\). ### (g)
trainMatrix<- cbind(autoDfTrain$cylinders,autoDfTrain$displacement,autoDfTrain$horsepower,autoDfTrain$weight)
testMatrix<- cbind(autoDfTest$cylinders,autoDfTest$displacement,autoDfTest$horsepower,autoDfTest$weight)
set.seed(1)
predmpg.knn <- knn(as.matrix(trainMatrix),as.matrix(testMatrix),autoDfTrain$mpg01, k = 1)
mean(predmpg.knn != autoDfTest$mpg01)
## [1] 0.1518987
set.seed(1)
predmpg.knn <- knn(as.matrix(trainMatrix),as.matrix(testMatrix),autoDfTrain$mpg01, k = 20)
mean(predmpg.knn != autoDfTest$mpg01)
## [1] 0.1012658
set.seed(1)
predmpg.knn <- knn(as.matrix(trainMatrix),as.matrix(testMatrix),autoDfTrain$mpg01, k = 50)
mean(predmpg.knn != autoDfTest$mpg01)
## [1] 0.07594937
Fitting 3 different KNN models with \(k = 1\), \(k = 20\), and \(k = 50\), there is an error rate of about \(11\%\), \(10\%\), and \(8\%\) respectively. The \(k = 50\) model outperforms the logistic and the QDA model, but the LDA model outperforms all of them. ### 13.
bostonDf <- Boston
bostonDf$crim01<-ifelse(bostonDf$crim > median(bostonDf$crim), 1, 0)
cor(bostonDf)
## crim zn indus chas nox
## crim 1.00000000 -0.20046922 0.40658341 -0.055891582 0.42097171
## zn -0.20046922 1.00000000 -0.53382819 -0.042696719 -0.51660371
## indus 0.40658341 -0.53382819 1.00000000 0.062938027 0.76365145
## chas -0.05589158 -0.04269672 0.06293803 1.000000000 0.09120281
## nox 0.42097171 -0.51660371 0.76365145 0.091202807 1.00000000
## rm -0.21924670 0.31199059 -0.39167585 0.091251225 -0.30218819
## age 0.35273425 -0.56953734 0.64477851 0.086517774 0.73147010
## dis -0.37967009 0.66440822 -0.70802699 -0.099175780 -0.76923011
## rad 0.62550515 -0.31194783 0.59512927 -0.007368241 0.61144056
## tax 0.58276431 -0.31456332 0.72076018 -0.035586518 0.66802320
## ptratio 0.28994558 -0.39167855 0.38324756 -0.121515174 0.18893268
## black -0.38506394 0.17552032 -0.35697654 0.048788485 -0.38005064
## lstat 0.45562148 -0.41299457 0.60379972 -0.053929298 0.59087892
## medv -0.38830461 0.36044534 -0.48372516 0.175260177 -0.42732077
## crim01 0.40939545 -0.43615103 0.60326017 0.070096774 0.72323480
## rm age dis rad tax
## crim -0.21924670 0.35273425 -0.37967009 0.625505145 0.58276431
## zn 0.31199059 -0.56953734 0.66440822 -0.311947826 -0.31456332
## indus -0.39167585 0.64477851 -0.70802699 0.595129275 0.72076018
## chas 0.09125123 0.08651777 -0.09917578 -0.007368241 -0.03558652
## nox -0.30218819 0.73147010 -0.76923011 0.611440563 0.66802320
## rm 1.00000000 -0.24026493 0.20524621 -0.209846668 -0.29204783
## age -0.24026493 1.00000000 -0.74788054 0.456022452 0.50645559
## dis 0.20524621 -0.74788054 1.00000000 -0.494587930 -0.53443158
## rad -0.20984667 0.45602245 -0.49458793 1.000000000 0.91022819
## tax -0.29204783 0.50645559 -0.53443158 0.910228189 1.00000000
## ptratio -0.35550149 0.26151501 -0.23247054 0.464741179 0.46085304
## black 0.12806864 -0.27353398 0.29151167 -0.444412816 -0.44180801
## lstat -0.61380827 0.60233853 -0.49699583 0.488676335 0.54399341
## medv 0.69535995 -0.37695457 0.24992873 -0.381626231 -0.46853593
## crim01 -0.15637178 0.61393992 -0.61634164 0.619786249 0.60874128
## ptratio black lstat medv crim01
## crim 0.2899456 -0.38506394 0.4556215 -0.3883046 0.40939545
## zn -0.3916785 0.17552032 -0.4129946 0.3604453 -0.43615103
## indus 0.3832476 -0.35697654 0.6037997 -0.4837252 0.60326017
## chas -0.1215152 0.04878848 -0.0539293 0.1752602 0.07009677
## nox 0.1889327 -0.38005064 0.5908789 -0.4273208 0.72323480
## rm -0.3555015 0.12806864 -0.6138083 0.6953599 -0.15637178
## age 0.2615150 -0.27353398 0.6023385 -0.3769546 0.61393992
## dis -0.2324705 0.29151167 -0.4969958 0.2499287 -0.61634164
## rad 0.4647412 -0.44441282 0.4886763 -0.3816262 0.61978625
## tax 0.4608530 -0.44180801 0.5439934 -0.4685359 0.60874128
## ptratio 1.0000000 -0.17738330 0.3740443 -0.5077867 0.25356836
## black -0.1773833 1.00000000 -0.3660869 0.3334608 -0.35121093
## lstat 0.3740443 -0.36608690 1.0000000 -0.7376627 0.45326273
## medv -0.5077867 0.33346082 -0.7376627 1.0000000 -0.26301673
## crim01 0.2535684 -0.35121093 0.4532627 -0.2630167 1.00000000
set.seed(1)
train_ind_crim <- sample(seq_len(nrow(bostonDf)), size = floor(0.8 * nrow(bostonDf)))
bostonDfTrain <- bostonDf[train_ind_crim, ]
bostonDfTest <- bostonDf[-train_ind_crim, ]
logitcrim.fit<-glm(crim01 ~ nox + rad + dis, data = bostonDfTrain, family = 'binomial')
logitcrim.fit
##
## Call: glm(formula = crim01 ~ nox + rad + dis, family = "binomial",
## data = bostonDfTrain)
##
## Coefficients:
## (Intercept) nox rad dis
## -20.8645 33.2019 0.4026 0.2267
##
## Degrees of Freedom: 403 Total (i.e. Null); 400 Residual
## Null Deviance: 559.7
## Residual Deviance: 210.5 AIC: 218.5
predcrim <- predict(logitcrim.fit,bostonDfTest, type = "response")
predcrim.logit <- rep(0, length(predcrim))
predcrim.logit[predcrim > 0.5] <- 1
mean(predcrim.logit != bostonDfTest$crim01)
## [1] 0.1372549
ldacrim.fit <- lda(crim01 ~ nox + rad + dis, data = bostonDfTrain)
ldacrim.fit
## Call:
## lda(crim01 ~ nox + rad + dis, data = bostonDfTrain)
##
## Prior probabilities of groups:
## 0 1
## 0.4851485 0.5148515
##
## Group means:
## nox rad dis
## 0 0.4719224 4.290816 5.018890
## 1 0.6433173 15.307692 2.475816
##
## Coefficients of linear discriminants:
## LD1
## nox 8.18765579
## rad 0.06402159
## dis -0.10223665
predcrim.lda <- predict(ldacrim.fit,bostonDfTest)
mean(predcrim.lda$class != bostonDfTest$crim01)
## [1] 0.1470588
qdacrim.fit <- qda(crim01 ~ nox + rad + dis, data = bostonDfTrain)
qdacrim.fit
## Call:
## qda(crim01 ~ nox + rad + dis, data = bostonDfTrain)
##
## Prior probabilities of groups:
## 0 1
## 0.4851485 0.5148515
##
## Group means:
## nox rad dis
## 0 0.4719224 4.290816 5.018890
## 1 0.6433173 15.307692 2.475816
predcrim.qda <- predict(qdacrim.fit,bostonDfTest)
mean(predcrim.qda$class != bostonDfTest$crim01)
## [1] 0.1176471
trainMatrixCrim<- cbind(bostonDfTrain$nox,bostonDfTrain$rad,bostonDfTrain$dis)
testMatrixCrim<- cbind(bostonDfTest$nox,bostonDfTest$rad,bostonDfTest$dis)
set.seed(1)
predcrim.knn <- knn(as.matrix(trainMatrixCrim),as.matrix(testMatrixCrim),bostonDfTrain$crim01, k = 1)
mean(predcrim.knn != bostonDfTest$crim01)
## [1] 0.1078431
set.seed(1)
predcrim.knn <- knn(as.matrix(trainMatrixCrim),as.matrix(testMatrixCrim),bostonDfTrain$crim01, k = 5)
mean(predcrim.knn != bostonDfTest$crim01)
## [1] 0.1176471
set.seed(1)
predcrim.knn <- knn(as.matrix(trainMatrixCrim),as.matrix(testMatrixCrim),bostonDfTrain$crim01, k = 50)
mean(predcrim.knn != bostonDfTest$crim01)
## [1] 0.1666667
To predict if the crime rate of a given suburb is above the median crime rate, a logistic regression, LDA, QDA, and multiple KNN models will be fit using the variables with highest correlation to crime rate. The first set of models is fit on the top 3 most correlated variables: \(nox\), \(rad\), and \(dis\). The logistic model had an error rate of \(13.72\%\), the LDA had an error rate of \(14.71\%\), the QDA had an error rate of \(11.77\%\), and the best KNN model had an error rate of \(10.78\%\).
logitcrim2.fit<-glm(crim01 ~ nox + rad + dis + age + indus + tax, data = bostonDfTrain, family = 'binomial')
logitcrim2.fit
##
## Call: glm(formula = crim01 ~ nox + rad + dis + age + indus + tax, family = "binomial",
## data = bostonDfTrain)
##
## Coefficients:
## (Intercept) nox rad dis age
## -25.999847 44.322849 0.501172 0.311233 0.022011
## indus tax
## -0.073689 -0.007592
##
## Degrees of Freedom: 403 Total (i.e. Null); 397 Residual
## Null Deviance: 559.7
## Residual Deviance: 191.1 AIC: 205.1
predcrim2 <- predict(logitcrim2.fit,bostonDfTest, type = "response")
predcrim2.logit <- rep(0, length(predcrim2))
predcrim2.logit[predcrim2 > 0.5] <- 1
mean(predcrim2.logit != bostonDfTest$crim01)
## [1] 0.127451
ldacrim2.fit <- lda(crim01 ~ nox + rad + dis + age + indus + tax, data = bostonDfTrain)
ldacrim2.fit
## Call:
## lda(crim01 ~ nox + rad + dis + age + indus + tax, data = bostonDfTrain)
##
## Prior probabilities of groups:
## 0 1
## 0.4851485 0.5148515
##
## Group means:
## nox rad dis age indus tax
## 0 0.4719224 4.290816 5.018890 52.29031 6.929337 307.0255
## 1 0.6433173 15.307692 2.475816 86.51010 15.332019 515.7981
##
## Coefficients of linear discriminants:
## LD1
## nox 7.208597394
## rad 0.096043643
## dis -0.006191848
## age 0.013850429
## indus 0.015394705
## tax -0.002199906
predcrim2.lda <- predict(ldacrim2.fit,bostonDfTest)
mean(predcrim2.lda$class != bostonDfTest$crim01)
## [1] 0.1666667
qdacrim2.fit <- qda(crim01 ~ nox + rad + dis + age + indus + tax, data = bostonDfTrain)
qdacrim2.fit
## Call:
## qda(crim01 ~ nox + rad + dis + age + indus + tax, data = bostonDfTrain)
##
## Prior probabilities of groups:
## 0 1
## 0.4851485 0.5148515
##
## Group means:
## nox rad dis age indus tax
## 0 0.4719224 4.290816 5.018890 52.29031 6.929337 307.0255
## 1 0.6433173 15.307692 2.475816 86.51010 15.332019 515.7981
predcrim2.qda <- predict(qdacrim2.fit,bostonDfTest)
mean(predcrim2.qda$class != bostonDfTest$crim01)
## [1] 0.1372549
trainMatrixCrim2<- cbind(bostonDfTrain$nox,bostonDfTrain$rad,bostonDfTrain$dis,bostonDfTrain$age,bostonDfTrain$indus,bostonDfTrain$tax)
testMatrixCrim2<- cbind(bostonDfTest$nox,bostonDfTest$rad,bostonDfTest$dis,bostonDfTest$age,bostonDfTest$indus,bostonDfTest$tax)
set.seed(1)
predcrim2.knn <- knn(as.matrix(trainMatrixCrim2),as.matrix(testMatrixCrim2),bostonDfTrain$crim01, k = 1)
mean(predcrim2.knn != bostonDfTest$crim01)
## [1] 0.1078431
set.seed(1)
predcrim2.knn <- knn(as.matrix(trainMatrixCrim2),as.matrix(testMatrixCrim2),bostonDfTrain$crim01, k = 5)
mean(predcrim2.knn != bostonDfTest$crim01)
## [1] 0.127451
set.seed(1)
predcrim2.knn <- knn(as.matrix(trainMatrixCrim2),as.matrix(testMatrixCrim2),bostonDfTrain$crim01, k = 50)
mean(predcrim2.knn != bostonDfTest$crim01)
## [1] 0.1666667
Expanding the variable selection to the top 6 highest correlated variables gains only marginal improvements in accuracy for some models, while decreasing in others. The logistic regression model improves from \(13.72\%\) error rate to \(12.75\%\). Inversely, the error rate for LDA rises from \(14.71\%\) to \(16.67\%\). The QDA error rate also increases from \(11.77\%\) to \(13.73\%\). Finally, the best KNN model’s error rate remains constant at \(10.78\%\).