library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(readxl)
library(tinytex)
library(ISLR)
library(ISLR2)
##
## Attaching package: 'ISLR2'
##
## The following objects are masked from 'package:ISLR':
##
## Auto, Credit
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:ISLR2':
##
## Boston
##
## The following object is masked from 'package:dplyr':
##
## select
library(class)
library(e1071)
This question should be answered using the Weekly data set, which is part of the ISLR package. This data is similar in nature to the Smarket data from this chapter’s lab, except that it contains 1089 weekly returns for 21 years, from the beginning of 1990 to the end of 2010.
- Produce some numerical and graphical summaries of the Weekly data. Do there appear to be any patterns?
pairs(Weekly)
Weekly data information:
A data frame with 1089 observations on the following 9 variables.
Year The year that the observation was recorded
Lag1 Percentage return for previous week
Lag2 Percentage return for 2 weeks previous
Lag3 Percentage return for 3 weeks previous
Lag4 Percentage return for 4 weeks previous
Lag5 Percentage return for 5 weeks previous
Volume Volume of shares traded (average number of daily
shares traded in billions)
Today Percentage return for this week
Direction A factor with levels Down and Up indicating
whether the market had a positive or negative return on a given week
Weekly_NQ <- Weekly %>% select_if(is.numeric)
names(Weekly_NQ)
## [1] "Year" "Lag1" "Lag2" "Lag3" "Lag4" "Lag5" "Volume" "Today"
lm.Weekly <- lm(Volume~.,data=Weekly_NQ)
summary(lm.Weekly)
##
## Call:
## lm(formula = Volume ~ ., data = Weekly_NQ)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7831 -0.6264 -0.1795 0.4426 5.7537
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.648e+02 9.045e+00 -51.390 < 2e-16 ***
## Year 2.332e-01 4.522e-03 51.570 < 2e-16 ***
## Lag1 -3.143e-02 1.165e-02 -2.699 0.00707 **
## Lag2 -4.610e-02 1.166e-02 -3.954 8.18e-05 ***
## Lag3 -3.427e-02 1.164e-02 -2.944 0.00331 **
## Lag4 -2.933e-02 1.163e-02 -2.522 0.01180 *
## Lag5 -2.728e-02 1.160e-02 -2.351 0.01888 *
## Today -6.298e-03 1.162e-02 -0.542 0.58794
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8969 on 1081 degrees of freedom
## Multiple R-squared: 0.7191, Adjusted R-squared: 0.7173
## F-statistic: 395.3 on 7 and 1081 DF, p-value: < 2.2e-16
cor(Weekly_NQ)
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
## Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
## Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
## Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
## Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
## Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
## Lag5 Volume Today
## Year -0.030519101 0.84194162 -0.032459894
## Lag1 -0.008183096 -0.06495131 -0.075031842
## Lag2 -0.072499482 -0.08551314 0.059166717
## Lag3 0.060657175 -0.06928771 -0.071243639
## Lag4 -0.075675027 -0.06107462 -0.007825873
## Lag5 1.000000000 -0.05851741 0.011012698
## Volume -0.058517414 1.00000000 -0.033077783
## Today 0.011012698 -0.03307778 1.000000000
lm.Year_Volume <- lm(Year~Volume, data=Weekly)
summary(lm.Year_Volume)
##
## Call:
## lm(formula = Year ~ Volume, data = Weekly)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.3999 -2.2495 0.5893 2.6037 7.4944
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.995e+03 1.350e-01 14775.21 <2e-16 ***
## Volume 3.012e+00 5.854e-02 51.45 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.257 on 1087 degrees of freedom
## Multiple R-squared: 0.7089, Adjusted R-squared: 0.7086
## F-statistic: 2647 on 1 and 1087 DF, p-value: < 2.2e-16
Year <- Weekly$Year
Volume <- Weekly$Volume
plot(Year,Volume)
Looking at the the above analysis we can see that there is a pattern between Volume and Year. With the Volume increasing as years increase.
- Use the full data set to perform a logistic regression with Direction as the response and the five lag variables plus Volume as predictors. Use the summary function to print the results. Do any of the predictors appear to be statistically significant? If so, which ones?
glm.direction <- glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume, data=Weekly,family=binomial)
summary (glm.direction)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
There seems to be only 1 predictor that is statistically significant
which is Lag2
- Compute the confusion matrix and overall fraction of correct predictions. Explain what the confusion matrix is telling you about the types of mistakes made by logistic regression.
glm.probs <- predict(glm.direction,type="response")
glm.probs [1:10]
## 1 2 3 4 5 6 7 8
## 0.6086249 0.6010314 0.5875699 0.4816416 0.6169013 0.5684190 0.5786097 0.5151972
## 9 10
## 0.5715200 0.5554287
contrasts(Weekly$Direction)
## Up
## Down 0
## Up 1
glm.pred=rep("Down",1089)
glm.pred[glm.probs >.5]= "Up"
table(glm.pred, Weekly$Direction)
##
## glm.pred Down Up
## Down 54 48
## Up 430 557
#False Negative/Positive
(48+430)/1089
## [1] 0.4389348
(48+430)
## [1] 478
#True Negative/Positive
(54+557)/1089
## [1] 0.5610652
(54+557)
## [1] 611
mean(glm.pred==Weekly$Direction)
## [1] 0.5610652
1-((54+557)/1089)
## [1] 0.4389348
Since the diagonal elements of the confusion matrix show the correct
predictions, we can see that 54 where correctly predicted Down and 557
where correctly predicted Up. Using the mean function we
can see it was correct on 56% of the weeks.
This result can be misleading because we trained and tested the model on the same set of 1089 weekly observations. This means we had a 43.89% training error rate.
In order to better test the model, we should split the data in to two parts. One part to train the model and the other to predict. This will help the model on accessing future predictions that are not part of the original model.
- Now fit the logistic regression model using a training data period from 1990 to 2008, with Lag2 as the only predictor. Compute the confusion matrix and the overall fraction of correct predictions for the held out data (that is, the data from 2009 and 2010).
train <- Weekly$Year<=2008
Weekly_2008 <- Weekly[!train,]
Direction_2008 <- Weekly$Direction[!train]
dim(Weekly_2008)
## [1] 104 9
glm.lag2 <- glm(Direction~Lag2, family=binomial,data=Weekly,subset=train)
glm.probs <- predict(glm.lag2,Weekly_2008,type="response")
glm.pred <- rep("Down",104)
glm.pred[glm.probs>.5]="Up"
table(glm.pred,Direction_2008)
## Direction_2008
## glm.pred Down Up
## Down 9 5
## Up 34 56
mean(glm.pred==Direction_2008)
## [1] 0.625
1-mean(glm.pred==Direction_2008)
## [1] 0.375
62.5% correct predictions, or 37.5% error rate.
- Repeat (d) using LDA.
lda.fit <- lda(Direction~Lag2, data=Weekly, subset=train)
lda.fit
## Call:
## lda(Direction ~ Lag2, data = Weekly, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
##
## Coefficients of linear discriminants:
## LD1
## Lag2 0.4414162
plot(lda.fit)
lda.pred <- predict(lda.fit, Weekly_2008)
names(lda.pred)
## [1] "class" "posterior" "x"
lda.class <- lda.pred$class
table(lda.class,Direction_2008)
## Direction_2008
## lda.class Down Up
## Down 9 5
## Up 34 56
mean(lda.class==Direction_2008)
## [1] 0.625
LDA provided the exact result as logistic regression.
qda.fit=qda(Direction~Lag2 ,data=Weekly ,subset=train)
qda.fit
## Call:
## qda(Direction ~ Lag2, data = Weekly, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
qda.class <- predict(qda.fit, Weekly_2008)$class
table(qda.class, Direction_2008)
## Direction_2008
## qda.class Down Up
## Down 0 0
## Up 43 61
mean(qda.class==Direction_2008)
## [1] 0.5865385
58.65% correct prediction using QDA.
- Repeat (d) using KNN with K = 1.
train.X <- Weekly[train,"Lag2",drop=F]
test.X <- Weekly[!train,"Lag2",drop=F]
train.Direction <- Weekly[train,"Direction",drop=T]
set.seed(1)
knn.pred <- knn(train.X,test.X,train.Direction ,k=1)
table(knn.pred, Direction_2008)
## Direction_2008
## knn.pred Down Up
## Down 21 30
## Up 22 31
table(knn.pred,Direction_2008)
## Direction_2008
## knn.pred Down Up
## Down 21 30
## Up 22 31
mean(knn.pred==Direction_2008)
## [1] 0.5
KNN results in 50%
nb_model <- naiveBayes(Direction~Lag2,data=Weekly, subset = train)
pred <- predict(nb_model,Weekly_2008)
table(pred,Direction_2008)
## Direction_2008
## pred Down Up
## Down 0 0
## Up 43 61
mean(pred==Direction_2008)
## [1] 0.5865385
The Naive Bayes method yielded 58.65%
- Which of these methods appears to provide the best results on this data?
In this case, Logistic seems the most simple and best model. Since LDA is similar to Logistic it also provided the best results.
- Experiment with different combinations of predictors, including possible transformations and interactions, for each of the methods. Report the variables, method, and associated confusion matrix that appears to provide the best results on the held out data. Note that you should also experiment with values for K in the KNN classifier.
set.seed(1)
knn.pred <- knn(train.X,test.X,train.Direction ,k=4)
table(knn.pred, Direction_2008)
## Direction_2008
## knn.pred Down Up
## Down 20 17
## Up 23 44
mean(knn.pred==Direction_2008)
## [1] 0.6153846
nb_model <- naiveBayes(Direction~Lag1+Lag2+Lag3,data=Weekly, subset = train)
pred <- predict(nb_model,Weekly_2008)
table(pred,Direction_2008)
## Direction_2008
## pred Down Up
## Down 5 10
## Up 38 51
mean(pred==Direction_2008)
## [1] 0.5384615
qda.fit=qda(Direction~Lag1+Lag2+Lag3 ,data=Weekly ,subset=train)
qda.fit
## Call:
## qda(Direction ~ Lag1 + Lag2 + Lag3, data = Weekly, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag1 Lag2 Lag3
## Down 0.289444444 -0.03568254 0.17080045
## Up -0.009213235 0.26036581 0.08404044
qda.class <- predict(qda.fit, Weekly_2008)$class
table(qda.class, Direction_2008)
## Direction_2008
## qda.class Down Up
## Down 6 10
## Up 37 51
mean(qda.class==Direction_2008)
## [1] 0.5480769
lda.fit <- lda(Direction~Lag1+Lag2+Lag3, data=Weekly, subset=train)
lda.fit
## Call:
## lda(Direction ~ Lag1 + Lag2 + Lag3, data = Weekly, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag1 Lag2 Lag3
## Down 0.289444444 -0.03568254 0.17080045
## Up -0.009213235 0.26036581 0.08404044
##
## Coefficients of linear discriminants:
## LD1
## Lag1 -0.29658609
## Lag2 0.29258490
## Lag3 -0.04766747
plot(lda.fit)
lda.pred <- predict(lda.fit, Weekly_2008)
names(lda.pred)
## [1] "class" "posterior" "x"
lda.class <- lda.pred$class
table(lda.class,Direction_2008)
## Direction_2008
## lda.class Down Up
## Down 8 9
## Up 35 52
mean(lda.class==Direction_2008)
## [1] 0.5769231
Messing around with the models by adding extra predictors, changing K in KNN has been proven to not help increase the models accuracy in predicting the direction of the Weekly data set. Increasing K to 4 did provide a 10% improvement in the model but at the risk of increasing bias for new predictions.
In this problem, you will develop a model to predict whether a given car gets high or low gas mileage based on the Auto data set.
- Create a binary variable, mpg01, that contains a 1 if mpg contains a value above its median, and a 0 if mpg contains a value below its median. You can compute the median using the median() function. Note you may find it helpful to use the data.frame() function to create a single data set containing both mpg01 and the other Auto variables.
#Creates Boolean variable for Median MPG 1 for over, 0 for under
mpg01 <- ifelse(Auto$mpg > median(Auto$mpg), 1, 0)
#Combines mpg01 to Auto
Auto_V <- data.frame(Auto, mpg01)
- Explore the data graphically in order to investigate the association between mpg01 and the other features. Which of the other features seem most likely to be useful in predicting mpg01? Scatterplots and boxplots may be useful tools to answer this question. Describe your findings.
Auto_NQ <- Auto_V %>% select_if(is.numeric)
names(Auto_V)
## [1] "mpg" "cylinders" "displacement" "horsepower" "weight"
## [6] "acceleration" "year" "origin" "name" "mpg01"
names(Auto_NQ)
## [1] "mpg" "cylinders" "displacement" "horsepower" "weight"
## [6] "acceleration" "year" "origin" "mpg01"
cor(Auto_NQ)
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## mpg01 0.8369392 -0.7591939 -0.7534766 -0.6670526 -0.7577566
## acceleration year origin mpg01
## mpg 0.4233285 0.5805410 0.5652088 0.8369392
## cylinders -0.5046834 -0.3456474 -0.5689316 -0.7591939
## displacement -0.5438005 -0.3698552 -0.6145351 -0.7534766
## horsepower -0.6891955 -0.4163615 -0.4551715 -0.6670526
## weight -0.4168392 -0.3091199 -0.5850054 -0.7577566
## acceleration 1.0000000 0.2903161 0.2127458 0.3468215
## year 0.2903161 1.0000000 0.1815277 0.4299042
## origin 0.2127458 0.1815277 1.0000000 0.5136984
## mpg01 0.3468215 0.4299042 0.5136984 1.0000000
plot(Auto_NQ)
Since mpg01 is a binary variable it’s difficult to discern anything
through regular regression. We will need to use logistic analysis.
glm.fits <- glm(mpg01~displacement+horsepower+weight+acceleration+year, data=Auto_NQ,family = binomial)
summary(glm.fits)
##
## Call:
## glm(formula = mpg01 ~ displacement + horsepower + weight + acceleration +
## year, family = binomial, data = Auto_NQ)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1982 -0.1129 0.0114 0.2243 3.3045
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -15.844178 5.636740 -2.811 0.004941 **
## displacement -0.007024 0.006519 -1.078 0.281237
## horsepower -0.035762 0.023176 -1.543 0.122826
## weight -0.003984 0.001084 -3.676 0.000237 ***
## acceleration 0.008144 0.141248 0.058 0.954019
## year 0.414101 0.072623 5.702 1.18e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 543.43 on 391 degrees of freedom
## Residual deviance: 159.34 on 386 degrees of freedom
## AIC: 171.34
##
## Number of Fisher Scoring iterations: 8
We can see that weight and year have a significant relationship to mpg01
- Split the data into a training set and a test set.
set.seed(1)
sample1 <- sample(c(TRUE, FALSE), nrow(Auto_NQ), replace=TRUE, prob = c(.6,.4))
train <- Auto_NQ[sample1, ]
test <- Auto_NQ[!sample1, ]
dim(train)
## [1] 246 9
dim(test)
## [1] 146 9
- Perform LDA on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained?
lda.fit.auto <- lda(mpg01 ~ weight+year, data=train)
lda.fit.auto
## Call:
## lda(mpg01 ~ weight + year, data = train)
##
## Prior probabilities of groups:
## 0 1
## 0.504065 0.495935
##
## Group means:
## weight year
## 0 3620.040 74.70968
## 1 2315.344 77.63115
##
## Coefficients of linear discriminants:
## LD1
## weight -0.00176319
## year 0.11400180
lda.class.auto <- predict(lda.fit.auto,test)$class
table(lda.class.auto,test$mpg01)
##
## lda.class.auto 0 1
## 0 62 5
## 1 10 69
mean(lda.class.auto==test$mpg01)
## [1] 0.8972603
1-mean(lda.class.auto==test$mpg01)
## [1] 0.1027397
Using LDA and the weight and year as predictors we were able to get 89.72% correct. I must say that’s pretty impressive and cool! I used 60% of the data as the training data, and the remaining 30% for the testing data.
Test error rate 10.27%
- Perform QDA on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained?
qda.fit.auto <- qda(mpg01 ~ weight+year, data=train)
qda.fit.auto
## Call:
## qda(mpg01 ~ weight + year, data = train)
##
## Prior probabilities of groups:
## 0 1
## 0.504065 0.495935
##
## Group means:
## weight year
## 0 3620.040 74.70968
## 1 2315.344 77.63115
qda.class.auto <- predict(qda.fit.auto,test)$class
table(qda.class.auto,test$mpg01)
##
## qda.class.auto 0 1
## 0 61 6
## 1 11 68
mean(qda.class.auto==test$mpg01)
## [1] 0.8835616
1-mean(qda.class.auto==test$mpg01)
## [1] 0.1164384
Using QDA we got a little less correct at 88.36% on the test data but still a nice result. Test error rate 11.64%
- Perform logistic regression on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained?
glm.auto <- glm(mpg01~weight+year, family=binomial, data=train)
summary(glm.auto)
##
## Call:
## glm(formula = mpg01 ~ weight + year, family = binomial, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9209 -0.1360 -0.0007 0.2159 3.1868
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.082e+01 6.708e+00 -3.104 0.00191 **
## weight -5.946e-03 8.835e-04 -6.730 1.69e-11 ***
## year 4.929e-01 1.063e-01 4.636 3.55e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 341.01 on 245 degrees of freedom
## Residual deviance: 93.95 on 243 degrees of freedom
## AIC: 99.95
##
## Number of Fisher Scoring iterations: 7
glm.probs.auto <- predict(glm.auto,test,type="response")
glm.pred.auto <- rep(0,nrow(test))
glm.pred.auto[glm.probs.auto > 0.50]=1
table(glm.pred.auto,test$mpg01)
##
## glm.pred.auto 0 1
## 0 64 9
## 1 8 65
mean(glm.pred.auto==test$mpg01)
## [1] 0.8835616
1-mean(glm.pred.auto==test$mpg01)
## [1] 0.1164384
Using logistic regression the model was still able to achieve a 88.36% success rate, still not as good as LDA but respectable.
Test error rate 11.64% the same as QDA
- Perform naive Bayes on the training data in order to predict mpg01 using the variables that seemed most associated with mpg01 in (b). What is the test error of the model obtained?
nb_model.auto <- naiveBayes(mpg01~weight+year,data=train)
summary(nb_model.auto)
## Length Class Mode
## apriori 2 table numeric
## tables 2 -none- list
## levels 2 -none- character
## isnumeric 2 -none- logical
## call 4 -none- call
prob.auto <- predict(nb_model.auto,test)
pred.auto <- rep(0,nrow(test))
pred.auto[prob.auto > 0.50]=1
table(pred.auto,test$mpg01)
##
## pred.auto 0 1
## 0 72 74
mean(pred.auto==test$mpg01)
## [1] 0.4931507
1-mean(pred.auto==test$mpg01)
## [1] 0.5068493
The Naives Bayes did not produce favorable results with only 49.31% correct.
Test error rate 50.68%
Using the Boston data set, fit classification models in order to predict whether a given census tract has a crime rate above or below the median. Explore logistic regression, LDA, naive Bayes, and KNN models using various subsets of the predictors. Describe your findings. Hint: You will have to create the response variable yourself, using the variables that are contained in the Boston data set.
#Creates Boolean variable for Median crim 1 for over, 0 for under
crim01 <- ifelse(Boston$crim > median(Boston$crim), 1, 0)
#Combines crim01 to Boston
Boston_V <- data.frame(Boston, crim01)
cor(Boston_V)
## crim zn indus chas nox
## crim 1.00000000 -0.20046922 0.40658341 -0.055891582 0.42097171
## zn -0.20046922 1.00000000 -0.53382819 -0.042696719 -0.51660371
## indus 0.40658341 -0.53382819 1.00000000 0.062938027 0.76365145
## chas -0.05589158 -0.04269672 0.06293803 1.000000000 0.09120281
## nox 0.42097171 -0.51660371 0.76365145 0.091202807 1.00000000
## rm -0.21924670 0.31199059 -0.39167585 0.091251225 -0.30218819
## age 0.35273425 -0.56953734 0.64477851 0.086517774 0.73147010
## dis -0.37967009 0.66440822 -0.70802699 -0.099175780 -0.76923011
## rad 0.62550515 -0.31194783 0.59512927 -0.007368241 0.61144056
## tax 0.58276431 -0.31456332 0.72076018 -0.035586518 0.66802320
## ptratio 0.28994558 -0.39167855 0.38324756 -0.121515174 0.18893268
## black -0.38506394 0.17552032 -0.35697654 0.048788485 -0.38005064
## lstat 0.45562148 -0.41299457 0.60379972 -0.053929298 0.59087892
## medv -0.38830461 0.36044534 -0.48372516 0.175260177 -0.42732077
## crim01 0.40939545 -0.43615103 0.60326017 0.070096774 0.72323480
## rm age dis rad tax ptratio
## crim -0.21924670 0.35273425 -0.37967009 0.625505145 0.58276431 0.2899456
## zn 0.31199059 -0.56953734 0.66440822 -0.311947826 -0.31456332 -0.3916785
## indus -0.39167585 0.64477851 -0.70802699 0.595129275 0.72076018 0.3832476
## chas 0.09125123 0.08651777 -0.09917578 -0.007368241 -0.03558652 -0.1215152
## nox -0.30218819 0.73147010 -0.76923011 0.611440563 0.66802320 0.1889327
## rm 1.00000000 -0.24026493 0.20524621 -0.209846668 -0.29204783 -0.3555015
## age -0.24026493 1.00000000 -0.74788054 0.456022452 0.50645559 0.2615150
## dis 0.20524621 -0.74788054 1.00000000 -0.494587930 -0.53443158 -0.2324705
## rad -0.20984667 0.45602245 -0.49458793 1.000000000 0.91022819 0.4647412
## tax -0.29204783 0.50645559 -0.53443158 0.910228189 1.00000000 0.4608530
## ptratio -0.35550149 0.26151501 -0.23247054 0.464741179 0.46085304 1.0000000
## black 0.12806864 -0.27353398 0.29151167 -0.444412816 -0.44180801 -0.1773833
## lstat -0.61380827 0.60233853 -0.49699583 0.488676335 0.54399341 0.3740443
## medv 0.69535995 -0.37695457 0.24992873 -0.381626231 -0.46853593 -0.5077867
## crim01 -0.15637178 0.61393992 -0.61634164 0.619786249 0.60874128 0.2535684
## black lstat medv crim01
## crim -0.38506394 0.4556215 -0.3883046 0.40939545
## zn 0.17552032 -0.4129946 0.3604453 -0.43615103
## indus -0.35697654 0.6037997 -0.4837252 0.60326017
## chas 0.04878848 -0.0539293 0.1752602 0.07009677
## nox -0.38005064 0.5908789 -0.4273208 0.72323480
## rm 0.12806864 -0.6138083 0.6953599 -0.15637178
## age -0.27353398 0.6023385 -0.3769546 0.61393992
## dis 0.29151167 -0.4969958 0.2499287 -0.61634164
## rad -0.44441282 0.4886763 -0.3816262 0.61978625
## tax -0.44180801 0.5439934 -0.4685359 0.60874128
## ptratio -0.17738330 0.3740443 -0.5077867 0.25356836
## black 1.00000000 -0.3660869 0.3334608 -0.35121093
## lstat -0.36608690 1.0000000 -0.7376627 0.45326273
## medv 0.33346082 -0.7376627 1.0000000 -0.26301673
## crim01 -0.35121093 0.4532627 -0.2630167 1.00000000
Looking at the correlation matrix with respect to crim01 we can take the following predictors and conduct further analysis:
indus proportion of non-retail business acres per
town.
nox nitrogen oxides concentration (parts per 10
million).
age proportion of owner-occupied units built prior to
1940.
rad index of accessibility to radial highways.
tax full-value property-tax rate per $10,000.
lstat lower status of the population (percent).
set.seed(1)
sample_b <- sample(c(TRUE, FALSE), nrow(Boston_V), replace=TRUE, prob = c(.6,.4))
trainb <- Boston_V[sample_b, ]
testb <- Boston_V[!sample_b, ]
dim(trainb)
## [1] 314 15
dim(testb)
## [1] 192 15
glm.bos <- glm(crim01~indus+nox+age+rad+tax+lstat, family=binomial, data=trainb)
summary(glm.bos)
##
## Call:
## glm(formula = crim01 ~ indus + nox + age + rad + tax + lstat,
## family = binomial, data = trainb)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.08917 -0.19479 0.00026 0.01065 2.74919
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -22.535448 3.975270 -5.669 1.44e-08 ***
## indus -0.072520 0.056568 -1.282 0.199841
## nox 40.095414 8.392236 4.778 1.77e-06 ***
## age 0.022459 0.012367 1.816 0.069362 .
## rad 0.576305 0.151603 3.801 0.000144 ***
## tax -0.008295 0.003199 -2.593 0.009521 **
## lstat -0.011124 0.043571 -0.255 0.798480
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 435.25 on 313 degrees of freedom
## Residual deviance: 139.35 on 307 degrees of freedom
## AIC: 153.35
##
## Number of Fisher Scoring iterations: 8
glm.probs.b <- predict(glm.bos,testb,type="response")
glm.pred.b <- rep(0,nrow(testb))
glm.pred.b[glm.probs.b > 0.50]=1
table(glm.pred.b,testb$crim01)
##
## glm.pred.b 0 1
## 0 88 13
## 1 10 81
mean(glm.pred.b==testb$crim01)
## [1] 0.8802083
1-mean(glm.pred.b==testb$crim01)
## [1] 0.1197917
We can see with a logistic regression model on the test set we were able to successfully predict 88.02% with a test error rate of 11.98%
lda.fit.b <- lda(crim01~indus+nox+age+rad+tax+lstat, data=trainb)
lda.fit.b
## Call:
## lda(crim01 ~ indus + nox + age + rad + tax + lstat, data = trainb)
##
## Prior probabilities of groups:
## 0 1
## 0.4936306 0.5063694
##
## Group means:
## indus nox age rad tax lstat
## 0 6.663226 0.4648245 50.03871 4.096774 303.8065 9.240387
## 1 15.310189 0.6426730 86.79308 14.610063 506.4969 15.832264
##
## Coefficients of linear discriminants:
## LD1
## indus 0.019009728
## nox 7.126273863
## age 0.018582977
## rad 0.089442271
## tax -0.001811084
## lstat -0.022915042
lda.class.b <- predict(lda.fit.b,testb)$class
table(lda.class.b,testb$crim01)
##
## lda.class.b 0 1
## 0 90 23
## 1 8 71
mean(lda.class.b==testb$crim01)
## [1] 0.8385417
1-mean(lda.class.b==testb$crim01)
## [1] 0.1614583
With LDA we get a lower success rate, with 83.85%. The test error rate is 16.15%.
nb_model.b <- naiveBayes(crim01~indus+nox+age+rad+tax+lstat, data=trainb)
summary(nb_model.b)
## Length Class Mode
## apriori 2 table numeric
## tables 6 -none- list
## levels 2 -none- character
## isnumeric 6 -none- logical
## call 4 -none- call
prob.b <- predict(nb_model.b,testb)
pred.b <- rep(0,nrow(testb))
pred.b[prob.b > 0.50]=1
table(pred.b,testb$crim01)
##
## pred.b 0 1
## 0 98 94
mean(pred.b==testb$crim01)
## [1] 0.5104167
1-mean(pred.b==testb$crim01)
## [1] 0.4895833
Naive Bayes provided pretty low numbers, 51.04% success. 48.96% error rate.
set.seed(1)
train.Boston = trainb[,c("indus","nox","age","rad","tax","lstat")]
test.Boston = testb[,c("indus","nox","age","rad","tax","lstat")]
knn.pred=knn(train.Boston,test.Boston,trainb$crim01,k=1)
table(knn.pred,testb$crim01)
##
## knn.pred 0 1
## 0 91 7
## 1 7 87
mean(knn.pred==testb$crim01)
## [1] 0.9270833
1-mean(knn.pred==testb$crim01)
## [1] 0.07291667
knn.pred=knn(train.Boston,test.Boston,trainb$crim01,k=2)
table(knn.pred,testb$crim01)
##
## knn.pred 0 1
## 0 86 12
## 1 12 82
mean(knn.pred==testb$crim01)
## [1] 0.875
knn.pred=knn(train.Boston,test.Boston,trainb$crim01,k=3)
table(knn.pred,testb$crim01)
##
## knn.pred 0 1
## 0 91 10
## 1 7 84
mean(knn.pred==testb$crim01)
## [1] 0.9114583
knn.pred=knn(train.Boston,test.Boston,trainb$crim01,k=4)
table(knn.pred,testb$crim01)
##
## knn.pred 0 1
## 0 90 11
## 1 8 83
mean(knn.pred==testb$crim01)
## [1] 0.9010417
KNN with K = 1 provided the best result (out of 4) with a success rate of 92.71% with a test error rate of 7.29%
KNN was the best model so far for this data set with respect to
predicting crim01 with the variables indus nox
age rad tax and
lstat