Chapter 04 (page 168): 13, 14, 16
library(ISLR2)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
library(class)
library(e1071)
library(caTools)
This question should be answered using the Weekly data set, which is part of the ISLR2 package. This data is similar in nature to the Smarket data from this chapter’s lab, except that it contains 1, 089 weekly returns for 21 years, from the beginning of 1990 to the end of 2010.
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
cor(Weekly[,-9])
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
## Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
## Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
## Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
## Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
## Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
## Lag5 Volume Today
## Year -0.030519101 0.84194162 -0.032459894
## Lag1 -0.008183096 -0.06495131 -0.075031842
## Lag2 -0.072499482 -0.08551314 0.059166717
## Lag3 0.060657175 -0.06928771 -0.071243639
## Lag4 -0.075675027 -0.06107462 -0.007825873
## Lag5 1.000000000 -0.05851741 0.011012698
## Volume -0.058517414 1.00000000 -0.033077783
## Today 0.011012698 -0.03307778 1.000000000
plot(Weekly$Volume)
Year and Volume show the only substantial correlation among the
variables. The plot shows how the volume of shares traded has
increased.
attach(Weekly)
logr.week<-glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,
data=Weekly,
family=binomial)
summary(logr.week)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
The only predictor that has a p-value that is statistically significant is Lag2, with a p value of: 0.0296.
Weekly.prob= predict(logr.week, type='response')
Weekly.pred =rep("Down", length(Weekly.prob))
Weekly.pred[Weekly.prob > 0.5] = "Up"
table(Weekly.pred, Direction)
## Direction
## Weekly.pred Down Up
## Down 54 48
## Up 430 557
mean(Weekly.pred == Direction)
## [1] 0.5610652
The percent of correct predictions made by the model is 56.11%. While 92% of the predictions that the returns would go Up were correct, only 11% of the predictions for the Down directions were correct. Furthermore, since the same set was used to train and test the regression, the error rate for the model is: 43.89%
train <- (Year<2009)
Weekly_2009 <-Weekly[!train,]
Weekly_fit<-glm(Direction~Lag2, data=Weekly,family=binomial, subset=train)
Weekly_prob= predict(Weekly_fit, Weekly_2009, type = "response")
Weekly_pred = rep("Down", length(Weekly_prob))
Weekly_pred[Weekly_prob > 0.5] = "Up"
Direction_2009 = Direction[!train]
table(Weekly_pred, Direction_2009)
## Direction_2009
## Weekly_pred Down Up
## Down 9 5
## Up 34 56
mean(Weekly_pred == Direction_2009)
## [1] 0.625
The percentage of correct predictions for this model is: 62.5%.
library(MASS)
WeeklyLDA<-lda(Direction~Lag2, data=Weekly,family=binomial, subset=train)
WeeklyLDA_pred<-predict(WeeklyLDA, Weekly_2009)
table(WeeklyLDA_pred$class, Direction_2009)
## Direction_2009
## Down Up
## Down 9 5
## Up 34 56
mean(WeeklyLDA_pred$class == Direction_2009)
## [1] 0.625
The percentage of correct predictions for this model is: 62.5%.
WeeklyLDA<-qda(Direction~Lag2, data=Weekly,family=binomial, subset=train)
WeeklyLDA_pred<-predict(WeeklyLDA, Weekly_2009)
table(WeeklyLDA_pred$class, Direction_2009)
## Direction_2009
## Down Up
## Down 0 0
## Up 43 61
mean(WeeklyLDA_pred$class == Direction_2009)
## [1] 0.5865385
The percentage of correct predictions for this model is: 58.65%; however, it only predicted for trends with value Up.
Weekly_train <- as.matrix(Lag2[train])
Weekly_test <-as.matrix(Lag2[!train])
train_Direction =Direction[train]
set.seed(99)
Weekly_pred <- knn(Weekly_train,Weekly_test,train_Direction,k=1)
table(Weekly_pred,Direction_2009)
## Direction_2009
## Weekly_pred Down Up
## Down 21 30
## Up 22 31
mean(Weekly_pred == Direction_2009)
## [1] 0.5
The percentage of correct predictions for this model is: 50%
WeeklyNaive <-naiveBayes(Direction~Lag2, data=Weekly,family=binomial, subset=train)
WeeklyNaive
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace, family = ..1)
##
## A-priori probabilities:
## Y
## Down Up
## 0.4477157 0.5522843
##
## Conditional probabilities:
## Lag2
## Y [,1] [,2]
## Down -0.03568254 2.199504
## Up 0.26036581 2.317485
Naive_class <- predict(WeeklyNaive , Weekly_2009)
table(Naive_class , Direction_2009)
## Direction_2009
## Naive_class Down Up
## Down 0 0
## Up 43 61
mean(Naive_class == Direction_2009)
## [1] 0.5865385
Both Logistic Regression and LDA have a succes rate of 62.5%, which is better than the other models.
WeeklyLDA <- lda(Direction~Lag1:Lag2, data=Weekly,family=binomial, subset=train)
WeeklyLDA_pred <- predict(WeeklyLDA, Weekly_2009)
table(WeeklyLDA_pred$class, Direction_2009)
## Direction_2009
## Down Up
## Down 0 1
## Up 43 60
mean(WeeklyLDA_pred$class == Direction_2009)
## [1] 0.5769231
Including Lag 1 decreases the accuracy of the LDA model.
Weekly_train <- as.matrix(Lag2[train])
Weekly_test <- as.matrix(Lag2[!train])
train_Direction =Direction[train]
set.seed(99)
Weekly_pred <- knn(Weekly_train,Weekly_test,train_Direction,k=10)
table(Weekly_pred,Direction_2009)
## Direction_2009
## Weekly_pred Down Up
## Down 20 19
## Up 23 42
mean(Weekly_pred == Direction_2009)
## [1] 0.5961538
A k value of 10 increases the accuracy rate to: 59.62%
Weekly_train <- as.matrix(Lag2[train])
Weekly_test <- as.matrix(Lag2[!train])
train_Direction <- Direction[train]
set.seed(99)
Weekly_pred <- knn(Weekly_train,Weekly_test,train_Direction,k=100)
table(Weekly_pred,Direction_2009)
## Direction_2009
## Weekly_pred Down Up
## Down 10 12
## Up 33 49
mean(Weekly_pred == Direction_2009)
## [1] 0.5673077
detach(Weekly)
A k value of 100 decreases the accuracy rate to:56.73 % in comparisson with the k value of 10.
#14 In this problem, you will develop a model to predict whether a given car gets high or low gas mileage based on the Auto data set.
summary(Auto)
## mpg cylinders displacement horsepower weight
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0 Min. :1613
## 1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0 1st Qu.:2225
## Median :22.75 Median :4.000 Median :151.0 Median : 93.5 Median :2804
## Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5 Mean :2978
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0 3rd Qu.:3615
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0 Max. :5140
##
## acceleration year origin name
## Min. : 8.00 Min. :70.00 Min. :1.000 amc matador : 5
## 1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000 ford pinto : 5
## Median :15.50 Median :76.00 Median :1.000 toyota corolla : 5
## Mean :15.54 Mean :75.98 Mean :1.577 amc gremlin : 4
## 3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000 amc hornet : 4
## Max. :24.80 Max. :82.00 Max. :3.000 chevrolet chevette: 4
## (Other) :365
attach(Auto)
mpg01 <- ifelse(mpg>median(mpg),yes=1,no=0)
auto <- data.frame(Auto,mpg01)
head(auto, n=10)
## mpg cylinders displacement horsepower weight acceleration year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10.0 70 1
## 7 14 8 454 220 4354 9.0 70 1
## 8 14 8 440 215 4312 8.5 70 1
## 9 14 8 455 225 4425 10.0 70 1
## 10 15 8 390 190 3850 8.5 70 1
## name mpg01
## 1 chevrolet chevelle malibu 0
## 2 buick skylark 320 0
## 3 plymouth satellite 0
## 4 amc rebel sst 0
## 5 ford torino 0
## 6 ford galaxie 500 0
## 7 chevrolet impala 0
## 8 plymouth fury iii 0
## 9 pontiac catalina 0
## 10 amc ambassador dpl 0
pairs(auto)
boxplot(mpg01,cylinders, xlab="mpg01", ylab="Cylinders")
boxplot(mpg01,acceleration, xlab="mpg01", ylab="Acceleration")
boxplot(mpg01,horsepower, xlab="mpg01", ylab="Horsepower")
boxplot(mpg01,weight, xlab="mpg01", ylab="Weight")
boxplot(mpg01,year, xlab="mpg01", ylab="Year")
It is difficult to interpret just graphically (using pairs) because
mpg01 is a categorical variable; however by examining the boxplots of
mpg01 by the respective variables there seems to be stronger
associations of mpg01 with certain variables like Cylinders or
Horsepower.
train <- (year %% 2 == 0)
test <- !train
auto_train <- Auto[train,]
auto_test <- Auto[test,]
mpg01_test <- mpg01[test]
The data was split by evaluating year and assigning the observation to the training or test set if the year is even or odd, respectively.
cor(auto[,-9])
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## mpg01 0.8369392 -0.7591939 -0.7534766 -0.6670526 -0.7577566
## acceleration year origin mpg01
## mpg 0.4233285 0.5805410 0.5652088 0.8369392
## cylinders -0.5046834 -0.3456474 -0.5689316 -0.7591939
## displacement -0.5438005 -0.3698552 -0.6145351 -0.7534766
## horsepower -0.6891955 -0.4163615 -0.4551715 -0.6670526
## weight -0.4168392 -0.3091199 -0.5850054 -0.7577566
## acceleration 1.0000000 0.2903161 0.2127458 0.3468215
## year 0.2903161 1.0000000 0.1815277 0.4299042
## origin 0.2127458 0.1815277 1.0000000 0.5136984
## mpg01 0.3468215 0.4299042 0.5136984 1.0000000
The 4 variables with greatest correlation to mpg01: cylinders, displacement, horsepower and weight.
autoLDA_fit <- lda(mpg01 ~ cylinders+displacement+horsepower+weight,
data = auto, subset = train)
autoLDA_pred <- predict(autoLDA_fit, auto_test)
table(autoLDA_pred$class == mpg01_test)
##
## FALSE TRUE
## 23 159
mean(autoLDA_pred$class == mpg01_test)
## [1] 0.8736264
100-87.36
## [1] 12.64
Test error: 12.64%
autoQDA_fit <- qda(mpg01 ~ cylinders+displacement+horsepower+weight,
data = auto, family=binomial, subset = train)
autoQDA_pred <- predict(autoQDA_fit, auto_test)
table(autoQDA_pred$class == mpg01_test)
##
## FALSE TRUE
## 24 158
mean(autoQDA_pred$class == mpg01_test)
## [1] 0.8681319
100-86.81
## [1] 13.19
Test error: 13.19%
log_fit <- glm(mpg01 ~ cylinders + weight + displacement + horsepower, data = auto,
family = binomial, subset = train)
log_prob <- predict(log_fit, auto_test, type = "response")
log_pred <- rep(0, length(log_prob))
log_pred[log_prob > 0.5] <- 1
mean(log_pred == mpg01_test)
## [1] 0.8791209
100-87.91
## [1] 12.09
Test error: 12.09%
auto_naive <- naiveBayes(mpg01 ~ cylinders+displacement+horsepower+weight,
data = auto, subset = train)
naive_pred <- predict(auto_naive, auto_test)
table(naive_pred == mpg01_test)
##
## FALSE TRUE
## 23 159
mean(naive_pred == mpg01_test)
## [1] 0.8736264
100-87.36
## [1] 12.64
Test error: 12.64%
train_k <- cbind(cylinders, displacement, horsepower, weight)[train,]
test_k <- cbind(cylinders, displacement, horsepower, weight)[test,]
train_mpg01 <- mpg01[train]
set.seed(99)
knn_pred <- knn(train_k, test_k, train_mpg01, k = 1)
mean(knn_pred == mpg01_test)
## [1] 0.8461538
100-84.62
## [1] 15.38
For k=1, the test error is: 15.38%
train_k <- cbind(cylinders, displacement, horsepower, weight)[train,]
test_k <- cbind(cylinders, displacement, horsepower, weight)[test,]
train_mpg01 <- mpg01[train]
set.seed(99)
knn_pred <- knn(train_k, test_k, train_mpg01, k = 10)
mean(knn_pred == mpg01_test)
## [1] 0.8351648
100-83.52
## [1] 16.48
For k=10, the test error is: 16.48%
train_k <- cbind(cylinders, displacement, horsepower, weight)[train,]
test_k <- cbind(cylinders, displacement, horsepower, weight)[test,]
train_mpg01 <- mpg01[train]
set.seed(99)
knn_pred <- knn(train_k, test_k, train_mpg01, k = 50)
mean(knn_pred == mpg01_test)
## [1] 0.8571429
100-85.71
## [1] 14.29
For k=50, the test error is: 14.29%
train_k <- cbind(cylinders, displacement, horsepower, weight)[train,]
test_k <- cbind(cylinders, displacement, horsepower, weight)[test,]
train_mpg01 <- mpg01[train]
set.seed(99)
knn_pred <- knn(train_k, test_k, train_mpg01, k = 100)
mean(knn_pred == mpg01_test)
## [1] 0.8571429
100-85.71
## [1] 14.29
For k=100, the test error is: 14.29%
For k=1, the test error is: 15.38% For k=10, the test error is: 16.48% For k=50 and k=100, the test error is: 14.29%
Both values of k=50 and 100 performed better.
detach(Auto)
#16 Using the Boston data set, fit classification models in order to predict whether a given census tract has a crime rate above or below the median. Explore logistic regression, LDA, naive Bayes, and KNN models using various subsets of the predictors. Describe your findings. Hint: You will have to create the response variable yourself, using the variables that are contained in the Boston data set.
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
attach(Boston)
crime01 <- rep(0, 506)
crime01[crim > median(Boston$crim)] = 1
crime01 <- as.factor(crime01)
NewBoston <- data.frame(Boston, crime01)
summary(NewBoston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv crime01
## Min. : 1.73 Min. : 5.00 0:253
## 1st Qu.: 6.95 1st Qu.:17.02 1:253
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
detach(Boston)
attach(NewBoston)
## The following object is masked _by_ .GlobalEnv:
##
## crime01
pairs(NewBoston)
boxplot(crime01,indus, xlab="Crime", ylab="proportion of non-retail business acres per town")
boxplot(crime01,nox, xlab="Crime", ylab="nitrogen oxides concentration (parts per 10 million)")
boxplot(crime01,age, xlab="Crime", ylab="proportion of owner-occupied units built prior to 1940")
boxplot(crime01,rad, xlab="Crime", ylab="index of accessibility to radial highways")
boxplot(crime01,dis, xlab="Crime", ylab="weighted mean of distances to five Boston employment centres.")
detach(NewBoston)
The pairs plot suggest a stronger correlation between our response crime01 and some of the variables, like: indus, nox, age, dis, rad and tax.
set.seed(99)
split_sets <- rbinom(506, 1, 0.7)
NewBoston <- cbind(NewBoston, split_sets)
train <- NewBoston[split_sets == 1,]
test <- NewBoston[split_sets == 0,]
Data is split into test and train sets using a random generator with a ratio of 0.7 specified.
Boston_glm <- glm(crime01~indus+age+nox+dis+medv, data = train, family = binomial)
summary(Boston_glm)
##
## Call:
## glm(formula = crime01 ~ indus + age + nox + dis + medv, family = binomial,
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.88166 -0.48450 -0.05151 0.33891 2.44829
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -24.862754 3.861070 -6.439 1.20e-10 ***
## indus -0.068174 0.038953 -1.750 0.08009 .
## age 0.008525 0.008910 0.957 0.33866
## nox 40.923526 6.335155 6.460 1.05e-10 ***
## dis 0.314674 0.153802 2.046 0.04076 *
## medv 0.078213 0.027083 2.888 0.00388 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 482.43 on 347 degrees of freedom
## Residual deviance: 235.43 on 342 degrees of freedom
## AIC: 247.43
##
## Number of Fisher Scoring iterations: 7
Age and industry do not have a significant p value, therefore they are removed from the model.
Boston_glm <- glm(crime01~nox+dis+medv, data = train, family = binomial)
prob <- predict(Boston_glm, test, type = 'response')
pred <- rep(0, 146)
pred[prob > 0.5] = 1
table(pred,test$crime01)
##
## pred 0 1
## 0 75 10
## 1 4 67
(67+75)/(75+67+14)
## [1] 0.9102564
Accuracy of the model is 91.03%
lda_boston <- lda(crime01~nox+dis+medv, data = train)
preds <- predict(lda_boston, test)
table(preds$class,test$crime01)
##
## 0 1
## 0 77 16
## 1 2 63
(77+63)/(77+63+18)
## [1] 0.8860759
Accuracy of the LDA model is 88.61%
train_k <- cbind(train$nox, train$dis, train$medv)
test_k <- cbind(test$nox, test$dis, test$medv)
set.seed(99)
knn_boston <- knn(train_k, test_k, train$crime01, k=1)
table(knn_boston, test$crime01)
##
## knn_boston 0 1
## 0 67 13
## 1 12 66
mean(knn_boston == test$crime01)
## [1] 0.8417722
Accuracy of the knn model, with k=1 is 84.17%
train_k <- cbind(train$nox, train$dis, train$medv)
test_k <- cbind(test$nox, test$dis, test$medv)
set.seed(99)
knn_boston <- knn(train_k, test_k, train$crime01, k=10)
table(knn_boston, test$crime01)
##
## knn_boston 0 1
## 0 70 17
## 1 9 62
mean(knn_boston == test$crime01)
## [1] 0.835443
Accuracy of the knn model, with k=10 is 83.54%