Questions 13, 14, 16
is part of the ISLR2 package. This data is similar in nature to the Smarket data from this chapter’s lab, except that it contains 1,089 weekly returns for 21 years, from the beginning of 1990 to the end of 2010.
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.1.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.3
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.1.3
library(corrplot) #visual Correlation Plot
## corrplot 0.92 loaded
library(GGally) #Scatterplot Matrix
## Warning: package 'GGally' was built under R version 4.1.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(gmodels) #Confusion Matrix calculations
## Warning: package 'gmodels' was built under R version 4.1.3
library(MASS) #LDA
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
## The following object is masked from 'package:dplyr':
##
## select
library(class) #KNN
library(e1071) #Naive Bayes
## Warning: package 'e1071' was built under R version 4.1.3
attach(Weekly)
names(Weekly)
## [1] "Year" "Lag1" "Lag2" "Lag3" "Lag4" "Lag5"
## [7] "Volume" "Today" "Direction"
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
cor(Weekly[, -9]) #correlation between Today and Lag1-Lag5 are close to zero and have little correlation between this weeks returns and previous weeks.
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
## Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
## Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
## Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
## Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
## Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
## Lag5 Volume Today
## Year -0.030519101 0.84194162 -0.032459894
## Lag1 -0.008183096 -0.06495131 -0.075031842
## Lag2 -0.072499482 -0.08551314 0.059166717
## Lag3 0.060657175 -0.06928771 -0.071243639
## Lag4 -0.075675027 -0.06107462 -0.007825873
## Lag5 1.000000000 -0.05851741 0.011012698
## Volume -0.058517414 1.00000000 -0.033077783
## Today 0.011012698 -0.03307778 1.000000000
corrplot(cor(Weekly[,-9]), method = "number", title = "Correlation of the Weekly Data Set") #correlation between Year & Volume
ggscatmat(Weekly, color = "Direction")
## Warning in ggscatmat(Weekly, color = "Direction"): Factor variables are omitted
## in plot
Weekly %>% mutate(row = row_number()) %>%
ggplot(aes(x = row, y = Volume)) +
geom_point() +
geom_smooth(se = FALSE) #average number of shares traded in billions; shows an increase in volume over time
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Weekly.glm = glm(
Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Weekly,
family = binomial)
summary(Weekly.glm)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
#Lag2 shows the smallest p-value with a positive Lag2 coefficient
Weekly.prob = predict(Weekly.glm, type = "response")
Weekly.pred = rep("Down", length(Weekly.prob))
Weekly.pred[Weekly.prob > 0.5] = "Up"
CrossTable(Weekly.pred, Direction)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 1089
##
##
## | Direction
## Weekly.pred | Down | Up | Row Total |
## -------------|-----------|-----------|-----------|
## Down | 54 | 48 | 102 |
## | 1.657 | 1.325 | |
## | 0.529 | 0.471 | 0.094 |
## | 0.112 | 0.079 | |
## | 0.050 | 0.044 | |
## -------------|-----------|-----------|-----------|
## Up | 430 | 557 | 987 |
## | 0.171 | 0.137 | |
## | 0.436 | 0.564 | 0.906 |
## | 0.888 | 0.921 | |
## | 0.395 | 0.511 | |
## -------------|-----------|-----------|-----------|
## Column Total | 484 | 605 | 1089 |
## | 0.444 | 0.556 | |
## -------------|-----------|-----------|-----------|
##
##
#The model accurately predicts only 11.2% when the market is down but accurately predicts 92.1% when the market is up.
0.511+0.050
## [1] 0.561
#The accuracy rate is 56.1%.
0.395+0.044
## [1] 0.439
#The error rate is 43.9%.
train = (Year<2009)
Weekly.train = Weekly[train,]
Weekly.test = Weekly[!train,]
Direction.train = Weekly.train$Direction
Direction.test = Weekly.test$Direction
Weekly.lrm = glm(Direction ~ Lag2,
data = Weekly,
family = binomial,
subset = train)
lrm.prob = predict(Weekly.lrm, Weekly.test, type = "response")
lrm.pred = rep("Down", length(lrm.prob))
lrm.pred[lrm.prob > 0.5] = "Up"
CrossTable(lrm.pred, Direction.test)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 104
##
##
## | Direction.test
## lrm.pred | Down | Up | Row Total |
## -------------|-----------|-----------|-----------|
## Down | 9 | 5 | 14 |
## | 1.782 | 1.256 | |
## | 0.643 | 0.357 | 0.135 |
## | 0.209 | 0.082 | |
## | 0.087 | 0.048 | |
## -------------|-----------|-----------|-----------|
## Up | 34 | 56 | 90 |
## | 0.277 | 0.195 | |
## | 0.378 | 0.622 | 0.865 |
## | 0.791 | 0.918 | |
## | 0.327 | 0.538 | |
## -------------|-----------|-----------|-----------|
## Column Total | 43 | 61 | 104 |
## | 0.413 | 0.587 | |
## -------------|-----------|-----------|-----------|
##
##
mean(lrm.pred == Direction.test)
## [1] 0.625
#62.5%
Weekly.lda = lda(Direction ~ Lag2,
data = Weekly,
subset = train)
lda.prob = predict(Weekly.lda, Weekly.test)$class
CrossTable(lda.prob, Direction.test)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 104
##
##
## | Direction.test
## lda.prob | Down | Up | Row Total |
## -------------|-----------|-----------|-----------|
## Down | 9 | 5 | 14 |
## | 1.782 | 1.256 | |
## | 0.643 | 0.357 | 0.135 |
## | 0.209 | 0.082 | |
## | 0.087 | 0.048 | |
## -------------|-----------|-----------|-----------|
## Up | 34 | 56 | 90 |
## | 0.277 | 0.195 | |
## | 0.378 | 0.622 | 0.865 |
## | 0.791 | 0.918 | |
## | 0.327 | 0.538 | |
## -------------|-----------|-----------|-----------|
## Column Total | 43 | 61 | 104 |
## | 0.413 | 0.587 | |
## -------------|-----------|-----------|-----------|
##
##
mean(lda.prob == Direction.test)
## [1] 0.625
#62.5%
Weekly.qda = qda(Direction ~ Lag2,
data = Weekly,
subset = train)
qda.prob = predict(Weekly.qda, Weekly.test)$class
CrossTable(qda.prob, Direction.test)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 104
##
##
## | Direction.test
## qda.prob | Down | Up | Row Total |
## -------------|-----------|-----------|-----------|
## Up | 43 | 61 | 104 |
## | 0.413 | 0.587 | |
## -------------|-----------|-----------|-----------|
## Column Total | 43 | 61 | 104 |
## -------------|-----------|-----------|-----------|
##
##
mean(qda.prob == Direction.test)
## [1] 0.5865385
#58.7%
knn.train = as.matrix(Lag2[train])
knn.test = as.matrix(Lag2[!train])
set.seed(1)
knn.pred = knn(knn.train, knn.test, Direction.train, k = 1)
CrossTable(knn.pred, Direction.test)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 104
##
##
## | Direction.test
## knn.pred | Down | Up | Row Total |
## -------------|-----------|-----------|-----------|
## Down | 21 | 30 | 51 |
## | 0.000 | 0.000 | |
## | 0.412 | 0.588 | 0.490 |
## | 0.488 | 0.492 | |
## | 0.202 | 0.288 | |
## -------------|-----------|-----------|-----------|
## Up | 22 | 31 | 53 |
## | 0.000 | 0.000 | |
## | 0.415 | 0.585 | 0.510 |
## | 0.512 | 0.508 | |
## | 0.212 | 0.298 | |
## -------------|-----------|-----------|-----------|
## Column Total | 43 | 61 | 104 |
## | 0.413 | 0.587 | |
## -------------|-----------|-----------|-----------|
##
##
mean(knn.pred == Direction.test)
## [1] 0.5
#50%
nb.fit = naiveBayes(Direction ~ Lag2,
data = Weekly,
subset = train)
nb.pred = predict(nb.fit, Weekly.test)
mean(nb.pred == Direction.test)
## [1] 0.5865385
#58.7%
#The logistic regression model accurately predicted the market with results of 62.5%.
#GLM interaction withLag5
Weekly.lrm2 = glm(
Direction ~ Lag5,
data = Weekly,
family=binomial,
subset = train)
Weekly.prob2 = predict(Weekly.lrm2, Weekly.test, type = "response")
Weekly.pred2 = rep("Down", length(Weekly.prob2))
Weekly.pred2[Weekly.prob2 > 0.5] = "Up"
CrossTable(Weekly.pred2, Direction.test)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 104
##
##
## | Direction.test
## Weekly.pred2 | Down | Up | Row Total |
## -------------|-----------|-----------|-----------|
## Down | 0 | 3 | 3 |
## | 1.240 | 0.874 | |
## | 0.000 | 1.000 | 0.029 |
## | 0.000 | 0.049 | |
## | 0.000 | 0.029 | |
## -------------|-----------|-----------|-----------|
## Up | 43 | 58 | 101 |
## | 0.037 | 0.026 | |
## | 0.426 | 0.574 | 0.971 |
## | 1.000 | 0.951 | |
## | 0.413 | 0.558 | |
## -------------|-----------|-----------|-----------|
## Column Total | 43 | 61 | 104 |
## | 0.413 | 0.587 | |
## -------------|-----------|-----------|-----------|
##
##
#LDA with Interaction Lag5
Weekly.lda2 = lda(Direction ~ Lag5,
data = Weekly,
subset = train)
lda.prob2 = predict(Weekly.lda2, Weekly.test)$class
CrossTable(lda.prob2, Direction.test)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 104
##
##
## | Direction.test
## lda.prob2 | Down | Up | Row Total |
## -------------|-----------|-----------|-----------|
## Down | 0 | 3 | 3 |
## | 1.240 | 0.874 | |
## | 0.000 | 1.000 | 0.029 |
## | 0.000 | 0.049 | |
## | 0.000 | 0.029 | |
## -------------|-----------|-----------|-----------|
## Up | 43 | 58 | 101 |
## | 0.037 | 0.026 | |
## | 0.426 | 0.574 | 0.971 |
## | 1.000 | 0.951 | |
## | 0.413 | 0.558 | |
## -------------|-----------|-----------|-----------|
## Column Total | 43 | 61 | 104 |
## | 0.413 | 0.587 | |
## -------------|-----------|-----------|-----------|
##
##
mean(lda.prob2 == Direction.test)
## [1] 0.5576923
#K=10, Lag5
knn.train2 = as.matrix(Lag5[train])
knn.test2 = as.matrix(Lag5[!train])
set.seed(1)
knn.pred2 = knn(knn.train2, knn.test2, Direction.train, k = 10)
CrossTable(knn.pred2, Direction.test)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 104
##
##
## | Direction.test
## knn.pred2 | Down | Up | Row Total |
## -------------|-----------|-----------|-----------|
## Down | 21 | 25 | 46 |
## | 0.206 | 0.145 | |
## | 0.457 | 0.543 | 0.442 |
## | 0.488 | 0.410 | |
## | 0.202 | 0.240 | |
## -------------|-----------|-----------|-----------|
## Up | 22 | 36 | 58 |
## | 0.164 | 0.115 | |
## | 0.379 | 0.621 | 0.558 |
## | 0.512 | 0.590 | |
## | 0.212 | 0.346 | |
## -------------|-----------|-----------|-----------|
## Column Total | 43 | 61 | 104 |
## | 0.413 | 0.587 | |
## -------------|-----------|-----------|-----------|
##
##
mean(knn.pred2 == Direction.test)
## [1] 0.5480769
#K=100, Lag5
knn.train3 = as.matrix(Lag5[train])
knn.test3 = as.matrix(Lag5[!train])
set.seed(1)
knn.pred3 = knn(knn.train3, knn.test3, Direction.train, k = 100)
CrossTable(knn.pred3, Direction.test)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 104
##
##
## | Direction.test
## knn.pred3 | Down | Up | Row Total |
## -------------|-----------|-----------|-----------|
## Down | 10 | 21 | 31 |
## | 0.619 | 0.437 | |
## | 0.323 | 0.677 | 0.298 |
## | 0.233 | 0.344 | |
## | 0.096 | 0.202 | |
## -------------|-----------|-----------|-----------|
## Up | 33 | 40 | 73 |
## | 0.263 | 0.185 | |
## | 0.452 | 0.548 | 0.702 |
## | 0.767 | 0.656 | |
## | 0.317 | 0.385 | |
## -------------|-----------|-----------|-----------|
## Column Total | 43 | 61 | 104 |
## | 0.413 | 0.587 | |
## -------------|-----------|-----------|-----------|
##
##
mean(knn.pred3 == Direction.test)
## [1] 0.4807692
detach(Weekly)
car gets high or low gas mileage based on the Auto data set.
summary(Auto)
## mpg cylinders displacement horsepower weight
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0 Min. :1613
## 1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0 1st Qu.:2225
## Median :22.75 Median :4.000 Median :151.0 Median : 93.5 Median :2804
## Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5 Mean :2978
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0 3rd Qu.:3615
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0 Max. :5140
##
## acceleration year origin name
## Min. : 8.00 Min. :70.00 Min. :1.000 amc matador : 5
## 1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000 ford pinto : 5
## Median :15.50 Median :76.00 Median :1.000 toyota corolla : 5
## Mean :15.54 Mean :75.98 Mean :1.577 amc gremlin : 4
## 3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000 amc hornet : 4
## Max. :24.80 Max. :82.00 Max. :3.000 chevrolet chevette: 4
## (Other) :365
Auto$country = factor(Auto$origin, labels = c("American", "European", "Japanese")) #converting the origin numbers to the countries
with(Auto, table(country, origin)) #amount of each country/origin
## origin
## country 1 2 3
## American 245 0 0
## European 0 68 0
## Japanese 0 0 79
auto = Auto %>%
mutate(mpg01 = ifelse(mpg > median(mpg), 1, 0)) #binary variable created
attach(auto)
## The following object is masked from package:ggplot2:
##
## mpg
summary(auto)
## mpg cylinders displacement horsepower weight
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0 Min. :1613
## 1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0 1st Qu.:2225
## Median :22.75 Median :4.000 Median :151.0 Median : 93.5 Median :2804
## Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5 Mean :2978
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0 3rd Qu.:3615
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0 Max. :5140
##
## acceleration year origin name
## Min. : 8.00 Min. :70.00 Min. :1.000 amc matador : 5
## 1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000 ford pinto : 5
## Median :15.50 Median :76.00 Median :1.000 toyota corolla : 5
## Mean :15.54 Mean :75.98 Mean :1.577 amc gremlin : 4
## 3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000 amc hornet : 4
## Max. :24.80 Max. :82.00 Max. :3.000 chevrolet chevette: 4
## (Other) :365
## country mpg01
## American:245 Min. :0.0
## European: 68 1st Qu.:0.0
## Japanese: 79 Median :0.5
## Mean :0.5
## 3rd Qu.:1.0
## Max. :1.0
##
cor(auto[, -c(9, 10)])
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## mpg01 0.8369392 -0.7591939 -0.7534766 -0.6670526 -0.7577566
## acceleration year origin mpg01
## mpg 0.4233285 0.5805410 0.5652088 0.8369392
## cylinders -0.5046834 -0.3456474 -0.5689316 -0.7591939
## displacement -0.5438005 -0.3698552 -0.6145351 -0.7534766
## horsepower -0.6891955 -0.4163615 -0.4551715 -0.6670526
## weight -0.4168392 -0.3091199 -0.5850054 -0.7577566
## acceleration 1.0000000 0.2903161 0.2127458 0.3468215
## year 0.2903161 1.0000000 0.1815277 0.4299042
## origin 0.2127458 0.1815277 1.0000000 0.5136984
## mpg01 0.3468215 0.4299042 0.5136984 1.0000000
corrplot(cor(auto[, -c(9, 10)]),
method = 'color',
order = 'hclust', addrect = 2,
tl.col = 'black', addCoef.col = 'black', number.cex = 0.65)
#cylinders, displacement, and weight seem to have a high negative correlation with MPG01; horsepower also shows a negative correlation
ggscatmat(auto, color = "mpg01")
## Warning in ggscatmat(auto, color = "mpg01"): Factor variables are omitted in
## plot
auto %>%
ggplot(aes(cut_number(mpg01, 2), horsepower)) +
geom_boxplot()
auto %>%
ggplot(aes(cut_number(mpg01, 2), cylinders)) +
geom_boxplot()
auto %>%
ggplot(aes(cut_number(mpg01, 2), displacement)) +
geom_boxplot()
auto %>%
ggplot(aes(cut_number(mpg01, 2), weight)) +
geom_boxplot()
train = (year %% 2 == 0)
auto.train = auto[train,]
auto.test = auto[!train,]
auto.lda = lda(mpg01 ~ horsepower + cylinders + displacement + weight,
data = auto,
subset = train)
auto.pred = predict(auto.lda, auto.test)
CrossTable(auto.pred$class, auto.test$mpg01)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 182
##
##
## | auto.test$mpg01
## auto.pred$class | 0 | 1 | Row Total |
## ----------------|-----------|-----------|-----------|
## 0 | 86 | 9 | 95 |
## | 21.890 | 26.695 | |
## | 0.905 | 0.095 | 0.522 |
## | 0.860 | 0.110 | |
## | 0.473 | 0.049 | |
## ----------------|-----------|-----------|-----------|
## 1 | 14 | 73 | 87 |
## | 23.902 | 29.149 | |
## | 0.161 | 0.839 | 0.478 |
## | 0.140 | 0.890 | |
## | 0.077 | 0.401 | |
## ----------------|-----------|-----------|-----------|
## Column Total | 100 | 82 | 182 |
## | 0.549 | 0.451 | |
## ----------------|-----------|-----------|-----------|
##
##
mean(auto.pred$class == auto.test$mpg01)
## [1] 0.8736264
1-mean(auto.pred$class == auto.test$mpg01) #test error rate
## [1] 0.1263736
auto.qda = qda(mpg01 ~ horsepower + cylinders + displacement + weight,
data = auto,
subset = train)
qda.pred = predict(auto.qda, auto.test)
CrossTable(qda.pred$class, auto.test$mpg01)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 182
##
##
## | auto.test$mpg01
## qda.pred$class | 0 | 1 | Row Total |
## ---------------|-----------|-----------|-----------|
## 0 | 89 | 13 | 102 |
## | 19.379 | 23.633 | |
## | 0.873 | 0.127 | 0.560 |
## | 0.890 | 0.159 | |
## | 0.489 | 0.071 | |
## ---------------|-----------|-----------|-----------|
## 1 | 11 | 69 | 80 |
## | 24.709 | 30.133 | |
## | 0.138 | 0.863 | 0.440 |
## | 0.110 | 0.841 | |
## | 0.060 | 0.379 | |
## ---------------|-----------|-----------|-----------|
## Column Total | 100 | 82 | 182 |
## | 0.549 | 0.451 | |
## ---------------|-----------|-----------|-----------|
##
##
mean(qda.pred$class == auto.test$mpg01)
## [1] 0.8681319
1-mean(qda.pred$class == auto.test$mpg01) #test error rate
## [1] 0.1318681
auto.lrm = glm(mpg01 ~ horsepower + cylinders + displacement + weight,
data = auto,
family = binomial,
subset = train)
auto.prob = predict(auto.lrm, auto.test, type = "response")
auto.pred = rep(0, length(auto.prob))
auto.pred[auto.prob > 0.5] = 1
CrossTable(auto.pred, auto.test$mpg01)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 182
##
##
## | auto.test$mpg01
## auto.pred | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 89 | 11 | 100 |
## | 21.107 | 25.741 | |
## | 0.890 | 0.110 | 0.549 |
## | 0.890 | 0.134 | |
## | 0.489 | 0.060 | |
## -------------|-----------|-----------|-----------|
## 1 | 11 | 71 | 82 |
## | 25.741 | 31.391 | |
## | 0.134 | 0.866 | 0.451 |
## | 0.110 | 0.866 | |
## | 0.060 | 0.390 | |
## -------------|-----------|-----------|-----------|
## Column Total | 100 | 82 | 182 |
## | 0.549 | 0.451 | |
## -------------|-----------|-----------|-----------|
##
##
mean(auto.pred == auto.test$mpg01)
## [1] 0.8791209
1-mean(auto.pred == auto.test$mpg01) #test error rate
## [1] 0.1208791
nb.fit = naiveBayes(mpg01 ~ horsepower + cylinders + displacement + weight,
data = auto,
subset = train)
nb.pred = predict(nb.fit, auto.test)
CrossTable(nb.pred, auto.test$mpg01)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 182
##
##
## | auto.test$mpg01
## nb.pred | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 88 | 11 | 99 |
## | 20.760 | 25.317 | |
## | 0.889 | 0.111 | 0.544 |
## | 0.880 | 0.134 | |
## | 0.484 | 0.060 | |
## -------------|-----------|-----------|-----------|
## 1 | 12 | 71 | 83 |
## | 24.762 | 30.198 | |
## | 0.145 | 0.855 | 0.456 |
## | 0.120 | 0.866 | |
## | 0.066 | 0.390 | |
## -------------|-----------|-----------|-----------|
## Column Total | 100 | 82 | 182 |
## | 0.549 | 0.451 | |
## -------------|-----------|-----------|-----------|
##
##
mean(nb.pred == auto.test$mpg01)
## [1] 0.8736264
1 - mean(nb.pred == auto.test$mpg01) #test error rate
## [1] 0.1263736
knn.train = cbind(horsepower, cylinders, displacement, weight)[train,]
knn.test = cbind(horsepower, cylinders, displacement, weight)[!train,]
set.seed(1)
#KNN = 1
knn.pred1 = knn(knn.train, knn.test, auto.train$mpg01, k = 1)
CrossTable(knn.pred1, auto.test$mpg01)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 182
##
##
## | auto.test$mpg01
## knn.pred1 | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 83 | 11 | 94 |
## | 19.031 | 23.209 | |
## | 0.883 | 0.117 | 0.516 |
## | 0.830 | 0.134 | |
## | 0.456 | 0.060 | |
## -------------|-----------|-----------|-----------|
## 1 | 17 | 71 | 88 |
## | 20.329 | 24.791 | |
## | 0.193 | 0.807 | 0.484 |
## | 0.170 | 0.866 | |
## | 0.093 | 0.390 | |
## -------------|-----------|-----------|-----------|
## Column Total | 100 | 82 | 182 |
## | 0.549 | 0.451 | |
## -------------|-----------|-----------|-----------|
##
##
mean(knn.pred1 == auto.test$mpg01)
## [1] 0.8461538
1 - mean(knn.pred1 == auto.test$mpg01) #test error rate
## [1] 0.1538462
#KNN = 3
knn.pred3 = knn(knn.train, knn.test, auto.train$mpg01, k = 3)
CrossTable(knn.pred3, auto.test$mpg01)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 182
##
##
## | auto.test$mpg01
## knn.pred3 | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 84 | 9 | 93 |
## | 21.184 | 25.834 | |
## | 0.903 | 0.097 | 0.511 |
## | 0.840 | 0.110 | |
## | 0.462 | 0.049 | |
## -------------|-----------|-----------|-----------|
## 1 | 16 | 73 | 89 |
## | 22.136 | 26.995 | |
## | 0.180 | 0.820 | 0.489 |
## | 0.160 | 0.890 | |
## | 0.088 | 0.401 | |
## -------------|-----------|-----------|-----------|
## Column Total | 100 | 82 | 182 |
## | 0.549 | 0.451 | |
## -------------|-----------|-----------|-----------|
##
##
mean(knn.pred3 == auto.test$mpg01)
## [1] 0.8626374
1 - mean(knn.pred3 == auto.test$mpg01) #test error rate
## [1] 0.1373626
#KNN = 05
knn.pred5 = knn(knn.train, knn.test, auto.train$mpg01, k = 5)
CrossTable(knn.pred5, auto.test$mpg01)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 182
##
##
## | auto.test$mpg01
## knn.pred5 | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 82 | 9 | 91 |
## | 20.480 | 24.976 | |
## | 0.901 | 0.099 | 0.500 |
## | 0.820 | 0.110 | |
## | 0.451 | 0.049 | |
## -------------|-----------|-----------|-----------|
## 1 | 18 | 73 | 91 |
## | 20.480 | 24.976 | |
## | 0.198 | 0.802 | 0.500 |
## | 0.180 | 0.890 | |
## | 0.099 | 0.401 | |
## -------------|-----------|-----------|-----------|
## Column Total | 100 | 82 | 182 |
## | 0.549 | 0.451 | |
## -------------|-----------|-----------|-----------|
##
##
mean(knn.pred5 == auto.test$mpg01)
## [1] 0.8516484
1 - mean(knn.pred5 == auto.test$mpg01) #test error rate
## [1] 0.1483516
#KNN = 10
knn.pred10 = knn(knn.train, knn.test, auto.train$mpg01, k = 10)
CrossTable(knn.pred10, auto.test$mpg01)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 182
##
##
## | auto.test$mpg01
## knn.pred10 | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 79 | 7 | 86 |
## | 21.330 | 26.012 | |
## | 0.919 | 0.081 | 0.473 |
## | 0.790 | 0.085 | |
## | 0.434 | 0.038 | |
## -------------|-----------|-----------|-----------|
## 1 | 21 | 75 | 96 |
## | 19.108 | 23.302 | |
## | 0.219 | 0.781 | 0.527 |
## | 0.210 | 0.915 | |
## | 0.115 | 0.412 | |
## -------------|-----------|-----------|-----------|
## Column Total | 100 | 82 | 182 |
## | 0.549 | 0.451 | |
## -------------|-----------|-----------|-----------|
##
##
mean(knn.pred10 == auto.test$mpg01)
## [1] 0.8461538
1 - mean(knn.pred10 == auto.test$mpg01) #test error rate
## [1] 0.1538462
#KNN = 50
knn.pred50 = knn(knn.train, knn.test, auto.train$mpg01, k = 50)
CrossTable(knn.pred50, auto.test$mpg01)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 182
##
##
## | auto.test$mpg01
## knn.pred50 | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 80 | 7 | 87 |
## | 21.687 | 26.448 | |
## | 0.920 | 0.080 | 0.478 |
## | 0.800 | 0.085 | |
## | 0.440 | 0.038 | |
## -------------|-----------|-----------|-----------|
## 1 | 20 | 75 | 95 |
## | 19.861 | 24.221 | |
## | 0.211 | 0.789 | 0.522 |
## | 0.200 | 0.915 | |
## | 0.110 | 0.412 | |
## -------------|-----------|-----------|-----------|
## Column Total | 100 | 82 | 182 |
## | 0.549 | 0.451 | |
## -------------|-----------|-----------|-----------|
##
##
mean(knn.pred50 == auto.test$mpg01)
## [1] 0.8516484
1 - mean(knn.pred50 == auto.test$mpg01) #test error rate
## [1] 0.1483516
#KNN = 100
knn.pred100 = knn(knn.train, knn.test, auto.train$mpg01, k = 100)
CrossTable(knn.pred100, auto.test$mpg01)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 182
##
##
## | auto.test$mpg01
## knn.pred100 | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 81 | 7 | 88 |
## | 22.045 | 26.884 | |
## | 0.920 | 0.080 | 0.484 |
## | 0.810 | 0.085 | |
## | 0.445 | 0.038 | |
## -------------|-----------|-----------|-----------|
## 1 | 19 | 75 | 94 |
## | 20.638 | 25.168 | |
## | 0.202 | 0.798 | 0.516 |
## | 0.190 | 0.915 | |
## | 0.104 | 0.412 | |
## -------------|-----------|-----------|-----------|
## Column Total | 100 | 82 | 182 |
## | 0.549 | 0.451 | |
## -------------|-----------|-----------|-----------|
##
##
mean(knn.pred100 == auto.test$mpg01)
## [1] 0.8571429
1 - mean(knn.pred100 == auto.test$mpg01) #test error rate
## [1] 0.1428571
detach(auto)
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
names(Boston)
## [1] "crim" "zn" "indus" "chas" "nox" "rm" "age"
## [8] "dis" "rad" "tax" "ptratio" "black" "lstat" "medv"
boston = Boston %>%
mutate(crim01 = ifelse(crim > median(crim), 1, 0))
names(boston)
## [1] "crim" "zn" "indus" "chas" "nox" "rm" "age"
## [8] "dis" "rad" "tax" "ptratio" "black" "lstat" "medv"
## [15] "crim01"
attach(boston)
summary(boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv crim01
## Min. : 1.73 Min. : 5.00 Min. :0.0
## 1st Qu.: 6.95 1st Qu.:17.02 1st Qu.:0.0
## Median :11.36 Median :21.20 Median :0.5
## Mean :12.65 Mean :22.53 Mean :0.5
## 3rd Qu.:16.95 3rd Qu.:25.00 3rd Qu.:1.0
## Max. :37.97 Max. :50.00 Max. :1.0
#CORRELATION
cor(boston)
## crim zn indus chas nox
## crim 1.00000000 -0.20046922 0.40658341 -0.055891582 0.42097171
## zn -0.20046922 1.00000000 -0.53382819 -0.042696719 -0.51660371
## indus 0.40658341 -0.53382819 1.00000000 0.062938027 0.76365145
## chas -0.05589158 -0.04269672 0.06293803 1.000000000 0.09120281
## nox 0.42097171 -0.51660371 0.76365145 0.091202807 1.00000000
## rm -0.21924670 0.31199059 -0.39167585 0.091251225 -0.30218819
## age 0.35273425 -0.56953734 0.64477851 0.086517774 0.73147010
## dis -0.37967009 0.66440822 -0.70802699 -0.099175780 -0.76923011
## rad 0.62550515 -0.31194783 0.59512927 -0.007368241 0.61144056
## tax 0.58276431 -0.31456332 0.72076018 -0.035586518 0.66802320
## ptratio 0.28994558 -0.39167855 0.38324756 -0.121515174 0.18893268
## black -0.38506394 0.17552032 -0.35697654 0.048788485 -0.38005064
## lstat 0.45562148 -0.41299457 0.60379972 -0.053929298 0.59087892
## medv -0.38830461 0.36044534 -0.48372516 0.175260177 -0.42732077
## crim01 0.40939545 -0.43615103 0.60326017 0.070096774 0.72323480
## rm age dis rad tax ptratio
## crim -0.21924670 0.35273425 -0.37967009 0.625505145 0.58276431 0.2899456
## zn 0.31199059 -0.56953734 0.66440822 -0.311947826 -0.31456332 -0.3916785
## indus -0.39167585 0.64477851 -0.70802699 0.595129275 0.72076018 0.3832476
## chas 0.09125123 0.08651777 -0.09917578 -0.007368241 -0.03558652 -0.1215152
## nox -0.30218819 0.73147010 -0.76923011 0.611440563 0.66802320 0.1889327
## rm 1.00000000 -0.24026493 0.20524621 -0.209846668 -0.29204783 -0.3555015
## age -0.24026493 1.00000000 -0.74788054 0.456022452 0.50645559 0.2615150
## dis 0.20524621 -0.74788054 1.00000000 -0.494587930 -0.53443158 -0.2324705
## rad -0.20984667 0.45602245 -0.49458793 1.000000000 0.91022819 0.4647412
## tax -0.29204783 0.50645559 -0.53443158 0.910228189 1.00000000 0.4608530
## ptratio -0.35550149 0.26151501 -0.23247054 0.464741179 0.46085304 1.0000000
## black 0.12806864 -0.27353398 0.29151167 -0.444412816 -0.44180801 -0.1773833
## lstat -0.61380827 0.60233853 -0.49699583 0.488676335 0.54399341 0.3740443
## medv 0.69535995 -0.37695457 0.24992873 -0.381626231 -0.46853593 -0.5077867
## crim01 -0.15637178 0.61393992 -0.61634164 0.619786249 0.60874128 0.2535684
## black lstat medv crim01
## crim -0.38506394 0.4556215 -0.3883046 0.40939545
## zn 0.17552032 -0.4129946 0.3604453 -0.43615103
## indus -0.35697654 0.6037997 -0.4837252 0.60326017
## chas 0.04878848 -0.0539293 0.1752602 0.07009677
## nox -0.38005064 0.5908789 -0.4273208 0.72323480
## rm 0.12806864 -0.6138083 0.6953599 -0.15637178
## age -0.27353398 0.6023385 -0.3769546 0.61393992
## dis 0.29151167 -0.4969958 0.2499287 -0.61634164
## rad -0.44441282 0.4886763 -0.3816262 0.61978625
## tax -0.44180801 0.5439934 -0.4685359 0.60874128
## ptratio -0.17738330 0.3740443 -0.5077867 0.25356836
## black 1.00000000 -0.3660869 0.3334608 -0.35121093
## lstat -0.36608690 1.0000000 -0.7376627 0.45326273
## medv 0.33346082 -0.7376627 1.0000000 -0.26301673
## crim01 -0.35121093 0.4532627 -0.2630167 1.00000000
corrplot(cor(boston),
method = "number")
corrplot(cor(boston),
method = "square")
corrplot(cor(boston),
method = 'color',
order = 'hclust', addrect = 2,
tl.col = 'black', addCoef.col = 'black', number.cex = 0.65)
ggscatmat(boston, color = "crim01")
## Warning in ggscatmat(boston, color = "crim01"): Factor variables are omitted in
## plot
#SPLITTING THE DATASET
train = 1:(length(boston$crim)/2)
test = (length(boston$crim)/2 + 1):length(boston$crim)
boston.train = boston[train, ]
boston.test = boston[test, ]
set.seed(1)
boston.glm = glm(crim01 ~ rad + tax + age + indus + nox + dis,
data = boston,
family = binomial,
subset = train)
boston.probs = predict(boston.glm, boston.test, type = "response")
boston.pred = rep(0, length(boston.probs))
boston.pred[boston.probs > 0.5] = 1
CrossTable(boston.pred, boston.test$crim01)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 253
##
##
## | boston.test$crim01
## boston.pred | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 75 | 8 | 83 |
## | 70.038 | 38.671 | |
## | 0.904 | 0.096 | 0.328 |
## | 0.833 | 0.049 | |
## | 0.296 | 0.032 | |
## -------------|-----------|-----------|-----------|
## 1 | 15 | 155 | 170 |
## | 34.195 | 18.881 | |
## | 0.088 | 0.912 | 0.672 |
## | 0.167 | 0.951 | |
## | 0.059 | 0.613 | |
## -------------|-----------|-----------|-----------|
## Column Total | 90 | 163 | 253 |
## | 0.356 | 0.644 | |
## -------------|-----------|-----------|-----------|
##
##
mean(boston.pred == boston.test$crim01)
## [1] 0.9090909
1 - mean(boston.pred == boston.test$crim01) #test error
## [1] 0.09090909
boston.lda = lda(crim01 ~ rad + tax + age + indus + nox + dis,
data = boston,
subset = train)
boston.pred = predict(boston.lda, boston.test)
CrossTable(boston.pred$class, boston.test$crim01)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 253
##
##
## | boston.test$crim01
## boston.pred$class | 0 | 1 | Row Total |
## ------------------|-----------|-----------|-----------|
## 0 | 81 | 18 | 99 |
## | 59.517 | 32.862 | |
## | 0.818 | 0.182 | 0.391 |
## | 0.900 | 0.110 | |
## | 0.320 | 0.071 | |
## ------------------|-----------|-----------|-----------|
## 1 | 9 | 145 | 154 |
## | 38.261 | 21.126 | |
## | 0.058 | 0.942 | 0.609 |
## | 0.100 | 0.890 | |
## | 0.036 | 0.573 | |
## ------------------|-----------|-----------|-----------|
## Column Total | 90 | 163 | 253 |
## | 0.356 | 0.644 | |
## ------------------|-----------|-----------|-----------|
##
##
mean(boston.pred$class == boston.test$crim01)
## [1] 0.8932806
1 - mean(boston.pred$class == boston.test$crim01) #test error rate
## [1] 0.1067194
nb.fit = naiveBayes(crim01 ~ rad + tax + age + indus + nox + dis,
data = boston,
subset = train)
nb.pred = predict(nb.fit, boston.test)
CrossTable(nb.pred, boston.test$crim01)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 253
##
##
## | boston.test$crim01
## nb.pred | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 77 | 13 | 90 |
## | 63.206 | 34.899 | |
## | 0.856 | 0.144 | 0.356 |
## | 0.856 | 0.080 | |
## | 0.304 | 0.051 | |
## -------------|-----------|-----------|-----------|
## 1 | 13 | 150 | 163 |
## | 34.899 | 19.269 | |
## | 0.080 | 0.920 | 0.644 |
## | 0.144 | 0.920 | |
## | 0.051 | 0.593 | |
## -------------|-----------|-----------|-----------|
## Column Total | 90 | 163 | 253 |
## | 0.356 | 0.644 | |
## -------------|-----------|-----------|-----------|
##
##
mean(nb.pred == boston.test$crim01)
## [1] 0.8972332
1 - mean(nb.pred == boston.test$crim01) #test error rate
## [1] 0.1027668
knn.train = cbind(rad, tax, age, indus, nox, dis)[train,]
knn.test = cbind(rad, tax, age, indus, nox, dis)[test,]
set.seed(1)
train.crim01 = crim01[train]
#KNN = 1
knn.pred1 = knn(knn.train, knn.test, train.crim01, k = 1)
CrossTable(knn.pred1, boston.test$crim01)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 253
##
##
## | boston.test$crim01
## knn.pred1 | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 82 | 151 | 233 |
## | 0.009 | 0.005 | |
## | 0.352 | 0.648 | 0.921 |
## | 0.911 | 0.926 | |
## | 0.324 | 0.597 | |
## -------------|-----------|-----------|-----------|
## 1 | 8 | 12 | 20 |
## | 0.110 | 0.061 | |
## | 0.400 | 0.600 | 0.079 |
## | 0.089 | 0.074 | |
## | 0.032 | 0.047 | |
## -------------|-----------|-----------|-----------|
## Column Total | 90 | 163 | 253 |
## | 0.356 | 0.644 | |
## -------------|-----------|-----------|-----------|
##
##
mean(knn.pred1 == boston.test$crim01)
## [1] 0.3715415
1 - mean(knn.pred1 == boston.test$crim01) #test error rate
## [1] 0.6284585
#KNN = 3
knn.pred1 = knn(knn.train, knn.test, train.crim01, k = 3)
CrossTable(knn.pred1, boston.test$crim01)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 253
##
##
## | boston.test$crim01
## knn.pred1 | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 80 | 23 | 103 |
## | 51.311 | 28.331 | |
## | 0.777 | 0.223 | 0.407 |
## | 0.889 | 0.141 | |
## | 0.316 | 0.091 | |
## -------------|-----------|-----------|-----------|
## 1 | 10 | 140 | 150 |
## | 35.234 | 19.454 | |
## | 0.067 | 0.933 | 0.593 |
## | 0.111 | 0.859 | |
## | 0.040 | 0.553 | |
## -------------|-----------|-----------|-----------|
## Column Total | 90 | 163 | 253 |
## | 0.356 | 0.644 | |
## -------------|-----------|-----------|-----------|
##
##
mean(knn.pred1 == boston.test$crim01)
## [1] 0.8695652
1 - mean(knn.pred1 == boston.test$crim01) #test error rate
## [1] 0.1304348
#KNN = 10
knn.pred1 = knn(knn.train, knn.test, train.crim01, k = 10)
CrossTable(knn.pred1, boston.test$crim01)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 253
##
##
## | boston.test$crim01
## knn.pred1 | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 83 | 23 | 106 |
## | 54.403 | 30.039 | |
## | 0.783 | 0.217 | 0.419 |
## | 0.922 | 0.141 | |
## | 0.328 | 0.091 | |
## -------------|-----------|-----------|-----------|
## 1 | 7 | 140 | 147 |
## | 39.230 | 21.660 | |
## | 0.048 | 0.952 | 0.581 |
## | 0.078 | 0.859 | |
## | 0.028 | 0.553 | |
## -------------|-----------|-----------|-----------|
## Column Total | 90 | 163 | 253 |
## | 0.356 | 0.644 | |
## -------------|-----------|-----------|-----------|
##
##
mean(knn.pred1 == boston.test$crim01)
## [1] 0.8814229
1 - mean(knn.pred1 == boston.test$crim01) #test error rate
## [1] 0.1185771