library(ISLR2)
data(Smarket)
names(Smarket)
## [1] "Year" "Lag1" "Lag2" "Lag3" "Lag4" "Lag5"
## [7] "Volume" "Today" "Direction"
dim(Smarket)
## [1] 1250 9
summary(Smarket)
## Year Lag1 Lag2 Lag3
## Min. :2001 Min. :-4.922000 Min. :-4.922000 Min. :-4.922000
## 1st Qu.:2002 1st Qu.:-0.639500 1st Qu.:-0.639500 1st Qu.:-0.640000
## Median :2003 Median : 0.039000 Median : 0.039000 Median : 0.038500
## Mean :2003 Mean : 0.003834 Mean : 0.003919 Mean : 0.001716
## 3rd Qu.:2004 3rd Qu.: 0.596750 3rd Qu.: 0.596750 3rd Qu.: 0.596750
## Max. :2005 Max. : 5.733000 Max. : 5.733000 Max. : 5.733000
## Lag4 Lag5 Volume Today
## Min. :-4.922000 Min. :-4.92200 Min. :0.3561 Min. :-4.922000
## 1st Qu.:-0.640000 1st Qu.:-0.64000 1st Qu.:1.2574 1st Qu.:-0.639500
## Median : 0.038500 Median : 0.03850 Median :1.4229 Median : 0.038500
## Mean : 0.001636 Mean : 0.00561 Mean :1.4783 Mean : 0.003138
## 3rd Qu.: 0.596750 3rd Qu.: 0.59700 3rd Qu.:1.6417 3rd Qu.: 0.596750
## Max. : 5.733000 Max. : 5.73300 Max. :3.1525 Max. : 5.733000
## Direction
## Down:602
## Up :648
##
##
##
##
pairs(Smarket)

cor(Smarket[, -9])
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 0.029699649 0.030596422 0.033194581 0.035688718
## Lag1 0.02969965 1.000000000 -0.026294328 -0.010803402 -0.002985911
## Lag2 0.03059642 -0.026294328 1.000000000 -0.025896670 -0.010853533
## Lag3 0.03319458 -0.010803402 -0.025896670 1.000000000 -0.024051036
## Lag4 0.03568872 -0.002985911 -0.010853533 -0.024051036 1.000000000
## Lag5 0.02978799 -0.005674606 -0.003557949 -0.018808338 -0.027083641
## Volume 0.53900647 0.040909908 -0.043383215 -0.041823686 -0.048414246
## Today 0.03009523 -0.026155045 -0.010250033 -0.002447647 -0.006899527
## Lag5 Volume Today
## Year 0.029787995 0.53900647 0.030095229
## Lag1 -0.005674606 0.04090991 -0.026155045
## Lag2 -0.003557949 -0.04338321 -0.010250033
## Lag3 -0.018808338 -0.04182369 -0.002447647
## Lag4 -0.027083641 -0.04841425 -0.006899527
## Lag5 1.000000000 -0.02200231 -0.034860083
## Volume -0.022002315 1.00000000 0.014591823
## Today -0.034860083 0.01459182 1.000000000
attach(Smarket)
plot(Volume)

glm.fits <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Smarket, family = binomial)
summary(glm.fits)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Smarket)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.126000 0.240736 -0.523 0.601
## Lag1 -0.073074 0.050167 -1.457 0.145
## Lag2 -0.042301 0.050086 -0.845 0.398
## Lag3 0.011085 0.049939 0.222 0.824
## Lag4 0.009359 0.049974 0.187 0.851
## Lag5 0.010313 0.049511 0.208 0.835
## Volume 0.135441 0.158360 0.855 0.392
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1731.2 on 1249 degrees of freedom
## Residual deviance: 1727.6 on 1243 degrees of freedom
## AIC: 1741.6
##
## Number of Fisher Scoring iterations: 3
coef(glm.fits)
## (Intercept) Lag1 Lag2 Lag3 Lag4 Lag5
## -0.126000257 -0.073073746 -0.042301344 0.011085108 0.009358938 0.010313068
## Volume
## 0.135440659
summary(glm.fits)$coef
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.126000257 0.24073574 -0.5233966 0.6006983
## Lag1 -0.073073746 0.05016739 -1.4565986 0.1452272
## Lag2 -0.042301344 0.05008605 -0.8445733 0.3983491
## Lag3 0.011085108 0.04993854 0.2219750 0.8243333
## Lag4 0.009358938 0.04997413 0.1872757 0.8514445
## Lag5 0.010313068 0.04951146 0.2082966 0.8349974
## Volume 0.135440659 0.15835970 0.8552723 0.3924004
summary(glm.fits)$coef[, 4]
## (Intercept) Lag1 Lag2 Lag3 Lag4 Lag5
## 0.6006983 0.1452272 0.3983491 0.8243333 0.8514445 0.8349974
## Volume
## 0.3924004
glm.probs <- predict(glm.fits, type = "response")
glm.probs[1:10]
## 1 2 3 4 5 6 7 8
## 0.5070841 0.4814679 0.4811388 0.5152224 0.5107812 0.5069565 0.4926509 0.5092292
## 9 10
## 0.5176135 0.4888378
contrasts(Direction)
## Up
## Down 0
## Up 1
glm.pred <- rep("Down", 1250)
glm.pred[glm.probs > .5] <- "Up"
table(glm.pred, Direction)
## Direction
## glm.pred Down Up
## Down 145 141
## Up 457 507
(507 + 145) / 1250
## [1] 0.5216
mean(glm.pred == Direction)
## [1] 0.5216
train <- (Year < 2005)
Smarket.2005 <- Smarket[!train, ]
Direction.2005 <- Direction[!train]
glm.fits <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume,
data = Smarket, family = binomial, subset = train)
glm.probs <- predict(glm.fits, Smarket.2005, type = "response")
glm.pred <- rep("Down", 252)
glm.pred[glm.probs > .5] <- "Up"
table(glm.pred, Direction.2005)
## Direction.2005
## glm.pred Down Up
## Down 77 97
## Up 34 44
mean(glm.pred == Direction.2005)
## [1] 0.4801587
mean(glm.pred != Direction.2005)
## [1] 0.5198413
glm.fits <- glm(Direction ~ Lag1 + Lag2, data = Smarket,
family = binomial, subset = train)
glm.probs <- predict(glm.fits, Smarket.2005, type = "response")
glm.pred <- rep("Down", 252)
glm.pred[glm.probs > .5] <- "Up"
table(glm.pred, Direction.2005)
## Direction.2005
## glm.pred Down Up
## Down 35 35
## Up 76 106
mean(glm.pred == Direction.2005)
## [1] 0.5595238
106 / (106 + 76)
## [1] 0.5824176
predict(glm.fits,
newdata = data.frame(Lag1 = c(1.2, 1.5), Lag2 = c(1.1, -0.8)),
type = "response")
## 1 2
## 0.4791462 0.4960939
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
lda.fit <- lda(Direction ~ Lag1 + Lag2, data = Smarket, subset = train)
print(lda.fit)
## Call:
## lda(Direction ~ Lag1 + Lag2, data = Smarket, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.491984 0.508016
##
## Group means:
## Lag1 Lag2
## Down 0.04279022 0.03389409
## Up -0.03954635 -0.03132544
##
## Coefficients of linear discriminants:
## LD1
## Lag1 -0.6420190
## Lag2 -0.5135293
par(mar=c(5, 5, 2, 2)) # Setting margin to c(bottom, left, top, right)
par(mar=c(5, 4, 4, 2) + 0.1)
lda.pred <- predict(lda.fit, Smarket.2005)
print(names(lda.pred))
## [1] "class" "posterior" "x"
lda.class <- lda.pred$class
confusion_matrix <- table(lda.class, Smarket.2005$Direction)
print(confusion_matrix)
##
## lda.class Down Up
## Down 35 35
## Up 76 106
accuracy <- mean(lda.class == Smarket.2005$Direction)
print(accuracy)
## [1] 0.5595238
count_ge_5 <- sum(lda.pred$posterior[, 1] >= .5)
print(count_ge_5)
## [1] 70
count_lt_5 <- sum(lda.pred$posterior[, 1] < .5)
print(count_lt_5)
## [1] 182
print(lda.pred$posterior[1:20, 1])
## 999 1000 1001 1002 1003 1004 1005 1006
## 0.4901792 0.4792185 0.4668185 0.4740011 0.4927877 0.4938562 0.4951016 0.4872861
## 1007 1008 1009 1010 1011 1012 1013 1014
## 0.4907013 0.4844026 0.4906963 0.5119988 0.4895152 0.4706761 0.4744593 0.4799583
## 1015 1016 1017 1018
## 0.4935775 0.5030894 0.4978806 0.4886331
print(lda.class[1:20])
## [1] Up Up Up Up Up Up Up Up Up Up Up Down Up Up Up
## [16] Up Up Down Up Up
## Levels: Down Up
count_gt_9 <- sum(lda.pred$posterior[, 1] > .9)
print(count_gt_9)
## [1] 0
library(MASS)
qda.fit <- qda(Direction ~ Lag1 + Lag2, data = Smarket, subset = train)
print(qda.fit)
## Call:
## qda(Direction ~ Lag1 + Lag2, data = Smarket, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.491984 0.508016
##
## Group means:
## Lag1 Lag2
## Down 0.04279022 0.03389409
## Up -0.03954635 -0.03132544
qda.class <- predict(qda.fit, Smarket.2005)$class
confusion_matrix <- table(qda.class, Smarket.2005$Direction)
print(confusion_matrix)
##
## qda.class Down Up
## Down 30 20
## Up 81 121
accuracy <- mean(qda.class == Smarket.2005$Direction)
print(accuracy)
## [1] 0.5992063
library(e1071)
nb.fit <- naiveBayes(Direction ~ Lag1 + Lag2, data = Smarket, subset = train)
print(nb.fit)
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## Down Up
## 0.491984 0.508016
##
## Conditional probabilities:
## Lag1
## Y [,1] [,2]
## Down 0.04279022 1.227446
## Up -0.03954635 1.231668
##
## Lag2
## Y [,1] [,2]
## Down 0.03389409 1.239191
## Up -0.03132544 1.220765
mean_Lag1_down <- mean(Lag1[train][Direction[train] == "Down"])
print(mean_Lag1_down)
## [1] 0.04279022
sd_Lag1_down <- sd(Lag1[train][Direction[train] == "Down"])
print(sd_Lag1_down)
## [1] 1.227446
nb.class <- predict(nb.fit, Smarket.2005)
confusion_matrix <- table(nb.class, Direction.2005)
print(confusion_matrix)
## Direction.2005
## nb.class Down Up
## Down 28 20
## Up 83 121
accuracy <- mean(nb.class == Direction.2005)
print(accuracy)
## [1] 0.5912698
nb.preds <- predict(nb.fit, Smarket.2005, type = "raw")
print(nb.preds[1:5, ])
## Down Up
## [1,] 0.4873164 0.5126836
## [2,] 0.4762492 0.5237508
## [3,] 0.4653377 0.5346623
## [4,] 0.4748652 0.5251348
## [5,] 0.4901890 0.5098110
library(class)
data(Caravan)
train.X <- cbind(Lag1, Lag2)[train, ]
test.X <- cbind(Lag1, Lag2)[!train, ]
train.Direction <- Direction[train]
set.seed(1)
knn.pred <- knn(train.X, test.X, train.Direction, k = 1)
table(knn.pred, Direction.2005)
## Direction.2005
## knn.pred Down Up
## Down 43 58
## Up 68 83
accuracy_k1 <- sum(knn.pred == Direction.2005) / length(Direction.2005)
print(accuracy_k1)
## [1] 0.5
knn.pred <- knn(train.X, test.X, train.Direction, k = 3)
table(knn.pred, Direction.2005)
## Direction.2005
## knn.pred Down Up
## Down 48 54
## Up 63 87
accuracy_k3 <- mean(knn.pred == Direction.2005)
print(accuracy_k3)
## [1] 0.5357143
dim(Caravan)
## [1] 5822 86
summary(Caravan)
## MOSTYPE MAANTHUI MGEMOMV MGEMLEEF
## Min. : 1.00 Min. : 1.000 Min. :1.000 Min. :1.000
## 1st Qu.:10.00 1st Qu.: 1.000 1st Qu.:2.000 1st Qu.:2.000
## Median :30.00 Median : 1.000 Median :3.000 Median :3.000
## Mean :24.25 Mean : 1.111 Mean :2.679 Mean :2.991
## 3rd Qu.:35.00 3rd Qu.: 1.000 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :41.00 Max. :10.000 Max. :5.000 Max. :6.000
## MOSHOOFD MGODRK MGODPR MGODOV
## Min. : 1.000 Min. :0.0000 Min. :0.000 Min. :0.00
## 1st Qu.: 3.000 1st Qu.:0.0000 1st Qu.:4.000 1st Qu.:0.00
## Median : 7.000 Median :0.0000 Median :5.000 Median :1.00
## Mean : 5.774 Mean :0.6965 Mean :4.627 Mean :1.07
## 3rd Qu.: 8.000 3rd Qu.:1.0000 3rd Qu.:6.000 3rd Qu.:2.00
## Max. :10.000 Max. :9.0000 Max. :9.000 Max. :5.00
## MGODGE MRELGE MRELSA MRELOV
## Min. :0.000 Min. :0.000 Min. :0.0000 Min. :0.00
## 1st Qu.:2.000 1st Qu.:5.000 1st Qu.:0.0000 1st Qu.:1.00
## Median :3.000 Median :6.000 Median :1.0000 Median :2.00
## Mean :3.259 Mean :6.183 Mean :0.8835 Mean :2.29
## 3rd Qu.:4.000 3rd Qu.:7.000 3rd Qu.:1.0000 3rd Qu.:3.00
## Max. :9.000 Max. :9.000 Max. :7.0000 Max. :9.00
## MFALLEEN MFGEKIND MFWEKIND MOPLHOOG MOPLMIDD
## Min. :0.000 Min. :0.00 Min. :0.0 Min. :0.000 Min. :0.000
## 1st Qu.:0.000 1st Qu.:2.00 1st Qu.:3.0 1st Qu.:0.000 1st Qu.:2.000
## Median :2.000 Median :3.00 Median :4.0 Median :1.000 Median :3.000
## Mean :1.888 Mean :3.23 Mean :4.3 Mean :1.461 Mean :3.351
## 3rd Qu.:3.000 3rd Qu.:4.00 3rd Qu.:6.0 3rd Qu.:2.000 3rd Qu.:4.000
## Max. :9.000 Max. :9.00 Max. :9.0 Max. :9.000 Max. :9.000
## MOPLLAAG MBERHOOG MBERZELF MBERBOER
## Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.0000
## 1st Qu.:3.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.0000
## Median :5.000 Median :2.000 Median :0.000 Median :0.0000
## Mean :4.572 Mean :1.895 Mean :0.398 Mean :0.5223
## 3rd Qu.:6.000 3rd Qu.:3.000 3rd Qu.:1.000 3rd Qu.:1.0000
## Max. :9.000 Max. :9.000 Max. :5.000 Max. :9.0000
## MBERMIDD MBERARBG MBERARBO MSKA MSKB1
## Min. :0.000 Min. :0.00 Min. :0.000 Min. :0.000 Min. :0.000
## 1st Qu.:2.000 1st Qu.:1.00 1st Qu.:1.000 1st Qu.:0.000 1st Qu.:1.000
## Median :3.000 Median :2.00 Median :2.000 Median :1.000 Median :2.000
## Mean :2.899 Mean :2.22 Mean :2.306 Mean :1.621 Mean :1.607
## 3rd Qu.:4.000 3rd Qu.:3.00 3rd Qu.:3.000 3rd Qu.:2.000 3rd Qu.:2.000
## Max. :9.000 Max. :9.00 Max. :9.000 Max. :9.000 Max. :9.000
## MSKB2 MSKC MSKD MHHUUR
## Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000
## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:0.000 1st Qu.:2.000
## Median :2.000 Median :4.000 Median :1.000 Median :4.000
## Mean :2.203 Mean :3.759 Mean :1.067 Mean :4.237
## 3rd Qu.:3.000 3rd Qu.:5.000 3rd Qu.:2.000 3rd Qu.:7.000
## Max. :9.000 Max. :9.000 Max. :9.000 Max. :9.000
## MHKOOP MAUT1 MAUT2 MAUT0 MZFONDS
## Min. :0.000 Min. :0.00 Min. :0.000 Min. :0.000 Min. :0.000
## 1st Qu.:2.000 1st Qu.:5.00 1st Qu.:0.000 1st Qu.:1.000 1st Qu.:5.000
## Median :5.000 Median :6.00 Median :1.000 Median :2.000 Median :7.000
## Mean :4.772 Mean :6.04 Mean :1.316 Mean :1.959 Mean :6.277
## 3rd Qu.:7.000 3rd Qu.:7.00 3rd Qu.:2.000 3rd Qu.:3.000 3rd Qu.:8.000
## Max. :9.000 Max. :9.00 Max. :7.000 Max. :9.000 Max. :9.000
## MZPART MINKM30 MINK3045 MINK4575
## Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:1.000
## Median :2.000 Median :2.000 Median :4.000 Median :3.000
## Mean :2.729 Mean :2.574 Mean :3.536 Mean :2.731
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:5.000 3rd Qu.:4.000
## Max. :9.000 Max. :9.000 Max. :9.000 Max. :9.000
## MINK7512 MINK123M MINKGEM MKOOPKLA
## Min. :0.0000 Min. :0.0000 Min. :0.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:3.000
## Median :0.0000 Median :0.0000 Median :4.000 Median :4.000
## Mean :0.7961 Mean :0.2027 Mean :3.784 Mean :4.236
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:4.000 3rd Qu.:6.000
## Max. :9.0000 Max. :9.0000 Max. :9.000 Max. :8.000
## PWAPART PWABEDR PWALAND PPERSAUT
## Min. :0.0000 Min. :0.00000 Min. :0.00000 Min. :0.00
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00
## Median :0.0000 Median :0.00000 Median :0.00000 Median :5.00
## Mean :0.7712 Mean :0.04002 Mean :0.07162 Mean :2.97
## 3rd Qu.:2.0000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:6.00
## Max. :3.0000 Max. :6.00000 Max. :4.00000 Max. :8.00
## PBESAUT PMOTSCO PVRAAUT PAANHANG
## Min. :0.00000 Min. :0.0000 Min. :0.000000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.000000 1st Qu.:0.00000
## Median :0.00000 Median :0.0000 Median :0.000000 Median :0.00000
## Mean :0.04827 Mean :0.1754 Mean :0.009447 Mean :0.02096
## 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.000000 3rd Qu.:0.00000
## Max. :7.00000 Max. :7.0000 Max. :9.000000 Max. :5.00000
## PTRACTOR PWERKT PBROM PLEVEN
## Min. :0.00000 Min. :0.00000 Min. :0.000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.000 1st Qu.:0.0000
## Median :0.00000 Median :0.00000 Median :0.000 Median :0.0000
## Mean :0.09258 Mean :0.01305 Mean :0.215 Mean :0.1948
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.000 3rd Qu.:0.0000
## Max. :6.00000 Max. :6.00000 Max. :6.000 Max. :9.0000
## PPERSONG PGEZONG PWAOREG PBRAND
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.000
## Median :0.00000 Median :0.00000 Median :0.00000 Median :2.000
## Mean :0.01374 Mean :0.01529 Mean :0.02353 Mean :1.828
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:4.000
## Max. :6.00000 Max. :3.00000 Max. :7.00000 Max. :8.000
## PZEILPL PPLEZIER PFIETS PINBOED
## Min. :0.0000000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.0000000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.0000000 Median :0.00000 Median :0.00000 Median :0.00000
## Mean :0.0008588 Mean :0.01889 Mean :0.02525 Mean :0.01563
## 3rd Qu.:0.0000000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :3.0000000 Max. :6.00000 Max. :1.00000 Max. :6.00000
## PBYSTAND AWAPART AWABEDR AWALAND
## Min. :0.00000 Min. :0.000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.000 Median :0.00000 Median :0.00000
## Mean :0.04758 Mean :0.403 Mean :0.01477 Mean :0.02061
## 3rd Qu.:0.00000 3rd Qu.:1.000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :5.00000 Max. :2.000 Max. :5.00000 Max. :1.00000
## APERSAUT ABESAUT AMOTSCO AVRAAUT
## Min. :0.0000 Min. :0.00000 Min. :0.00000 Min. :0.000000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.000000
## Median :1.0000 Median :0.00000 Median :0.00000 Median :0.000000
## Mean :0.5622 Mean :0.01048 Mean :0.04105 Mean :0.002233
## 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.000000
## Max. :7.0000 Max. :4.00000 Max. :8.00000 Max. :3.000000
## AAANHANG ATRACTOR AWERKT ABROM
## Min. :0.00000 Min. :0.00000 Min. :0.000000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.000000 Median :0.00000
## Mean :0.01254 Mean :0.03367 Mean :0.006183 Mean :0.07042
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.000000 3rd Qu.:0.00000
## Max. :3.00000 Max. :4.00000 Max. :6.000000 Max. :2.00000
## ALEVEN APERSONG AGEZONG AWAOREG
## Min. :0.00000 Min. :0.000000 Min. :0.000000 Min. :0.000000
## 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.000000 1st Qu.:0.000000
## Median :0.00000 Median :0.000000 Median :0.000000 Median :0.000000
## Mean :0.07661 Mean :0.005325 Mean :0.006527 Mean :0.004638
## 3rd Qu.:0.00000 3rd Qu.:0.000000 3rd Qu.:0.000000 3rd Qu.:0.000000
## Max. :8.00000 Max. :1.000000 Max. :1.000000 Max. :2.000000
## ABRAND AZEILPL APLEZIER AFIETS
## Min. :0.0000 Min. :0.0000000 Min. :0.000000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.0000000 1st Qu.:0.000000 1st Qu.:0.00000
## Median :1.0000 Median :0.0000000 Median :0.000000 Median :0.00000
## Mean :0.5701 Mean :0.0005153 Mean :0.006012 Mean :0.03178
## 3rd Qu.:1.0000 3rd Qu.:0.0000000 3rd Qu.:0.000000 3rd Qu.:0.00000
## Max. :7.0000 Max. :1.0000000 Max. :2.000000 Max. :3.00000
## AINBOED ABYSTAND Purchase
## Min. :0.000000 Min. :0.00000 No :5474
## 1st Qu.:0.000000 1st Qu.:0.00000 Yes: 348
## Median :0.000000 Median :0.00000
## Mean :0.007901 Mean :0.01426
## 3rd Qu.:0.000000 3rd Qu.:0.00000
## Max. :2.000000 Max. :2.00000
standardized.X <- scale(Caravan[, -86])
test <- 1:1000
train.X <- standardized.X[-test, ]
test.X <- standardized.X[test, ]
train.Y <- Caravan$Purchase[-test]
test.Y <- Caravan$Purchase[test]
set.seed(1)
knn.pred <- knn(train.X, test.X, train.Y, k = 1)
misclassification_error <- mean(test.Y != knn.pred)
print(misclassification_error)
## [1] 0.118
baseline_error <- mean(test.Y != "No")
print(baseline_error)
## [1] 0.059
table(knn.pred, test.Y)
## test.Y
## knn.pred No Yes
## No 873 50
## Yes 68 9
knn.pred <- knn(train.X, test.X, train.Y, k = 3)
table(knn.pred, test.Y)
## test.Y
## knn.pred No Yes
## No 920 54
## Yes 21 5
correctly_classified_yes <- sum(test.Y == "Yes" & knn.pred == "Yes") / sum(test.Y == "Yes")
print(correctly_classified_yes)
## [1] 0.08474576
knn.pred <- knn(train.X, test.X, train.Y, k = 5)
table(knn.pred, test.Y)
## test.Y
## knn.pred No Yes
## No 930 55
## Yes 11 4
correctly_classified_yes_k5 <- sum(test.Y == "Yes" & knn.pred == "Yes") / sum(test.Y == "Yes")
print(correctly_classified_yes_k5)
## [1] 0.06779661
glm.fits <- glm(Purchase ~ ., data = Caravan, family = binomial, subset = -test)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
glm.probs <- predict(glm.fits, Caravan[test, ], type = "response")
glm.pred <- rep("No", 1000)
glm.pred[glm.probs > .5] <- "Yes"
table(glm.pred, test.Y)
## test.Y
## glm.pred No Yes
## No 934 59
## Yes 7 0
glm.pred <- rep("No", 1000)
glm.pred[glm.probs > .25] <- "Yes"
table(glm.pred, test.Y)
## test.Y
## glm.pred No Yes
## No 919 48
## Yes 22 11
correctly_classified_yes_threshold <- sum(test.Y == "Yes" & glm.pred == "Yes") / sum(test.Y == "Yes")
print(correctly_classified_yes_threshold)
## [1] 0.1864407