Data set in MASS library, sources:
Ripley, B.D. (1994) Neural networks and related methods for classification (with discussion). Journal of the Royal Statistical Society series B 56, 409–456.
Ripley, B.D. (1996) Pattern Recognition and Neural Networks. Cambridge: Cambridge University Press.
Plot the training set:
library(MASS)
library(ggplot2)
data("synth.tr")
synth.tr$yc <- as.factor(synth.tr$yc)
synth.te$yc <- as.factor(synth.te$yc)
grid <- expand.grid(
xs = seq(-1.5, 1, length = 100),
ys = seq(-0.2, 1.2, length = 100)
)
ggplot(aes(x = xs, y = ys, color = yc), data = synth.tr) +
geom_point()
fit.glm <- glm(yc~., data = synth.tr, family = "binomial")
pred <- ifelse(predict(fit.glm, newdata = synth.te, type = "response") >0.5, "1", "0")
table(pred, synth.te$yc)
##
## pred 0 1
## 0 459 73
## 1 41 427
mean(pred != synth.te$yc)
## [1] 0.114
grid$pred.glm <- ifelse(predict(fit.glm, newdata = grid, type = "response") >0.5, "1", "0")
ggplot(aes(x = xs, y = ys, color = yc), data = synth.tr) +
geom_point() +
geom_point(aes(color = pred.glm), data = grid, alpha = .1)
Use smoothing splines:
library(gam)
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.14-4
fit.gam <- gam(yc~ s(xs,df = 5)+ s(ys,df = 5), data = synth.tr,
family="binomial")
pred <- ifelse(predict(fit.gam, synth.te, type = "response")>0.5, "1", "0")
table(pred, synth.te$yc)
##
## pred 0 1
## 0 457 46
## 1 43 454
mean(pred != synth.te$yc) # test error
## [1] 0.089
grid$pred.gam <- ifelse(matrix(predict(fit.gam, grid, type = "response"),dim(grid)[1], 1)>0.5, "1", "0")
ggplot(aes(x = xs, y = ys, color = yc), data = synth.tr) +
geom_point() +
geom_point(aes(color = pred.gam), data = grid, alpha = .1)
anova(fit.glm, fit.gam, test = "Chisq")
## Analysis of Deviance Table
##
## Model 1: yc ~ xs + ys
## Model 2: yc ~ s(xs, df = 5) + s(ys, df = 5)
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 247 161.42
## 2 239 120.55 7.9997 40.871 2.203e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fit.gam <- gam(yc~ s(xs,df = 5) + ys, data = synth.tr,
family="binomial")
pred <- ifelse(predict(fit.gam, synth.te, type = "response")>0.5, "1", "0")
table(pred, synth.te$yc)
##
## pred 0 1
## 0 459 49
## 1 41 451
mean(pred != synth.te$yc) # test error
## [1] 0.09
grid$pred.gam <- ifelse(matrix(predict(fit.gam, grid, type = "response"),dim(grid)[1], 1)>0.5, "1", "0")
ggplot(aes(x = xs, y = ys, color = yc), data = synth.tr) +
geom_point() +
geom_point(aes(color = pred.gam), data = grid, alpha = .1)
# Trees
library(tree)
## Warning: package 'tree' was built under R version 3.4.4
fit.tree <- tree(yc~., data = synth.tr)
plot(fit.tree)
text(fit.tree, cex=.5, pretty=0)
pred <- predict(fit.tree, newdata = synth.te, type="class")
table(pred, synth.te$yc)
##
## pred 0 1
## 0 442 97
## 1 58 403
mean(pred != synth.te$yc)
## [1] 0.155
grid$pred.tree <- predict(fit.tree, newdata = grid, type = "class")
ggplot(aes(x = xs, y = ys, color = yc), data = synth.tr) +
geom_point() +
geom_point(aes(color = pred.tree), data = grid, alpha = .1)
Use cross-validation to prune the tree.
cvtree <- cv.tree(fit.tree, FUN=prune.misclass)
plot(cvtree$size,cvtree$dev,type="b")
w <- which.min(cvtree$dev)
cvtree$size[w]
## [1] 6
tree1 <- prune.tree(fit.tree, best=2)
pred <- predict(tree1, newdata = synth.te, type="class")
table(pred, synth.te$yc)
##
## pred 0 1
## 0 443 62
## 1 57 438
mean(pred != synth.te$yc)
## [1] 0.119
grid$pred.tree1 <- predict(tree1, newdata = grid, type = "class")
ggplot(aes(x = xs, y = ys, color = yc), data = synth.tr) +
geom_point() +
geom_point(aes(color = pred.tree1), data = grid, alpha = .1)
library(randomForest)
fit.rf <- randomForest(yc~., data = synth.tr)
pred <- predict(fit.rf, synth.te)
table(pred, synth.te$yc)
##
## pred 0 1
## 0 457 60
## 1 43 440
mean(pred != synth.te$yc) # test error
## [1] 0.103
grid$pred.rf <- predict(fit.rf, newdata = grid)
ggplot(aes(x = xs, y = ys, color = yc), data = synth.tr) +
geom_point() +
geom_point(aes(color = pred.rf), data = grid, alpha = .1)
ggplot(aes(x = xs, y = ys, color = yc), data = synth.te) +
geom_point() +
geom_point(aes(color = pred.rf), data = grid, alpha = .1)
We can see from the plots that this overfits the data
LDA should be similar to logistic regression above
fit.qda <- qda(yc~., data = synth.tr)
pred <- predict(fit.qda, synth.te)$class
table(pred, synth.te$yc)
##
## pred 0 1
## 0 454 56
## 1 46 444
mean(pred != synth.te$yc) # test error
## [1] 0.102
grid$pred.qda <- predict(fit.qda, newdata = grid)$class
ggplot(aes(x = xs, y = ys, color = yc), data = synth.tr) +
geom_point() +
geom_point(aes(color = pred.qda), data = grid, alpha = .1)
ggplot(aes(x = xs, y = ys, color = yc), data = synth.te) +
geom_point() +
geom_point(aes(color = pred.qda), data = grid, alpha = .1)
library(e1071)
fit.svm <- svm(yc~., data = synth.tr, kernel = "radial")
pred <- predict(fit.svm, synth.te)
table(pred, synth.te$yc)
##
## pred 0 1
## 0 458 53
## 1 42 447
mean(pred != synth.te$yc) # test error
## [1] 0.095
grid$pred.svm <- predict(fit.svm, newdata = grid)
ggplot(aes(x = xs, y = ys, color = yc), data = synth.tr) +
geom_point() +
geom_point(aes(color = pred.svm), data = grid, alpha = .1)
We should really tune this
tune.out <- tune(svm,yc~., data = synth.tr, kernel = "radial",
ranges = list(cost = c(1,2,3), gamma = 10^seq(-4, 2, by = 1)) )
tune.out
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 2 1
##
## - best performance: 0.124
fit.svm2 <- svm(yc~., data = synth.tr, kernel = "radial",
cost = 2, gamma = 1)
pred <- predict(fit.svm2, synth.te)
table(pred, synth.te$yc)
##
## pred 0 1
## 0 458 54
## 1 42 446
mean(pred != synth.te$yc) # test error
## [1] 0.096
grid$pred.svm2 <- predict(fit.svm2, newdata = grid)
ggplot(aes(x = xs, y = ys, color = yc), data = synth.tr) +
geom_point() +
geom_point(aes(color = pred.svm2), data = grid, alpha = .1)
Try out differerent bandwidth parameter gamma:
fit.svm2 <- svm(yc~., data = synth.tr, kernel = "radial",
cost = 2, gamma = 10)
pred <- predict(fit.svm2, synth.te)
table(pred, synth.te$yc)
##
## pred 0 1
## 0 448 73
## 1 52 427
mean(pred != synth.te$yc) # test error
## [1] 0.125
grid$pred.svm2 <- predict(fit.svm2, newdata = grid)
ggplot(aes(x = xs, y = ys, color = yc), data = synth.tr) +
geom_point() +
geom_point(aes(color = pred.svm2), data = grid, alpha = .1)
fit.svm2 <- svm(yc~., data = synth.tr, kernel = "radial",
cost = 2, gamma = 0.01)
pred <- predict(fit.svm2, synth.te)
table(pred, synth.te$yc)
##
## pred 0 1
## 0 454 61
## 1 46 439
mean(pred != synth.te$yc) # test error
## [1] 0.107
grid$pred.svm2 <- predict(fit.svm2, newdata = grid)
ggplot(aes(x = xs, y = ys, color = yc), data = synth.tr) +
geom_point() +
geom_point(aes(color = pred.svm2), data = grid, alpha = .1)
fit.svm.poly <- svm(yc~., data = synth.tr, kernel = "polynomial")
pred <- predict(fit.svm.poly, synth.te)
table(pred, synth.te$yc)
##
## pred 0 1
## 0 421 29
## 1 79 471
mean(pred != synth.te$yc) # test error
## [1] 0.108
grid$pred.svm.poly <- predict(fit.svm.poly, newdata = grid)
ggplot(aes(x = xs, y = ys, color = yc), data = synth.tr) +
geom_point() +
geom_point(aes(color = pred.svm.poly), data = grid, alpha = .1)
fit.svm.sigmoid <- svm(yc~., data = synth.tr, kernel = "sigmoid")
pred <- predict(fit.svm.sigmoid, synth.te)
table(pred, synth.te$yc)
##
## pred 0 1
## 0 427 108
## 1 73 392
mean(pred != synth.te$yc) # test error
## [1] 0.181
grid$pred.svm.sigmoid <- predict(fit.svm.sigmoid , newdata = grid)
ggplot(aes(x = xs, y = ys, color = yc), data = synth.tr) +
geom_point() +
geom_point(aes(color = pred.svm.sigmoid), data = grid, alpha = .1)