I will implement boosting method with decision stumps. Decision stumps are defined as the first node of a tree classifier
I write a function called AdaBoost. This a composite of training weak learners and combining them. Input arguments are training data, their labels and number of weak leaners I would like to have in my boosting.
I divide the data into two sets by ramdomly picking out 10% of them which will serve as test data.
rm(list = ls())
setwd("~/Desktop/Homework_backup/machinelearning/uspsdata")
y <- read.table("uspscl.txt")
x <- read.table("uspsdata.txt")
index <- sample(1:200, 20)
x1 <- as.matrix(x[-index, ])
y1 <- as.matrix(y[-index, ])
x2 <- as.matrix(x[index, ])
y2 <- as.matrix(y[index, ])
AdaBoost <- function(x, y, B) {
j <- ncol(x)
n <- nrow(x)
w <- rep(1/n, n)
alpha <- c()
J <- c()
theta <- c()
for (b in 1:B) {
wy <- c()
cum <- matrix(nrow = n, ncol = j)
m <- c()
theta1.index <- c()
theta1 <- c()
cum2 <- c()
for (j in 1:j) {
xw <- cbind(x[, j], w, y)
xwa <- xw[order(xw[, 1]), ]
for (i in 1:n) {
wy[i] <- xwa[i, 2] * xwa[i, 3]
}
cum[, j] <- cumsum(wy)
theta1.index[j] <- which(abs(cum[, j]) == max(abs(cum[, j])))[1]
extre <- xwa[theta1.index[j], 1]
theta1[j] <- extre
cum2[j] <- max(abs(cum[, j]))
}
indexj <- which(cum2 == max(cum2))[1]
J[b] <- indexj
theta[b] <- theta1[indexj]
m <- sign(sign(x[, J[b]] - theta[b]) + 0.1)
if (length(which(y == m)) < n/2) {
m <- -m
}
indicator <- as.numeric((m - y) != 0)
e <- (w %*% indicator)/sum(w)
alpha[b] <- log((1 - e)/e)
for (i in 1:n) {
w[i] <- w[i] * exp(alpha[b] * indicator[i])
}
}
return(list(alpha = alpha, theta = theta, J = J))
}
Evaluation <- function(ada, x, y) {
n <- nrow(x)
B <- length(unlist(ada[1]))
m <- matrix(nrow = nrow(x), ncol = B)
J <- c()
theta <- c()
alpha <- c()
for (b in 1:B) {
J[b] <- unlist(ada[3])[b]
theta[b] <- unlist(ada[2])[b]
alpha[b] <- unlist(ada[1])[b]
m[, b] <- sign(sign(x[, J[b]] - theta[b]) + 0.1) # if x[i,j]=theta,it is in class '+1'
if (length(which(y == m[, b])) < n/2) {
m[, b] <- -m[, b]
}
}
M <- rep(0, nrow(x))
for (i in 1:B) {
M <- M + alpha[i] * m[, i]
}
L <- sign(M)
# misrate<-sum(abs(sign(L-y)))/nrow(x)
misrate <- length(which(L != y))/nrow(x)
return(list(L, misrate))
}
Cross Validation
I do cross validation to have a better sense of the performances of boosting method. The cross validaion is done by a function called CV
CV <- function(x, y, b) {
n <- nrow(x)
j <- ncol(x)
n5 <- n/5
xtrain <- matrix(nrow = (4 * n)/5, ncol = j)
xtest <- matrix(nrow = n5, ncol = j)
ytrain <- c()
ytest <- c()
mistrain <- c()
mistest <- c()
for (i in 1:5) {
xtrain <- x[-((n5 * (i - 1) + 1):(n5 * i)), ]
ytrain <- y[-((n5 * (i - 1) + 1):(n5 * i)), 1]
xtest <- x[((n5 * (i - 1) + 1):(n5 * i)), ]
ytest <- y[((n5 * (i - 1) + 1):(n5 * i)), 1]
ada <- AdaBoost(xtrain, ytrain, b)
mistrain[i] <- Evaluation(ada, xtrain, ytrain)[[2]]
mistest[i] <- Evaluation(ada, xtest, ytest)[[2]]
}
Mistrain <- mean(mistrain)
Mistest <- mean(mistest)
return(c(Mistrain, Mistest))
}
Get pairs of training misrate and testing mistrate, and plot them on the same plot.
plotmisrate <- function(x, y, B) {
train <- c()
test <- c()
for (b in 1:B) {
train[b] <- CV(x, y, b)[1]
test[b] <- CV(x, y, b)[2]
}
plot(train, pch = "*", col = 2, xlab = "Number of iteration", ylab = "Misclassification Rate",
ylim = c(0, 0.3))
lines(train, lty = 2, col = 2)
points(test, pch = "o", col = 3)
lines(test, lty = 1, col = 3)
legend("topleft", legend = c("Training error", "Testing error"), fill = c(2,
3))
}
plotmisrate(x, y, 30)
We can see clearly from the plot above, a very good property of boosting method is that the test error converges to a constant instead of increasing. So it is robust to overfitting