Boosting

I will implement boosting method with decision stumps. Decision stumps are defined as the first node of a tree classifier

I write a function called AdaBoost. This a composite of training weak learners and combining them. Input arguments are training data, their labels and number of weak leaners I would like to have in my boosting.

I divide the data into two sets by ramdomly picking out 10% of them which will serve as test data.

rm(list = ls())
setwd("~/Desktop/Homework_backup/machinelearning/uspsdata")
y <- read.table("uspscl.txt")
x <- read.table("uspsdata.txt")
index <- sample(1:200, 20)
x1 <- as.matrix(x[-index, ])
y1 <- as.matrix(y[-index, ])
x2 <- as.matrix(x[index, ])
y2 <- as.matrix(y[index, ])
AdaBoost <- function(x, y, B) {
    j <- ncol(x)
    n <- nrow(x)
    w <- rep(1/n, n)
    alpha <- c()
    J <- c()
    theta <- c()
    for (b in 1:B) {
        wy <- c()
        cum <- matrix(nrow = n, ncol = j)
        m <- c()
        theta1.index <- c()
        theta1 <- c()
        cum2 <- c()
        for (j in 1:j) {
            xw <- cbind(x[, j], w, y)
            xwa <- xw[order(xw[, 1]), ]
            for (i in 1:n) {
                wy[i] <- xwa[i, 2] * xwa[i, 3]
            }
            cum[, j] <- cumsum(wy)
            theta1.index[j] <- which(abs(cum[, j]) == max(abs(cum[, j])))[1]
            extre <- xwa[theta1.index[j], 1]
            theta1[j] <- extre
            cum2[j] <- max(abs(cum[, j]))
        }
        indexj <- which(cum2 == max(cum2))[1]
        J[b] <- indexj
        theta[b] <- theta1[indexj]
        m <- sign(sign(x[, J[b]] - theta[b]) + 0.1)
        if (length(which(y == m)) < n/2) {
            m <- -m
        }
        indicator <- as.numeric((m - y) != 0)
        e <- (w %*% indicator)/sum(w)
        alpha[b] <- log((1 - e)/e)
        for (i in 1:n) {
            w[i] <- w[i] * exp(alpha[b] * indicator[i])
        }
    }
    return(list(alpha = alpha, theta = theta, J = J))
}


Evaluation <- function(ada, x, y) {
    n <- nrow(x)
    B <- length(unlist(ada[1]))
    m <- matrix(nrow = nrow(x), ncol = B)
    J <- c()
    theta <- c()
    alpha <- c()
    for (b in 1:B) {
        J[b] <- unlist(ada[3])[b]
        theta[b] <- unlist(ada[2])[b]
        alpha[b] <- unlist(ada[1])[b]
        m[, b] <- sign(sign(x[, J[b]] - theta[b]) + 0.1)  # if x[i,j]=theta,it is in class '+1'
        if (length(which(y == m[, b])) < n/2) {
            m[, b] <- -m[, b]
        }
    }
    M <- rep(0, nrow(x))
    for (i in 1:B) {
        M <- M + alpha[i] * m[, i]
    }
    L <- sign(M)
    # misrate<-sum(abs(sign(L-y)))/nrow(x)
    misrate <- length(which(L != y))/nrow(x)
    return(list(L, misrate))
}

Cross Validation

I do cross validation to have a better sense of the performances of boosting method. The cross validaion is done by a function called CV

CV <- function(x, y, b) {
    n <- nrow(x)
    j <- ncol(x)
    n5 <- n/5
    xtrain <- matrix(nrow = (4 * n)/5, ncol = j)
    xtest <- matrix(nrow = n5, ncol = j)
    ytrain <- c()
    ytest <- c()
    mistrain <- c()
    mistest <- c()
    for (i in 1:5) {
        xtrain <- x[-((n5 * (i - 1) + 1):(n5 * i)), ]
        ytrain <- y[-((n5 * (i - 1) + 1):(n5 * i)), 1]
        xtest <- x[((n5 * (i - 1) + 1):(n5 * i)), ]
        ytest <- y[((n5 * (i - 1) + 1):(n5 * i)), 1]
        ada <- AdaBoost(xtrain, ytrain, b)
        mistrain[i] <- Evaluation(ada, xtrain, ytrain)[[2]]
        mistest[i] <- Evaluation(ada, xtest, ytest)[[2]]
    }
    Mistrain <- mean(mistrain)
    Mistest <- mean(mistest)
    return(c(Mistrain, Mistest))
}

Get pairs of training misrate and testing mistrate, and plot them on the same plot.

plotmisrate <- function(x, y, B) {
    train <- c()
    test <- c()
    for (b in 1:B) {
        train[b] <- CV(x, y, b)[1]
        test[b] <- CV(x, y, b)[2]
    }
    plot(train, pch = "*", col = 2, xlab = "Number of iteration", ylab = "Misclassification Rate", 
        ylim = c(0, 0.3))
    lines(train, lty = 2, col = 2)
    points(test, pch = "o", col = 3)
    lines(test, lty = 1, col = 3)
    legend("topleft", legend = c("Training error", "Testing error"), fill = c(2, 
        3))

}
plotmisrate(x, y, 30)

plot of chunk unnamed-chunk-4

We can see clearly from the plot above, a very good property of boosting method is that the test error converges to a constant instead of increasing. So it is robust to overfitting