Environment

rm(list=ls())
set.seed(1)
library(dplyr)
library(gridExtra)
library(RColorBrewer)
library(directlabels)
library(knitr)
library(R.matlab)
library(kableExtra)
library(ggplot2)
library(plotly)
f <- list(family = "Courier New, monospace", 
          size = 18, 
          color = "#7f7f7f")

Function

load and show data

load <- function(path, header = FALSE){
  data <- read.csv(path, header = header)
  X <<- data[, 1:(dim(data)[2] - 1)]
  y <<- data[, dim(data)[2]]
  data
}

show dataframe

showdf <- function(df, title = ''){
  df %>% 
    kable(format = 'html', caption = title, digit = 4) %>%
    kable_styling(bootstrap_options = c('striped', 'hover'))
}

map variables to higher polynomial degree

# input matrix X isn't included 1 column
mapFeature <- function(X, degree = 1){
  m <- dim(X)[1]; n <- dim(X)[2]
  mf <- rep(1, m)
  if(n == 1){
    for (i in 1:degree) mf <- cbind(mf, X[, 1] ^ i)
  }else if(n == 2){
    for(i in 1:degree){
      for(j in 0:i) mf <- cbind(mf, X[, 1] ^ (i - j) * X[, 2] ^ j)
    }
  }
  colnames(mf) <- paste('x', 0:(dim(mf)[2] - 1), sep = '')
  return(mf)
}

sigmoid function

\[ g(z)=\dfrac{1}{1+e^{-z}} \]

sigmoid <- function(z){
  return(1 / (1 + exp(-z)))
}

ggplot

ggplot() + 
  stat_function(fun = sigmoid) +
  xlim(-10, 10) +
  ggtitle('Sigmoid Function') +
  xlab('x') +
  ylab('f(x)')

plotly

x <- seq(-10, 10, length = 100)
plot_ly(x = x, y = sigmoid(x),
        mode = 'lines', type = 'scatter') %>%
  layout(title = 'Sigmoid Function',
         xaxis = list(title = 'x', titlefont = f),
         yaxis = list(title = 'f(x)', titlefont = f))

gradient of the sigmoid function

\[ g'(z)= \dfrac{d}{dz}g(z)= g(z)(1-g(z)) \]

sigmoidGradient <- function(z) {
    return(sigmoid(z) * (1 - sigmoid(z)))
}

normal equation

\[ \theta= \left(X^TX+\lambda \begin{bmatrix} 0 & 0 & \cdots & 0 \\ 0 & 1 & \cdots & 0 \\ \vdots & \vdots & \ddots & \vdots \\ 0 & 0 & \cdots & 1 \\ \end{bmatrix} \right)^{-1}X^Ty \]

normalEquation <- function(X, y, lambda = 0){
  n <- dim(X)[2]
  regular <- lambda * diag(1, n)
  regular[1, 1] <- 0 
  theta = solve(t(X) %*% X + regular) %*% t(X) %*% y
  rownames(theta) <- paste('theta', 0:(n - 1), sep = '')
  return(theta)
}

cost function

\[ \mbox{Linear Regression}\\ h_{\theta}(x^{(i)})= (x^{(i)})^T\theta\\ J(\theta)= \dfrac{1}{2m}\sum\limits_{i=1}^m\left(h_{\theta}(x^{(i)})-y^{(i)}\right)^2+\dfrac{\lambda}{2m}\sum\limits_{j=1}^n\theta_j^2= \dfrac{1}{2m}(X\theta-y)^T(X\theta-y)+\dfrac{\lambda}{2m}\sum\limits_{j=1}^n\theta_j^2 \\ \] \[ \mbox{Logistic Regression}\\ h_{\theta}(x^{(i)}) =g(\theta^Tx^{(i)}) =\dfrac{1}{1+e^{-\theta^Tx^{(i)}}} =Pr(y=1|x^{(i)};\theta)\\ \begin{align} J(\theta) &=\dfrac{1}{m}\sum\limits_{i=1}^mCost(h_{\theta}(x^{(i)}),y^{(i)})+\frac{\lambda}{2m}\sum\limits_{j=1}^n\theta_j^2\\ &=\dfrac{1}{m}\left[\sum\limits_{i=1}^m-y^{(i)}\log h_{\theta}(x^{(i)})-(1-y^{(i)})\log (1-h_{\theta}(x^{(i)}))\right]+\frac{\lambda}{2m}\sum\limits_{j=1}^n\theta_j^2\\ &=\dfrac{1}{m}\left[-y^t\log h_{\theta}(X)-(1-y)^T\log (1-h_{\theta}(X))\right]+\frac{\lambda}{2m}\sum\limits_{j=1}^n\theta_j^2\\ &=\dfrac{1}{m}\left[-y^t\log g(X\theta)-(1-y)^T\log (1-g(X\theta))\right]+\frac{\lambda}{2m}\sum\limits_{j=1}^n\theta_j^2 \end{align} \]

costFunction <- function(X, y, theta, lambda = 0, type = c('lin', 'log')){
  m <- dim(X)[1]
  pt <- theta
  pt[1] <- 0
  A <- lambda / (2 * m) * t(pt) %*% pt
  if(type == 'lin') {
    return(t(X %*% theta - y) %*% (X %*% theta - y) / (2 * m) + A)
  }
  else if(type == 'log'){
    h <- sigmoid(X %*% theta)
    return(1 / m * (-t(y) %*% log(h) - t(1 - y) %*% log(1 - h)) + A)
  }
}

gradient operator

\[ \mbox{Linear Regression}\\ \nabla_{\theta}J= \dfrac{\partial}{\partial\theta_j}J(\theta)= \dfrac{1}{m}X^T(X\theta-y)+\frac{\lambda}{m} \begin{bmatrix} 0\\ \theta_1\\ \theta_2\\ \vdots\\ \theta_n \end{bmatrix} \] \[ \mbox{Logistic Regression}\\ \nabla_{\theta}J= \dfrac{\partial}{\partial\theta_j}J(\theta)= \begin{bmatrix} \frac{1}{m}\sum_{i=1}^m\left(h_{\theta}(x^{(i)})-y^{(i)}\right)x_0^{(i)}+0\\ \frac{1}{m}\sum_{i=1}^m\left(h_{\theta}(x^{(i)})-y^{(i)}\right)x_1^{(i)}+\frac{\lambda}{m}\theta_1\\ \frac{1}{m}\sum_{i=1}^m\left(h_{\theta}(x^{(i)})-y^{(i)}\right)x_2^{(i)}+\frac{\lambda}{m}\theta_2\\ \vdots\\ \frac{1}{m}\sum_{i=1}^m\left(h_{\theta}(x^{(i)})-y^{(i)}\right)x_n^{(i)}+\frac{\lambda}{m}\theta_n \end{bmatrix} = \frac{1}{m}X^T\left(h_{\theta}(X)-y\right)+\frac{\lambda}{m} \begin{bmatrix} 0\\ \theta_1\\ \theta_2\\ \vdots\\ \theta_n \end{bmatrix} \]

gradientFunction <- function(X, y, theta, lambda = 0, type = c('lin', 'log')){
  m <- dim(X)[1]
  G <- lambda / m * theta
  G[1] <- 0
  if(type == 'lin') return(1 / m * t(X) %*% (X %*% theta - y) + G)
  else if(type == 'log'){
    h <- sigmoid(X %*% theta)
    return(1 / m * t(X) %*% (h - y) +  G)
  }
}

hessian matrix

\[ H= \frac{1}{m}\left[\sum\limits_{i=1}^mh_{\theta}(x^{(i)})(1-h_{\theta}(x^{(i)}))x^{(i)}\left(x^{(i)}\right)^T\right] + \frac{\lambda}{m} \begin{bmatrix} 0&&&\\ &1&&\\ &&\ddots&\\ &&&1 \end{bmatrix} \]

hessian <- function (X, y, theta, lambda = 0) {
  m <- dim(X)[1]
  n <- dim(X)[2]
  L <- lambda / m * diag(n)
  L[1, 1] <- 0
  h <- sigmoid(X %*% theta)
  return(1 / m * t(X) %*% X * diag(h) * diag(1 - h) + L)
}

compute theta by iterations

\[ \mbox{Linear Regression}\\ \theta:= \theta-\alpha\nabla_{\theta}J \] \[ \mbox{Logistic Regression}\\ \theta:=\theta-H^{-1}\nabla_{\theta}J \]

computeTheta <- function(X, y, theta, alpha = 0, lambda = 0, num_iters, 
                         type = c('lin', 'log')){
  n <- length(theta)
  theta_vals <- theta
  J_vals <- rep(0, num_iters)
  if(type == 'lin'){
    for (i in 1:num_iters){
      theta_vals <- rbind(theta_vals, t(theta))
      J_vals[i] <- costFunction(X, y, theta, lambda, type = type)
      theta <- theta - alpha * gradientFunction(X, y, theta, lambda, type = type)
    }
  }else if(type == 'log'){
    for (i in 1:num_iters) {
      theta_vals <- rbind(theta_vals, t(theta))
      J_vals[i] = costFunction(X, y, theta, lambda, type = type)
      theta = theta - 
        solve(hessian(X, y, theta, lambda)) %*% 
        gradientFunction(X, y, theta, lambda, type = type)
    }
  }
  rownames(theta) <- paste('theta', 0:(n - 1), sep = '')
  return(list(theta = theta, theta_vals = theta_vals, J_vals = J_vals)) 
}

one vs all

oneVsAll <- function(X, y, lambda = 0, num_iters, num_labels, method = c('matrix', 'optim')){
  n <- dim(X)[2]
  alpha <- 0
  type <- 'log'
  for (c in 1:num_labels){
    theta_ini <- rep(0, n)
    if(method == 'matrix')
      theta <- computeTheta(X, (y == c), theta_ini, alpha, lambda, num_iters, type)$theta
    else
      theta <- optim(theta_ini, X = X, y = (y == c), lambda = lambda, type = type,
                 fn = costFunction, gr = gradientFunction,
                 method = "BFGS", control = list(maxit = num_iters))$par
    if(c == 1) all_theta <- t(theta)
    else all_theta <- rbind(all_theta, t(theta))
  }
  return(all_theta)
}

one vs all predict

predictOneVsAll <- function(all_theta, X){
  ps <- sigmoid(X %*% t(all_theta))
  return(apply(sigmoid(X %*% t(all_theta)), 1, which.max))
}

display image

displayData <- function(X, example_width = round(sqrt(dim(X)[2])))  {
  if (is.vector(X))
    X <- t(X)
  m <- dim(X)[1]
  n <- dim(X)[2]
  example_height <- (n / example_width) #20
  display_rows <- floor(sqrt(m)) #10
  display_cols <- ceiling(m / display_rows) #10
  pad <- 1
  display_array <-
    -matrix(0,pad + display_rows * (example_height + pad),
            pad + display_cols * (example_width + pad))
  curr_ex <- 1
  for (j in 1:display_rows) {
    for (i in 1:display_cols) {
      if (curr_ex > m)
        break
      max_val <- max(abs(X[curr_ex,]))
      display_array[pad + (j - 1) * (example_height + pad) + (1:example_height),
                    pad + (i - 1) * (example_width + pad) + (1:example_width)] <-
        matrix(X[curr_ex,], example_height, example_width) / max_val
      curr_ex <- curr_ex + 1
    }
    if (curr_ex > m)
      break
  }
  op <- par(bg = "gray")
  display_array <- t(apply(display_array,2,rev))
  image(
    z = display_array,col = gray.colors(100), xaxt = 'n',yaxt = 'n'
  )
  grid(
    nx = display_cols,display_rows,col = 'black',lwd = 2,lty = 1
  )
  box()
  par(op)
}

neural network cost function

\[ h_{\Theta}(x)\in R^K\\ (h_{\Theta}(x))_i=i^{\text{th}}\text{ output}\\ \begin{align} J(\Theta)&= -\dfrac{1}{m}\left[\sum\limits_{i=1}^m\sum\limits_{k=1}^Ky_k^{(i)}\log (h_{\Theta}(x^{(i)}))_k+(1-y_k^{(i)})\log (1-(h_{\Theta}(x^{(i)}))_k)\right]\\ &+\dfrac{\lambda}{2m}\sum\limits_{l=1}^{L-1}\sum\limits_{i=1}^{s_l}\sum\limits_{j=1}^{s_l+1}(\Theta_{ji}^{(l)})^2 \end{align} \]

nnCostFunction  <-
  function(input_layer_size, hidden_layer_size, num_labels,X, y, lambda) {
    function(nn_params) {
      m <- dim(X)[1]
      Theta1 <-
        matrix(nn_params[1:(hidden_layer_size * (input_layer_size + 1))],
               hidden_layer_size, (input_layer_size + 1))
      Theta2 <-
        matrix(nn_params[(1 + (hidden_layer_size * (input_layer_size + 1))):length(nn_params)],
               num_labels, (hidden_layer_size + 1))
      I <- diag(num_labels)
      Y <- matrix(0, m, num_labels)
      for (i in 1:m)
        Y[i,] <- I[y[i],]
      a1 <- cbind(rep(1,m),X)
      z2 <- a1 %*% t(Theta1)
      a2 <- cbind(rep(1,dim(z2)[1]), sigmoid(z2))
      z3 <- a2 %*% t(Theta2)
      a3 <- sigmoid(z3)
      h <- a3
      p <- sum(Theta1[,-1] ^ 2) + sum(Theta2[,-1] ^ 2)
      return(sum((-Y) * log(h) - (1 - Y) * log(1 - h)) / m + lambda * p / (2 * m))
    }
  }

neural network gradient function

nnGradFunction  <-
  function(input_layer_size, hidden_layer_size, num_labels,
           X, y, lambda) {
    function(nn_params) {
      m <- dim(X)[1]
      Theta1 <-
        matrix(nn_params[1:(hidden_layer_size * (input_layer_size + 1))],
               hidden_layer_size, (input_layer_size + 1))
      Theta2 <-
        matrix(nn_params[(1 + (hidden_layer_size * (input_layer_size + 1))):length(nn_params)],
               num_labels, (hidden_layer_size + 1))
      I <- diag(num_labels)
      Y <- matrix(0, m, num_labels)
      for (i in 1:m)
        Y[i,] <- I[y[i],]
      a1 <- cbind(rep(1,m),X)
      z2 <- a1 %*% t(Theta1)
      a2 <- cbind(rep(1,dim(z2)[1]), sigmoid(z2))
      z3 <- a2 %*% t(Theta2)
      a3 <- sigmoid(z3)
      h <- a3
      sigma3 <- h - Y
      sigma2 <-
        (sigma3 %*% Theta2) * sigmoidGradient(cbind(rep(1,dim(z2)[1]),z2))
      sigma2 <- sigma2[,-1]
      delta_1 <- (t(sigma2) %*% a1)
      delta_2 <- (t(sigma3) %*% a2)
      p1 <- (lambda / m) * cbind(rep(0,dim(Theta1)[1]), Theta1[,-1])
      p2 <- (lambda / m) * cbind(rep(0,dim(Theta2)[1]), Theta2[,-1])
      Theta1_grad <- delta_1 / m + p1
      Theta2_grad <- delta_2 / m + p2
      grad <-  c(c(Theta1_grad), c(Theta2_grad))
      return(grad)
    }
  }

randomly initialize the weights of a layer

randInitializeWeights <- function(L_in, L_out) {
  epsilon_init <- 0.12
  rnd <- runif(L_out * (1 + L_in))
  rnd <- matrix(rnd,L_out,1 + L_in)
  W <- rnd * 2 * epsilon_init - epsilon_init
  W
}

neural network predict

predict <- function(Theta1, Theta2, X) {
  if (is.vector(X))
    X <- t(X)
  m <- dim(X)[1]
  num_labels <- dim(Theta2)[1]
  p <- rep(0,dim(X)[1])
  h1 <- sigmoid(cbind(rep(1,m),X) %*% t(Theta1))
  h2 <- sigmoid(cbind(rep(1,m),h1) %*% t(Theta2))
  p <- apply(h2,1,which.max)
  p
}

debugInitializeWeights

debugInitializeWeights  <- function(fan_out, fan_in) {
  W <- matrix(0,fan_out,1 + fan_in)
  W <- matrix(sin(1:length(W)), dim(W)[1],dim(W)[2]) / 10
  W
}

computeNumericalGradient

computeNumericalGradient <- function(J, theta) {
  numgrad <- rep(0,length(theta))
  perturb <- rep(0,length(theta))
  e <- 1e-4
  for (p in 1:length(theta)) {
    perturb[p] <- e
    loss1 <- J(theta - perturb)
    loss2 <- J(theta + perturb)
    numgrad[p] <- (loss2 - loss1) / (2 * e)
    perturb[p] <- 0
  }
  numgrad
}

checkNNGradients

checkNNGradients <- function (lambda = 0) {
  input_layer_size <- 3
  hidden_layer_size <- 5
  num_labels <- 3
  m <- 5
  Theta1 <- debugInitializeWeights(hidden_layer_size, input_layer_size)
  Theta2 <- debugInitializeWeights(num_labels, hidden_layer_size)
  X  <- debugInitializeWeights(m, input_layer_size - 1)
  y  <- 1 + t(1:m %% num_labels)
  nn_params <- c(Theta1,Theta2)
  costFunc <- nnCostFunction(input_layer_size, hidden_layer_size,
                             num_labels, X, y, lambda)
  cost <- costFunc(nn_params)
  grad <- nnGradFunction(input_layer_size, hidden_layer_size,
                         num_labels, X, y, lambda)(nn_params)
  numgrad <- computeNumericalGradient(costFunc, nn_params)
  print(cbind(numgrad, grad))
  cat(
    sprintf(
      'The above two columns you get should be very similar.
      (Left-Your Numerical Gradient, Right-Analytical Gradient)\n\n'
    )
    )
  diff <-
    norm(as.matrix(numgrad - grad)) / norm(as.matrix(numgrad + grad))
  cat(
    sprintf(
      'If your backpropagation implementation is correct, then
      the relative difference will be small (less than 1e-9).
      Relative Difference: %g', diff
    )
    )
}

lbfgsb3 faster than optim

lbfgsb3_ <- function (prm, fn, gr = NULL, lower = -Inf, upper = Inf, control = list(), 
          ...)
{
  tasklist <- c("NEW_X", "START", "STOP", "FG", "ABNORMAL_TERMINATION_IN_LNSRCH", 
                "CONVERGENCE", "CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL", 
                "CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH", "ERROR: FTOL .LT. ZERO", 
                "ERROR: GTOL .LT. ZERO", "ERROR: INITIAL G .GE. ZERO", 
                "ERROR: INVALID NBD", "ERROR: N .LE. 0", "ERROR: NO FEASIBLE SOLUTION", 
                "ERROR: STP .GT. STPMAX", "ERROR: STP .LT. STPMIN", 
                "ERROR: STPMAX .LT. STPMIN", "ERROR: STPMIN .LT. ZERO", 
                "ERROR: XTOL .LT. ZERO", "FG_LNSRCH", "FG_START", "RESTART_FROM_LNSRCH", 
                "WARNING: ROUNDING ERRORS PREVENT PROGRESS", "WARNING: STP .eq. STPMAX", 
                "WARNING: STP .eq. STPMIN", "WARNING: XTOL TEST SATISFIED")
  ctrl <- list(maxit = 100, trace = 0, iprint = 0L)
  namc <- names(control)
  if (!all(namc %in% names(ctrl))) 
    stop("unknown names in control: ", namc[!(namc %in% 
                                                names(ctrl))])
  ctrl[namc] <- control
  iprint <- as.integer(ctrl$iprint)
  factr <- 1e+07
  pgtol <- 1e-05
  nmax <- 26260
  mmax <- 17L
  if (length(prm) > nmax) 
    stop("The number of parameters cannot exceed 1024")
  n <- as.integer(length(prm))
  m <- 5L
  nbd <- rep(2L, n)
  nwa <- 2 * mmax * nmax + 5 * nmax + 11 * mmax * mmax + 8 * 
    mmax
  wa <- rep(0, nwa)
  dsave <- rep(0, 29)
  lsave <- rep(TRUE, 4)
  isave <- rep(0L, 44)
  iwa <- rep(0L, 3 * nmax)
  csave <- ""
  if (length(lower) == 1) 
    lower <- rep(lower, n)
  if (length(upper) == 1) 
    upper <- rep(upper, n)
  bigval <- .Machine$double.xmax/10
  for (i in 1:n) {
    if (is.finite(lower[i])) {
      if (is.finite(upper[i])) 
        nbd[i] <- 2
      else {
        nbd[i] <- 1
        upper[i] <- bigval
      }
    }
    else {
      if (is.finite(upper[i])) {
        nbd[i] <- 3
        lower[i] <- -bigval
      }
      else {
        nbd[i] <- 0
        upper[i] <- bigval
        lower[i] <- -bigval
      }
    }
  }
  itask <- 2L
  task <- tasklist[itask]
  f <- .Machine$double.xmax/100
  g <- rep(f, n)
  icsave <- 0
  repeat {
    if (isave[34] > ctrl$maxit )
      break
    
    if (ctrl$trace >= 2) {
      cat("Before call, f=", f, "  task number ", itask, 
          " ")
      print(task)
    }
    possibleError <- tryCatch(
    result <- .Fortran("lbfgsb3", n = as.integer(n), m = as.integer(m), 
                       x = as.double(prm), l = as.double(lower), u = as.double(upper), 
                       nbd = as.integer(nbd), f = as.double(f), g = as.double(g), 
                       factr = as.double(factr), pgtol = as.double(pgtol), 
                       wa = as.double(wa), iwa = as.integer(iwa), itask = as.integer(itask), 
                       iprint = as.integer(iprint), icsave = as.integer(icsave), 
                       lsave = as.logical(lsave), isave = as.integer(isave), 
                       dsave = as.double(dsave))
    , error = function(e) e)
    
    if(inherits(possibleError, "error"))
      break
    
    
    itask <- result$itask
    icsave <- result$icsave
    prm <- result$x
    g <- result$g
    iwa <- result$iwa
    wa <- result$wa
    nbd <- result$nbd
    lsave <- result$lsave
    isave <- result$isave
    dsave <- result$dsave
    if (ctrl$trace > 2) {
      cat("returned from lbfgsb3\n")
      cat("returned itask is ", itask, "\n")
      task <- tasklist[itask]
      cat("changed task to ", task, "\n")
    }
    if (itask %in% c(4L, 20L, 21L)) {
      if (ctrl$trace >= 2) {
        cat("computing f and g at prm=")
        print(prm)
      }
      f <- fn(prm, ...)
      if (is.null(gr)) {
        g <- grad(fn, prm, ...)
      }
      else {
        g <- gr(prm, ...)
      }
      if (ctrl$trace > 0) {
        cat("At iteration ", isave[34], " f =", f)
        if (ctrl$trace > 1) {
          cat("max(abs(g))=", max(abs(g)))
        }
        cat("\n")
      }
    }
    else {
      if (itask == 1L) {
      }
      else break
    }
  }
  info <- list(task = task, itask = itask, lsave = lsave, 
               icsave = icsave, dsave = dsave, isave = isave)
  ans <- list(prm = prm, f = f, g = g, info = info)
}

Simple Linear Regression

calculate theta

path = 'data/ex1data1.txt'
data <- read.csv(path, header = FALSE)
data
X <- t(t(data[, 1:(dim(data)[2] - 1)]))
y <- data[, dim(data)[2]]
X_std <- scale(X)
y_std <- scale(y)
X <- cbind(rep(1, dim(X)[1]), X)
X_std <- cbind(rep(1, dim(X_std)[1]), X_std)
# normal equation
theta <- normalEquation(X, y)
theta_std <- normalEquation(X_std, y_std)
# gradient descent
theta_ini <- c(5, -3)
alpha <- 0.01
lambda <- 0
num_iters <- 1000
result <- computeTheta(X, y, theta_ini, alpha, lambda, num_iters, type = 'lin')
result_std <- computeTheta(X_std, y_std, theta_ini, alpha, lambda, num_iters, type = 'lin')
# lm model
theta_glm <- glm(V2~., data = data)$coefficients
data.frame(theta_normal = theta,
           theta_grad = result$theta,
           theta_glm = theta_glm,
           theta_std = theta_std,
           theta_grad_std = result_std$theta) %>% showdf()
theta_normal theta_grad theta_glm theta_std theta_grad_std
theta0 -3.8958 -2.3775 -3.8958 0.0000 0.0002
theta1 1.1930 1.0405 1.1930 0.8379 0.8377

scatter plot and regression line

plot_x <- seq(min(X[, 2]), max(X[, 2]), length=100)
plot_y <- cbind(rep(1, length(plot_x)), plot_x) %*% c(theta[1],theta[2])
plot_x_std <- seq(min(X_std[, 2]), max(X_std[, 2]), length=100)
plot_y_std <- cbind(rep(1, length(plot_x_std)), plot_x_std) %*% c(theta_std[1],theta_std[2])
plot1 <- ggplot() +
  geom_point(aes(x = X[, 2], y = y), size = 2) +
  geom_path(aes(x = plot_x, y = plot_y), color = 'red') +
  labs(x = 'x', y = 'y')
plot2 <- ggplot() +
  geom_point(aes(x = X_std[, 2], y = y_std), size = 2) +
  geom_path(aes(x = plot_x_std, y = plot_y_std), color = 'red') +
  labs(x = 'normalized x', y = 'normalized y')
grid.arrange(plot1, plot2, ncol = 2)

J_vals between (un)normalized

plot1 <- ggplot() +
  geom_line(aes(x = 1:num_iters, y = result$J_vals)) +
  labs(x = 'iteraton', y = 'J_val(normalized)')
plot2<- ggplot() +
  geom_line(aes(x = 1:num_iters, y = result_std$J_vals)) +
  labs(x = 'iteraton', y = 'J_val(normalized)')
grid.arrange(plot1, plot2, ncol = 2)

J_vals with different alpha

num_iters <- 5
for(i in c(0.01, 0.02, 0.03)){
  assign(paste('result', i, sep = ''),
         computeTheta(X, y, theta = c(0, 0),
                       alpha = i, lambda = 0, num_iters = num_iters, type = 'lin'))
}

ggplot

ggplot() +
  geom_line(aes(x = 1:num_iters, y = result0.01$J_vals, colour = '0.01')) +
  geom_line(aes(x = 1:num_iters, y = result0.02$J_vals, colour = '0.02')) +
  geom_line(aes(x = 1:num_iters, y = result0.03$J_vals, colour = '0.03')) +
  labs(colour = 'alpha', x = 'iteration', y = 'J_val')

J_vals surface plot

length <- 100
theta0_vals <- seq(-10, 10, length = length)
theta1_vals <- seq(-4, 4, length = length)
J_vals <- J_vals_std <- matrix(0, length, length)
for (i in 1:length){
  for (j in 1:length){
    t <- c(theta0_vals[i], theta1_vals[j])
    J_vals[i, j] <- costFunction(X, y, theta = t, lambda = 0, type = 'lin')
    J_vals_std[i, j] <- costFunction(X_std, y_std, theta = t, lambda = 0, type = 'lin')
  }
}

plotly

plot_ly(x = theta0_vals, y = theta1_vals, z = J_vals) %>%
  add_surface() %>%
  layout(title = "",
         scene = list(xaxis = list(title = "Theta0"),
                      yaxis = list(title = "Theta1"),
                      zaxis = list(title = "J_val")))
plot_ly(x = theta0_vals, y = theta1_vals, z = J_vals_std) %>%
  add_surface() %>%
  layout(title = "",
         scene = list(xaxis = list(title = "Theta0"),
                      yaxis = list(title = "Theta1"),
                      zaxis = list(title = "J_val")))

J_vals convergence path

temp <- data.frame(x = rep(theta0_vals, each = length),
                   y = rep(theta1_vals, times = length))
contour <- cbind(temp, z = as.numeric(log(t(J_vals))))
contour_std <- cbind(temp, z = as.numeric(log(t(J_vals_std))))

ggplot2

plot1 <- ggplot()+
  geom_raster(data = contour, aes(x = x, y = y,fill = z), show.legend = FALSE) +
  geom_contour(data = contour, aes(x = x, y = y, z = z, colour = ..level..),
               colour = 'white', size = 0.5) +
  scale_fill_gradientn(colors = brewer.pal(3, "PuBuGn")) +
  geom_path(aes(x = result$theta_vals[, 1],
                y = result$theta_vals[, 2]),
            size = 1, colour = 'orange', alpha = 0.75) +
  geom_point(aes(x = result$theta_vals[1, 1],
                 y = result$theta_vals[1, 2]),
             size = 3, colour = 'blue',  alpha = 0.75) +
  geom_point(aes(x = result$theta[1],
                 y = result$theta[2]),
             size=3, colour = 'red', alpha = 0.75, shape = 4) +
  labs(fill="J_val", title = '', x = 'Theta0', y = 'Theta1')
plot1 <- direct.label(plot1, "bottom.pieces")
plot2<-ggplot()+
  geom_raster(data = contour_std, aes(x = x, y = y,fill = z), show.legend = FALSE) +
  geom_contour(data = contour_std, aes(x = x, y = y, z = z, colour = ..level..),
               colour = 'white', size = 0.5) +
  scale_fill_gradientn(colors = brewer.pal(3, "PuBuGn")) +
  geom_path(aes(x = result_std$theta_vals[, 1],
                y = result_std$theta_vals[, 2]),
            size = 1, colour = 'orange', alpha = 0.75) +
  geom_point(aes(x = result_std$theta_vals[1, 1],
                 y = result_std$theta_vals[1, 2]),
             size = 3, colour = 'blue',  alpha = 0.75) +
  geom_point(aes(x = result_std$theta[1],
                 y = result_std$theta[2]),
             size=3, colour = 'red', alpha = 0.75, shape = 4) +
  labs(fill="J_val", title = '', x = 'Theta0', y = 'Theta1')
plot2 <- direct.label(plot2, "bottom.pieces")
grid.arrange(plot1, plot2, ncol = 2)

plotly

plot1 <-
  plot_ly(x = theta0_vals, y = theta1_vals, z = log(t(J_vals)),
        type = "contour", contours = list(showlabels = TRUE), showscale = FALSE) %>%
  add_trace(x = result$theta_vals[1, 1],
            y = result$theta_vals[1, 2],
            mode = 'markers', type = 'scatter',
            marker = list(symbol = 'o', color = 'blue', size = 10)) %>%
  add_trace(x = result$theta_vals[, 1],
            y = result$theta_vals[, 2],
            mode = "lines", type = 'scatter', color = 'white') %>%
  add_trace(x = result$theta[1],
            y = result$theta[2],
            mode = "markers", type = 'scatter',
            marker = list(symbol = 'x', color = 'red', size = 10)) %>%
  layout(title = '',
         xaxis = list(title = "Theta0", titlefont = f),
         yaxis = list(title = "Theta1", titlefont = f),
         showlegend = FALSE)
plot2 <-
  plot_ly(x = theta0_vals, y = theta1_vals, z = log(t(J_vals_std)),
        type = "contour", contours = list(showlabels = TRUE), showscale = FALSE) %>%
  add_trace(x = result_std$theta_vals[1, 1],
            y = result_std$theta_vals[1, 2],
            mode = "markers", type = 'scatter',
            marker = list(symbol = 'o', color = 'blue', size = 10)) %>%
  add_trace(x = result_std$theta_vals[, 1],
            y = result_std$theta_vals[, 2],
            mode = "lines", type = 'scatter') %>%
  add_trace(x = result_std$theta[1],
            y = result_std$theta[2],
            mode = "markers", type = 'scatter',
            marker = list(symbol = 'x', color = 'red', size = 10)) %>%
  layout(title = '',
         xaxis = list(title = "Theta0", titlefont = f),
         yaxis = list(title = "Theta1", titlefont = f),
         showlegend = FALSE)
subplot(plot1, plot2)

Multiple Linear Regression

calculate theta

path = "data/ex1data2.txt"
data <- read.csv(path, header = FALSE)
data
X <- cbind(rep(1, dim(data)[1]), scale(data[, 1:(dim(data)[2] - 1)]))
y <- data[, dim(data)[2]]
# normal equation
theta <- normalEquation(X, y)
# gradient descent
theta_ini <- rep(0, dim(X)[2])
lambda <- 0
num_iters <- 50
for (i in c(0.3, 0.1, 0.03, 0.01)){
  assign(paste('result', i, sep = ''),
         computeTheta(X, y, theta_ini, alpha = i, lambda, num_iters, type = 'lin'))
}
# lm model
theta_glm <- glm(V3~., data = data)$coefficient
data.frame(theta_normal = theta,
           theta_glm = theta_glm,
           theta_0.3 = result0.3$theta,
           theta_0.1 = result0.1$theta,
           theta_0.03 = result0.03$theta,
           theta_0.01 = result0.01$theta) %>% showdf()
theta_normal theta_glm theta_0.3 theta_0.1 theta_0.03 theta_0.01
theta0 340412.660 89597.9095 340412.654 338658.2492 266180.45 134460.94
theta1 110631.050 139.2107 110572.962 104127.5156 75037.93 39283.23
theta2 -6649.474 -8738.0191 -6591.386 -172.2053 18970.04 16518.64

J_vals with different alpha

ggplot() +
  geom_line(aes(x = 1:num_iters, y = result0.3$J_vals, colour = '0.3'),
            size = 1, alpha = 0.75, show_guide = TRUE) +
  geom_line(aes(x = 1:num_iters, y = result0.1$J_vals, colour = '0.1'),
            size = 1, alpha = 0.75, show_guide = TRUE) +
  geom_line(aes(x = 1:num_iters, y = result0.03$J_vals, colour = '0.03'),
            size = 1, alpha = 0.75, show_guide = TRUE) +
  geom_line(aes(x = 1:num_iters, y = result0.01$J_vals, colour = '0.01'),
            size = 1, alpha = 0.75, show_guide = TRUE) +
  labs(colour = 'alpha', title = 'Normalized X', x = 'iteration', y = 'J_val')

Logistic Regression

calculate theta

path <- "data/ex2data1.txt"
data <- read.csv(path, header = FALSE)
data
X <- mapFeature(data[, 1:(dim(data)[2] - 1)])
y <- data[, dim(data)[2]]
data <- data.frame(X, y)
# gradient descent
theta <- rep(0, dim(X)[2])
alpha <- 0
lambda <- 0
num_iters <- c(10, 100, 1000)
for(i in num_iters){
  assign(paste('result', i, sep = ''),
         computeTheta(X, y, theta, alpha, lambda, num_iters = i, type = 'log'))
}
# 使用opti替代octave中的fminunc找最優解
theta_opt <- optim(par = theta, X = X, y = y, lambda = lambda, type = 'log',
                   fn = costFunction, gr = gradientFunction,
                   method = "Nelder-Mead", control = list(maxit = 1000))$par
# 使用glm
theta_glm <- glm(y~., data = data[, -1], family = "binomial")$coefficients
theta <- data.frame(theta_10 = result10$theta,
                    theta_100 = result100$theta,
                    theta_1000 = result1000$theta,
                    theta_optim = theta_opt,
                    theta_glm = theta_glm)
theta %>% showdf()
theta_10 theta_100 theta_1000 theta_optim theta_glm
theta0 -17.3257 -25.1508 -25.1613 -25.1647 -25.1613
theta1 0.1427 0.2061 0.2062 0.2063 0.2062
theta2 0.1373 0.2014 0.2015 0.2015 0.2015
acc <- rep(0, 5)
for(i in 1:5){
  acc[i] <- paste(mean((sigmoid(as.matrix(data[, 1:(dim(data)[2] - 1)]) %*% theta[, i]) >= 0.5) == data[, dim(data)[2]]) * 100, '%', sep = '')
}
acc <- data.frame(theta_10 = acc[1],
                  theta_100 = acc[2],
                  theta_1000 = acc[3],
                  theta_optim = acc[4],
                  theta_glm = acc[5])
rownames(acc) <- 'Accurate'
acc %>% showdf()
theta_10 theta_100 theta_1000 theta_optim theta_glm
Accurate 89% 89% 89% 89% 89%

ggplot

plot_gg <- ggplot() +
  geom_point(data = data, aes(x1, x2, colour = factor(y), shape = factor(y)),
             size = 3, alpha = 0.75) +
  scale_shape_manual(values = c(4, 16),
                     guide = guide_legend(reverse = TRUE),
                     name = '',
                     breaks = c("0", "1"),
                     labels = c("Not Admitted", "Admitted")) +
  scale_colour_manual(values = c('red','blue'),
                      guide = guide_legend(reverse = TRUE),
                      name = '',
                      breaks = c("0", "1"),
                      labels = c("Not Admitted", "Admitted")) +
  labs(title = '', x = 'Exam 1 score', y = 'Exam 2 score') +
  theme(legend.position="bottom")
plot_gg

plotly

trace1 <- data[data[, 'y'] == 0,]
trace2 <- data[data[, 'y'] == 1,]
plot_pl <- plot_ly() %>%
  add_trace(data = trace2, x = ~x1, y = ~x2, name = 'Admitted',
            mode = "markers", type = 'scatter',
            marker = list(symbol = 'o', color = 'blue', size = 10),
            showlegend = TRUE) %>%
  add_trace(data = trace1, x = ~x1, y = ~x2, name = 'Not Admitted',
                 mode = "markers", type = 'scatter',
                 marker = list(symbol = 'x', color = 'red', size = 10),
            showlegend = TRUE) %>%
  layout(title = '',
         xaxis = list(title = 'exam1 score',
                      range = c(min(data$x1) - 2, max(data$x1) + 2),
                      titlefont = f),
         yaxis = list(title = 'exam2 score',
                      range = c(min(data$x2) - 2, max(data$x2) + 2),
                      titlefont = f))
plot_pl

J_vals with different iteration

ggplot

plot1 <- ggplot() + geom_line(aes(x = 1:num_iters[1], y = result10$J_vals)) +
  labs(title = '', x = 'iteraion', y = 'J_vals')
plot2 <- ggplot() + geom_line(aes(x = 1:num_iters[2], y = result100$J_vals)) +
  labs(title = '', x = 'iteraion', y = 'J_vals')
plot3 <- ggplot() + geom_line(aes(x = 1:num_iters[3], y = result1000$J_vals)) +
  labs(title = '', x = 'iteraion', y = 'J_vals')
grid.arrange(plot1, plot2, plot3, ncol = 3)

plotly

plot1 <- plot_ly(x = 1:num_iters[1], y = result10$J_vals, mode = 'line', name = 'iter=10')
plot2 <- plot_ly(x = 1:num_iters[2], y = result100$J_vals, mode = 'line', name = 'iter=100')
plot3 <- plot_ly(x = 1:num_iters[3], y = result1000$J_vals, mode = 'line', name = 'iter=1000')
subplot(plot1, plot2, plot3) %>%
  layout(yaxis = list(title = "J_val",
                      titlefont = f),
         legend = list(orientation = 'h'))

dicision boundary

\[ \theta_0+\theta_1x_1+\theta_2x_2=0\\ \Rightarrow x_2 =\dfrac{-1}{\theta_2}\times[\theta_1x_1+\theta_0] \]

theta <- result1000$theta
plot_x = c(min(X[, 2]), max(X[, 2]))
plot_y = -1 / theta[3] * (theta[2] * plot_x + theta[1])
trace3 <- data.frame(x = plot_x, y = plot_y)

ggplot

plot1 <-
  plot_gg + geom_path(data = trace3, aes(x = x, y = y),
                     colour = 'orange', size = 1, alpha = 0.75) +
  labs(title = 'Dicision Boundary') +
  coord_cartesian(xlim = c(min(data$x1), max(data$x1)),
                  ylim = c(min(data$x2), max(data$x2))) +
  theme(legend.position = 'none')
y_pre <- sigmoid(as.matrix(data[, 1:(dim(data)[2] - 1)]) %*% theta) >= 0.5
data_pre <- cbind(data, y_pre)
plot2 <-
  ggplot() +
  geom_point(data = data_pre, aes(x1, x2, colour = factor(y_pre), shape = factor(y_pre)),
             size = 3, alpha = 0.75) +
  scale_shape_manual(values = c(4, 16)) +
  scale_colour_manual(values = c('red','blue')) +
  geom_path(data = trace3, aes(x = x, y = y),
                     colour = 'orange', size = 1, alpha = 0.75) +
  labs(title = 'Dicision Boundary(predict)', x = 'Exam 1 score', y = 'Exam 2 score') +
  coord_cartesian(xlim = c(min(data$x1), max(data$x1)),
                  ylim = c(min(data$x2), max(data$x2))) +
  theme(legend.position = 'none')
grid.arrange(plot1, plot2, ncol = 2)

plotly

plot_ly() %>%
  add_trace(data = trace2, x = ~x1, y = ~x2, name = 'Admitted',
            mode = "markers", type = 'scatter',
            marker = list(symbol = 'o', color = 'blue', size = 10), showlegend = TRUE) %>%
  add_trace(data = trace1, x = ~x1, y = ~x2, name = 'Not Admitted',
                 mode = "markers", type = 'scatter',
                 marker = list(symbol = 'x', color = 'red', size = 10), showlegend = TRUE) %>%
  add_trace(data = trace3, x = ~x, y = ~y, mode = 'lines', name = 'Dicision Boundary') %>%
  layout(xaxis = list(title = 'exam1 score', titlefont = f,
                      range = c(min(data$x1) - 2, max(data$x1 + 2))),
         yaxis = list(title = 'exam2 score', titlefont = f,
                      range = c(min(data$x2) - 2, max(data$x2 + 2))))

Regularized Logistic Regression

calculate theta

path = "data/ex2data2.txt"
data <- read.csv(path, header = FALSE)
data
X <- mapFeature(data[, 1:(dim(data)[2] - 1)], 6)
y <- data[, dim(data)[2]]
data <- data.frame(X, y)
# gradient descent, set lambda = 0, 1, 100
theta <- rep(0, dim(X)[2])
lambda <- c(0, 1, 100)
num_iters <- 100
j <- 1
for(i in lambda){
  assign(paste('result', j, sep = ''),
         computeTheta(X, y, theta, alpha, lambda = i, num_iters, type = 'log'))
  j <- j + 1
}
# opti, set lambda = 0, 1, 100
j <- 4
for(i in lambda){
  assign(paste('result', j, sep = ''),
         optim(par = theta, X = X, y = y, lambda = i, type = 'log',
                    fn = costFunction, gr = gradientFunction,
                    method = "BFGS"))
  j <- j + 1
}
# glm
result7 <- glm(y~., data = data[, -1], family = "binomial")
theta <- data.frame(theta_0 = result1$theta,
                    theta_1 = result2$theta,
                    theta_100 = result3$theta,
                    theta_opt0 = result4$par,
                    theta_opt1 = result5$par,
                    theta_opt100 = result6$par,
                    theta_glm = result7$coefficients)
theta[1:5, ] %>% showdf()
theta_0 theta_1 theta_100 theta_opt0 theta_opt1 theta_opt100 theta_glm
theta0 7.1125 1.2727 0.0219 3.9956 1.2727 0.0219 38.2308
theta1 10.8285 0.6253 -0.0175 0.9643 0.6256 -0.0175 55.5958
theta2 16.1217 1.1811 0.0057 5.3991 1.1809 0.0057 98.1467
theta3 -60.5132 -2.0200 -0.0552 -6.1098 -2.0192 -0.0552 -369.4302
theta4 -11.4413 -0.9174 -0.0131 -8.2302 -0.9176 -0.0131 -177.1204
acc <- rep(0, 7)
for(i in 1:7){
  acc[i] <- paste(round(mean((sigmoid(as.matrix(data[, 1:(dim(data)[2] - 1)]) %*% theta[, i]) >= 0.5) == data[, dim(data)[2]]) * 100, 2), '%', sep = '')
}
acc <- data.frame(theta_0 = acc[1],
                  theta_1 = acc[2],
                  theta_100 = acc[3],
                  theta_opt0 = acc[4],
                  theta_opt1 = acc[5],
                  theta_opt100 = acc[6],
                  theta_glm = acc[7])
rownames(acc) <- 'Accurate'
acc %>% showdf()
theta_0 theta_1 theta_100 theta_opt0 theta_opt1 theta_opt100 theta_glm
Accurate 81.36% 83.05% 61.02% 83.9% 83.05% 61.02% 88.98%

ggplot

plot_gg <- ggplot() +
  geom_point(aes(x = data[data$y == 1, ]$x1, y = data[data$y == 1, ]$x2,
                 colour = 'Accepted', shape = 'Accepted')) +
  geom_point(aes(x = data[data$y == 0, ]$x1, y = data[data$y == 0, ]$x2,
                 colour = 'Rejected', shape = 'Rejected')) +
  scale_shape_manual(values = c(16, 4)) +
  labs(title = '', x = 'Microchip Test 1', y = 'Microchip Test 2', colour = '', shape = '')
plot_gg

plotly

trace1 <- data[data[, 'y'] == 1,]
trace2 <- data[data[, 'y'] == 0,]
plot_ly() %>%
  add_trace(data = trace1, x = ~x1, y = ~x2, name = 'Accepted',
            mode = "markers", type = 'scatter',
            marker = list(symbol = 'o', color = 'blue', size = 10)) %>%
  add_trace(data = trace2, x = ~x1, y = ~x2, name = 'Rejected',
            mode = "markers", type = 'scatter',
            marker = list(symbol = 'x', color = 'red', size = 10)) %>%
  layout(title = '',
         xaxis = list(title = 'Microchip Test 1',
                      titlefont = f),
         yaxis = list(title = 'Microchip Test 2',
                      titlefont = f))

J_vals with different lambda

ggplot

ggplot() +
  geom_line(aes(x = 1:num_iters, y = result1$J_vals, colour = '0')) +
  geom_line(aes(x = 1:num_iters, y = result2$J_vals, colour = '1')) +
  geom_line(aes(x = 1:num_iters, y = result3$J_vals, colour = '100')) +
  labs(title = '', x = 'iteraion', y = 'J_vals', colour = 'lambda')

plotly

plot_ly() %>%
  add_trace(x = 1:num_iters, y = result1$J_vals, mode = 'line', name = 'lambda=0') %>%
  add_trace(x = 1:num_iters, y = result2$J_vals, mode = 'line', name = 'lambda=1') %>%
  add_trace(x = 1:num_iters, y = result3$J_vals, mode = 'line', name = 'lambda=100') %>%
  layout(title = '',
         xaxis = list(title = "iteraion",
                      titlefont = f),
         yaxis = list(title = "J_vals",
                      titlefont = f))

dicision boundary

drawLength <- 50
u <- v <- seq(-1, 1.5, length = drawLength)
for(i in 1:7) assign(paste('z', i, sep=''), matrix(0, drawLength, drawLength))
z <- list(z1, z2, z3, z4, z5, z6, z7)
for(k in 1:7){
  for(i in 1:drawLength){
    for(j in 1:drawLength){
        z[[k]][j, i] <- mapFeature(cbind(u[i], v[j]), 6) %*% theta[, k]
    }
  }
}
coor<- data.frame(x = rep(u, each = drawLength),
                  y = rep(v, times = drawLength))
contour_list <- list(z1 = cbind(coor, z = c(z[[1]])),
                     z2 = cbind(coor, z = c(z[[2]])),
                     z3 = cbind(coor, z = c(z[[3]])),
                     z4 = cbind(coor, z = c(z[[4]])),
                     z5 = cbind(coor, z = c(z[[5]])),
                     z6 = cbind(coor, z = c(z[[6]])),
                     z7 = cbind(coor, z = c(z[[7]])))

ggplot

j <- 1
for(i in c('lambda=0', 'lambda=1', 'lambda=100',
           'opti lambda=0','opti lambda=1','opti lambda=100',
           'glm')){
  assign(paste('plot', j, sep = ''),
         plot_gg +
           geom_contour(data = contour_list[[j]], aes(x = x, y = y, z = z),
                        bins = 1, alpha = 0.75, size = 1) +
           labs(title = i) +
           coord_cartesian(xlim = c(min(data$x1), max(data$x1)),
                           ylim = c(min(data$x2), max(data$x2))) +
           theme(legend.position = "none"))
j <- j + 1
}
grid.arrange(plot1, plot2, plot3, plot4, plot5, plot6, plot7, ncol = 3)

plotly

plot_ly() %>%
  add_trace(data = trace1, x = ~x1, y = ~x2, name = 'Accepted',
            mode = "markers", type = 'scatter',
            marker = list(symbol = 'o', color = 'blue', size = 10)) %>%
  add_trace(data = trace2, x = ~x1, y = ~x2, name = 'Rejected',
            mode = "markers", type = 'scatter',
            marker = list(symbol = 'x', color = 'red', size = 10)) %>%
  add_trace(x = u, y = v, z = z[[2]],
            type = "contour", showscale = FALSE,
            contours = list(coloring = 'lines',
                            showlabels = TRUE),
            line = list(width = 2)) %>%
  layout(title = 'Dicision Boundary',
         xaxis = list(title = 'Microchip Test 1',
                      titlefont = f,
                      range = c(min(data$x1) - 0.1, max(data$x1) + 0.1)),
         yaxis = list(title = 'Microchip Test 2',
                      titlefont = f,
                      range = c(min(data$x2) - 0.1, max(data$x2) + 0.1)))

Multi-class Classification

path <- "data/ex3data1.mat"
data <- readMat(path)
X <- cbind(rep(1, dim(data$X)[1]), data$X)
y <- data$y

display image

displayData(X[sample(dim(X)[1])[1:100], -1])

calculate theta and accuracy

msg1 <- c('Use matrix\n')
for(i in c(0.1)){
  for(j in c(3:3)){
    all_theta <- oneVsAll(X, y, lambda = i, num_iters = j, num_labels = 10, method = 'matrix')
    pred <- predictOneVsAll(all_theta, X)
    accu <- round(mean(pred == y) * 100, 2)
    msg1 <- paste(msg1,
                  paste("Training Set Accuracy with num_iters=", j, "& lambda=", i, "is:", accu, "%\n"))
  }
}
msg2 <- c('Use optim\n')
for(i in c(0.1)){
  for(j in c(50)){
    all_theta <- oneVsAll(X, y, lambda = i, num_iters = j, num_labels = 10, method = 'optim')
    pred <- predictOneVsAll(all_theta, X)
    accu <- round(mean(pred == y) * 100, 2)
    msg2 <- paste(msg2,
                  paste("Training Set Accuracy with num_iters=", j, "& lambda=", i, "is:", accu, "%\n"))
  }
}
cat(paste(msg1, msg2, sep = ''))
## Use matrix
##  Training Set Accuracy with num_iters= 3 & lambda= 0.1 is: 92.42 %
## Use optim
##  Training Set Accuracy with num_iters= 50 & lambda= 0.1 is: 93.18 %

Neural Networks

path <- "data/ex3data1.mat"
data <- readMat(path)
X <- cbind(rep(1, dim(data$X)[1]), data$X)
y <- data$y
path <- "data/ex3weights.mat"
data <- readMat(path)
Theta1 <- data$Theta1
Theta2 <- data$Theta2

feedforward propagation and prediction

z1 <- X %*% t(Theta1)
h1 <- sigmoid(z1)
h1 <- cbind(rep(1, length = dim(h1)[1]), h1)
z2 <- h1 %*% t(Theta2)
h2 <- sigmoid(z2)
pred <- apply(h2, 1, which.max)
accu <- mean(pred == y) * 100
cat("Training Set Accuracy is:", accu, "%\n")
## Training Set Accuracy is: 97.52 %

J_vals with different lambda

X <- X[, -1]
nn_params <-c(c(Theta1), c(Theta2))
input_layer_size <- dim(X)[2]
hidden_layer_size <- 25
num_labels <- 10
J_vals <- rep(0, 20)
for(i in 1:length(J_vals)){
J_vals[i] <- 
  nnCostFunction(input_layer_size, hidden_layer_size, num_labels, X, y, lambda = i - 1)(nn_params)
}
ggplot() + geom_line(aes(x = 1:length(J_vals), y = J_vals)) +
  labs(x = 'lambda', y = 'J_val')

backpropagation

lambda <- 1
initial_Theta1 <- randInitializeWeights(input_layer_size, hidden_layer_size)
initial_Theta2 <- randInitializeWeights(hidden_layer_size, num_labels)
initial_nn_params <- c(initial_Theta1, initial_Theta2)
costFunction <- nnCostFunction(input_layer_size, hidden_layer_size,
                                   num_labels, X, y, lambda)
gradFunction <- nnGradFunction(input_layer_size, hidden_layer_size,
                               num_labels, X, y, lambda)
# use optim
nn_params <- optim(par = initial_nn_params,
                   fn = costFunction, gr = gradFunction,
                   method = "BFGS", control = list(trace = 1, maxit = 50))$par
## initial  value 6.966889 
## iter  10 value 2.025642
## iter  20 value 1.052006
## iter  30 value 0.765928
## iter  40 value 0.642570
## iter  50 value 0.564674
## final  value 0.564674 
## stopped after 50 iterations
Theta1 <- matrix(nn_params[1:(hidden_layer_size * (input_layer_size + 1))],
                 hidden_layer_size,
                 (input_layer_size + 1))
Theta2 <- matrix(nn_params[(1 + (hidden_layer_size * (input_layer_size + 1))):length(nn_params)],
                 num_labels,
                 (hidden_layer_size + 1))
mean(predict(Theta1, Theta2, X) == y) * 100 # 94.16%
## [1] 94.14
# use lbfgsb3
library(lbfgsb3c)
nn_params <- lbfgsb3_(initial_nn_params,
                fn = costFunction, gr = gradFunction,
                control = list(trace = 1, maxit = 50))$prm
Theta1 <- matrix(nn_params[1:(hidden_layer_size * (input_layer_size + 1))],
                 hidden_layer_size,
                 (input_layer_size + 1))
Theta2 <- matrix(nn_params[(1 + (hidden_layer_size * (input_layer_size + 1))):length(nn_params)],
                 num_labels,
                 (hidden_layer_size + 1))
mean(predict(Theta1, Theta2, X) == y) * 100 # 97.08%
## [1] 9.76

checkNNGradients

checkNNGradients()
##             numgrad          grad
##  [1,] -9.278252e-03 -9.278252e-03
##  [2,]  8.899120e-03  8.899120e-03
##  [3,] -8.360108e-03 -8.360108e-03
##  [4,]  7.628136e-03  7.628136e-03
##  [5,] -6.747984e-03 -6.747984e-03
##  [6,] -3.049787e-06 -3.049789e-06
##  [7,]  1.428694e-05  1.428694e-05
##  [8,] -2.593831e-05 -2.593831e-05
##  [9,]  3.698832e-05  3.698832e-05
## [10,] -4.687598e-05 -4.687598e-05
## [11,] -1.750601e-04 -1.750601e-04
## [12,]  2.331464e-04  2.331464e-04
## [13,] -2.874687e-04 -2.874687e-04
## [14,]  3.353203e-04  3.353203e-04
## [15,] -3.762156e-04 -3.762156e-04
## [16,] -9.626606e-05 -9.626606e-05
## [17,]  1.179827e-04  1.179827e-04
## [18,] -1.371497e-04 -1.371497e-04
## [19,]  1.532471e-04  1.532471e-04
## [20,] -1.665603e-04 -1.665603e-04
## [21,]  3.145450e-01  3.145450e-01
## [22,]  1.110566e-01  1.110566e-01
## [23,]  9.740070e-02  9.740070e-02
## [24,]  1.640908e-01  1.640908e-01
## [25,]  5.757365e-02  5.757365e-02
## [26,]  5.045759e-02  5.045759e-02
## [27,]  1.645679e-01  1.645679e-01
## [28,]  5.778674e-02  5.778674e-02
## [29,]  5.075302e-02  5.075302e-02
## [30,]  1.583393e-01  1.583393e-01
## [31,]  5.592353e-02  5.592353e-02
## [32,]  4.916208e-02  4.916208e-02
## [33,]  1.511275e-01  1.511275e-01
## [34,]  5.369670e-02  5.369670e-02
## [35,]  4.714562e-02  4.714562e-02
## [36,]  1.495683e-01  1.495683e-01
## [37,]  5.315421e-02  5.315421e-02
## [38,]  4.655972e-02  4.655972e-02
## The above two columns you get should be very similar.
##       (Left-Your Numerical Gradient, Right-Analytical Gradient)
## 
## If your backpropagation implementation is correct, then
##       the relative difference will be small (less than 1e-9).
##       Relative Difference: 2.33818e-11

predict examples

rp <- sample(5000)
for (i in 1:4){
    displayData(X[rp[i], ])
    cat('picture label:', y[rp[i], ]%%10, '\n')
    cat('NN predict:', predict(Theta1, Theta2, X[rp[i], ])%%10, '\n')
}

## picture label: 8 
## NN predict: 2

## picture label: 2 
## NN predict: 2

## picture label: 0 
## NN predict: 2

## picture label: 8 
## NN predict: 2