# Instaling some useful packages
library(tidyverse)
library(kableExtra)
library(ggrepel)
library(magrittr)

Problem 1

Let the sequence P, P, P, P, N, P, P, P, N, N, P, N, P, N, N, N, N, P, N, N be the sorted result according to the posterior probability being a positive instance. Please find the AUC value for this ranking result. (10 %)

# Data
FPR <- c(0, rep(0, 4), rep(0.1, 4), 0.2, 0.3, 0.3, 0.4, 0.4, 
         seq(0.5, 0.8, 0.1), seq(0.8, 1, 0.1))
TPR <- c(0 ,seq(0.1, 0.4, 0.1), seq(0.4, 0.7, 0.1), 0.7, 0.7, 
         0.8, 0.8, rep(0.9, 5), rep(1, 3))
label <- c("", rep("P", 4), "N", rep("P", 3), "N", "N", 
           "P", "N", "P", rep("N", 4), "P", "N", "N")
# Visualization ROC
ggplot(data = NULL, aes(x = FPR, y = TPR)) + 
  geom_point() + geom_line() + 
  scale_x_continuous(breaks = FPR, 
                     minor_breaks = NULL) + 
  scale_y_continuous(breaks = TPR, 
                     minor_breaks = NULL, 
                     limits = c(0, 1)) + 
  labs(x = "False positive rate", 
       y = "True positive rate", 
       title = "ROC") + 
  geom_text_repel(aes(x = FPR, y = TPR, label =  label) ) + 
  theme_grey(base_size = 13)

上述圖形為ROC,可算出曲線下面積(AUC)為0.4+0.27+0.07+0.06+0.02 = 0.82

Problem 2

Prove that for any matrix \(B \in R^{m*n}\), either the system (I) \[ \begin{align} Bx <0 \end{align} \] or the system (II) \[ \begin{align} B^{T}\alpha = 0,\quad \alpha \geq0,\ and\ \alpha \neq0 \end{align} \] has a solution but never both. (20 %)
Hint 1 : \(Bx <0\) if and only if \(Bx+{\bf 1}z\leq 0,z>0\)
Hint 2 : Use Farkas?? Lemma with a suitable \(b \in R^{n+1}\) and \(A \in R^{m*(n+1)}\)
< Proof >
By hint 1 :
將原問題轉化成 \[ \begin{align} & Bx + {\bf 1}z \leq 0, z>0 \\ \Rightarrow &(B,\ {\bf 1} ) * \begin{pmatrix} x \\ z \\ \end{pmatrix} \leq 0,\quad b^{T} * \begin{pmatrix} x \\ z \\ \end{pmatrix} = z >0 \\ \Rightarrow &A * x^{*} \leq 0,\quad b = \begin{pmatrix} 0 \\ \vdots \\ 1 \\ \end{pmatrix}_{(n+1)*1} > 0 \\ or \\ &A^{T} * \alpha^{x} = b,\ \alpha^{x} = \begin{pmatrix} \alpha \\ 1 \\ \end{pmatrix}_{m*1} \geq0 \end{align} \] By, Farkas’s Lemma
上述兩式,至少有一式有解,故原問題至少有一式有解?

Problem 3

(a). Solve

\[ \begin{align} \min \limits_{x\in R^{2}}\ \frac{1}{2}x^{T} \begin{pmatrix} 1 & 0 \\ 0 & 900 \\ \end{pmatrix} x \end{align} \] using the steep descent with exact line search. You are welcome to copy the MATLAB code from my slides. Start your code with the initial point \(x_{0}=[1000,\ 1]^{T}\) . Stop until \(||x_{n+1}-x_{n}|| < 10^{-8}\).
Report your solution and the number of iteration. (15 %)

# Steep descent
Q <- matrix(c(1, 0, 0, 900), ncol = 2) 
x0 <- matrix(c(1000, 1))   # Initial value 
Steep.descent <- function(x = x0, Q, tol = 1e-8){
  for(i in 1:10000){   # Fixed interation times 10000
    d <- - Q %*% x     # descent direction
    lambda <- ( (t(d) %*% d) / (t(d) %*% Q %*% d) ) %>% 
      as.numeric()     # Step size 
    xn <- x + lambda* d
    if( all( abs(xn-x)  <  tol ) ) {
      break
    }
    x <- xn
    }
  return(list( Iteration = i, x = xn))
}
c(Steep.descent(x = x0, Q = Q) %>% unlist, 0) %>%
  round(., 3) %>% t %>% t %>% 
  `rownames<-`(., c("Iteration", "$x_{1}$", "$x_{2}$", "f(x)")) %>% 
  kable(., "html", col.names = c("Value"), 
        caption = "Steep descent") %>%
  kable_styling(bootstrap_options = "striped", full_width = F)
Steep descent
Value
Iteration 8558
\(x_{1}\) 0
\(x_{2}\) 0
f(x) 0

(b). Implement the Newton??s method for minimizing a quadratic function \(f(x) = \frac{1}{2}x^{T}Qx+P^{T}x\) in MATLAB code. Apply your code to solve the minimization problem in (a) (15%)

# Newton??s method
Q <- matrix(c(1, 0, 0, 900), ncol = 2) # Quadratic term 
x0 <- matrix(c(1000, 1))   # Initial value 
Newton_fun <- function(x = x0, Q, tol = 1e-8){
  for(i in 1:10000){   # Fixed interation times 10000
    S <- -Q %*% x   # Score function
    H <- solve(Q)   # Hessian matrix_inverse
    d <- H %*% S
    xn <- x + d
    if( all( abs(xn-x)  <  tol ) ) {
      break
    }
    x <- xn
    }
  return(list( Iteration = i, x = xn))
}
c(Newton_fun(x = x0, Q) %>% unlist, 0) %>% 
  t %>% t %>% 
  `rownames<-`(., c("Iteration", "$x_{1}$", "$x_{2}$", "f(x)")) %>% 
  kable(., "html", col.names = c("Value"), 
        caption = "Newton??s method") %>%
  kable_styling(bootstrap_options = "striped", full_width = F)
Newton??s method
Value
Iteration 2
\(x_{1}\) 0
\(x_{2}\) 0
f(x) 0

Problem 4

Let \(S=\{ (x^{i}, y_{i})\}_{i=1}^{m} \subset R^{n} ?? \{1, 1\}\) be a non-trivial training set. The Perceptron Algorithm in dual form is given as follows:

  1. What are the meanings of the output \(\alpha_{i}\) and the 1-norm of \(\alpha\)? (15%)
    由上述演算法知,\(\alpha_{i}\) 表第i筆資料分錯的次數,且1-norm of \(\alpha\) 表整筆資料分錯的總次數
  2. Why the updating rule is effective? (10%)
    \(\alpha\)沒有在變化時,表每筆資料皆分到對的次數,此時演算法即可停止,減少計算時間,故此效率較高

Problem 5

Let A+ = {(0, 0), (0.5, 0), (0, 0.5), (-0.5, 0), (0, -0.5)} and A- = {(0.5, 0.5), (0.5, -0.5), (-0.5, 0.5), (-0.5, -0.5), (1, 0), (0, 1), (-1, 0), (0, -1)}. (40%)

# Data
data.p5 <- data.frame(x1 = c(0  , 0.5, 0   , -0.5, 0, 
                             0.5, 0.5, -0.5, -0.5, 1, 
                             0, -1, 0), 
                      x2 = c(0  , 0  ,  0.5,    0, -0.5, 
                             0.5, -0.5, 0.5, -0.5, 0, 
                             1, 0, -1), 
                      y = c(rep(1, 5), rep(-1, 8)))
  1. Try to find the hypothesis h(x) by implementing the Perceptron algorithm in the dual form and replacing the inner product \[ \begin{align} <x^{i}, x^{j}>\ by\ <x^{i}, x^{j}>^{2}\ and\ R = \max \limits_{1\leq i \leq l} ||x^{i}||_{2}^{2} \end{align} \]

    # Replace the inner product
    perceptron_dual <- function(x = data.p5[, c(1, 2)], 
                                y = data.p5$y){
      # x be R^2*1  # y be -1 or 1 
      # Define some constant 
      a <- matrix(0, nrow = dim(x)[1])
      b <- 0 # intercept 
      L <- x^2 %>% apply(., 1, sum) %>% max() # 2-norm
      D <- diag(y)
      A <- x %>% as.matrix() 
      AA_t <- ( A %*% t(A) )^2
      # iteration times
      for (i in 1:10000) { 
        z <- t(a) %*% D %*% AA_t
        z
        for (j in 1:length(y)) {
          if( y[j] * (z[j] + b) <= 0){
            a[j] <- a[j] + 1
            b <- b + y[j] * L^{2}
          }
        }
      } 
      return(list(alpha = a,
                  b = b)) 
    }
    
    g_A <- ggplot(data = data.p5, 
                  aes(x = x1, y = x2)) + 
      geom_point(aes(colour = y %>% as.factor()), 
                 size = 4) + 
      geom_point(shape = 1, size = 4,colour = "black") + 
      labs(color = "Label")
    # Visualization
    dual_A <- perceptron_dual(x = data.p5[, c(1, 2)], 
                              y = data.p5$y)
    dual_A$alpha %>% 
      `rownames<-`(., c(paste("$\\alpha_{", 
                              seq(1, 13), "}$", 
                              sep = ""))) %>% 
      kable(., "html", col.names = c("Value")) %>%
      kable_styling(bootstrap_options = "striped", 
                    full_width = F)
    Value
    \(\alpha_{1}\) 10
    \(\alpha_{2}\) 1
    \(\alpha_{3}\) 0
    \(\alpha_{4}\) 0
    \(\alpha_{5}\) 0
    \(\alpha_{6}\) 5
    \(\alpha_{7}\) 5
    \(\alpha_{8}\) 0
    \(\alpha_{9}\) 0
    \(\alpha_{10}\) 0
    \(\alpha_{11}\) 0
    \(\alpha_{12}\) 0
    \(\alpha_{13}\) 0

    上述為perceptron_dual form 之\(\alpha\)向量

    dual_A$b %>% matrix() %>% 
      `rownames<-`(., "b") %>% 
      kable(., "html", col.names = c("Value")) %>%
      kable_styling(bootstrap_options = "striped", 
                    full_width = F)
    Value
    b 1

    上述為perceptron_dual form 之 b

    \[ \begin{align} h(x) &= \alpha^{T}D(S*x)^{2} + b \\ where&,\ \alpha \in R^{13*1},\ D=diag(y) \in R^{13*13} \\ &\ \ S = Training \in R^{13*2}, x \in R^{2*1}, b \in R^{1*!} \end{align} \]

  2. Generate 10,000 points in the box [-1.5, 1.5]??[-1.5, 1.5] randomly as a test set. Plug these points into the hyposthesis that you got in (a) and then plot the points for which h(x) > 0 with + .

    # Plot the test dataset 
    dual_A.pred_fun <- function(x){
      x %<>% as.matrix()
      # Define some constant
      a <- dual_A$alpha
      b <- dual_A$b
      D <- diag(data.p5$y)
      S <- data.p5[, c(1, 2)] %>% as.matrix()
      # Calcualte hypothesis
      value <- ( (t(a) %*% D %*% (S %*% x)^2) + b ) %>% 
        sign() %>% as.vector()
      return(value)
      }
    
    # Visualization 
    A.test <- matrix(runif(20000, -1.5, 1.5), 
                     ncol = 2) %>% as.data.frame() %>% 
      `colnames<-`(., c("x1", "x2"))
    A.test$pred <- A.test %>% apply(., 1, dual_A.pred_fun)
    
    g_A + 
      geom_point(data = A.test, 
                 aes(x = x1, y = x2, 
                     colour = pred %>% as.factor()), 
                 size = 0.3) + 
      labs(title = "Perceptron algorithm", 
           caption = "Dual form")
  3. Repeat (a) and (b) by using the training data \[ \begin{align} &B+ = \{(0.5,\ 0),\ (0,\ 0.5),\ (-0.5,\ 0),\ (0,\ -0.5)\} \quad and \\ &B- = \{(0.5,\ 0.5),\ (0.5,\ -0.5),\ (-0.5,\ 0.5),\ (-0.5,\ -0.5)\} \end{align} \]

    # Data
    df.B <- data.frame(x1 = c(0.5, 0, -0.5, 0, 
                              0.5, 0.5, -0.5, -0.5),
                       x2 = c(0, 0.5, 0, -0.5, 
                              0.5, -0.5, 0.5, -0.5),
                       y = c(rep(1, 4), rep(-1, 4))
                       )
    dual_B <-  perceptron_dual(x = df.B[, c(1, 2)], 
                              y = df.B$y)
    g_B <- ggplot(data = df.B, 
                  aes(x = x1, y = x2)) + 
      geom_point(aes(colour = y %>% as.factor()), size = 4) + 
      geom_point(shape = 1, size = 4,colour = "black") + 
      labs(color = "Label")
    dual_B.pred_fun <- function(x){
      x %<>% as.matrix()
      # Define some constant
      a <- dual_B$alpha
      b <- dual_B$b
      D <- diag(df.B$y)
      S <-  df.B[, c(1, 2)] %>% as.matrix()
      # Calcualte hypothesis
      value <- ( (t(a) %*% D %*% (S %*% x)^2) + b ) %>% 
        sign() %>% as.vector()
      return(value)
    }
    
    # Visualization 
    B.test <- matrix(runif(20000, -1.5, 1.5), 
                     ncol = 2) %>% as.data.frame() %>% 
      `colnames<-`(., c("x1", "x2"))
    B.test$pred <- B.test %>% apply(., 1, dual_A.pred_fun)
    
    g_B + 
      geom_point(data = B.test, 
                 aes(x = x1, y = x2, 
                     colour = pred %>% as.factor()), 
                 size = 0.3) + 
      labs(title = "Perceptron algorithm", 
           caption = "Dual form")

  4. Let the nonlinear mapping \(\phi : R^{2} \rightarrow R^{4}\) defined by \[ \begin{align} \phi(x)=[-x_{1}x_{2},\ x_{1}^{2},\ x_{1}x_{2},\ x_{2}^{2}] \end{align} \] Map the training data A+ and A- into the feature space using this nonlinear map. Find the hypothesis f(x) by implementing the Perceptron algorithm in the primal form in the feature space.

    # Writing perceptron_primal form 
    perceptron_primal <- function(x, y, eta = 1, niter = 10) {
      # Initialize weight vector
      weight <- rep(0, dim(x)[2] + 1) # ???`?ƶ??ҥH?n+1
      errors <- rep(0, niter)
      # loop over number of epochs niter
      for (jj in 1:niter) {
        # loop through training data set
        for (ii in 1:length(y)) {
          # Predict binary label using Heaviside activation 
          # function
          z <- sum(weight[2:length(weight)] * 
                     as.numeric(x[ii, ])) + weight[1]
          if(z < 0) { ypred <- -1 
          } else { ypred <- 1 } 
          # Change weight - the formula doesn't do anything 
          # if the predicted value is correct
          weightdiff <- eta * (y[ii] - ypred) * 
            c(1, as.numeric(x[ii, ]))
          weight <- weight + weightdiff
          # Update error function
          if ((y[ii] - ypred) != 0.0) {
            errors[jj] <- errors[jj] + 1
          }
        }
        }
      # weight to decide between the two species 
      return(list(Weight = weight, 
                  Errors = errors))
      }
    # Datas
    w1 <- -(data.p5$x1 * data.p5$x2)
    w2 <- data.p5$x1^2
    w3 <- (data.p5$x1 * data.p5$x2)
    w4 <- data.p5$x2^2
    x.p5 <- cbind(w1, w2, w3, w4) %>% as.data.frame()
    y.p5 <- c(rep(1, 5), rep(-1, 8))
    # Using perceptron
    perceptron_primal(x = x.p5, y = y.p5, 
                      niter = 20)$Weight %>% 
      matrix(., nrow = 1) %>% 
      `colnames<-`(., c("b", "w1", "w2", "w3", "w4")) %>% 
      `rownames<-`(., c("values")) %>% 
      kable(., "html") %>% 
      kable_styling(bootstrap_options = "striped", full_width = F)
    b w1 w2 w3 w4
    values 2 0 -2.5 0 -8

    Above the table is the coefficient of feature space.

    \[ \begin{align} h(x) &= W^{T}X\\ &= \begin{pmatrix} 2 & 0 & -2.5 & 0 & 8 \\ \end{pmatrix}_{1 * 5} * \begin{pmatrix} 1 \\ -x_{1}x_{2} \\ x_{1}^{2} \\ x_{1}x_{2} \\ x_{2}^{2} \end{pmatrix}_{5 * 1} \end{align} \]

    # Map the training data A+ and A- into the feature space
    W <- perceptron_primal(x = x.p5, y = y.p5, 
                      niter = 20)$Weight %>% 
      matrix(.)
    ypred <- (cbind(1, x.p5) %>% as.matrix()) %*% W
    x.p5$ypred <- ifelse(ypred >= 0, 1, -1)
    cbind(data.p5, x.p5) %>% 
      kable(., "html", 
            col.names = c("$x_{1}$", "$x_{2}$", "y", 
                          "$-x_{1}x_{2}$", "$x_{1}^{2}$", 
                          "$x_{1}x_{2}$", "$x_{2}^{2}$"
                          , "ypred")) %>% 
      kable_styling(bootstrap_options = "striped", 
                    full_width = F) %>% 
      add_header_above(c("2-dimension" = 3, 
                         "4-dimension" = 5))
    2-dimension
    4-dimension
    \(x_{1}\) \(x_{2}\) y \(-x_{1}x_{2}\) \(x_{1}^{2}\) \(x_{1}x_{2}\) \(x_{2}^{2}\) ypred
    0.0 0.0 1 0.00 0.00 0.00 0.00 1
    0.5 0.0 1 0.00 0.25 0.00 0.00 1
    0.0 0.5 1 0.00 0.00 0.00 0.25 1
    -0.5 0.0 1 0.00 0.25 0.00 0.00 1
    0.0 -0.5 1 0.00 0.00 0.00 0.25 1
    0.5 0.5 -1 -0.25 0.25 0.25 0.25 -1
    0.5 -0.5 -1 0.25 0.25 -0.25 0.25 -1
    -0.5 0.5 -1 0.25 0.25 -0.25 0.25 -1
    -0.5 -0.5 -1 -0.25 0.25 0.25 0.25 -1
    1.0 0.0 -1 0.00 1.00 0.00 0.00 -1
    0.0 1.0 -1 0.00 0.00 0.00 1.00 -1
    -1.0 0.0 -1 0.00 1.00 0.00 0.00 -1
    0.0 -1.0 -1 0.00 0.00 0.00 1.00 -1

    上述表格表將資料mapping至四維空間後,perceptron 分類的結果

  5. Repeat (b) by using the hypothesis that you got in (d). Please know that you need to map the points randomly generated in (b) by the nonlinear mapping \(\phi\) first.

    # Plot the test dataset 
    A.test <- data.frame(x1 = runif(10000, -1.5, 1.5) , 
                         x2 = runif(10000, -1.5, 1.5))
    A4.test <- data.frame(1, 
                          x1 = -(A.test$x1 * A.test$x2), 
                          x2 = A.test$x1^2, 
                          x3 = (A.test$x1 * A.test$x2),
                          x4 = A.test$x2^2)
    pred <- ifelse( (as.matrix(A4.test) %*% W) >= 0 , 1, -1 )
    
    # Visualization 
    g_A + 
      geom_point(data = A.test, 
                 aes(x = x1, y = x2, 
                     colour = pred %>% as.factor()), 
                 size = 0.3) + 
      labs(title = "Perceptron algorithm",
           caption = "Primal form")

Problem 6

Find an approximate solution using MATLAB to the following system by minimizing \(||Ax-b||_{p}\) for p = 1, 2, \(\infty\). Write down both the approximate solution, and the value of the \(||Ax-b||_{p}\). Draw the solution points in \(R^{2}\) and the four equations being solved. \[ \begin{equation*} \begin{aligned} & x_{1} &&+ 2x_{2} &&= 2 \\ & 2x_{1} &&- x_{2} &&= -2 \\ & x_{1} &&+ x_{2} &&= 3 \\ & 4x_{1} &&- x_{2} &&= -4 \end{aligned} \end{equation*} \]

# Data
A <- matrix(c(1 ,2 ,2, -1, 1, 1, 4, -1), ncol = 2, by = TRUE)
b <- matrix(c(2, -2, 3, -4), ncol = 1)

# 1-norm
library(L1pack) # L1 norm 
L1_norm <- l1fit(x = A, y = b, intercept = FALSE)$coefficients 
L1_sol <- l1fit(x = A, y = b, intercept = FALSE)$residuals %>% 
  sum
# 2-norm
L2_norm <- lm(b ~ -1 + A)$coef
L2_sol <- lm(b ~ -1 + A)$residuals^2 %>% sum %>% sqrt
# Infinity norm 
library(quadrupen) # Infinity norm 
bounded <- bounded.reg(x = A, y = b, 
                       lambda1 = 1,lambda2 = 0, 
                       intercept = FALSE)
Infinity_norm <- bounded@coefficients
Infinity_sol <- bounded@residuals %>% abs %>% max
# Summary 
data.norm <- data.frame(x = c(L1_norm[1], L2_norm[1], Infinity_norm[1]), 
                        y = c(L1_norm[2], L2_norm[2], Infinity_norm[2]),
                        sol = c(L1_sol, L2_sol, Infinity_sol),
                        label = c("L1-norm", "L2-norm", "Infinity-norm"))
# Summary
data.norm %>% 
  kable(.,"html", 
        col.names = c("$x_{1}$", "$x_{2}$", "sol", "label")) %>% 
  kable_styling(bootstrap_options = "striped", full_width = F)
\(x_{1}\) \(x_{2}\) sol label
-0.6666667 1.333333 3.000000 L1-norm
-0.4552000 1.662100 2.136707 L2-norm
-0.2000000 1.800000 1.400000 Infinity-norm
# Visualization
ggplot(data = data.norm, aes(x = x, y = y )) + 
  geom_point(aes(color = label), size = 3) +
  geom_abline(intercept = 1, slope = -0.5) + 
  geom_abline(intercept = 2, slope = 2) + 
  geom_abline(intercept = 3, slope = -1) +
  geom_abline(intercept = 4, slope = 4) +
  theme(legend.position = "none") +
  geom_text_repel(aes(x = x, y = y, 
                      label = label)) + 
  labs(x = "x1", y = "x2", 
       title = "Solution of different norm") + 
  xlim(-2, 2) + ylim(0, 3)