# Instaling some useful packages
library(tidyverse)
library(kableExtra)
library(ggrepel)
library(magrittr)

Problem 1

Let the sequence P, P, P, P, N, P, P, P, N, N, P, N, P, N, N, N, N, P, N, N be the sorted result according to the posterior probability being a positive instance. Please find the AUC value for this ranking result. (10 %)

# Data
FPR <- c(0, rep(0, 4), rep(0.1, 4), 0.2, 0.3, 0.3, 0.4, 0.4, 
         seq(0.5, 0.8, 0.1), seq(0.8, 1, 0.1))
TPR <- c(0 ,seq(0.1, 0.4, 0.1), seq(0.4, 0.7, 0.1), 0.7, 0.7, 
         0.8, 0.8, rep(0.9, 5), rep(1, 3))
label <- c("", rep("P", 4), "N", rep("P", 3), "N", "N", 
           "P", "N", "P", rep("N", 4), "P", "N", "N")
# Visualization ROC
ggplot(data = NULL, aes(x = FPR, y = TPR)) + 
  geom_point() + geom_line() + 
  scale_x_continuous(breaks = FPR, 
                     minor_breaks = NULL) + 
  scale_y_continuous(breaks = TPR, 
                     minor_breaks = NULL, 
                     limits = c(0, 1)) + 
  labs(x = "False positive rate", 
       y = "True positive rate", 
       title = "ROC") + 
  geom_text_repel(aes(x = FPR, y = TPR, label =  label) ) + 
  theme_grey(base_size = 13)

上述圖形為ROC，可算出曲線下面積(AUC)為0.4+0.27+0.07+0.06+0.02 = 0.82

Problem 2

Prove that for any matrix \(B \in R^{m*n}\), either the system (I) \[ \begin{align} Bx <0 \end{align} \] or the system (II) \[ \begin{align} B^{T}\alpha = 0,\quad \alpha \geq0,\ and\ \alpha \neq0 \end{align} \] has a solution but never both. (20 %)
Hint 1 : \(Bx <0\) if and only if \(Bx+{\bf 1}z\leq 0,z>0\)
Hint 2 : Use Farkas?? Lemma with a suitable \(b \in R^{n+1}\) and \(A \in R^{m*(n+1)}\)
< Proof >
By hint 1 :
將原問題轉化成 \[ \begin{align} & Bx + {\bf 1}z \leq 0, z>0 \\ \Rightarrow &(B,\ {\bf 1} ) * \begin{pmatrix} x \\ z \\ \end{pmatrix} \leq 0,\quad b^{T} * \begin{pmatrix} x \\ z \\ \end{pmatrix} = z >0 \\ \Rightarrow &A * x^{*} \leq 0,\quad b = \begin{pmatrix} 0 \\ \vdots \\ 1 \\ \end{pmatrix}_{(n+1)*1} > 0 \\ or \\ &A^{T} * \alpha^{x} = b,\ \alpha^{x} = \begin{pmatrix} \alpha \\ 1 \\ \end{pmatrix}_{m*1} \geq0 \end{align} \] By, Farkas’s Lemma
上述兩式，至少有一式有解，故原問題至少有一式有解?

Problem 3

(a). Solve

\[ \begin{align} \min \limits_{x\in R^{2}}\ \frac{1}{2}x^{T} \begin{pmatrix} 1 & 0 \\ 0 & 900 \\ \end{pmatrix} x \end{align} \] using the steep descent with exact line search. You are welcome to copy the MATLAB code from my slides. Start your code with the initial point \(x_{0}=[1000,\ 1]^{T}\) . Stop until \(||x_{n+1}-x_{n}|| < 10^{-8}\).
Report your solution and the number of iteration. (15 %)

# Steep descent
Q <- matrix(c(1, 0, 0, 900), ncol = 2) 
x0 <- matrix(c(1000, 1))   # Initial value 
Steep.descent <- function(x = x0, Q, tol = 1e-8){
  for(i in 1:10000){   # Fixed interation times 10000
    d <- - Q %*% x     # descent direction
    lambda <- ( (t(d) %*% d) / (t(d) %*% Q %*% d) ) %>% 
      as.numeric()     # Step size 
    xn <- x + lambda* d
    if( all( abs(xn-x)  <  tol ) ) {
      break
    }
    x <- xn
    }
  return(list( Iteration = i, x = xn))
}
c(Steep.descent(x = x0, Q = Q) %>% unlist, 0) %>%
  round(., 3) %>% t %>% t %>% 
  `rownames<-`(., c("Iteration", "$x_{1}$", "$x_{2}$", "f(x)")) %>% 
  kable(., "html", col.names = c("Value"), 
        caption = "Steep descent") %>%
  kable_styling(bootstrap_options = "striped", full_width = F)

Steep descent
	Value
Iteration	8558
\(x_{1}\)	0
\(x_{2}\)	0
f(x)	0

(b). Implement the Newton??s method for minimizing a quadratic function \(f(x) = \frac{1}{2}x^{T}Qx+P^{T}x\) in MATLAB code. Apply your code to solve the minimization problem in (a) (15%)

# Newton??s method
Q <- matrix(c(1, 0, 0, 900), ncol = 2) # Quadratic term 
x0 <- matrix(c(1000, 1))   # Initial value 
Newton_fun <- function(x = x0, Q, tol = 1e-8){
  for(i in 1:10000){   # Fixed interation times 10000
    S <- -Q %*% x   # Score function
    H <- solve(Q)   # Hessian matrix_inverse
    d <- H %*% S
    xn <- x + d
    if( all( abs(xn-x)  <  tol ) ) {
      break
    }
    x <- xn
    }
  return(list( Iteration = i, x = xn))
}
c(Newton_fun(x = x0, Q) %>% unlist, 0) %>% 
  t %>% t %>% 
  `rownames<-`(., c("Iteration", "$x_{1}$", "$x_{2}$", "f(x)")) %>% 
  kable(., "html", col.names = c("Value"), 
        caption = "Newton??s method") %>%
  kable_styling(bootstrap_options = "striped", full_width = F)

Newton??s method
	Value
Iteration	2
\(x_{1}\)	0
\(x_{2}\)	0
f(x)	0

Problem 4

Let \(S=\{ (x^{i}, y_{i})\}_{i=1}^{m} \subset R^{n} ?? \{1, 1\}\) be a non-trivial training set. The Perceptron Algorithm in dual form is given as follows:

What are the meanings of the output \(\alpha_{i}\) and the 1-norm of \(\alpha\)? (15%)
由上述演算法知，\(\alpha_{i}\) 表第i筆資料分錯的次數，且1-norm of \(\alpha\) 表整筆資料分錯的總次數
Why the updating rule is effective? (10%)
當\(\alpha\)沒有在變化時，表每筆資料皆分到對的次數，此時演算法即可停止，減少計算時間，故此效率較高

Problem 5

Let A+ = {(0, 0), (0.5, 0), (0, 0.5), (-0.5, 0), (0, -0.5)} and A- = {(0.5, 0.5), (0.5, -0.5), (-0.5, 0.5), (-0.5, -0.5), (1, 0), (0, 1), (-1, 0), (0, -1)}. (40%)

# Data
data.p5 <- data.frame(x1 = c(0  , 0.5, 0   , -0.5, 0, 
                             0.5, 0.5, -0.5, -0.5, 1, 
                             0, -1, 0), 
                      x2 = c(0  , 0  ,  0.5,    0, -0.5, 
                             0.5, -0.5, 0.5, -0.5, 0, 
                             1, 0, -1), 
                      y = c(rep(1, 5), rep(-1, 8)))

Try to find the hypothesis h(x) by implementing the Perceptron algorithm in the dual form and replacing the inner product \[ \begin{align} <x^{i}, x^{j}>\ by\ <x^{i}, x^{j}>^{2}\ and\ R = \max \limits_{1\leq i \leq l} ||x^{i}||_{2}^{2} \end{align} \]

# Replace the inner product
perceptron_dual <- function(x = data.p5[, c(1, 2)], 
                            y = data.p5$y){
  # x be R^2*1  # y be -1 or 1 
  # Define some constant 
  a <- matrix(0, nrow = dim(x)[1])
  b <- 0 # intercept 
  L <- x^2 %>% apply(., 1, sum) %>% max() # 2-norm
  D <- diag(y)
  A <- x %>% as.matrix() 
  AA_t <- ( A %*% t(A) )^2
  # iteration times
  for (i in 1:10000) { 
    z <- t(a) %*% D %*% AA_t
    z
    for (j in 1:length(y)) {
      if( y[j] * (z[j] + b) <= 0){
        a[j] <- a[j] + 1
        b <- b + y[j] * L^{2}
      }
    }
  } 
  return(list(alpha = a,
              b = b)) 
}

g_A <- ggplot(data = data.p5, 
              aes(x = x1, y = x2)) + 
  geom_point(aes(colour = y %>% as.factor()), 
             size = 4) + 
  geom_point(shape = 1, size = 4,colour = "black") + 
  labs(color = "Label")

# Visualization
dual_A <- perceptron_dual(x = data.p5[, c(1, 2)], 
                          y = data.p5$y)
dual_A$alpha %>% 
  `rownames<-`(., c(paste("$\\alpha_{", 
                          seq(1, 13), "}$", 
                          sep = ""))) %>% 
  kable(., "html", col.names = c("Value")) %>%
  kable_styling(bootstrap_options = "striped", 
                full_width = F)

	Value
\(\alpha_{1}\)	10
\(\alpha_{2}\)	1
\(\alpha_{3}\)	0
\(\alpha_{4}\)	0
\(\alpha_{5}\)	0
\(\alpha_{6}\)	5
\(\alpha_{7}\)	5
\(\alpha_{8}\)	0
\(\alpha_{9}\)	0
\(\alpha_{10}\)	0
\(\alpha_{11}\)	0
\(\alpha_{12}\)	0
\(\alpha_{13}\)	0

上述為perceptron_dual form 之\(\alpha\)向量

dual_A$b %>% matrix() %>% 
  `rownames<-`(., "b") %>% 
  kable(., "html", col.names = c("Value")) %>%
  kable_styling(bootstrap_options = "striped", 
                full_width = F)

	Value
b	1

上述為perceptron_dual form 之 b

\[ \begin{align} h(x) &= \alpha^{T}D(S*x)^{2} + b \\ where&,\ \alpha \in R^{13*1},\ D=diag(y) \in R^{13*13} \\ &\ \ S = Training \in R^{13*2}, x \in R^{2*1}, b \in R^{1*!} \end{align} \]

Generate 10,000 points in the box [-1.5, 1.5]??[-1.5, 1.5] randomly as a test set. Plug these points into the hyposthesis that you got in (a) and then plot the points for which h(x) > 0 with + .

# Plot the test dataset 
dual_A.pred_fun <- function(x){
  x %<>% as.matrix()
  # Define some constant
  a <- dual_A$alpha
  b <- dual_A$b
  D <- diag(data.p5$y)
  S <- data.p5[, c(1, 2)] %>% as.matrix()
  # Calcualte hypothesis
  value <- ( (t(a) %*% D %*% (S %*% x)^2) + b ) %>% 
    sign() %>% as.vector()
  return(value)
  }

# Visualization 
A.test <- matrix(runif(20000, -1.5, 1.5), 
                 ncol = 2) %>% as.data.frame() %>% 
  `colnames<-`(., c("x1", "x2"))
A.test$pred <- A.test %>% apply(., 1, dual_A.pred_fun)

g_A + 
  geom_point(data = A.test, 
             aes(x = x1, y = x2, 
                 colour = pred %>% as.factor()), 
             size = 0.3) + 
  labs(title = "Perceptron algorithm", 
       caption = "Dual form")

Repeat (a) and (b) by using the training data \[ \begin{align} &B+ = \{(0.5,\ 0),\ (0,\ 0.5),\ (-0.5,\ 0),\ (0,\ -0.5)\} \quad and \\ &B- = \{(0.5,\ 0.5),\ (0.5,\ -0.5),\ (-0.5,\ 0.5),\ (-0.5,\ -0.5)\} \end{align} \]

# Data
df.B <- data.frame(x1 = c(0.5, 0, -0.5, 0, 
                          0.5, 0.5, -0.5, -0.5),
                   x2 = c(0, 0.5, 0, -0.5, 
                          0.5, -0.5, 0.5, -0.5),
                   y = c(rep(1, 4), rep(-1, 4))
                   )
dual_B <-  perceptron_dual(x = df.B[, c(1, 2)], 
                          y = df.B$y)
g_B <- ggplot(data = df.B, 
              aes(x = x1, y = x2)) + 
  geom_point(aes(colour = y %>% as.factor()), size = 4) + 
  geom_point(shape = 1, size = 4,colour = "black") + 
  labs(color = "Label")

dual_B.pred_fun <- function(x){
  x %<>% as.matrix()
  # Define some constant
  a <- dual_B$alpha
  b <- dual_B$b
  D <- diag(df.B$y)
  S <-  df.B[, c(1, 2)] %>% as.matrix()
  # Calcualte hypothesis
  value <- ( (t(a) %*% D %*% (S %*% x)^2) + b ) %>% 
    sign() %>% as.vector()
  return(value)
}

# Visualization 
B.test <- matrix(runif(20000, -1.5, 1.5), 
                 ncol = 2) %>% as.data.frame() %>% 
  `colnames<-`(., c("x1", "x2"))
B.test$pred <- B.test %>% apply(., 1, dual_A.pred_fun)

g_B + 
  geom_point(data = B.test, 
             aes(x = x1, y = x2, 
                 colour = pred %>% as.factor()), 
             size = 0.3) + 
  labs(title = "Perceptron algorithm", 
       caption = "Dual form")

Let the nonlinear mapping \(\phi : R^{2} \rightarrow R^{4}\) defined by \[ \begin{align} \phi(x)=[-x_{1}x_{2},\ x_{1}^{2},\ x_{1}x_{2},\ x_{2}^{2}] \end{align} \] Map the training data A+ and A- into the feature space using this nonlinear map. Find the hypothesis f(x) by implementing the Perceptron algorithm in the primal form in the feature space.

# Writing perceptron_primal form 
perceptron_primal <- function(x, y, eta = 1, niter = 10) {
  # Initialize weight vector
  weight <- rep(0, dim(x)[2] + 1) # ???`?ƶ??ҥH?n+1
  errors <- rep(0, niter)
  # loop over number of epochs niter
  for (jj in 1:niter) {
    # loop through training data set
    for (ii in 1:length(y)) {
      # Predict binary label using Heaviside activation 
      # function
      z <- sum(weight[2:length(weight)] * 
                 as.numeric(x[ii, ])) + weight[1]
      if(z < 0) { ypred <- -1 
      } else { ypred <- 1 } 
      # Change weight - the formula doesn't do anything 
      # if the predicted value is correct
      weightdiff <- eta * (y[ii] - ypred) * 
        c(1, as.numeric(x[ii, ]))
      weight <- weight + weightdiff
      # Update error function
      if ((y[ii] - ypred) != 0.0) {
        errors[jj] <- errors[jj] + 1
      }
    }
    }
  # weight to decide between the two species 
  return(list(Weight = weight, 
              Errors = errors))
  }

# Datas
w1 <- -(data.p5$x1 * data.p5$x2)
w2 <- data.p5$x1^2
w3 <- (data.p5$x1 * data.p5$x2)
w4 <- data.p5$x2^2
x.p5 <- cbind(w1, w2, w3, w4) %>% as.data.frame()
y.p5 <- c(rep(1, 5), rep(-1, 8))
# Using perceptron
perceptron_primal(x = x.p5, y = y.p5, 
                  niter = 20)$Weight %>% 
  matrix(., nrow = 1) %>% 
  `colnames<-`(., c("b", "w1", "w2", "w3", "w4")) %>% 
  `rownames<-`(., c("values")) %>% 
  kable(., "html") %>% 
  kable_styling(bootstrap_options = "striped", full_width = F)

	b	w1	w2	w3	w4
values	2	0	-2.5	0	-8

Above the table is the coefficient of feature space.

\[ \begin{align} h(x) &= W^{T}X\\ &= \begin{pmatrix} 2 & 0 & -2.5 & 0 & 8 \\ \end{pmatrix}_{1 * 5} * \begin{pmatrix} 1 \\ -x_{1}x_{2} \\ x_{1}^{2} \\ x_{1}x_{2} \\ x_{2}^{2} \end{pmatrix}_{5 * 1} \end{align} \]

# Map the training data A+ and A- into the feature space
W <- perceptron_primal(x = x.p5, y = y.p5, 
                  niter = 20)$Weight %>% 
  matrix(.)
ypred <- (cbind(1, x.p5) %>% as.matrix()) %*% W
x.p5$ypred <- ifelse(ypred >= 0, 1, -1)
cbind(data.p5, x.p5) %>% 
  kable(., "html", 
        col.names = c("$x_{1}$", "$x_{2}$", "y", 
                      "$-x_{1}x_{2}$", "$x_{1}^{2}$", 
                      "$x_{1}x_{2}$", "$x_{2}^{2}$"
                      , "ypred")) %>% 
  kable_styling(bootstrap_options = "striped", 
                full_width = F) %>% 
  add_header_above(c("2-dimension" = 3, 
                     "4-dimension" = 5))

2-dimension			4-dimension
\(x_{1}\)	\(x_{2}\)	y	\(-x_{1}x_{2}\)	\(x_{1}^{2}\)	\(x_{1}x_{2}\)	\(x_{2}^{2}\)	ypred
0.0	0.0	1	0.00	0.00	0.00	0.00	1
0.5	0.0	1	0.00	0.25	0.00	0.00	1
0.0	0.5	1	0.00	0.00	0.00	0.25	1
-0.5	0.0	1	0.00	0.25	0.00	0.00	1
0.0	-0.5	1	0.00	0.00	0.00	0.25	1
0.5	0.5	-1	-0.25	0.25	0.25	0.25	-1
0.5	-0.5	-1	0.25	0.25	-0.25	0.25	-1
-0.5	0.5	-1	0.25	0.25	-0.25	0.25	-1
-0.5	-0.5	-1	-0.25	0.25	0.25	0.25	-1
1.0	0.0	-1	0.00	1.00	0.00	0.00	-1
0.0	1.0	-1	0.00	0.00	0.00	1.00	-1
-1.0	0.0	-1	0.00	1.00	0.00	0.00	-1
0.0	-1.0	-1	0.00	0.00	0.00	1.00	-1

上述表格表將資料mapping至四維空間後，perceptron 分類的結果

Repeat (b) by using the hypothesis that you got in (d). Please know that you need to map the points randomly generated in (b) by the nonlinear mapping \(\phi\) first.

# Plot the test dataset 
A.test <- data.frame(x1 = runif(10000, -1.5, 1.5) , 
                     x2 = runif(10000, -1.5, 1.5))
A4.test <- data.frame(1, 
                      x1 = -(A.test$x1 * A.test$x2), 
                      x2 = A.test$x1^2, 
                      x3 = (A.test$x1 * A.test$x2),
                      x4 = A.test$x2^2)
pred <- ifelse( (as.matrix(A4.test) %*% W) >= 0 , 1, -1 )

# Visualization 
g_A + 
  geom_point(data = A.test, 
             aes(x = x1, y = x2, 
                 colour = pred %>% as.factor()), 
             size = 0.3) + 
  labs(title = "Perceptron algorithm",
       caption = "Primal form")

Problem 6

Find an approximate solution using MATLAB to the following system by minimizing \(||Ax-b||_{p}\) for p = 1, 2, \(\infty\). Write down both the approximate solution, and the value of the \(||Ax-b||_{p}\). Draw the solution points in \(R^{2}\) and the four equations being solved. \[ \begin{equation*} \begin{aligned} & x_{1} &&+ 2x_{2} &&= 2 \\ & 2x_{1} &&- x_{2} &&= -2 \\ & x_{1} &&+ x_{2} &&= 3 \\ & 4x_{1} &&- x_{2} &&= -4 \end{aligned} \end{equation*} \]

# Data
A <- matrix(c(1 ,2 ,2, -1, 1, 1, 4, -1), ncol = 2, by = TRUE)
b <- matrix(c(2, -2, 3, -4), ncol = 1)

# 1-norm
library(L1pack) # L1 norm 
L1_norm <- l1fit(x = A, y = b, intercept = FALSE)$coefficients 
L1_sol <- l1fit(x = A, y = b, intercept = FALSE)$residuals %>% 
  sum
# 2-norm
L2_norm <- lm(b ~ -1 + A)$coef
L2_sol <- lm(b ~ -1 + A)$residuals^2 %>% sum %>% sqrt
# Infinity norm 
library(quadrupen) # Infinity norm 
bounded <- bounded.reg(x = A, y = b, 
                       lambda1 = 1,lambda2 = 0, 
                       intercept = FALSE)
Infinity_norm <- bounded@coefficients
Infinity_sol <- bounded@residuals %>% abs %>% max
# Summary 
data.norm <- data.frame(x = c(L1_norm[1], L2_norm[1], Infinity_norm[1]), 
                        y = c(L1_norm[2], L2_norm[2], Infinity_norm[2]),
                        sol = c(L1_sol, L2_sol, Infinity_sol),
                        label = c("L1-norm", "L2-norm", "Infinity-norm"))

# Summary
data.norm %>% 
  kable(.,"html", 
        col.names = c("$x_{1}$", "$x_{2}$", "sol", "label")) %>% 
  kable_styling(bootstrap_options = "striped", full_width = F)

\(x_{1}\)	\(x_{2}\)	sol	label
-0.6666667	1.333333	3.000000	L1-norm
-0.4552000	1.662100	2.136707	L2-norm
-0.2000000	1.800000	1.400000	Infinity-norm

# Visualization
ggplot(data = data.norm, aes(x = x, y = y )) + 
  geom_point(aes(color = label), size = 3) +
  geom_abline(intercept = 1, slope = -0.5) + 
  geom_abline(intercept = 2, slope = 2) + 
  geom_abline(intercept = 3, slope = -1) +
  geom_abline(intercept = 4, slope = 4) +
  theme(legend.position = "none") +
  geom_text_repel(aes(x = x, y = y, 
                      label = label)) + 
  labs(x = "x1", y = "x2", 
       title = "Solution of different norm") + 
  xlim(-2, 2) + ylim(0, 3)

Machine_Learning_HW2