# Instaling some useful packages
library(tidyverse)
library(kableExtra)
library(ggrepel)
library(magrittr)
Let the sequence P, P, P, P, N, P, P, P, N, N, P, N, P, N, N, N, N, P, N, N be the sorted result according to the posterior probability being a positive instance. Please find the AUC value for this ranking result. (10 %)
# Data
FPR <- c(0, rep(0, 4), rep(0.1, 4), 0.2, 0.3, 0.3, 0.4, 0.4,
seq(0.5, 0.8, 0.1), seq(0.8, 1, 0.1))
TPR <- c(0 ,seq(0.1, 0.4, 0.1), seq(0.4, 0.7, 0.1), 0.7, 0.7,
0.8, 0.8, rep(0.9, 5), rep(1, 3))
label <- c("", rep("P", 4), "N", rep("P", 3), "N", "N",
"P", "N", "P", rep("N", 4), "P", "N", "N")
# Visualization ROC
ggplot(data = NULL, aes(x = FPR, y = TPR)) +
geom_point() + geom_line() +
scale_x_continuous(breaks = FPR,
minor_breaks = NULL) +
scale_y_continuous(breaks = TPR,
minor_breaks = NULL,
limits = c(0, 1)) +
labs(x = "False positive rate",
y = "True positive rate",
title = "ROC") +
geom_text_repel(aes(x = FPR, y = TPR, label = label) ) +
theme_grey(base_size = 13)
上述圖形為ROC,可算出曲線下面積(AUC)為0.4+0.27+0.07+0.06+0.02 = 0.82
Prove that for any matrix \(B \in R^{m*n}\), either the system (I) \[
\begin{align}
Bx <0
\end{align}
\] or the system (II) \[
\begin{align}
B^{T}\alpha = 0,\quad \alpha \geq0,\ and\ \alpha \neq0
\end{align}
\] has a solution but never both. (20 %)
Hint 1 : \(Bx <0\) if and only if \(Bx+{\bf 1}z\leq 0,z>0\)
Hint 2 : Use Farkas?? Lemma with a suitable \(b \in R^{n+1}\) and \(A \in R^{m*(n+1)}\)
< Proof >
By hint 1 :
將原問題轉化成 \[
\begin{align}
& Bx + {\bf 1}z \leq 0, z>0 \\
\Rightarrow
&(B,\ {\bf 1} ) *
\begin{pmatrix}
x \\
z \\
\end{pmatrix} \leq 0,\quad b^{T} *
\begin{pmatrix}
x \\
z \\
\end{pmatrix} = z >0 \\
\Rightarrow
&A * x^{*} \leq 0,\quad b =
\begin{pmatrix}
0 \\
\vdots \\
1 \\
\end{pmatrix}_{(n+1)*1}
> 0 \\
or \\
&A^{T} * \alpha^{x} = b,\ \alpha^{x} =
\begin{pmatrix}
\alpha \\
1 \\
\end{pmatrix}_{m*1}
\geq0
\end{align}
\] By, Farkas’s Lemma
上述兩式,至少有一式有解,故原問題至少有一式有解?
(a). Solve
\[
\begin{align}
\min \limits_{x\in R^{2}}\
\frac{1}{2}x^{T}
\begin{pmatrix}
1 & 0 \\
0 & 900 \\
\end{pmatrix} x
\end{align}
\] using the steep descent with exact line search. You are welcome to copy the MATLAB code from my slides. Start your code with the initial point \(x_{0}=[1000,\ 1]^{T}\) . Stop until \(||x_{n+1}-x_{n}|| < 10^{-8}\).
Report your solution and the number of iteration. (15 %)
# Steep descent
Q <- matrix(c(1, 0, 0, 900), ncol = 2)
x0 <- matrix(c(1000, 1)) # Initial value
Steep.descent <- function(x = x0, Q, tol = 1e-8){
for(i in 1:10000){ # Fixed interation times 10000
d <- - Q %*% x # descent direction
lambda <- ( (t(d) %*% d) / (t(d) %*% Q %*% d) ) %>%
as.numeric() # Step size
xn <- x + lambda* d
if( all( abs(xn-x) < tol ) ) {
break
}
x <- xn
}
return(list( Iteration = i, x = xn))
}
c(Steep.descent(x = x0, Q = Q) %>% unlist, 0) %>%
round(., 3) %>% t %>% t %>%
`rownames<-`(., c("Iteration", "$x_{1}$", "$x_{2}$", "f(x)")) %>%
kable(., "html", col.names = c("Value"),
caption = "Steep descent") %>%
kable_styling(bootstrap_options = "striped", full_width = F)
| Value | |
|---|---|
| Iteration | 8558 |
| \(x_{1}\) | 0 |
| \(x_{2}\) | 0 |
| f(x) | 0 |
(b). Implement the Newton??s method for minimizing a quadratic function \(f(x) = \frac{1}{2}x^{T}Qx+P^{T}x\) in MATLAB code. Apply your code to solve the minimization problem in (a) (15%)
# Newton??s method
Q <- matrix(c(1, 0, 0, 900), ncol = 2) # Quadratic term
x0 <- matrix(c(1000, 1)) # Initial value
Newton_fun <- function(x = x0, Q, tol = 1e-8){
for(i in 1:10000){ # Fixed interation times 10000
S <- -Q %*% x # Score function
H <- solve(Q) # Hessian matrix_inverse
d <- H %*% S
xn <- x + d
if( all( abs(xn-x) < tol ) ) {
break
}
x <- xn
}
return(list( Iteration = i, x = xn))
}
c(Newton_fun(x = x0, Q) %>% unlist, 0) %>%
t %>% t %>%
`rownames<-`(., c("Iteration", "$x_{1}$", "$x_{2}$", "f(x)")) %>%
kable(., "html", col.names = c("Value"),
caption = "Newton??s method") %>%
kable_styling(bootstrap_options = "striped", full_width = F)
| Value | |
|---|---|
| Iteration | 2 |
| \(x_{1}\) | 0 |
| \(x_{2}\) | 0 |
| f(x) | 0 |
Let \(S=\{ (x^{i}, y_{i})\}_{i=1}^{m} \subset R^{n} ?? \{1, 1\}\) be a non-trivial training set. The Perceptron Algorithm in dual form is given as follows:
Let A+ = {(0, 0), (0.5, 0), (0, 0.5), (-0.5, 0), (0, -0.5)} and A- = {(0.5, 0.5), (0.5, -0.5), (-0.5, 0.5), (-0.5, -0.5), (1, 0), (0, 1), (-1, 0), (0, -1)}. (40%)
# Data
data.p5 <- data.frame(x1 = c(0 , 0.5, 0 , -0.5, 0,
0.5, 0.5, -0.5, -0.5, 1,
0, -1, 0),
x2 = c(0 , 0 , 0.5, 0, -0.5,
0.5, -0.5, 0.5, -0.5, 0,
1, 0, -1),
y = c(rep(1, 5), rep(-1, 8)))
Try to find the hypothesis h(x) by implementing the Perceptron algorithm in the dual form and replacing the inner product \[ \begin{align} <x^{i}, x^{j}>\ by\ <x^{i}, x^{j}>^{2}\ and\ R = \max \limits_{1\leq i \leq l} ||x^{i}||_{2}^{2} \end{align} \]
# Replace the inner product
perceptron_dual <- function(x = data.p5[, c(1, 2)],
y = data.p5$y){
# x be R^2*1 # y be -1 or 1
# Define some constant
a <- matrix(0, nrow = dim(x)[1])
b <- 0 # intercept
L <- x^2 %>% apply(., 1, sum) %>% max() # 2-norm
D <- diag(y)
A <- x %>% as.matrix()
AA_t <- ( A %*% t(A) )^2
# iteration times
for (i in 1:10000) {
z <- t(a) %*% D %*% AA_t
z
for (j in 1:length(y)) {
if( y[j] * (z[j] + b) <= 0){
a[j] <- a[j] + 1
b <- b + y[j] * L^{2}
}
}
}
return(list(alpha = a,
b = b))
}
g_A <- ggplot(data = data.p5,
aes(x = x1, y = x2)) +
geom_point(aes(colour = y %>% as.factor()),
size = 4) +
geom_point(shape = 1, size = 4,colour = "black") +
labs(color = "Label")
# Visualization
dual_A <- perceptron_dual(x = data.p5[, c(1, 2)],
y = data.p5$y)
dual_A$alpha %>%
`rownames<-`(., c(paste("$\\alpha_{",
seq(1, 13), "}$",
sep = ""))) %>%
kable(., "html", col.names = c("Value")) %>%
kable_styling(bootstrap_options = "striped",
full_width = F)
| Value | |
|---|---|
| \(\alpha_{1}\) | 10 |
| \(\alpha_{2}\) | 1 |
| \(\alpha_{3}\) | 0 |
| \(\alpha_{4}\) | 0 |
| \(\alpha_{5}\) | 0 |
| \(\alpha_{6}\) | 5 |
| \(\alpha_{7}\) | 5 |
| \(\alpha_{8}\) | 0 |
| \(\alpha_{9}\) | 0 |
| \(\alpha_{10}\) | 0 |
| \(\alpha_{11}\) | 0 |
| \(\alpha_{12}\) | 0 |
| \(\alpha_{13}\) | 0 |
上述為perceptron_dual form 之\(\alpha\)向量
dual_A$b %>% matrix() %>%
`rownames<-`(., "b") %>%
kable(., "html", col.names = c("Value")) %>%
kable_styling(bootstrap_options = "striped",
full_width = F)
| Value | |
|---|---|
| b | 1 |
上述為perceptron_dual form 之 b
\[ \begin{align} h(x) &= \alpha^{T}D(S*x)^{2} + b \\ where&,\ \alpha \in R^{13*1},\ D=diag(y) \in R^{13*13} \\ &\ \ S = Training \in R^{13*2}, x \in R^{2*1}, b \in R^{1*!} \end{align} \]
Generate 10,000 points in the box [-1.5, 1.5]??[-1.5, 1.5] randomly as a test set. Plug these points into the hyposthesis that you got in (a) and then plot the points for which h(x) > 0 with + .
# Plot the test dataset
dual_A.pred_fun <- function(x){
x %<>% as.matrix()
# Define some constant
a <- dual_A$alpha
b <- dual_A$b
D <- diag(data.p5$y)
S <- data.p5[, c(1, 2)] %>% as.matrix()
# Calcualte hypothesis
value <- ( (t(a) %*% D %*% (S %*% x)^2) + b ) %>%
sign() %>% as.vector()
return(value)
}
# Visualization
A.test <- matrix(runif(20000, -1.5, 1.5),
ncol = 2) %>% as.data.frame() %>%
`colnames<-`(., c("x1", "x2"))
A.test$pred <- A.test %>% apply(., 1, dual_A.pred_fun)
g_A +
geom_point(data = A.test,
aes(x = x1, y = x2,
colour = pred %>% as.factor()),
size = 0.3) +
labs(title = "Perceptron algorithm",
caption = "Dual form")
Repeat (a) and (b) by using the training data \[ \begin{align} &B+ = \{(0.5,\ 0),\ (0,\ 0.5),\ (-0.5,\ 0),\ (0,\ -0.5)\} \quad and \\ &B- = \{(0.5,\ 0.5),\ (0.5,\ -0.5),\ (-0.5,\ 0.5),\ (-0.5,\ -0.5)\} \end{align} \]
# Data
df.B <- data.frame(x1 = c(0.5, 0, -0.5, 0,
0.5, 0.5, -0.5, -0.5),
x2 = c(0, 0.5, 0, -0.5,
0.5, -0.5, 0.5, -0.5),
y = c(rep(1, 4), rep(-1, 4))
)
dual_B <- perceptron_dual(x = df.B[, c(1, 2)],
y = df.B$y)
g_B <- ggplot(data = df.B,
aes(x = x1, y = x2)) +
geom_point(aes(colour = y %>% as.factor()), size = 4) +
geom_point(shape = 1, size = 4,colour = "black") +
labs(color = "Label")
dual_B.pred_fun <- function(x){
x %<>% as.matrix()
# Define some constant
a <- dual_B$alpha
b <- dual_B$b
D <- diag(df.B$y)
S <- df.B[, c(1, 2)] %>% as.matrix()
# Calcualte hypothesis
value <- ( (t(a) %*% D %*% (S %*% x)^2) + b ) %>%
sign() %>% as.vector()
return(value)
}
# Visualization
B.test <- matrix(runif(20000, -1.5, 1.5),
ncol = 2) %>% as.data.frame() %>%
`colnames<-`(., c("x1", "x2"))
B.test$pred <- B.test %>% apply(., 1, dual_A.pred_fun)
g_B +
geom_point(data = B.test,
aes(x = x1, y = x2,
colour = pred %>% as.factor()),
size = 0.3) +
labs(title = "Perceptron algorithm",
caption = "Dual form")
Let the nonlinear mapping \(\phi : R^{2} \rightarrow R^{4}\) defined by \[ \begin{align} \phi(x)=[-x_{1}x_{2},\ x_{1}^{2},\ x_{1}x_{2},\ x_{2}^{2}] \end{align} \] Map the training data A+ and A- into the feature space using this nonlinear map. Find the hypothesis f(x) by implementing the Perceptron algorithm in the primal form in the feature space.
# Writing perceptron_primal form
perceptron_primal <- function(x, y, eta = 1, niter = 10) {
# Initialize weight vector
weight <- rep(0, dim(x)[2] + 1) # ???`?ƶ??ҥH?n+1
errors <- rep(0, niter)
# loop over number of epochs niter
for (jj in 1:niter) {
# loop through training data set
for (ii in 1:length(y)) {
# Predict binary label using Heaviside activation
# function
z <- sum(weight[2:length(weight)] *
as.numeric(x[ii, ])) + weight[1]
if(z < 0) { ypred <- -1
} else { ypred <- 1 }
# Change weight - the formula doesn't do anything
# if the predicted value is correct
weightdiff <- eta * (y[ii] - ypred) *
c(1, as.numeric(x[ii, ]))
weight <- weight + weightdiff
# Update error function
if ((y[ii] - ypred) != 0.0) {
errors[jj] <- errors[jj] + 1
}
}
}
# weight to decide between the two species
return(list(Weight = weight,
Errors = errors))
}
# Datas
w1 <- -(data.p5$x1 * data.p5$x2)
w2 <- data.p5$x1^2
w3 <- (data.p5$x1 * data.p5$x2)
w4 <- data.p5$x2^2
x.p5 <- cbind(w1, w2, w3, w4) %>% as.data.frame()
y.p5 <- c(rep(1, 5), rep(-1, 8))
# Using perceptron
perceptron_primal(x = x.p5, y = y.p5,
niter = 20)$Weight %>%
matrix(., nrow = 1) %>%
`colnames<-`(., c("b", "w1", "w2", "w3", "w4")) %>%
`rownames<-`(., c("values")) %>%
kable(., "html") %>%
kable_styling(bootstrap_options = "striped", full_width = F)
| b | w1 | w2 | w3 | w4 | |
|---|---|---|---|---|---|
| values | 2 | 0 | -2.5 | 0 | -8 |
Above the table is the coefficient of feature space.
\[ \begin{align} h(x) &= W^{T}X\\ &= \begin{pmatrix} 2 & 0 & -2.5 & 0 & 8 \\ \end{pmatrix}_{1 * 5} * \begin{pmatrix} 1 \\ -x_{1}x_{2} \\ x_{1}^{2} \\ x_{1}x_{2} \\ x_{2}^{2} \end{pmatrix}_{5 * 1} \end{align} \]
# Map the training data A+ and A- into the feature space
W <- perceptron_primal(x = x.p5, y = y.p5,
niter = 20)$Weight %>%
matrix(.)
ypred <- (cbind(1, x.p5) %>% as.matrix()) %*% W
x.p5$ypred <- ifelse(ypred >= 0, 1, -1)
cbind(data.p5, x.p5) %>%
kable(., "html",
col.names = c("$x_{1}$", "$x_{2}$", "y",
"$-x_{1}x_{2}$", "$x_{1}^{2}$",
"$x_{1}x_{2}$", "$x_{2}^{2}$"
, "ypred")) %>%
kable_styling(bootstrap_options = "striped",
full_width = F) %>%
add_header_above(c("2-dimension" = 3,
"4-dimension" = 5))
| \(x_{1}\) | \(x_{2}\) | y | \(-x_{1}x_{2}\) | \(x_{1}^{2}\) | \(x_{1}x_{2}\) | \(x_{2}^{2}\) | ypred |
|---|---|---|---|---|---|---|---|
| 0.0 | 0.0 | 1 | 0.00 | 0.00 | 0.00 | 0.00 | 1 |
| 0.5 | 0.0 | 1 | 0.00 | 0.25 | 0.00 | 0.00 | 1 |
| 0.0 | 0.5 | 1 | 0.00 | 0.00 | 0.00 | 0.25 | 1 |
| -0.5 | 0.0 | 1 | 0.00 | 0.25 | 0.00 | 0.00 | 1 |
| 0.0 | -0.5 | 1 | 0.00 | 0.00 | 0.00 | 0.25 | 1 |
| 0.5 | 0.5 | -1 | -0.25 | 0.25 | 0.25 | 0.25 | -1 |
| 0.5 | -0.5 | -1 | 0.25 | 0.25 | -0.25 | 0.25 | -1 |
| -0.5 | 0.5 | -1 | 0.25 | 0.25 | -0.25 | 0.25 | -1 |
| -0.5 | -0.5 | -1 | -0.25 | 0.25 | 0.25 | 0.25 | -1 |
| 1.0 | 0.0 | -1 | 0.00 | 1.00 | 0.00 | 0.00 | -1 |
| 0.0 | 1.0 | -1 | 0.00 | 0.00 | 0.00 | 1.00 | -1 |
| -1.0 | 0.0 | -1 | 0.00 | 1.00 | 0.00 | 0.00 | -1 |
| 0.0 | -1.0 | -1 | 0.00 | 0.00 | 0.00 | 1.00 | -1 |
上述表格表將資料mapping至四維空間後,perceptron 分類的結果
Repeat (b) by using the hypothesis that you got in (d). Please know that you need to map the points randomly generated in (b) by the nonlinear mapping \(\phi\) first.
# Plot the test dataset
A.test <- data.frame(x1 = runif(10000, -1.5, 1.5) ,
x2 = runif(10000, -1.5, 1.5))
A4.test <- data.frame(1,
x1 = -(A.test$x1 * A.test$x2),
x2 = A.test$x1^2,
x3 = (A.test$x1 * A.test$x2),
x4 = A.test$x2^2)
pred <- ifelse( (as.matrix(A4.test) %*% W) >= 0 , 1, -1 )
# Visualization
g_A +
geom_point(data = A.test,
aes(x = x1, y = x2,
colour = pred %>% as.factor()),
size = 0.3) +
labs(title = "Perceptron algorithm",
caption = "Primal form")
Find an approximate solution using MATLAB to the following system by minimizing \(||Ax-b||_{p}\) for p = 1, 2, \(\infty\). Write down both the approximate solution, and the value of the \(||Ax-b||_{p}\). Draw the solution points in \(R^{2}\) and the four equations being solved. \[ \begin{equation*} \begin{aligned} & x_{1} &&+ 2x_{2} &&= 2 \\ & 2x_{1} &&- x_{2} &&= -2 \\ & x_{1} &&+ x_{2} &&= 3 \\ & 4x_{1} &&- x_{2} &&= -4 \end{aligned} \end{equation*} \]
# Data
A <- matrix(c(1 ,2 ,2, -1, 1, 1, 4, -1), ncol = 2, by = TRUE)
b <- matrix(c(2, -2, 3, -4), ncol = 1)
# 1-norm
library(L1pack) # L1 norm
L1_norm <- l1fit(x = A, y = b, intercept = FALSE)$coefficients
L1_sol <- l1fit(x = A, y = b, intercept = FALSE)$residuals %>%
sum
# 2-norm
L2_norm <- lm(b ~ -1 + A)$coef
L2_sol <- lm(b ~ -1 + A)$residuals^2 %>% sum %>% sqrt
# Infinity norm
library(quadrupen) # Infinity norm
bounded <- bounded.reg(x = A, y = b,
lambda1 = 1,lambda2 = 0,
intercept = FALSE)
Infinity_norm <- bounded@coefficients
Infinity_sol <- bounded@residuals %>% abs %>% max
# Summary
data.norm <- data.frame(x = c(L1_norm[1], L2_norm[1], Infinity_norm[1]),
y = c(L1_norm[2], L2_norm[2], Infinity_norm[2]),
sol = c(L1_sol, L2_sol, Infinity_sol),
label = c("L1-norm", "L2-norm", "Infinity-norm"))
# Summary
data.norm %>%
kable(.,"html",
col.names = c("$x_{1}$", "$x_{2}$", "sol", "label")) %>%
kable_styling(bootstrap_options = "striped", full_width = F)
| \(x_{1}\) | \(x_{2}\) | sol | label |
|---|---|---|---|
| -0.6666667 | 1.333333 | 3.000000 | L1-norm |
| -0.4552000 | 1.662100 | 2.136707 | L2-norm |
| -0.2000000 | 1.800000 | 1.400000 | Infinity-norm |
# Visualization
ggplot(data = data.norm, aes(x = x, y = y )) +
geom_point(aes(color = label), size = 3) +
geom_abline(intercept = 1, slope = -0.5) +
geom_abline(intercept = 2, slope = 2) +
geom_abline(intercept = 3, slope = -1) +
geom_abline(intercept = 4, slope = 4) +
theme(legend.position = "none") +
geom_text_repel(aes(x = x, y = y,
label = label)) +
labs(x = "x1", y = "x2",
title = "Solution of different norm") +
xlim(-2, 2) + ylim(0, 3)