# Instaling some useful packages
library(tidyverse)
library(kableExtra)
library(magrittr)

Problem 1

Let \(S = \{ {\bf x}^{i},y \}_{i=1}^{s}\) be the dataset given as follows: \[ \begin{align} \{ (-2, 2.5), (-1.5, 2.5), (-1, 0.5), (-0.5, 0), (0, -0.5), (0.5, -1.5), (1, -2.5), (2, -4) \} \end{align} \] We consider using two different regression models to fit this dataset, the polynomial with degree 5 and linear model.

# Dataset 
x <- c(-2, -1.5, -1, -0.5, 0, 0.5, 1, 2)
y <- c(2.5, 2.5, 0.5, 0, -0.5, -1.5, -2.5, -4)
S <- data.frame(x, y)
S %>% kable(., "html", row.names = seq(1,8)) %>% 
  kable_styling(bootstrap_options = "striped", 
                full_width = F)

	x	y
1	-2.0	2.5
2	-1.5	2.5
3	-1.0	0.5
4	-0.5	0.0
5	0.0	-0.5
6	0.5	-1.5
7	1.0	-2.5
8	2.0	-4.0

Use the four-fold cross validation to evaluate the performance of the polynomial with degree 5. Report the mean absolute error, MAE for training set as well as the test set. (20%)

# Define training and testing set for four-fold
set.seed(2019)
index <- sample(x = 1:8, size = 8) # Random permutation
# Training and testing dataset by ???ƽ???
l <- 1
for (i in seq(1, 7, by = 2)) {
  train_name <- paste("training", l, sep = "_")
  testing_name <-paste("testing", l, sep = "_")
  df_train <- S[index[-c(i, i+1)], ]
  df_testing <- S[index[c(i, i+1)], ]
  assign(train_name, df_train)
  assign(testing_name, df_testing)
  l = l + 1
}
# Polynomial fit 
poly.fit_1 <- lm(y ~ poly(x, 5), data = training_1)
fit_MAE_1 <- (testing_1$y - predict(poly.fit_1, newdata = testing_1)) %>% abs %>% mean

poly.fit_2 <- lm(y ~ poly(x, 5), data = training_2)
fit_MAE_2 <- (testing_2$y - predict(poly.fit_2, newdata = testing_2)) %>% abs %>% mean

poly.fit_3 <- lm(y ~ poly(x, 5), data = training_3)
fit_MAE_3 <- (testing_3$y - predict(poly.fit_3, newdata = testing_3)) %>% abs %>% mean

poly.fit_4 <- lm(y ~ poly(x, 5), data = training_4)
fit_MAE_4 <- (testing_4$y - predict(poly.fit_4, newdata = testing_4)) %>% abs %>% mean

# Summary
poly.traing_MAE <- c(poly.fit_1$residuals %>% abs %>% mean,
                     poly.fit_2$residuals %>% abs %>% mean,
                     poly.fit_3$residuals %>% abs %>% mean,
                     poly.fit_4$residuals %>% abs %>% mean)
poly.testing_MAE <- c(fit_MAE_1, fit_MAE_2, fit_MAE_3, 
                      fit_MAE_4)
cbind(poly.traing_MAE, poly.testing_MAE) %>%
  `colnames<-`(., c("Training set", "Testing set")) %>%
  `rownames<-`(., c(seq(1, 4))) %>% 
  round(., 4) %>% 
  kable(., "html", 
        caption = "Mean Absolute Error (MAE)", 
        row.names = seq(1, 4)) %>% 
  kable_styling(bootstrap_options = "striped", 
                full_width = F) %>% 
  footnote(general = "polynomial with degree 5")

Mean Absolute Error (MAE)
	Training set	Testing set
1	0	3.4952
2	0	0.8646
3	0	16.0000
4	0	0.4562
Note:
polynomial with degree 5

Plot the resulting model for each fold on the same figure. (10%)

# Four fold 
f1.plot <- data.frame(x = seq(-2, 2, length.out = 100))
f1.plot$value <-  predict(poly.fit_1, newdata = f1.plot)

f2.plot <- data.frame(x = seq(-2, 2, length.out = 100))
f2.plot$value <-  predict(poly.fit_2, newdata = f2.plot)

f3.plot <- data.frame(x = seq(-2, 2, length.out = 100))
f3.plot$value <-  predict(poly.fit_3, newdata = f3.plot)

f4.plot <- data.frame(x = seq(-2, 2, length.out = 100))
f4.plot$value <-  predict(poly.fit_4, newdata = f4.plot)
# Visualization 
ggplot(data = S, aes(x = x, y = y)) + 
  geom_point(size = 2) +
  geom_line(data = f1.plot, aes(x = x, y = value, colour = "fold 1")) + 
  geom_line(data = f2.plot, aes(x = x, y = value, colour = "fold 2")) + 
  geom_line(data = f3.plot, aes(x = x, y = value, colour = "fold 3")) + 
  geom_line(data = f4.plot, aes(x = x, y = value, colour = "fold 4")) + 
  labs(x = "x", y = "y", 
       title = "polynomial with degree 5", 
       colour = "fold") + 
  theme(legend.position = c(0.8, 0.8), 
    legend.background = element_rect(color = "black",
                                     fill = "grey90"))

Repeat [(a)] and [(b)] for the linear model (25%)

# four-fold : 1 
lm.fit_1 <- lm(y ~ x, data = training_1)
lm_MAE_1 <- (testing_1$y - predict(lm.fit_1, newdata = testing_1)) %>% abs %>% mean
# four-fold : 2 
lm.fit_2 <- lm(y ~ x, data = training_2)
lm_MAE_2 <- (testing_2$y - predict(lm.fit_2, newdata = testing_2)) %>% abs %>% mean
# four-fold : 3 
lm.fit_3 <- lm(y ~ x, data = training_3)
lm_MAE_3 <- (testing_3$y - predict(lm.fit_3, newdata = testing_3)) %>% abs %>% mean
# four-fold : 4 
lm.fit_4 <- lm(y ~ x, data = training_4)
lm_MAE_4 <- (testing_4$y - predict(lm.fit_4, newdata = testing_4)) %>% abs %>% mean

# Summary
lm.traing_MAE <- c(lm.fit_1$residuals %>% abs %>% mean,
                   lm.fit_2$residuals %>% abs %>% mean,
                   lm.fit_3$residuals %>% abs %>% mean,
                   lm.fit_4$residuals %>% abs %>% mean)
lm.testing_MAE <- c(lm_MAE_1, lm_MAE_2, lm_MAE_3, lm_MAE_4)
cbind(lm.traing_MAE, lm.testing_MAE) %>%
  `colnames<-`(., c("Training set", "Testing set")) %>%
  `rownames<-`(., c(seq(1, 4))) %>% 
  round(., 4) %>% 
  kable(., "html", 
        caption = "Mean Absolute Error (MAE)", 
        row.names = seq(1, 4)) %>% 
  kable_styling(bootstrap_options = "striped", 
                full_width = F) %>% 
  footnote(general = "linear model")

Mean Absolute Error (MAE)
	Training set	Testing set
1	0.2663	0.2321
2	0.1399	0.5246
3	0.2857	0.1429
4	0.2426	0.3107
Note:
linear model

# Four fold 
f1.plot <- data.frame(x = seq(-2, 2, length.out = 100))
f1.plot$value <-  predict(lm.fit_1, newdata = f1.plot)

f2.plot <- data.frame(x = seq(-2, 2, length.out = 100))
f2.plot$value <-  predict(lm.fit_2, newdata = f2.plot)

f3.plot <- data.frame(x = seq(-2, 2, length.out = 100))
f3.plot$value <-  predict(lm.fit_3, newdata = f3.plot)

f4.plot <- data.frame(x = seq(-2, 2, length.out = 100))
f4.plot$value <-  predict(lm.fit_4, newdata = f4.plot)
# Visualization 
ggplot(data = S, aes(x = x, y = y)) + 
  geom_point(size = 2) +
  geom_line(data = f1.plot, 
            aes(x = x, y = value, colour = "fold 1")) + 
  geom_line(data = f2.plot, 
            aes(x = x, y = value, colour = "fold 2")) + 
  geom_line(data = f3.plot, 
            aes(x = x, y = value, colour = "fold 3")) + 
  geom_line(data = f4.plot, 
            aes(x = x, y = value, colour = "fold 4")) + 
  labs(x = "x", y = "y", 
       title = "linear model", 
       colour = "fold") + 
  theme(legend.position = c(0.8, 0.8), 
    legend.background = element_rect(color = "black",
                                     fill = "grey90"))

NOTE: You have to do the random permutation on the dataset S before you do the four-fold cross validation.

Problem 2

We define the \(\epsilon\)-insensitive loss function as follows: \[ \begin{align} |x|_{\epsilon}=\left\{ \begin{aligned} x - \epsilon,\ &if\ x \geq \epsilon \\ 0,\ &if\ -\epsilon \leq x \leq \epsilon \\ -x - \epsilon,\ &if x \leq -\epsilon \end{aligned} \right. \end{align} \] Find the smooth approximation function for the \(\epsilon\)-insensitive loss function. (15%)
< Solution >
the smooth approximation function is \[ \begin{align} p(x,\beta) = \left\{ \begin{aligned} (x-\epsilon) + \frac{1}{\beta}\ ln\left( 1 + e^{-\beta(x-\epsilon)} \right),\quad x \geq 0 \\ -(x + \epsilon) + \frac{1}{\beta}\ ln\left( 1 + e^{\beta(x+\epsilon)} \right),\quad x < 0 \\ \end{aligned} \right. \end{align} \]

Problem 3

Consider a quadratic programming (QP) problem: \[ \begin{align} &\min \limits_{w,\ b}\ \frac{C}{2} ||\xi||^{2}_{2} + \frac{1}{2} \left( ||w||^{2}_{2}+b^{2} \right) \\ &s.t. D(Aw + {\bf 1}b) + \xi \geq {\bf 1} \end{align} \] Prove that \((\xi^{*}, w^{*}, b^{*})\) the solution of (QP) satisfies: \[ \begin{align} \xi^{*} = ({\bf 1} - D(Aw^{*} + {\bf 1} b^{*}))_{+} \end{align} \] where \((.)_{+} = max \{., 0\}\). (15%)
< Proof >
We know the solution of quadratic programming must satisfy the constraint \[ \begin{align} &D(Aw^{*} + {\bf 1} b^{*}) + \xi^{*} \geq {\bf 1} \\ \Rightarrow\ &\xi^{*} \geq {\bf 1} - D(Aw^{*} + {\bf 1} b^{*}) \end{align} \] Now, according to the above inequality, we need to claim \(\xi^{*} \geq 0\)
接著採用反證法來證明\(\xi^{*} \geq 0\)
< proof > Assume
\[ \begin{align} \xi^{*}_{i} < 0 \quad \forall i=1,2...l \end{align} \] Without loss of generality (w.l.o.g), assume \[ \begin{align} \xi^{*}_{1} < 0 \end{align} \] then we can construct \[ \begin{align} &\xi = (0,\ \xi^{*}_{2},...,\ \xi^{*}_{l}) \\ &such\ \ that\ \ D(Aw + {\bf 1} b) + \xi \geq {\bf 1} \\ &but\ \ \frac{C}{2}||\xi||^{2}_{2} \leq \frac{C}{2}||\xi^{*}||^{2}_{2} \\ &so, \xi\ \ is\ \ not\ \ a\ \ solution \rightarrow \leftarrow \end{align} \] ?ѤW?z?٬??Ҫk?? \(\xi^{*} \geq 0\)?C
By Karush-Kuhn-Tucker (KKT) condition
\[ \begin{align} When\ \ \alpha=0,\ &\because \xi(\alpha - C)=0\ \\ &\therefore \xi=0 \\ When\ \ \alpha \neq0,\ &\because \alpha (D(Aw+{\bf 1}b)+\xi - {\bf 1})=0 \\ &\therefore \xi = {\bf 1 } - D(Aw+{\bf 1}b) \end{align} \] so, under the solution of QP \[ \begin{align} \xi^{*} &= max\left\{0,\ \ {\bf 1} - D(Aw^{*} + {\bf 1} b^{*}) \right\} \\ &= ({\bf 1} - D(Aw^{*} + {\bf 1} b^{*}))_{+} \end{align} \]

Problem 4

Write down the dual problem for the linear programming problem:(15%) \[ \begin{align} &min\ {\bf p}^{T}x+{\bf q}^{T}y \\ &s.t. \left\{ \begin{aligned} Ax+By \leq {\bf b} \\ Cx+Dy = {\bf c} \\ x \geq 0 \end{aligned} \right. \Rightarrow \left\{ \begin{aligned} Ax+By - {\bf b} \leq 0 \\ Cx+Dy - {\bf c} = 0 \\ -x \leq 0 \end{aligned} \right. \end{align} \] < Proof >
By Lagrange multiplier \[ \begin{align} &\mathcal{L} = \mathcal{L}(x,y,\alpha_{1}, \alpha_{2}, \alpha_{3}) = {\bf p}^{T}x+{\bf q}^{T}y + \alpha_{1}^{T} (Ax+By - {\bf b}) + \alpha_{2}^{T} (Cx+Dy - {\bf c}) - \alpha_{3}^{T} x \\ &\Rightarrow \left\{ \begin{aligned} &\frac{\partial \mathcal{L}}{\partial x} = {\bf p} + A^{T}\alpha_{1} + C^{T}\alpha_{2} - \alpha_{3} = 0 \\ &\frac{\partial \mathcal{L}}{\partial y} = {\bf q} + B^{T}\alpha_{1} + D^{T}\alpha_{2} = 0 \end{aligned} \right. \Rightarrow \left\{ \begin{aligned} & {\bf p} = -A^{T}\alpha_{1} - C^{T}\alpha_{2} + \alpha_{3} \\ & {\bf q} = -B^{T}\alpha_{1} - D^{T}\alpha_{2} \end{aligned} \right. \\ \end{align} \] ?N?W?z???G?a?J\(\mathcal{L}(x,y,\alpha_{1}, \alpha_{2}, \alpha_{3})\)?o \[ \begin{align} \theta(\alpha_{1}, \alpha_{2}, \alpha_{3}) = -\alpha_{1}^{T}b - \alpha_{2}^{T}C \end{align} \] so, the dual form problem is \[ \begin{align} &\max \limits_{ (\alpha_{1}, \alpha_{2}) } -\alpha_{1}^{T}b - \alpha_{2}^{T}C \\ & s.t\quad \left\{ \begin{aligned} & {\bf p} = -A^{T}\alpha_{1} - C^{T}\alpha_{2} + \alpha_{3} \\ & {\bf q} = -B^{T}\alpha_{1} - D^{T}\alpha_{2} \\ &\alpha_{1}, \alpha_{2} \geq 0 \end{aligned} \right. \\ \end{align} \]

Machine_Learning_HW3

0652626_林暐翔

2019/4/28

Problem 1

Problem 2

Problem 3

Problem 4