# Instaling some useful packages
library(tidyverse)
library(kableExtra)
library(magrittr)

Problem 1

Let \(S = \{ {\bf x}^{i},y \}_{i=1}^{s}\) be the dataset given as follows: \[ \begin{align} \{ (-2, 2.5), (-1.5, 2.5), (-1, 0.5), (-0.5, 0), (0, -0.5), (0.5, -1.5), (1, -2.5), (2, -4) \} \end{align} \] We consider using two different regression models to fit this dataset, the polynomial with degree 5 and linear model.

# Dataset 
x <- c(-2, -1.5, -1, -0.5, 0, 0.5, 1, 2)
y <- c(2.5, 2.5, 0.5, 0, -0.5, -1.5, -2.5, -4)
S <- data.frame(x, y)
S %>% kable(., "html", row.names = seq(1,8)) %>% 
  kable_styling(bootstrap_options = "striped", 
                full_width = F)
x y
1 -2.0 2.5
2 -1.5 2.5
3 -1.0 0.5
4 -0.5 0.0
5 0.0 -0.5
6 0.5 -1.5
7 1.0 -2.5
8 2.0 -4.0
  1. Use the four-fold cross validation to evaluate the performance of the polynomial with degree 5. Report the mean absolute error, MAE for training set as well as the test set. (20%)

    # Define training and testing set for four-fold
    set.seed(2019)
    index <- sample(x = 1:8, size = 8) # Random permutation
    # Training and testing dataset by ???ƽ???
    l <- 1
    for (i in seq(1, 7, by = 2)) {
      train_name <- paste("training", l, sep = "_")
      testing_name <-paste("testing", l, sep = "_")
      df_train <- S[index[-c(i, i+1)], ]
      df_testing <- S[index[c(i, i+1)], ]
      assign(train_name, df_train)
      assign(testing_name, df_testing)
      l = l + 1
    }
    # Polynomial fit 
    poly.fit_1 <- lm(y ~ poly(x, 5), data = training_1)
    fit_MAE_1 <- (testing_1$y - predict(poly.fit_1, newdata = testing_1)) %>% abs %>% mean
    
    poly.fit_2 <- lm(y ~ poly(x, 5), data = training_2)
    fit_MAE_2 <- (testing_2$y - predict(poly.fit_2, newdata = testing_2)) %>% abs %>% mean
    
    poly.fit_3 <- lm(y ~ poly(x, 5), data = training_3)
    fit_MAE_3 <- (testing_3$y - predict(poly.fit_3, newdata = testing_3)) %>% abs %>% mean
    
    poly.fit_4 <- lm(y ~ poly(x, 5), data = training_4)
    fit_MAE_4 <- (testing_4$y - predict(poly.fit_4, newdata = testing_4)) %>% abs %>% mean
    # Summary
    poly.traing_MAE <- c(poly.fit_1$residuals %>% abs %>% mean,
                         poly.fit_2$residuals %>% abs %>% mean,
                         poly.fit_3$residuals %>% abs %>% mean,
                         poly.fit_4$residuals %>% abs %>% mean)
    poly.testing_MAE <- c(fit_MAE_1, fit_MAE_2, fit_MAE_3, 
                          fit_MAE_4)
    cbind(poly.traing_MAE, poly.testing_MAE) %>%
      `colnames<-`(., c("Training set", "Testing set")) %>%
      `rownames<-`(., c(seq(1, 4))) %>% 
      round(., 4) %>% 
      kable(., "html", 
            caption = "Mean Absolute Error (MAE)", 
            row.names = seq(1, 4)) %>% 
      kable_styling(bootstrap_options = "striped", 
                    full_width = F) %>% 
      footnote(general = "polynomial with degree 5")
    Mean Absolute Error (MAE)
    Training set Testing set
    1 0 3.4952
    2 0 0.8646
    3 0 16.0000
    4 0 0.4562
    Note:
    polynomial with degree 5
  2. Plot the resulting model for each fold on the same figure. (10%)

    # Four fold 
    f1.plot <- data.frame(x = seq(-2, 2, length.out = 100))
    f1.plot$value <-  predict(poly.fit_1, newdata = f1.plot)
    
    f2.plot <- data.frame(x = seq(-2, 2, length.out = 100))
    f2.plot$value <-  predict(poly.fit_2, newdata = f2.plot)
    
    f3.plot <- data.frame(x = seq(-2, 2, length.out = 100))
    f3.plot$value <-  predict(poly.fit_3, newdata = f3.plot)
    
    f4.plot <- data.frame(x = seq(-2, 2, length.out = 100))
    f4.plot$value <-  predict(poly.fit_4, newdata = f4.plot)
    # Visualization 
    ggplot(data = S, aes(x = x, y = y)) + 
      geom_point(size = 2) +
      geom_line(data = f1.plot, aes(x = x, y = value, colour = "fold 1")) + 
      geom_line(data = f2.plot, aes(x = x, y = value, colour = "fold 2")) + 
      geom_line(data = f3.plot, aes(x = x, y = value, colour = "fold 3")) + 
      geom_line(data = f4.plot, aes(x = x, y = value, colour = "fold 4")) + 
      labs(x = "x", y = "y", 
           title = "polynomial with degree 5", 
           colour = "fold") + 
      theme(legend.position = c(0.8, 0.8), 
        legend.background = element_rect(color = "black",
                                         fill = "grey90"))

  3. Repeat [(a)] and [(b)] for the linear model (25%)

    # four-fold : 1 
    lm.fit_1 <- lm(y ~ x, data = training_1)
    lm_MAE_1 <- (testing_1$y - predict(lm.fit_1, newdata = testing_1)) %>% abs %>% mean
    # four-fold : 2 
    lm.fit_2 <- lm(y ~ x, data = training_2)
    lm_MAE_2 <- (testing_2$y - predict(lm.fit_2, newdata = testing_2)) %>% abs %>% mean
    # four-fold : 3 
    lm.fit_3 <- lm(y ~ x, data = training_3)
    lm_MAE_3 <- (testing_3$y - predict(lm.fit_3, newdata = testing_3)) %>% abs %>% mean
    # four-fold : 4 
    lm.fit_4 <- lm(y ~ x, data = training_4)
    lm_MAE_4 <- (testing_4$y - predict(lm.fit_4, newdata = testing_4)) %>% abs %>% mean
    # Summary
    lm.traing_MAE <- c(lm.fit_1$residuals %>% abs %>% mean,
                       lm.fit_2$residuals %>% abs %>% mean,
                       lm.fit_3$residuals %>% abs %>% mean,
                       lm.fit_4$residuals %>% abs %>% mean)
    lm.testing_MAE <- c(lm_MAE_1, lm_MAE_2, lm_MAE_3, lm_MAE_4)
    cbind(lm.traing_MAE, lm.testing_MAE) %>%
      `colnames<-`(., c("Training set", "Testing set")) %>%
      `rownames<-`(., c(seq(1, 4))) %>% 
      round(., 4) %>% 
      kable(., "html", 
            caption = "Mean Absolute Error (MAE)", 
            row.names = seq(1, 4)) %>% 
      kable_styling(bootstrap_options = "striped", 
                    full_width = F) %>% 
      footnote(general = "linear model")
    Mean Absolute Error (MAE)
    Training set Testing set
    1 0.2663 0.2321
    2 0.1399 0.5246
    3 0.2857 0.1429
    4 0.2426 0.3107
    Note:
    linear model
    # Four fold 
    f1.plot <- data.frame(x = seq(-2, 2, length.out = 100))
    f1.plot$value <-  predict(lm.fit_1, newdata = f1.plot)
    
    f2.plot <- data.frame(x = seq(-2, 2, length.out = 100))
    f2.plot$value <-  predict(lm.fit_2, newdata = f2.plot)
    
    f3.plot <- data.frame(x = seq(-2, 2, length.out = 100))
    f3.plot$value <-  predict(lm.fit_3, newdata = f3.plot)
    
    f4.plot <- data.frame(x = seq(-2, 2, length.out = 100))
    f4.plot$value <-  predict(lm.fit_4, newdata = f4.plot)
    # Visualization 
    ggplot(data = S, aes(x = x, y = y)) + 
      geom_point(size = 2) +
      geom_line(data = f1.plot, 
                aes(x = x, y = value, colour = "fold 1")) + 
      geom_line(data = f2.plot, 
                aes(x = x, y = value, colour = "fold 2")) + 
      geom_line(data = f3.plot, 
                aes(x = x, y = value, colour = "fold 3")) + 
      geom_line(data = f4.plot, 
                aes(x = x, y = value, colour = "fold 4")) + 
      labs(x = "x", y = "y", 
           title = "linear model", 
           colour = "fold") + 
      theme(legend.position = c(0.8, 0.8), 
        legend.background = element_rect(color = "black",
                                         fill = "grey90"))

NOTE: You have to do the random permutation on the dataset S before you do the four-fold cross validation.

Problem 2

We define the \(\epsilon\)-insensitive loss function as follows: \[ \begin{align} |x|_{\epsilon}=\left\{ \begin{aligned} x - \epsilon,\ &if\ x \geq \epsilon \\ 0,\ &if\ -\epsilon \leq x \leq \epsilon \\ -x - \epsilon,\ &if x \leq -\epsilon \end{aligned} \right. \end{align} \] Find the smooth approximation function for the \(\epsilon\)-insensitive loss function. (15%)
< Solution >
the smooth approximation function is \[ \begin{align} p(x,\beta) = \left\{ \begin{aligned} (x-\epsilon) + \frac{1}{\beta}\ ln\left( 1 + e^{-\beta(x-\epsilon)} \right),\quad x \geq 0 \\ -(x + \epsilon) + \frac{1}{\beta}\ ln\left( 1 + e^{\beta(x+\epsilon)} \right),\quad x < 0 \\ \end{aligned} \right. \end{align} \]

Problem 3

Consider a quadratic programming (QP) problem: \[ \begin{align} &\min \limits_{w,\ b}\ \frac{C}{2} ||\xi||^{2}_{2} + \frac{1}{2} \left( ||w||^{2}_{2}+b^{2} \right) \\ &s.t. D(Aw + {\bf 1}b) + \xi \geq {\bf 1} \end{align} \] Prove that \((\xi^{*}, w^{*}, b^{*})\) the solution of (QP) satisfies: \[ \begin{align} \xi^{*} = ({\bf 1} - D(Aw^{*} + {\bf 1} b^{*}))_{+} \end{align} \] where \((.)_{+} = max \{., 0\}\). (15%)
< Proof >
We know the solution of quadratic programming must satisfy the constraint \[ \begin{align} &D(Aw^{*} + {\bf 1} b^{*}) + \xi^{*} \geq {\bf 1} \\ \Rightarrow\ &\xi^{*} \geq {\bf 1} - D(Aw^{*} + {\bf 1} b^{*}) \end{align} \] Now, according to the above inequality, we need to claim \(\xi^{*} \geq 0\)
接著採用反證法來證明\(\xi^{*} \geq 0\)
< proof > Assume
\[ \begin{align} \xi^{*}_{i} < 0 \quad \forall i=1,2...l \end{align} \] Without loss of generality (w.l.o.g), assume \[ \begin{align} \xi^{*}_{1} < 0 \end{align} \] then we can construct \[ \begin{align} &\xi = (0,\ \xi^{*}_{2},...,\ \xi^{*}_{l}) \\ &such\ \ that\ \ D(Aw + {\bf 1} b) + \xi \geq {\bf 1} \\ &but\ \ \frac{C}{2}||\xi||^{2}_{2} \leq \frac{C}{2}||\xi^{*}||^{2}_{2} \\ &so, \xi\ \ is\ \ not\ \ a\ \ solution \rightarrow \leftarrow \end{align} \] ?ѤW?z?٬??Ҫk?? \(\xi^{*} \geq 0\)?C
By Karush-Kuhn-Tucker (KKT) condition
\[ \begin{align} When\ \ \alpha=0,\ &\because \xi(\alpha - C)=0\ \\ &\therefore \xi=0 \\ When\ \ \alpha \neq0,\ &\because \alpha (D(Aw+{\bf 1}b)+\xi - {\bf 1})=0 \\ &\therefore \xi = {\bf 1 } - D(Aw+{\bf 1}b) \end{align} \] so, under the solution of QP \[ \begin{align} \xi^{*} &= max\left\{0,\ \ {\bf 1} - D(Aw^{*} + {\bf 1} b^{*}) \right\} \\ &= ({\bf 1} - D(Aw^{*} + {\bf 1} b^{*}))_{+} \end{align} \]

Problem 4

Write down the dual problem for the linear programming problem:(15%) \[ \begin{align} &min\ {\bf p}^{T}x+{\bf q}^{T}y \\ &s.t. \left\{ \begin{aligned} Ax+By \leq {\bf b} \\ Cx+Dy = {\bf c} \\ x \geq 0 \end{aligned} \right. \Rightarrow \left\{ \begin{aligned} Ax+By - {\bf b} \leq 0 \\ Cx+Dy - {\bf c} = 0 \\ -x \leq 0 \end{aligned} \right. \end{align} \] < Proof >
By Lagrange multiplier \[ \begin{align} &\mathcal{L} = \mathcal{L}(x,y,\alpha_{1}, \alpha_{2}, \alpha_{3}) = {\bf p}^{T}x+{\bf q}^{T}y + \alpha_{1}^{T} (Ax+By - {\bf b}) + \alpha_{2}^{T} (Cx+Dy - {\bf c}) - \alpha_{3}^{T} x \\ &\Rightarrow \left\{ \begin{aligned} &\frac{\partial \mathcal{L}}{\partial x} = {\bf p} + A^{T}\alpha_{1} + C^{T}\alpha_{2} - \alpha_{3} = 0 \\ &\frac{\partial \mathcal{L}}{\partial y} = {\bf q} + B^{T}\alpha_{1} + D^{T}\alpha_{2} = 0 \end{aligned} \right. \Rightarrow \left\{ \begin{aligned} & {\bf p} = -A^{T}\alpha_{1} - C^{T}\alpha_{2} + \alpha_{3} \\ & {\bf q} = -B^{T}\alpha_{1} - D^{T}\alpha_{2} \end{aligned} \right. \\ \end{align} \] ?N?W?z???G?a?J\(\mathcal{L}(x,y,\alpha_{1}, \alpha_{2}, \alpha_{3})\)?o \[ \begin{align} \theta(\alpha_{1}, \alpha_{2}, \alpha_{3}) = -\alpha_{1}^{T}b - \alpha_{2}^{T}C \end{align} \] so, the dual form problem is \[ \begin{align} &\max \limits_{ (\alpha_{1}, \alpha_{2}) } -\alpha_{1}^{T}b - \alpha_{2}^{T}C \\ & s.t\quad \left\{ \begin{aligned} & {\bf p} = -A^{T}\alpha_{1} - C^{T}\alpha_{2} + \alpha_{3} \\ & {\bf q} = -B^{T}\alpha_{1} - D^{T}\alpha_{2} \\ &\alpha_{1}, \alpha_{2} \geq 0 \end{aligned} \right. \\ \end{align} \]