# Instaling some useful packages
library(tidyverse)
library(kableExtra)
library(class)     # Using KNN

Problem 1

Let \[ \begin{align} A_{+} = \{(0, 0),\ (1, 1),\ (-1, 1),\ (1, -1),\ (-1, -1)\} \end{align} \] and \[ \begin{align} A_{-} =\{(1,0),\ (-1,0),\ (0,1),\ (0,-1)\} \end{align} \] represent the positive and negative training instances respectively.

# Creating a dataset
x1 <- c(0, 1, -1, 1, -1, 1, -1, 0, 0)
x2 <- c(0, 1, 1, -1, -1, 0, 0, 1, -1)
y <- c(rep("A+", 5), rep("A-", 4))
data.problem1 <- data.frame(x1, x2, y)
data.problem1 %>% as.matrix() %>% 
  kable(., "html", col.names = c("x1", "x2", "y")) %>%
  kable_styling(bootstrap_options = "striped", full_width = F)

x1	x2	y
0	0	A+
1	1	A+
-1	1	A+
1	-1	A+
-1	-1	A+
1	0	A-
-1	0	A-
0	1	A-
0	-1	A-

Plot the decision boundary for the 1-nearest neighbor(1-NN) algorithm.

# 1. Creating testing set 
n.sample <- 30   # ???ͦ??X?Ӽ˥??I
data.testing <- matrix(0, 
                       nrow = n.sample^2, ncol = 2) %>%
  `colnames<-`(., c("x1", "x2")) %>% as.data.frame() 
data.testing[, 2] <- rep(seq(-1, 1, length.out = n.sample),
                         n.sample)
data.testing[, 1] <- rep(seq(-1, 1, length.out = n.sample),
                         each = n.sample)
# 2. Using KNN 
KNN.model <- knn(train = data.problem1[, c(1, 2)], 
                 test = data.testing, 
                 cl = data.problem1$y,
                 k = 1)
data.testing$predict <- KNN.model 
data.testing$z <- c(0, 1)[sapply(data.testing$predict, 
                                 as.numeric)]
# 3. Visualization 
ggplot(data = data.testing, aes(x = x1, y = x2)) + 
  geom_point(aes(colour = predict, 
                 shape = predict), size = 2) + 
  geom_contour(aes(z = z), colour = 'black', size = 0.1) +
  geom_point(data = data.problem1, 
             aes(x = x1, y = x2, 
                 colour = as.factor(y), shape = y), 
             size = 5) + 
  labs(title = "1-Nearest Neighbour")

Above the figure, It’s the decision boundary of 1-NN

What is the training set accuaracy for 3-nearest neighbor(3-NN) algorithm?

# Using KNN, K = 3
prediciton.knn <- knn(train = data.problem1[, c(1, 2)], 
                      test = data.problem1[, c(1, 2)], 
                      cl = data.problem1[, 3], 
                      k = 3)
# Confusion matrix
confusion.table <- table(x = data.problem1$y, 
                         y = prediciton.knn, 
                         dnn = c("Actual", "Predict"))
# Accuaracy
Accuaracy <- sum(diag(confusion.table)) / 
  sum(confusion.table)
Accuaracy %>% matrix(., dimnames = list(c("Accuaracy"), 
                                        c("3-NN"))) %>% 
  kable(., "html") %>% 
  kable_styling(bootstrap_options = "striped", full_width = F)

	3-NN
Accuaracy	0

What is the confusion matrix for 3-nearest neighbor(3-NN) algorithm on the training set?

# 3-NN confusion matrix 
confusion.table %>% 
  kable(., "html", caption = "Confusion matrix", 
        booktabs = TRUE) %>% 
  kable_styling(bootstrap_options = "striped", full_width = F) %>% 
  add_header_above(c(" " = 1, "Predict" = 2))

Confusion matrix
	Predict
	A-	A+
A-	0	4
A+	5	0

Above the information, It’s the confusion matrix of 3-NN.

Problem 2

Let the \(A \in R^{n*n}\) be a symmetric positive definite matrix. Show that all eigenvalues of matrix A are positive.
< Proof >
Let \(\lambda\) be the eigenvalue of A, then \(A\vec{x}=\lambda \vec{x},\ \forall\ \vec{x} \neq \vec{0}\)
\[ \begin{align} &Given\ A\ is\ positive\ definite\ matrix \\ &\Rightarrow\ \vec{x}^{\,t}A\vec{x} = \vec{x}^{\,t}\lambda\vec{x} = \lambda*||\vec{x}||^{2} >0 \\ &\because\ ||\vec{x}||^{2} >0 \\ &\therefore\ \lambda > 0 \end{align} \]

Problem 3

Let \(Z=[X_{1},\ X_{2},\ X_{3}]^{T}\) be a random vector and \(\Sigma\) be a matrix with size 3??3 where \(\Sigma_{ij} = Cov(X_{i},X_{j})\) and \(\Sigma_{ii} = Var(X_{i})\). Let the random variable \(W=a^{T}Z=a_{1}X_{1} + a_{2}X_{2} + a_{3}X_{3}\) where \(a = [a_{1},\ a_{2},\ a_{3}]^{T}\). i.e., the random variable W is the projection of random vector Z onto the vector a. Find the variance of W .
< Solution > \[ \begin{align} Var(W) &= Var(a^{T}Z) \\ &= aVar(Z)a^{T} \\ &= \begin{pmatrix} a_{1} & a_{2} & a_{3} \\ \end{pmatrix}_{3 * 1} \Sigma_{3*3} \begin{pmatrix} a_{1} \\ a_{2} \\ a_{3} \\ \end{pmatrix}_{3 * 1}\quad where\quad \Sigma= \begin{pmatrix} \Sigma_{11} & \Sigma_{12} & \Sigma_{13}\\ \Sigma_{21} & \Sigma_{22} & \Sigma_{23}\\ \Sigma_{31} & \Sigma_{32} & \Sigma_{33}\\ \end{pmatrix}_{3 * 3} \\ &= \sum_{i=1}^{3}a_{i}^{2} \Sigma_{ii} + 2\mathop{\sum_{i=1}^{3}\sum_{j=1}^{3}}_{i<j}a_{i}a_{j}\Sigma_{ij} \end{align} \]

Problem 4

Let S be a set of 10,000 random numbers generated by the uniform distribution, U[0,1].You have to estimate the mean of this uniform distribution.
According to the above statement \[ \begin{align} S \sim U[0,1] \\ \end{align} \] then, we know \[ \begin{align} E(S) = \frac{1}{2},\ and\ Var(S) = \frac{1}{12} \approx 0.0833 \end{align} \] Now, we generate 10000 r.vs from U[0, 1], and to estimate the mean and variance.

# Generating 10000 r.v from U[0, 1]
set.seed(2019)
S <- runif(n = 10000, min = 0, max = 1)
# Calculating mena and variance
c(S %>% mean, S %>% var) %>% round(., 4) %>% 
  matrix(., nrow = 1) %>% `rownames<-`(., "Value") %>% 
  kable(., "html", col.names = c("Mean", "Variance")) %>%
  kable_styling(bootstrap_options = "striped", full_width = F)

	Mean	Variance
Value	0.4944	0.0836

Above table is the mean and variance by 10000 uniform r.vs.

Randomly select 10 number from the set S and then use the average of these 10 number as the estimation. Repeat this experiment 20 times. What is the sample mean and sample standard deviation of these 20 random experiments?

set.seed(2019)
S_10.talbe <- matrix(NA, ncol = 20, nrow = 2) %>%
  `rownames<-`(., c("Mean", "Variance")) %>% 
  `colnames<-`(., c(1:20))
for(i in 1:20){
  S_10 <- sample(x = S, size = 10, replace = TRUE)
  S_10.talbe[1, i]<- S_10 %>% mean()
  S_10.talbe[2, i ]<- S_10 %>% var
  }
S_10.talbe %>% apply(., 1, mean) %>% round(., 4) %>% 
  t() %>% `rownames<-`(., "Values") %>% 
  kable(., "html") %>%
  kable_styling(bootstrap_options = "striped", full_width = F)

	Mean	Variance
Values	0.4631	0.0797

Randomly select 1,000 number from the set S and then use the average of these 1,000 number as the estimation. Repeat this experiment 50 times. What is the sample mean and sample standard deviation of these 50 random experiments?

set.seed(2019)
S_50.talbe <- matrix(NA, ncol = 50, nrow = 2) %>%
  `rownames<-`(., c("Mean", "Variance")) %>% 
  `colnames<-`(., c(1:50))
for(i in 1:50){
  S_50 <- sample(x = S, size = 1000, replace = TRUE)
  S_50.talbe[1, i]<- S_10 %>% mean()
  S_50.talbe[2, i ]<- S_10 %>% var
  }
S_50.talbe %>% apply(., 1, mean) %>% round(., 4) %>% 
  t() %>% `rownames<-`(., "Values") %>% 
  kable(., "html") %>%
  kable_styling(bootstrap_options = "striped", full_width = F)

	Mean	Variance
Values	0.2843	0.0439

Problem 5

Consider a linear system of equations as follows: \[ \begin{equation*} \begin{aligned} 2x_{1} &+ &&2x_{2} & &= 5 \\ x_{1} &+ &&&x_{3} &= 2 \\ & &&2x_{2}+ &2x_{3} &= 3 \\ x_{1} &+ &&&x_{3} &= 4 \end{aligned} \end{equation*} \] Find the least squares approximation solution for it.
According to the statement, we can rewrite from linear system of equations to matrix form.
\[ \begin{align} \begin{pmatrix} 2 & 2 & 0 \\ 1 & 0 & 1 \\ 0 & 2 & 2 \\ 1 & 0 & 1 \end{pmatrix}_{4 * 3} \begin{pmatrix} x_{1} \\ x_{2} \\ x_{3} \end{pmatrix}_{3 * 1} &= \begin{pmatrix} 5 \\ 2 \\ 3 \\ 4 \end{pmatrix}_{4 * 1} \\ \Rightarrow AX &= b \end{align} \] So, the least square solution is \[ \begin{align} X = (A^{T}A)^{-1}A^{T}b \in R^{3*1} \end{align} \]

# LS solution
A <- matrix(c(2, 2, 0,
              1, 0, 1,
              0, 2, 2,
              1, 0, 1), nrow = 4, byrow = TRUE)
    
b <- matrix(c(5, 2, 3, 4), ncol = 1)
    
solve(t(A) %*% A) %*% t(A) %*% b %>% 
  `rownames<-`(., paste("x", seq(1,3))) %>% 
  kable(., "html", caption = "LS solution") %>%
  kable_styling(bootstrap_options = "striped", full_width = F)

LS solution
x 1	2.0
x 2	0.5
x 3	1.0

Above table is the least square solution of linear system

Problem 6

Generate a training dataset with size 1000 by yourself.

That is, \(S = \{(x^{i},y_{i})|x^{i} = (x^{i}_{1}, x^{i}_{2}) \in R^{2},and\ y_{i} \in R,\ i = 1,......, 1000\}\), where \(x_{1}\) and \(x_{2}\) are generated by the uniform distribution, U[-1, 1]. The observation value \(y = 2x^{2}_1 - 0.6x^{2}_{2} + 1.5x_{1}x_{2} + x_{1} + 2x_{2} + \epsilon\) where \(\epsilon\) is the random noise generated by N(0, 1).

# Genetating s training dataset S
set.seed(2019)
x1 <- runif(n = 1000, min = -1, max = 1)
x2 <- runif(n = 1000, min = -1, max = 1)
y <- 2*(x1^2) - 0.6 *(x2^2) + 1.5 * x1*x2 + x1 + 2*x2 +
  rnorm(1000)
S <- data.frame(x1, x2, y)
S %>% head %>% 
  kable(., "html", caption = "A part of data") %>%
  kable_styling(bootstrap_options = "striped", full_width = F)

A part of data
x1	x2	y
0.5398031	-0.3970481	1.5427073
0.4256795	-0.8003378	-1.2369026
-0.3932796	0.4100049	-0.3372891
0.2364727	-0.6506493	-1.4938536
-0.8990325	0.1781360	1.5329034
-0.9135624	0.1022247	1.6144876

Above table is the simulated dataset

Find a quadratic function f(x), that is fitted in the training dataset S.
According to the statement, we need to fit the quadratic function

# Fit a regression
lm.fit <- lm(y ~ -1 + I(x1^2) + I(x2^2) + x1*x2, data = S) 
lm.fit %>% summary() %>% coef %>% round(., 3)%>% 
  kable(., "html") %>% 
  kable_styling(bootstrap_options = "striped", full_width = F)

	Estimate	Std. Error	t value
I(x1^2)	2.230	0.087	25.572
I(x2^2)	-0.819	0.086	-9.559
x1	0.983	0.056	17.605
x2	2.009	0.056	36.058
x1:x2	1.449	0.095	15.220

Above the table is the coefficient, and then the quadratic function is \[ \begin{align} f(x) = 2.23x^{2}_1 - 0.819x^{2}_{2} + 1.449x_{1}x_{2} + 0.983x_{1} + 2.009x_{2} \end{align} \]

Compute the MAE, mean of absolute error, and plot the function you get.

# MAE
lm.fit$residuals %>% abs %>% mean %>% round(., 4) %>% 
  matrix(., dimnames = list(c("Value"), c("MAE"))) %>% 
  kable(., "html") %>% 
  kable_styling(bootstrap_options = "striped", full_width = F)

	MAE
Value	0.8015

Above the table is the MAE (mean of absolute error) of quadratic function.

# plot the quadratic function
obj_fun <- function (x1, x2) {
  coef <- lm.fit %>% coef
  value <- coef[1]*x1^2 + 
    coef[2]*x2^2 + coef[3]*x1 + coef[4]*x2 + coef[5]*x1*x2
  return(value)
}

x1 <- seq(-1,1, length.out = 15)
x2 <- seq(-1,1, length.out = 15)
z <- outer(x1, x2, obj_fun)
# op <- par(bg = "white")
persp(x = x1, y = x2, z, 
      theta = 20, phi = 20, expand = 0.7, 
      col = "blue", 
      main = "quadratic function", 
      sub = expression(f(x) == 2.23*x[1]^{2} - 0.819*x[2]^{2} + 1.439*x[1]*x[2] + 0.983*x[1] + 2.009*x[2]))

Machine_Learning_HW1