Set Up

Clean the Environment

rm(list = ls())

Prepare Packages

library(ggplot2)
library(dplyr)
## 
## 载入程辑包:'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Logistic model:

sigmoid function, inverse of logit

sigmoid <- function(z){1/(1+exp(-z))}

cost function

cost <- function(theta, X, y){
  
  m <- length(y) # number of training examples
  
  h <- sigmoid(X %*% theta)
  
  J <- (t(-y)%*%log(h)-t(1-y)%*%log(1-h))/m
  J
}

gradient function

grad <- function(theta, X, y){
  
  m <- length(y) 
  
  h <- sigmoid(X%*%theta)
 
  grad <- (t(X)%*%(h - y))/m
  grad
}
logisticReg <- function(X, y){
  #remove NA rows
  X <- na.omit(X)
  
  y <- na.omit(y)
  
  #add bias term and convert to matrix
  X <- mutate(X, bias =1)
  
  #move the bias column to col1
  X <- as.matrix(X[, c(ncol(X), 1:(ncol(X)-1))])
  
  y <- as.matrix(y)
  
  #initialize theta
  theta <- matrix(rep(0, ncol(X)), nrow = ncol(X))
  
  #use the optim function to perform gradient descent
  costOpti <- optim(theta, fn = cost, gr = grad, X = X, y = y)
  
  #return coefficients
  return(costOpti$par)
}
logisticProb <- function(theta, X){
  
  X <- na.omit(X)
  
  #add bias term and convert to matrix
  X <- mutate(X, bias =1)
  
  X <- as.matrix(X[,c(ncol(X), 1:(ncol(X)-1))])
  
  return(sigmoid(X%*%theta))
}
# y prediction
logisticPred <- function(prob){
  
  return(round(prob, 0))
}

Creat Data set:

set.seed(1024)

r1 = sqrt(runif(1500, min=0, max=1))

a1 = runif(1500, min=0, max=2*pi)

x1 = r1 * cos(a1) + rnorm(1500,mean=0,sd=0.2)

x2 = r1 * sin(a1) + rnorm(1500,mean=0,sd=0.2)

dt1 = data.frame(x1,x2)

dt1$y = 1

r2 = sqrt(runif(3500, min=2.5, max=4))

a2 = runif(3500, min=0, max=2*pi)

x1 = r2 * cos(a2)

x2 = r2 * sin(a2)

dt2 = data.frame(x1,x2)

dt2$y = 0

dtf = rbind(dt1,dt2)

#head(dt1)
ggplot(dtf) + 
  geom_point(aes(x=x1, y=x2,color = as.character(y) ), size = 2) + 
  scale_colour_discrete(name  ="Label") + 
  xlim(-2.5, 2.5) +
  ylim(-2.5, 2.5) + coord_fixed(ratio = 1) +
  ggtitle('Data to be classified') +
  theme_bw(base_size = 12) 

Xd = dtf

X = Xd[,c(1,2)]

y = Xd$y

model I

\(h_\theta(x) = g(\theta+\theta_1x_1+\theta_2x_2)\)

# training
Xm1 = X

theta1 <- logisticReg(Xm1, y)

# generate a grid for decision boundary, this is the test set
grid <- expand.grid(seq(-2.5, 2.5, length.out = 100), seq(-2.5, 2.5, length.out = 100))

colnames(grid) = c('x1','x2')

# predict the probability
probZ <- logisticProb(theta1, grid)

# predict the label
Z <- logisticPred(probZ)

gridPred = cbind(grid, Z)

Visualize the Boundary

# decision boundary visualization
M1_plot <- ggplot() +   
  geom_point(data = X, aes(x=x1, y=x2, color = as.character(y)), size = 2, show.legend = F) + 
  geom_tile(data = gridPred, aes(x = grid[, 1],y = grid[, 2], fill=as.character(Z)), 
            alpha = 0.3, show.legend = F) + ylim(-3, 3)+ xlim(-3, 3) +
  ggtitle('Decision Boundary for Model I') +
  coord_fixed(ratio = 1) +
  theme_bw(base_size = 12) 

M1_plot

model II

\(h_\theta(x) = g(\theta+\theta_1x_1+\theta_2x_2 + \theta_3x_1^2 + \theta_4x_2^2)\)

Xm2 = X

Xm2$x3 = Xm2$x1^2

Xm2$x4 = Xm2$x2^2

# training
theta2 <- logisticReg(Xm2, y)

# generate a grid for decision boundary, this is the test set
grid <- expand.grid(seq(-2.5, 2.5, length.out = 100), seq(-2.5, 2.5, length.out = 100))
colnames(grid) = c('x1','x2')
grid$x3 = grid$x1^2
grid$x4 = grid$x2^2

# predict the probability
probZ <- logisticProb(theta2, grid)

# predict the label
Z <- logisticPred(probZ)
gridPred = cbind(grid, Z)
# decision boundary visualization
M2_plot <- ggplot() +   
  geom_point(data = X, aes(x=x1, y=x2, color = as.character(y)), size = 2, show.legend = F) + 
  geom_tile(data = gridPred, aes(x = grid[, 1],y = grid[, 2], fill=as.character(Z)), 
            alpha = 0.3, show.legend = F) + ylim(-3, 3)+ xlim(-3, 3) +
  ggtitle('Decision Boundary for Model II') +
  coord_fixed(ratio = 1) +
  theme_bw(base_size = 12) 

M2_plot

Comparison of accuracy

Model I

probZ1 <- logisticProb(theta1, Xm1)

Z1 <- logisticPred(probZ1)

mean(Z1==y)
## [1] 0.7

Model II:

probZ2 <- logisticProb(theta2, Xm2)

Z2 <- logisticPred(probZ2)

mean(Z2==y)
## [1] 0.9994

We can see that with the quadratic term, the boundary is better.