Summary

Logistic Regression

In this part of the exercise, you will build a logistic regression model to predict whether a student gets admitted into a university.

Part 1: Plotting

ex2data1 <- read_csv("~/R_projects/171001_linear reg_exo_ng_2/ex2data1.txt", col_names = FALSE)
## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   X2 = col_double(),
##   X3 = col_integer()
## )
colnames(ex2data1) <- c("exam_1_score", "exam_2_score", "Admission")

X <- ex2data1[ , -3]
y <- ex2data1[ , 3]

plot(X$exam_1_score, X$exam_2_score, 
     col= as.factor(y$Admission), 
     pch= 20,
     main= "Training data: in red admitted student, in black non admitted")

X <- as.matrix(X)
y <- as.matrix(y)

Part 2: Compute Cost and Gradient

CostFunction <- function(X, y, Theta, Lambda){
  Hypothesis <- 1 / (1 + exp(- X %*% Theta)) # probability that y = 1 on input X
  J <- (1/m) * sum(- y * log(Hypothesis) - (1 - y) * log(1 - Hypothesis)) # compute J cost
  reg <- (Lambda / (2 * m)) * sum(Theta[2:length(Theta)] ^ 2)
  J <- J + reg
  return(J)
}


GradFunction <- function(X, y, Theta, Lambda){
  Hypothesis <- 1 / (1 + exp(- X %*% Theta))
  Grad <- ((t(X) %*% (Hypothesis - y)) / m) + ((Lambda / m) * c(0,Theta[2:length(Theta)]))
  return(Grad)
}


# 
m <- nrow(X)
CostFunction(cbind(1,X),y,c(0, 0, 0),0)
## [1] 0.6931472
# fprintf('Expected cost (approx): 0.693\n')

CostFunction(cbind(1,X),y,c(-24, 0.2, 0.2),0)
## [1] 0.2183302
# 'Expected cost (approx): 0.218\n');

GradFunction(cbind(1,X),y,c(-24, 0.2, 0.2),0)
##               Admission
##              0.04290299
## exam_1_score 2.56623412
## exam_2_score 2.64679737
# 'Expected gradients (approx):\n 0.043\n 2.566\n 2.647\n');

Part 3: Optimizing using fminunc

X <- cbind(1, X) # add 1 to X for Theta 0

TrainLogisticReg <- function(X, y, Lambda){
  ifelse(is.vector(X), initial_theta <- c(0,0), initial_theta <- rep(0, ncol(X))) # initialize theta
  
  ifelse(is.vector(X), m <- length(X), m <- nrow(X))
  assign("m", m, .GlobalEnv)
  
  LogisticRec <- optim(par = initial_theta,
                       fn = CostFunction,
                       gr = GradFunction,
                       #method = "BFGS",
                       X = X,
                       y = y,
                       Lambda = Lambda)
  
  print(paste("J", LogisticRec$value))
  return(LogisticRec$par)
}

# note: I have closer result to the exercise when I removed the method = BFGS
theta_result <- TrainLogisticReg(X, y, 0)
## [1] "J 0.203497704571603"
print(theta_result)
## [1] -25.1647048   0.2062524   0.2015046
#fprintf('Expected cost (approx): 0.203\n');
# fprintf('Expected theta (approx): -25.161\n 0.206\n 0.201\n')

Plot the decision boundary: plotDecisionBoundary(theta, X, y)

# from stack exchange : https://stats.stackexchange.com/questions/6206/how-to-plot-decision-boundary-in-r-for-logistic-regression-model
# This is an equation in the form of y = mx + b
# x2 >= (-theta[1] / theta[3]) + ((-theta[2] / theta[3]) * x1)

ggplot(as.data.frame(X), 
       aes(x= X[,2], y= X[,3], colour= as.vector(as.factor(y))))+
  geom_point()+
  geom_abline(slope= (-theta_result[2] / theta_result[3]), intercept = (-theta_result[1] / theta_result[3]), colour= "red", size=1)+
  ggtitle("Boundary line")+
  xlab("Score exam 1")+
  ylab("Score exam 2")+
  theme(legend.position = "none")

Part 4: Predict and Accuracies

After learning the parameters, you’ll like to use it to predict the outcomes on unseen data. In this part, you will use the logistic regression model to predict the probability that a student with score 45 on exam 1 and score 85 on exam 2 will be admitted.

print(1 / (1 + exp(- c(1, 45, 85) %*% theta_result)))
##           [,1]
## [1,] 0.7763541
# fprintf('Expected value: 0.775 +/- 0.002\n\n')

predict <- 1 / (1 + exp(- X %*% theta_result))

mean(round(predict,0) == round(y,0)) * 100
## [1] 89
#fprintf('Train Accuracy: %f\n', mean(double(p == y)) * 100);
#fprintf('Expected accuracy (approx): 89.0\n');

Regularized logistic regression

In this part of the exercise, you will implement regularized logistic regression to predict whether microchips from a fabrication plant passes quality assurance (QA).

ex2data2 <- read_csv("~/R_projects/171001_linear reg_exo_ng_2/ex2data2.txt", col_names = FALSE)
## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   X2 = col_double(),
##   X3 = col_integer()
## )
colnames(ex2data2) <- c("Microchip_Test_1", "Microchip_Test_2", "Test")

X <- ex2data2[ , -3]
y <- ex2data2[ , 3]


plot(X$Microchip_Test_1, X$Microchip_Test_2, 
     col= as.factor(y$Test), 
     pch= 20,
     main= "Training data: in red success test, in black failure")

X <- as.matrix(X)
y <- as.matrix(y)

Part 1: Regularized Logistic Regression

In this part, you are given a dataset with data points that are not linearly separable. However, you would still like to use logistic regression to classify the data points.

To do so, you introduce more features to use – in particular, you add polynomial features to our data matrix (similar to polynomial regression).

nb: the mapFeature function add a column of ones to deal with the intercept

# in the exercise they add a one to the matrix in the mapFeature function

#function out = mapFeature(X1, X2)
#MAPFEATURE Feature mapping function to polynomial features

#MAPFEATURE(X1, X2) maps the two input features to quadratic features used in the regularization exercise.

#Returns a new feature array with more features, comprising of X1, X2, X1.^2, X2.^2, X1*X2, X1*X2.^2, etc..
#Inputs X1, X2 must be the same size

mapFeature <- function(X1, X2, degree){
  
  out <- as.data.frame(rep(1, length(X1)))
  
  for (i in 1:degree){
    for (j in 0:i){
      out[, ncol(out)+1] <- (X1 ^ (i-j)) * (X2 ^j)
    }
  }
  return(out)
}

X_poly <- mapFeature(X[ ,1], X[ ,2], 6)

m <- nrow(X_poly)

# testing
CostFunction(as.matrix(X_poly),y, rep(0, ncol(X_poly)), 1)
## [1] 0.6931472
#fprintf('Expected cost (approx): 0.693\n');

head(GradFunction(as.matrix(X_poly), y, rep(0, ncol(X_poly)), 1), 5)
##                            Test
## rep(1, length(X1)) 8.474576e-03
## V2                 1.878809e-02
## V3                 7.777119e-05
## V4                 5.034464e-02
## V5                 1.150133e-02
#fprintf('Expected gradients (approx) - first five values only:\n');
#fprintf(' 0.0085\n 0.0188\n 0.0001\n 0.0503\n 0.0115\n');

CostFunction(as.matrix(X_poly), y, rep(1, ncol(X_poly)),10)
## [1] 3.164509
#fprintf('\nCost at test theta (with lambda = 10): %f\n', cost);
#fprintf('Expected cost (approx): 3.16\n');

head(GradFunction(as.matrix(X_poly), y, rep(1, ncol(X_poly)), 10), 5)
##                          Test
## rep(1, length(X1)) 0.34604507
## V2                 0.16135192
## V3                 0.19479576
## V4                 0.22686278
## V5                 0.09218568
#fprintf('Expected gradients (approx) - first five values only:\n');
#fprintf(' 0.3460\n 0.1614\n 0.1948\n 0.2269\n 0.0922\n');

Part 2: Regularization and Accuracies

In this part, you will get to try different values of lambda and see how regularization affects the decision coundart

Try the following values of lambda (0, 1, 10, 100).

How does the decision boundary change when you vary lambda? How does the training set accuracy vary?

Note: in the exercise, we use mapFeature with 6 degrees which give a 28-dimensional vector. However, the result I get for this last part of the exercise are different. On the Machineguning website, he also redo the exercise (“http://www.machinegurning.com/rstats/regularised-logistic-regression/”), and the author goes to a 12 degrees features instead. It is not clear to me what the problem is as all previous parts of the exercise are good. I noticed that I have better (and closer to the exercise) results using the method = “BFGS” in the TrainLogisticReg function instead of the null option (which is “If it is NULL, a finite-difference approximation will be used”).

As an example, for lambda = 1: without the BFGS method, I get a train accuracy of 78% and with the BFGS method I get 83.05% (it should be 83.1 acording to the exercise).

TrainLogisticReg <- function(X, y, Lambda){ # use the BFGS method
  ifelse(is.vector(X), initial_theta <- c(0,0), initial_theta <- rep(0, ncol(X))) # initialize theta
  
  ifelse(is.vector(X), m <- length(X), m <- nrow(X))
  assign("m", m, .GlobalEnv)
  
  LogisticRec <- optim(par = initial_theta,
                       fn = CostFunction,
                       gr = GradFunction,
                       method = "BFGS",
                       X = X,
                       y = y,
                       Lambda = Lambda)
  
  print(paste("J", LogisticRec$value))
  return(LogisticRec$par)
}



#Lambda 0
lambda0 <- TrainLogisticReg(as.matrix(X_poly), y, 0)
## [1] "J 0.312649713409482"
boundary_fct <- function(theta){ 
  u <- seq(-1, 1.5, length = 50) 
  v <- seq(-1, 1.5, length = 50)
  z <- matrix(0, nrow = length(u), length(v))
  
  for (i in 1 : length(u)){
    for (j in 1 : length(v)){
      z[i, j] <- sum(as.matrix(mapFeature(u[i], v[j], 6)) * theta)
    }
  }
  
  u <- rep(u, times = length(u))
  v <- rep(v, each = length(v))
  z <- as.vector(t(z))  # unroll the matrix to have it accepted by ggplot
  
  boundary_grid <- as.data.frame(cbind(u,v,z))
  return(boundary_grid)
}

lambda0_boundary <- boundary_fct(lambda0)

plot <- ggplot(ex2data2, aes(x= Microchip_Test_1, y= Microchip_Test_2))+
  geom_point(aes(colour= as.factor(Test)))+
  xlab("Microchip test 1")+
  ylab("Microchip test 2")+
  theme_bw()

plot + geom_contour(data = lambda0_boundary, aes(x = u, y = v, z = z), bins= 1) + 
  ggtitle("Lambda 0")

# Lambda 1
lambda1 <- TrainLogisticReg(as.matrix(X_poly), y, 1)
## [1] "J 0.529002742805693"
lambda1_boundary <- boundary_fct(lambda1)
plot + geom_contour(data = lambda1_boundary, aes(x = u, y = v, z = z), bins= 1) + 
  ggtitle("Lambda 1")

# Lambda 100
lambda100 <- TrainLogisticReg(as.matrix(X_poly), y, 100)
## [1] "J 0.686483833880188"
lambda100_boundary <- boundary_fct(lambda100)
plot + geom_contour(data = lambda100_boundary, aes(x = u, y = v, z = z), bins= 1) + 
  ggtitle("Lambda 100")

As we can see, when the lambda increases, the boundary is getting offset.

Moreover, although plots of Lambda 1 and Lambda 100 are pretty close to the ones in the exercise, the Lambda 0 is not overfitting as much.