In this part of the exercise, you will build a logistic regression model to predict whether a student gets admitted into a university.
ex2data1 <- read_csv("~/R_projects/171001_linear reg_exo_ng_2/ex2data1.txt", col_names = FALSE)
## Parsed with column specification:
## cols(
## X1 = col_double(),
## X2 = col_double(),
## X3 = col_integer()
## )
colnames(ex2data1) <- c("exam_1_score", "exam_2_score", "Admission")
X <- ex2data1[ , -3]
y <- ex2data1[ , 3]
plot(X$exam_1_score, X$exam_2_score,
col= as.factor(y$Admission),
pch= 20,
main= "Training data: in red admitted student, in black non admitted")
X <- as.matrix(X)
y <- as.matrix(y)
CostFunction <- function(X, y, Theta, Lambda){
Hypothesis <- 1 / (1 + exp(- X %*% Theta)) # probability that y = 1 on input X
J <- (1/m) * sum(- y * log(Hypothesis) - (1 - y) * log(1 - Hypothesis)) # compute J cost
reg <- (Lambda / (2 * m)) * sum(Theta[2:length(Theta)] ^ 2)
J <- J + reg
return(J)
}
GradFunction <- function(X, y, Theta, Lambda){
Hypothesis <- 1 / (1 + exp(- X %*% Theta))
Grad <- ((t(X) %*% (Hypothesis - y)) / m) + ((Lambda / m) * c(0,Theta[2:length(Theta)]))
return(Grad)
}
#
m <- nrow(X)
CostFunction(cbind(1,X),y,c(0, 0, 0),0)
## [1] 0.6931472
# fprintf('Expected cost (approx): 0.693\n')
CostFunction(cbind(1,X),y,c(-24, 0.2, 0.2),0)
## [1] 0.2183302
# 'Expected cost (approx): 0.218\n');
GradFunction(cbind(1,X),y,c(-24, 0.2, 0.2),0)
## Admission
## 0.04290299
## exam_1_score 2.56623412
## exam_2_score 2.64679737
# 'Expected gradients (approx):\n 0.043\n 2.566\n 2.647\n');
X <- cbind(1, X) # add 1 to X for Theta 0
TrainLogisticReg <- function(X, y, Lambda){
ifelse(is.vector(X), initial_theta <- c(0,0), initial_theta <- rep(0, ncol(X))) # initialize theta
ifelse(is.vector(X), m <- length(X), m <- nrow(X))
assign("m", m, .GlobalEnv)
LogisticRec <- optim(par = initial_theta,
fn = CostFunction,
gr = GradFunction,
#method = "BFGS",
X = X,
y = y,
Lambda = Lambda)
print(paste("J", LogisticRec$value))
return(LogisticRec$par)
}
# note: I have closer result to the exercise when I removed the method = BFGS
theta_result <- TrainLogisticReg(X, y, 0)
## [1] "J 0.203497704571603"
print(theta_result)
## [1] -25.1647048 0.2062524 0.2015046
#fprintf('Expected cost (approx): 0.203\n');
# fprintf('Expected theta (approx): -25.161\n 0.206\n 0.201\n')
Plot the decision boundary: plotDecisionBoundary(theta, X, y)
# from stack exchange : https://stats.stackexchange.com/questions/6206/how-to-plot-decision-boundary-in-r-for-logistic-regression-model
# This is an equation in the form of y = mx + b
# x2 >= (-theta[1] / theta[3]) + ((-theta[2] / theta[3]) * x1)
ggplot(as.data.frame(X),
aes(x= X[,2], y= X[,3], colour= as.vector(as.factor(y))))+
geom_point()+
geom_abline(slope= (-theta_result[2] / theta_result[3]), intercept = (-theta_result[1] / theta_result[3]), colour= "red", size=1)+
ggtitle("Boundary line")+
xlab("Score exam 1")+
ylab("Score exam 2")+
theme(legend.position = "none")
After learning the parameters, you’ll like to use it to predict the outcomes on unseen data. In this part, you will use the logistic regression model to predict the probability that a student with score 45 on exam 1 and score 85 on exam 2 will be admitted.
print(1 / (1 + exp(- c(1, 45, 85) %*% theta_result)))
## [,1]
## [1,] 0.7763541
# fprintf('Expected value: 0.775 +/- 0.002\n\n')
predict <- 1 / (1 + exp(- X %*% theta_result))
mean(round(predict,0) == round(y,0)) * 100
## [1] 89
#fprintf('Train Accuracy: %f\n', mean(double(p == y)) * 100);
#fprintf('Expected accuracy (approx): 89.0\n');
In this part of the exercise, you will implement regularized logistic regression to predict whether microchips from a fabrication plant passes quality assurance (QA).
ex2data2 <- read_csv("~/R_projects/171001_linear reg_exo_ng_2/ex2data2.txt", col_names = FALSE)
## Parsed with column specification:
## cols(
## X1 = col_double(),
## X2 = col_double(),
## X3 = col_integer()
## )
colnames(ex2data2) <- c("Microchip_Test_1", "Microchip_Test_2", "Test")
X <- ex2data2[ , -3]
y <- ex2data2[ , 3]
plot(X$Microchip_Test_1, X$Microchip_Test_2,
col= as.factor(y$Test),
pch= 20,
main= "Training data: in red success test, in black failure")
X <- as.matrix(X)
y <- as.matrix(y)
In this part, you are given a dataset with data points that are not linearly separable. However, you would still like to use logistic regression to classify the data points.
To do so, you introduce more features to use – in particular, you add polynomial features to our data matrix (similar to polynomial regression).
nb: the mapFeature function add a column of ones to deal with the intercept
# in the exercise they add a one to the matrix in the mapFeature function
#function out = mapFeature(X1, X2)
#MAPFEATURE Feature mapping function to polynomial features
#MAPFEATURE(X1, X2) maps the two input features to quadratic features used in the regularization exercise.
#Returns a new feature array with more features, comprising of X1, X2, X1.^2, X2.^2, X1*X2, X1*X2.^2, etc..
#Inputs X1, X2 must be the same size
mapFeature <- function(X1, X2, degree){
out <- as.data.frame(rep(1, length(X1)))
for (i in 1:degree){
for (j in 0:i){
out[, ncol(out)+1] <- (X1 ^ (i-j)) * (X2 ^j)
}
}
return(out)
}
X_poly <- mapFeature(X[ ,1], X[ ,2], 6)
m <- nrow(X_poly)
# testing
CostFunction(as.matrix(X_poly),y, rep(0, ncol(X_poly)), 1)
## [1] 0.6931472
#fprintf('Expected cost (approx): 0.693\n');
head(GradFunction(as.matrix(X_poly), y, rep(0, ncol(X_poly)), 1), 5)
## Test
## rep(1, length(X1)) 8.474576e-03
## V2 1.878809e-02
## V3 7.777119e-05
## V4 5.034464e-02
## V5 1.150133e-02
#fprintf('Expected gradients (approx) - first five values only:\n');
#fprintf(' 0.0085\n 0.0188\n 0.0001\n 0.0503\n 0.0115\n');
CostFunction(as.matrix(X_poly), y, rep(1, ncol(X_poly)),10)
## [1] 3.164509
#fprintf('\nCost at test theta (with lambda = 10): %f\n', cost);
#fprintf('Expected cost (approx): 3.16\n');
head(GradFunction(as.matrix(X_poly), y, rep(1, ncol(X_poly)), 10), 5)
## Test
## rep(1, length(X1)) 0.34604507
## V2 0.16135192
## V3 0.19479576
## V4 0.22686278
## V5 0.09218568
#fprintf('Expected gradients (approx) - first five values only:\n');
#fprintf(' 0.3460\n 0.1614\n 0.1948\n 0.2269\n 0.0922\n');
In this part, you will get to try different values of lambda and see how regularization affects the decision coundart
Try the following values of lambda (0, 1, 10, 100).
How does the decision boundary change when you vary lambda? How does the training set accuracy vary?
Note: in the exercise, we use mapFeature with 6 degrees which give a 28-dimensional vector. However, the result I get for this last part of the exercise are different. On the Machineguning website, he also redo the exercise (“http://www.machinegurning.com/rstats/regularised-logistic-regression/”), and the author goes to a 12 degrees features instead. It is not clear to me what the problem is as all previous parts of the exercise are good. I noticed that I have better (and closer to the exercise) results using the method = “BFGS” in the TrainLogisticReg function instead of the null option (which is “If it is NULL, a finite-difference approximation will be used”).
As an example, for lambda = 1: without the BFGS method, I get a train accuracy of 78% and with the BFGS method I get 83.05% (it should be 83.1 acording to the exercise).
TrainLogisticReg <- function(X, y, Lambda){ # use the BFGS method
ifelse(is.vector(X), initial_theta <- c(0,0), initial_theta <- rep(0, ncol(X))) # initialize theta
ifelse(is.vector(X), m <- length(X), m <- nrow(X))
assign("m", m, .GlobalEnv)
LogisticRec <- optim(par = initial_theta,
fn = CostFunction,
gr = GradFunction,
method = "BFGS",
X = X,
y = y,
Lambda = Lambda)
print(paste("J", LogisticRec$value))
return(LogisticRec$par)
}
#Lambda 0
lambda0 <- TrainLogisticReg(as.matrix(X_poly), y, 0)
## [1] "J 0.312649713409482"
boundary_fct <- function(theta){
u <- seq(-1, 1.5, length = 50)
v <- seq(-1, 1.5, length = 50)
z <- matrix(0, nrow = length(u), length(v))
for (i in 1 : length(u)){
for (j in 1 : length(v)){
z[i, j] <- sum(as.matrix(mapFeature(u[i], v[j], 6)) * theta)
}
}
u <- rep(u, times = length(u))
v <- rep(v, each = length(v))
z <- as.vector(t(z)) # unroll the matrix to have it accepted by ggplot
boundary_grid <- as.data.frame(cbind(u,v,z))
return(boundary_grid)
}
lambda0_boundary <- boundary_fct(lambda0)
plot <- ggplot(ex2data2, aes(x= Microchip_Test_1, y= Microchip_Test_2))+
geom_point(aes(colour= as.factor(Test)))+
xlab("Microchip test 1")+
ylab("Microchip test 2")+
theme_bw()
plot + geom_contour(data = lambda0_boundary, aes(x = u, y = v, z = z), bins= 1) +
ggtitle("Lambda 0")
# Lambda 1
lambda1 <- TrainLogisticReg(as.matrix(X_poly), y, 1)
## [1] "J 0.529002742805693"
lambda1_boundary <- boundary_fct(lambda1)
plot + geom_contour(data = lambda1_boundary, aes(x = u, y = v, z = z), bins= 1) +
ggtitle("Lambda 1")
# Lambda 100
lambda100 <- TrainLogisticReg(as.matrix(X_poly), y, 100)
## [1] "J 0.686483833880188"
lambda100_boundary <- boundary_fct(lambda100)
plot + geom_contour(data = lambda100_boundary, aes(x = u, y = v, z = z), bins= 1) +
ggtitle("Lambda 100")
As we can see, when the lambda increases, the boundary is getting offset.
Moreover, although plots of Lambda 1 and Lambda 100 are pretty close to the ones in the exercise, the Lambda 0 is not overfitting as much.