Training a logistic regression model

The Data

We want to train a logistic regression model on a dataset consisting of 2 feature variables.The data is in the folder ex2data2.txt in the usual teams file share folder for the class. Here is the plot of the binary Y target variable against the 2 features (crosses are 1s and circles are 0s).

#Read the data
library(data.table)
library(ggplot2)

dataset<-as.matrix(fread("ex2data2.txt"))
colnames(dataset)<-c("x1","x2","y")

d<-as.data.frame(dataset)

d$y<-as.factor(d$y)

ggplot(d, aes(x = x1, y = x2, colour = y)) + geom_point(aes(shape = y, stroke = 2), size = 3) + scale_shape_manual(values=c(1,3))

The Training Function

Function for Predicted Probabilities

predicted_prob <- function(theta,X){
  return(1/(1+exp(-(X%*%theta))))
}

The Cost Function for the Regression

costfunctionreg = function(theta, X, y, lambda){
  
  #COSTFUNCTIONREG Compute cost and gradient for logistic regression with regularization
  #   J = COSTFUNCTIONREG(theta, X, y, lambda) computes the cost of using
  #   theta as the parameter for regularized logistic regression and the
  #   gradient of the cost w.r.t. to the parameters. 
  
  # Initialize some useful values
  m = length(y); # number of training examples
  
  # You need to return the following variables correctly 
  grad = matrix(0,length(theta),1);
  
  h_theta = predicted_prob(theta,X);
  J = (1/m) * sum((-y*log(h_theta) - (1-y)*log(1-h_theta))) + (lambda/(2*m)) * t(theta[2:length(theta)])%*%theta[2:length(theta)];
  
  return(J)
  
}

The Training Function

train<-function(theta0,X,y,lambda){
  
  theta_opt<-optim(theta0, costfunctionreg, gradient, X, y, lambda,
                   method = c("BFGS"),
                   lower = -Inf, upper = Inf,
                   control = list(reltol=10^-12, abstol = 10-12, maxit=10000000), hessian = FALSE)
  
  return(theta_opt)
  
}

The GradientFunction

gradient<-function(theta,X,y,lambda){
  
  # Initialize some useful values
  m = length(y); # number of training examples
  
  thetaZero = theta;
  thetaZero = c(0,thetaZero[2:length(thetaZero)]);
  
  h_theta = predicted_prob(theta,X);
  
  grad = t((1 / m) * t(h_theta - y)%*%X) + (lambda)*thetaZero;
  
}

Function for Computing the Predicted Probabilities

#Accuracy = (TP + TN)/(TP + FP +TN +FN)
#Recall = TP/(TP + FN)
#TNR = TN/(TN + FP)
#Precision = TP/(TP + FP)
#NPV = TN/(TN + FP)

classification_metrics<-function(theta,X,y,classification_threshold){
  
  assigned_class<-as.numeric(predicted_prob(theta,X)>classification_threshold)
  
  assigned_class<-factor(as.character(assigned_class),levels=c("0","1"))
  y<-as.factor(y)
  
  #Convert to factors
  
  cm = as.matrix(table(y, assigned_class))
  
  accuracy <- sum(diag(cm)) / sum(cm)
  
  recall <- cm[2,2] / (cm[2,1] + cm[2,2])
  
  TNR <- cm[1,1]/(cm[1,1] + cm[1,2])
  
  precision <- cm[2,2] / (cm[1,2] + cm[2,2])
  
  NPV <- cm[1,1]/(cm[1,2] + cm[1,1])
  
  return(list(accuracy=accuracy,recall=recall,TNR=TNR,precision=precision,NPV=NPV))
}

Function for Polynomial Features of a Certain Degree

add.poly.features <- function(x, degree){
  new.mat <- matrix(1, nrow=nrow(x))
  for (i in 1:degree){
    for (j in 0:i){
      new.mat <- cbind(new.mat, (x[,1]^(i-j) * (x[,2]^j)))
    }
  }
  return(new.mat)
}

Split the Dataset into Training and Test Sets

library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked _by_ '.GlobalEnv':
## 
##     train
set.seed(100)

train_indexes<-createDataPartition(dataset[,"y"],p=0.7,list=F)

X_train<-dataset[train_indexes,colnames(dataset) != "y"]
y_train<-dataset[train_indexes,"y"]

X_test<-dataset[-train_indexes,colnames(dataset) != "y"]
y_test<-dataset[-train_indexes,"y"]

Features for Polynomial Degree = 1

degree_to_fit <- 1
lambda<-0

X_train_features<-add.poly.features(X_train,degree_to_fit)
X_test_features<-add.poly.features(X_test,degree_to_fit)

Parameter Estimates

theta0<-rep(0,dim(X_train_features)[2])

res<-train(theta0,X_train_features,y_train,lambda = lambda)

print(res$par)
## [1] -0.1120450 -0.1282820  0.2213682

Classification Metrics for the Training Set

classification_metrics(res$par,X_train_features,y_train,0.5)
## $accuracy
## [1] 0.4939759
## 
## $recall
## [1] 0.3
## 
## $TNR
## [1] 0.6744186
## 
## $precision
## [1] 0.4615385
## 
## $NPV
## [1] 0.6744186

Classification Metrics for the Test Set

classification_metrics(res$par,X_test_features,y_test,0.5)
## $accuracy
## [1] 0.5142857
## 
## $recall
## [1] 0.3888889
## 
## $TNR
## [1] 0.6470588
## 
## $precision
## [1] 0.5384615
## 
## $NPV
## [1] 0.6470588

Improving the Model performance

Polynomial Degree = 2

degree_to_fit <- 2
lambda<-0

X_train_features<-add.poly.features(X_train,degree_to_fit)
X_test_features<-add.poly.features(X_test,degree_to_fit)

theta0<-rep(0,dim(X_train_features)[2])

res<-train(theta0,X_train_features,y_train,lambda = lambda)

Classification Metrics for the Training Set (Polynomial Degree = 2)

classification_metrics(res$par,X_train_features,y_train,0.5)
## $accuracy
## [1] 0.7951807
## 
## $recall
## [1] 0.775
## 
## $TNR
## [1] 0.8139535
## 
## $precision
## [1] 0.7948718
## 
## $NPV
## [1] 0.8139535

Classification Metrics for the Test Set (Polynomial Degree = 2)

classification_metrics(res$par,X_test_features,y_test,0.5)
## $accuracy
## [1] 0.9428571
## 
## $recall
## [1] 1
## 
## $TNR
## [1] 0.8823529
## 
## $precision
## [1] 0.9
## 
## $NPV
## [1] 0.8823529

Polynomial Degree = 3

degree_to_fit <- 3
lambda<-0

X_train_features<-add.poly.features(X_train,degree_to_fit)
X_test_features<-add.poly.features(X_test,degree_to_fit)

theta0<-rep(0,dim(X_train_features)[2])

res<-train(theta0,X_train_features,y_train,lambda = lambda)

Classification Metrics for the Training Set (Polynomial Degree = 3)

classification_metrics(res$par,X_train_features,y_train,0.5)
## $accuracy
## [1] 0.8313253
## 
## $recall
## [1] 0.85
## 
## $TNR
## [1] 0.8139535
## 
## $precision
## [1] 0.8095238
## 
## $NPV
## [1] 0.8139535

Classification Metrics for the Test Set (Polynomial Degree = 3)

classification_metrics(res$par,X_test_features,y_test,0.5)
## $accuracy
## [1] 0.9142857
## 
## $recall
## [1] 1
## 
## $TNR
## [1] 0.8235294
## 
## $precision
## [1] 0.8571429
## 
## $NPV
## [1] 0.8235294

Polynomial Degree = 4

degree_to_fit <- 4
lambda<-0

X_train_features<-add.poly.features(X_train,degree_to_fit)
X_test_features<-add.poly.features(X_test,degree_to_fit)

theta0<-rep(0,dim(X_train_features)[2])

res<-train(theta0,X_train_features,y_train,lambda = lambda)

Classification Metrics for the Training Set (Polynomial Degree = 4)

classification_metrics(res$par,X_train_features,y_train,0.5)
## $accuracy
## [1] 0.8554217
## 
## $recall
## [1] 0.9
## 
## $TNR
## [1] 0.8139535
## 
## $precision
## [1] 0.8181818
## 
## $NPV
## [1] 0.8139535

Classification Metrics for the Test Set (Polynomial Degree = 4)

classification_metrics(res$par,X_test_features,y_test,0.5)
## $accuracy
## [1] 0.9142857
## 
## $recall
## [1] 1
## 
## $TNR
## [1] 0.8235294
## 
## $precision
## [1] 0.8571429
## 
## $NPV
## [1] 0.8235294

Accuracy Chart by Polynomial Degree for the Training Set

# Create the data for the chart
A <- c(49,80,83,86)
D <- c("1","2","3","4")

# Plot the bar chart 
barplot(A,names.arg=D,xlab="Polynomial Degree",ylab="Accurary",col="blue",
main="Accuracy Chart",border="red")

## Accuracy Chart by Polynomial Degree for the Test Set

# Create the data for the chart
A <- c(51,94,91,91)
D <- c("1","2","3","4")

# Plot the bar chart 
barplot(A,names.arg=D,xlab="Polynomial Degree",ylab="Accurary",col="blue",
main="Accuracy Chart",border="red")

Comments on the Results

The accuracy results for the training and the test sets are presented by the above two bar charts.

Our model selection is based on the test accuracy because the training accuracy might be high due to overfitting.

We choose polynomial of degree 2 because it provides the highest accuracy, 0.94 (see Bar Chart for Test Accuracy).

The training accuracy is getting higher as the degree of polynomial increases but this is due to overfitting. That is, the polynomial fits the data of the training set but this model does not fit points not used in the training set and this is the reason why we split the data in training and test sets. The classification metrics should be measured on data not used in the training set.