Logistic Regression with Ridge Penalty

2. (6+4+2=12 pts) Diabetes classification using logistic regression

Diabetes is a major health concern in many Pima Indian Women. In this problem, we are going to analyze a datasets with 768 individuals, and will try to predict whether or not an individual suffers from diabetes. The complete dataset description can be found at http://archive.ics.uci.edu/ml/ datasets/Pima+Indians+Diabetes. Below are the summary statistics of this dataset.

• 768 individuals

• 8 features

• 1 nominal class variable {0,1}, where 0 and 1 denote absence and presence of diabetes respectively.

(a) Fit a ridge logistic regression (i.e., logistic regression with a ridge penalty) model using glmnet for each of the following two types of data preprocessing. We are going to use the first 400 sam- ples to train the model, and the rest will be used as the test set. We have already preprocessed the train and test sets for your convenience (train-std.csv, test-std.csv, train-log.csv, test-log.csv). We are going to do 10-fold cross-validation to choose the strength of the regu- larizer in each case. Report the mean error rate (fraction of incorrect labels) on both the training and test sets, in a table.

dir()

##  [1] "APM_logistic.Rproj"     "diabetes_test-log.csv" 
##  [3] "diabetes_test-std.csv"  "diabetes_train-log.csv"
##  [5] "diabetes_train-std.csv" "figure"                
##  [7] "ridge_logit.html"       "ridge_logit.md"        
##  [9] "ridge_logit.Rmd"        "SVM.py"

library(dplyr)
library(glmnet)

set.seed(2)

#standardized training data
diabetes_train_std <- read.csv("diabetes_train-std.csv", header=TRUE, sep=",")
str(diabetes_train_std)

## 'data.frame':    400 obs. of  9 variables:
##  $ numpreg         : num  0.64 -0.844 1.233 -0.844 -1.141 ...
##  $ plasmacon       : num  0.838 -1.127 1.93 -1.002 0.495 ...
##  $ bloodpress      : num  0.126 -0.202 -0.311 -0.202 -1.624 ...
##  $ skinfold        : num  0.894 0.517 -1.306 0.14 0.894 ...
##  $ seruminsulin    : num  -0.699 -0.699 -0.699 0.114 0.753 ...
##  $ BMI             : num  0.165 -0.846 -1.322 -0.629 1.537 ...
##  $ pedigreefunction: num  0.469 -0.369 0.606 -0.927 5.51 ...
##  $ age             : num  1.4292 -0.195 -0.1095 -1.0499 -0.0241 ...
##  $ classvariable   : int  1 0 1 0 1 0 1 0 1 0 ...

y_train_std <- as.factor(diabetes_train_std[,9])
x_train_std <- as.matrix(diabetes_train_std[,1:8])

#cross-validation
cv_train_std <- cv.glmnet(x_train_std, y_train_std, type.measure="class", nfolds=10, family="binomial")
lambda <- cv_train_std$lambda.min
lambda

## [1] 0.006769725

plot(cv_train_std)

plot of chunk unnamed-chunk-1

#fit the model
#alpha=0 for the ridge penalty, alpha=1 for the lasso penalty
std_ridge_logit <- glmnet(x_train_std, y_train_std, family="binomial", alpha=0)


#predicting with the training set
SRL_pred_train <- predict(std_ridge_logit, x_train_std, type="class", s=lambda)

#report mean error rate (fraction of incorrect labels)
confusion_matrix_train <- table(y_train_std, SRL_pred_train)
confusion_matrix_train

##            SRL_pred_train
## y_train_std   0   1
##           0 214  32
##           1  62  92

error_rate_train <- (32+62)/400.0
error_rate_train

## [1] 0.235

#standardized test data
diabetes_test_std <- read.csv("diabetes_test-std.csv", header=TRUE, sep=",")
#dim = (357, 9)

#separating response and predictor variables
y_test_std <- as.factor(diabetes_test_std[,9])
x_test_std <- as.matrix(diabetes_test_std[,1:8])

#predicting with the test set
SRL_pred_test <- predict(std_ridge_logit, x_test_std, type="class", s=lambda)
confusion_matrix_test <- table(y_test_std, SRL_pred_test)
confusion_matrix_test

##           SRL_pred_test
## y_test_std   0   1
##          0 222  23
##          1  47  65

error_rate_test <- (23+47)/357.0
error_rate_test

## [1] 0.1960784

##############################################################################

#Log-transformed data
diabetes_train_log <- read.csv("diabetes_train-log.csv", header=TRUE, sep=",")
#dim = (400, 9)

#separating response and predictor variables
y_train_log <- as.factor(diabetes_train_log[,9])
x_train_log <- as.matrix(diabetes_train_log[,1:8])

#cross-validation
cv_train_log <- cv.glmnet(x_train_log, y_train_log, type.measure="class", nfolds=10, family="binomial")
lambda_log <- cv_train_log$lambda.min
lambda_log

## [1] 0.003015737

#fit the model
#alpha=0 for the ridge penalty, alpha=1 for the lasso penalty
log_ridge_logit <- glmnet(x_train_log, y_train_log, family="binomial", alpha=0)

#predicting with the log training set
LogRL_pred_train <- predict(log_ridge_logit, x_train_log, type="class", s=lambda_log)
log_confusion_matrix_train <- table(y_train_log, LogRL_pred_train)
log_confusion_matrix_train

##            LogRL_pred_train
## y_train_log   0   1
##           0 206  40
##           1  77  77

#error rate
log_error_train <- (40+77)/400.0
log_error_train

## [1] 0.2925

diabetes_test_log <- read.csv("diabetes_test-log.csv", header=TRUE, sep=",")
#dim = (357, 9)

y_test_log <- as.factor(diabetes_test_log[,9])
x_test_log <- as.matrix(diabetes_test_log[,1:8])

#predicting with LOG test set
LogRL_pred_test <- predict(log_ridge_logit, x_test_log, type="class", s=lambda)
log_confusion_matrix_test <- table(y_test_log, LogRL_pred_test)
log_confusion_matrix_test

##           LogRL_pred_test
## y_test_log   0   1
##          0 211  34
##          1  50  62

log_error_test <- (50+34)/357.0
log_error_test

## [1] 0.2352941

(b) Plot the receiver operating characteristic (ROC) curves on the test data for each of the logistic regression models on the same plot. Use ROCR1 to get the ROC curve and ggplot2 to plot the ROC curves. Report the area under the ROC curve (AUC) for the two models in a table.

install.packages("ROCR")

## Error in contrib.url(repos, "source"): trying to use CRAN without setting a mirror

library(ROCR)
library(ggplot2)

#ROC curve for standardized data
prob_std <- predict(std_ridge_logit, x_test_std, type="response", s=lambda)
pred_std <- prediction(prob_std, y_test_std)
perf_std <- performance(pred_std, measure = "tpr", x.measure = "fpr")

#true positive rate
tpr.points1 <- attr(perf_std, "y.values")[[1]]
#tpr.points

#false positive rate
fpr.points1 <- attr(perf_std,"x.values")[[1]]
#fpr.points

#area under the curve
auc1 <- attr(performance(pred_std, "auc"), "y.values")[[1]]
formatted_auc1 <- signif(auc1, digits=3)


roc.data1 <- data.frame(fpr=fpr.points1, tpr=tpr.points1, model="GLM")


ggplot(roc.data1, aes(x=fpr, ymin=0, ymax=tpr)) +
    geom_ribbon(alpha=0.2) +
    geom_line(aes(y=tpr)) +
    ggtitle(paste0("ROC Curve for Standardized Data w/ AUC=", formatted_auc1))

plot of chunk unnamed-chunk-2

######################################################################
#ROC curve for log data
#prob <- predict(fit, newdata=test, type="response")
prob <- predict(log_ridge_logit, x_test_log, type="response", s=lambda)
pred <- prediction(prob, y_test_log)
perf <- performance(pred, measure = "tpr", x.measure = "fpr")

#true positive rate
tpr.points <- attr(perf, "y.values")[[1]]
#tpr.points

#false positive rate
fpr.points <- attr(perf,"x.values")[[1]]
#fpr.points

#area under the curve
auc <- attr(performance(pred, "auc"), "y.values")[[1]]
formatted_auc <- signif(auc, digits=3)


roc.data <- data.frame(fpr=fpr.points, tpr=tpr.points, model="GLM")


ggplot(roc.data, aes(x=fpr, ymin=0, ymax=tpr)) +
    geom_ribbon(alpha=0.2) +
    geom_line(aes(y=tpr)) +
    ggtitle(paste0("ROC Curve for Log-Transformed Data w/ AUC=", formatted_auc))

plot of chunk unnamed-chunk-2

How do the models compare based on AUC?

The area under a ROC curve quantifies the overall ability of the test to discriminate between those individuals with the disease and those without the disease. In this case, the logit model which uses the standardized data performs marginally better than the same model using the log-transformed data (Standardized AUC: .862 vs. Log-Transformed AUC: .824). This finding is consistent with the rest of the results.

c) Generate and plot the lift curves for the two logistic regression models on the same plot. Interpret the performance of the models (based on lift) at a positive prediction rate of 0.25.

#lift performance of standardized data
lift.perf1 <- performance(pred_std, "lift", "rpp")
plot(lift.perf1, colorize=T, main="Regularized Logit Performance", sub="Standardized vs. Logit Transformation")

#lift performance of log data
lift.perf <- performance(pred, "lift", "rpp")
plot(lift.perf, colorize=F, add=TRUE)

plot of chunk unnamed-chunk-3

Interpret the performance of the models (based on lift) at a positive prediction rate of 0.25.

At a positive prediction rate of 0.25, we can see that the model which uses the standardized data (colorful line) has a higher lift value (approximately 2.4). Thus, the performance of the logistic regression model using a ridge penalty and standardized data performs better than the same model using log transformed data. This finding is supported by the mean error rates obtained earlier in the problem.