"NFHS Data – Logistic Regression

LOADING DATA INTO R ENVIRONMENT
- Loading the Data
TRAINING THE LOGISTIC REGRESSION MODEl
- Running the Training Mode without Interaction
- Variable Importance in Logistic Model
TESTING THE LOGISTIC REGRESSION MODEL

LOADING DATA INTO R ENVIRONMENT

Loading the Data

# loading full data into R
load("NFHSW.rda")
# loading train data into R
load("TrainData.rda")
# loading test data into R
load("TestData.rda")
colnames(NFHSW.df2)

## [1] "Anaemic"                 "bmi_3cat"               
## [3] "place_resi"              "wealth_index"           
## [5] "religion.1"              "education"              
## [7] "caste"                   "dietary.diversity.score"

TRAINING THE LOGISTIC REGRESSION MODEl

Running the Training Mode without Interaction

# building logistic regression model
LRModel <- glm(Anaemic ~ bmi_3cat
                        + place_resi
                        + wealth_index
                        + religion.1
                        + education
                        + caste
                        + dietary.diversity.score,
                        data = trainData.df, family = binomial())
# summary of the model
summary(LRModel)

## 
## Call:
## glm(formula = Anaemic ~ bmi_3cat + place_resi + wealth_index + 
##     religion.1 + education + caste + dietary.diversity.score, 
##     family = binomial(), data = trainData.df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5722  -1.2281   0.9443   1.0991   1.6239  
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              0.822911   0.016225  50.720  < 2e-16 ***
## bmi_3cat2               -0.229282   0.007263 -31.567  < 2e-16 ***
## bmi_3cat3               -0.448299   0.009521 -47.084  < 2e-16 ***
## place_resi2             -0.013655   0.007329  -1.863   0.0624 .  
## wealth_index2           -0.140589   0.009297 -15.122  < 2e-16 ***
## wealth_index3           -0.162667   0.009705 -16.761  < 2e-16 ***
## wealth_index4           -0.155729   0.010552 -14.758  < 2e-16 ***
## wealth_index5           -0.134086   0.011931 -11.238  < 2e-16 ***
## religion.12             -0.064859   0.009571  -6.777 1.23e-11 ***
## religion.13             -0.691551   0.012891 -53.646  < 2e-16 ***
## religion.14             -0.089456   0.013528  -6.613 3.78e-11 ***
## education1              -0.040890   0.009809  -4.168 3.07e-05 ***
## education2              -0.093482   0.007417 -12.603  < 2e-16 ***
## education3              -0.189506   0.011413 -16.605  < 2e-16 ***
## caste2                   0.081779   0.010446   7.829 4.92e-15 ***
## caste3                  -0.097321   0.008203 -11.865  < 2e-16 ***
## caste4                  -0.172684   0.009461 -18.252  < 2e-16 ***
## dietary.diversity.score -0.012015   0.001428  -8.413  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 688812  on 499477  degrees of freedom
## Residual deviance: 679438  on 499460  degrees of freedom
## AIC: 679474
## 
## Number of Fisher Scoring iterations: 4

Variable Importance in Logistic Model

library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

# variable importance
varImp(LRModel)

##                           Overall
## bmi_3cat2               31.566901
## bmi_3cat3               47.084346
## place_resi2              1.863131
## wealth_index2           15.121519
## wealth_index3           16.761059
## wealth_index4           14.758122
## wealth_index5           11.238456
## religion.12              6.776502
## religion.13             53.646265
## religion.14              6.612600
## education1               4.168481
## education2              12.603044
## education3              16.605087
## caste2                   7.829066
## caste3                  11.864518
## caste4                  18.251522
## dietary.diversity.score  8.413411

TESTING THE LOGISTIC REGRESSION MODEL

Predicting Model on Test Data Set

# predicting the model on test data set
PredLRModel <- predict(LRModel, 
                    newdata = testData.df,type = "response")

Plotting the Predicted Probabilities

# plot of probabilities
plot(PredLRModel, 
     main = "Scatterplot of Probabilities of anaemic person", 
     xlab = "Customer ID", 
     ylab = "Predicted Probability of Anaemia")

Confusion Matrix at 50% Cut-Off Probability

# taking the cut-off probability 50%
pred.LR <- ifelse(PredLRModel > 0.50, "1", "0")
# saving predicted vector as factor 
Pred <- as.factor(pred.LR)
# ordering the vectors
Predicted <- ordered(Pred, levels = c("1", "0"))
Actual <- ordered(testData.df$Anaemic,levels = c("1", "0"))
# making confusion matrix
cm <-confusionMatrix(table(Predicted,Actual))
cm

## Confusion Matrix and Statistics
## 
##          Actual
## Predicted     1     0
##         1 54419 40786
##         0 13321 16342
##                                           
##                Accuracy : 0.5667          
##                  95% CI : (0.5639, 0.5694)
##     No Information Rate : 0.5425          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.0929          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.8034          
##             Specificity : 0.2861          
##          Pos Pred Value : 0.5716          
##          Neg Pred Value : 0.5509          
##              Prevalence : 0.5425          
##          Detection Rate : 0.4358          
##    Detection Prevalence : 0.7624          
##       Balanced Accuracy : 0.5447          
##                                           
##        'Positive' Class : 1               
##

Performance Metrics at different Cut-Off Probabilities

library(dplyr)
# function to print confusion matrices for diffrent cut-off levels of probability
CmFn <- function(cutoff) {
      
       # predicting the test set results
         PredLRModel <- predict(LRModel, testData.df, type = "response")
         C1 <- ifelse(PredLRModel > cutoff, "1", "0")
         C2 <- testData.df$Anaemic
         predY   <- as.factor(C1)
           actualY <- as.factor(C2)
           Predicted <- ordered(predY, levels = c("1", "0"))
           Actual <-  ordered(actualY, levels = c("1", "0"))
        # use the confusionMatrix from the caret package
        cm1 <-confusionMatrix(data = Predicted,reference = Actual, positive = "1")
        # extracting accuracy
        Accuracy <- cm1$overall[1]
        # extracting sensitivity
          Sensitivity <- cm1$byClass[1]
        # extracting specificity
          Specificity <- cm1$byClass[2]
      # extracting value of kappa
          Kappa <- cm1$overall[2]
          
        # combined table
          tab <- cbind(Accuracy,Sensitivity,Specificity,Kappa)
        return(tab)}
     # sequence of cut-off points        
        cutoff1 <- seq( 0, 1, by = .05 )
     # loop using "lapply"
        tab2    <- lapply(cutoff1, CmFn)
     # creating matrix of different metrics
        numrows = length(cutoff1)
        pm <- matrix(1:numrows*4, nrow = numrows, ncol=4)
     #  applying for loop
        for (i in 1:numrows){
          pm[i,] = tab2[[i]]}
        pm <- as.data.frame(pm)
        pm <- cbind(cutoff1, pm)
        pm <- rename(pm, cutoff =  cutoff1, Accuracy = V1, 
                       Senstivity = V2 ,Specificity = V3, kappa = V4)
    # printing the table
        print(pm)

##    cutoff  Accuracy   Senstivity Specificity        kappa
## 1    0.00 0.5424929 1.0000000000 0.000000000 0.0000000000
## 2    0.05 0.5424929 1.0000000000 0.000000000 0.0000000000
## 3    0.10 0.5424929 1.0000000000 0.000000000 0.0000000000
## 4    0.15 0.5424929 1.0000000000 0.000000000 0.0000000000
## 5    0.20 0.5424929 1.0000000000 0.000000000 0.0000000000
## 6    0.25 0.5424929 1.0000000000 0.000000000 0.0000000000
## 7    0.30 0.5429253 0.9992471213 0.001837978 0.0011770440
## 8    0.35 0.5449355 0.9941688810 0.012253186 0.0069565191
## 9    0.40 0.5499407 0.9810304104 0.038772581 0.0213746711
## 10   0.45 0.5608082 0.9350605255 0.117035429 0.0556063933
## 11   0.50 0.5666864 0.8033510481 0.286059375 0.0929091595
## 12   0.55 0.5514303 0.5222615884 0.586017365 0.1070008142
## 13   0.60 0.5104510 0.2279450841 0.845434813 0.0692523935
## 14   0.65 0.4734920 0.0540005905 0.970907436 0.0229443390
## 15   0.70 0.4576032 0.0002804842 0.999877468 0.0001445334
## 16   0.75 0.4575071 0.0000000000 1.000000000 0.0000000000
## 17   0.80 0.4575071 0.0000000000 1.000000000 0.0000000000
## 18   0.85 0.4575071 0.0000000000 1.000000000 0.0000000000
## 19   0.90 0.4575071 0.0000000000 1.000000000 0.0000000000
## 20   0.95 0.4575071 0.0000000000 1.000000000 0.0000000000
## 21   1.00 0.4575071 0.0000000000 1.000000000 0.0000000000

Plotting Performance Metrics

plot(pm$cutoff,pm$Senstivity,pch = 16, xlab = "Cut-off Probabilities", ylab = "ML Metrics",ylim = c(0,1),xlim = c(0,1),type = "l",lwd = 2,col= "red")
lines(pm$cutoff, pm$Specificity,col= "blue",lwd = 2)
lines(pm$cutoff, pm$Accuracy,col= "black")
legend(0.8, 0.3, legend = c("Senstivity", "Specificity","Accuracy"), col=c("red","blue","black"), cex=0.8,lty=1)

ROC Curve

# False Positive Rate
FPR <- 1-pm$Specificity 
# True positive Rate
TPR <- pm$Senstivity 
# plotting ROC curve
plot(FPR,TPR,main = "ROC Curve",col = 2,lwd = 2,type = "l",xlab = "False Positive Rate", ylab = "True positive Rate")
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")

AUC (Area Under the Curve)

library(ROCR)

## Loading required package: gplots

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

# area under curve
LRPrediction <- prediction(PredLRModel,testData.df$Anaemic)
aucLR <- performance(LRPrediction, measure = "auc")
aucLR <- aucLR@y.values[[1]]
aucLR

## [1] 0.5787662

"NFHS Data – Logistic Regression

Aryansh Gupta (IIM Lucknow)

LOADING DATA INTO R ENVIRONMENT

Loading the Data

TRAINING THE LOGISTIC REGRESSION MODEl

Running the Training Mode without Interaction

Variable Importance in Logistic Model

TESTING THE LOGISTIC REGRESSION MODEL

Predicting Model on Test Data Set

Plotting the Predicted Probabilities

Confusion Matrix at 50% Cut-Off Probability

Performance Metrics at different Cut-Off Probabilities

Plotting Performance Metrics

ROC Curve

AUC (Area Under the Curve)