# loading full data into R
load("NFHSW.rda")
# loading train data into R
load("TrainData.rda")
# loading test data into R
load("TestData.rda")
colnames(NFHSW.df2)## [1] "Anaemic" "bmi_3cat"
## [3] "place_resi" "wealth_index"
## [5] "religion.1" "education"
## [7] "caste" "dietary.diversity.score"
# building logistic regression model
LRModel <- glm(Anaemic ~ bmi_3cat
+ place_resi
+ wealth_index
+ religion.1
+ education
+ caste
+ dietary.diversity.score,
data = trainData.df, family = binomial())
# summary of the model
summary(LRModel)##
## Call:
## glm(formula = Anaemic ~ bmi_3cat + place_resi + wealth_index +
## religion.1 + education + caste + dietary.diversity.score,
## family = binomial(), data = trainData.df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5722 -1.2281 0.9443 1.0991 1.6239
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.822911 0.016225 50.720 < 2e-16 ***
## bmi_3cat2 -0.229282 0.007263 -31.567 < 2e-16 ***
## bmi_3cat3 -0.448299 0.009521 -47.084 < 2e-16 ***
## place_resi2 -0.013655 0.007329 -1.863 0.0624 .
## wealth_index2 -0.140589 0.009297 -15.122 < 2e-16 ***
## wealth_index3 -0.162667 0.009705 -16.761 < 2e-16 ***
## wealth_index4 -0.155729 0.010552 -14.758 < 2e-16 ***
## wealth_index5 -0.134086 0.011931 -11.238 < 2e-16 ***
## religion.12 -0.064859 0.009571 -6.777 1.23e-11 ***
## religion.13 -0.691551 0.012891 -53.646 < 2e-16 ***
## religion.14 -0.089456 0.013528 -6.613 3.78e-11 ***
## education1 -0.040890 0.009809 -4.168 3.07e-05 ***
## education2 -0.093482 0.007417 -12.603 < 2e-16 ***
## education3 -0.189506 0.011413 -16.605 < 2e-16 ***
## caste2 0.081779 0.010446 7.829 4.92e-15 ***
## caste3 -0.097321 0.008203 -11.865 < 2e-16 ***
## caste4 -0.172684 0.009461 -18.252 < 2e-16 ***
## dietary.diversity.score -0.012015 0.001428 -8.413 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 688812 on 499477 degrees of freedom
## Residual deviance: 679438 on 499460 degrees of freedom
## AIC: 679474
##
## Number of Fisher Scoring iterations: 4
## Loading required package: lattice
## Loading required package: ggplot2
## Overall
## bmi_3cat2 31.566901
## bmi_3cat3 47.084346
## place_resi2 1.863131
## wealth_index2 15.121519
## wealth_index3 16.761059
## wealth_index4 14.758122
## wealth_index5 11.238456
## religion.12 6.776502
## religion.13 53.646265
## religion.14 6.612600
## education1 4.168481
## education2 12.603044
## education3 16.605087
## caste2 7.829066
## caste3 11.864518
## caste4 18.251522
## dietary.diversity.score 8.413411
# plot of probabilities
plot(PredLRModel,
main = "Scatterplot of Probabilities of anaemic person",
xlab = "Customer ID",
ylab = "Predicted Probability of Anaemia")# taking the cut-off probability 50%
pred.LR <- ifelse(PredLRModel > 0.50, "1", "0")
# saving predicted vector as factor
Pred <- as.factor(pred.LR)
# ordering the vectors
Predicted <- ordered(Pred, levels = c("1", "0"))
Actual <- ordered(testData.df$Anaemic,levels = c("1", "0"))
# making confusion matrix
cm <-confusionMatrix(table(Predicted,Actual))
cm## Confusion Matrix and Statistics
##
## Actual
## Predicted 1 0
## 1 54419 40786
## 0 13321 16342
##
## Accuracy : 0.5667
## 95% CI : (0.5639, 0.5694)
## No Information Rate : 0.5425
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.0929
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8034
## Specificity : 0.2861
## Pos Pred Value : 0.5716
## Neg Pred Value : 0.5509
## Prevalence : 0.5425
## Detection Rate : 0.4358
## Detection Prevalence : 0.7624
## Balanced Accuracy : 0.5447
##
## 'Positive' Class : 1
##
library(dplyr)
# function to print confusion matrices for diffrent cut-off levels of probability
CmFn <- function(cutoff) {
# predicting the test set results
PredLRModel <- predict(LRModel, testData.df, type = "response")
C1 <- ifelse(PredLRModel > cutoff, "1", "0")
C2 <- testData.df$Anaemic
predY <- as.factor(C1)
actualY <- as.factor(C2)
Predicted <- ordered(predY, levels = c("1", "0"))
Actual <- ordered(actualY, levels = c("1", "0"))
# use the confusionMatrix from the caret package
cm1 <-confusionMatrix(data = Predicted,reference = Actual, positive = "1")
# extracting accuracy
Accuracy <- cm1$overall[1]
# extracting sensitivity
Sensitivity <- cm1$byClass[1]
# extracting specificity
Specificity <- cm1$byClass[2]
# extracting value of kappa
Kappa <- cm1$overall[2]
# combined table
tab <- cbind(Accuracy,Sensitivity,Specificity,Kappa)
return(tab)}
# sequence of cut-off points
cutoff1 <- seq( 0, 1, by = .05 )
# loop using "lapply"
tab2 <- lapply(cutoff1, CmFn)
# creating matrix of different metrics
numrows = length(cutoff1)
pm <- matrix(1:numrows*4, nrow = numrows, ncol=4)
# applying for loop
for (i in 1:numrows){
pm[i,] = tab2[[i]]}
pm <- as.data.frame(pm)
pm <- cbind(cutoff1, pm)
pm <- rename(pm, cutoff = cutoff1, Accuracy = V1,
Senstivity = V2 ,Specificity = V3, kappa = V4)
# printing the table
print(pm)## cutoff Accuracy Senstivity Specificity kappa
## 1 0.00 0.5424929 1.0000000000 0.000000000 0.0000000000
## 2 0.05 0.5424929 1.0000000000 0.000000000 0.0000000000
## 3 0.10 0.5424929 1.0000000000 0.000000000 0.0000000000
## 4 0.15 0.5424929 1.0000000000 0.000000000 0.0000000000
## 5 0.20 0.5424929 1.0000000000 0.000000000 0.0000000000
## 6 0.25 0.5424929 1.0000000000 0.000000000 0.0000000000
## 7 0.30 0.5429253 0.9992471213 0.001837978 0.0011770440
## 8 0.35 0.5449355 0.9941688810 0.012253186 0.0069565191
## 9 0.40 0.5499407 0.9810304104 0.038772581 0.0213746711
## 10 0.45 0.5608082 0.9350605255 0.117035429 0.0556063933
## 11 0.50 0.5666864 0.8033510481 0.286059375 0.0929091595
## 12 0.55 0.5514303 0.5222615884 0.586017365 0.1070008142
## 13 0.60 0.5104510 0.2279450841 0.845434813 0.0692523935
## 14 0.65 0.4734920 0.0540005905 0.970907436 0.0229443390
## 15 0.70 0.4576032 0.0002804842 0.999877468 0.0001445334
## 16 0.75 0.4575071 0.0000000000 1.000000000 0.0000000000
## 17 0.80 0.4575071 0.0000000000 1.000000000 0.0000000000
## 18 0.85 0.4575071 0.0000000000 1.000000000 0.0000000000
## 19 0.90 0.4575071 0.0000000000 1.000000000 0.0000000000
## 20 0.95 0.4575071 0.0000000000 1.000000000 0.0000000000
## 21 1.00 0.4575071 0.0000000000 1.000000000 0.0000000000
plot(pm$cutoff,pm$Senstivity,pch = 16, xlab = "Cut-off Probabilities", ylab = "ML Metrics",ylim = c(0,1),xlim = c(0,1),type = "l",lwd = 2,col= "red")
lines(pm$cutoff, pm$Specificity,col= "blue",lwd = 2)
lines(pm$cutoff, pm$Accuracy,col= "black")
legend(0.8, 0.3, legend = c("Senstivity", "Specificity","Accuracy"), col=c("red","blue","black"), cex=0.8,lty=1)# False Positive Rate
FPR <- 1-pm$Specificity
# True positive Rate
TPR <- pm$Senstivity
# plotting ROC curve
plot(FPR,TPR,main = "ROC Curve",col = 2,lwd = 2,type = "l",xlab = "False Positive Rate", ylab = "True positive Rate")
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
# area under curve
LRPrediction <- prediction(PredLRModel,testData.df$Anaemic)
aucLR <- performance(LRPrediction, measure = "auc")
aucLR <- aucLR@y.values[[1]]
aucLR## [1] 0.5787662