Yahye Project
# 5 Supervised Machine Learning Models
# Measless Vaccination Coverage in Somaliland
library(haven)
final<-read_dta("~/YahyeM.dta")
str(final)
## tibble [577 × 20] (S3: tbl_df/tbl/data.frame)
## $ Age_Group : dbl+lbl [1:577] 3, 2, 3, 3, 3, 3, 5, 2, 3, 3, 3, 4, 3, 5, 2, 3, 3, 5, ...
## ..@ label : chr "Age in 5-year groups"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:7] 1 2 3 4 5 6 7
## .. ..- attr(*, "names")= chr [1:7] "15 - 19" "20 - 24" "25 - 29" "30 - 34" ...
## $ Geographic_Region : dbl+lbl [1:577] 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11...
## ..@ label : chr "Region"
## ..@ format.stata: chr "%2.0f"
## ..@ labels : Named num [1:18] 11 12 13 14 15 16 17 18 19 20 ...
## .. ..- attr(*, "names")= chr [1:18] "Awdal" "Woqooyi Galbeed" "Togdheer" "Sool" ...
## $ Place_of_Residence : dbl+lbl [1:577] 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 2, 2, 2, ...
## ..@ label : chr "Residence"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:3] 1 2 3
## .. ..- attr(*, "names")= chr [1:3] "Urban" "Rural" "Nomadic"
## $ Maternal_Education : dbl+lbl [1:577] 0, 1, 0, 0, 3, 2, 2, 2, 3, 1, 3, 1, 2, 0, 0, 0, 0, 0, ...
## ..@ label : chr "Education"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:4] 0 1 2 3
## .. ..- attr(*, "names")= chr [1:4] "No Education" "Primary" " Secondary" "Higher"
## $ Radio_Listening : dbl+lbl [1:577] 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...
## ..@ label : chr "Frequency of listening to radio"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:3] 1 2 3
## .. ..- attr(*, "names")= chr [1:3] "At least once a week" "Less than once a week" "Not at all"
## $ Television_Watching : dbl+lbl [1:577] 1, 1, 3, 1, 1, 2, 3, 3, 2, 1, 1, 3, 1, 1, 3, 3, 3, 3, ...
## ..@ label : chr "Frequency of watching television"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:3] 1 2 3
## .. ..- attr(*, "names")= chr [1:3] "At least once a week" "Less than once a week" "Not at all"
## $ Internet_Usage : dbl+lbl [1:577] 2, 2, 2, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 2, 2, 2, 2, 2, ...
## ..@ label : chr "Ever used internet"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:2] 1 2
## .. ..- attr(*, "names")= chr [1:2] "Yes" "No"
## $ Wealth_Quantile : dbl+lbl [1:577] 4, 5, 3, 5, 5, 5, 5, 4, 5, 4, 5, 5, 5, 5, 1, 5, 5, 5, ...
## ..@ label : chr "Wealth quintile"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:5] 1 2 3 4 5
## .. ..- attr(*, "names")= chr [1:5] "Lowest" "Second" "Middle" "Fourth" ...
## $ Health_Access : dbl+lbl [1:577] 2, 2, 2, 2, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, ...
## ..@ label : chr "Getting medical help for self: getting permission to go"
## ..@ format.stata: chr "%17.0f"
## ..@ labels : Named num [1:2] 1 2
## .. ..- attr(*, "names")= chr [1:2] "Big problem" "Not a big problem"
## $ Distance_Healthcare : dbl+lbl [1:577] 2, 2, 2, 2, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, ...
## ..@ label : chr "Getting medical help for self: distance to health facility"
## ..@ format.stata: chr "%17.0f"
## ..@ labels : Named num [1:2] 1 2
## .. ..- attr(*, "names")= chr [1:2] "Big problem" "Not a big problem"
## $ Birth_order : num [1:577] 1 1 1 4 1 3 1 2 1 1 ...
## ..- attr(*, "label")= chr "Birth Order"
## ..- attr(*, "format.stata")= chr "%2.0f"
## $ Child_twin : dbl+lbl [1:577] 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## ..@ label : chr "Twins"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:2] 1 2
## .. ..- attr(*, "names")= chr [1:2] "Sing" "Mult"
## $ Child_sex : dbl+lbl [1:577] 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 2, 1, 2, 2, 2, 1, 1, 2, ...
## ..@ label : chr "Sex of child"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:2] 1 2
## .. ..- attr(*, "names")= chr [1:2] "Male" "Female"
## $ Child_Age : num [1:577] 11 3 7 3 6 13 28 29 6 20 ...
## ..- attr(*, "label")= chr "Age in Months"
## ..- attr(*, "format.stata")= chr "%2.0f"
## $ ANC_Numbers : num [1:577] 6 3 0 3 2 9 7 4 7 0 ...
## ..- attr(*, "label")= chr "Number of antenatal visits during pregnancy"
## ..- attr(*, "format.stata")= chr "%2.0f"
## $ Postnatal_care : dbl+lbl [1:577] 2, 2, 1, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...
## ..@ label : chr "Respondent's health checked after discharge/delivery at home"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:2] 1 2
## .. ..- attr(*, "names")= chr [1:2] "Yes" "No"
## $ DV : dbl+lbl [1:577] 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, ...
## ..@ label : chr "Measless Vaccination Status"
## ..@ format.stata: chr "%3.0f"
## ..@ labels : Named num [1:2] 0 1
## .. ..- attr(*, "names")= chr [1:2] "No" "Yes"
## $ Husband_Education : dbl+lbl [1:577] 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 1, 2, 2, 2, ...
## ..@ label : chr "Husband/partner's ever attended school"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:3] 1 2 8
## .. ..- attr(*, "names")= chr [1:3] "Yes" "No" "Don't Know"
## $ Husband_Employment_status : dbl+lbl [1:577] 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 2, 1, 1, 1, 2, 1, 1, ...
## ..@ label : chr "Husband/partner worked in last 12 months"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:3] 1 2 8
## .. ..- attr(*, "names")= chr [1:3] "Yes" "No" "Don't Know"
## $ Maternal_Employment_status: dbl+lbl [1:577] 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...
## ..@ label : chr "Respondent worked in last 12 months"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:2] 1 2
## .. ..- attr(*, "names")= chr [1:2] "Yes" "No"
write.csv(final,"~/Yahye.csv")
final<-read.csv("~/Yahye.csv")
final<-final[,-1]
# Logistic Regression
logistic<-glm(DV~.,data=final,family = binomial())
summary(logistic)
##
## Call:
## glm(formula = DV ~ ., family = binomial(), data = final)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.885033 2.872604 0.656 0.5117
## Age_Group -0.084942 0.107212 -0.792 0.4282
## Geographic_Region -0.046430 0.020497 -2.265 0.0235 *
## Place_of_Residence -0.343763 0.202148 -1.701 0.0890 .
## Maternal_Education -0.069276 0.154013 -0.450 0.6529
## Radio_Listening -0.528832 0.203056 -2.604 0.0092 **
## Television_Watching 0.125964 0.154782 0.814 0.4158
## Internet_Usage 0.028090 0.288116 0.097 0.9223
## Wealth_Quantile -0.015474 0.113408 -0.136 0.8915
## Health_Access 0.190633 0.251472 0.758 0.4484
## Distance_Healthcare 0.146684 0.244886 0.599 0.5492
## Birth_order 0.070152 0.053205 1.319 0.1873
## Child_twin -0.290760 1.229882 -0.236 0.8131
## Child_sex -0.200391 0.198525 -1.009 0.3128
## Child_Age 0.133941 0.013621 9.834 <2e-16 ***
## ANC_Numbers -0.036047 0.050346 -0.716 0.4740
## Postnatal_care -0.421795 0.269327 -1.566 0.1173
## Husband_Education 0.175600 0.228266 0.769 0.4417
## Husband_Employment_status -0.140806 0.219249 -0.642 0.5207
## Maternal_Employment_status -0.006725 1.109219 -0.006 0.9952
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 787.91 on 576 degrees of freedom
## Residual deviance: 622.08 on 557 degrees of freedom
## AIC: 662.08
##
## Number of Fisher Scoring iterations: 4
round(coef(summary(logistic)),4)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.8850 2.8726 0.6562 0.5117
## Age_Group -0.0849 0.1072 -0.7923 0.4282
## Geographic_Region -0.0464 0.0205 -2.2652 0.0235
## Place_of_Residence -0.3438 0.2021 -1.7005 0.0890
## Maternal_Education -0.0693 0.1540 -0.4498 0.6529
## Radio_Listening -0.5288 0.2031 -2.6044 0.0092
## Television_Watching 0.1260 0.1548 0.8138 0.4158
## Internet_Usage 0.0281 0.2881 0.0975 0.9223
## Wealth_Quantile -0.0155 0.1134 -0.1364 0.8915
## Health_Access 0.1906 0.2515 0.7581 0.4484
## Distance_Healthcare 0.1467 0.2449 0.5990 0.5492
## Birth_order 0.0702 0.0532 1.3185 0.1873
## Child_twin -0.2908 1.2299 -0.2364 0.8131
## Child_sex -0.2004 0.1985 -1.0094 0.3128
## Child_Age 0.1339 0.0136 9.8335 0.0000
## ANC_Numbers -0.0360 0.0503 -0.7160 0.4740
## Postnatal_care -0.4218 0.2693 -1.5661 0.1173
## Husband_Education 0.1756 0.2283 0.7693 0.4417
## Husband_Employment_status -0.1408 0.2192 -0.6422 0.5207
## Maternal_Employment_status -0.0067 1.1092 -0.0061 0.9952
# Logistic Regression without factors
logistic<-glm(DV~.,data=final,family = binomial())
summary(logistic)
##
## Call:
## glm(formula = DV ~ ., family = binomial(), data = final)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.885033 2.872604 0.656 0.5117
## Age_Group -0.084942 0.107212 -0.792 0.4282
## Geographic_Region -0.046430 0.020497 -2.265 0.0235 *
## Place_of_Residence -0.343763 0.202148 -1.701 0.0890 .
## Maternal_Education -0.069276 0.154013 -0.450 0.6529
## Radio_Listening -0.528832 0.203056 -2.604 0.0092 **
## Television_Watching 0.125964 0.154782 0.814 0.4158
## Internet_Usage 0.028090 0.288116 0.097 0.9223
## Wealth_Quantile -0.015474 0.113408 -0.136 0.8915
## Health_Access 0.190633 0.251472 0.758 0.4484
## Distance_Healthcare 0.146684 0.244886 0.599 0.5492
## Birth_order 0.070152 0.053205 1.319 0.1873
## Child_twin -0.290760 1.229882 -0.236 0.8131
## Child_sex -0.200391 0.198525 -1.009 0.3128
## Child_Age 0.133941 0.013621 9.834 <2e-16 ***
## ANC_Numbers -0.036047 0.050346 -0.716 0.4740
## Postnatal_care -0.421795 0.269327 -1.566 0.1173
## Husband_Education 0.175600 0.228266 0.769 0.4417
## Husband_Employment_status -0.140806 0.219249 -0.642 0.5207
## Maternal_Employment_status -0.006725 1.109219 -0.006 0.9952
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 787.91 on 576 degrees of freedom
## Residual deviance: 622.08 on 557 degrees of freedom
## AIC: 662.08
##
## Number of Fisher Scoring iterations: 4
# Prediction Models
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
# Prep Training and Test dataset
# First, create a Train-test split with 0% data included in the training set
set.seed(123)
trainDataIndex<-createDataPartition(final$DV, p=0.8, list = F)
trainData<-final[trainDataIndex,]
testData<-final[-trainDataIndex,]
###################################################
# Logistic with only significant variables
###################################################
logitmod2<-glm(DV~ .,
family="binomial", data=trainData)
summary(logitmod2)
##
## Call:
## glm(formula = DV ~ ., family = "binomial", data = trainData)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.668196 2.986042 0.559 0.57639
## Age_Group -0.067843 0.118926 -0.570 0.56836
## Geographic_Region -0.064887 0.023679 -2.740 0.00614 **
## Place_of_Residence -0.220630 0.229010 -0.963 0.33534
## Maternal_Education -0.140843 0.172797 -0.815 0.41503
## Radio_Listening -0.492412 0.223192 -2.206 0.02737 *
## Television_Watching -0.002837 0.173962 -0.016 0.98699
## Internet_Usage 0.044454 0.327084 0.136 0.89189
## Wealth_Quantile -0.012463 0.131245 -0.095 0.92435
## Health_Access 0.396323 0.283477 1.398 0.16209
## Distance_Healthcare 0.172527 0.272641 0.633 0.52687
## Birth_order 0.069386 0.059748 1.161 0.24551
## Child_twin -0.048461 1.221355 -0.040 0.96835
## Child_sex -0.091635 0.225825 -0.406 0.68491
## Child_Age 0.139645 0.015634 8.932 < 2e-16 ***
## ANC_Numbers -0.083213 0.056063 -1.484 0.13774
## Postnatal_care -0.361494 0.306368 -1.180 0.23803
## Husband_Education 0.337657 0.260281 1.297 0.19454
## Husband_Employment_status -0.299764 0.247795 -1.210 0.22638
## Maternal_Employment_status -0.183268 1.121369 -0.163 0.87018
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 632.65 on 461 degrees of freedom
## Residual deviance: 491.66 on 442 degrees of freedom
## AIC: 531.66
##
## Number of Fisher Scoring iterations: 4
# Apply the model to predict the testdata
pred<-predict(logitmod2,newdata = testData, type = "response")
summary(pred)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.03969 0.32499 0.54409 0.55382 0.81588 0.98343
# Measure the accuracy of prediction in the test data
y_pred_num<-ifelse(pred<0.5,0,1)
y_pred<-factor(y_pred_num,levels = c(0,1))
y_act<-testData$DV
#Result : Prediction Accuracy
set.seed(123)
mean(y_pred==y_act)
## [1] 0.7043478
#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_table <- table(testData$DV, y_pred_num)
#Displays confusion matrix and the statistics associated with the confusion matrix
result<-caret::confusionMatrix(confusion_table)
result
## Confusion Matrix and Statistics
##
## y_pred_num
## 0 1
## 0 33 13
## 1 21 48
##
## Accuracy : 0.7043
## 95% CI : (0.6121, 0.7858)
## No Information Rate : 0.5304
## P-Value [Acc > NIR] : 0.0001067
##
## Kappa : 0.4014
##
## Mcnemar's Test P-Value : 0.2299491
##
## Sensitivity : 0.6111
## Specificity : 0.7869
## Pos Pred Value : 0.7174
## Neg Pred Value : 0.6957
## Prevalence : 0.4696
## Detection Rate : 0.2870
## Detection Prevalence : 0.4000
## Balanced Accuracy : 0.6990
##
## 'Positive' Class : 0
##
metrics<-as.data.frame(result$byClass)
colnames(metrics)<-"metrics"
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
kable(round(metrics,4), caption = "F1-score, Precision and Recall ") %>%
kable_styling(font_size = 16)
F1-score, Precision and Recall
|
metrics
|
Sensitivity
|
0.6111
|
Specificity
|
0.7869
|
Pos Pred Value
|
0.7174
|
Neg Pred Value
|
0.6957
|
Precision
|
0.7174
|
Recall
|
0.6111
|
F1
|
0.6600
|
Prevalence
|
0.4696
|
Detection Rate
|
0.2870
|
Detection Prevalence
|
0.4000
|
Balanced Accuracy
|
0.6990
|
#############################################3
# Probit Regression
#############################################
# Probit Regression
probitmod<-glm(DV~ .,
family=binomial(link="probit"), data=trainData)
summary(probitmod)
##
## Call:
## glm(formula = DV ~ ., family = binomial(link = "probit"), data = trainData)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.124850 1.740920 0.646 0.51820
## Age_Group -0.054694 0.069861 -0.783 0.43369
## Geographic_Region -0.037476 0.013885 -2.699 0.00695 **
## Place_of_Residence -0.127339 0.134869 -0.944 0.34508
## Maternal_Education -0.085141 0.102205 -0.833 0.40482
## Radio_Listening -0.299373 0.130480 -2.294 0.02177 *
## Television_Watching -0.005267 0.101960 -0.052 0.95880
## Internet_Usage 0.017937 0.192132 0.093 0.92562
## Wealth_Quantile -0.006721 0.077389 -0.087 0.93079
## Health_Access 0.220601 0.165266 1.335 0.18194
## Distance_Healthcare 0.109603 0.159441 0.687 0.49182
## Birth_order 0.044818 0.035222 1.272 0.20322
## Child_twin -0.088809 0.719598 -0.123 0.90178
## Child_sex -0.064269 0.133181 -0.483 0.62940
## Child_Age 0.081067 0.008649 9.373 < 2e-16 ***
## ANC_Numbers -0.045730 0.032546 -1.405 0.16000
## Postnatal_care -0.229310 0.181379 -1.264 0.20614
## Husband_Education 0.209239 0.153836 1.360 0.17378
## Husband_Employment_status -0.196697 0.145662 -1.350 0.17690
## Maternal_Employment_status -0.083692 0.649477 -0.129 0.89747
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 632.65 on 461 degrees of freedom
## Residual deviance: 493.11 on 442 degrees of freedom
## AIC: 533.11
##
## Number of Fisher Scoring iterations: 5
# Apply the model to predict the testdata
pred<-predict(probitmod,newdata = testData, type = "response")
summary(pred)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.02841 0.33874 0.54033 0.55394 0.79885 0.99066
# Measure the accuracy of prediction in the test data
y_pred_num<-ifelse(pred<0.5,0,1)
y_pred<-factor(y_pred_num,levels = c(0,1))
y_act<-testData$DV
#Result : Prediction Accuracy
set.seed(123)
mean(y_pred==y_act)
## [1] 0.7130435
#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_table <- table(testData$DV, y_pred_num)
#Displays confusion matrix and the statistics associated with the confusion matrix
result<-caret::confusionMatrix(confusion_table)
result
## Confusion Matrix and Statistics
##
## y_pred_num
## 0 1
## 0 33 13
## 1 20 49
##
## Accuracy : 0.713
## 95% CI : (0.6212, 0.7935)
## No Information Rate : 0.5391
## P-Value [Acc > NIR] : 0.0001017
##
## Kappa : 0.417
##
## Mcnemar's Test P-Value : 0.2962699
##
## Sensitivity : 0.6226
## Specificity : 0.7903
## Pos Pred Value : 0.7174
## Neg Pred Value : 0.7101
## Prevalence : 0.4609
## Detection Rate : 0.2870
## Detection Prevalence : 0.4000
## Balanced Accuracy : 0.7065
##
## 'Positive' Class : 0
##
metrics<-as.data.frame(result$byClass)
colnames(metrics)<-"metrics"
library(dplyr)
library(kableExtra)
kable(round(metrics,4), caption = "F1-score, Precision and Recall ") %>%
kable_styling(font_size = 16)
F1-score, Precision and Recall
|
metrics
|
Sensitivity
|
0.6226
|
Specificity
|
0.7903
|
Pos Pred Value
|
0.7174
|
Neg Pred Value
|
0.7101
|
Precision
|
0.7174
|
Recall
|
0.6226
|
F1
|
0.6667
|
Prevalence
|
0.4609
|
Detection Rate
|
0.2870
|
Detection Prevalence
|
0.4000
|
Balanced Accuracy
|
0.7065
|
# Probit Regression
# Get feature importance (coefficients)
feature_importance2 <- abs(coef(probitmod)[-1])
# Create feature importance plot
feature_importance_df2<-data.frame(
Feature = names(feature_importance2),
Importance = feature_importance2
)
feature_importance_df2 <- feature_importance_df2[order(feature_importance_df2$Importance, decreasing = TRUE), ]
#windows()
ggplot(feature_importance_df2, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "blue") +
xlab("Feature") +
ylab("Importance") +
ggtitle("Probit Regression - Feature Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

################################################3
# Random Forest
################################################
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
set.seed(123)
#Random Forest for variables. mtry = 5 since there are 24 variables (square root of 24 is close to 5).
fit_rf <- randomForest(factor(DV) ~., data = trainData)
fit_rf
##
## Call:
## randomForest(formula = factor(DV) ~ ., data = trainData)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 25.11%
## Confusion matrix:
## 0 1 class.error
## 0 128 73 0.3631841
## 1 43 218 0.1647510
#windows()
varImpPlot(fit_rf)

#Performance of Random Forest Model on Testing Data
predrf = predict(fit_rf, newdata=testData)
table(predrf)
## predrf
## 0 1
## 46 69
summary(predrf)
## 0 1
## 46 69
# Measure the accuracy of prediction in the test data
y_pred_numrf<-ifelse(predrf==1,1,0)
y_predrf<-factor(y_pred_numrf,levels = c(0,1))
y_actrf<-testData$DV
#Result : Prediction Accuracy
mean(y_predrf==y_actrf)
## [1] 0.7913043
#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_tablerf<- table(testData$DV, y_pred_numrf)
#Displays confusion matrix and the statistics associated with the confusion matrix
resultrf<-caret::confusionMatrix(confusion_tablerf)
resultrf
## Confusion Matrix and Statistics
##
## y_pred_numrf
## 0 1
## 0 34 12
## 1 12 57
##
## Accuracy : 0.7913
## 95% CI : (0.7056, 0.8615)
## No Information Rate : 0.6
## P-Value [Acc > NIR] : 1.026e-05
##
## Kappa : 0.5652
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.7391
## Specificity : 0.8261
## Pos Pred Value : 0.7391
## Neg Pred Value : 0.8261
## Prevalence : 0.4000
## Detection Rate : 0.2957
## Detection Prevalence : 0.4000
## Balanced Accuracy : 0.7826
##
## 'Positive' Class : 0
##
metricsrf<-as.data.frame(resultrf$byClass)
colnames(metricsrf)<-"metricsrf"
library(dplyr)
library(kableExtra)
kable(round(metricsrf,4), caption = "F1-score, Precision and Recall ") %>%
kable_styling(font_size = 16)
F1-score, Precision and Recall
|
metricsrf
|
Sensitivity
|
0.7391
|
Specificity
|
0.8261
|
Pos Pred Value
|
0.7391
|
Neg Pred Value
|
0.8261
|
Precision
|
0.7391
|
Recall
|
0.7391
|
F1
|
0.7391
|
Prevalence
|
0.4000
|
Detection Rate
|
0.2957
|
Detection Prevalence
|
0.4000
|
Balanced Accuracy
|
0.7826
|
# Random Forest
# Get feature importance
feature_importance <- importance(fit_rf)
# Convert feature importance to data frame
feature_importance_df <- data.frame(
Feature = row.names(feature_importance),
Importance = feature_importance[, "MeanDecreaseGini"]
)
# Sort feature importance by importance score
feature_importance_df <- feature_importance_df[order(feature_importance_df$Importance, decreasing = TRUE), ]
#windows()
# Create feature importance plot
ggplot(feature_importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "darkgreen") +
xlab("Feature") +
ylab("Importance") +
ggtitle("Random Forest - Feature Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

#ROC
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
# RandomForest
test_prob1= predict(fit_rf, newdata=testData, type = "prob")
test_roc1= roc(testData$DV, test_prob1[,c(2)],plot = TRUE, col='darkgoldenrod4', print.auc = TRUE,percent=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

#Probit Regression
test_prob2= predict(probitmod, newdata=testData, type = "response")
test_roc2= roc(testData$DV~test_prob2,plot = TRUE, col='darkorange', print.auc = TRUE,percent=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

#Logistic Regression
test_prob3= predict(logitmod2, newdata=testData, type = "response")
test_roc3= roc(testData$DV~test_prob3, plot = TRUE,col="darkblue", print.auc = TRUE,percent=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

###############################################
# Support Vector Machine (SVM) Model
###############################################
library(e1071)
set.seed(123)
svm<-svm(DV~., data = trainData, kernel='linear', cost=10, scale=FALSE)
#Prediction
set.seed(123)
predictionSVM<- predict(svm, newdata = testData)
summary(predictionSVM)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.3003 0.3281 0.5260 0.5887 0.8549 1.4222
# Measure the accuracy of prediction in the test data
y_pred_numSVM<-ifelse(predictionSVM<0.5,0,1)
y_predSVM<-factor(y_pred_numSVM,levels = c(0,1))
y_actSVM<-testData$DV
#Result : Prediction Accuracy
mean(y_predSVM==y_actSVM)
## [1] 0.6695652
#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_tableSVM<- table(testData$DV, y_predSVM)
#Displays confusion matrix and the statistics associated with the confusion matrix
resultSVM<-caret::confusionMatrix(confusion_tableSVM)
resultSVM
## Confusion Matrix and Statistics
##
## y_predSVM
## 0 1
## 0 32 14
## 1 24 45
##
## Accuracy : 0.6696
## 95% CI : (0.5757, 0.7544)
## No Information Rate : 0.513
## P-Value [Acc > NIR] : 0.0004889
##
## Kappa : 0.3357
##
## Mcnemar's Test P-Value : 0.1442921
##
## Sensitivity : 0.5714
## Specificity : 0.7627
## Pos Pred Value : 0.6957
## Neg Pred Value : 0.6522
## Prevalence : 0.4870
## Detection Rate : 0.2783
## Detection Prevalence : 0.4000
## Balanced Accuracy : 0.6671
##
## 'Positive' Class : 0
##
metricsSVM<-as.data.frame(resultSVM$byClass)
colnames(metricsSVM)<-"metricsSVM"
library(dplyr)
library(kableExtra)
kable(round(metricsSVM,4), caption = "F1-score, Precision and Recall ") %>%
kable_styling(font_size = 16)
F1-score, Precision and Recall
|
metricsSVM
|
Sensitivity
|
0.5714
|
Specificity
|
0.7627
|
Pos Pred Value
|
0.6957
|
Neg Pred Value
|
0.6522
|
Precision
|
0.6957
|
Recall
|
0.5714
|
F1
|
0.6275
|
Prevalence
|
0.4870
|
Detection Rate
|
0.2783
|
Detection Prevalence
|
0.4000
|
Balanced Accuracy
|
0.6671
|
#SVM
library(pROC)
#windows()
test_prob4= predict(svm, newdata=testData)
test_roc4= roc(testData$DV, main="Support Vector Machine", test_prob4,plot = TRUE, col='darkblue', print.auc = TRUE,percent=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

coef(svm)[2:20]
## Age_Group Geographic_Region
## -2.334968e-02 -1.942142e-02
## Place_of_Residence Maternal_Education
## -1.045718e-01 -4.396286e-02
## Radio_Listening Television_Watching
## -5.504526e-02 2.529389e-02
## Internet_Usage Wealth_Quantile
## -1.117033e-02 7.106203e-03
## Health_Access Distance_Healthcare
## 7.222660e-02 6.451840e-02
## Birth_order Child_twin
## 1.909147e-02 -1.563194e-13
## Child_sex Child_Age
## -2.918590e-02 3.551292e-02
## ANC_Numbers Postnatal_care
## -2.375915e-02 -1.090159e-01
## Husband_Education Husband_Employment_status
## 4.344231e-03 -2.775793e-02
## Maternal_Employment_status
## -5.099233e-02
# Support Vector Machine
# Get feature importance (coefficients)
feature_importance1<- abs(coef(svm)[2:20])
# Create feature importance plot
feature_importance_df <- data.frame(
Feature = names(feature_importance1),
Importance = feature_importance1
)
feature_importance_df <- feature_importance_df[order(feature_importance_df$Importance, decreasing = TRUE), ]
#windows()
ggplot(feature_importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "darkred") +
xlab("Feature") +
ylab("Importance") +
ggtitle("Support Vector Machine - Feature Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

#################################################
############################################
# KNN Model
library(knn)
# Fitting model
set.seed(123)
fit <-knnreg(trainData$DV ~ ., data = trainData,k=1)
summary(fit)
## Length Class Mode
## learn 2 -none- list
## k 1 -none- numeric
## terms 3 terms call
## xlevels 0 -none- list
## theDots 0 -none- list
#Predict Output
predicted= predict(fit,testData)
summary(predicted)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.000 0.529 1.000 1.000
# Measure the accuracy of prediction in the test data
y_pred_numknn<-ifelse(predicted==1,1,0)
y_predknn<-factor(y_pred_numknn,levels = c(0,1))
y_actknn<-testData$DV
#Result : Prediction Accuracy
mean(y_predknn==y_actknn)
## [1] 0.6956522
#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_tableknn<- table(testData$DV, y_predknn)
#Displays confusion matrix and the statistics associated with the confusion matrix
resultknn<-caret::confusionMatrix(confusion_tableknn)
resultknn
## Confusion Matrix and Statistics
##
## y_predknn
## 0 1
## 0 34 12
## 1 23 46
##
## Accuracy : 0.6957
## 95% CI : (0.6029, 0.778)
## No Information Rate : 0.5043
## P-Value [Acc > NIR] : 2.448e-05
##
## Kappa : 0.3902
##
## Mcnemar's Test P-Value : 0.09097
##
## Sensitivity : 0.5965
## Specificity : 0.7931
## Pos Pred Value : 0.7391
## Neg Pred Value : 0.6667
## Prevalence : 0.4957
## Detection Rate : 0.2957
## Detection Prevalence : 0.4000
## Balanced Accuracy : 0.6948
##
## 'Positive' Class : 0
##
metricsknn<-as.data.frame(resultknn$byClass)
colnames(metricsrf)<-"metricsknn"
library(dplyr)
library(kableExtra)
kable(round(metricsknn,4), caption = "F1-score, Precision and Recall ") %>%
kable_styling(font_size = 16)
F1-score, Precision and Recall
|
resultknn$byClass
|
Sensitivity
|
0.5965
|
Specificity
|
0.7931
|
Pos Pred Value
|
0.7391
|
Neg Pred Value
|
0.6667
|
Precision
|
0.7391
|
Recall
|
0.5965
|
F1
|
0.6602
|
Prevalence
|
0.4957
|
Detection Rate
|
0.2957
|
Detection Prevalence
|
0.4000
|
Balanced Accuracy
|
0.6948
|
knnmodel <- train(DV ~ ., data = trainData, method = "knn")
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
# Get feature importance
importance <- varImp(knnmodel)
# Print the feature importance
print(importance)
## loess r-squared variable importance
##
## Overall
## Child_Age 100.00000
## Geographic_Region 9.92992
## Radio_Listening 8.71100
## Wealth_Quantile 4.81462
## Television_Watching 4.53952
## Postnatal_care 3.81569
## Place_of_Residence 3.77625
## Husband_Education 2.94649
## Distance_Healthcare 1.98242
## Child_twin 1.64725
## Husband_Employment_status 1.55126
## Age_Group 1.24735
## Health_Access 0.74023
## Maternal_Education 0.55528
## Internet_Usage 0.29148
## Child_sex 0.19910
## Birth_order 0.09896
## Maternal_Employment_status 0.02434
## ANC_Numbers 0.00000
# Plot the feature importance
plot <- ggplot(importance, aes(x = rownames(importance), y = Overall)) +
geom_bar(stat = "identity", fill = "darkmagenta") +
labs(x = "Features", y = "Importance") +
ggtitle("KNN Features Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Print the plot
#windows()
print(plot)

#ROC
library(pROC)
# KNN
#windows()
test_prob5= predict(fit, newdata=testData, type = "response")
test_roc5= roc(testData$DV, test_prob5,plot = TRUE, col='darkblue', print.auc = TRUE,percent=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

####################################################
# Area_Under Curve
############################################
#Area-under Curve
#windows()
plot(test_roc1, print.auc = TRUE, col='darkblue',percent=TRUE, print.auc.y = 48, print.auc.x = 0)
plot(test_roc2 , add = TRUE, col='gold', print.auc = TRUE, print.auc.y = 41, print.auc.x = 0)
plot(test_roc3 , add = TRUE, col='brown', print.auc = TRUE, print.auc.y = 34, print.auc.x = 0)
plot(test_roc4 , add = TRUE, col='darkgreen', print.auc = TRUE, print.auc.y = 27, print.auc.x = 0)
plot(test_roc5 , add = TRUE, col='darkmagenta', print.auc = TRUE, print.auc.y = 20, print.auc.x = 0)
abline(a=0, b=1, lty=3, lwd=4, col="black")
legend("topleft", legend = c("Random Forest-Model", "Probit Regression",
"Logistic-Regression","Support Vector Machine","KNN Model") ,
pch = 15, bty = 'n', col = c("darkblue","gold", "brown", "darkgreen","darkmagenta"))

###################################################
# Metrics
###############################################
# Sample accuracy, sensitivity, F1 Score, and precision values for three models
Accuracy <- c(0.704, 0.791, 0.713,0.670,0.696)
Recall <- c(0.611, 0.739, 0.623,0.571,0.600)
F1_score <- c(0.660, 0.739, 0.667,0.628,0.660)
Precision<-c(0.717,0.739,0.717,0.696,0.739)
# Creating a data frame with the values
model_names <- c("Logistic Regression", "Random Forest", "Probit Regression",
"Support Vector Machine","KNN Model")
data <- data.frame(Model = model_names, Accuracy = Accuracy, Recall = Recall, F1_Score = F1_score, Precision=Precision)
# Loading the ggplot2 package
library(ggplot2)
# Reshaping the data frame for plotting
data_long <- reshape2::melt(data, id.vars = "Model", variable.name = "Metric", value.name = "Value")
#windows()
# Creating the bar chart
ggplot(data_long, aes(x = Model, y = Value, fill = Metric)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Comparison of Supervised Machine Learning Models",
x = "Model",
y = "Value") +
scale_fill_manual(values = c("Accuracy" = "magenta3",
"Recall" = "darkcyan",
"F1_Score" = "pink4",
"Precision" = "purple1")) +
theme_minimal()

#############################################
# Feature Selection
#############################################
#windows()
par(mfrow=(c(3,2)))
# Logistic Regression
# Get feature importance (coefficients)
feature_importance2 <- abs(coef(logitmod2)[-1])
# Create feature importance plot
feature_importance_df2<-data.frame(
Feature = names(feature_importance2),
Importance = feature_importance2
)
feature_importance_df2 <- feature_importance_df2[order(feature_importance_df2$Importance, decreasing = TRUE), ]
#windows()
ggplot(feature_importance_df2, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "darkmagenta") +
xlab("Feature") +
ylab("Importance") +
ggtitle("Logistic Regression - Feature Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

###########################################3
# Probit Regression
# Get feature importance (coefficients)
feature_importance2 <- abs(coef(probitmod)[-1])
# Create feature importance plot
feature_importance_df2<-data.frame(
Feature = names(feature_importance2),
Importance = feature_importance2
)
feature_importance_df2 <- feature_importance_df2[order(feature_importance_df2$Importance, decreasing = TRUE), ]
#windows()
ggplot(feature_importance_df2, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "darkcyan") +
xlab("Feature") +
ylab("Importance") +
ggtitle("Probit Regression - Feature Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

#############################################
# Random Forest
# Get feature importance
feature_importance <- importance(fit_rf)
# Convert feature importance to data frame
feature_importance_df <- data.frame(
Feature = row.names(feature_importance),
Importance = feature_importance[, "MeanDecreaseGini"]
)
# Sort feature importance by importance score
feature_importance_df <- feature_importance_df[order(feature_importance_df$Importance, decreasing = TRUE), ]
#windows()
# Create feature importance plot
ggplot(feature_importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "darkgreen") +
xlab("Feature") +
ylab("Importance") +
ggtitle("Random Forest - Feature Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

###########################################
knnmodel <- train(DV ~ ., data = trainData, method = "knn")
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
# Get feature importance
importance <- varImp(knnmodel)
# Print the feature importance
print(importance)
## loess r-squared variable importance
##
## Overall
## Child_Age 100.00000
## Geographic_Region 9.92992
## Radio_Listening 8.71100
## Wealth_Quantile 4.81462
## Television_Watching 4.53952
## Postnatal_care 3.81569
## Place_of_Residence 3.77625
## Husband_Education 2.94649
## Distance_Healthcare 1.98242
## Child_twin 1.64725
## Husband_Employment_status 1.55126
## Age_Group 1.24735
## Health_Access 0.74023
## Maternal_Education 0.55528
## Internet_Usage 0.29148
## Child_sex 0.19910
## Birth_order 0.09896
## Maternal_Employment_status 0.02434
## ANC_Numbers 0.00000
#windows()
# Plot the feature importance
plot <- ggplot(importance, aes(x = rownames(importance), y = Overall)) +
geom_bar(stat = "identity", fill = "darkred") +
labs(x = "Features", y = "Importance") +
ggtitle("KNN Features Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Print the plot
print(plot)

########################################
coef(svm)[2:20]
## Age_Group Geographic_Region
## -2.334968e-02 -1.942142e-02
## Place_of_Residence Maternal_Education
## -1.045718e-01 -4.396286e-02
## Radio_Listening Television_Watching
## -5.504526e-02 2.529389e-02
## Internet_Usage Wealth_Quantile
## -1.117033e-02 7.106203e-03
## Health_Access Distance_Healthcare
## 7.222660e-02 6.451840e-02
## Birth_order Child_twin
## 1.909147e-02 -1.563194e-13
## Child_sex Child_Age
## -2.918590e-02 3.551292e-02
## ANC_Numbers Postnatal_care
## -2.375915e-02 -1.090159e-01
## Husband_Education Husband_Employment_status
## 4.344231e-03 -2.775793e-02
## Maternal_Employment_status
## -5.099233e-02
# Support Vector Machine
# Get feature importance (coefficients)
feature_importance1<- abs(coef(svm)[2:20])
# Create feature importance plot
feature_importance_df <- data.frame(
Feature = names(feature_importance1),
Importance = feature_importance1
)
feature_importance_df <- feature_importance_df[order(feature_importance_df$Importance, decreasing = TRUE), ]
#windows()
ggplot(feature_importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "pink") +
xlab("Feature") +
ylab("Importance") +
ggtitle("Support Vector Machine - Feature Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
