Amoud University

Yahye Project

# 5 Supervised Machine Learning Models
# Measless Vaccination Coverage in Somaliland
library(haven)
final<-read_dta("~/YahyeM.dta")
str(final)

## tibble [577 × 20] (S3: tbl_df/tbl/data.frame)
##  $ Age_Group                 : dbl+lbl [1:577] 3, 2, 3, 3, 3, 3, 5, 2, 3, 3, 3, 4, 3, 5, 2, 3, 3, 5, ...
##    ..@ label       : chr "Age in 5-year groups"
##    ..@ format.stata: chr "%1.0f"
##    ..@ labels      : Named num [1:7] 1 2 3 4 5 6 7
##    .. ..- attr(*, "names")= chr [1:7] "15 - 19" "20 - 24" "25 - 29" "30 - 34" ...
##  $ Geographic_Region         : dbl+lbl [1:577] 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11...
##    ..@ label       : chr "Region"
##    ..@ format.stata: chr "%2.0f"
##    ..@ labels      : Named num [1:18] 11 12 13 14 15 16 17 18 19 20 ...
##    .. ..- attr(*, "names")= chr [1:18] "Awdal" "Woqooyi Galbeed" "Togdheer" "Sool" ...
##  $ Place_of_Residence        : dbl+lbl [1:577] 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 2, 2, 2, ...
##    ..@ label       : chr "Residence"
##    ..@ format.stata: chr "%1.0f"
##    ..@ labels      : Named num [1:3] 1 2 3
##    .. ..- attr(*, "names")= chr [1:3] "Urban" "Rural" "Nomadic"
##  $ Maternal_Education        : dbl+lbl [1:577] 0, 1, 0, 0, 3, 2, 2, 2, 3, 1, 3, 1, 2, 0, 0, 0, 0, 0, ...
##    ..@ label       : chr "Education"
##    ..@ format.stata: chr "%1.0f"
##    ..@ labels      : Named num [1:4] 0 1 2 3
##    .. ..- attr(*, "names")= chr [1:4] "No Education" "Primary" " Secondary" "Higher"
##  $ Radio_Listening           : dbl+lbl [1:577] 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...
##    ..@ label       : chr "Frequency of listening to radio"
##    ..@ format.stata: chr "%1.0f"
##    ..@ labels      : Named num [1:3] 1 2 3
##    .. ..- attr(*, "names")= chr [1:3] "At least once a week" "Less than once a week" "Not at all"
##  $ Television_Watching       : dbl+lbl [1:577] 1, 1, 3, 1, 1, 2, 3, 3, 2, 1, 1, 3, 1, 1, 3, 3, 3, 3, ...
##    ..@ label       : chr "Frequency of watching television"
##    ..@ format.stata: chr "%1.0f"
##    ..@ labels      : Named num [1:3] 1 2 3
##    .. ..- attr(*, "names")= chr [1:3] "At least once a week" "Less than once a week" "Not at all"
##  $ Internet_Usage            : dbl+lbl [1:577] 2, 2, 2, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 2, 2, 2, 2, 2, ...
##    ..@ label       : chr "Ever used internet"
##    ..@ format.stata: chr "%1.0f"
##    ..@ labels      : Named num [1:2] 1 2
##    .. ..- attr(*, "names")= chr [1:2] "Yes" "No"
##  $ Wealth_Quantile           : dbl+lbl [1:577] 4, 5, 3, 5, 5, 5, 5, 4, 5, 4, 5, 5, 5, 5, 1, 5, 5, 5, ...
##    ..@ label       : chr "Wealth quintile"
##    ..@ format.stata: chr "%1.0f"
##    ..@ labels      : Named num [1:5] 1 2 3 4 5
##    .. ..- attr(*, "names")= chr [1:5] "Lowest" "Second" "Middle" "Fourth" ...
##  $ Health_Access             : dbl+lbl [1:577] 2, 2, 2, 2, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, ...
##    ..@ label       : chr "Getting medical help for self: getting permission to go"
##    ..@ format.stata: chr "%17.0f"
##    ..@ labels      : Named num [1:2] 1 2
##    .. ..- attr(*, "names")= chr [1:2] "Big problem" "Not a big problem"
##  $ Distance_Healthcare       : dbl+lbl [1:577] 2, 2, 2, 2, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, ...
##    ..@ label       : chr "Getting medical help for self: distance to health facility"
##    ..@ format.stata: chr "%17.0f"
##    ..@ labels      : Named num [1:2] 1 2
##    .. ..- attr(*, "names")= chr [1:2] "Big problem" "Not a big problem"
##  $ Birth_order               : num [1:577] 1 1 1 4 1 3 1 2 1 1 ...
##   ..- attr(*, "label")= chr "Birth Order"
##   ..- attr(*, "format.stata")= chr "%2.0f"
##  $ Child_twin                : dbl+lbl [1:577] 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
##    ..@ label       : chr "Twins"
##    ..@ format.stata: chr "%1.0f"
##    ..@ labels      : Named num [1:2] 1 2
##    .. ..- attr(*, "names")= chr [1:2] "Sing" "Mult"
##  $ Child_sex                 : dbl+lbl [1:577] 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 2, 1, 2, 2, 2, 1, 1, 2, ...
##    ..@ label       : chr "Sex of child"
##    ..@ format.stata: chr "%1.0f"
##    ..@ labels      : Named num [1:2] 1 2
##    .. ..- attr(*, "names")= chr [1:2] "Male" "Female"
##  $ Child_Age                 : num [1:577] 11 3 7 3 6 13 28 29 6 20 ...
##   ..- attr(*, "label")= chr "Age in Months"
##   ..- attr(*, "format.stata")= chr "%2.0f"
##  $ ANC_Numbers               : num [1:577] 6 3 0 3 2 9 7 4 7 0 ...
##   ..- attr(*, "label")= chr "Number of antenatal visits during pregnancy"
##   ..- attr(*, "format.stata")= chr "%2.0f"
##  $ Postnatal_care            : dbl+lbl [1:577] 2, 2, 1, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...
##    ..@ label       : chr "Respondent's health checked after discharge/delivery at home"
##    ..@ format.stata: chr "%1.0f"
##    ..@ labels      : Named num [1:2] 1 2
##    .. ..- attr(*, "names")= chr [1:2] "Yes" "No"
##  $ DV                        : dbl+lbl [1:577] 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, ...
##    ..@ label       : chr "Measless Vaccination Status"
##    ..@ format.stata: chr "%3.0f"
##    ..@ labels      : Named num [1:2] 0 1
##    .. ..- attr(*, "names")= chr [1:2] "No" "Yes"
##  $ Husband_Education         : dbl+lbl [1:577] 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 1, 2, 2, 2, ...
##    ..@ label       : chr "Husband/partner's  ever attended school"
##    ..@ format.stata: chr "%1.0f"
##    ..@ labels      : Named num [1:3] 1 2 8
##    .. ..- attr(*, "names")= chr [1:3] "Yes" "No" "Don't Know"
##  $ Husband_Employment_status : dbl+lbl [1:577] 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 2, 1, 1, 1, 2, 1, 1, ...
##    ..@ label       : chr "Husband/partner worked in last 12 months"
##    ..@ format.stata: chr "%1.0f"
##    ..@ labels      : Named num [1:3] 1 2 8
##    .. ..- attr(*, "names")= chr [1:3] "Yes" "No" "Don't Know"
##  $ Maternal_Employment_status: dbl+lbl [1:577] 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...
##    ..@ label       : chr "Respondent worked in last 12 months"
##    ..@ format.stata: chr "%1.0f"
##    ..@ labels      : Named num [1:2] 1 2
##    .. ..- attr(*, "names")= chr [1:2] "Yes" "No"

write.csv(final,"~/Yahye.csv")
final<-read.csv("~/Yahye.csv")
final<-final[,-1]
# Logistic Regression
logistic<-glm(DV~.,data=final,family = binomial())
summary(logistic)

## 
## Call:
## glm(formula = DV ~ ., family = binomial(), data = final)
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 1.885033   2.872604   0.656   0.5117    
## Age_Group                  -0.084942   0.107212  -0.792   0.4282    
## Geographic_Region          -0.046430   0.020497  -2.265   0.0235 *  
## Place_of_Residence         -0.343763   0.202148  -1.701   0.0890 .  
## Maternal_Education         -0.069276   0.154013  -0.450   0.6529    
## Radio_Listening            -0.528832   0.203056  -2.604   0.0092 ** 
## Television_Watching         0.125964   0.154782   0.814   0.4158    
## Internet_Usage              0.028090   0.288116   0.097   0.9223    
## Wealth_Quantile            -0.015474   0.113408  -0.136   0.8915    
## Health_Access               0.190633   0.251472   0.758   0.4484    
## Distance_Healthcare         0.146684   0.244886   0.599   0.5492    
## Birth_order                 0.070152   0.053205   1.319   0.1873    
## Child_twin                 -0.290760   1.229882  -0.236   0.8131    
## Child_sex                  -0.200391   0.198525  -1.009   0.3128    
## Child_Age                   0.133941   0.013621   9.834   <2e-16 ***
## ANC_Numbers                -0.036047   0.050346  -0.716   0.4740    
## Postnatal_care             -0.421795   0.269327  -1.566   0.1173    
## Husband_Education           0.175600   0.228266   0.769   0.4417    
## Husband_Employment_status  -0.140806   0.219249  -0.642   0.5207    
## Maternal_Employment_status -0.006725   1.109219  -0.006   0.9952    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 787.91  on 576  degrees of freedom
## Residual deviance: 622.08  on 557  degrees of freedom
## AIC: 662.08
## 
## Number of Fisher Scoring iterations: 4

round(coef(summary(logistic)),4)

##                            Estimate Std. Error z value Pr(>|z|)
## (Intercept)                  1.8850     2.8726  0.6562   0.5117
## Age_Group                   -0.0849     0.1072 -0.7923   0.4282
## Geographic_Region           -0.0464     0.0205 -2.2652   0.0235
## Place_of_Residence          -0.3438     0.2021 -1.7005   0.0890
## Maternal_Education          -0.0693     0.1540 -0.4498   0.6529
## Radio_Listening             -0.5288     0.2031 -2.6044   0.0092
## Television_Watching          0.1260     0.1548  0.8138   0.4158
## Internet_Usage               0.0281     0.2881  0.0975   0.9223
## Wealth_Quantile             -0.0155     0.1134 -0.1364   0.8915
## Health_Access                0.1906     0.2515  0.7581   0.4484
## Distance_Healthcare          0.1467     0.2449  0.5990   0.5492
## Birth_order                  0.0702     0.0532  1.3185   0.1873
## Child_twin                  -0.2908     1.2299 -0.2364   0.8131
## Child_sex                   -0.2004     0.1985 -1.0094   0.3128
## Child_Age                    0.1339     0.0136  9.8335   0.0000
## ANC_Numbers                 -0.0360     0.0503 -0.7160   0.4740
## Postnatal_care              -0.4218     0.2693 -1.5661   0.1173
## Husband_Education            0.1756     0.2283  0.7693   0.4417
## Husband_Employment_status   -0.1408     0.2192 -0.6422   0.5207
## Maternal_Employment_status  -0.0067     1.1092 -0.0061   0.9952

# Logistic Regression without factors
logistic<-glm(DV~.,data=final,family = binomial())
summary(logistic)

## 
## Call:
## glm(formula = DV ~ ., family = binomial(), data = final)
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 1.885033   2.872604   0.656   0.5117    
## Age_Group                  -0.084942   0.107212  -0.792   0.4282    
## Geographic_Region          -0.046430   0.020497  -2.265   0.0235 *  
## Place_of_Residence         -0.343763   0.202148  -1.701   0.0890 .  
## Maternal_Education         -0.069276   0.154013  -0.450   0.6529    
## Radio_Listening            -0.528832   0.203056  -2.604   0.0092 ** 
## Television_Watching         0.125964   0.154782   0.814   0.4158    
## Internet_Usage              0.028090   0.288116   0.097   0.9223    
## Wealth_Quantile            -0.015474   0.113408  -0.136   0.8915    
## Health_Access               0.190633   0.251472   0.758   0.4484    
## Distance_Healthcare         0.146684   0.244886   0.599   0.5492    
## Birth_order                 0.070152   0.053205   1.319   0.1873    
## Child_twin                 -0.290760   1.229882  -0.236   0.8131    
## Child_sex                  -0.200391   0.198525  -1.009   0.3128    
## Child_Age                   0.133941   0.013621   9.834   <2e-16 ***
## ANC_Numbers                -0.036047   0.050346  -0.716   0.4740    
## Postnatal_care             -0.421795   0.269327  -1.566   0.1173    
## Husband_Education           0.175600   0.228266   0.769   0.4417    
## Husband_Employment_status  -0.140806   0.219249  -0.642   0.5207    
## Maternal_Employment_status -0.006725   1.109219  -0.006   0.9952    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 787.91  on 576  degrees of freedom
## Residual deviance: 622.08  on 557  degrees of freedom
## AIC: 662.08
## 
## Number of Fisher Scoring iterations: 4

# Prediction Models
library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

# Prep Training and Test dataset
# First, create a Train-test split with 0% data included in the training set
set.seed(123)
trainDataIndex<-createDataPartition(final$DV, p=0.8, list = F)
trainData<-final[trainDataIndex,]
testData<-final[-trainDataIndex,]

###################################################

# Logistic with only significant variables
###################################################
logitmod2<-glm(DV~ .,
               family="binomial", data=trainData)
summary(logitmod2)

## 
## Call:
## glm(formula = DV ~ ., family = "binomial", data = trainData)
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 1.668196   2.986042   0.559  0.57639    
## Age_Group                  -0.067843   0.118926  -0.570  0.56836    
## Geographic_Region          -0.064887   0.023679  -2.740  0.00614 ** 
## Place_of_Residence         -0.220630   0.229010  -0.963  0.33534    
## Maternal_Education         -0.140843   0.172797  -0.815  0.41503    
## Radio_Listening            -0.492412   0.223192  -2.206  0.02737 *  
## Television_Watching        -0.002837   0.173962  -0.016  0.98699    
## Internet_Usage              0.044454   0.327084   0.136  0.89189    
## Wealth_Quantile            -0.012463   0.131245  -0.095  0.92435    
## Health_Access               0.396323   0.283477   1.398  0.16209    
## Distance_Healthcare         0.172527   0.272641   0.633  0.52687    
## Birth_order                 0.069386   0.059748   1.161  0.24551    
## Child_twin                 -0.048461   1.221355  -0.040  0.96835    
## Child_sex                  -0.091635   0.225825  -0.406  0.68491    
## Child_Age                   0.139645   0.015634   8.932  < 2e-16 ***
## ANC_Numbers                -0.083213   0.056063  -1.484  0.13774    
## Postnatal_care             -0.361494   0.306368  -1.180  0.23803    
## Husband_Education           0.337657   0.260281   1.297  0.19454    
## Husband_Employment_status  -0.299764   0.247795  -1.210  0.22638    
## Maternal_Employment_status -0.183268   1.121369  -0.163  0.87018    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 632.65  on 461  degrees of freedom
## Residual deviance: 491.66  on 442  degrees of freedom
## AIC: 531.66
## 
## Number of Fisher Scoring iterations: 4

# Apply the model to predict the testdata
pred<-predict(logitmod2,newdata = testData, type = "response")
summary(pred)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.03969 0.32499 0.54409 0.55382 0.81588 0.98343

# Measure the accuracy of prediction in the test data
y_pred_num<-ifelse(pred<0.5,0,1)
y_pred<-factor(y_pred_num,levels = c(0,1))
y_act<-testData$DV

#Result : Prediction Accuracy 
set.seed(123)
mean(y_pred==y_act)

## [1] 0.7043478

#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_table <- table(testData$DV, y_pred_num)
#Displays confusion matrix and the statistics associated with the confusion matrix
result<-caret::confusionMatrix(confusion_table)
result

## Confusion Matrix and Statistics
## 
##    y_pred_num
##      0  1
##   0 33 13
##   1 21 48
##                                           
##                Accuracy : 0.7043          
##                  95% CI : (0.6121, 0.7858)
##     No Information Rate : 0.5304          
##     P-Value [Acc > NIR] : 0.0001067       
##                                           
##                   Kappa : 0.4014          
##                                           
##  Mcnemar's Test P-Value : 0.2299491       
##                                           
##             Sensitivity : 0.6111          
##             Specificity : 0.7869          
##          Pos Pred Value : 0.7174          
##          Neg Pred Value : 0.6957          
##              Prevalence : 0.4696          
##          Detection Rate : 0.2870          
##    Detection Prevalence : 0.4000          
##       Balanced Accuracy : 0.6990          
##                                           
##        'Positive' Class : 0               
##

metrics<-as.data.frame(result$byClass)
colnames(metrics)<-"metrics"
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(kableExtra)

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

kable(round(metrics,4), caption = "F1-score, Precision and Recall ") %>%
  kable_styling(font_size = 16)

F1-score, Precision and Recall
	metrics
Sensitivity	0.6111
Specificity	0.7869
Pos Pred Value	0.7174
Neg Pred Value	0.6957
Precision	0.7174
Recall	0.6111
F1	0.6600
Prevalence	0.4696
Detection Rate	0.2870
Detection Prevalence	0.4000
Balanced Accuracy	0.6990

#############################################3
# Probit Regression
#############################################

# Probit Regression
probitmod<-glm(DV~ .,
               family=binomial(link="probit"), data=trainData)
summary(probitmod)

## 
## Call:
## glm(formula = DV ~ ., family = binomial(link = "probit"), data = trainData)
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 1.124850   1.740920   0.646  0.51820    
## Age_Group                  -0.054694   0.069861  -0.783  0.43369    
## Geographic_Region          -0.037476   0.013885  -2.699  0.00695 ** 
## Place_of_Residence         -0.127339   0.134869  -0.944  0.34508    
## Maternal_Education         -0.085141   0.102205  -0.833  0.40482    
## Radio_Listening            -0.299373   0.130480  -2.294  0.02177 *  
## Television_Watching        -0.005267   0.101960  -0.052  0.95880    
## Internet_Usage              0.017937   0.192132   0.093  0.92562    
## Wealth_Quantile            -0.006721   0.077389  -0.087  0.93079    
## Health_Access               0.220601   0.165266   1.335  0.18194    
## Distance_Healthcare         0.109603   0.159441   0.687  0.49182    
## Birth_order                 0.044818   0.035222   1.272  0.20322    
## Child_twin                 -0.088809   0.719598  -0.123  0.90178    
## Child_sex                  -0.064269   0.133181  -0.483  0.62940    
## Child_Age                   0.081067   0.008649   9.373  < 2e-16 ***
## ANC_Numbers                -0.045730   0.032546  -1.405  0.16000    
## Postnatal_care             -0.229310   0.181379  -1.264  0.20614    
## Husband_Education           0.209239   0.153836   1.360  0.17378    
## Husband_Employment_status  -0.196697   0.145662  -1.350  0.17690    
## Maternal_Employment_status -0.083692   0.649477  -0.129  0.89747    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 632.65  on 461  degrees of freedom
## Residual deviance: 493.11  on 442  degrees of freedom
## AIC: 533.11
## 
## Number of Fisher Scoring iterations: 5

# Apply the model to predict the testdata
pred<-predict(probitmod,newdata = testData, type = "response")
summary(pred)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.02841 0.33874 0.54033 0.55394 0.79885 0.99066

# Measure the accuracy of prediction in the test data
y_pred_num<-ifelse(pred<0.5,0,1)
y_pred<-factor(y_pred_num,levels = c(0,1))
y_act<-testData$DV

#Result : Prediction Accuracy 
set.seed(123)
mean(y_pred==y_act)

## [1] 0.7130435

#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_table <- table(testData$DV, y_pred_num)
#Displays confusion matrix and the statistics associated with the confusion matrix
result<-caret::confusionMatrix(confusion_table)
result

## Confusion Matrix and Statistics
## 
##    y_pred_num
##      0  1
##   0 33 13
##   1 20 49
##                                           
##                Accuracy : 0.713           
##                  95% CI : (0.6212, 0.7935)
##     No Information Rate : 0.5391          
##     P-Value [Acc > NIR] : 0.0001017       
##                                           
##                   Kappa : 0.417           
##                                           
##  Mcnemar's Test P-Value : 0.2962699       
##                                           
##             Sensitivity : 0.6226          
##             Specificity : 0.7903          
##          Pos Pred Value : 0.7174          
##          Neg Pred Value : 0.7101          
##              Prevalence : 0.4609          
##          Detection Rate : 0.2870          
##    Detection Prevalence : 0.4000          
##       Balanced Accuracy : 0.7065          
##                                           
##        'Positive' Class : 0               
##

metrics<-as.data.frame(result$byClass)
colnames(metrics)<-"metrics"
library(dplyr)
library(kableExtra)
kable(round(metrics,4), caption = "F1-score, Precision and Recall ") %>%
  kable_styling(font_size = 16)

F1-score, Precision and Recall
	metrics
Sensitivity	0.6226
Specificity	0.7903
Pos Pred Value	0.7174
Neg Pred Value	0.7101
Precision	0.7174
Recall	0.6226
F1	0.6667
Prevalence	0.4609
Detection Rate	0.2870
Detection Prevalence	0.4000
Balanced Accuracy	0.7065

# Probit Regression

# Get feature importance (coefficients)
feature_importance2 <- abs(coef(probitmod)[-1])

# Create feature importance plot
feature_importance_df2<-data.frame(
  Feature = names(feature_importance2),
  Importance = feature_importance2
)

feature_importance_df2 <- feature_importance_df2[order(feature_importance_df2$Importance, decreasing = TRUE), ]
#windows()
ggplot(feature_importance_df2, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "blue") +
  xlab("Feature") +
  ylab("Importance") +
  ggtitle("Probit Regression - Feature Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

################################################3
# Random Forest 
################################################

library(randomForest)

## randomForest 4.7-1.1

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

set.seed(123)

#Random Forest for variables. mtry = 5 since there are 24 variables (square root of 24 is close to 5).
fit_rf <- randomForest(factor(DV) ~., data = trainData)
fit_rf

## 
## Call:
##  randomForest(formula = factor(DV) ~ ., data = trainData) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 25.11%
## Confusion matrix:
##     0   1 class.error
## 0 128  73   0.3631841
## 1  43 218   0.1647510

#windows()
varImpPlot(fit_rf)

#Performance of Random Forest Model on Testing Data
predrf = predict(fit_rf, newdata=testData)
table(predrf)

## predrf
##  0  1 
## 46 69

summary(predrf)

##  0  1 
## 46 69

# Measure the accuracy of prediction in the test data
y_pred_numrf<-ifelse(predrf==1,1,0)
y_predrf<-factor(y_pred_numrf,levels = c(0,1))
y_actrf<-testData$DV

#Result : Prediction Accuracy 
mean(y_predrf==y_actrf)

## [1] 0.7913043

#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_tablerf<- table(testData$DV, y_pred_numrf)
#Displays confusion matrix and the statistics associated with the confusion matrix
resultrf<-caret::confusionMatrix(confusion_tablerf)
resultrf

## Confusion Matrix and Statistics
## 
##    y_pred_numrf
##      0  1
##   0 34 12
##   1 12 57
##                                           
##                Accuracy : 0.7913          
##                  95% CI : (0.7056, 0.8615)
##     No Information Rate : 0.6             
##     P-Value [Acc > NIR] : 1.026e-05       
##                                           
##                   Kappa : 0.5652          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.7391          
##             Specificity : 0.8261          
##          Pos Pred Value : 0.7391          
##          Neg Pred Value : 0.8261          
##              Prevalence : 0.4000          
##          Detection Rate : 0.2957          
##    Detection Prevalence : 0.4000          
##       Balanced Accuracy : 0.7826          
##                                           
##        'Positive' Class : 0               
##

metricsrf<-as.data.frame(resultrf$byClass)
colnames(metricsrf)<-"metricsrf"
library(dplyr)
library(kableExtra)
kable(round(metricsrf,4), caption = "F1-score, Precision and Recall ") %>%
  kable_styling(font_size = 16)

F1-score, Precision and Recall
	metricsrf
Sensitivity	0.7391
Specificity	0.8261
Pos Pred Value	0.7391
Neg Pred Value	0.8261
Precision	0.7391
Recall	0.7391
F1	0.7391
Prevalence	0.4000
Detection Rate	0.2957
Detection Prevalence	0.4000
Balanced Accuracy	0.7826

# Random Forest 

# Get feature importance
feature_importance <- importance(fit_rf)

# Convert feature importance to data frame
feature_importance_df <- data.frame(
  Feature = row.names(feature_importance),
  Importance = feature_importance[, "MeanDecreaseGini"]
)

# Sort feature importance by importance score
feature_importance_df <- feature_importance_df[order(feature_importance_df$Importance, decreasing = TRUE), ]
#windows()
# Create feature importance plot
ggplot(feature_importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "darkgreen") +
  xlab("Feature") +
  ylab("Importance") +
  ggtitle("Random Forest - Feature Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

#ROC
library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

# RandomForest
test_prob1= predict(fit_rf, newdata=testData, type = "prob")
test_roc1= roc(testData$DV, test_prob1[,c(2)],plot = TRUE, col='darkgoldenrod4', print.auc = TRUE,percent=TRUE)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

#Probit Regression
test_prob2= predict(probitmod, newdata=testData, type = "response")
test_roc2= roc(testData$DV~test_prob2,plot = TRUE, col='darkorange', print.auc = TRUE,percent=TRUE)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

#Logistic Regression
test_prob3= predict(logitmod2, newdata=testData, type = "response")
test_roc3= roc(testData$DV~test_prob3, plot = TRUE,col="darkblue", print.auc = TRUE,percent=TRUE)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

###############################################
# Support Vector Machine (SVM) Model
###############################################
library(e1071)
set.seed(123)
svm<-svm(DV~., data = trainData, kernel='linear', cost=10, scale=FALSE)


#Prediction
set.seed(123)
predictionSVM<- predict(svm, newdata = testData)
summary(predictionSVM)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.3003  0.3281  0.5260  0.5887  0.8549  1.4222

# Measure the accuracy of prediction in the test data
y_pred_numSVM<-ifelse(predictionSVM<0.5,0,1)
y_predSVM<-factor(y_pred_numSVM,levels = c(0,1))
y_actSVM<-testData$DV

#Result : Prediction Accuracy 
mean(y_predSVM==y_actSVM)

## [1] 0.6695652

#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_tableSVM<- table(testData$DV, y_predSVM)
#Displays confusion matrix and the statistics associated with the confusion matrix
resultSVM<-caret::confusionMatrix(confusion_tableSVM)
resultSVM

## Confusion Matrix and Statistics
## 
##    y_predSVM
##      0  1
##   0 32 14
##   1 24 45
##                                           
##                Accuracy : 0.6696          
##                  95% CI : (0.5757, 0.7544)
##     No Information Rate : 0.513           
##     P-Value [Acc > NIR] : 0.0004889       
##                                           
##                   Kappa : 0.3357          
##                                           
##  Mcnemar's Test P-Value : 0.1442921       
##                                           
##             Sensitivity : 0.5714          
##             Specificity : 0.7627          
##          Pos Pred Value : 0.6957          
##          Neg Pred Value : 0.6522          
##              Prevalence : 0.4870          
##          Detection Rate : 0.2783          
##    Detection Prevalence : 0.4000          
##       Balanced Accuracy : 0.6671          
##                                           
##        'Positive' Class : 0               
##

metricsSVM<-as.data.frame(resultSVM$byClass)
colnames(metricsSVM)<-"metricsSVM"
library(dplyr)
library(kableExtra)
kable(round(metricsSVM,4), caption = "F1-score, Precision and Recall ") %>%
  kable_styling(font_size = 16)

F1-score, Precision and Recall
	metricsSVM
Sensitivity	0.5714
Specificity	0.7627
Pos Pred Value	0.6957
Neg Pred Value	0.6522
Precision	0.6957
Recall	0.5714
F1	0.6275
Prevalence	0.4870
Detection Rate	0.2783
Detection Prevalence	0.4000
Balanced Accuracy	0.6671

#SVM
library(pROC)
#windows()
test_prob4= predict(svm, newdata=testData)
test_roc4= roc(testData$DV, main="Support Vector Machine", test_prob4,plot = TRUE, col='darkblue', print.auc = TRUE,percent=TRUE)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

coef(svm)[2:20]

##                  Age_Group          Geographic_Region 
##              -2.334968e-02              -1.942142e-02 
##         Place_of_Residence         Maternal_Education 
##              -1.045718e-01              -4.396286e-02 
##            Radio_Listening        Television_Watching 
##              -5.504526e-02               2.529389e-02 
##             Internet_Usage            Wealth_Quantile 
##              -1.117033e-02               7.106203e-03 
##              Health_Access        Distance_Healthcare 
##               7.222660e-02               6.451840e-02 
##                Birth_order                 Child_twin 
##               1.909147e-02              -1.563194e-13 
##                  Child_sex                  Child_Age 
##              -2.918590e-02               3.551292e-02 
##                ANC_Numbers             Postnatal_care 
##              -2.375915e-02              -1.090159e-01 
##          Husband_Education  Husband_Employment_status 
##               4.344231e-03              -2.775793e-02 
## Maternal_Employment_status 
##              -5.099233e-02

# Support Vector Machine
# Get feature importance (coefficients)
feature_importance1<- abs(coef(svm)[2:20])

# Create feature importance plot
feature_importance_df <- data.frame(
  Feature = names(feature_importance1),
  Importance = feature_importance1
)

feature_importance_df <- feature_importance_df[order(feature_importance_df$Importance, decreasing = TRUE), ]
#windows()
ggplot(feature_importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "darkred") +
  xlab("Feature") +
  ylab("Importance") +
  ggtitle("Support Vector Machine - Feature Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

#################################################



############################################
# KNN Model
library(knn)

# Fitting model
set.seed(123)
fit <-knnreg(trainData$DV ~ ., data = trainData,k=1)
summary(fit)

##         Length Class  Mode   
## learn   2      -none- list   
## k       1      -none- numeric
## terms   3      terms  call   
## xlevels 0      -none- list   
## theDots 0      -none- list

#Predict Output 
predicted= predict(fit,testData)
summary(predicted)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   0.529   1.000   1.000

# Measure the accuracy of prediction in the test data
y_pred_numknn<-ifelse(predicted==1,1,0)
y_predknn<-factor(y_pred_numknn,levels = c(0,1))
y_actknn<-testData$DV

#Result : Prediction Accuracy 
mean(y_predknn==y_actknn)

## [1] 0.6956522

#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_tableknn<- table(testData$DV, y_predknn)
#Displays confusion matrix and the statistics associated with the confusion matrix
resultknn<-caret::confusionMatrix(confusion_tableknn)
resultknn

## Confusion Matrix and Statistics
## 
##    y_predknn
##      0  1
##   0 34 12
##   1 23 46
##                                          
##                Accuracy : 0.6957         
##                  95% CI : (0.6029, 0.778)
##     No Information Rate : 0.5043         
##     P-Value [Acc > NIR] : 2.448e-05      
##                                          
##                   Kappa : 0.3902         
##                                          
##  Mcnemar's Test P-Value : 0.09097        
##                                          
##             Sensitivity : 0.5965         
##             Specificity : 0.7931         
##          Pos Pred Value : 0.7391         
##          Neg Pred Value : 0.6667         
##              Prevalence : 0.4957         
##          Detection Rate : 0.2957         
##    Detection Prevalence : 0.4000         
##       Balanced Accuracy : 0.6948         
##                                          
##        'Positive' Class : 0              
##

metricsknn<-as.data.frame(resultknn$byClass)
colnames(metricsrf)<-"metricsknn"
library(dplyr)
library(kableExtra)
kable(round(metricsknn,4), caption = "F1-score, Precision and Recall ") %>%
  kable_styling(font_size = 16)

F1-score, Precision and Recall
	resultknn$byClass
Sensitivity	0.5965
Specificity	0.7931
Pos Pred Value	0.7391
Neg Pred Value	0.6667
Precision	0.7391
Recall	0.5965
F1	0.6602
Prevalence	0.4957
Detection Rate	0.2957
Detection Prevalence	0.4000
Balanced Accuracy	0.6948

knnmodel <- train(DV ~ ., data = trainData, method = "knn")

## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.

# Get feature importance
importance <- varImp(knnmodel)
# Print the feature importance
print(importance)

## loess r-squared variable importance
## 
##                              Overall
## Child_Age                  100.00000
## Geographic_Region            9.92992
## Radio_Listening              8.71100
## Wealth_Quantile              4.81462
## Television_Watching          4.53952
## Postnatal_care               3.81569
## Place_of_Residence           3.77625
## Husband_Education            2.94649
## Distance_Healthcare          1.98242
## Child_twin                   1.64725
## Husband_Employment_status    1.55126
## Age_Group                    1.24735
## Health_Access                0.74023
## Maternal_Education           0.55528
## Internet_Usage               0.29148
## Child_sex                    0.19910
## Birth_order                  0.09896
## Maternal_Employment_status   0.02434
## ANC_Numbers                  0.00000

# Plot the feature importance
plot <- ggplot(importance, aes(x = rownames(importance), y = Overall)) +
  geom_bar(stat = "identity", fill = "darkmagenta") +
  labs(x = "Features", y = "Importance") +
  ggtitle("KNN Features Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Print the plot
#windows()
print(plot)

#ROC
library(pROC)
# KNN
#windows()
test_prob5= predict(fit, newdata=testData, type = "response")
test_roc5= roc(testData$DV, test_prob5,plot = TRUE, col='darkblue', print.auc = TRUE,percent=TRUE)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

####################################################
# Area_Under Curve
############################################
#Area-under Curve
#windows()
plot(test_roc1,  print.auc = TRUE,  col='darkblue',percent=TRUE, print.auc.y = 48, print.auc.x = 0)
plot(test_roc2 , add = TRUE, col='gold', print.auc = TRUE, print.auc.y = 41, print.auc.x = 0)
plot(test_roc3 , add = TRUE, col='brown', print.auc = TRUE, print.auc.y = 34, print.auc.x = 0)
plot(test_roc4 , add = TRUE, col='darkgreen', print.auc = TRUE, print.auc.y = 27, print.auc.x = 0)
plot(test_roc5 , add = TRUE, col='darkmagenta', print.auc = TRUE, print.auc.y = 20, print.auc.x = 0)
abline(a=0, b=1, lty=3, lwd=4, col="black")
legend("topleft", legend = c("Random Forest-Model", "Probit Regression",
                             "Logistic-Regression","Support Vector Machine","KNN Model") ,
       pch = 15, bty = 'n', col = c("darkblue","gold", "brown", "darkgreen","darkmagenta"))

###################################################
# Metrics
###############################################
# Sample accuracy, sensitivity, F1 Score, and precision values for three models
Accuracy <- c(0.704, 0.791, 0.713,0.670,0.696)
Recall <- c(0.611, 0.739, 0.623,0.571,0.600)
F1_score <- c(0.660, 0.739, 0.667,0.628,0.660)
Precision<-c(0.717,0.739,0.717,0.696,0.739)
# Creating a data frame with the values
model_names <- c("Logistic Regression", "Random Forest", "Probit Regression",
                 "Support Vector Machine","KNN Model")
data <- data.frame(Model = model_names, Accuracy = Accuracy, Recall = Recall, F1_Score = F1_score, Precision=Precision)


# Loading the ggplot2 package
library(ggplot2)

# Reshaping the data frame for plotting
data_long <- reshape2::melt(data, id.vars = "Model", variable.name = "Metric", value.name = "Value")
#windows()
# Creating the bar chart
ggplot(data_long, aes(x = Model, y = Value, fill = Metric)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Comparison of Supervised Machine Learning Models",
       x = "Model",
       y = "Value") +
  scale_fill_manual(values = c("Accuracy" = "magenta3",
                               "Recall" = "darkcyan",
                               "F1_Score" = "pink4",
                               "Precision" = "purple1")) +
  theme_minimal()

#############################################
# Feature Selection
#############################################
#windows()
par(mfrow=(c(3,2)))


# Logistic Regression

# Get feature importance (coefficients)
feature_importance2 <- abs(coef(logitmod2)[-1])

# Create feature importance plot
feature_importance_df2<-data.frame(
  Feature = names(feature_importance2),
  Importance = feature_importance2
)

feature_importance_df2 <- feature_importance_df2[order(feature_importance_df2$Importance, decreasing = TRUE), ]
#windows()
ggplot(feature_importance_df2, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "darkmagenta") +
  xlab("Feature") +
  ylab("Importance") +
  ggtitle("Logistic Regression - Feature Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

###########################################3
# Probit Regression

# Get feature importance (coefficients)
feature_importance2 <- abs(coef(probitmod)[-1])

# Create feature importance plot
feature_importance_df2<-data.frame(
  Feature = names(feature_importance2),
  Importance = feature_importance2
)

feature_importance_df2 <- feature_importance_df2[order(feature_importance_df2$Importance, decreasing = TRUE), ]
#windows()
ggplot(feature_importance_df2, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "darkcyan") +
  xlab("Feature") +
  ylab("Importance") +
  ggtitle("Probit Regression - Feature Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

#############################################
# Random Forest 

# Get feature importance
feature_importance <- importance(fit_rf)

# Convert feature importance to data frame
feature_importance_df <- data.frame(
  Feature = row.names(feature_importance),
  Importance = feature_importance[, "MeanDecreaseGini"]
)

# Sort feature importance by importance score
feature_importance_df <- feature_importance_df[order(feature_importance_df$Importance, decreasing = TRUE), ]
#windows()
# Create feature importance plot
ggplot(feature_importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "darkgreen") +
  xlab("Feature") +
  ylab("Importance") +
  ggtitle("Random Forest - Feature Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

###########################################

knnmodel <- train(DV ~ ., data = trainData, method = "knn")

## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.

# Get feature importance
importance <- varImp(knnmodel)
# Print the feature importance
print(importance)

## loess r-squared variable importance
## 
##                              Overall
## Child_Age                  100.00000
## Geographic_Region            9.92992
## Radio_Listening              8.71100
## Wealth_Quantile              4.81462
## Television_Watching          4.53952
## Postnatal_care               3.81569
## Place_of_Residence           3.77625
## Husband_Education            2.94649
## Distance_Healthcare          1.98242
## Child_twin                   1.64725
## Husband_Employment_status    1.55126
## Age_Group                    1.24735
## Health_Access                0.74023
## Maternal_Education           0.55528
## Internet_Usage               0.29148
## Child_sex                    0.19910
## Birth_order                  0.09896
## Maternal_Employment_status   0.02434
## ANC_Numbers                  0.00000

#windows()
# Plot the feature importance
plot <- ggplot(importance, aes(x = rownames(importance), y = Overall)) +
  geom_bar(stat = "identity", fill = "darkred") +
  labs(x = "Features", y = "Importance") +
  ggtitle("KNN Features Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Print the plot
print(plot)

########################################
coef(svm)[2:20]

##                  Age_Group          Geographic_Region 
##              -2.334968e-02              -1.942142e-02 
##         Place_of_Residence         Maternal_Education 
##              -1.045718e-01              -4.396286e-02 
##            Radio_Listening        Television_Watching 
##              -5.504526e-02               2.529389e-02 
##             Internet_Usage            Wealth_Quantile 
##              -1.117033e-02               7.106203e-03 
##              Health_Access        Distance_Healthcare 
##               7.222660e-02               6.451840e-02 
##                Birth_order                 Child_twin 
##               1.909147e-02              -1.563194e-13 
##                  Child_sex                  Child_Age 
##              -2.918590e-02               3.551292e-02 
##                ANC_Numbers             Postnatal_care 
##              -2.375915e-02              -1.090159e-01 
##          Husband_Education  Husband_Employment_status 
##               4.344231e-03              -2.775793e-02 
## Maternal_Employment_status 
##              -5.099233e-02

# Support Vector Machine
# Get feature importance (coefficients)
feature_importance1<- abs(coef(svm)[2:20])

# Create feature importance plot
feature_importance_df <- data.frame(
  Feature = names(feature_importance1),
  Importance = feature_importance1
)

feature_importance_df <- feature_importance_df[order(feature_importance_df$Importance, decreasing = TRUE), ]
#windows()
ggplot(feature_importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "pink") +
  xlab("Feature") +
  ylab("Importance") +
  ggtitle("Support Vector Machine - Feature Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Yahye Project

Yahye Hassan Muse, and Abdisalam Hassan Muse (Ph.D.)

2024-05-27

Yahye Project