Amoud University

Bilaawe

# 5 Supervised Machine Learning Models
library(haven)
final<-read_dta("~/BilaaweFinal.dta")
str(final)

## tibble [13,699 × 17] (S3: tbl_df/tbl/data.frame)
##  $ Age_group                : dbl+lbl [1:13699] 2, 3, 3, 3, 1, 5, 5, 2, 2, 2, 3, 3, 4, 4, 4, 1, 1, 3...
##    ..@ label       : chr "Age in 5-year groups"
##    ..@ format.stata: chr "%1.0f"
##    ..@ labels      : Named num [1:7] 1 2 3 4 5 6 7
##    .. ..- attr(*, "names")= chr [1:7] "15 - 19" "20 - 24" "25 - 29" "30 - 34" ...
##  $ Region                   : dbl+lbl [1:13699] 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, ...
##    ..@ label       : chr "Region"
##    ..@ format.stata: chr "%2.0f"
##    ..@ labels      : Named num [1:18] 11 12 13 14 15 16 17 18 19 20 ...
##    .. ..- attr(*, "names")= chr [1:18] "Awdal" "Woqooyi Galbeed" "Togdheer" "Sool" ...
##  $ Residence                : dbl+lbl [1:13699] 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3...
##    ..@ label       : chr "Type of place of residence"
##    ..@ format.stata: chr "%1.0f"
##    ..@ labels      : Named num [1:6] 1 2 3 4 5 6
##    .. ..- attr(*, "names")= chr [1:6] "Rural" "Urban" "Nomadic" "Rural IDP" ...
##  $ Women_Education          : dbl+lbl [1:13699] 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
##    ..@ label       : chr "Highest educational level"
##    ..@ format.stata: chr "%1.0f"
##    ..@ labels      : Named num [1:4] 0 1 2 3
##    .. ..- attr(*, "names")= chr [1:4] "No Education" "Primary" "Secondary" "Higher"
##  $ Wealth_Quantile          : dbl+lbl [1:13699] 2, 2, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 3...
##    ..@ label       : chr "Wealth index combined"
##    ..@ format.stata: chr "%1.0f"
##    ..@ labels      : Named num [1:5] 1 2 3 4 5
##    .. ..- attr(*, "names")= chr [1:5] "Lowest" "Second" "Middle" "Fourth" ...
##  $ Age_1st_birth            : dbl+lbl [1:13699] 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1...
##    ..@ label       : chr "Age of respondent at 1st birth"
##    ..@ format.stata: chr "%18.0f"
##    ..@ labels      : Named num [1:2] 0 1
##    .. ..- attr(*, "names")= chr [1:2] "Less than 20 Years" "20 and More"
##  $ Current_Breastfeeding    : dbl+lbl [1:13699] 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
##    ..@ label       : chr "Currently breastfeeding"
##    ..@ format.stata: chr "%1.0f"
##    ..@ labels      : Named num [1:2] 1 2
##    .. ..- attr(*, "names")= chr [1:2] "Yes" "No"
##  $ Health_Access            : dbl+lbl [1:13699] 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1...
##    ..@ label       : chr "Health Access"
##    ..@ format.stata: chr "%3.0f"
##    ..@ labels      : Named num [1:2] 1 2
##    .. ..- attr(*, "names")= chr [1:2] "Yes" "No"
##  $ Age_1st_Marriage         : dbl+lbl [1:13699] 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
##    ..@ label       : chr "Age at first Marriage"
##    ..@ format.stata: chr "%18.0f"
##    ..@ labels      : Named num [1:2] 0 1
##    .. ..- attr(*, "names")= chr [1:2] "Less than 20 Years" "20 and More"
##  $ Husband_Employment_Status: dbl+lbl [1:13699] 2, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
##    ..@ label       : chr "Husband/partner worked in last 12 months"
##    ..@ format.stata: chr "%1.0f"
##    ..@ labels      : Named num [1:3] 1 2 8
##    .. ..- attr(*, "names")= chr [1:3] "Yes" "No" "Don't Know"
##  $ Birth_Order              : num [1:13699] 2 3 4 3 1 6 5 4 3 2 ...
##   ..- attr(*, "label")= chr "Birth Order"
##   ..- attr(*, "format.stata")= chr "%2.0f"
##  $ Child_Twin               : dbl+lbl [1:13699] 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
##    ..@ label       : chr "Twins"
##    ..@ format.stata: chr "%1.0f"
##    ..@ labels      : Named num [1:2] 1 2
##    .. ..- attr(*, "names")= chr [1:2] "Sing" "Mult"
##  $ Child_Sex                : dbl+lbl [1:13699] 2, 2, 2, 1, 1, 2, 2, 1, 2, 2, 1, 1, 1, 1, 1, 2, 2, 1...
##    ..@ label       : chr "Sex of child"
##    ..@ format.stata: chr "%1.0f"
##    ..@ labels      : Named num [1:2] 1 2
##    .. ..- attr(*, "names")= chr [1:2] "Male" "Female"
##  $ Survival_Status          : dbl+lbl [1:13699] 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
##    ..@ label       : chr "Child alive or dead at time of interview"
##    ..@ format.stata: chr "%5.0f"
##    ..@ labels      : Named num [1:2] 0 1
##    .. ..- attr(*, "names")= chr [1:2] "Alive" "Dead"
##  $ Place_of_Birth           : dbl+lbl [1:13699] 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
##    ..@ label       : chr "Place of delivery"
##    ..@ format.stata: chr "%15.0f"
##    ..@ labels      : Named num [1:2] 0 1
##    .. ..- attr(*, "names")= chr [1:2] "Home" "Health Facility"
##  $ Child_size               : dbl+lbl [1:13699] 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 4...
##    ..@ label       : chr "Size of child at birth"
##    ..@ format.stata: chr "%6.0f"
##    ..@ labels      : Named num [1:3] 2 3 4
##    .. ..- attr(*, "names")= chr [1:3] "Large" "Normal" "Small"
##  $ Number_of_Children       : dbl+lbl [1:13699] 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0...
##    ..@ label       : chr "Number of children ever born"
##    ..@ format.stata: chr "%20.0g"
##    ..@ labels      : Named num [1:2] 0 1
##    .. ..- attr(*, "names")= chr [1:2] "0-5 Children" "More than 5 Children"

write.csv(final,"~/BilaaweFinal.csv")
final<-read.csv("~/BilaaweFinal.csv")
final<-final[,-1]
# Logistic Regression
logistic<-glm(Survival_Status~.,data=final,family = binomial())
summary(logistic)

## 
## Call:
## glm(formula = Survival_Status ~ ., family = binomial(), data = final)
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -4.039494   0.547803  -7.374 1.66e-13 ***
## Age_group                 -0.289915   0.059205  -4.897 9.74e-07 ***
## Region                    -0.006680   0.009168  -0.729 0.466207    
## Residence                  0.017392   0.061488   0.283 0.777288    
## Women_Education           -0.062057   0.102370  -0.606 0.544377    
## Wealth_Quantile           -0.045697   0.039006  -1.172 0.241381    
## Age_1st_birth              0.118418   0.124090   0.954 0.339934    
## Current_Breastfeeding      0.094027   0.091848   1.024 0.305968    
## Health_Access             -0.177867   0.100023  -1.778 0.075362 .  
## Age_1st_Marriage          -0.203578   0.137516  -1.480 0.138769    
## Husband_Employment_Status -0.173572   0.097771  -1.775 0.075848 .  
## Birth_Order                0.268633   0.031092   8.640  < 2e-16 ***
## Child_Twin                 1.351508   0.227953   5.929 3.05e-09 ***
## Child_Sex                 -0.306914   0.091827  -3.342 0.000831 ***
## Place_of_Birth            -0.120658   0.129297  -0.933 0.350727    
## Child_size                 0.136764   0.097881   1.397 0.162340    
## Number_of_Children        -0.530133   0.155735  -3.404 0.000664 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4435.0  on 13698  degrees of freedom
## Residual deviance: 4284.4  on 13682  degrees of freedom
## AIC: 4318.4
## 
## Number of Fisher Scoring iterations: 6

round(coef(summary(logistic)),4)

##                           Estimate Std. Error z value Pr(>|z|)
## (Intercept)                -4.0395     0.5478 -7.3740   0.0000
## Age_group                  -0.2899     0.0592 -4.8968   0.0000
## Region                     -0.0067     0.0092 -0.7287   0.4662
## Residence                   0.0174     0.0615  0.2829   0.7773
## Women_Education            -0.0621     0.1024 -0.6062   0.5444
## Wealth_Quantile            -0.0457     0.0390 -1.1715   0.2414
## Age_1st_birth               0.1184     0.1241  0.9543   0.3399
## Current_Breastfeeding       0.0940     0.0918  1.0237   0.3060
## Health_Access              -0.1779     0.1000 -1.7783   0.0754
## Age_1st_Marriage           -0.2036     0.1375 -1.4804   0.1388
## Husband_Employment_Status  -0.1736     0.0978 -1.7753   0.0758
## Birth_Order                 0.2686     0.0311  8.6399   0.0000
## Child_Twin                  1.3515     0.2280  5.9289   0.0000
## Child_Sex                  -0.3069     0.0918 -3.3423   0.0008
## Place_of_Birth             -0.1207     0.1293 -0.9332   0.3507
## Child_size                  0.1368     0.0979  1.3972   0.1623
## Number_of_Children         -0.5301     0.1557 -3.4041   0.0007

# Logistic Regression without factors
logistic<-glm(Survival_Status~.,data=final,family = binomial())
summary(logistic)

## 
## Call:
## glm(formula = Survival_Status ~ ., family = binomial(), data = final)
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -4.039494   0.547803  -7.374 1.66e-13 ***
## Age_group                 -0.289915   0.059205  -4.897 9.74e-07 ***
## Region                    -0.006680   0.009168  -0.729 0.466207    
## Residence                  0.017392   0.061488   0.283 0.777288    
## Women_Education           -0.062057   0.102370  -0.606 0.544377    
## Wealth_Quantile           -0.045697   0.039006  -1.172 0.241381    
## Age_1st_birth              0.118418   0.124090   0.954 0.339934    
## Current_Breastfeeding      0.094027   0.091848   1.024 0.305968    
## Health_Access             -0.177867   0.100023  -1.778 0.075362 .  
## Age_1st_Marriage          -0.203578   0.137516  -1.480 0.138769    
## Husband_Employment_Status -0.173572   0.097771  -1.775 0.075848 .  
## Birth_Order                0.268633   0.031092   8.640  < 2e-16 ***
## Child_Twin                 1.351508   0.227953   5.929 3.05e-09 ***
## Child_Sex                 -0.306914   0.091827  -3.342 0.000831 ***
## Place_of_Birth            -0.120658   0.129297  -0.933 0.350727    
## Child_size                 0.136764   0.097881   1.397 0.162340    
## Number_of_Children        -0.530133   0.155735  -3.404 0.000664 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4435.0  on 13698  degrees of freedom
## Residual deviance: 4284.4  on 13682  degrees of freedom
## AIC: 4318.4
## 
## Number of Fisher Scoring iterations: 6

# Visualization of Parameter Effects
library(effects)

## Loading required package: carData

## lattice theme set by effectsTheme()
## See ?effectsTheme for details.

#windows()
plot(allEffects(logistic),cex.main=.75, cex.lab=.1, cex.axis=0.1)

# Prediction Models
library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

# Prep Training and Test dataset
# First, create a Train-test split with 0% data included in the training set
set.seed(123)
trainDataIndex<-createDataPartition(final$Survival_Status, p=0.8, list = F)
trainData<-final[trainDataIndex,]
testData<-final[-trainDataIndex,]

###################################################

# Logistic with only significant variables
###################################################
logitmod2<-glm(Survival_Status~ .,
               family="binomial", data=trainData)
summary(logitmod2)

## 
## Call:
## glm(formula = Survival_Status ~ ., family = "binomial", data = trainData)
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -4.283697   0.613656  -6.981 2.94e-12 ***
## Age_group                 -0.287426   0.066324  -4.334 1.47e-05 ***
## Region                     0.001086   0.010315   0.105 0.916160    
## Residence                  0.102506   0.069961   1.465 0.142868    
## Women_Education           -0.016257   0.112872  -0.144 0.885475    
## Wealth_Quantile           -0.052661   0.044278  -1.189 0.234308    
## Age_1st_birth              0.052528   0.141620   0.371 0.710705    
## Current_Breastfeeding      0.056552   0.103668   0.546 0.585404    
## Health_Access             -0.209171   0.113318  -1.846 0.064910 .  
## Age_1st_Marriage          -0.119592   0.155688  -0.768 0.442397    
## Husband_Employment_Status -0.162683   0.110103  -1.478 0.139527    
## Birth_Order                0.273201   0.034700   7.873 3.45e-15 ***
## Child_Twin                 1.404853   0.244603   5.743 9.28e-09 ***
## Child_Sex                 -0.372027   0.104259  -3.568 0.000359 ***
## Place_of_Birth            -0.101659   0.146294  -0.695 0.487120    
## Child_size                 0.146484   0.110097   1.330 0.183354    
## Number_of_Children        -0.635916   0.176183  -3.609 0.000307 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3505.4  on 10959  degrees of freedom
## Residual deviance: 3376.8  on 10943  degrees of freedom
## AIC: 3410.8
## 
## Number of Fisher Scoring iterations: 6

# Apply the model to predict the testdata
pred<-predict(logitmod2,newdata = testData, type = "response")
summary(pred)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.005838 0.023321 0.032442 0.036825 0.044333 0.449391

# Measure the accuracy of prediction in the test data
y_pred_num<-ifelse(pred<0.036,0,1)
y_pred<-factor(y_pred_num,levels = c(0,1))
y_act<-testData$Survival_Status

#Result : Prediction Accuracy 
set.seed(123)
mean(y_pred==y_act)

## [1] 0.5951077

# Plot ROC Curve
library(InformationValue)

## 
## Attaching package: 'InformationValue'

## The following objects are masked from 'package:caret':
## 
##     confusionMatrix, precision, sensitivity, specificity

InformationValue::plotROC(y_act,pred)

InformationValue::AUROC(y_act,pred)

## [1] 0.6012519

#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_table <- table(testData$Survival_Status, y_pred_num)
#Displays confusion matrix and the statistics associated with the confusion matrix
result<-caret::confusionMatrix(confusion_table)
result

## Confusion Matrix and Statistics
## 
##    y_pred_num
##        0    1
##   0 1563 1065
##   1   44   67
##                                           
##                Accuracy : 0.5951          
##                  95% CI : (0.5764, 0.6136)
##     No Information Rate : 0.5867          
##     P-Value [Acc > NIR] : 0.1914          
##                                           
##                   Kappa : 0.0367          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.97262         
##             Specificity : 0.05919         
##          Pos Pred Value : 0.59475         
##          Neg Pred Value : 0.60360         
##              Prevalence : 0.58671         
##          Detection Rate : 0.57065         
##    Detection Prevalence : 0.95947         
##       Balanced Accuracy : 0.51590         
##                                           
##        'Positive' Class : 0               
##

metrics<-as.data.frame(result$byClass)
colnames(metrics)<-"metrics"
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(kableExtra)

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

kable(round(metrics,4), caption = "F1-score, Precision and Recall ") %>%
  kable_styling(font_size = 16)

F1-score, Precision and Recall
	metrics
Sensitivity	0.9726
Specificity	0.0592
Pos Pred Value	0.5947
Neg Pred Value	0.6036
Precision	0.5947
Recall	0.9726
F1	0.7381
Prevalence	0.5867
Detection Rate	0.5706
Detection Prevalence	0.9595
Balanced Accuracy	0.5159

#############################################
# Probit Regression
#############################################

# Probit Regression
probitmod<-glm(Survival_Status~ .,
               family=binomial(link="probit"), data=trainData)
summary(probitmod)

## 
## Call:
## glm(formula = Survival_Status ~ ., family = binomial(link = "probit"), 
##     data = trainData)
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -2.2189107  0.2834884  -7.827 4.99e-15 ***
## Age_group                 -0.1361225  0.0298419  -4.561 5.08e-06 ***
## Region                    -0.0009604  0.0046288  -0.207 0.835637    
## Residence                  0.0401587  0.0310505   1.293 0.195895    
## Women_Education           -0.0066852  0.0493454  -0.135 0.892235    
## Wealth_Quantile           -0.0255685  0.0198320  -1.289 0.197310    
## Age_1st_birth              0.0278509  0.0634860   0.439 0.660884    
## Current_Breastfeeding      0.0226100  0.0464648   0.487 0.626539    
## Health_Access             -0.0968806  0.0503279  -1.925 0.054231 .  
## Age_1st_Marriage          -0.0420892  0.0682940  -0.616 0.537700    
## Husband_Employment_Status -0.0822458  0.0494838  -1.662 0.096498 .  
## Birth_Order                0.1274120  0.0161787   7.875 3.40e-15 ***
## Child_Twin                 0.6806168  0.1292031   5.268 1.38e-07 ***
## Child_Sex                 -0.1679732  0.0462551  -3.631 0.000282 ***
## Place_of_Birth            -0.0420166  0.0647343  -0.649 0.516297    
## Child_size                 0.0644779  0.0497044   1.297 0.194553    
## Number_of_Children        -0.2894850  0.0794152  -3.645 0.000267 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3505.4  on 10959  degrees of freedom
## Residual deviance: 3375.5  on 10943  degrees of freedom
## AIC: 3409.5
## 
## Number of Fisher Scoring iterations: 6

# Apply the model to predict the testdata
pred<-predict(probitmod,newdata = testData, type = "response")
summary(pred)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.004173 0.022911 0.032552 0.036881 0.045116 0.373586

# Measure the accuracy of prediction in the test data
y_pred_num<-ifelse(pred<0.036,0,1)
y_pred<-factor(y_pred_num,levels = c(0,1))
y_act<-testData$Survival_Status

#Result : Prediction Accuracy 
set.seed(123)
mean(y_pred==y_act)

## [1] 0.5852501

# Plot ROC Curve
library(InformationValue)
InformationValue::plotROC(y_act,pred)

InformationValue::AUROC(y_act,pred)

## [1] 0.6090577

#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_table <- table(testData$Survival_Status, y_pred_num)
#Displays confusion matrix and the statistics associated with the confusion matrix
result<-caret::confusionMatrix(confusion_table)
result

## Confusion Matrix and Statistics
## 
##    y_pred_num
##        0    1
##   0 1534 1094
##   1   42   69
##                                           
##                Accuracy : 0.5853          
##                  95% CI : (0.5665, 0.6038)
##     No Information Rate : 0.5754          
##     P-Value [Acc > NIR] : 0.1528          
##                                           
##                   Kappa : 0.0371          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.97335         
##             Specificity : 0.05933         
##          Pos Pred Value : 0.58371         
##          Neg Pred Value : 0.62162         
##              Prevalence : 0.57539         
##          Detection Rate : 0.56006         
##    Detection Prevalence : 0.95947         
##       Balanced Accuracy : 0.51634         
##                                           
##        'Positive' Class : 0               
##

metrics<-as.data.frame(result$byClass)
colnames(metrics)<-"metrics"
library(dplyr)
library(kableExtra)
kable(round(metrics,4), caption = "F1-score, Precision and Recall ") %>%
  kable_styling(font_size = 16)

F1-score, Precision and Recall
	metrics
Sensitivity	0.9734
Specificity	0.0593
Pos Pred Value	0.5837
Neg Pred Value	0.6216
Precision	0.5837
Recall	0.9734
F1	0.7298
Prevalence	0.5754
Detection Rate	0.5601
Detection Prevalence	0.9595
Balanced Accuracy	0.5163

# Probit Regression

# Get feature importance (coefficients)
feature_importance2 <- abs(coef(probitmod)[-1])

# Create feature importance plot
feature_importance_df2<-data.frame(
  Feature = names(feature_importance2),
  Importance = feature_importance2
)

feature_importance_df2 <- feature_importance_df2[order(feature_importance_df2$Importance, decreasing = TRUE), ]
#windows()
ggplot(feature_importance_df2, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "gold") +
  xlab("Feature") +
  ylab("Importance") +
  ggtitle("Probit Regression - Feature Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

################################################3
# Random Forest 
################################################

library(randomForest)

## randomForest 4.7-1.1

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

set.seed(123)

#Random Forest for variables. mtry = 5 since there are 24 variables (square root of 24 is close to 5).
fit_rf <- randomForest(factor(Survival_Status) ~., data = trainData)
fit_rf

## 
## Call:
##  randomForest(formula = factor(Survival_Status) ~ ., data = trainData) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 3.74%
## Confusion matrix:
##       0 1  class.error
## 0 10544 5 0.0004739786
## 1   405 6 0.9854014599

varImpPlot(fit_rf)

#Performance of Random Forest Model on Testing Data
predrf = predict(fit_rf, newdata=testData)
table(predrf)

## predrf
##    0    1 
## 2737    2

summary(predrf)

##    0    1 
## 2737    2

# Measure the accuracy of prediction in the test data
y_pred_numrf<-ifelse(predrf==1,1,0)
y_predrf<-factor(y_pred_numrf,levels = c(0,1))
y_actrf<-testData$Survival_Status

#Result : Prediction Accuracy 
mean(y_predrf==y_actrf)

## [1] 0.9594743

#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_tablerf<- table(testData$Survival_Status, y_pred_numrf)
#Displays confusion matrix and the statistics associated with the confusion matrix
resultrf<-caret::confusionMatrix(confusion_tablerf)
resultrf

## Confusion Matrix and Statistics
## 
##    y_pred_numrf
##        0    1
##   0 2627    1
##   1  110    1
##                                           
##                Accuracy : 0.9595          
##                  95% CI : (0.9514, 0.9665)
##     No Information Rate : 0.9993          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0163          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.959810        
##             Specificity : 0.500000        
##          Pos Pred Value : 0.999619        
##          Neg Pred Value : 0.009009        
##              Prevalence : 0.999270        
##          Detection Rate : 0.959109        
##    Detection Prevalence : 0.959474        
##       Balanced Accuracy : 0.729905        
##                                           
##        'Positive' Class : 0               
##

metricsrf<-as.data.frame(resultrf$byClass)
colnames(metricsrf)<-"metricsrf"
library(dplyr)
library(kableExtra)
kable(round(metricsrf,4), caption = "F1-score, Precision and Recall ") %>%
  kable_styling(font_size = 16)

F1-score, Precision and Recall
	metricsrf
Sensitivity	0.9598
Specificity	0.5000
Pos Pred Value	0.9996
Neg Pred Value	0.0090
Precision	0.9996
Recall	0.9598
F1	0.9793
Prevalence	0.9993
Detection Rate	0.9591
Detection Prevalence	0.9595
Balanced Accuracy	0.7299

# Random Forest 

# Get feature importance
feature_importance <- importance(fit_rf)

# Convert feature importance to data frame
feature_importance_df <- data.frame(
  Feature = row.names(feature_importance),
  Importance = feature_importance[, "MeanDecreaseGini"]
)

# Sort feature importance by importance score
feature_importance_df <- feature_importance_df[order(feature_importance_df$Importance, decreasing = TRUE), ]
#windows()
# Create feature importance plot
ggplot(feature_importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  xlab("Feature") +
  ylab("Importance") +
  ggtitle("Random Forest - Feature Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

#ROC
library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

# RandomForest
test_prob1= predict(fit_rf, newdata=testData, type = "prob")
test_roc1= roc(testData$Survival_Status, test_prob1[,c(2)],plot = TRUE, col='darkgoldenrod4', print.auc = TRUE,percent=TRUE)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

#Probit Regression
test_prob2= predict(probitmod, newdata=testData, type = "response")
test_roc2= roc(testData$Survival_Status~test_prob2,plot = TRUE, col='darkorange', print.auc = TRUE,percent=TRUE)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

#Logistic Regression
test_prob3= predict(logitmod2, newdata=testData, type = "response")
test_roc3= roc(testData$Survival_Status~test_prob3, plot = TRUE,col="darkblue", print.auc = TRUE,percent=TRUE)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

###############################################
# Support Vector Machine (SVM) Model
###############################################
library(e1071)
set.seed(123)
svm<-svm(Survival_Status~., data = trainData, kernel='linear', cost=10, scale=FALSE)

# Plot the SVC obtained
#windows()
#plot(svm, final)
summary(svm)

## 
## Call:
## svm(formula = Survival_Status ~ ., data = trainData, kernel = "linear", 
##     cost = 10, scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  eps-regression 
##  SVM-Kernel:  linear 
##        cost:  10 
##       gamma:  0.0625 
##     epsilon:  0.1 
## 
## 
## Number of Support Vectors:  4531

#Prediction
set.seed(123)
predictionSVM<- predict(svm, newdata = testData)
summary(predictionSVM)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.09963 0.09993 0.10000 0.10000 0.10006 0.10026

# Measure the accuracy of prediction in the test data
y_pred_numSVM<-ifelse(predictionSVM<0.1,0,1)
y_predSVM<-factor(y_pred_numSVM,levels = c(0,1))
y_actSVM<-testData$Survival_Status

#Result : Prediction Accuracy 
mean(y_predSVM==y_actSVM)

## [1] 0.5060241

#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_tableSVM<- table(testData$Survival_Status, y_predSVM)
#Displays confusion matrix and the statistics associated with the confusion matrix
resultSVM<-caret::confusionMatrix(confusion_tableSVM)
resultSVM

## Confusion Matrix and Statistics
## 
##    y_predSVM
##        0    1
##   0 1338 1290
##   1   63   48
##                                           
##                Accuracy : 0.506           
##                  95% CI : (0.4871, 0.5249)
##     No Information Rate : 0.5115          
##     P-Value [Acc > NIR] : 0.7233          
##                                           
##                   Kappa : -0.0093         
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.95503         
##             Specificity : 0.03587         
##          Pos Pred Value : 0.50913         
##          Neg Pred Value : 0.43243         
##              Prevalence : 0.51150         
##          Detection Rate : 0.48850         
##    Detection Prevalence : 0.95947         
##       Balanced Accuracy : 0.49545         
##                                           
##        'Positive' Class : 0               
##

metricsSVM<-as.data.frame(resultSVM$byClass)
colnames(metricsSVM)<-"metricsSVM"
library(dplyr)
library(kableExtra)
kable(round(metricsSVM,4), caption = "F1-score, Precision and Recall ") %>%
  kable_styling(font_size = 16)

F1-score, Precision and Recall
	metricsSVM
Sensitivity	0.9550
Specificity	0.0359
Pos Pred Value	0.5091
Neg Pred Value	0.4324
Precision	0.5091
Recall	0.9550
F1	0.6642
Prevalence	0.5115
Detection Rate	0.4885
Detection Prevalence	0.9595
Balanced Accuracy	0.4955

#SVM
library(pROC)
#windows()
test_prob4= predict(svm, newdata=testData)
test_roc4= roc(testData$Survival_Status, main="Support Vector Machine", test_prob4,plot = TRUE, col='darkblue', print.auc = TRUE,percent=TRUE)

## Setting levels: control = 0, case = 1

## Setting direction: controls > cases

coef(svm)[2:17]

##                 Age_group                    Region                 Residence 
##             -4.257924e-05              9.735127e-06             -5.251352e-06 
##           Women_Education           Wealth_Quantile             Age_1st_birth 
##             -5.471056e-05             -9.461495e-06              4.167470e-05 
##     Current_Breastfeeding             Health_Access          Age_1st_Marriage 
##              3.784603e-05              2.520351e-05             -5.520323e-05 
## Husband_Employment_Status               Birth_Order                Child_Twin 
##             -4.970253e-05             -1.244043e-05              1.414533e-04 
##                 Child_Sex            Place_of_Birth                Child_size 
##             -5.897005e-06             -1.529280e-05             -1.098612e-05 
##        Number_of_Children 
##              8.802279e-05

# Support Vector Machine
# Get feature importance (coefficients)
feature_importance1<- abs(coef(svm)[2:17])

# Create feature importance plot
feature_importance_df <- data.frame(
  Feature = names(feature_importance1),
  Importance = feature_importance1
)

feature_importance_df <- feature_importance_df[order(feature_importance_df$Importance, decreasing = TRUE), ]
#windows()
ggplot(feature_importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "darkred") +
  xlab("Feature") +
  ylab("Importance") +
  ggtitle("Support Vector Machine - Feature Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

#################################################



############################################
# KNN Model
library(knn)

# Fitting model
fit <-knnreg(trainData$Survival_Status ~ ., data = trainData,k=2)
summary(fit)

##         Length Class  Mode   
## learn   2      -none- list   
## k       1      -none- numeric
## terms   3      terms  call   
## xlevels 0      -none- list   
## theDots 0      -none- list

#Predict Output 
predicted= predict(fit,testData)
summary(predicted)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.03492 0.00000 1.00000

# Measure the accuracy of prediction in the test data
y_pred_numknn<-ifelse(predicted==1,1,0)
y_predknn<-factor(y_pred_numknn,levels = c(0,1))
y_actknn<-testData$Survival_Status

#Result : Prediction Accuracy 
mean(y_predknn==y_actknn)

## [1] 0.9591092

#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_tableknn<- table(testData$Survival_Status, y_predknn)
#Displays confusion matrix and the statistics associated with the confusion matrix
resultknn<-caret::confusionMatrix(confusion_tableknn)
resultknn

## Confusion Matrix and Statistics
## 
##    y_predknn
##        0    1
##   0 2626    2
##   1  110    1
##                                          
##                Accuracy : 0.9591         
##                  95% CI : (0.951, 0.9662)
##     No Information Rate : 0.9989         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.0154         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.959795       
##             Specificity : 0.333333       
##          Pos Pred Value : 0.999239       
##          Neg Pred Value : 0.009009       
##              Prevalence : 0.998905       
##          Detection Rate : 0.958744       
##    Detection Prevalence : 0.959474       
##       Balanced Accuracy : 0.646564       
##                                          
##        'Positive' Class : 0              
##

metricsknn<-as.data.frame(resultknn$byClass)
colnames(metricsrf)<-"metricsknn"
library(dplyr)
library(kableExtra)
kable(round(metricsknn,4), caption = "F1-score, Precision and Recall ") %>%
  kable_styling(font_size = 16)

F1-score, Precision and Recall
	resultknn$byClass
Sensitivity	0.9598
Specificity	0.3333
Pos Pred Value	0.9992
Neg Pred Value	0.0090
Precision	0.9992
Recall	0.9598
F1	0.9791
Prevalence	0.9989
Detection Rate	0.9587
Detection Prevalence	0.9595
Balanced Accuracy	0.6466

knnmodel <- train(Survival_Status ~ ., data = trainData, method = "knn")

## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.

# Get feature importance
importance <- varImp(knnmodel)
# Print the feature importance
print(importance)

## loess r-squared variable importance
## 
##                           Overall
## Child_Twin                100.000
## Birth_Order                78.946
## Child_Sex                  37.696
## Age_1st_birth              32.576
## Age_1st_Marriage           31.520
## Residence                  13.754
## Health_Access              13.305
## Child_size                  8.786
## Number_of_Children          6.657
## Place_of_Birth              6.343
## Husband_Employment_Status   5.405
## Women_Education             3.093
## Age_group                   2.598
## Wealth_Quantile             2.485
## Current_Breastfeeding       0.481
## Region                      0.000

# Plot the feature importance
plot <- ggplot(importance, aes(x = rownames(importance), y = Overall)) +
  geom_bar(stat = "identity", fill = "purple") +
  labs(x = "Features", y = "Importance") +
  ggtitle("KNN Features Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Print the plot
print(plot)

#ROC
library(pROC)
# KNN
#windows()
test_prob5= predict(fit, newdata=testData, type = "response")
test_roc5= roc(testData$Survival_Status, test_prob5,plot = TRUE, col='darkblue', print.auc = TRUE,percent=TRUE)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

####################################################
# Area_Under Curve
############################################
#Area-under Curve
#windows()
plot(test_roc1,  print.auc = TRUE,  col='brown',percent=TRUE, print.auc.y = 48, print.auc.x = 0)
plot(test_roc2 , add = TRUE, col='red', print.auc = TRUE, print.auc.y = 41, print.auc.x = 0)
plot(test_roc3 , add = TRUE, col='darkgoldenrod4', print.auc = TRUE, print.auc.y = 34, print.auc.x = 0)
plot(test_roc4 , add = TRUE, col='green', print.auc = TRUE, print.auc.y = 27, print.auc.x = 0)
plot(test_roc5 , add = TRUE, col='purple', print.auc = TRUE, print.auc.y = 20, print.auc.x = 0)
abline(a=0, b=1, lty=3, lwd=4, col="black")
legend("topleft", legend = c("Random Forest-Model", "Probit Regression",
                             "Logistic-Regression","Support Vector Machine","KNN Model") ,
       pch = 15, bty = 'n', col = c("brown","red", "darkgoldenrod4", "green","purple"))

###################################################
# Metrics
###############################################
# Sample accuracy, sensitivity, F1 Score, and precision values for three models
Accuracy <- c(0.5951, 0.9595, 0.5853,0.9591,0.5081)
Recall <- c(0.9726, 0.9598, 0.9734,0.9598,0.9550)
F1_score <- c(0.7381, 0.9793, 0.7298,0.9791,0.6642)
Precision<-c(0.5947,0.9996,0.5837,0.9992,0.5091)
# Creating a data frame with the values
model_names <- c("Logistic Regression", "Random Forest", "Probit Regression","KNN",
                 "Support Vector Machine")
data <- data.frame(Model = model_names, Accuracy = Accuracy, Recall = Recall, F1_Score = F1_score, Precision=Precision)


# Loading the ggplot2 package
library(ggplot2)

# Reshaping the data frame for plotting
data_long <- reshape2::melt(data, id.vars = "Model", variable.name = "Metric", value.name = "Value")
#windows()
# Creating the bar chart
ggplot(data_long, aes(x = Model, y = Value, fill = Metric)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Comparison of Supervised Machine Learning Models",
       x = "Model",
       y = "Value") +
  scale_fill_manual(values = c("Accuracy" = "darkblue",
                               "Recall" = "darkgreen",
                               "F1_Score" = "magenta",
                               "Precision" = "darkred")) +
  theme_minimal()

#############################################
# Feature Selection
#############################################
#windows()
par(mfrow=(c(3,2)))


# Logistic Regression

# Get feature importance (coefficients)
feature_importance2 <- abs(coef(logitmod2)[-1])

# Create feature importance plot
feature_importance_df2<-data.frame(
  Feature = names(feature_importance2),
  Importance = feature_importance2
)

feature_importance_df2 <- feature_importance_df2[order(feature_importance_df2$Importance, decreasing = TRUE), ]
#windows()
ggplot(feature_importance_df2, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "magenta") +
  xlab("Feature") +
  ylab("Importance") +
  ggtitle("Logistic Regression - Feature Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

###########################################3
# Probit Regression

# Get feature importance (coefficients)
feature_importance2 <- abs(coef(probitmod)[-1])

# Create feature importance plot
feature_importance_df2<-data.frame(
  Feature = names(feature_importance2),
  Importance = feature_importance2
)

feature_importance_df2 <- feature_importance_df2[order(feature_importance_df2$Importance, decreasing = TRUE), ]
#windows()
ggplot(feature_importance_df2, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "gold") +
  xlab("Feature") +
  ylab("Importance") +
  ggtitle("Probit Regression - Feature Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

#############################################
# Random Forest 

# Get feature importance
feature_importance <- importance(fit_rf)

# Convert feature importance to data frame
feature_importance_df <- data.frame(
  Feature = row.names(feature_importance),
  Importance = feature_importance[, "MeanDecreaseGini"]
)

# Sort feature importance by importance score
feature_importance_df <- feature_importance_df[order(feature_importance_df$Importance, decreasing = TRUE), ]
#windows()
# Create feature importance plot
ggplot(feature_importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  xlab("Feature") +
  ylab("Importance") +
  ggtitle("Random Forest - Feature Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

###########################################

knnmodel <- train(Survival_Status ~ ., data = trainData, method = "knn")

## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.

# Get feature importance
importance <- varImp(knnmodel)
# Print the feature importance
print(importance)

## loess r-squared variable importance
## 
##                           Overall
## Child_Twin                100.000
## Birth_Order                78.946
## Child_Sex                  37.696
## Age_1st_birth              32.576
## Age_1st_Marriage           31.520
## Residence                  13.754
## Health_Access              13.305
## Child_size                  8.786
## Number_of_Children          6.657
## Place_of_Birth              6.343
## Husband_Employment_Status   5.405
## Women_Education             3.093
## Age_group                   2.598
## Wealth_Quantile             2.485
## Current_Breastfeeding       0.481
## Region                      0.000

#windows()
# Plot the feature importance
plot <- ggplot(importance, aes(x = rownames(importance), y = Overall)) +
  geom_bar(stat = "identity", fill = "purple") +
  labs(x = "Features", y = "Importance") +
  ggtitle("KNN Features Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Print the plot
print(plot)

########################################
coef(svm)[2:17]

##                 Age_group                    Region                 Residence 
##             -4.257924e-05              9.735127e-06             -5.251352e-06 
##           Women_Education           Wealth_Quantile             Age_1st_birth 
##             -5.471056e-05             -9.461495e-06              4.167470e-05 
##     Current_Breastfeeding             Health_Access          Age_1st_Marriage 
##              3.784603e-05              2.520351e-05             -5.520323e-05 
## Husband_Employment_Status               Birth_Order                Child_Twin 
##             -4.970253e-05             -1.244043e-05              1.414533e-04 
##                 Child_Sex            Place_of_Birth                Child_size 
##             -5.897005e-06             -1.529280e-05             -1.098612e-05 
##        Number_of_Children 
##              8.802279e-05

# Support Vector Machine
# Get feature importance (coefficients)
feature_importance1<- abs(coef(svm)[2:17])

# Create feature importance plot
feature_importance_df <- data.frame(
  Feature = names(feature_importance1),
  Importance = feature_importance1
)

feature_importance_df <- feature_importance_df[order(feature_importance_df$Importance, decreasing = TRUE), ]
#windows()
ggplot(feature_importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "darkred") +
  xlab("Feature") +
  ylab("Importance") +
  ggtitle("Support Vector Machine - Feature Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Bilaawe Project

Mohamed A. Hussein, and Abdisalam Hassan Muse (Ph.D.)

2023-11-10

Bilaawe