Bilaawe
# 5 Supervised Machine Learning Models
library(haven)
final<-read_dta("~/BilaaweFinal.dta")
str(final)
## tibble [13,699 × 17] (S3: tbl_df/tbl/data.frame)
## $ Age_group : dbl+lbl [1:13699] 2, 3, 3, 3, 1, 5, 5, 2, 2, 2, 3, 3, 4, 4, 4, 1, 1, 3...
## ..@ label : chr "Age in 5-year groups"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:7] 1 2 3 4 5 6 7
## .. ..- attr(*, "names")= chr [1:7] "15 - 19" "20 - 24" "25 - 29" "30 - 34" ...
## $ Region : dbl+lbl [1:13699] 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, ...
## ..@ label : chr "Region"
## ..@ format.stata: chr "%2.0f"
## ..@ labels : Named num [1:18] 11 12 13 14 15 16 17 18 19 20 ...
## .. ..- attr(*, "names")= chr [1:18] "Awdal" "Woqooyi Galbeed" "Togdheer" "Sool" ...
## $ Residence : dbl+lbl [1:13699] 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3...
## ..@ label : chr "Type of place of residence"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:6] 1 2 3 4 5 6
## .. ..- attr(*, "names")= chr [1:6] "Rural" "Urban" "Nomadic" "Rural IDP" ...
## $ Women_Education : dbl+lbl [1:13699] 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## ..@ label : chr "Highest educational level"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:4] 0 1 2 3
## .. ..- attr(*, "names")= chr [1:4] "No Education" "Primary" "Secondary" "Higher"
## $ Wealth_Quantile : dbl+lbl [1:13699] 2, 2, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 3...
## ..@ label : chr "Wealth index combined"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:5] 1 2 3 4 5
## .. ..- attr(*, "names")= chr [1:5] "Lowest" "Second" "Middle" "Fourth" ...
## $ Age_1st_birth : dbl+lbl [1:13699] 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1...
## ..@ label : chr "Age of respondent at 1st birth"
## ..@ format.stata: chr "%18.0f"
## ..@ labels : Named num [1:2] 0 1
## .. ..- attr(*, "names")= chr [1:2] "Less than 20 Years" "20 and More"
## $ Current_Breastfeeding : dbl+lbl [1:13699] 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## ..@ label : chr "Currently breastfeeding"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:2] 1 2
## .. ..- attr(*, "names")= chr [1:2] "Yes" "No"
## $ Health_Access : dbl+lbl [1:13699] 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1...
## ..@ label : chr "Health Access"
## ..@ format.stata: chr "%3.0f"
## ..@ labels : Named num [1:2] 1 2
## .. ..- attr(*, "names")= chr [1:2] "Yes" "No"
## $ Age_1st_Marriage : dbl+lbl [1:13699] 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## ..@ label : chr "Age at first Marriage"
## ..@ format.stata: chr "%18.0f"
## ..@ labels : Named num [1:2] 0 1
## .. ..- attr(*, "names")= chr [1:2] "Less than 20 Years" "20 and More"
## $ Husband_Employment_Status: dbl+lbl [1:13699] 2, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## ..@ label : chr "Husband/partner worked in last 12 months"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:3] 1 2 8
## .. ..- attr(*, "names")= chr [1:3] "Yes" "No" "Don't Know"
## $ Birth_Order : num [1:13699] 2 3 4 3 1 6 5 4 3 2 ...
## ..- attr(*, "label")= chr "Birth Order"
## ..- attr(*, "format.stata")= chr "%2.0f"
## $ Child_Twin : dbl+lbl [1:13699] 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## ..@ label : chr "Twins"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:2] 1 2
## .. ..- attr(*, "names")= chr [1:2] "Sing" "Mult"
## $ Child_Sex : dbl+lbl [1:13699] 2, 2, 2, 1, 1, 2, 2, 1, 2, 2, 1, 1, 1, 1, 1, 2, 2, 1...
## ..@ label : chr "Sex of child"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:2] 1 2
## .. ..- attr(*, "names")= chr [1:2] "Male" "Female"
## $ Survival_Status : dbl+lbl [1:13699] 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## ..@ label : chr "Child alive or dead at time of interview"
## ..@ format.stata: chr "%5.0f"
## ..@ labels : Named num [1:2] 0 1
## .. ..- attr(*, "names")= chr [1:2] "Alive" "Dead"
## $ Place_of_Birth : dbl+lbl [1:13699] 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## ..@ label : chr "Place of delivery"
## ..@ format.stata: chr "%15.0f"
## ..@ labels : Named num [1:2] 0 1
## .. ..- attr(*, "names")= chr [1:2] "Home" "Health Facility"
## $ Child_size : dbl+lbl [1:13699] 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 4...
## ..@ label : chr "Size of child at birth"
## ..@ format.stata: chr "%6.0f"
## ..@ labels : Named num [1:3] 2 3 4
## .. ..- attr(*, "names")= chr [1:3] "Large" "Normal" "Small"
## $ Number_of_Children : dbl+lbl [1:13699] 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0...
## ..@ label : chr "Number of children ever born"
## ..@ format.stata: chr "%20.0g"
## ..@ labels : Named num [1:2] 0 1
## .. ..- attr(*, "names")= chr [1:2] "0-5 Children" "More than 5 Children"
write.csv(final,"~/BilaaweFinal.csv")
final<-read.csv("~/BilaaweFinal.csv")
final<-final[,-1]
# Logistic Regression
logistic<-glm(Survival_Status~.,data=final,family = binomial())
summary(logistic)
##
## Call:
## glm(formula = Survival_Status ~ ., family = binomial(), data = final)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.039494 0.547803 -7.374 1.66e-13 ***
## Age_group -0.289915 0.059205 -4.897 9.74e-07 ***
## Region -0.006680 0.009168 -0.729 0.466207
## Residence 0.017392 0.061488 0.283 0.777288
## Women_Education -0.062057 0.102370 -0.606 0.544377
## Wealth_Quantile -0.045697 0.039006 -1.172 0.241381
## Age_1st_birth 0.118418 0.124090 0.954 0.339934
## Current_Breastfeeding 0.094027 0.091848 1.024 0.305968
## Health_Access -0.177867 0.100023 -1.778 0.075362 .
## Age_1st_Marriage -0.203578 0.137516 -1.480 0.138769
## Husband_Employment_Status -0.173572 0.097771 -1.775 0.075848 .
## Birth_Order 0.268633 0.031092 8.640 < 2e-16 ***
## Child_Twin 1.351508 0.227953 5.929 3.05e-09 ***
## Child_Sex -0.306914 0.091827 -3.342 0.000831 ***
## Place_of_Birth -0.120658 0.129297 -0.933 0.350727
## Child_size 0.136764 0.097881 1.397 0.162340
## Number_of_Children -0.530133 0.155735 -3.404 0.000664 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4435.0 on 13698 degrees of freedom
## Residual deviance: 4284.4 on 13682 degrees of freedom
## AIC: 4318.4
##
## Number of Fisher Scoring iterations: 6
round(coef(summary(logistic)),4)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.0395 0.5478 -7.3740 0.0000
## Age_group -0.2899 0.0592 -4.8968 0.0000
## Region -0.0067 0.0092 -0.7287 0.4662
## Residence 0.0174 0.0615 0.2829 0.7773
## Women_Education -0.0621 0.1024 -0.6062 0.5444
## Wealth_Quantile -0.0457 0.0390 -1.1715 0.2414
## Age_1st_birth 0.1184 0.1241 0.9543 0.3399
## Current_Breastfeeding 0.0940 0.0918 1.0237 0.3060
## Health_Access -0.1779 0.1000 -1.7783 0.0754
## Age_1st_Marriage -0.2036 0.1375 -1.4804 0.1388
## Husband_Employment_Status -0.1736 0.0978 -1.7753 0.0758
## Birth_Order 0.2686 0.0311 8.6399 0.0000
## Child_Twin 1.3515 0.2280 5.9289 0.0000
## Child_Sex -0.3069 0.0918 -3.3423 0.0008
## Place_of_Birth -0.1207 0.1293 -0.9332 0.3507
## Child_size 0.1368 0.0979 1.3972 0.1623
## Number_of_Children -0.5301 0.1557 -3.4041 0.0007
# Logistic Regression without factors
logistic<-glm(Survival_Status~.,data=final,family = binomial())
summary(logistic)
##
## Call:
## glm(formula = Survival_Status ~ ., family = binomial(), data = final)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.039494 0.547803 -7.374 1.66e-13 ***
## Age_group -0.289915 0.059205 -4.897 9.74e-07 ***
## Region -0.006680 0.009168 -0.729 0.466207
## Residence 0.017392 0.061488 0.283 0.777288
## Women_Education -0.062057 0.102370 -0.606 0.544377
## Wealth_Quantile -0.045697 0.039006 -1.172 0.241381
## Age_1st_birth 0.118418 0.124090 0.954 0.339934
## Current_Breastfeeding 0.094027 0.091848 1.024 0.305968
## Health_Access -0.177867 0.100023 -1.778 0.075362 .
## Age_1st_Marriage -0.203578 0.137516 -1.480 0.138769
## Husband_Employment_Status -0.173572 0.097771 -1.775 0.075848 .
## Birth_Order 0.268633 0.031092 8.640 < 2e-16 ***
## Child_Twin 1.351508 0.227953 5.929 3.05e-09 ***
## Child_Sex -0.306914 0.091827 -3.342 0.000831 ***
## Place_of_Birth -0.120658 0.129297 -0.933 0.350727
## Child_size 0.136764 0.097881 1.397 0.162340
## Number_of_Children -0.530133 0.155735 -3.404 0.000664 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4435.0 on 13698 degrees of freedom
## Residual deviance: 4284.4 on 13682 degrees of freedom
## AIC: 4318.4
##
## Number of Fisher Scoring iterations: 6
# Visualization of Parameter Effects
library(effects)
## Loading required package: carData
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
#windows()
plot(allEffects(logistic),cex.main=.75, cex.lab=.1, cex.axis=0.1)

# Prediction Models
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
# Prep Training and Test dataset
# First, create a Train-test split with 0% data included in the training set
set.seed(123)
trainDataIndex<-createDataPartition(final$Survival_Status, p=0.8, list = F)
trainData<-final[trainDataIndex,]
testData<-final[-trainDataIndex,]
###################################################
# Logistic with only significant variables
###################################################
logitmod2<-glm(Survival_Status~ .,
family="binomial", data=trainData)
summary(logitmod2)
##
## Call:
## glm(formula = Survival_Status ~ ., family = "binomial", data = trainData)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.283697 0.613656 -6.981 2.94e-12 ***
## Age_group -0.287426 0.066324 -4.334 1.47e-05 ***
## Region 0.001086 0.010315 0.105 0.916160
## Residence 0.102506 0.069961 1.465 0.142868
## Women_Education -0.016257 0.112872 -0.144 0.885475
## Wealth_Quantile -0.052661 0.044278 -1.189 0.234308
## Age_1st_birth 0.052528 0.141620 0.371 0.710705
## Current_Breastfeeding 0.056552 0.103668 0.546 0.585404
## Health_Access -0.209171 0.113318 -1.846 0.064910 .
## Age_1st_Marriage -0.119592 0.155688 -0.768 0.442397
## Husband_Employment_Status -0.162683 0.110103 -1.478 0.139527
## Birth_Order 0.273201 0.034700 7.873 3.45e-15 ***
## Child_Twin 1.404853 0.244603 5.743 9.28e-09 ***
## Child_Sex -0.372027 0.104259 -3.568 0.000359 ***
## Place_of_Birth -0.101659 0.146294 -0.695 0.487120
## Child_size 0.146484 0.110097 1.330 0.183354
## Number_of_Children -0.635916 0.176183 -3.609 0.000307 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3505.4 on 10959 degrees of freedom
## Residual deviance: 3376.8 on 10943 degrees of freedom
## AIC: 3410.8
##
## Number of Fisher Scoring iterations: 6
# Apply the model to predict the testdata
pred<-predict(logitmod2,newdata = testData, type = "response")
summary(pred)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.005838 0.023321 0.032442 0.036825 0.044333 0.449391
# Measure the accuracy of prediction in the test data
y_pred_num<-ifelse(pred<0.036,0,1)
y_pred<-factor(y_pred_num,levels = c(0,1))
y_act<-testData$Survival_Status
#Result : Prediction Accuracy
set.seed(123)
mean(y_pred==y_act)
## [1] 0.5951077
# Plot ROC Curve
library(InformationValue)
##
## Attaching package: 'InformationValue'
## The following objects are masked from 'package:caret':
##
## confusionMatrix, precision, sensitivity, specificity
InformationValue::plotROC(y_act,pred)

InformationValue::AUROC(y_act,pred)
## [1] 0.6012519
#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_table <- table(testData$Survival_Status, y_pred_num)
#Displays confusion matrix and the statistics associated with the confusion matrix
result<-caret::confusionMatrix(confusion_table)
result
## Confusion Matrix and Statistics
##
## y_pred_num
## 0 1
## 0 1563 1065
## 1 44 67
##
## Accuracy : 0.5951
## 95% CI : (0.5764, 0.6136)
## No Information Rate : 0.5867
## P-Value [Acc > NIR] : 0.1914
##
## Kappa : 0.0367
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.97262
## Specificity : 0.05919
## Pos Pred Value : 0.59475
## Neg Pred Value : 0.60360
## Prevalence : 0.58671
## Detection Rate : 0.57065
## Detection Prevalence : 0.95947
## Balanced Accuracy : 0.51590
##
## 'Positive' Class : 0
##
metrics<-as.data.frame(result$byClass)
colnames(metrics)<-"metrics"
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
kable(round(metrics,4), caption = "F1-score, Precision and Recall ") %>%
kable_styling(font_size = 16)
F1-score, Precision and Recall
|
metrics
|
Sensitivity
|
0.9726
|
Specificity
|
0.0592
|
Pos Pred Value
|
0.5947
|
Neg Pred Value
|
0.6036
|
Precision
|
0.5947
|
Recall
|
0.9726
|
F1
|
0.7381
|
Prevalence
|
0.5867
|
Detection Rate
|
0.5706
|
Detection Prevalence
|
0.9595
|
Balanced Accuracy
|
0.5159
|
#############################################
# Probit Regression
#############################################
# Probit Regression
probitmod<-glm(Survival_Status~ .,
family=binomial(link="probit"), data=trainData)
summary(probitmod)
##
## Call:
## glm(formula = Survival_Status ~ ., family = binomial(link = "probit"),
## data = trainData)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.2189107 0.2834884 -7.827 4.99e-15 ***
## Age_group -0.1361225 0.0298419 -4.561 5.08e-06 ***
## Region -0.0009604 0.0046288 -0.207 0.835637
## Residence 0.0401587 0.0310505 1.293 0.195895
## Women_Education -0.0066852 0.0493454 -0.135 0.892235
## Wealth_Quantile -0.0255685 0.0198320 -1.289 0.197310
## Age_1st_birth 0.0278509 0.0634860 0.439 0.660884
## Current_Breastfeeding 0.0226100 0.0464648 0.487 0.626539
## Health_Access -0.0968806 0.0503279 -1.925 0.054231 .
## Age_1st_Marriage -0.0420892 0.0682940 -0.616 0.537700
## Husband_Employment_Status -0.0822458 0.0494838 -1.662 0.096498 .
## Birth_Order 0.1274120 0.0161787 7.875 3.40e-15 ***
## Child_Twin 0.6806168 0.1292031 5.268 1.38e-07 ***
## Child_Sex -0.1679732 0.0462551 -3.631 0.000282 ***
## Place_of_Birth -0.0420166 0.0647343 -0.649 0.516297
## Child_size 0.0644779 0.0497044 1.297 0.194553
## Number_of_Children -0.2894850 0.0794152 -3.645 0.000267 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3505.4 on 10959 degrees of freedom
## Residual deviance: 3375.5 on 10943 degrees of freedom
## AIC: 3409.5
##
## Number of Fisher Scoring iterations: 6
# Apply the model to predict the testdata
pred<-predict(probitmod,newdata = testData, type = "response")
summary(pred)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.004173 0.022911 0.032552 0.036881 0.045116 0.373586
# Measure the accuracy of prediction in the test data
y_pred_num<-ifelse(pred<0.036,0,1)
y_pred<-factor(y_pred_num,levels = c(0,1))
y_act<-testData$Survival_Status
#Result : Prediction Accuracy
set.seed(123)
mean(y_pred==y_act)
## [1] 0.5852501
# Plot ROC Curve
library(InformationValue)
InformationValue::plotROC(y_act,pred)

InformationValue::AUROC(y_act,pred)
## [1] 0.6090577
#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_table <- table(testData$Survival_Status, y_pred_num)
#Displays confusion matrix and the statistics associated with the confusion matrix
result<-caret::confusionMatrix(confusion_table)
result
## Confusion Matrix and Statistics
##
## y_pred_num
## 0 1
## 0 1534 1094
## 1 42 69
##
## Accuracy : 0.5853
## 95% CI : (0.5665, 0.6038)
## No Information Rate : 0.5754
## P-Value [Acc > NIR] : 0.1528
##
## Kappa : 0.0371
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.97335
## Specificity : 0.05933
## Pos Pred Value : 0.58371
## Neg Pred Value : 0.62162
## Prevalence : 0.57539
## Detection Rate : 0.56006
## Detection Prevalence : 0.95947
## Balanced Accuracy : 0.51634
##
## 'Positive' Class : 0
##
metrics<-as.data.frame(result$byClass)
colnames(metrics)<-"metrics"
library(dplyr)
library(kableExtra)
kable(round(metrics,4), caption = "F1-score, Precision and Recall ") %>%
kable_styling(font_size = 16)
F1-score, Precision and Recall
|
metrics
|
Sensitivity
|
0.9734
|
Specificity
|
0.0593
|
Pos Pred Value
|
0.5837
|
Neg Pred Value
|
0.6216
|
Precision
|
0.5837
|
Recall
|
0.9734
|
F1
|
0.7298
|
Prevalence
|
0.5754
|
Detection Rate
|
0.5601
|
Detection Prevalence
|
0.9595
|
Balanced Accuracy
|
0.5163
|
# Probit Regression
# Get feature importance (coefficients)
feature_importance2 <- abs(coef(probitmod)[-1])
# Create feature importance plot
feature_importance_df2<-data.frame(
Feature = names(feature_importance2),
Importance = feature_importance2
)
feature_importance_df2 <- feature_importance_df2[order(feature_importance_df2$Importance, decreasing = TRUE), ]
#windows()
ggplot(feature_importance_df2, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "gold") +
xlab("Feature") +
ylab("Importance") +
ggtitle("Probit Regression - Feature Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

################################################3
# Random Forest
################################################
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
set.seed(123)
#Random Forest for variables. mtry = 5 since there are 24 variables (square root of 24 is close to 5).
fit_rf <- randomForest(factor(Survival_Status) ~., data = trainData)
fit_rf
##
## Call:
## randomForest(formula = factor(Survival_Status) ~ ., data = trainData)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 3.74%
## Confusion matrix:
## 0 1 class.error
## 0 10544 5 0.0004739786
## 1 405 6 0.9854014599
varImpPlot(fit_rf)

#Performance of Random Forest Model on Testing Data
predrf = predict(fit_rf, newdata=testData)
table(predrf)
## predrf
## 0 1
## 2737 2
summary(predrf)
## 0 1
## 2737 2
# Measure the accuracy of prediction in the test data
y_pred_numrf<-ifelse(predrf==1,1,0)
y_predrf<-factor(y_pred_numrf,levels = c(0,1))
y_actrf<-testData$Survival_Status
#Result : Prediction Accuracy
mean(y_predrf==y_actrf)
## [1] 0.9594743
#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_tablerf<- table(testData$Survival_Status, y_pred_numrf)
#Displays confusion matrix and the statistics associated with the confusion matrix
resultrf<-caret::confusionMatrix(confusion_tablerf)
resultrf
## Confusion Matrix and Statistics
##
## y_pred_numrf
## 0 1
## 0 2627 1
## 1 110 1
##
## Accuracy : 0.9595
## 95% CI : (0.9514, 0.9665)
## No Information Rate : 0.9993
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0163
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.959810
## Specificity : 0.500000
## Pos Pred Value : 0.999619
## Neg Pred Value : 0.009009
## Prevalence : 0.999270
## Detection Rate : 0.959109
## Detection Prevalence : 0.959474
## Balanced Accuracy : 0.729905
##
## 'Positive' Class : 0
##
metricsrf<-as.data.frame(resultrf$byClass)
colnames(metricsrf)<-"metricsrf"
library(dplyr)
library(kableExtra)
kable(round(metricsrf,4), caption = "F1-score, Precision and Recall ") %>%
kable_styling(font_size = 16)
F1-score, Precision and Recall
|
metricsrf
|
Sensitivity
|
0.9598
|
Specificity
|
0.5000
|
Pos Pred Value
|
0.9996
|
Neg Pred Value
|
0.0090
|
Precision
|
0.9996
|
Recall
|
0.9598
|
F1
|
0.9793
|
Prevalence
|
0.9993
|
Detection Rate
|
0.9591
|
Detection Prevalence
|
0.9595
|
Balanced Accuracy
|
0.7299
|
# Random Forest
# Get feature importance
feature_importance <- importance(fit_rf)
# Convert feature importance to data frame
feature_importance_df <- data.frame(
Feature = row.names(feature_importance),
Importance = feature_importance[, "MeanDecreaseGini"]
)
# Sort feature importance by importance score
feature_importance_df <- feature_importance_df[order(feature_importance_df$Importance, decreasing = TRUE), ]
#windows()
# Create feature importance plot
ggplot(feature_importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "steelblue") +
xlab("Feature") +
ylab("Importance") +
ggtitle("Random Forest - Feature Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

#ROC
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
# RandomForest
test_prob1= predict(fit_rf, newdata=testData, type = "prob")
test_roc1= roc(testData$Survival_Status, test_prob1[,c(2)],plot = TRUE, col='darkgoldenrod4', print.auc = TRUE,percent=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

#Probit Regression
test_prob2= predict(probitmod, newdata=testData, type = "response")
test_roc2= roc(testData$Survival_Status~test_prob2,plot = TRUE, col='darkorange', print.auc = TRUE,percent=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

#Logistic Regression
test_prob3= predict(logitmod2, newdata=testData, type = "response")
test_roc3= roc(testData$Survival_Status~test_prob3, plot = TRUE,col="darkblue", print.auc = TRUE,percent=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

###############################################
# Support Vector Machine (SVM) Model
###############################################
library(e1071)
set.seed(123)
svm<-svm(Survival_Status~., data = trainData, kernel='linear', cost=10, scale=FALSE)
# Plot the SVC obtained
#windows()
#plot(svm, final)
summary(svm)
##
## Call:
## svm(formula = Survival_Status ~ ., data = trainData, kernel = "linear",
## cost = 10, scale = FALSE)
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: linear
## cost: 10
## gamma: 0.0625
## epsilon: 0.1
##
##
## Number of Support Vectors: 4531
#Prediction
set.seed(123)
predictionSVM<- predict(svm, newdata = testData)
summary(predictionSVM)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.09963 0.09993 0.10000 0.10000 0.10006 0.10026
# Measure the accuracy of prediction in the test data
y_pred_numSVM<-ifelse(predictionSVM<0.1,0,1)
y_predSVM<-factor(y_pred_numSVM,levels = c(0,1))
y_actSVM<-testData$Survival_Status
#Result : Prediction Accuracy
mean(y_predSVM==y_actSVM)
## [1] 0.5060241
#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_tableSVM<- table(testData$Survival_Status, y_predSVM)
#Displays confusion matrix and the statistics associated with the confusion matrix
resultSVM<-caret::confusionMatrix(confusion_tableSVM)
resultSVM
## Confusion Matrix and Statistics
##
## y_predSVM
## 0 1
## 0 1338 1290
## 1 63 48
##
## Accuracy : 0.506
## 95% CI : (0.4871, 0.5249)
## No Information Rate : 0.5115
## P-Value [Acc > NIR] : 0.7233
##
## Kappa : -0.0093
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.95503
## Specificity : 0.03587
## Pos Pred Value : 0.50913
## Neg Pred Value : 0.43243
## Prevalence : 0.51150
## Detection Rate : 0.48850
## Detection Prevalence : 0.95947
## Balanced Accuracy : 0.49545
##
## 'Positive' Class : 0
##
metricsSVM<-as.data.frame(resultSVM$byClass)
colnames(metricsSVM)<-"metricsSVM"
library(dplyr)
library(kableExtra)
kable(round(metricsSVM,4), caption = "F1-score, Precision and Recall ") %>%
kable_styling(font_size = 16)
F1-score, Precision and Recall
|
metricsSVM
|
Sensitivity
|
0.9550
|
Specificity
|
0.0359
|
Pos Pred Value
|
0.5091
|
Neg Pred Value
|
0.4324
|
Precision
|
0.5091
|
Recall
|
0.9550
|
F1
|
0.6642
|
Prevalence
|
0.5115
|
Detection Rate
|
0.4885
|
Detection Prevalence
|
0.9595
|
Balanced Accuracy
|
0.4955
|
#SVM
library(pROC)
#windows()
test_prob4= predict(svm, newdata=testData)
test_roc4= roc(testData$Survival_Status, main="Support Vector Machine", test_prob4,plot = TRUE, col='darkblue', print.auc = TRUE,percent=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls > cases

coef(svm)[2:17]
## Age_group Region Residence
## -4.257924e-05 9.735127e-06 -5.251352e-06
## Women_Education Wealth_Quantile Age_1st_birth
## -5.471056e-05 -9.461495e-06 4.167470e-05
## Current_Breastfeeding Health_Access Age_1st_Marriage
## 3.784603e-05 2.520351e-05 -5.520323e-05
## Husband_Employment_Status Birth_Order Child_Twin
## -4.970253e-05 -1.244043e-05 1.414533e-04
## Child_Sex Place_of_Birth Child_size
## -5.897005e-06 -1.529280e-05 -1.098612e-05
## Number_of_Children
## 8.802279e-05
# Support Vector Machine
# Get feature importance (coefficients)
feature_importance1<- abs(coef(svm)[2:17])
# Create feature importance plot
feature_importance_df <- data.frame(
Feature = names(feature_importance1),
Importance = feature_importance1
)
feature_importance_df <- feature_importance_df[order(feature_importance_df$Importance, decreasing = TRUE), ]
#windows()
ggplot(feature_importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "darkred") +
xlab("Feature") +
ylab("Importance") +
ggtitle("Support Vector Machine - Feature Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

#################################################
############################################
# KNN Model
library(knn)
# Fitting model
fit <-knnreg(trainData$Survival_Status ~ ., data = trainData,k=2)
summary(fit)
## Length Class Mode
## learn 2 -none- list
## k 1 -none- numeric
## terms 3 terms call
## xlevels 0 -none- list
## theDots 0 -none- list
#Predict Output
predicted= predict(fit,testData)
summary(predicted)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.03492 0.00000 1.00000
# Measure the accuracy of prediction in the test data
y_pred_numknn<-ifelse(predicted==1,1,0)
y_predknn<-factor(y_pred_numknn,levels = c(0,1))
y_actknn<-testData$Survival_Status
#Result : Prediction Accuracy
mean(y_predknn==y_actknn)
## [1] 0.9591092
#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_tableknn<- table(testData$Survival_Status, y_predknn)
#Displays confusion matrix and the statistics associated with the confusion matrix
resultknn<-caret::confusionMatrix(confusion_tableknn)
resultknn
## Confusion Matrix and Statistics
##
## y_predknn
## 0 1
## 0 2626 2
## 1 110 1
##
## Accuracy : 0.9591
## 95% CI : (0.951, 0.9662)
## No Information Rate : 0.9989
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0154
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.959795
## Specificity : 0.333333
## Pos Pred Value : 0.999239
## Neg Pred Value : 0.009009
## Prevalence : 0.998905
## Detection Rate : 0.958744
## Detection Prevalence : 0.959474
## Balanced Accuracy : 0.646564
##
## 'Positive' Class : 0
##
metricsknn<-as.data.frame(resultknn$byClass)
colnames(metricsrf)<-"metricsknn"
library(dplyr)
library(kableExtra)
kable(round(metricsknn,4), caption = "F1-score, Precision and Recall ") %>%
kable_styling(font_size = 16)
F1-score, Precision and Recall
|
resultknn$byClass
|
Sensitivity
|
0.9598
|
Specificity
|
0.3333
|
Pos Pred Value
|
0.9992
|
Neg Pred Value
|
0.0090
|
Precision
|
0.9992
|
Recall
|
0.9598
|
F1
|
0.9791
|
Prevalence
|
0.9989
|
Detection Rate
|
0.9587
|
Detection Prevalence
|
0.9595
|
Balanced Accuracy
|
0.6466
|
knnmodel <- train(Survival_Status ~ ., data = trainData, method = "knn")
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
# Get feature importance
importance <- varImp(knnmodel)
# Print the feature importance
print(importance)
## loess r-squared variable importance
##
## Overall
## Child_Twin 100.000
## Birth_Order 78.946
## Child_Sex 37.696
## Age_1st_birth 32.576
## Age_1st_Marriage 31.520
## Residence 13.754
## Health_Access 13.305
## Child_size 8.786
## Number_of_Children 6.657
## Place_of_Birth 6.343
## Husband_Employment_Status 5.405
## Women_Education 3.093
## Age_group 2.598
## Wealth_Quantile 2.485
## Current_Breastfeeding 0.481
## Region 0.000
# Plot the feature importance
plot <- ggplot(importance, aes(x = rownames(importance), y = Overall)) +
geom_bar(stat = "identity", fill = "purple") +
labs(x = "Features", y = "Importance") +
ggtitle("KNN Features Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Print the plot
print(plot)

#ROC
library(pROC)
# KNN
#windows()
test_prob5= predict(fit, newdata=testData, type = "response")
test_roc5= roc(testData$Survival_Status, test_prob5,plot = TRUE, col='darkblue', print.auc = TRUE,percent=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

####################################################
# Area_Under Curve
############################################
#Area-under Curve
#windows()
plot(test_roc1, print.auc = TRUE, col='brown',percent=TRUE, print.auc.y = 48, print.auc.x = 0)
plot(test_roc2 , add = TRUE, col='red', print.auc = TRUE, print.auc.y = 41, print.auc.x = 0)
plot(test_roc3 , add = TRUE, col='darkgoldenrod4', print.auc = TRUE, print.auc.y = 34, print.auc.x = 0)
plot(test_roc4 , add = TRUE, col='green', print.auc = TRUE, print.auc.y = 27, print.auc.x = 0)
plot(test_roc5 , add = TRUE, col='purple', print.auc = TRUE, print.auc.y = 20, print.auc.x = 0)
abline(a=0, b=1, lty=3, lwd=4, col="black")
legend("topleft", legend = c("Random Forest-Model", "Probit Regression",
"Logistic-Regression","Support Vector Machine","KNN Model") ,
pch = 15, bty = 'n', col = c("brown","red", "darkgoldenrod4", "green","purple"))

###################################################
# Metrics
###############################################
# Sample accuracy, sensitivity, F1 Score, and precision values for three models
Accuracy <- c(0.5951, 0.9595, 0.5853,0.9591,0.5081)
Recall <- c(0.9726, 0.9598, 0.9734,0.9598,0.9550)
F1_score <- c(0.7381, 0.9793, 0.7298,0.9791,0.6642)
Precision<-c(0.5947,0.9996,0.5837,0.9992,0.5091)
# Creating a data frame with the values
model_names <- c("Logistic Regression", "Random Forest", "Probit Regression","KNN",
"Support Vector Machine")
data <- data.frame(Model = model_names, Accuracy = Accuracy, Recall = Recall, F1_Score = F1_score, Precision=Precision)
# Loading the ggplot2 package
library(ggplot2)
# Reshaping the data frame for plotting
data_long <- reshape2::melt(data, id.vars = "Model", variable.name = "Metric", value.name = "Value")
#windows()
# Creating the bar chart
ggplot(data_long, aes(x = Model, y = Value, fill = Metric)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Comparison of Supervised Machine Learning Models",
x = "Model",
y = "Value") +
scale_fill_manual(values = c("Accuracy" = "darkblue",
"Recall" = "darkgreen",
"F1_Score" = "magenta",
"Precision" = "darkred")) +
theme_minimal()

#############################################
# Feature Selection
#############################################
#windows()
par(mfrow=(c(3,2)))
# Logistic Regression
# Get feature importance (coefficients)
feature_importance2 <- abs(coef(logitmod2)[-1])
# Create feature importance plot
feature_importance_df2<-data.frame(
Feature = names(feature_importance2),
Importance = feature_importance2
)
feature_importance_df2 <- feature_importance_df2[order(feature_importance_df2$Importance, decreasing = TRUE), ]
#windows()
ggplot(feature_importance_df2, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "magenta") +
xlab("Feature") +
ylab("Importance") +
ggtitle("Logistic Regression - Feature Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

###########################################3
# Probit Regression
# Get feature importance (coefficients)
feature_importance2 <- abs(coef(probitmod)[-1])
# Create feature importance plot
feature_importance_df2<-data.frame(
Feature = names(feature_importance2),
Importance = feature_importance2
)
feature_importance_df2 <- feature_importance_df2[order(feature_importance_df2$Importance, decreasing = TRUE), ]
#windows()
ggplot(feature_importance_df2, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "gold") +
xlab("Feature") +
ylab("Importance") +
ggtitle("Probit Regression - Feature Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

#############################################
# Random Forest
# Get feature importance
feature_importance <- importance(fit_rf)
# Convert feature importance to data frame
feature_importance_df <- data.frame(
Feature = row.names(feature_importance),
Importance = feature_importance[, "MeanDecreaseGini"]
)
# Sort feature importance by importance score
feature_importance_df <- feature_importance_df[order(feature_importance_df$Importance, decreasing = TRUE), ]
#windows()
# Create feature importance plot
ggplot(feature_importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "steelblue") +
xlab("Feature") +
ylab("Importance") +
ggtitle("Random Forest - Feature Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

###########################################
knnmodel <- train(Survival_Status ~ ., data = trainData, method = "knn")
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
# Get feature importance
importance <- varImp(knnmodel)
# Print the feature importance
print(importance)
## loess r-squared variable importance
##
## Overall
## Child_Twin 100.000
## Birth_Order 78.946
## Child_Sex 37.696
## Age_1st_birth 32.576
## Age_1st_Marriage 31.520
## Residence 13.754
## Health_Access 13.305
## Child_size 8.786
## Number_of_Children 6.657
## Place_of_Birth 6.343
## Husband_Employment_Status 5.405
## Women_Education 3.093
## Age_group 2.598
## Wealth_Quantile 2.485
## Current_Breastfeeding 0.481
## Region 0.000
#windows()
# Plot the feature importance
plot <- ggplot(importance, aes(x = rownames(importance), y = Overall)) +
geom_bar(stat = "identity", fill = "purple") +
labs(x = "Features", y = "Importance") +
ggtitle("KNN Features Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Print the plot
print(plot)

########################################
coef(svm)[2:17]
## Age_group Region Residence
## -4.257924e-05 9.735127e-06 -5.251352e-06
## Women_Education Wealth_Quantile Age_1st_birth
## -5.471056e-05 -9.461495e-06 4.167470e-05
## Current_Breastfeeding Health_Access Age_1st_Marriage
## 3.784603e-05 2.520351e-05 -5.520323e-05
## Husband_Employment_Status Birth_Order Child_Twin
## -4.970253e-05 -1.244043e-05 1.414533e-04
## Child_Sex Place_of_Birth Child_size
## -5.897005e-06 -1.529280e-05 -1.098612e-05
## Number_of_Children
## 8.802279e-05
# Support Vector Machine
# Get feature importance (coefficients)
feature_importance1<- abs(coef(svm)[2:17])
# Create feature importance plot
feature_importance_df <- data.frame(
Feature = names(feature_importance1),
Importance = feature_importance1
)
feature_importance_df <- feature_importance_df[order(feature_importance_df$Importance, decreasing = TRUE), ]
#windows()
ggplot(feature_importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "darkred") +
xlab("Feature") +
ylab("Importance") +
ggtitle("Support Vector Machine - Feature Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
