Leila Project
# 5 Supervised Machine Learning Models
# Unintended Childbirth in Somaliland
library(haven)
final<-read_dta("~/Leyla.dta")
str(final)
## tibble [3,737 × 15] (S3: tbl_df/tbl/data.frame)
## $ Age_Group : dbl+lbl [1:3737] 2, 3, 3, 3, 5, 5, 3, 3, 1, 1, 3, 3, 3, 3, 3, 3, 1, 5,...
## ..@ label : chr "Age in 5-year groups"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:7] 1 2 3 4 5 6 7
## .. ..- attr(*, "names")= chr [1:7] "15 - 19" "20 - 24" "25 - 29" "30 - 34" ...
## $ Region : dbl+lbl [1:3737] 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1...
## ..@ label : chr "Region"
## ..@ format.stata: chr "%2.0f"
## ..@ labels : Named num [1:18] 11 12 13 14 15 16 17 18 19 20 ...
## .. ..- attr(*, "names")= chr [1:18] "Awdal" "Woqooyi Galbeed" "Togdheer" "Sool" ...
## $ Residence : dbl+lbl [1:3737] 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,...
## ..@ label : chr "Type of place of residence"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:6] 1 2 3 4 5 6
## .. ..- attr(*, "names")= chr [1:6] "Rural" "Urban" "Nomadic" "Rural IDP" ...
## $ Women_Educationlevel : dbl+lbl [1:3737] 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 0, 0, 0,...
## ..@ label : chr "Highest educational level"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:4] 0 1 2 3
## .. ..- attr(*, "names")= chr [1:4] "No Education" "Primary" "Secondary" "Higher"
## $ Listening_Radio : dbl+lbl [1:3737] 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 3, 3, 3, 3, 3,...
## ..@ label : chr "Frequency of listening to radio"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:3] 1 2 3
## .. ..- attr(*, "names")= chr [1:3] "At least once a week" "Less than once a week" "Not at all"
## $ Watching_Television : dbl+lbl [1:3737] 3, 3, 3, 3, 1, 1, 3, 3, 3, 3, 2, 2, 2, 3, 3, 3, 3, 3,...
## ..@ label : chr "Frequency of watching television"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:3] 1 2 3
## .. ..- attr(*, "names")= chr [1:3] "At least once a week" "Less than once a week" "Not at all"
## $ Wealth_Quantile : dbl+lbl [1:3737] 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 4, 4, 4, 3, 3, 3, 3, 2,...
## ..@ label : chr "Wealth index combined"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:5] 1 2 3 4 5
## .. ..- attr(*, "names")= chr [1:5] "Lowest" "Second" "Middle" "Fourth" ...
## $ Contraceptive_Method : dbl+lbl [1:3737] 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## ..@ label : chr "Current contraceptive method"
## ..@ format.stata: chr "%2.0f"
## ..@ labels : Named num [1:14] 0 1 2 3 4 5 6 7 8 9 ...
## .. ..- attr(*, "names")= chr [1:14] "Not currently using(pregnant)" "IUD" "Injectables" "Implants" ...
## $ Conctraceptive_Use : dbl+lbl [1:3737] 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,...
## ..@ label : chr "Contraceptive use and intention"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:5] 1 2 3 4 5
## .. ..- attr(*, "names")= chr [1:5] "Using mordern method" "Using traditional method" "Non-User, intends to use later" "Does not intend to use" ...
## $ Health_Access : dbl+lbl [1:3737] 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1,...
## ..@ label : chr "Getting medical help for self: distance to health facility"
## ..@ format.stata: chr "%10.0f"
## ..@ labels : Named num [1:2] 1 2
## .. ..- attr(*, "names")= chr [1:2] "No problem" "Problem"
## $ Husband_Education_Level : dbl+lbl [1:3737] 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 1, 1,...
## ..@ label : chr "Husband/partner's ever attended school"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:3] 1 2 8
## .. ..- attr(*, "names")= chr [1:3] "Yes" "No" "Don't Know"
## $ Husband_Employment_Status: dbl+lbl [1:3737] 2, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,...
## ..@ label : chr "Husband/partner worked in last 12 months"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:3] 1 2 8
## .. ..- attr(*, "names")= chr [1:3] "Yes" "No" "Don't Know"
## $ Women_Employment_Status : dbl+lbl [1:3737] 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,...
## ..@ label : chr "Respondent worked in last 12 months"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:2] 1 2
## .. ..- attr(*, "names")= chr [1:2] "Yes" "No"
## $ DV : dbl+lbl [1:3737] 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
## ..@ label : chr "Wanted pregnancy when became pregnant"
## ..@ format.stata: chr "%10.0f"
## ..@ labels : Named num [1:2] 0 1
## .. ..- attr(*, "names")= chr [1:2] "Wanted" "Unwanted"
## $ Number_of_Children : dbl+lbl [1:3737] 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0,...
## ..@ label : chr "Total Children Ever Born"
## ..@ format.stata: chr "%20.0g"
## ..@ labels : Named num [1:2] 0 1
## .. ..- attr(*, "names")= chr [1:2] "Less than 5 Children" "5 Children+"
write.csv(final,"~/Leyla.csv")
final<-read.csv("~/Leyla.csv")
final<-final[,-1]
# Logistic Regression
logistic<-glm(DV~.,data=final,family = binomial())
summary(logistic)
##
## Call:
## glm(formula = DV ~ ., family = binomial(), data = final)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.15319 2.09284 -1.984 0.047203 *
## Age_Group 0.10117 0.07391 1.369 0.171046
## Region -0.10721 0.06149 -1.744 0.081216 .
## Residence 0.17885 0.11141 1.605 0.108436
## Women_Educationlevel 0.03472 0.17782 0.195 0.845183
## Listening_Radio 0.20495 0.16820 1.218 0.223051
## Watching_Television 0.12066 0.14380 0.839 0.401448
## Wealth_Quantile 0.15088 0.07334 2.057 0.039646 *
## Contraceptive_Method -0.12628 0.17266 -0.731 0.464562
## Conctraceptive_Use -0.24896 0.24527 -1.015 0.310078
## Health_Access 0.57360 0.17474 3.283 0.001029 **
## Husband_Education_Level -0.39456 0.24500 -1.610 0.107301
## Husband_Employment_Status 0.46855 0.18699 2.506 0.012222 *
## Women_Employment_Status -0.09938 0.73760 -0.135 0.892825
## Number_of_Children 0.70398 0.21334 3.300 0.000968 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1290.1 on 3736 degrees of freedom
## Residual deviance: 1236.4 on 3722 degrees of freedom
## AIC: 1266.4
##
## Number of Fisher Scoring iterations: 6
round(coef(summary(logistic)),4)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.1532 2.0928 -1.9845 0.0472
## Age_Group 0.1012 0.0739 1.3689 0.1710
## Region -0.1072 0.0615 -1.7437 0.0812
## Residence 0.1788 0.1114 1.6053 0.1084
## Women_Educationlevel 0.0347 0.1778 0.1953 0.8452
## Listening_Radio 0.2049 0.1682 1.2185 0.2231
## Watching_Television 0.1207 0.1438 0.8390 0.4014
## Wealth_Quantile 0.1509 0.0733 2.0574 0.0396
## Contraceptive_Method -0.1263 0.1727 -0.7314 0.4646
## Conctraceptive_Use -0.2490 0.2453 -1.0151 0.3101
## Health_Access 0.5736 0.1747 3.2825 0.0010
## Husband_Education_Level -0.3946 0.2450 -1.6104 0.1073
## Husband_Employment_Status 0.4685 0.1870 2.5057 0.0122
## Women_Employment_Status -0.0994 0.7376 -0.1347 0.8928
## Number_of_Children 0.7040 0.2133 3.2998 0.0010
# Logistic Regression without factors
logistic<-glm(DV~.,data=final,family = binomial())
summary(logistic)
##
## Call:
## glm(formula = DV ~ ., family = binomial(), data = final)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.15319 2.09284 -1.984 0.047203 *
## Age_Group 0.10117 0.07391 1.369 0.171046
## Region -0.10721 0.06149 -1.744 0.081216 .
## Residence 0.17885 0.11141 1.605 0.108436
## Women_Educationlevel 0.03472 0.17782 0.195 0.845183
## Listening_Radio 0.20495 0.16820 1.218 0.223051
## Watching_Television 0.12066 0.14380 0.839 0.401448
## Wealth_Quantile 0.15088 0.07334 2.057 0.039646 *
## Contraceptive_Method -0.12628 0.17266 -0.731 0.464562
## Conctraceptive_Use -0.24896 0.24527 -1.015 0.310078
## Health_Access 0.57360 0.17474 3.283 0.001029 **
## Husband_Education_Level -0.39456 0.24500 -1.610 0.107301
## Husband_Employment_Status 0.46855 0.18699 2.506 0.012222 *
## Women_Employment_Status -0.09938 0.73760 -0.135 0.892825
## Number_of_Children 0.70398 0.21334 3.300 0.000968 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1290.1 on 3736 degrees of freedom
## Residual deviance: 1236.4 on 3722 degrees of freedom
## AIC: 1266.4
##
## Number of Fisher Scoring iterations: 6
# Prediction Models
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
# Prep Training and Test dataset
# First, create a Train-test split with 0% data included in the training set
set.seed(123)
trainDataIndex<-createDataPartition(final$DV, p=0.8, list = F)
trainData<-final[trainDataIndex,]
testData<-final[-trainDataIndex,]
###################################################
# Logistic with only significant variables
###################################################
logitmod2<-glm(DV~ .,
family="binomial", data=trainData)
summary(logitmod2)
##
## Call:
## glm(formula = DV ~ ., family = "binomial", data = trainData)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.77706 2.24337 -2.129 0.03322 *
## Age_Group 0.06012 0.08343 0.721 0.47118
## Region -0.07494 0.06828 -1.098 0.27240
## Residence 0.15535 0.12236 1.270 0.20424
## Women_Educationlevel 0.12964 0.18947 0.684 0.49385
## Listening_Radio 0.31853 0.20631 1.544 0.12262
## Watching_Television 0.17045 0.16012 1.064 0.28712
## Wealth_Quantile 0.15235 0.08082 1.885 0.05943 .
## Contraceptive_Method -0.06165 0.17495 -0.352 0.72456
## Conctraceptive_Use -0.16790 0.27070 -0.620 0.53510
## Health_Access 0.58578 0.19447 3.012 0.00259 **
## Husband_Education_Level -0.44541 0.27109 -1.643 0.10037
## Husband_Employment_Status 0.40141 0.20919 1.919 0.05500 .
## Women_Employment_Status -0.21423 0.74201 -0.289 0.77280
## Number_of_Children 0.70175 0.23753 2.954 0.00313 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1044.6 on 2989 degrees of freedom
## Residual deviance: 1004.7 on 2975 degrees of freedom
## AIC: 1034.7
##
## Number of Fisher Scoring iterations: 6
# Apply the model to predict the testdata
pred<-predict(logitmod2,newdata = testData, type = "response")
summary(pred)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.005977 0.023367 0.037275 0.041888 0.053084 0.162106
# Measure the accuracy of prediction in the test data
y_pred_num<-ifelse(pred<0.041,0,1)
y_pred<-factor(y_pred_num,levels = c(0,1))
y_act<-testData$DV
#Result : Prediction Accuracy
set.seed(123)
mean(y_pred==y_act)
## [1] 0.5783133
# Plot ROC Curve
library(InformationValue)
##
## Attaching package: 'InformationValue'
## The following objects are masked from 'package:caret':
##
## confusionMatrix, precision, sensitivity, specificity
InformationValue::plotROC(y_act,pred)

InformationValue::AUROC(y_act,pred)
## [1] 0.6785371
#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_table <- table(testData$DV, y_pred_num)
#Displays confusion matrix and the statistics associated with the confusion matrix
result<-caret::confusionMatrix(confusion_table)
result
## Confusion Matrix and Statistics
##
## y_pred_num
## 0 1
## 0 413 305
## 1 10 19
##
## Accuracy : 0.5783
## 95% CI : (0.542, 0.614)
## No Information Rate : 0.5663
## P-Value [Acc > NIR] : 0.2655
##
## Kappa : 0.0392
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.97636
## Specificity : 0.05864
## Pos Pred Value : 0.57521
## Neg Pred Value : 0.65517
## Prevalence : 0.56627
## Detection Rate : 0.55288
## Detection Prevalence : 0.96118
## Balanced Accuracy : 0.51750
##
## 'Positive' Class : 0
##
metrics<-as.data.frame(result$byClass)
colnames(metrics)<-"metrics"
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
kable(round(metrics,4), caption = "F1-score, Precision and Recall ") %>%
kable_styling(font_size = 16)
F1-score, Precision and Recall
|
|
metrics
|
|
Sensitivity
|
0.9764
|
|
Specificity
|
0.0586
|
|
Pos Pred Value
|
0.5752
|
|
Neg Pred Value
|
0.6552
|
|
Precision
|
0.5752
|
|
Recall
|
0.9764
|
|
F1
|
0.7239
|
|
Prevalence
|
0.5663
|
|
Detection Rate
|
0.5529
|
|
Detection Prevalence
|
0.9612
|
|
Balanced Accuracy
|
0.5175
|
#############################################3
# Probit Regression
#############################################
# Probit Regression
probitmod<-glm(DV~ .,
family=binomial(link="probit"), data=trainData)
summary(probitmod)
##
## Call:
## glm(formula = DV ~ ., family = binomial(link = "probit"), data = trainData)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.42650 1.01984 -2.379 0.01735 *
## Age_Group 0.03448 0.03842 0.897 0.36946
## Region -0.03284 0.03126 -1.050 0.29357
## Residence 0.06907 0.05576 1.239 0.21542
## Women_Educationlevel 0.06571 0.08564 0.767 0.44292
## Listening_Radio 0.13697 0.08854 1.547 0.12185
## Watching_Television 0.07954 0.07237 1.099 0.27170
## Wealth_Quantile 0.07072 0.03736 1.893 0.05835 .
## Contraceptive_Method -0.03050 0.07975 -0.382 0.70214
## Conctraceptive_Use -0.07985 0.12303 -0.649 0.51632
## Health_Access 0.27795 0.08999 3.089 0.00201 **
## Husband_Education_Level -0.20451 0.12192 -1.677 0.09347 .
## Husband_Employment_Status 0.18748 0.09654 1.942 0.05215 .
## Women_Employment_Status -0.13447 0.34102 -0.394 0.69335
## Number_of_Children 0.31136 0.10619 2.932 0.00337 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1044.6 on 2989 degrees of freedom
## Residual deviance: 1004.0 on 2975 degrees of freedom
## AIC: 1034
##
## Number of Fisher Scoring iterations: 6
# Apply the model to predict the testdata
pred<-predict(probitmod,newdata = testData, type = "response")
summary(pred)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00453 0.02295 0.03757 0.04191 0.05449 0.15276
# Measure the accuracy of prediction in the test data
y_pred_num<-ifelse(pred<0.04,0,1)
y_pred<-factor(y_pred_num,levels = c(0,1))
y_act<-testData$DV
#Result : Prediction Accuracy
set.seed(123)
mean(y_pred==y_act)
## [1] 0.5609103
# Plot ROC Curve
library(InformationValue)
InformationValue::plotROC(y_act,pred)

InformationValue::AUROC(y_act,pred)
## [1] 0.6685957
#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_table <- table(testData$DV, y_pred_num)
#Displays confusion matrix and the statistics associated with the confusion matrix
result<-caret::confusionMatrix(confusion_table)
result
## Confusion Matrix and Statistics
##
## y_pred_num
## 0 1
## 0 400 318
## 1 10 19
##
## Accuracy : 0.5609
## 95% CI : (0.5245, 0.5969)
## No Information Rate : 0.5489
## P-Value [Acc > NIR] : 0.2662
##
## Kappa : 0.0348
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.97561
## Specificity : 0.05638
## Pos Pred Value : 0.55710
## Neg Pred Value : 0.65517
## Prevalence : 0.54886
## Detection Rate : 0.53548
## Detection Prevalence : 0.96118
## Balanced Accuracy : 0.51599
##
## 'Positive' Class : 0
##
metrics<-as.data.frame(result$byClass)
colnames(metrics)<-"metrics"
library(dplyr)
library(kableExtra)
kable(round(metrics,4), caption = "F1-score, Precision and Recall ") %>%
kable_styling(font_size = 16)
F1-score, Precision and Recall
|
|
metrics
|
|
Sensitivity
|
0.9756
|
|
Specificity
|
0.0564
|
|
Pos Pred Value
|
0.5571
|
|
Neg Pred Value
|
0.6552
|
|
Precision
|
0.5571
|
|
Recall
|
0.9756
|
|
F1
|
0.7092
|
|
Prevalence
|
0.5489
|
|
Detection Rate
|
0.5355
|
|
Detection Prevalence
|
0.9612
|
|
Balanced Accuracy
|
0.5160
|
# Probit Regression
# Get feature importance (coefficients)
feature_importance2 <- abs(coef(probitmod)[-1])
# Create feature importance plot
feature_importance_df2<-data.frame(
Feature = names(feature_importance2),
Importance = feature_importance2
)
feature_importance_df2 <- feature_importance_df2[order(feature_importance_df2$Importance, decreasing = TRUE), ]
#windows()
ggplot(feature_importance_df2, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "darkred") +
xlab("Feature") +
ylab("Importance") +
ggtitle("Probit Regression - Feature Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

################################################3
# Random Forest
################################################
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
set.seed(123)
#Random Forest for variables. mtry = 5 since there are 24 variables (square root of 24 is close to 5).
fit_rf <- randomForest(factor(DV) ~., data = trainData)
fit_rf
##
## Call:
## randomForest(formula = factor(DV) ~ ., data = trainData)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 4.21%
## Confusion matrix:
## 0 1 class.error
## 0 2864 0 0
## 1 126 0 1
varImpPlot(fit_rf)

#Performance of Random Forest Model on Testing Data
predrf = predict(fit_rf, newdata=testData)
table(predrf)
## predrf
## 0 1
## 746 1
summary(predrf)
## 0 1
## 746 1
# Measure the accuracy of prediction in the test data
y_pred_numrf<-ifelse(predrf==1,1,0)
y_predrf<-factor(y_pred_numrf,levels = c(0,1))
y_actrf<-testData$DV
#Result : Prediction Accuracy
mean(y_predrf==y_actrf)
## [1] 0.9598394
#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_tablerf<- table(testData$DV, y_pred_numrf)
#Displays confusion matrix and the statistics associated with the confusion matrix
resultrf<-caret::confusionMatrix(confusion_tablerf)
resultrf
## Confusion Matrix and Statistics
##
## y_pred_numrf
## 0 1
## 0 717 1
## 1 29 0
##
## Accuracy : 0.9598
## 95% CI : (0.9432, 0.9727)
## No Information Rate : 0.9987
## P-Value [Acc > NIR] : 1
##
## Kappa : -0.0026
##
## Mcnemar's Test P-Value : 8.244e-07
##
## Sensitivity : 0.9611
## Specificity : 0.0000
## Pos Pred Value : 0.9986
## Neg Pred Value : 0.0000
## Prevalence : 0.9987
## Detection Rate : 0.9598
## Detection Prevalence : 0.9612
## Balanced Accuracy : 0.4806
##
## 'Positive' Class : 0
##
metricsrf<-as.data.frame(resultrf$byClass)
colnames(metricsrf)<-"metricsrf"
library(dplyr)
library(kableExtra)
kable(round(metricsrf,4), caption = "F1-score, Precision and Recall ") %>%
kable_styling(font_size = 16)
F1-score, Precision and Recall
|
|
metricsrf
|
|
Sensitivity
|
0.9611
|
|
Specificity
|
0.0000
|
|
Pos Pred Value
|
0.9986
|
|
Neg Pred Value
|
0.0000
|
|
Precision
|
0.9986
|
|
Recall
|
0.9611
|
|
F1
|
0.9795
|
|
Prevalence
|
0.9987
|
|
Detection Rate
|
0.9598
|
|
Detection Prevalence
|
0.9612
|
|
Balanced Accuracy
|
0.4806
|
# Random Forest
# Get feature importance
feature_importance <- importance(fit_rf)
# Convert feature importance to data frame
feature_importance_df <- data.frame(
Feature = row.names(feature_importance),
Importance = feature_importance[, "MeanDecreaseGini"]
)
# Sort feature importance by importance score
feature_importance_df <- feature_importance_df[order(feature_importance_df$Importance, decreasing = TRUE), ]
#windows()
# Create feature importance plot
ggplot(feature_importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "darkblue") +
xlab("Feature") +
ylab("Importance") +
ggtitle("Random Forest - Feature Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

#ROC
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
# RandomForest
test_prob1= predict(fit_rf, newdata=testData, type = "prob")
test_roc1= roc(testData$DV, test_prob1[,c(2)],plot = TRUE, col='darkgoldenrod4', print.auc = TRUE,percent=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

#Probit Regression
test_prob2= predict(probitmod, newdata=testData, type = "response")
test_roc2= roc(testData$DV~test_prob2,plot = TRUE, col='darkorange', print.auc = TRUE,percent=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

#Logistic Regression
test_prob3= predict(logitmod2, newdata=testData, type = "response")
test_roc3= roc(testData$DV~test_prob3, plot = TRUE,col="darkblue", print.auc = TRUE,percent=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

###############################################
# Support Vector Machine (SVM) Model
###############################################
library(e1071)
set.seed(123)
svm<-svm(DV~., data = trainData, kernel='linear', cost=10, scale=FALSE)
# Plot the SVC obtained
#windows()
plot(svm, final)
summary(svm)
##
## Call:
## svm(formula = DV ~ ., data = trainData, kernel = "linear", cost = 10,
## scale = FALSE)
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: linear
## cost: 10
## gamma: 0.07142857
## epsilon: 0.1
##
##
## Number of Support Vectors: 533
#Prediction
set.seed(123)
predictionSVM<- predict(svm, newdata = testData)
summary(predictionSVM)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.09953 0.09986 0.10000 0.09999 0.10012 0.10044
# Measure the accuracy of prediction in the test data
y_pred_numSVM<-ifelse(predictionSVM<0.1,0,1)
y_predSVM<-factor(y_pred_numSVM,levels = c(0,1))
y_actSVM<-testData$DV
#Result : Prediction Accuracy
mean(y_predSVM==y_actSVM)
## [1] 0.5033467
#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_tableSVM<- table(testData$DV, y_predSVM)
#Displays confusion matrix and the statistics associated with the confusion matrix
resultSVM<-caret::confusionMatrix(confusion_tableSVM)
resultSVM
## Confusion Matrix and Statistics
##
## y_predSVM
## 0 1
## 0 361 357
## 1 14 15
##
## Accuracy : 0.5033
## 95% CI : (0.4669, 0.5398)
## No Information Rate : 0.502
## P-Value [Acc > NIR] : 0.4854
##
## Kappa : 0.003
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.96267
## Specificity : 0.04032
## Pos Pred Value : 0.50279
## Neg Pred Value : 0.51724
## Prevalence : 0.50201
## Detection Rate : 0.48327
## Detection Prevalence : 0.96118
## Balanced Accuracy : 0.50149
##
## 'Positive' Class : 0
##
metricsSVM<-as.data.frame(resultSVM$byClass)
colnames(metricsSVM)<-"metricsSVM"
library(dplyr)
library(kableExtra)
kable(round(metricsSVM,4), caption = "F1-score, Precision and Recall ") %>%
kable_styling(font_size = 16)
F1-score, Precision and Recall
|
|
metricsSVM
|
|
Sensitivity
|
0.9627
|
|
Specificity
|
0.0403
|
|
Pos Pred Value
|
0.5028
|
|
Neg Pred Value
|
0.5172
|
|
Precision
|
0.5028
|
|
Recall
|
0.9627
|
|
F1
|
0.6606
|
|
Prevalence
|
0.5020
|
|
Detection Rate
|
0.4833
|
|
Detection Prevalence
|
0.9612
|
|
Balanced Accuracy
|
0.5015
|
#SVM
library(pROC)
#windows()
test_prob4= predict(svm, newdata=testData)
test_roc4= roc(testData$DV, main="Support Vector Machine", test_prob4,plot = TRUE, col='darkblue', print.auc = TRUE,percent=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

coef(svm)[2:15]
## Age_Group Region Residence
## 3.844677e-05 3.637503e-06 -7.235190e-05
## Women_Educationlevel Listening_Radio Watching_Television
## 7.425885e-05 -7.779376e-05 -3.979415e-05
## Wealth_Quantile Contraceptive_Method Conctraceptive_Use
## -9.079849e-05 1.878193e-05 7.595457e-05
## Health_Access Husband_Education_Level Husband_Employment_Status
## 5.815800e-05 -3.459137e-05 -4.565393e-05
## Women_Employment_Status Number_of_Children
## -1.582482e-04 6.895394e-05
# Support Vector Machine
# Get feature importance (coefficients)
feature_importance1<- abs(coef(svm)[2:15])
# Create feature importance plot
feature_importance_df <- data.frame(
Feature = names(feature_importance1),
Importance = feature_importance1
)
feature_importance_df <- feature_importance_df[order(feature_importance_df$Importance, decreasing = TRUE), ]
#windows()
ggplot(feature_importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "magenta") +
xlab("Feature") +
ylab("Importance") +
ggtitle("Support Vector Machine - Feature Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

#################################################
############################################
# KNN Model
library(knn)
# Fitting model
fit <-knnreg(trainData$DV ~ ., data = trainData,k=1)
summary(fit)
## Length Class Mode
## learn 2 -none- list
## k 1 -none- numeric
## terms 3 terms call
## xlevels 0 -none- list
## theDots 0 -none- list
#Predict Output
predicted= predict(fit,testData)
summary(predicted)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.03529 0.00000 1.00000
# Measure the accuracy of prediction in the test data
y_pred_numknn<-ifelse(predicted==1,1,0)
y_predknn<-factor(y_pred_numknn,levels = c(0,1))
y_actknn<-testData$DV
#Result : Prediction Accuracy
mean(y_predknn==y_actknn)
## [1] 0.9558233
#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_tableknn<- table(testData$DV, y_predknn)
#Displays confusion matrix and the statistics associated with the confusion matrix
resultknn<-caret::confusionMatrix(confusion_tableknn)
resultknn
## Confusion Matrix and Statistics
##
## y_predknn
## 0 1
## 0 713 5
## 1 28 1
##
## Accuracy : 0.9558
## 95% CI : (0.9385, 0.9694)
## No Information Rate : 0.992
## P-Value [Acc > NIR] : 1.0000000
##
## Kappa : 0.0444
##
## Mcnemar's Test P-Value : 0.0001283
##
## Sensitivity : 0.96221
## Specificity : 0.16667
## Pos Pred Value : 0.99304
## Neg Pred Value : 0.03448
## Prevalence : 0.99197
## Detection Rate : 0.95448
## Detection Prevalence : 0.96118
## Balanced Accuracy : 0.56444
##
## 'Positive' Class : 0
##
metricsknn<-as.data.frame(resultknn$byClass)
colnames(metricsrf)<-"metricsknn"
library(dplyr)
library(kableExtra)
kable(round(metricsknn,4), caption = "F1-score, Precision and Recall ") %>%
kable_styling(font_size = 16)
F1-score, Precision and Recall
|
|
resultknn$byClass
|
|
Sensitivity
|
0.9622
|
|
Specificity
|
0.1667
|
|
Pos Pred Value
|
0.9930
|
|
Neg Pred Value
|
0.0345
|
|
Precision
|
0.9930
|
|
Recall
|
0.9622
|
|
F1
|
0.9774
|
|
Prevalence
|
0.9920
|
|
Detection Rate
|
0.9545
|
|
Detection Prevalence
|
0.9612
|
|
Balanced Accuracy
|
0.5644
|
knnmodel <- train(DV ~ ., data = trainData, method = "knn")
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
# Get feature importance
importance <- varImp(knnmodel)
# Print the feature importance
print(importance)
## loess r-squared variable importance
##
## Overall
## Number_of_Children 100.00000
## Age_Group 52.76462
## Health_Access 45.56273
## Listening_Radio 13.04052
## Residence 10.45667
## Husband_Education_Level 10.14241
## Wealth_Quantile 7.11500
## Husband_Employment_Status 5.65956
## Region 4.63386
## Watching_Television 3.80817
## Women_Employment_Status 0.69541
## Contraceptive_Method 0.01595
## Women_Educationlevel 0.00863
## Conctraceptive_Use 0.00000
# Plot the feature importance
plot <- ggplot(importance, aes(x = rownames(importance), y = Overall)) +
geom_bar(stat = "identity", fill = "darkgreen") +
labs(x = "Features", y = "Importance") +
ggtitle("KNN Features Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Print the plot
print(plot)

#ROC
library(pROC)
# KNN
#windows()
test_prob5= predict(fit, newdata=testData, type = "response")
test_roc5= roc(testData$DV, test_prob5,plot = TRUE, col='darkblue', print.auc = TRUE,percent=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

####################################################
# Area_Under Curve
############################################
#Area-under Curve
#windows()
plot(test_roc1, print.auc = TRUE, col='blue',percent=TRUE, print.auc.y = 48, print.auc.x = 0)
plot(test_roc2 , add = TRUE, col='gold', print.auc = TRUE, print.auc.y = 41, print.auc.x = 0)
plot(test_roc3 , add = TRUE, col='brown', print.auc = TRUE, print.auc.y = 34, print.auc.x = 0)
plot(test_roc4 , add = TRUE, col='green', print.auc = TRUE, print.auc.y = 27, print.auc.x = 0)
plot(test_roc5 , add = TRUE, col='cyan', print.auc = TRUE, print.auc.y = 20, print.auc.x = 0)
abline(a=0, b=1, lty=3, lwd=4, col="black")
legend("topleft", legend = c("Random Forest-Model", "Probit Regression",
"Logistic-Regression","Support Vector Machine","KNN Model") ,
pch = 15, bty = 'n', col = c("blue","gold", "brown", "green","cyan"))

###################################################
# Metrics
###############################################
# Sample accuracy, sensitivity, F1 Score, and precision values for three models
Accuracy <- c(0.578, 0.960, 0.561,0.503,0.956)
Recall <- c(0.976, 0.961, 0.976,0.963,0.962)
F1_score <- c(0.724, 0.980, 0.709,0.661,0.977)
Precision<-c(0.575,0.999,0.557,0.503,0.993)
# Creating a data frame with the values
model_names <- c("Logistic Regression", "Random Forest", "Probit Regression",
"Support Vector Machine","KNN Model")
data <- data.frame(Model = model_names, Accuracy = Accuracy, Recall = Recall, F1_Score = F1_score, Precision=Precision)
# Loading the ggplot2 package
library(ggplot2)
# Reshaping the data frame for plotting
data_long <- reshape2::melt(data, id.vars = "Model", variable.name = "Metric", value.name = "Value")
#windows()
# Creating the bar chart
ggplot(data_long, aes(x = Model, y = Value, fill = Metric)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Comparison of Supervised Machine Learning Models",
x = "Model",
y = "Value") +
scale_fill_manual(values = c("Accuracy" = "cyan3",
"Recall" = "pink4",
"F1_Score" = "purple4",
"Precision" = "green1")) +
theme_minimal()

#############################################
# Feature Selection
#############################################
#windows()
par(mfrow=(c(3,2)))
# Logistic Regression
# Get feature importance (coefficients)
feature_importance2 <- abs(coef(logitmod2)[-1])
# Create feature importance plot
feature_importance_df2<-data.frame(
Feature = names(feature_importance2),
Importance = feature_importance2
)
feature_importance_df2 <- feature_importance_df2[order(feature_importance_df2$Importance, decreasing = TRUE), ]
#windows()
ggplot(feature_importance_df2, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "magenta4") +
xlab("Feature") +
ylab("Importance") +
ggtitle("Logistic Regression - Feature Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

###########################################3
# Probit Regression
# Get feature importance (coefficients)
feature_importance2 <- abs(coef(probitmod)[-1])
# Create feature importance plot
feature_importance_df2<-data.frame(
Feature = names(feature_importance2),
Importance = feature_importance2
)
feature_importance_df2 <- feature_importance_df2[order(feature_importance_df2$Importance, decreasing = TRUE), ]
#windows()
ggplot(feature_importance_df2, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "cyan4") +
xlab("Feature") +
ylab("Importance") +
ggtitle("Probit Regression - Feature Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

#############################################
# Random Forest
# Get feature importance
feature_importance <- importance(fit_rf)
# Convert feature importance to data frame
feature_importance_df <- data.frame(
Feature = row.names(feature_importance),
Importance = feature_importance[, "MeanDecreaseGini"]
)
# Sort feature importance by importance score
feature_importance_df <- feature_importance_df[order(feature_importance_df$Importance, decreasing = TRUE), ]
#windows()
# Create feature importance plot
ggplot(feature_importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "darkred") +
xlab("Feature") +
ylab("Importance") +
ggtitle("Random Forest - Feature Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

###########################################
knnmodel <- train(DV ~ ., data = trainData, method = "knn")
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
# Get feature importance
importance <- varImp(knnmodel)
# Print the feature importance
print(importance)
## loess r-squared variable importance
##
## Overall
## Number_of_Children 100.00000
## Age_Group 52.76462
## Health_Access 45.56273
## Listening_Radio 13.04052
## Residence 10.45667
## Husband_Education_Level 10.14241
## Wealth_Quantile 7.11500
## Husband_Employment_Status 5.65956
## Region 4.63386
## Watching_Television 3.80817
## Women_Employment_Status 0.69541
## Contraceptive_Method 0.01595
## Women_Educationlevel 0.00863
## Conctraceptive_Use 0.00000
#windows()
# Plot the feature importance
plot <- ggplot(importance, aes(x = rownames(importance), y = Overall)) +
geom_bar(stat = "identity", fill = "darkblue") +
labs(x = "Features", y = "Importance") +
ggtitle("KNN Features Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Print the plot
print(plot)

########################################
coef(svm)[2:15]
## Age_Group Region Residence
## 3.844677e-05 3.637503e-06 -7.235190e-05
## Women_Educationlevel Listening_Radio Watching_Television
## 7.425885e-05 -7.779376e-05 -3.979415e-05
## Wealth_Quantile Contraceptive_Method Conctraceptive_Use
## -9.079849e-05 1.878193e-05 7.595457e-05
## Health_Access Husband_Education_Level Husband_Employment_Status
## 5.815800e-05 -3.459137e-05 -4.565393e-05
## Women_Employment_Status Number_of_Children
## -1.582482e-04 6.895394e-05
# Support Vector Machine
# Get feature importance (coefficients)
feature_importance1<- abs(coef(svm)[2:15])
# Create feature importance plot
feature_importance_df <- data.frame(
Feature = names(feature_importance1),
Importance = feature_importance1
)
feature_importance_df <- feature_importance_df[order(feature_importance_df$Importance, decreasing = TRUE), ]
#windows()
ggplot(feature_importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "darkgreen") +
xlab("Feature") +
ylab("Importance") +
ggtitle("Support Vector Machine - Feature Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
