Amoud University

Leila Project

# 5 Supervised Machine Learning Models
# Unintended Childbirth in Somaliland
library(haven)
final<-read_dta("~/Leyla.dta")
str(final)

## tibble [3,737 × 15] (S3: tbl_df/tbl/data.frame)
##  $ Age_Group                : dbl+lbl [1:3737] 2, 3, 3, 3, 5, 5, 3, 3, 1, 1, 3, 3, 3, 3, 3, 3, 1, 5,...
##    ..@ label       : chr "Age in 5-year groups"
##    ..@ format.stata: chr "%1.0f"
##    ..@ labels      : Named num [1:7] 1 2 3 4 5 6 7
##    .. ..- attr(*, "names")= chr [1:7] "15 - 19" "20 - 24" "25 - 29" "30 - 34" ...
##  $ Region                   : dbl+lbl [1:3737] 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1...
##    ..@ label       : chr "Region"
##    ..@ format.stata: chr "%2.0f"
##    ..@ labels      : Named num [1:18] 11 12 13 14 15 16 17 18 19 20 ...
##    .. ..- attr(*, "names")= chr [1:18] "Awdal" "Woqooyi Galbeed" "Togdheer" "Sool" ...
##  $ Residence                : dbl+lbl [1:3737] 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,...
##    ..@ label       : chr "Type of place of residence"
##    ..@ format.stata: chr "%1.0f"
##    ..@ labels      : Named num [1:6] 1 2 3 4 5 6
##    .. ..- attr(*, "names")= chr [1:6] "Rural" "Urban" "Nomadic" "Rural IDP" ...
##  $ Women_Educationlevel     : dbl+lbl [1:3737] 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 0, 0, 0,...
##    ..@ label       : chr "Highest educational level"
##    ..@ format.stata: chr "%1.0f"
##    ..@ labels      : Named num [1:4] 0 1 2 3
##    .. ..- attr(*, "names")= chr [1:4] "No Education" "Primary" "Secondary" "Higher"
##  $ Listening_Radio          : dbl+lbl [1:3737] 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 3, 3, 3, 3, 3,...
##    ..@ label       : chr "Frequency of listening to radio"
##    ..@ format.stata: chr "%1.0f"
##    ..@ labels      : Named num [1:3] 1 2 3
##    .. ..- attr(*, "names")= chr [1:3] "At least once a week" "Less than once a week" "Not at all"
##  $ Watching_Television      : dbl+lbl [1:3737] 3, 3, 3, 3, 1, 1, 3, 3, 3, 3, 2, 2, 2, 3, 3, 3, 3, 3,...
##    ..@ label       : chr "Frequency of watching television"
##    ..@ format.stata: chr "%1.0f"
##    ..@ labels      : Named num [1:3] 1 2 3
##    .. ..- attr(*, "names")= chr [1:3] "At least once a week" "Less than once a week" "Not at all"
##  $ Wealth_Quantile          : dbl+lbl [1:3737] 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 4, 4, 4, 3, 3, 3, 3, 2,...
##    ..@ label       : chr "Wealth index combined"
##    ..@ format.stata: chr "%1.0f"
##    ..@ labels      : Named num [1:5] 1 2 3 4 5
##    .. ..- attr(*, "names")= chr [1:5] "Lowest" "Second" "Middle" "Fourth" ...
##  $ Contraceptive_Method     : dbl+lbl [1:3737] 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
##    ..@ label       : chr "Current contraceptive method"
##    ..@ format.stata: chr "%2.0f"
##    ..@ labels      : Named num [1:14] 0 1 2 3 4 5 6 7 8 9 ...
##    .. ..- attr(*, "names")= chr [1:14] "Not currently using(pregnant)" "IUD" "Injectables" "Implants" ...
##  $ Conctraceptive_Use       : dbl+lbl [1:3737] 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,...
##    ..@ label       : chr "Contraceptive use and intention"
##    ..@ format.stata: chr "%1.0f"
##    ..@ labels      : Named num [1:5] 1 2 3 4 5
##    .. ..- attr(*, "names")= chr [1:5] "Using mordern method" "Using traditional method" "Non-User, intends to use later" "Does not intend to use" ...
##  $ Health_Access            : dbl+lbl [1:3737] 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1,...
##    ..@ label       : chr "Getting medical help for self: distance to health facility"
##    ..@ format.stata: chr "%10.0f"
##    ..@ labels      : Named num [1:2] 1 2
##    .. ..- attr(*, "names")= chr [1:2] "No problem" "Problem"
##  $ Husband_Education_Level  : dbl+lbl [1:3737] 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 1, 1,...
##    ..@ label       : chr "Husband/partner's  ever attended school"
##    ..@ format.stata: chr "%1.0f"
##    ..@ labels      : Named num [1:3] 1 2 8
##    .. ..- attr(*, "names")= chr [1:3] "Yes" "No" "Don't Know"
##  $ Husband_Employment_Status: dbl+lbl [1:3737] 2, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,...
##    ..@ label       : chr "Husband/partner worked in last 12 months"
##    ..@ format.stata: chr "%1.0f"
##    ..@ labels      : Named num [1:3] 1 2 8
##    .. ..- attr(*, "names")= chr [1:3] "Yes" "No" "Don't Know"
##  $ Women_Employment_Status  : dbl+lbl [1:3737] 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,...
##    ..@ label       : chr "Respondent worked in last 12 months"
##    ..@ format.stata: chr "%1.0f"
##    ..@ labels      : Named num [1:2] 1 2
##    .. ..- attr(*, "names")= chr [1:2] "Yes" "No"
##  $ DV                       : dbl+lbl [1:3737] 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
##    ..@ label       : chr "Wanted pregnancy when became pregnant"
##    ..@ format.stata: chr "%10.0f"
##    ..@ labels      : Named num [1:2] 0 1
##    .. ..- attr(*, "names")= chr [1:2] "Wanted" "Unwanted"
##  $ Number_of_Children       : dbl+lbl [1:3737] 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0,...
##    ..@ label       : chr "Total Children Ever Born"
##    ..@ format.stata: chr "%20.0g"
##    ..@ labels      : Named num [1:2] 0 1
##    .. ..- attr(*, "names")= chr [1:2] "Less than 5 Children" "5 Children+"

write.csv(final,"~/Leyla.csv")
final<-read.csv("~/Leyla.csv")
final<-final[,-1]
# Logistic Regression
logistic<-glm(DV~.,data=final,family = binomial())
summary(logistic)

## 
## Call:
## glm(formula = DV ~ ., family = binomial(), data = final)
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -4.15319    2.09284  -1.984 0.047203 *  
## Age_Group                  0.10117    0.07391   1.369 0.171046    
## Region                    -0.10721    0.06149  -1.744 0.081216 .  
## Residence                  0.17885    0.11141   1.605 0.108436    
## Women_Educationlevel       0.03472    0.17782   0.195 0.845183    
## Listening_Radio            0.20495    0.16820   1.218 0.223051    
## Watching_Television        0.12066    0.14380   0.839 0.401448    
## Wealth_Quantile            0.15088    0.07334   2.057 0.039646 *  
## Contraceptive_Method      -0.12628    0.17266  -0.731 0.464562    
## Conctraceptive_Use        -0.24896    0.24527  -1.015 0.310078    
## Health_Access              0.57360    0.17474   3.283 0.001029 ** 
## Husband_Education_Level   -0.39456    0.24500  -1.610 0.107301    
## Husband_Employment_Status  0.46855    0.18699   2.506 0.012222 *  
## Women_Employment_Status   -0.09938    0.73760  -0.135 0.892825    
## Number_of_Children         0.70398    0.21334   3.300 0.000968 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1290.1  on 3736  degrees of freedom
## Residual deviance: 1236.4  on 3722  degrees of freedom
## AIC: 1266.4
## 
## Number of Fisher Scoring iterations: 6

round(coef(summary(logistic)),4)

##                           Estimate Std. Error z value Pr(>|z|)
## (Intercept)                -4.1532     2.0928 -1.9845   0.0472
## Age_Group                   0.1012     0.0739  1.3689   0.1710
## Region                     -0.1072     0.0615 -1.7437   0.0812
## Residence                   0.1788     0.1114  1.6053   0.1084
## Women_Educationlevel        0.0347     0.1778  0.1953   0.8452
## Listening_Radio             0.2049     0.1682  1.2185   0.2231
## Watching_Television         0.1207     0.1438  0.8390   0.4014
## Wealth_Quantile             0.1509     0.0733  2.0574   0.0396
## Contraceptive_Method       -0.1263     0.1727 -0.7314   0.4646
## Conctraceptive_Use         -0.2490     0.2453 -1.0151   0.3101
## Health_Access               0.5736     0.1747  3.2825   0.0010
## Husband_Education_Level    -0.3946     0.2450 -1.6104   0.1073
## Husband_Employment_Status   0.4685     0.1870  2.5057   0.0122
## Women_Employment_Status    -0.0994     0.7376 -0.1347   0.8928
## Number_of_Children          0.7040     0.2133  3.2998   0.0010

# Logistic Regression without factors
logistic<-glm(DV~.,data=final,family = binomial())
summary(logistic)

## 
## Call:
## glm(formula = DV ~ ., family = binomial(), data = final)
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -4.15319    2.09284  -1.984 0.047203 *  
## Age_Group                  0.10117    0.07391   1.369 0.171046    
## Region                    -0.10721    0.06149  -1.744 0.081216 .  
## Residence                  0.17885    0.11141   1.605 0.108436    
## Women_Educationlevel       0.03472    0.17782   0.195 0.845183    
## Listening_Radio            0.20495    0.16820   1.218 0.223051    
## Watching_Television        0.12066    0.14380   0.839 0.401448    
## Wealth_Quantile            0.15088    0.07334   2.057 0.039646 *  
## Contraceptive_Method      -0.12628    0.17266  -0.731 0.464562    
## Conctraceptive_Use        -0.24896    0.24527  -1.015 0.310078    
## Health_Access              0.57360    0.17474   3.283 0.001029 ** 
## Husband_Education_Level   -0.39456    0.24500  -1.610 0.107301    
## Husband_Employment_Status  0.46855    0.18699   2.506 0.012222 *  
## Women_Employment_Status   -0.09938    0.73760  -0.135 0.892825    
## Number_of_Children         0.70398    0.21334   3.300 0.000968 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1290.1  on 3736  degrees of freedom
## Residual deviance: 1236.4  on 3722  degrees of freedom
## AIC: 1266.4
## 
## Number of Fisher Scoring iterations: 6

# Prediction Models
library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

# Prep Training and Test dataset
# First, create a Train-test split with 0% data included in the training set
set.seed(123)
trainDataIndex<-createDataPartition(final$DV, p=0.8, list = F)
trainData<-final[trainDataIndex,]
testData<-final[-trainDataIndex,]

###################################################

# Logistic with only significant variables
###################################################
logitmod2<-glm(DV~ .,
               family="binomial", data=trainData)
summary(logitmod2)

## 
## Call:
## glm(formula = DV ~ ., family = "binomial", data = trainData)
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)   
## (Intercept)               -4.77706    2.24337  -2.129  0.03322 * 
## Age_Group                  0.06012    0.08343   0.721  0.47118   
## Region                    -0.07494    0.06828  -1.098  0.27240   
## Residence                  0.15535    0.12236   1.270  0.20424   
## Women_Educationlevel       0.12964    0.18947   0.684  0.49385   
## Listening_Radio            0.31853    0.20631   1.544  0.12262   
## Watching_Television        0.17045    0.16012   1.064  0.28712   
## Wealth_Quantile            0.15235    0.08082   1.885  0.05943 . 
## Contraceptive_Method      -0.06165    0.17495  -0.352  0.72456   
## Conctraceptive_Use        -0.16790    0.27070  -0.620  0.53510   
## Health_Access              0.58578    0.19447   3.012  0.00259 **
## Husband_Education_Level   -0.44541    0.27109  -1.643  0.10037   
## Husband_Employment_Status  0.40141    0.20919   1.919  0.05500 . 
## Women_Employment_Status   -0.21423    0.74201  -0.289  0.77280   
## Number_of_Children         0.70175    0.23753   2.954  0.00313 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1044.6  on 2989  degrees of freedom
## Residual deviance: 1004.7  on 2975  degrees of freedom
## AIC: 1034.7
## 
## Number of Fisher Scoring iterations: 6

# Apply the model to predict the testdata
pred<-predict(logitmod2,newdata = testData, type = "response")
summary(pred)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.005977 0.023367 0.037275 0.041888 0.053084 0.162106

# Measure the accuracy of prediction in the test data
y_pred_num<-ifelse(pred<0.041,0,1)
y_pred<-factor(y_pred_num,levels = c(0,1))
y_act<-testData$DV

#Result : Prediction Accuracy 
set.seed(123)
mean(y_pred==y_act)

## [1] 0.5783133

# Plot ROC Curve
library(InformationValue)

## 
## Attaching package: 'InformationValue'

## The following objects are masked from 'package:caret':
## 
##     confusionMatrix, precision, sensitivity, specificity

InformationValue::plotROC(y_act,pred)

InformationValue::AUROC(y_act,pred)

## [1] 0.6785371

#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_table <- table(testData$DV, y_pred_num)
#Displays confusion matrix and the statistics associated with the confusion matrix
result<-caret::confusionMatrix(confusion_table)
result

## Confusion Matrix and Statistics
## 
##    y_pred_num
##       0   1
##   0 413 305
##   1  10  19
##                                         
##                Accuracy : 0.5783        
##                  95% CI : (0.542, 0.614)
##     No Information Rate : 0.5663        
##     P-Value [Acc > NIR] : 0.2655        
##                                         
##                   Kappa : 0.0392        
##                                         
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 0.97636       
##             Specificity : 0.05864       
##          Pos Pred Value : 0.57521       
##          Neg Pred Value : 0.65517       
##              Prevalence : 0.56627       
##          Detection Rate : 0.55288       
##    Detection Prevalence : 0.96118       
##       Balanced Accuracy : 0.51750       
##                                         
##        'Positive' Class : 0             
##

metrics<-as.data.frame(result$byClass)
colnames(metrics)<-"metrics"
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(kableExtra)

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

kable(round(metrics,4), caption = "F1-score, Precision and Recall ") %>%
  kable_styling(font_size = 16)

F1-score, Precision and Recall
	metrics
Sensitivity	0.9764
Specificity	0.0586
Pos Pred Value	0.5752
Neg Pred Value	0.6552
Precision	0.5752
Recall	0.9764
F1	0.7239
Prevalence	0.5663
Detection Rate	0.5529
Detection Prevalence	0.9612
Balanced Accuracy	0.5175

#############################################3
# Probit Regression
#############################################

# Probit Regression
probitmod<-glm(DV~ .,
               family=binomial(link="probit"), data=trainData)
summary(probitmod)

## 
## Call:
## glm(formula = DV ~ ., family = binomial(link = "probit"), data = trainData)
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)   
## (Intercept)               -2.42650    1.01984  -2.379  0.01735 * 
## Age_Group                  0.03448    0.03842   0.897  0.36946   
## Region                    -0.03284    0.03126  -1.050  0.29357   
## Residence                  0.06907    0.05576   1.239  0.21542   
## Women_Educationlevel       0.06571    0.08564   0.767  0.44292   
## Listening_Radio            0.13697    0.08854   1.547  0.12185   
## Watching_Television        0.07954    0.07237   1.099  0.27170   
## Wealth_Quantile            0.07072    0.03736   1.893  0.05835 . 
## Contraceptive_Method      -0.03050    0.07975  -0.382  0.70214   
## Conctraceptive_Use        -0.07985    0.12303  -0.649  0.51632   
## Health_Access              0.27795    0.08999   3.089  0.00201 **
## Husband_Education_Level   -0.20451    0.12192  -1.677  0.09347 . 
## Husband_Employment_Status  0.18748    0.09654   1.942  0.05215 . 
## Women_Employment_Status   -0.13447    0.34102  -0.394  0.69335   
## Number_of_Children         0.31136    0.10619   2.932  0.00337 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1044.6  on 2989  degrees of freedom
## Residual deviance: 1004.0  on 2975  degrees of freedom
## AIC: 1034
## 
## Number of Fisher Scoring iterations: 6

# Apply the model to predict the testdata
pred<-predict(probitmod,newdata = testData, type = "response")
summary(pred)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00453 0.02295 0.03757 0.04191 0.05449 0.15276

# Measure the accuracy of prediction in the test data
y_pred_num<-ifelse(pred<0.04,0,1)
y_pred<-factor(y_pred_num,levels = c(0,1))
y_act<-testData$DV

#Result : Prediction Accuracy 
set.seed(123)
mean(y_pred==y_act)

## [1] 0.5609103

# Plot ROC Curve
library(InformationValue)
InformationValue::plotROC(y_act,pred)

InformationValue::AUROC(y_act,pred)

## [1] 0.6685957

#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_table <- table(testData$DV, y_pred_num)
#Displays confusion matrix and the statistics associated with the confusion matrix
result<-caret::confusionMatrix(confusion_table)
result

## Confusion Matrix and Statistics
## 
##    y_pred_num
##       0   1
##   0 400 318
##   1  10  19
##                                           
##                Accuracy : 0.5609          
##                  95% CI : (0.5245, 0.5969)
##     No Information Rate : 0.5489          
##     P-Value [Acc > NIR] : 0.2662          
##                                           
##                   Kappa : 0.0348          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.97561         
##             Specificity : 0.05638         
##          Pos Pred Value : 0.55710         
##          Neg Pred Value : 0.65517         
##              Prevalence : 0.54886         
##          Detection Rate : 0.53548         
##    Detection Prevalence : 0.96118         
##       Balanced Accuracy : 0.51599         
##                                           
##        'Positive' Class : 0               
##

metrics<-as.data.frame(result$byClass)
colnames(metrics)<-"metrics"
library(dplyr)
library(kableExtra)
kable(round(metrics,4), caption = "F1-score, Precision and Recall ") %>%
  kable_styling(font_size = 16)

F1-score, Precision and Recall
	metrics
Sensitivity	0.9756
Specificity	0.0564
Pos Pred Value	0.5571
Neg Pred Value	0.6552
Precision	0.5571
Recall	0.9756
F1	0.7092
Prevalence	0.5489
Detection Rate	0.5355
Detection Prevalence	0.9612
Balanced Accuracy	0.5160

# Probit Regression

# Get feature importance (coefficients)
feature_importance2 <- abs(coef(probitmod)[-1])

# Create feature importance plot
feature_importance_df2<-data.frame(
  Feature = names(feature_importance2),
  Importance = feature_importance2
)

feature_importance_df2 <- feature_importance_df2[order(feature_importance_df2$Importance, decreasing = TRUE), ]
#windows()
ggplot(feature_importance_df2, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "darkred") +
  xlab("Feature") +
  ylab("Importance") +
  ggtitle("Probit Regression - Feature Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

################################################3
# Random Forest 
################################################

library(randomForest)

## randomForest 4.7-1.1

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

set.seed(123)

#Random Forest for variables. mtry = 5 since there are 24 variables (square root of 24 is close to 5).
fit_rf <- randomForest(factor(DV) ~., data = trainData)
fit_rf

## 
## Call:
##  randomForest(formula = factor(DV) ~ ., data = trainData) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 4.21%
## Confusion matrix:
##      0 1 class.error
## 0 2864 0           0
## 1  126 0           1

varImpPlot(fit_rf)

#Performance of Random Forest Model on Testing Data
predrf = predict(fit_rf, newdata=testData)
table(predrf)

## predrf
##   0   1 
## 746   1

summary(predrf)

##   0   1 
## 746   1

# Measure the accuracy of prediction in the test data
y_pred_numrf<-ifelse(predrf==1,1,0)
y_predrf<-factor(y_pred_numrf,levels = c(0,1))
y_actrf<-testData$DV

#Result : Prediction Accuracy 
mean(y_predrf==y_actrf)

## [1] 0.9598394

#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_tablerf<- table(testData$DV, y_pred_numrf)
#Displays confusion matrix and the statistics associated with the confusion matrix
resultrf<-caret::confusionMatrix(confusion_tablerf)
resultrf

## Confusion Matrix and Statistics
## 
##    y_pred_numrf
##       0   1
##   0 717   1
##   1  29   0
##                                           
##                Accuracy : 0.9598          
##                  95% CI : (0.9432, 0.9727)
##     No Information Rate : 0.9987          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : -0.0026         
##                                           
##  Mcnemar's Test P-Value : 8.244e-07       
##                                           
##             Sensitivity : 0.9611          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.9986          
##          Neg Pred Value : 0.0000          
##              Prevalence : 0.9987          
##          Detection Rate : 0.9598          
##    Detection Prevalence : 0.9612          
##       Balanced Accuracy : 0.4806          
##                                           
##        'Positive' Class : 0               
##

metricsrf<-as.data.frame(resultrf$byClass)
colnames(metricsrf)<-"metricsrf"
library(dplyr)
library(kableExtra)
kable(round(metricsrf,4), caption = "F1-score, Precision and Recall ") %>%
  kable_styling(font_size = 16)

F1-score, Precision and Recall
	metricsrf
Sensitivity	0.9611
Specificity	0.0000
Pos Pred Value	0.9986
Neg Pred Value	0.0000
Precision	0.9986
Recall	0.9611
F1	0.9795
Prevalence	0.9987
Detection Rate	0.9598
Detection Prevalence	0.9612
Balanced Accuracy	0.4806

# Random Forest 

# Get feature importance
feature_importance <- importance(fit_rf)

# Convert feature importance to data frame
feature_importance_df <- data.frame(
  Feature = row.names(feature_importance),
  Importance = feature_importance[, "MeanDecreaseGini"]
)

# Sort feature importance by importance score
feature_importance_df <- feature_importance_df[order(feature_importance_df$Importance, decreasing = TRUE), ]
#windows()
# Create feature importance plot
ggplot(feature_importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "darkblue") +
  xlab("Feature") +
  ylab("Importance") +
  ggtitle("Random Forest - Feature Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

#ROC
library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

# RandomForest
test_prob1= predict(fit_rf, newdata=testData, type = "prob")
test_roc1= roc(testData$DV, test_prob1[,c(2)],plot = TRUE, col='darkgoldenrod4', print.auc = TRUE,percent=TRUE)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

#Probit Regression
test_prob2= predict(probitmod, newdata=testData, type = "response")
test_roc2= roc(testData$DV~test_prob2,plot = TRUE, col='darkorange', print.auc = TRUE,percent=TRUE)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

#Logistic Regression
test_prob3= predict(logitmod2, newdata=testData, type = "response")
test_roc3= roc(testData$DV~test_prob3, plot = TRUE,col="darkblue", print.auc = TRUE,percent=TRUE)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

###############################################
# Support Vector Machine (SVM) Model
###############################################
library(e1071)
set.seed(123)
svm<-svm(DV~., data = trainData, kernel='linear', cost=10, scale=FALSE)

# Plot the SVC obtained
#windows()
plot(svm, final)
summary(svm)

## 
## Call:
## svm(formula = DV ~ ., data = trainData, kernel = "linear", cost = 10, 
##     scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  eps-regression 
##  SVM-Kernel:  linear 
##        cost:  10 
##       gamma:  0.07142857 
##     epsilon:  0.1 
## 
## 
## Number of Support Vectors:  533

#Prediction
set.seed(123)
predictionSVM<- predict(svm, newdata = testData)
summary(predictionSVM)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.09953 0.09986 0.10000 0.09999 0.10012 0.10044

# Measure the accuracy of prediction in the test data
y_pred_numSVM<-ifelse(predictionSVM<0.1,0,1)
y_predSVM<-factor(y_pred_numSVM,levels = c(0,1))
y_actSVM<-testData$DV

#Result : Prediction Accuracy 
mean(y_predSVM==y_actSVM)

## [1] 0.5033467

#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_tableSVM<- table(testData$DV, y_predSVM)
#Displays confusion matrix and the statistics associated with the confusion matrix
resultSVM<-caret::confusionMatrix(confusion_tableSVM)
resultSVM

## Confusion Matrix and Statistics
## 
##    y_predSVM
##       0   1
##   0 361 357
##   1  14  15
##                                           
##                Accuracy : 0.5033          
##                  95% CI : (0.4669, 0.5398)
##     No Information Rate : 0.502           
##     P-Value [Acc > NIR] : 0.4854          
##                                           
##                   Kappa : 0.003           
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.96267         
##             Specificity : 0.04032         
##          Pos Pred Value : 0.50279         
##          Neg Pred Value : 0.51724         
##              Prevalence : 0.50201         
##          Detection Rate : 0.48327         
##    Detection Prevalence : 0.96118         
##       Balanced Accuracy : 0.50149         
##                                           
##        'Positive' Class : 0               
##

metricsSVM<-as.data.frame(resultSVM$byClass)
colnames(metricsSVM)<-"metricsSVM"
library(dplyr)
library(kableExtra)
kable(round(metricsSVM,4), caption = "F1-score, Precision and Recall ") %>%
  kable_styling(font_size = 16)

F1-score, Precision and Recall
	metricsSVM
Sensitivity	0.9627
Specificity	0.0403
Pos Pred Value	0.5028
Neg Pred Value	0.5172
Precision	0.5028
Recall	0.9627
F1	0.6606
Prevalence	0.5020
Detection Rate	0.4833
Detection Prevalence	0.9612
Balanced Accuracy	0.5015

#SVM
library(pROC)
#windows()
test_prob4= predict(svm, newdata=testData)
test_roc4= roc(testData$DV, main="Support Vector Machine", test_prob4,plot = TRUE, col='darkblue', print.auc = TRUE,percent=TRUE)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

coef(svm)[2:15]

##                 Age_Group                    Region                 Residence 
##              3.844677e-05              3.637503e-06             -7.235190e-05 
##      Women_Educationlevel           Listening_Radio       Watching_Television 
##              7.425885e-05             -7.779376e-05             -3.979415e-05 
##           Wealth_Quantile      Contraceptive_Method        Conctraceptive_Use 
##             -9.079849e-05              1.878193e-05              7.595457e-05 
##             Health_Access   Husband_Education_Level Husband_Employment_Status 
##              5.815800e-05             -3.459137e-05             -4.565393e-05 
##   Women_Employment_Status        Number_of_Children 
##             -1.582482e-04              6.895394e-05

# Support Vector Machine
# Get feature importance (coefficients)
feature_importance1<- abs(coef(svm)[2:15])

# Create feature importance plot
feature_importance_df <- data.frame(
  Feature = names(feature_importance1),
  Importance = feature_importance1
)

feature_importance_df <- feature_importance_df[order(feature_importance_df$Importance, decreasing = TRUE), ]
#windows()
ggplot(feature_importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "magenta") +
  xlab("Feature") +
  ylab("Importance") +
  ggtitle("Support Vector Machine - Feature Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

#################################################



############################################
# KNN Model
library(knn)

# Fitting model
fit <-knnreg(trainData$DV ~ ., data = trainData,k=1)
summary(fit)

##         Length Class  Mode   
## learn   2      -none- list   
## k       1      -none- numeric
## terms   3      terms  call   
## xlevels 0      -none- list   
## theDots 0      -none- list

#Predict Output 
predicted= predict(fit,testData)
summary(predicted)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.03529 0.00000 1.00000

# Measure the accuracy of prediction in the test data
y_pred_numknn<-ifelse(predicted==1,1,0)
y_predknn<-factor(y_pred_numknn,levels = c(0,1))
y_actknn<-testData$DV

#Result : Prediction Accuracy 
mean(y_predknn==y_actknn)

## [1] 0.9558233

#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_tableknn<- table(testData$DV, y_predknn)
#Displays confusion matrix and the statistics associated with the confusion matrix
resultknn<-caret::confusionMatrix(confusion_tableknn)
resultknn

## Confusion Matrix and Statistics
## 
##    y_predknn
##       0   1
##   0 713   5
##   1  28   1
##                                           
##                Accuracy : 0.9558          
##                  95% CI : (0.9385, 0.9694)
##     No Information Rate : 0.992           
##     P-Value [Acc > NIR] : 1.0000000       
##                                           
##                   Kappa : 0.0444          
##                                           
##  Mcnemar's Test P-Value : 0.0001283       
##                                           
##             Sensitivity : 0.96221         
##             Specificity : 0.16667         
##          Pos Pred Value : 0.99304         
##          Neg Pred Value : 0.03448         
##              Prevalence : 0.99197         
##          Detection Rate : 0.95448         
##    Detection Prevalence : 0.96118         
##       Balanced Accuracy : 0.56444         
##                                           
##        'Positive' Class : 0               
##

metricsknn<-as.data.frame(resultknn$byClass)
colnames(metricsrf)<-"metricsknn"
library(dplyr)
library(kableExtra)
kable(round(metricsknn,4), caption = "F1-score, Precision and Recall ") %>%
  kable_styling(font_size = 16)

F1-score, Precision and Recall
	resultknn$byClass
Sensitivity	0.9622
Specificity	0.1667
Pos Pred Value	0.9930
Neg Pred Value	0.0345
Precision	0.9930
Recall	0.9622
F1	0.9774
Prevalence	0.9920
Detection Rate	0.9545
Detection Prevalence	0.9612
Balanced Accuracy	0.5644

knnmodel <- train(DV ~ ., data = trainData, method = "knn")

## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.

# Get feature importance
importance <- varImp(knnmodel)
# Print the feature importance
print(importance)

## loess r-squared variable importance
## 
##                             Overall
## Number_of_Children        100.00000
## Age_Group                  52.76462
## Health_Access              45.56273
## Listening_Radio            13.04052
## Residence                  10.45667
## Husband_Education_Level    10.14241
## Wealth_Quantile             7.11500
## Husband_Employment_Status   5.65956
## Region                      4.63386
## Watching_Television         3.80817
## Women_Employment_Status     0.69541
## Contraceptive_Method        0.01595
## Women_Educationlevel        0.00863
## Conctraceptive_Use          0.00000

# Plot the feature importance
plot <- ggplot(importance, aes(x = rownames(importance), y = Overall)) +
  geom_bar(stat = "identity", fill = "darkgreen") +
  labs(x = "Features", y = "Importance") +
  ggtitle("KNN Features Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Print the plot
print(plot)

#ROC
library(pROC)
# KNN
#windows()
test_prob5= predict(fit, newdata=testData, type = "response")
test_roc5= roc(testData$DV, test_prob5,plot = TRUE, col='darkblue', print.auc = TRUE,percent=TRUE)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

####################################################
# Area_Under Curve
############################################
#Area-under Curve
#windows()
plot(test_roc1,  print.auc = TRUE,  col='blue',percent=TRUE, print.auc.y = 48, print.auc.x = 0)
plot(test_roc2 , add = TRUE, col='gold', print.auc = TRUE, print.auc.y = 41, print.auc.x = 0)
plot(test_roc3 , add = TRUE, col='brown', print.auc = TRUE, print.auc.y = 34, print.auc.x = 0)
plot(test_roc4 , add = TRUE, col='green', print.auc = TRUE, print.auc.y = 27, print.auc.x = 0)
plot(test_roc5 , add = TRUE, col='cyan', print.auc = TRUE, print.auc.y = 20, print.auc.x = 0)
abline(a=0, b=1, lty=3, lwd=4, col="black")
legend("topleft", legend = c("Random Forest-Model", "Probit Regression",
                             "Logistic-Regression","Support Vector Machine","KNN Model") ,
       pch = 15, bty = 'n', col = c("blue","gold", "brown", "green","cyan"))

###################################################
# Metrics
###############################################
# Sample accuracy, sensitivity, F1 Score, and precision values for three models
Accuracy <- c(0.578, 0.960, 0.561,0.503,0.956)
Recall <- c(0.976, 0.961, 0.976,0.963,0.962)
F1_score <- c(0.724, 0.980, 0.709,0.661,0.977)
Precision<-c(0.575,0.999,0.557,0.503,0.993)
# Creating a data frame with the values
model_names <- c("Logistic Regression", "Random Forest", "Probit Regression",
                 "Support Vector Machine","KNN Model")
data <- data.frame(Model = model_names, Accuracy = Accuracy, Recall = Recall, F1_Score = F1_score, Precision=Precision)


# Loading the ggplot2 package
library(ggplot2)

# Reshaping the data frame for plotting
data_long <- reshape2::melt(data, id.vars = "Model", variable.name = "Metric", value.name = "Value")
#windows()
# Creating the bar chart
ggplot(data_long, aes(x = Model, y = Value, fill = Metric)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Comparison of Supervised Machine Learning Models",
       x = "Model",
       y = "Value") +
  scale_fill_manual(values = c("Accuracy" = "cyan3",
                               "Recall" = "pink4",
                               "F1_Score" = "purple4",
                               "Precision" = "green1")) +
  theme_minimal()

#############################################
# Feature Selection
#############################################
#windows()
par(mfrow=(c(3,2)))


# Logistic Regression

# Get feature importance (coefficients)
feature_importance2 <- abs(coef(logitmod2)[-1])

# Create feature importance plot
feature_importance_df2<-data.frame(
  Feature = names(feature_importance2),
  Importance = feature_importance2
)

feature_importance_df2 <- feature_importance_df2[order(feature_importance_df2$Importance, decreasing = TRUE), ]
#windows()
ggplot(feature_importance_df2, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "magenta4") +
  xlab("Feature") +
  ylab("Importance") +
  ggtitle("Logistic Regression - Feature Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

###########################################3
# Probit Regression

# Get feature importance (coefficients)
feature_importance2 <- abs(coef(probitmod)[-1])

# Create feature importance plot
feature_importance_df2<-data.frame(
  Feature = names(feature_importance2),
  Importance = feature_importance2
)

feature_importance_df2 <- feature_importance_df2[order(feature_importance_df2$Importance, decreasing = TRUE), ]
#windows()
ggplot(feature_importance_df2, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "cyan4") +
  xlab("Feature") +
  ylab("Importance") +
  ggtitle("Probit Regression - Feature Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

#############################################
# Random Forest 

# Get feature importance
feature_importance <- importance(fit_rf)

# Convert feature importance to data frame
feature_importance_df <- data.frame(
  Feature = row.names(feature_importance),
  Importance = feature_importance[, "MeanDecreaseGini"]
)

# Sort feature importance by importance score
feature_importance_df <- feature_importance_df[order(feature_importance_df$Importance, decreasing = TRUE), ]
#windows()
# Create feature importance plot
ggplot(feature_importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "darkred") +
  xlab("Feature") +
  ylab("Importance") +
  ggtitle("Random Forest - Feature Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

###########################################

knnmodel <- train(DV ~ ., data = trainData, method = "knn")

## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.

# Get feature importance
importance <- varImp(knnmodel)
# Print the feature importance
print(importance)

## loess r-squared variable importance
## 
##                             Overall
## Number_of_Children        100.00000
## Age_Group                  52.76462
## Health_Access              45.56273
## Listening_Radio            13.04052
## Residence                  10.45667
## Husband_Education_Level    10.14241
## Wealth_Quantile             7.11500
## Husband_Employment_Status   5.65956
## Region                      4.63386
## Watching_Television         3.80817
## Women_Employment_Status     0.69541
## Contraceptive_Method        0.01595
## Women_Educationlevel        0.00863
## Conctraceptive_Use          0.00000

#windows()
# Plot the feature importance
plot <- ggplot(importance, aes(x = rownames(importance), y = Overall)) +
  geom_bar(stat = "identity", fill = "darkblue") +
  labs(x = "Features", y = "Importance") +
  ggtitle("KNN Features Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Print the plot
print(plot)

########################################
coef(svm)[2:15]

##                 Age_Group                    Region                 Residence 
##              3.844677e-05              3.637503e-06             -7.235190e-05 
##      Women_Educationlevel           Listening_Radio       Watching_Television 
##              7.425885e-05             -7.779376e-05             -3.979415e-05 
##           Wealth_Quantile      Contraceptive_Method        Conctraceptive_Use 
##             -9.079849e-05              1.878193e-05              7.595457e-05 
##             Health_Access   Husband_Education_Level Husband_Employment_Status 
##              5.815800e-05             -3.459137e-05             -4.565393e-05 
##   Women_Employment_Status        Number_of_Children 
##             -1.582482e-04              6.895394e-05

# Support Vector Machine
# Get feature importance (coefficients)
feature_importance1<- abs(coef(svm)[2:15])

# Create feature importance plot
feature_importance_df <- data.frame(
  Feature = names(feature_importance1),
  Importance = feature_importance1
)

feature_importance_df <- feature_importance_df[order(feature_importance_df$Importance, decreasing = TRUE), ]
#windows()
ggplot(feature_importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "darkgreen") +
  xlab("Feature") +
  ylab("Importance") +
  ggtitle("Support Vector Machine - Feature Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Leyla Project

Leila M. Farah, and Abdisalam Hassan Muse (Ph.D.)

2024-05-22

Leila Project