Amoud University

Dr Genanka Project

# 5 Supervised Machine Learning Models
# Contraceptive Use in Somalia
library(haven)
final<-read_dta("~/Genanka.dta")
str(final)

## tibble [22,776 × 17] (S3: tbl_df/tbl/data.frame)
##  $ Age                      : dbl+lbl [1:22776] 2, 2, 3, 3, 3, 3, 3, 3, 3, 5, 5, 5, 5, 5, 5, 3, 3, 3...
##    ..@ label       : chr "Age in 5-year groups"
##    ..@ format.stata: chr "%1.0f"
##    ..@ labels      : Named num [1:7] 1 2 3 4 5 6 7
##    .. ..- attr(*, "names")= chr [1:7] "15 - 19" "20 - 24" "25 - 29" "30 - 34" ...
##  $ Region                   : dbl+lbl [1:22776] 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, ...
##    ..@ label       : chr "Region"
##    ..@ format.stata: chr "%2.0f"
##    ..@ labels      : Named num [1:18] 11 12 13 14 15 16 17 18 19 20 ...
##    .. ..- attr(*, "names")= chr [1:18] "Awdal" "Woqooyi Galbeed" "Togdheer" "Sool" ...
##  $ Residence                : dbl+lbl [1:22776] 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
##    ..@ label       : chr "Type of place of residence"
##    ..@ format.stata: chr "%1.0f"
##    ..@ labels      : Named num [1:3] 1 2 3
##    .. ..- attr(*, "names")= chr [1:3] "Urban" "Rural" "Nomadic"
##  $ Women_Educationlevel     : dbl+lbl [1:22776] 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
##    ..@ label       : chr "Highest educational level"
##    ..@ format.stata: chr "%1.0f"
##    ..@ labels      : Named num [1:4] 0 1 2 3
##    .. ..- attr(*, "names")= chr [1:4] "No Education" "Primary" "Secondary" "Higher"
##  $ Household_size           : dbl+lbl [1:22776] 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0...
##    ..@ label       : chr "Number of household members (listed)"
##    ..@ format.stata: chr "%21.0f"
##    ..@ labels      : Named num [1:2] 0 1
##    .. ..- attr(*, "names")= chr [1:2] "Less than 4 Members" " Four or More Members"
##  $ Listening_Radio          : dbl+lbl [1:22776] 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3...
##    ..@ label       : chr "Frequency of listening to radio"
##    ..@ format.stata: chr "%1.0f"
##    ..@ labels      : Named num [1:3] 1 2 3
##    .. ..- attr(*, "names")= chr [1:3] "At least once a week" "Less than once a week" "Not at all"
##  $ Watching_TV              : dbl+lbl [1:22776] 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 3, 3, 3...
##    ..@ label       : chr "Frequency of watching television"
##    ..@ format.stata: chr "%1.0f"
##    ..@ labels      : Named num [1:3] 1 2 3
##    .. ..- attr(*, "names")= chr [1:3] "At least once a week" "Less than once a week" "Not at all"
##  $ Internet_Usage           : dbl+lbl [1:22776] 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
##    ..@ label       : chr "Ever used internet"
##    ..@ format.stata: chr "%3.0f"
##    ..@ labels      : Named num [1:2] 1 2
##    .. ..- attr(*, "names")= chr [1:2] "Yes" "No"
##  $ Wealth_Quantile          : dbl+lbl [1:22776] 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2...
##    ..@ label       : chr "Wealth index combined"
##    ..@ format.stata: chr "%1.0f"
##    ..@ labels      : Named num [1:5] 1 2 3 4 5
##    .. ..- attr(*, "names")= chr [1:5] "Lowest" "Second" "Middle" "Fourth" ...
##  $ Number_of_Children       : dbl+lbl [1:22776] 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
##    ..@ label       : chr "Total children ever born"
##    ..@ format.stata: chr "%22.0f"
##    ..@ labels      : Named num [1:2] 0 1
##    .. ..- attr(*, "names")= chr [1:2] "Less than 4 Children" " Four or More Children"
##  $ DV                       : dbl+lbl [1:22776] 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
##    ..@ label       : chr "Contraceptive use and intention"
##    ..@ format.stata: chr "%9.0f"
##    ..@ labels      : Named num [1:2] 0 1
##    .. ..- attr(*, "names")= chr [1:2] "Non-Users" "Users"
##  $ Health_Access            : dbl+lbl [1:22776] 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
##    ..@ label       : chr "Getting medical help for self: getting permission to go"
##    ..@ format.stata: chr "%10.0f"
##    ..@ labels      : Named num [1:2] 1 2
##    .. ..- attr(*, "names")= chr [1:2] "Problem" "No problem"
##  $ Distance_of_Healthcentre : dbl+lbl [1:22776] 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2...
##    ..@ label       : chr "Getting medical help for self: distance to health facility"
##    ..@ format.stata: chr "%10.0f"
##    ..@ labels      : Named num [1:2] 1 2
##    .. ..- attr(*, "names")= chr [1:2] "Problem" "No problem"
##  $ Husband_Education        : dbl+lbl [1:22776] 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
##    ..@ label       : chr "Husband/partner's  ever attended school"
##    ..@ format.stata: chr "%1.0f"
##    ..@ labels      : Named num [1:3] 1 2 8
##    .. ..- attr(*, "names")= chr [1:3] "Yes" "No" "Don't Know"
##  $ Husband_Occupation_status: dbl+lbl [1:22776] 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1...
##    ..@ label       : chr "Husband/partner worked in last 12 months"
##    ..@ format.stata: chr "%1.0f"
##    ..@ labels      : Named num [1:3] 1 2 8
##    .. ..- attr(*, "names")= chr [1:3] "Yes" "No" "Don't Know"
##  $ Women_Occupation_status  : dbl+lbl [1:22776] 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
##    ..@ label       : chr "Respondent worked in last 12 months"
##    ..@ format.stata: chr "%1.0f"
##    ..@ labels      : Named num [1:2] 1 2
##    .. ..- attr(*, "names")= chr [1:2] "Yes" "No"
##  $ sanitationfacility       : dbl+lbl [1:22776] 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
##    ..@ format.stata: chr "%10.0g"
##    ..@ labels      : Named num [1:2] 0 1
##    .. ..- attr(*, "names")= chr [1:2] "Unimproved" "Improved"

write.csv(final,"~/Genanka.csv")
final<-read.csv("~/Genanka.csv")
final<-final[,-1]
# Logistic Regression
logistic<-glm(DV~.,data=final,family = binomial())
summary(logistic)

## 
## Call:
## glm(formula = DV ~ ., family = binomial(), data = final)
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -5.88055    2.16253  -2.719 0.006542 ** 
## Age                       -0.20089    0.04464  -4.500 6.80e-06 ***
## Region                    -0.12401    0.01351  -9.178  < 2e-16 ***
## Residence                 -0.14846    0.10920  -1.359 0.174005    
## Women_Educationlevel       0.29935    0.08832   3.389 0.000701 ***
## Household_size             0.96174    0.17945   5.359 8.35e-08 ***
## Listening_Radio            0.05679    0.09060   0.627 0.530764    
## Watching_TV               -0.32496    0.07257  -4.478 7.54e-06 ***
## Internet_Usage            -0.08734    0.16394  -0.533 0.594195    
## Wealth_Quantile            0.33654    0.06195   5.433 5.54e-08 ***
## Number_of_Children         0.52053    0.15604   3.336 0.000850 ***
## Health_Access             -0.03429    0.13805  -0.248 0.803835    
## Distance_of_Healthcentre   0.45850    0.13504   3.395 0.000686 ***
## Husband_Education          0.39290    0.12493   3.145 0.001661 ** 
## Husband_Occupation_status -0.37756    0.12963  -2.913 0.003585 ** 
## Women_Occupation_status    1.47997    1.00790   1.468 0.142004    
## sanitationfacility        -0.42958    0.11947  -3.596 0.000324 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3725.1  on 22775  degrees of freedom
## Residual deviance: 3159.4  on 22759  degrees of freedom
## AIC: 3193.4
## 
## Number of Fisher Scoring iterations: 8

round(coef(summary(logistic)),4)

##                           Estimate Std. Error z value Pr(>|z|)
## (Intercept)                -5.8805     2.1625 -2.7193   0.0065
## Age                        -0.2009     0.0446 -4.4999   0.0000
## Region                     -0.1240     0.0135 -9.1783   0.0000
## Residence                  -0.1485     0.1092 -1.3594   0.1740
## Women_Educationlevel        0.2994     0.0883  3.3894   0.0007
## Household_size              0.9617     0.1794  5.3595   0.0000
## Listening_Radio             0.0568     0.0906  0.6268   0.5308
## Watching_TV                -0.3250     0.0726 -4.4779   0.0000
## Internet_Usage             -0.0873     0.1639 -0.5328   0.5942
## Wealth_Quantile             0.3365     0.0619  5.4329   0.0000
## Number_of_Children          0.5205     0.1560  3.3359   0.0009
## Health_Access              -0.0343     0.1380 -0.2484   0.8038
## Distance_of_Healthcentre    0.4585     0.1350  3.3952   0.0007
## Husband_Education           0.3929     0.1249  3.1449   0.0017
## Husband_Occupation_status  -0.3776     0.1296 -2.9126   0.0036
## Women_Occupation_status     1.4800     1.0079  1.4684   0.1420
## sanitationfacility         -0.4296     0.1195 -3.5957   0.0003

# Logistic Regression without factors
logistic<-glm(DV~.,data=final,family = binomial())
summary(logistic)

## 
## Call:
## glm(formula = DV ~ ., family = binomial(), data = final)
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -5.88055    2.16253  -2.719 0.006542 ** 
## Age                       -0.20089    0.04464  -4.500 6.80e-06 ***
## Region                    -0.12401    0.01351  -9.178  < 2e-16 ***
## Residence                 -0.14846    0.10920  -1.359 0.174005    
## Women_Educationlevel       0.29935    0.08832   3.389 0.000701 ***
## Household_size             0.96174    0.17945   5.359 8.35e-08 ***
## Listening_Radio            0.05679    0.09060   0.627 0.530764    
## Watching_TV               -0.32496    0.07257  -4.478 7.54e-06 ***
## Internet_Usage            -0.08734    0.16394  -0.533 0.594195    
## Wealth_Quantile            0.33654    0.06195   5.433 5.54e-08 ***
## Number_of_Children         0.52053    0.15604   3.336 0.000850 ***
## Health_Access             -0.03429    0.13805  -0.248 0.803835    
## Distance_of_Healthcentre   0.45850    0.13504   3.395 0.000686 ***
## Husband_Education          0.39290    0.12493   3.145 0.001661 ** 
## Husband_Occupation_status -0.37756    0.12963  -2.913 0.003585 ** 
## Women_Occupation_status    1.47997    1.00790   1.468 0.142004    
## sanitationfacility        -0.42958    0.11947  -3.596 0.000324 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3725.1  on 22775  degrees of freedom
## Residual deviance: 3159.4  on 22759  degrees of freedom
## AIC: 3193.4
## 
## Number of Fisher Scoring iterations: 8

# Prediction Models
library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

# Prep Training and Test dataset
# First, create a Train-test split with 0% data included in the training set
set.seed(123)
trainDataIndex<-createDataPartition(final$DV, p=0.8, list = F)
trainData<-final[trainDataIndex,]
testData<-final[-trainDataIndex,]

###################################################

# Logistic with only significant variables
###################################################
logitmod2<-glm(DV~ .,
               family="binomial", data=trainData)
summary(logitmod2)

## 
## Call:
## glm(formula = DV ~ ., family = "binomial", data = trainData)
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -5.18579    2.19610  -2.361 0.018208 *  
## Age                       -0.20427    0.04949  -4.128 3.67e-05 ***
## Region                    -0.13028    0.01498  -8.700  < 2e-16 ***
## Residence                 -0.19526    0.12266  -1.592 0.111426    
## Women_Educationlevel       0.31178    0.09760   3.194 0.001401 ** 
## Household_size             0.94680    0.19716   4.802 1.57e-06 ***
## Listening_Radio            0.08126    0.10182   0.798 0.424835    
## Watching_TV               -0.28355    0.08038  -3.528 0.000419 ***
## Internet_Usage            -0.19774    0.17999  -1.099 0.271922    
## Wealth_Quantile            0.33404    0.06900   4.841 1.29e-06 ***
## Number_of_Children         0.65709    0.17725   3.707 0.000210 ***
## Health_Access             -0.03907    0.15275  -0.256 0.798098    
## Distance_of_Healthcentre   0.43994    0.14972   2.938 0.003299 ** 
## Husband_Education          0.38294    0.13856   2.764 0.005714 ** 
## Husband_Occupation_status -0.35366    0.14272  -2.478 0.013214 *  
## Women_Occupation_status    1.21883    1.00999   1.207 0.227521    
## sanitationfacility        -0.48619    0.13293  -3.658 0.000255 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3018.0  on 18220  degrees of freedom
## Residual deviance: 2552.4  on 18204  degrees of freedom
## AIC: 2586.4
## 
## Number of Fisher Scoring iterations: 8

# Apply the model to predict the testdata
pred<-predict(logitmod2,newdata = testData, type = "response")
summary(pred)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0002909 0.0032716 0.0068923 0.0159015 0.0171588 0.3909683

# Measure the accuracy of prediction in the test data
y_pred_num<-ifelse(pred<0.015,0,1)
y_pred<-factor(y_pred_num,levels = c(0,1))
y_act<-testData$DV

#Result : Prediction Accuracy 
set.seed(123)
mean(y_pred==y_act)

## [1] 0.7306257

# Plot ROC Curve
library(InformationValue)

## 
## Attaching package: 'InformationValue'

## The following objects are masked from 'package:caret':
## 
##     confusionMatrix, precision, sensitivity, specificity

InformationValue::plotROC(y_act,pred)

InformationValue::AUROC(y_act,pred)

## [1] 0.7427618

#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_table <- table(testData$DV, y_pred_num)
#Displays confusion matrix and the statistics associated with the confusion matrix
result<-caret::confusionMatrix(confusion_table)
result

## Confusion Matrix and Statistics
## 
##    y_pred_num
##        0    1
##   0 3277 1210
##   1   17   51
##                                           
##                Accuracy : 0.7306          
##                  95% CI : (0.7175, 0.7435)
##     No Information Rate : 0.7232          
##     P-Value [Acc > NIR] : 0.1335          
##                                           
##                   Kappa : 0.0498          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.99484         
##             Specificity : 0.04044         
##          Pos Pred Value : 0.73033         
##          Neg Pred Value : 0.75000         
##              Prevalence : 0.72316         
##          Detection Rate : 0.71943         
##    Detection Prevalence : 0.98507         
##       Balanced Accuracy : 0.51764         
##                                           
##        'Positive' Class : 0               
##

metrics<-as.data.frame(result$byClass)
colnames(metrics)<-"metrics"
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(kableExtra)

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

kable(round(metrics,4), caption = "F1-score, Precision and Recall ") %>%
  kable_styling(font_size = 16)

F1-score, Precision and Recall
	metrics
Sensitivity	0.9948
Specificity	0.0404
Pos Pred Value	0.7303
Neg Pred Value	0.7500
Precision	0.7303
Recall	0.9948
F1	0.8423
Prevalence	0.7232
Detection Rate	0.7194
Detection Prevalence	0.9851
Balanced Accuracy	0.5176

#############################################3
# Probit Regression
#############################################

# Probit Regression
probitmod<-glm(DV~ .,
               family=binomial(link="probit"), data=trainData)
summary(probitmod)

## 
## Call:
## glm(formula = DV ~ ., family = binomial(link = "probit"), data = trainData)
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -2.278144   0.804149  -2.833 0.004612 ** 
## Age                       -0.088674   0.020791  -4.265 2.00e-05 ***
## Region                    -0.058380   0.006403  -9.118  < 2e-16 ***
## Residence                 -0.069512   0.049913  -1.393 0.163720    
## Women_Educationlevel       0.140517   0.045014   3.122 0.001798 ** 
## Household_size             0.392549   0.078507   5.000 5.73e-07 ***
## Listening_Radio            0.028669   0.044614   0.643 0.520479    
## Watching_TV               -0.131604   0.036878  -3.569 0.000359 ***
## Internet_Usage            -0.098466   0.081866  -1.203 0.229067    
## Wealth_Quantile            0.135231   0.028024   4.826 1.40e-06 ***
## Number_of_Children         0.269605   0.076050   3.545 0.000392 ***
## Health_Access             -0.041160   0.063997  -0.643 0.520125    
## Distance_of_Healthcentre   0.184673   0.063452   2.910 0.003609 ** 
## Husband_Education          0.186205   0.061025   3.051 0.002279 ** 
## Husband_Occupation_status -0.150504   0.059085  -2.547 0.010858 *  
## Women_Occupation_status    0.452069   0.359175   1.259 0.208164    
## sanitationfacility        -0.205046   0.056846  -3.607 0.000310 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3018.0  on 18220  degrees of freedom
## Residual deviance: 2546.2  on 18204  degrees of freedom
## AIC: 2580.2
## 
## Number of Fisher Scoring iterations: 8

# Apply the model to predict the testdata
pred<-predict(probitmod,newdata = testData, type = "response")
summary(pred)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 6.634e-05 2.627e-03 6.601e-03 1.589e-02 1.778e-02 3.000e-01

# Measure the accuracy of prediction in the test data
y_pred_num<-ifelse(pred<0.015,0,1)
y_pred<-factor(y_pred_num,levels = c(0,1))
y_act<-testData$DV

#Result : Prediction Accuracy 
set.seed(123)
mean(y_pred==y_act)

## [1] 0.7220637

# Plot ROC Curve
library(InformationValue)
InformationValue::plotROC(y_act,pred)

InformationValue::AUROC(y_act,pred)

## [1] 0.7456607

#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_table <- table(testData$DV, y_pred_num)
#Displays confusion matrix and the statistics associated with the confusion matrix
result<-caret::confusionMatrix(confusion_table)
result

## Confusion Matrix and Statistics
## 
##    y_pred_num
##        0    1
##   0 3238 1249
##   1   17   51
##                                          
##                Accuracy : 0.7221         
##                  95% CI : (0.7088, 0.735)
##     No Information Rate : 0.7146         
##     P-Value [Acc > NIR] : 0.1357         
##                                          
##                   Kappa : 0.0475         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.99478        
##             Specificity : 0.03923        
##          Pos Pred Value : 0.72164        
##          Neg Pred Value : 0.75000        
##              Prevalence : 0.71460        
##          Detection Rate : 0.71087        
##    Detection Prevalence : 0.98507        
##       Balanced Accuracy : 0.51700        
##                                          
##        'Positive' Class : 0              
##

metrics<-as.data.frame(result$byClass)
colnames(metrics)<-"metrics"
library(dplyr)
library(kableExtra)
kable(round(metrics,4), caption = "F1-score, Precision and Recall ") %>%
  kable_styling(font_size = 16)

F1-score, Precision and Recall
	metrics
Sensitivity	0.9948
Specificity	0.0392
Pos Pred Value	0.7216
Neg Pred Value	0.7500
Precision	0.7216
Recall	0.9948
F1	0.8365
Prevalence	0.7146
Detection Rate	0.7109
Detection Prevalence	0.9851
Balanced Accuracy	0.5170

# Probit Regression

# Get feature importance (coefficients)
feature_importance2 <- abs(coef(probitmod)[-1])

# Create feature importance plot
feature_importance_df2<-data.frame(
  Feature = names(feature_importance2),
  Importance = feature_importance2
)

feature_importance_df2 <- feature_importance_df2[order(feature_importance_df2$Importance, decreasing = TRUE), ]
#windows()
ggplot(feature_importance_df2, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "darkred") +
  xlab("Feature") +
  ylab("Importance") +
  ggtitle("Probit Regression - Feature Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

################################################3
# Random Forest 
################################################

library(randomForest)

## randomForest 4.7-1.1

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

set.seed(123)

#Random Forest for variables. mtry = 5 since there are 24 variables (square root of 24 is close to 5).
fit_rf <- randomForest(factor(DV) ~., data = trainData)
fit_rf

## 
## Call:
##  randomForest(formula = factor(DV) ~ ., data = trainData) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 0.68%
## Confusion matrix:
##       0   1  class.error
## 0 17920   6 0.0003347094
## 1   117 178 0.3966101695

varImpPlot(fit_rf)

#Performance of Random Forest Model on Testing Data
predrf = predict(fit_rf, newdata=testData)
table(predrf)

## predrf
##    0    1 
## 4510   45

summary(predrf)

##    0    1 
## 4510   45

# Measure the accuracy of prediction in the test data
y_pred_numrf<-ifelse(predrf==1,1,0)
y_predrf<-factor(y_pred_numrf,levels = c(0,1))
y_actrf<-testData$DV

#Result : Prediction Accuracy 
mean(y_predrf==y_actrf)

## [1] 0.9949506

#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_tablerf<- table(testData$DV, y_pred_numrf)
#Displays confusion matrix and the statistics associated with the confusion matrix
resultrf<-caret::confusionMatrix(confusion_tablerf)
resultrf

## Confusion Matrix and Statistics
## 
##    y_pred_numrf
##        0    1
##   0 4487    0
##   1   23   45
##                                           
##                Accuracy : 0.995           
##                  95% CI : (0.9924, 0.9968)
##     No Information Rate : 0.9901          
##     P-Value [Acc > NIR] : 0.0002174       
##                                           
##                   Kappa : 0.794           
##                                           
##  Mcnemar's Test P-Value : 4.49e-06        
##                                           
##             Sensitivity : 0.9949          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.6618          
##              Prevalence : 0.9901          
##          Detection Rate : 0.9851          
##    Detection Prevalence : 0.9851          
##       Balanced Accuracy : 0.9975          
##                                           
##        'Positive' Class : 0               
##

metricsrf<-as.data.frame(resultrf$byClass)
colnames(metricsrf)<-"metricsrf"
library(dplyr)
library(kableExtra)
kable(round(metricsrf,4), caption = "F1-score, Precision and Recall ") %>%
  kable_styling(font_size = 16)

F1-score, Precision and Recall
	metricsrf
Sensitivity	0.9949
Specificity	1.0000
Pos Pred Value	1.0000
Neg Pred Value	0.6618
Precision	1.0000
Recall	0.9949
F1	0.9974
Prevalence	0.9901
Detection Rate	0.9851
Detection Prevalence	0.9851
Balanced Accuracy	0.9975

# Random Forest 

# Get feature importance
feature_importance <- importance(fit_rf)

# Convert feature importance to data frame
feature_importance_df <- data.frame(
  Feature = row.names(feature_importance),
  Importance = feature_importance[, "MeanDecreaseGini"]
)

# Sort feature importance by importance score
feature_importance_df <- feature_importance_df[order(feature_importance_df$Importance, decreasing = TRUE), ]
#windows()
# Create feature importance plot
ggplot(feature_importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "darkblue") +
  xlab("Feature") +
  ylab("Importance") +
  ggtitle("Random Forest - Feature Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

#ROC
library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

# RandomForest
test_prob1= predict(fit_rf, newdata=testData, type = "prob")
test_roc1= roc(testData$DV, test_prob1[,c(2)],plot = TRUE, col='darkgoldenrod4', print.auc = TRUE,percent=TRUE)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

#Probit Regression
test_prob2= predict(probitmod, newdata=testData, type = "response")
test_roc2= roc(testData$DV~test_prob2,plot = TRUE, col='darkorange', print.auc = TRUE,percent=TRUE)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

#Logistic Regression
test_prob3= predict(logitmod2, newdata=testData, type = "response")
test_roc3= roc(testData$DV~test_prob3, plot = TRUE,col="darkblue", print.auc = TRUE,percent=TRUE)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

###############################################
# Support Vector Machine (SVM) Model
###############################################
library(e1071)
set.seed(123)
svm<-svm(DV~., data = trainData, kernel='linear', cost=10, scale=FALSE)

# Plot the SVC obtained
#windows()
plot(svm, final)
summary(svm)

## 
## Call:
## svm(formula = DV ~ ., data = trainData, kernel = "linear", cost = 10, 
##     scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  eps-regression 
##  SVM-Kernel:  linear 
##        cost:  10 
##       gamma:  0.0625 
##     epsilon:  0.1 
## 
## 
## Number of Support Vectors:  1894

#Prediction
set.seed(123)
predictionSVM<- predict(svm, newdata = testData)
summary(predictionSVM)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.09958 0.09987 0.09997 0.09997 0.10007 0.10042

# Measure the accuracy of prediction in the test data
y_pred_numSVM<-ifelse(predictionSVM<0.1,0,1)
y_predSVM<-factor(y_pred_numSVM,levels = c(0,1))
y_actSVM<-testData$DV

#Result : Prediction Accuracy 
mean(y_predSVM==y_actSVM)

## [1] 0.5815587

#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_tableSVM<- table(testData$DV, y_predSVM)
#Displays confusion matrix and the statistics associated with the confusion matrix
resultSVM<-caret::confusionMatrix(confusion_tableSVM)
resultSVM

## Confusion Matrix and Statistics
## 
##    y_predSVM
##        0    1
##   0 2609 1878
##   1   28   40
##                                           
##                Accuracy : 0.5816          
##                  95% CI : (0.5671, 0.5959)
##     No Information Rate : 0.5789          
##     P-Value [Acc > NIR] : 0.3653          
##                                           
##                   Kappa : 0.0118          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.98938         
##             Specificity : 0.02086         
##          Pos Pred Value : 0.58146         
##          Neg Pred Value : 0.58824         
##              Prevalence : 0.57892         
##          Detection Rate : 0.57278         
##    Detection Prevalence : 0.98507         
##       Balanced Accuracy : 0.50512         
##                                           
##        'Positive' Class : 0               
##

metricsSVM<-as.data.frame(resultSVM$byClass)
colnames(metricsSVM)<-"metricsSVM"
library(dplyr)
library(kableExtra)
kable(round(metricsSVM,4), caption = "F1-score, Precision and Recall ") %>%
  kable_styling(font_size = 16)

F1-score, Precision and Recall
	metricsSVM
Sensitivity	0.9894
Specificity	0.0209
Pos Pred Value	0.5815
Neg Pred Value	0.5882
Precision	0.5815
Recall	0.9894
F1	0.7325
Prevalence	0.5789
Detection Rate	0.5728
Detection Prevalence	0.9851
Balanced Accuracy	0.5051

#SVM
library(pROC)
#windows()
test_prob4= predict(svm, newdata=testData)
test_roc4= roc(testData$DV, main="Support Vector Machine", test_prob4,plot = TRUE, col='darkblue', print.auc = TRUE,percent=TRUE)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

coef(svm)[2:15]

##                       Age                    Region                 Residence 
##              3.562587e-05             -1.566845e-05              5.605488e-05 
##      Women_Educationlevel            Household_size           Listening_Radio 
##             -2.563011e-05             -7.175976e-05             -3.870384e-05 
##               Watching_TV            Internet_Usage           Wealth_Quantile 
##              6.940195e-06             -9.375122e-05              1.846147e-05 
##        Number_of_Children             Health_Access  Distance_of_Healthcentre 
##             -8.917798e-06              1.442841e-04             -5.295778e-05 
##         Husband_Education Husband_Occupation_status 
##              4.111935e-05             -5.776952e-05

# Support Vector Machine
# Get feature importance (coefficients)
feature_importance1<- abs(coef(svm)[2:15])

# Create feature importance plot
feature_importance_df <- data.frame(
  Feature = names(feature_importance1),
  Importance = feature_importance1
)

feature_importance_df <- feature_importance_df[order(feature_importance_df$Importance, decreasing = TRUE), ]
#windows()
ggplot(feature_importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "magenta") +
  xlab("Feature") +
  ylab("Importance") +
  ggtitle("Support Vector Machine - Feature Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

#################################################



############################################
# KNN Model
library(knn)

# Fitting model
fit <-knnreg(trainData$DV ~ ., data = trainData,k=1)
summary(fit)

##         Length Class  Mode   
## learn   2      -none- list   
## k       1      -none- numeric
## terms   3      terms  call   
## xlevels 0      -none- list   
## theDots 0      -none- list

#Predict Output 
predicted= predict(fit,testData)
summary(predicted)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.01516 0.00000 1.00000

# Measure the accuracy of prediction in the test data
y_pred_numknn<-ifelse(predicted==1,1,0)
y_predknn<-factor(y_pred_numknn,levels = c(0,1))
y_actknn<-testData$DV

#Result : Prediction Accuracy 
mean(y_predknn==y_actknn)

## [1] 0.997146

#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_tableknn<- table(testData$DV, y_predknn)
#Displays confusion matrix and the statistics associated with the confusion matrix
resultknn<-caret::confusionMatrix(confusion_tableknn)
resultknn

## Confusion Matrix and Statistics
## 
##    y_predknn
##        0    1
##   0 4485    2
##   1   11   57
##                                           
##                Accuracy : 0.9971          
##                  95% CI : (0.9951, 0.9985)
##     No Information Rate : 0.987           
##     P-Value [Acc > NIR] : 4.042e-13       
##                                           
##                   Kappa : 0.8962          
##                                           
##  Mcnemar's Test P-Value : 0.0265          
##                                           
##             Sensitivity : 0.9976          
##             Specificity : 0.9661          
##          Pos Pred Value : 0.9996          
##          Neg Pred Value : 0.8382          
##              Prevalence : 0.9870          
##          Detection Rate : 0.9846          
##    Detection Prevalence : 0.9851          
##       Balanced Accuracy : 0.9818          
##                                           
##        'Positive' Class : 0               
##

metricsknn<-as.data.frame(resultknn$byClass)
colnames(metricsrf)<-"metricsknn"
library(dplyr)
library(kableExtra)
kable(round(metricsknn,4), caption = "F1-score, Precision and Recall ") %>%
  kable_styling(font_size = 16)

F1-score, Precision and Recall
	resultknn$byClass
Sensitivity	0.9976
Specificity	0.9661
Pos Pred Value	0.9996
Neg Pred Value	0.8382
Precision	0.9996
Recall	0.9976
F1	0.9986
Prevalence	0.9870
Detection Rate	0.9846
Detection Prevalence	0.9851
Balanced Accuracy	0.9818

knnmodel <- train(DV ~ ., data = trainData, method = "knn")

## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.

# Get feature importance
importance <- varImp(knnmodel)
# Print the feature importance
print(importance)

## loess r-squared variable importance
## 
##                            Overall
## Watching_TV               100.0000
## Wealth_Quantile            85.7159
## Women_Educationlevel       71.7999
## Husband_Education          63.3490
## Internet_Usage             51.8555
## Residence                  37.6352
## Husband_Occupation_status  34.6371
## Region                     18.2522
## Distance_of_Healthcentre   15.9418
## Household_size             10.8305
## Age                         7.5547
## Listening_Radio             4.1220
## Health_Access               4.0223
## sanitationfacility          0.7499
## Women_Occupation_status     0.6048
## Number_of_Children          0.0000

# Plot the feature importance
plot <- ggplot(importance, aes(x = rownames(importance), y = Overall)) +
  geom_bar(stat = "identity", fill = "darkgreen") +
  labs(x = "Features", y = "Importance") +
  ggtitle("KNN Features Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Print the plot
print(plot)

#ROC
library(pROC)
# KNN
#windows()
test_prob5= predict(fit, newdata=testData, type = "response")
test_roc5= roc(testData$DV, test_prob5,plot = TRUE, col='darkblue', print.auc = TRUE,percent=TRUE)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

####################################################
# Area_Under Curve
############################################
#Area-under Curve
#windows()
plot(test_roc1,  print.auc = TRUE,  col='blue',percent=TRUE, print.auc.y = 48, print.auc.x = 0)
plot(test_roc2 , add = TRUE, col='gold', print.auc = TRUE, print.auc.y = 41, print.auc.x = 0)
plot(test_roc3 , add = TRUE, col='brown', print.auc = TRUE, print.auc.y = 34, print.auc.x = 0)
plot(test_roc4 , add = TRUE, col='green', print.auc = TRUE, print.auc.y = 27, print.auc.x = 0)
plot(test_roc5 , add = TRUE, col='cyan', print.auc = TRUE, print.auc.y = 20, print.auc.x = 0)
abline(a=0, b=1, lty=3, lwd=4, col="black")
legend("topleft", legend = c("Random Forest-Model", "Probit Regression",
                             "Logistic-Regression","Support Vector Machine","KNN Model") ,
       pch = 15, bty = 'n', col = c("blue","gold", "brown", "green","cyan"))

###################################################
# Metrics
###############################################
# Sample accuracy, sensitivity, F1 Score, and precision values for three models
Accuracy <- c(0.578, 0.960, 0.561,0.503,0.956)
Recall <- c(0.976, 0.961, 0.976,0.963,0.962)
F1_score <- c(0.724, 0.980, 0.709,0.661,0.977)
Precision<-c(0.575,0.999,0.557,0.503,0.993)
# Creating a data frame with the values
model_names <- c("Logistic Regression", "Random Forest", "Probit Regression",
                 "Support Vector Machine","KNN Model")
data <- data.frame(Model = model_names, Accuracy = Accuracy, Recall = Recall, F1_Score = F1_score, Precision=Precision)


# Loading the ggplot2 package
library(ggplot2)

# Reshaping the data frame for plotting
data_long <- reshape2::melt(data, id.vars = "Model", variable.name = "Metric", value.name = "Value")
#windows()
# Creating the bar chart
ggplot(data_long, aes(x = Model, y = Value, fill = Metric)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Comparison of Supervised Machine Learning Models",
       x = "Model",
       y = "Value") +
  scale_fill_manual(values = c("Accuracy" = "cyan3",
                               "Recall" = "pink4",
                               "F1_Score" = "purple4",
                               "Precision" = "green1")) +
  theme_minimal()

#############################################
# Feature Selection
#############################################
#windows()
par(mfrow=(c(3,2)))


# Logistic Regression

# Get feature importance (coefficients)
feature_importance2 <- abs(coef(logitmod2)[-1])

# Create feature importance plot
feature_importance_df2<-data.frame(
  Feature = names(feature_importance2),
  Importance = feature_importance2
)

feature_importance_df2 <- feature_importance_df2[order(feature_importance_df2$Importance, decreasing = TRUE), ]
#windows()
ggplot(feature_importance_df2, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "magenta4") +
  xlab("Feature") +
  ylab("Importance") +
  ggtitle("Logistic Regression - Feature Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

###########################################3
# Probit Regression

# Get feature importance (coefficients)
feature_importance2 <- abs(coef(probitmod)[-1])

# Create feature importance plot
feature_importance_df2<-data.frame(
  Feature = names(feature_importance2),
  Importance = feature_importance2
)

feature_importance_df2 <- feature_importance_df2[order(feature_importance_df2$Importance, decreasing = TRUE), ]
#windows()
ggplot(feature_importance_df2, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "cyan4") +
  xlab("Feature") +
  ylab("Importance") +
  ggtitle("Probit Regression - Feature Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

#############################################
# Random Forest 

# Get feature importance
feature_importance <- importance(fit_rf)

# Convert feature importance to data frame
feature_importance_df <- data.frame(
  Feature = row.names(feature_importance),
  Importance = feature_importance[, "MeanDecreaseGini"]
)

# Sort feature importance by importance score
feature_importance_df <- feature_importance_df[order(feature_importance_df$Importance, decreasing = TRUE), ]
#windows()
# Create feature importance plot
ggplot(feature_importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "darkred") +
  xlab("Feature") +
  ylab("Importance") +
  ggtitle("Random Forest - Feature Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

###########################################

knnmodel <- train(DV ~ ., data = trainData, method = "knn")

## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.

# Get feature importance
importance <- varImp(knnmodel)
# Print the feature importance
print(importance)

## loess r-squared variable importance
## 
##                            Overall
## Watching_TV               100.0000
## Wealth_Quantile            85.7159
## Women_Educationlevel       71.7999
## Husband_Education          63.3490
## Internet_Usage             51.8555
## Residence                  37.6352
## Husband_Occupation_status  34.6371
## Region                     18.2522
## Distance_of_Healthcentre   15.9418
## Household_size             10.8305
## Age                         7.5547
## Listening_Radio             4.1220
## Health_Access               4.0223
## sanitationfacility          0.7499
## Women_Occupation_status     0.6048
## Number_of_Children          0.0000

#windows()
# Plot the feature importance
plot <- ggplot(importance, aes(x = rownames(importance), y = Overall)) +
  geom_bar(stat = "identity", fill = "darkblue") +
  labs(x = "Features", y = "Importance") +
  ggtitle("KNN Features Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Print the plot
print(plot)

########################################
coef(svm)[2:15]

##                       Age                    Region                 Residence 
##              3.562587e-05             -1.566845e-05              5.605488e-05 
##      Women_Educationlevel            Household_size           Listening_Radio 
##             -2.563011e-05             -7.175976e-05             -3.870384e-05 
##               Watching_TV            Internet_Usage           Wealth_Quantile 
##              6.940195e-06             -9.375122e-05              1.846147e-05 
##        Number_of_Children             Health_Access  Distance_of_Healthcentre 
##             -8.917798e-06              1.442841e-04             -5.295778e-05 
##         Husband_Education Husband_Occupation_status 
##              4.111935e-05             -5.776952e-05

# Support Vector Machine
# Get feature importance (coefficients)
feature_importance1<- abs(coef(svm)[2:15])

# Create feature importance plot
feature_importance_df <- data.frame(
  Feature = names(feature_importance1),
  Importance = feature_importance1
)

feature_importance_df <- feature_importance_df[order(feature_importance_df$Importance, decreasing = TRUE), ]
#windows()
ggplot(feature_importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "darkgreen") +
  xlab("Feature") +
  ylab("Importance") +
  ggtitle("Support Vector Machine - Feature Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Dr. Genanka Project

Abdirahman Omer Ali, and Abdisalam Hassan Muse (Ph.D.)

2024-05-22

Dr Genanka Project