Dr Genanka Project
# 5 Supervised Machine Learning Models
# Contraceptive Use in Somalia
library(haven)
final<-read_dta("~/Genanka.dta")
str(final)
## tibble [22,776 × 17] (S3: tbl_df/tbl/data.frame)
## $ Age : dbl+lbl [1:22776] 2, 2, 3, 3, 3, 3, 3, 3, 3, 5, 5, 5, 5, 5, 5, 3, 3, 3...
## ..@ label : chr "Age in 5-year groups"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:7] 1 2 3 4 5 6 7
## .. ..- attr(*, "names")= chr [1:7] "15 - 19" "20 - 24" "25 - 29" "30 - 34" ...
## $ Region : dbl+lbl [1:22776] 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, ...
## ..@ label : chr "Region"
## ..@ format.stata: chr "%2.0f"
## ..@ labels : Named num [1:18] 11 12 13 14 15 16 17 18 19 20 ...
## .. ..- attr(*, "names")= chr [1:18] "Awdal" "Woqooyi Galbeed" "Togdheer" "Sool" ...
## $ Residence : dbl+lbl [1:22776] 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
## ..@ label : chr "Type of place of residence"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:3] 1 2 3
## .. ..- attr(*, "names")= chr [1:3] "Urban" "Rural" "Nomadic"
## $ Women_Educationlevel : dbl+lbl [1:22776] 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## ..@ label : chr "Highest educational level"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:4] 0 1 2 3
## .. ..- attr(*, "names")= chr [1:4] "No Education" "Primary" "Secondary" "Higher"
## $ Household_size : dbl+lbl [1:22776] 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0...
## ..@ label : chr "Number of household members (listed)"
## ..@ format.stata: chr "%21.0f"
## ..@ labels : Named num [1:2] 0 1
## .. ..- attr(*, "names")= chr [1:2] "Less than 4 Members" " Four or More Members"
## $ Listening_Radio : dbl+lbl [1:22776] 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3...
## ..@ label : chr "Frequency of listening to radio"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:3] 1 2 3
## .. ..- attr(*, "names")= chr [1:3] "At least once a week" "Less than once a week" "Not at all"
## $ Watching_TV : dbl+lbl [1:22776] 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 3, 3, 3...
## ..@ label : chr "Frequency of watching television"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:3] 1 2 3
## .. ..- attr(*, "names")= chr [1:3] "At least once a week" "Less than once a week" "Not at all"
## $ Internet_Usage : dbl+lbl [1:22776] 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
## ..@ label : chr "Ever used internet"
## ..@ format.stata: chr "%3.0f"
## ..@ labels : Named num [1:2] 1 2
## .. ..- attr(*, "names")= chr [1:2] "Yes" "No"
## $ Wealth_Quantile : dbl+lbl [1:22776] 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2...
## ..@ label : chr "Wealth index combined"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:5] 1 2 3 4 5
## .. ..- attr(*, "names")= chr [1:5] "Lowest" "Second" "Middle" "Fourth" ...
## $ Number_of_Children : dbl+lbl [1:22776] 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## ..@ label : chr "Total children ever born"
## ..@ format.stata: chr "%22.0f"
## ..@ labels : Named num [1:2] 0 1
## .. ..- attr(*, "names")= chr [1:2] "Less than 4 Children" " Four or More Children"
## $ DV : dbl+lbl [1:22776] 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## ..@ label : chr "Contraceptive use and intention"
## ..@ format.stata: chr "%9.0f"
## ..@ labels : Named num [1:2] 0 1
## .. ..- attr(*, "names")= chr [1:2] "Non-Users" "Users"
## $ Health_Access : dbl+lbl [1:22776] 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
## ..@ label : chr "Getting medical help for self: getting permission to go"
## ..@ format.stata: chr "%10.0f"
## ..@ labels : Named num [1:2] 1 2
## .. ..- attr(*, "names")= chr [1:2] "Problem" "No problem"
## $ Distance_of_Healthcentre : dbl+lbl [1:22776] 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2...
## ..@ label : chr "Getting medical help for self: distance to health facility"
## ..@ format.stata: chr "%10.0f"
## ..@ labels : Named num [1:2] 1 2
## .. ..- attr(*, "names")= chr [1:2] "Problem" "No problem"
## $ Husband_Education : dbl+lbl [1:22776] 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## ..@ label : chr "Husband/partner's ever attended school"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:3] 1 2 8
## .. ..- attr(*, "names")= chr [1:3] "Yes" "No" "Don't Know"
## $ Husband_Occupation_status: dbl+lbl [1:22776] 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1...
## ..@ label : chr "Husband/partner worked in last 12 months"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:3] 1 2 8
## .. ..- attr(*, "names")= chr [1:3] "Yes" "No" "Don't Know"
## $ Women_Occupation_status : dbl+lbl [1:22776] 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
## ..@ label : chr "Respondent worked in last 12 months"
## ..@ format.stata: chr "%1.0f"
## ..@ labels : Named num [1:2] 1 2
## .. ..- attr(*, "names")= chr [1:2] "Yes" "No"
## $ sanitationfacility : dbl+lbl [1:22776] 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## ..@ format.stata: chr "%10.0g"
## ..@ labels : Named num [1:2] 0 1
## .. ..- attr(*, "names")= chr [1:2] "Unimproved" "Improved"
write.csv(final,"~/Genanka.csv")
final<-read.csv("~/Genanka.csv")
final<-final[,-1]
# Logistic Regression
logistic<-glm(DV~.,data=final,family = binomial())
summary(logistic)
##
## Call:
## glm(formula = DV ~ ., family = binomial(), data = final)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.88055 2.16253 -2.719 0.006542 **
## Age -0.20089 0.04464 -4.500 6.80e-06 ***
## Region -0.12401 0.01351 -9.178 < 2e-16 ***
## Residence -0.14846 0.10920 -1.359 0.174005
## Women_Educationlevel 0.29935 0.08832 3.389 0.000701 ***
## Household_size 0.96174 0.17945 5.359 8.35e-08 ***
## Listening_Radio 0.05679 0.09060 0.627 0.530764
## Watching_TV -0.32496 0.07257 -4.478 7.54e-06 ***
## Internet_Usage -0.08734 0.16394 -0.533 0.594195
## Wealth_Quantile 0.33654 0.06195 5.433 5.54e-08 ***
## Number_of_Children 0.52053 0.15604 3.336 0.000850 ***
## Health_Access -0.03429 0.13805 -0.248 0.803835
## Distance_of_Healthcentre 0.45850 0.13504 3.395 0.000686 ***
## Husband_Education 0.39290 0.12493 3.145 0.001661 **
## Husband_Occupation_status -0.37756 0.12963 -2.913 0.003585 **
## Women_Occupation_status 1.47997 1.00790 1.468 0.142004
## sanitationfacility -0.42958 0.11947 -3.596 0.000324 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3725.1 on 22775 degrees of freedom
## Residual deviance: 3159.4 on 22759 degrees of freedom
## AIC: 3193.4
##
## Number of Fisher Scoring iterations: 8
round(coef(summary(logistic)),4)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.8805 2.1625 -2.7193 0.0065
## Age -0.2009 0.0446 -4.4999 0.0000
## Region -0.1240 0.0135 -9.1783 0.0000
## Residence -0.1485 0.1092 -1.3594 0.1740
## Women_Educationlevel 0.2994 0.0883 3.3894 0.0007
## Household_size 0.9617 0.1794 5.3595 0.0000
## Listening_Radio 0.0568 0.0906 0.6268 0.5308
## Watching_TV -0.3250 0.0726 -4.4779 0.0000
## Internet_Usage -0.0873 0.1639 -0.5328 0.5942
## Wealth_Quantile 0.3365 0.0619 5.4329 0.0000
## Number_of_Children 0.5205 0.1560 3.3359 0.0009
## Health_Access -0.0343 0.1380 -0.2484 0.8038
## Distance_of_Healthcentre 0.4585 0.1350 3.3952 0.0007
## Husband_Education 0.3929 0.1249 3.1449 0.0017
## Husband_Occupation_status -0.3776 0.1296 -2.9126 0.0036
## Women_Occupation_status 1.4800 1.0079 1.4684 0.1420
## sanitationfacility -0.4296 0.1195 -3.5957 0.0003
# Logistic Regression without factors
logistic<-glm(DV~.,data=final,family = binomial())
summary(logistic)
##
## Call:
## glm(formula = DV ~ ., family = binomial(), data = final)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.88055 2.16253 -2.719 0.006542 **
## Age -0.20089 0.04464 -4.500 6.80e-06 ***
## Region -0.12401 0.01351 -9.178 < 2e-16 ***
## Residence -0.14846 0.10920 -1.359 0.174005
## Women_Educationlevel 0.29935 0.08832 3.389 0.000701 ***
## Household_size 0.96174 0.17945 5.359 8.35e-08 ***
## Listening_Radio 0.05679 0.09060 0.627 0.530764
## Watching_TV -0.32496 0.07257 -4.478 7.54e-06 ***
## Internet_Usage -0.08734 0.16394 -0.533 0.594195
## Wealth_Quantile 0.33654 0.06195 5.433 5.54e-08 ***
## Number_of_Children 0.52053 0.15604 3.336 0.000850 ***
## Health_Access -0.03429 0.13805 -0.248 0.803835
## Distance_of_Healthcentre 0.45850 0.13504 3.395 0.000686 ***
## Husband_Education 0.39290 0.12493 3.145 0.001661 **
## Husband_Occupation_status -0.37756 0.12963 -2.913 0.003585 **
## Women_Occupation_status 1.47997 1.00790 1.468 0.142004
## sanitationfacility -0.42958 0.11947 -3.596 0.000324 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3725.1 on 22775 degrees of freedom
## Residual deviance: 3159.4 on 22759 degrees of freedom
## AIC: 3193.4
##
## Number of Fisher Scoring iterations: 8
# Prediction Models
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
# Prep Training and Test dataset
# First, create a Train-test split with 0% data included in the training set
set.seed(123)
trainDataIndex<-createDataPartition(final$DV, p=0.8, list = F)
trainData<-final[trainDataIndex,]
testData<-final[-trainDataIndex,]
###################################################
# Logistic with only significant variables
###################################################
logitmod2<-glm(DV~ .,
family="binomial", data=trainData)
summary(logitmod2)
##
## Call:
## glm(formula = DV ~ ., family = "binomial", data = trainData)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.18579 2.19610 -2.361 0.018208 *
## Age -0.20427 0.04949 -4.128 3.67e-05 ***
## Region -0.13028 0.01498 -8.700 < 2e-16 ***
## Residence -0.19526 0.12266 -1.592 0.111426
## Women_Educationlevel 0.31178 0.09760 3.194 0.001401 **
## Household_size 0.94680 0.19716 4.802 1.57e-06 ***
## Listening_Radio 0.08126 0.10182 0.798 0.424835
## Watching_TV -0.28355 0.08038 -3.528 0.000419 ***
## Internet_Usage -0.19774 0.17999 -1.099 0.271922
## Wealth_Quantile 0.33404 0.06900 4.841 1.29e-06 ***
## Number_of_Children 0.65709 0.17725 3.707 0.000210 ***
## Health_Access -0.03907 0.15275 -0.256 0.798098
## Distance_of_Healthcentre 0.43994 0.14972 2.938 0.003299 **
## Husband_Education 0.38294 0.13856 2.764 0.005714 **
## Husband_Occupation_status -0.35366 0.14272 -2.478 0.013214 *
## Women_Occupation_status 1.21883 1.00999 1.207 0.227521
## sanitationfacility -0.48619 0.13293 -3.658 0.000255 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3018.0 on 18220 degrees of freedom
## Residual deviance: 2552.4 on 18204 degrees of freedom
## AIC: 2586.4
##
## Number of Fisher Scoring iterations: 8
# Apply the model to predict the testdata
pred<-predict(logitmod2,newdata = testData, type = "response")
summary(pred)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0002909 0.0032716 0.0068923 0.0159015 0.0171588 0.3909683
# Measure the accuracy of prediction in the test data
y_pred_num<-ifelse(pred<0.015,0,1)
y_pred<-factor(y_pred_num,levels = c(0,1))
y_act<-testData$DV
#Result : Prediction Accuracy
set.seed(123)
mean(y_pred==y_act)
## [1] 0.7306257
# Plot ROC Curve
library(InformationValue)
##
## Attaching package: 'InformationValue'
## The following objects are masked from 'package:caret':
##
## confusionMatrix, precision, sensitivity, specificity
InformationValue::plotROC(y_act,pred)

InformationValue::AUROC(y_act,pred)
## [1] 0.7427618
#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_table <- table(testData$DV, y_pred_num)
#Displays confusion matrix and the statistics associated with the confusion matrix
result<-caret::confusionMatrix(confusion_table)
result
## Confusion Matrix and Statistics
##
## y_pred_num
## 0 1
## 0 3277 1210
## 1 17 51
##
## Accuracy : 0.7306
## 95% CI : (0.7175, 0.7435)
## No Information Rate : 0.7232
## P-Value [Acc > NIR] : 0.1335
##
## Kappa : 0.0498
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.99484
## Specificity : 0.04044
## Pos Pred Value : 0.73033
## Neg Pred Value : 0.75000
## Prevalence : 0.72316
## Detection Rate : 0.71943
## Detection Prevalence : 0.98507
## Balanced Accuracy : 0.51764
##
## 'Positive' Class : 0
##
metrics<-as.data.frame(result$byClass)
colnames(metrics)<-"metrics"
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
kable(round(metrics,4), caption = "F1-score, Precision and Recall ") %>%
kable_styling(font_size = 16)
F1-score, Precision and Recall
|
metrics
|
Sensitivity
|
0.9948
|
Specificity
|
0.0404
|
Pos Pred Value
|
0.7303
|
Neg Pred Value
|
0.7500
|
Precision
|
0.7303
|
Recall
|
0.9948
|
F1
|
0.8423
|
Prevalence
|
0.7232
|
Detection Rate
|
0.7194
|
Detection Prevalence
|
0.9851
|
Balanced Accuracy
|
0.5176
|
#############################################3
# Probit Regression
#############################################
# Probit Regression
probitmod<-glm(DV~ .,
family=binomial(link="probit"), data=trainData)
summary(probitmod)
##
## Call:
## glm(formula = DV ~ ., family = binomial(link = "probit"), data = trainData)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.278144 0.804149 -2.833 0.004612 **
## Age -0.088674 0.020791 -4.265 2.00e-05 ***
## Region -0.058380 0.006403 -9.118 < 2e-16 ***
## Residence -0.069512 0.049913 -1.393 0.163720
## Women_Educationlevel 0.140517 0.045014 3.122 0.001798 **
## Household_size 0.392549 0.078507 5.000 5.73e-07 ***
## Listening_Radio 0.028669 0.044614 0.643 0.520479
## Watching_TV -0.131604 0.036878 -3.569 0.000359 ***
## Internet_Usage -0.098466 0.081866 -1.203 0.229067
## Wealth_Quantile 0.135231 0.028024 4.826 1.40e-06 ***
## Number_of_Children 0.269605 0.076050 3.545 0.000392 ***
## Health_Access -0.041160 0.063997 -0.643 0.520125
## Distance_of_Healthcentre 0.184673 0.063452 2.910 0.003609 **
## Husband_Education 0.186205 0.061025 3.051 0.002279 **
## Husband_Occupation_status -0.150504 0.059085 -2.547 0.010858 *
## Women_Occupation_status 0.452069 0.359175 1.259 0.208164
## sanitationfacility -0.205046 0.056846 -3.607 0.000310 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3018.0 on 18220 degrees of freedom
## Residual deviance: 2546.2 on 18204 degrees of freedom
## AIC: 2580.2
##
## Number of Fisher Scoring iterations: 8
# Apply the model to predict the testdata
pred<-predict(probitmod,newdata = testData, type = "response")
summary(pred)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.634e-05 2.627e-03 6.601e-03 1.589e-02 1.778e-02 3.000e-01
# Measure the accuracy of prediction in the test data
y_pred_num<-ifelse(pred<0.015,0,1)
y_pred<-factor(y_pred_num,levels = c(0,1))
y_act<-testData$DV
#Result : Prediction Accuracy
set.seed(123)
mean(y_pred==y_act)
## [1] 0.7220637
# Plot ROC Curve
library(InformationValue)
InformationValue::plotROC(y_act,pred)

InformationValue::AUROC(y_act,pred)
## [1] 0.7456607
#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_table <- table(testData$DV, y_pred_num)
#Displays confusion matrix and the statistics associated with the confusion matrix
result<-caret::confusionMatrix(confusion_table)
result
## Confusion Matrix and Statistics
##
## y_pred_num
## 0 1
## 0 3238 1249
## 1 17 51
##
## Accuracy : 0.7221
## 95% CI : (0.7088, 0.735)
## No Information Rate : 0.7146
## P-Value [Acc > NIR] : 0.1357
##
## Kappa : 0.0475
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.99478
## Specificity : 0.03923
## Pos Pred Value : 0.72164
## Neg Pred Value : 0.75000
## Prevalence : 0.71460
## Detection Rate : 0.71087
## Detection Prevalence : 0.98507
## Balanced Accuracy : 0.51700
##
## 'Positive' Class : 0
##
metrics<-as.data.frame(result$byClass)
colnames(metrics)<-"metrics"
library(dplyr)
library(kableExtra)
kable(round(metrics,4), caption = "F1-score, Precision and Recall ") %>%
kable_styling(font_size = 16)
F1-score, Precision and Recall
|
metrics
|
Sensitivity
|
0.9948
|
Specificity
|
0.0392
|
Pos Pred Value
|
0.7216
|
Neg Pred Value
|
0.7500
|
Precision
|
0.7216
|
Recall
|
0.9948
|
F1
|
0.8365
|
Prevalence
|
0.7146
|
Detection Rate
|
0.7109
|
Detection Prevalence
|
0.9851
|
Balanced Accuracy
|
0.5170
|
# Probit Regression
# Get feature importance (coefficients)
feature_importance2 <- abs(coef(probitmod)[-1])
# Create feature importance plot
feature_importance_df2<-data.frame(
Feature = names(feature_importance2),
Importance = feature_importance2
)
feature_importance_df2 <- feature_importance_df2[order(feature_importance_df2$Importance, decreasing = TRUE), ]
#windows()
ggplot(feature_importance_df2, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "darkred") +
xlab("Feature") +
ylab("Importance") +
ggtitle("Probit Regression - Feature Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

################################################3
# Random Forest
################################################
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
set.seed(123)
#Random Forest for variables. mtry = 5 since there are 24 variables (square root of 24 is close to 5).
fit_rf <- randomForest(factor(DV) ~., data = trainData)
fit_rf
##
## Call:
## randomForest(formula = factor(DV) ~ ., data = trainData)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 0.68%
## Confusion matrix:
## 0 1 class.error
## 0 17920 6 0.0003347094
## 1 117 178 0.3966101695
varImpPlot(fit_rf)

#Performance of Random Forest Model on Testing Data
predrf = predict(fit_rf, newdata=testData)
table(predrf)
## predrf
## 0 1
## 4510 45
summary(predrf)
## 0 1
## 4510 45
# Measure the accuracy of prediction in the test data
y_pred_numrf<-ifelse(predrf==1,1,0)
y_predrf<-factor(y_pred_numrf,levels = c(0,1))
y_actrf<-testData$DV
#Result : Prediction Accuracy
mean(y_predrf==y_actrf)
## [1] 0.9949506
#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_tablerf<- table(testData$DV, y_pred_numrf)
#Displays confusion matrix and the statistics associated with the confusion matrix
resultrf<-caret::confusionMatrix(confusion_tablerf)
resultrf
## Confusion Matrix and Statistics
##
## y_pred_numrf
## 0 1
## 0 4487 0
## 1 23 45
##
## Accuracy : 0.995
## 95% CI : (0.9924, 0.9968)
## No Information Rate : 0.9901
## P-Value [Acc > NIR] : 0.0002174
##
## Kappa : 0.794
##
## Mcnemar's Test P-Value : 4.49e-06
##
## Sensitivity : 0.9949
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.6618
## Prevalence : 0.9901
## Detection Rate : 0.9851
## Detection Prevalence : 0.9851
## Balanced Accuracy : 0.9975
##
## 'Positive' Class : 0
##
metricsrf<-as.data.frame(resultrf$byClass)
colnames(metricsrf)<-"metricsrf"
library(dplyr)
library(kableExtra)
kable(round(metricsrf,4), caption = "F1-score, Precision and Recall ") %>%
kable_styling(font_size = 16)
F1-score, Precision and Recall
|
metricsrf
|
Sensitivity
|
0.9949
|
Specificity
|
1.0000
|
Pos Pred Value
|
1.0000
|
Neg Pred Value
|
0.6618
|
Precision
|
1.0000
|
Recall
|
0.9949
|
F1
|
0.9974
|
Prevalence
|
0.9901
|
Detection Rate
|
0.9851
|
Detection Prevalence
|
0.9851
|
Balanced Accuracy
|
0.9975
|
# Random Forest
# Get feature importance
feature_importance <- importance(fit_rf)
# Convert feature importance to data frame
feature_importance_df <- data.frame(
Feature = row.names(feature_importance),
Importance = feature_importance[, "MeanDecreaseGini"]
)
# Sort feature importance by importance score
feature_importance_df <- feature_importance_df[order(feature_importance_df$Importance, decreasing = TRUE), ]
#windows()
# Create feature importance plot
ggplot(feature_importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "darkblue") +
xlab("Feature") +
ylab("Importance") +
ggtitle("Random Forest - Feature Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

#ROC
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
# RandomForest
test_prob1= predict(fit_rf, newdata=testData, type = "prob")
test_roc1= roc(testData$DV, test_prob1[,c(2)],plot = TRUE, col='darkgoldenrod4', print.auc = TRUE,percent=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

#Probit Regression
test_prob2= predict(probitmod, newdata=testData, type = "response")
test_roc2= roc(testData$DV~test_prob2,plot = TRUE, col='darkorange', print.auc = TRUE,percent=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

#Logistic Regression
test_prob3= predict(logitmod2, newdata=testData, type = "response")
test_roc3= roc(testData$DV~test_prob3, plot = TRUE,col="darkblue", print.auc = TRUE,percent=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

###############################################
# Support Vector Machine (SVM) Model
###############################################
library(e1071)
set.seed(123)
svm<-svm(DV~., data = trainData, kernel='linear', cost=10, scale=FALSE)
# Plot the SVC obtained
#windows()
plot(svm, final)
summary(svm)
##
## Call:
## svm(formula = DV ~ ., data = trainData, kernel = "linear", cost = 10,
## scale = FALSE)
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: linear
## cost: 10
## gamma: 0.0625
## epsilon: 0.1
##
##
## Number of Support Vectors: 1894
#Prediction
set.seed(123)
predictionSVM<- predict(svm, newdata = testData)
summary(predictionSVM)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.09958 0.09987 0.09997 0.09997 0.10007 0.10042
# Measure the accuracy of prediction in the test data
y_pred_numSVM<-ifelse(predictionSVM<0.1,0,1)
y_predSVM<-factor(y_pred_numSVM,levels = c(0,1))
y_actSVM<-testData$DV
#Result : Prediction Accuracy
mean(y_predSVM==y_actSVM)
## [1] 0.5815587
#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_tableSVM<- table(testData$DV, y_predSVM)
#Displays confusion matrix and the statistics associated with the confusion matrix
resultSVM<-caret::confusionMatrix(confusion_tableSVM)
resultSVM
## Confusion Matrix and Statistics
##
## y_predSVM
## 0 1
## 0 2609 1878
## 1 28 40
##
## Accuracy : 0.5816
## 95% CI : (0.5671, 0.5959)
## No Information Rate : 0.5789
## P-Value [Acc > NIR] : 0.3653
##
## Kappa : 0.0118
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.98938
## Specificity : 0.02086
## Pos Pred Value : 0.58146
## Neg Pred Value : 0.58824
## Prevalence : 0.57892
## Detection Rate : 0.57278
## Detection Prevalence : 0.98507
## Balanced Accuracy : 0.50512
##
## 'Positive' Class : 0
##
metricsSVM<-as.data.frame(resultSVM$byClass)
colnames(metricsSVM)<-"metricsSVM"
library(dplyr)
library(kableExtra)
kable(round(metricsSVM,4), caption = "F1-score, Precision and Recall ") %>%
kable_styling(font_size = 16)
F1-score, Precision and Recall
|
metricsSVM
|
Sensitivity
|
0.9894
|
Specificity
|
0.0209
|
Pos Pred Value
|
0.5815
|
Neg Pred Value
|
0.5882
|
Precision
|
0.5815
|
Recall
|
0.9894
|
F1
|
0.7325
|
Prevalence
|
0.5789
|
Detection Rate
|
0.5728
|
Detection Prevalence
|
0.9851
|
Balanced Accuracy
|
0.5051
|
#SVM
library(pROC)
#windows()
test_prob4= predict(svm, newdata=testData)
test_roc4= roc(testData$DV, main="Support Vector Machine", test_prob4,plot = TRUE, col='darkblue', print.auc = TRUE,percent=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

coef(svm)[2:15]
## Age Region Residence
## 3.562587e-05 -1.566845e-05 5.605488e-05
## Women_Educationlevel Household_size Listening_Radio
## -2.563011e-05 -7.175976e-05 -3.870384e-05
## Watching_TV Internet_Usage Wealth_Quantile
## 6.940195e-06 -9.375122e-05 1.846147e-05
## Number_of_Children Health_Access Distance_of_Healthcentre
## -8.917798e-06 1.442841e-04 -5.295778e-05
## Husband_Education Husband_Occupation_status
## 4.111935e-05 -5.776952e-05
# Support Vector Machine
# Get feature importance (coefficients)
feature_importance1<- abs(coef(svm)[2:15])
# Create feature importance plot
feature_importance_df <- data.frame(
Feature = names(feature_importance1),
Importance = feature_importance1
)
feature_importance_df <- feature_importance_df[order(feature_importance_df$Importance, decreasing = TRUE), ]
#windows()
ggplot(feature_importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "magenta") +
xlab("Feature") +
ylab("Importance") +
ggtitle("Support Vector Machine - Feature Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

#################################################
############################################
# KNN Model
library(knn)
# Fitting model
fit <-knnreg(trainData$DV ~ ., data = trainData,k=1)
summary(fit)
## Length Class Mode
## learn 2 -none- list
## k 1 -none- numeric
## terms 3 terms call
## xlevels 0 -none- list
## theDots 0 -none- list
#Predict Output
predicted= predict(fit,testData)
summary(predicted)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.01516 0.00000 1.00000
# Measure the accuracy of prediction in the test data
y_pred_numknn<-ifelse(predicted==1,1,0)
y_predknn<-factor(y_pred_numknn,levels = c(0,1))
y_actknn<-testData$DV
#Result : Prediction Accuracy
mean(y_predknn==y_actknn)
## [1] 0.997146
#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_tableknn<- table(testData$DV, y_predknn)
#Displays confusion matrix and the statistics associated with the confusion matrix
resultknn<-caret::confusionMatrix(confusion_tableknn)
resultknn
## Confusion Matrix and Statistics
##
## y_predknn
## 0 1
## 0 4485 2
## 1 11 57
##
## Accuracy : 0.9971
## 95% CI : (0.9951, 0.9985)
## No Information Rate : 0.987
## P-Value [Acc > NIR] : 4.042e-13
##
## Kappa : 0.8962
##
## Mcnemar's Test P-Value : 0.0265
##
## Sensitivity : 0.9976
## Specificity : 0.9661
## Pos Pred Value : 0.9996
## Neg Pred Value : 0.8382
## Prevalence : 0.9870
## Detection Rate : 0.9846
## Detection Prevalence : 0.9851
## Balanced Accuracy : 0.9818
##
## 'Positive' Class : 0
##
metricsknn<-as.data.frame(resultknn$byClass)
colnames(metricsrf)<-"metricsknn"
library(dplyr)
library(kableExtra)
kable(round(metricsknn,4), caption = "F1-score, Precision and Recall ") %>%
kable_styling(font_size = 16)
F1-score, Precision and Recall
|
resultknn$byClass
|
Sensitivity
|
0.9976
|
Specificity
|
0.9661
|
Pos Pred Value
|
0.9996
|
Neg Pred Value
|
0.8382
|
Precision
|
0.9996
|
Recall
|
0.9976
|
F1
|
0.9986
|
Prevalence
|
0.9870
|
Detection Rate
|
0.9846
|
Detection Prevalence
|
0.9851
|
Balanced Accuracy
|
0.9818
|
knnmodel <- train(DV ~ ., data = trainData, method = "knn")
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
# Get feature importance
importance <- varImp(knnmodel)
# Print the feature importance
print(importance)
## loess r-squared variable importance
##
## Overall
## Watching_TV 100.0000
## Wealth_Quantile 85.7159
## Women_Educationlevel 71.7999
## Husband_Education 63.3490
## Internet_Usage 51.8555
## Residence 37.6352
## Husband_Occupation_status 34.6371
## Region 18.2522
## Distance_of_Healthcentre 15.9418
## Household_size 10.8305
## Age 7.5547
## Listening_Radio 4.1220
## Health_Access 4.0223
## sanitationfacility 0.7499
## Women_Occupation_status 0.6048
## Number_of_Children 0.0000
# Plot the feature importance
plot <- ggplot(importance, aes(x = rownames(importance), y = Overall)) +
geom_bar(stat = "identity", fill = "darkgreen") +
labs(x = "Features", y = "Importance") +
ggtitle("KNN Features Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Print the plot
print(plot)

#ROC
library(pROC)
# KNN
#windows()
test_prob5= predict(fit, newdata=testData, type = "response")
test_roc5= roc(testData$DV, test_prob5,plot = TRUE, col='darkblue', print.auc = TRUE,percent=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

####################################################
# Area_Under Curve
############################################
#Area-under Curve
#windows()
plot(test_roc1, print.auc = TRUE, col='blue',percent=TRUE, print.auc.y = 48, print.auc.x = 0)
plot(test_roc2 , add = TRUE, col='gold', print.auc = TRUE, print.auc.y = 41, print.auc.x = 0)
plot(test_roc3 , add = TRUE, col='brown', print.auc = TRUE, print.auc.y = 34, print.auc.x = 0)
plot(test_roc4 , add = TRUE, col='green', print.auc = TRUE, print.auc.y = 27, print.auc.x = 0)
plot(test_roc5 , add = TRUE, col='cyan', print.auc = TRUE, print.auc.y = 20, print.auc.x = 0)
abline(a=0, b=1, lty=3, lwd=4, col="black")
legend("topleft", legend = c("Random Forest-Model", "Probit Regression",
"Logistic-Regression","Support Vector Machine","KNN Model") ,
pch = 15, bty = 'n', col = c("blue","gold", "brown", "green","cyan"))

###################################################
# Metrics
###############################################
# Sample accuracy, sensitivity, F1 Score, and precision values for three models
Accuracy <- c(0.578, 0.960, 0.561,0.503,0.956)
Recall <- c(0.976, 0.961, 0.976,0.963,0.962)
F1_score <- c(0.724, 0.980, 0.709,0.661,0.977)
Precision<-c(0.575,0.999,0.557,0.503,0.993)
# Creating a data frame with the values
model_names <- c("Logistic Regression", "Random Forest", "Probit Regression",
"Support Vector Machine","KNN Model")
data <- data.frame(Model = model_names, Accuracy = Accuracy, Recall = Recall, F1_Score = F1_score, Precision=Precision)
# Loading the ggplot2 package
library(ggplot2)
# Reshaping the data frame for plotting
data_long <- reshape2::melt(data, id.vars = "Model", variable.name = "Metric", value.name = "Value")
#windows()
# Creating the bar chart
ggplot(data_long, aes(x = Model, y = Value, fill = Metric)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Comparison of Supervised Machine Learning Models",
x = "Model",
y = "Value") +
scale_fill_manual(values = c("Accuracy" = "cyan3",
"Recall" = "pink4",
"F1_Score" = "purple4",
"Precision" = "green1")) +
theme_minimal()

#############################################
# Feature Selection
#############################################
#windows()
par(mfrow=(c(3,2)))
# Logistic Regression
# Get feature importance (coefficients)
feature_importance2 <- abs(coef(logitmod2)[-1])
# Create feature importance plot
feature_importance_df2<-data.frame(
Feature = names(feature_importance2),
Importance = feature_importance2
)
feature_importance_df2 <- feature_importance_df2[order(feature_importance_df2$Importance, decreasing = TRUE), ]
#windows()
ggplot(feature_importance_df2, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "magenta4") +
xlab("Feature") +
ylab("Importance") +
ggtitle("Logistic Regression - Feature Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

###########################################3
# Probit Regression
# Get feature importance (coefficients)
feature_importance2 <- abs(coef(probitmod)[-1])
# Create feature importance plot
feature_importance_df2<-data.frame(
Feature = names(feature_importance2),
Importance = feature_importance2
)
feature_importance_df2 <- feature_importance_df2[order(feature_importance_df2$Importance, decreasing = TRUE), ]
#windows()
ggplot(feature_importance_df2, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "cyan4") +
xlab("Feature") +
ylab("Importance") +
ggtitle("Probit Regression - Feature Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

#############################################
# Random Forest
# Get feature importance
feature_importance <- importance(fit_rf)
# Convert feature importance to data frame
feature_importance_df <- data.frame(
Feature = row.names(feature_importance),
Importance = feature_importance[, "MeanDecreaseGini"]
)
# Sort feature importance by importance score
feature_importance_df <- feature_importance_df[order(feature_importance_df$Importance, decreasing = TRUE), ]
#windows()
# Create feature importance plot
ggplot(feature_importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "darkred") +
xlab("Feature") +
ylab("Importance") +
ggtitle("Random Forest - Feature Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

###########################################
knnmodel <- train(DV ~ ., data = trainData, method = "knn")
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
# Get feature importance
importance <- varImp(knnmodel)
# Print the feature importance
print(importance)
## loess r-squared variable importance
##
## Overall
## Watching_TV 100.0000
## Wealth_Quantile 85.7159
## Women_Educationlevel 71.7999
## Husband_Education 63.3490
## Internet_Usage 51.8555
## Residence 37.6352
## Husband_Occupation_status 34.6371
## Region 18.2522
## Distance_of_Healthcentre 15.9418
## Household_size 10.8305
## Age 7.5547
## Listening_Radio 4.1220
## Health_Access 4.0223
## sanitationfacility 0.7499
## Women_Occupation_status 0.6048
## Number_of_Children 0.0000
#windows()
# Plot the feature importance
plot <- ggplot(importance, aes(x = rownames(importance), y = Overall)) +
geom_bar(stat = "identity", fill = "darkblue") +
labs(x = "Features", y = "Importance") +
ggtitle("KNN Features Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Print the plot
print(plot)

########################################
coef(svm)[2:15]
## Age Region Residence
## 3.562587e-05 -1.566845e-05 5.605488e-05
## Women_Educationlevel Household_size Listening_Radio
## -2.563011e-05 -7.175976e-05 -3.870384e-05
## Watching_TV Internet_Usage Wealth_Quantile
## 6.940195e-06 -9.375122e-05 1.846147e-05
## Number_of_Children Health_Access Distance_of_Healthcentre
## -8.917798e-06 1.442841e-04 -5.295778e-05
## Husband_Education Husband_Occupation_status
## 4.111935e-05 -5.776952e-05
# Support Vector Machine
# Get feature importance (coefficients)
feature_importance1<- abs(coef(svm)[2:15])
# Create feature importance plot
feature_importance_df <- data.frame(
Feature = names(feature_importance1),
Importance = feature_importance1
)
feature_importance_df <- feature_importance_df[order(feature_importance_df$Importance, decreasing = TRUE), ]
#windows()
ggplot(feature_importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "darkgreen") +
xlab("Feature") +
ylab("Importance") +
ggtitle("Support Vector Machine - Feature Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
