Dropout Project

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

# 10 Supervised Machine Learning Models
final1<-read.csv("~/mldropout.csv")
str(final1)

## 'data.frame':    4490 obs. of  11 variables:
##  $ Age_of_Household_head: chr  "Below 20 years" "20Years and Above" "Below 20 years" "Below 20 years" ...
##  $ Household_size       : chr  "Below 5 Members" "5 Members or mor" "5 Members or mor" "5 Members or mor" ...
##  $ Sex_of_Household_head: chr  "female" "female" "male" "female" ...
##  $ Grade                : chr  "first_degree" "first_degree" "first_degree" "first_degree" ...
##  $ Region               : chr  "Marodijeh" "Marodijeh" "Marodijeh" "Marodijeh" ...
##  $ District             : chr  "Hargeisa" "Hargeisa" "Hargeisa" "Hargeisa" ...
##  $ Location_Type        : chr  "Urban" "Rural" "Rural" "Rural" ...
##  $ Occupation_status    : chr  "Daily wager" "Government Employee" "Government Employee" "Government Employee" ...
##  $ Type_of_Housing      : chr  "Daar no fence" "Daar no fence" "Daar no fence" "Daar no fence" ...
##  $ School_Type          : chr  "Public" "Private" "Private" "Private" ...
##  $ dv                   : chr  "No" "No" "No" "Yes" ...

#Age
dada<-final1$Age_of_Household_head
table(dada)

## dada
## 20Years and Above    Below 20 years 
##               513              3977

dada<-as.factor(dada)
age<-unclass(dada)
table(age)

## age
##    1    2 
##  513 3977

#Household Size
guriga<-final1$Household_size
table(guriga)

## guriga
## 5 Members or mor  Below 5 Members 
##             3817              673

guriga<-as.factor(guriga)
HHSize<-unclass(guriga)
table(HHSize)

## HHSize
##    1    2 
## 3817  673

#Sex
jinsi<-final1$Sex_of_Household_head
table(jinsi)

## jinsi
## female   male 
##   2164   2326

jinsi<-as.factor(jinsi)
Sex<-unclass(jinsi)
table(Sex)

## Sex
##    1    2 
## 2164 2326

#Grade
classka<-final1$Grade
table(classka)

## classka
##  first_degree        form_1        form_2        form_3        form_4 
##          3935            31             5            29            18 
##       grade_1       grade_2       grade_3       grade_4       grade_5 
##            64            60            46            52            63 
##       grade_6       grade_7       grade_8 post_graduate 
##            34            61            89             3

classka<-as.factor(classka)
Grade<-unclass(classka)
table(Grade)

## Grade
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14 
## 3935   31    5   29   18   64   60   46   52   63   34   61   89    3

#Region
gobol<-final1$Region
table(gobol)

## gobol
##        Awdal Daadmadheedh         Hawd    Marodijeh        Sahil       Sanaag 
##          562          105          127         1433          589          472 
##       Saraar         Sool     Togdheer 
##          152          351          699

gobol<-as.factor(gobol)
Region<-unclass(gobol)
table(Region)

## Region
##    1    2    3    4    5    6    7    8    9 
##  562  105  127 1433  589  472  152  351  699

#District
degmo<-final1$District
table(degmo)

## degmo
## balligubadle      Berbera       Borama        Burco      Caynabo  Ceel-Afweyn 
##          127          475          458          702          148          115 
##   Ceerigaabo     Hargeisa  Laascaanood      Lughaya     oodweyne       Sheikh 
##          357         1434          351          104          103          116

degmo<-as.factor(degmo)
District<-unclass(degmo)
table(District)

## District
##    1    2    3    4    5    6    7    8    9   10   11   12 
##  127  475  458  702  148  115  357 1434  351  104  103  116

#Residence
degsiimo<-final1$Location_Type
table(degsiimo)

## degsiimo
## Nomadic   Rural   Urban 
##     207    1319    2964

degsiimo<-as.factor(degsiimo)
Residence<-unclass(degsiimo)
table(Residence)

## Residence
##    1    2    3 
##  207 1319 2964

#Occupation
shaqo<-final1$Occupation_status
table(shaqo)

## shaqo
##  Agriculture or Agro-pastoralist                      Daily wager 
##                              416                              676 
##                          Fishing              Government Employee 
##                               16                              467 
##                 Household Chores              Nomadic Pastoralist 
##                             1075                              163 
## Not working but looking for work                    Other specify 
##                              477                              242 
##                 Private Employee                    Self Employed 
##                              313                              645

shaqo<-as.factor(shaqo)
Occupation<-unclass(shaqo)
table(Occupation)

## Occupation
##    1    2    3    4    5    6    7    8    9   10 
##  416  676   16  467 1075  163  477  242  313  645

#House Type
nooca<-final1$Type_of_Housing
table(nooca)

## nooca
##             Bungalow  Buul/bus/somali hut    Concrete building 
##                  477                 1079                   15 
##        Daar no fence Derked/cariish/Mudul             Sandaqad 
##                 1765                  249                  905

nooca<-as.factor(nooca)
Typehousing<-unclass(nooca)
table(Typehousing)

## Typehousing
##    1    2    3    4    5    6 
##  477 1079   15 1765  249  905

#School Type
dugsiga<-final1$School_Type
table(dugsiga)

## dugsiga
## Private  Public 
##    1250    3240

dugsiga<-as.factor(dugsiga)
Schooltype<-unclass(dugsiga)
table(Schooltype)

## Schooltype
##    1    2 
## 1250 3240

#DV
dv<-final1$dv
dv<-ifelse(dv<"Yes",0,1)
table(dv)

## dv
##    0    1 
## 3921  569

prop.table(table(dv))*100

## dv
##        0        1 
## 87.32739 12.67261

#Final Data
final<-data.frame(dv,age,HHSize,Sex,Grade,Region,District,Residence,Occupation,Typehousing,Schooltype)


# Logistic Regression
dropout<-glm(dv~.,data=final,family = binomial())
summary(dropout)

## 
## Call:
## glm(formula = dv ~ ., family = binomial(), data = final)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.74351    1.02704  -1.698   0.0896 .  
## age         -1.58277    0.21601  -7.327 2.35e-13 ***
## HHSize       0.01548    0.26549   0.058   0.9535    
## Sex         -0.31890    0.18956  -1.682   0.0925 .  
## Grade        1.05031    0.04839  21.703  < 2e-16 ***
## Region      -0.01147    0.03932  -0.292   0.7706    
## District     0.02419    0.03541   0.683   0.4946    
## Residence    0.13170    0.17938   0.734   0.4628    
## Occupation  -0.03633    0.03342  -1.087   0.2771    
## Typehousing -0.08687    0.05782  -1.502   0.1330    
## Schooltype   0.18760    0.24775   0.757   0.4489    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3413.44  on 4489  degrees of freedom
## Residual deviance:  989.15  on 4479  degrees of freedom
## AIC: 1011.2
## 
## Number of Fisher Scoring iterations: 7

round(coef(summary(dropout)),4)

##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)  -1.7435     1.0270 -1.6976   0.0896
## age          -1.5828     0.2160 -7.3272   0.0000
## HHSize        0.0155     0.2655  0.0583   0.9535
## Sex          -0.3189     0.1896 -1.6823   0.0925
## Grade         1.0503     0.0484 21.7033   0.0000
## Region       -0.0115     0.0393 -0.2916   0.7706
## District      0.0242     0.0354  0.6830   0.4946
## Residence     0.1317     0.1794  0.7342   0.4628
## Occupation   -0.0363     0.0334 -1.0869   0.2771
## Typehousing  -0.0869     0.0578 -1.5024   0.1330
## Schooltype    0.1876     0.2477  0.7572   0.4489

# Logistic Regression without factors
dropout1<-glm(dv~.,data=final,family = binomial())
summary(dropout1)

## 
## Call:
## glm(formula = dv ~ ., family = binomial(), data = final)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.74351    1.02704  -1.698   0.0896 .  
## age         -1.58277    0.21601  -7.327 2.35e-13 ***
## HHSize       0.01548    0.26549   0.058   0.9535    
## Sex         -0.31890    0.18956  -1.682   0.0925 .  
## Grade        1.05031    0.04839  21.703  < 2e-16 ***
## Region      -0.01147    0.03932  -0.292   0.7706    
## District     0.02419    0.03541   0.683   0.4946    
## Residence    0.13170    0.17938   0.734   0.4628    
## Occupation  -0.03633    0.03342  -1.087   0.2771    
## Typehousing -0.08687    0.05782  -1.502   0.1330    
## Schooltype   0.18760    0.24775   0.757   0.4489    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3413.44  on 4489  degrees of freedom
## Residual deviance:  989.15  on 4479  degrees of freedom
## AIC: 1011.2
## 
## Number of Fisher Scoring iterations: 7

# Visualization of Parameter Effects
library(effects)

## Loading required package: carData

## lattice theme set by effectsTheme()
## See ?effectsTheme for details.

windows()
plot(allEffects(dropout1),cex.main=.75, cex.lab=.1, cex.axis=0.1)







# Prediction Models
library(caret)

## Warning: package 'caret' was built under R version 4.3.3

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 4.3.3

## Loading required package: lattice

# Prep Training and Test dataset
# First, create a Train-test split with 0% data included in the training set
set.seed(123)
trainDataIndex<-createDataPartition(final$dv, p=0.8, list = F)
trainData<-final[trainDataIndex,]
testData<-final[-trainDataIndex,]

# Build logistic model (Usig All Features)
logitmod1<-glm(dv~ .,
               family="binomial", data=trainData)
summary(logitmod1)

## 
## Call:
## glm(formula = dv ~ ., family = "binomial", data = trainData)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.99588    1.11971  -1.782   0.0747 .  
## age         -1.56637    0.23480  -6.671 2.54e-11 ***
## HHSize       0.12275    0.28318   0.433   0.6647    
## Sex         -0.34251    0.20558  -1.666   0.0957 .  
## Grade        1.01050    0.05296  19.079  < 2e-16 ***
## Region      -0.01430    0.04235  -0.338   0.7357    
## District     0.03114    0.03808   0.818   0.4135    
## Residence    0.20095    0.19284   1.042   0.2974    
## Occupation  -0.04141    0.03578  -1.157   0.2472    
## Typehousing -0.08473    0.06266  -1.352   0.1764    
## Schooltype   0.22647    0.26741   0.847   0.3971    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2722.24  on 3591  degrees of freedom
## Residual deviance:  834.51  on 3581  degrees of freedom
## AIC: 856.51
## 
## Number of Fisher Scoring iterations: 7

# Re-run logistic with only significant variables
logitmod2<-glm(dv~ .,
               family="binomial", data=trainData)
summary(logitmod2)

## 
## Call:
## glm(formula = dv ~ ., family = "binomial", data = trainData)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.99588    1.11971  -1.782   0.0747 .  
## age         -1.56637    0.23480  -6.671 2.54e-11 ***
## HHSize       0.12275    0.28318   0.433   0.6647    
## Sex         -0.34251    0.20558  -1.666   0.0957 .  
## Grade        1.01050    0.05296  19.079  < 2e-16 ***
## Region      -0.01430    0.04235  -0.338   0.7357    
## District     0.03114    0.03808   0.818   0.4135    
## Residence    0.20095    0.19284   1.042   0.2974    
## Occupation  -0.04141    0.03578  -1.157   0.2472    
## Typehousing -0.08473    0.06266  -1.352   0.1764    
## Schooltype   0.22647    0.26741   0.847   0.3971    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2722.24  on 3591  degrees of freedom
## Residual deviance:  834.51  on 3581  degrees of freedom
## AIC: 856.51
## 
## Number of Fisher Scoring iterations: 7

# Apply the model to predict the testdata
pred<-predict(logitmod2,newdata = testData, type = "response")
summary(pred)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.007423 0.015635 0.020036 0.134010 0.027302 0.999952

table(pred)

## pred
## 0.00742264330492044 0.00834557966058118 0.00935186017198008  0.0095735670772806 
##                   1                   3                   1                   1 
## 0.00966655480524891 0.00968072581175757 0.00975780449363521 0.00983825067518304 
##                   1                   1                   1                   1 
## 0.00984834778170783 0.00987684856127385   0.010129602347288  0.0101362970093275 
##                   1                   1                   1                   1 
##  0.0102844837526646  0.0102885250120266  0.0103920358457867  0.0103941226909481 
##                   1                   1                   3                   1 
##  0.0104445769634846  0.0104570681713114  0.0106607549758712   0.010740543673018 
##                   1                   1                   1                   1 
##  0.0108813556118409  0.0110803802769215  0.0111625849379009  0.0111671619241053 
##                   1                   3                   1                   2 
##  0.0113027710121192  0.0114600515058198  0.0115918294622422  0.0116338028816892 
##                   1                   4                   1                   2 
##  0.0116793620708303  0.0117146370802538  0.0117209298446323  0.0117688024765855 
##                   4                   1                   1                   3 
##   0.011863872491806  0.0119711212852362  0.0119838622663828  0.0120563177182936 
##                   1                   1                   2                   1 
##  0.0121147417032472  0.0122565168699555   0.012266992821193  0.0122874036432378 
##                   5                   1                   1                   1 
##  0.0122898663741369  0.0123080101834741  0.0123494079707349    0.01237733206863 
##                   1                   1                   4                   1 
##  0.0124372627164436  0.0124390512552267  0.0124841997262543  0.0125462964423127 
##                   1                   1                   1                   1 
##  0.0125571257031243  0.0125781788192963  0.0126204737030311  0.0126780961394396 
##                   2                   2                   2                   1 
##  0.0127479762050097  0.0127509011475002   0.012752140128656   0.012772001068969 
##                   1                   3                   1                   2 
##  0.0128002500526309  0.0128028142395892  0.0128164591651669  0.0128648085831455 
##                   1                   1                   3                   1 
##  0.0129236371071585  0.0129562818333845  0.0130751540874032  0.0130996309953278 
##                   1                   1                   3                   1 
##  0.0131524164575045  0.0132070310651539    0.01325617248866  0.0132841226338693 
##                   1                   1                   1                   2 
##  0.0132867824471289  0.0133342124575686  0.0135331636747964  0.0135343162629865 
##                   1                   2                   1                   2 
##   0.013598570359466  0.0136184263680403  0.0136890981762095  0.0136952645290174 
##                   1                   1                   1                   1 
##  0.0137029934417753  0.0137037125658686  0.0137339356642547  0.0137577258591493 
##                   2                   1                   1                   3 
##  0.0138015033629629  0.0138478540603425  0.0138668113921259  0.0138901356992704 
##                   2                   2                   1                   1 
##  0.0138929151430402  0.0139133922323584   0.013939766219649  0.0139601133759764 
##                   1                   1                   1                   1 
##  0.0140266075939197  0.0140387740901676  0.0140443475649599  0.0140592637026455 
##                   2                   2                   1                   1 
##   0.014091668804937  0.0141274949523808  0.0142158503759126  0.0143275943604022 
##                   2                   1                   1                   2 
##  0.0143526052231819  0.0143707514861484  0.0143784502802266    0.01441460635028 
##                   1                   1                   1                   2 
##  0.0144174892072757  0.0144248787119776  0.0144370186507005  0.0144660833536126 
##                   1                   1                   1                   3 
##  0.0144898964845193  0.0145590659025113   0.014575092187373  0.0145780066660749 
##                   1                   2                   3                   1 
##  0.0146845254691312  0.0147001113859661  0.0147159980449591  0.0147367949710476 
##                   1                   7                   1                   1 
##  0.0147922802028924  0.0148003677274794  0.0148079787700362  0.0148446319354664 
##                   1                   1                   1                   1 
##  0.0148477129926871  0.0148662799647653  0.0149503434360873   0.014952060211996 
##                   1                   1                   1                   2 
##  0.0149840974612268  0.0149896682313965  0.0150086555168392  0.0150121965633278 
##                   1                   2                   2                   1 
##  0.0150714036244964  0.0150744158289352  0.0151472410453222  0.0151534268194979 
##                   2                   1                   6                   1 
##  0.0151849874779913  0.0152223437706865  0.0152583385958503  0.0152602383838768 
##                   1                   1                   1                   1 
##  0.0152958725355116  0.0153021180641779  0.0153042311169027  0.0153120972424999 
##                   2                   1                   1                   1 
##  0.0153370112015111  0.0153502842231612  0.0153719184201596  0.0153943233792159 
##                   1                   1                   1                   1 
##  0.0153986034767706  0.0154209975902826  0.0154367345312307  0.0154934173773807 
##                   3                   2                   1                   1 
##   0.015536162698721   0.015549223135717  0.0155661887922652  0.0156297559017006 
##                   1                   5                   3                   1 
##  0.0156509013643749  0.0156757598527779   0.015698600607625   0.015724836805133 
##                   2                   1                   3                   2 
##  0.0158455336645185   0.015895166910648  0.0160189320399188   0.016039195154529 
##                   1                   1                   1                   4 
##  0.0160660316994747  0.0160712763576581  0.0161250699746767  0.0161253024430737 
##                   3                   1                   1                   1 
##  0.0161724250918957  0.0161920801886961  0.0161959776178171  0.0162171443460771 
##                   1                   2                   4                   1 
##  0.0162410361651031  0.0162568661408634  0.0163720268970132  0.0163857783057423 
##                   1                   1                   3                   1 
##   0.016440338495152  0.0164658892602669  0.0165054651488881  0.0165438275745182 
##                   2                   1                   1                   1 
##  0.0165881090984207  0.0165946776393754  0.0166014445483232  0.0166122567789941 
##                   4                   2                   1                   2 
##  0.0166294245948103  0.0166951534715256  0.0167017771594211  0.0167059836919066 
##                   1                   1                   1                   1 
##  0.0167095954023294  0.0167967932926841  0.0167989208499831  0.0168346022697389 
##                   1                   1                   2                   1 
##  0.0168944801367872  0.0169325480708322  0.0169602445378723  0.0169793329077077 
##                   2                   1                   1                   4 
##   0.016988405101068  0.0170667252406428  0.0170983936499657  0.0171165373209028 
##                   1                   1                   3                   1 
##  0.0171770470561328  0.0171878279835745  0.0171978038612669  0.0172201175293919 
##                   3                   3                   5                   2 
##  0.0172235516708162  0.0172488519930787  0.0172841532959751  0.0173210995729097 
##                   3                   4                   7                   2 
##  0.0173674248177715    0.01740000206738   0.017429075163574  0.0174930332009511 
##                   1                   4                   1                   1 
##  0.0174945170610337  0.0175443340318425  0.0175632069894364  0.0175698488406534 
##                   5                   2                   2                   1 
##  0.0175889797980354   0.017625539850793  0.0176736373212267  0.0177418237997451 
##                   1                   1                   1                   2 
##   0.017768105022023  0.0177972555691863  0.0178048700650194  0.0178084187082642 
##                   1                   1                   1                   2 
##  0.0178680450039664  0.0178791189381795  0.0178957293266801   0.017935105039955 
##                   1                   1                   1                   1 
##  0.0179576982010048  0.0179913623937972  0.0180402029414723  0.0180791498364049 
##                   2                   2                   4                   1 
##  0.0181510397985229  0.0181525784432821  0.0183182412717443  0.0183523508094168 
##                   2                   3                   1                   3 
##  0.0183677344976174  0.0184259071261055  0.0185020090283481  0.0185473717784921 
##                   1                   4                   1                   2 
##  0.0185629456289942  0.0185745799565362  0.0186131313236526  0.0187411682257349 
##                   1                   2                   1                   1 
##  0.0187562015424543  0.0188618399745509  0.0188740487554849  0.0189512928645733 
##                   5                   2                   6                   1 
##   0.018974808859335  0.0189764159845194  0.0190751530974237  0.0191134349037331 
##                   2                   1                   1                   1 
##  0.0191294441314273  0.0192370952787161  0.0192691777191788  0.0194109420954427 
##                   1                   2                   1                   1 
##  0.0194153657483028  0.0194211032127447  0.0194472766460655  0.0194535856712397 
##                   2                   3                   1                   1 
##  0.0194804685330476  0.0194974631325631  0.0195390464708274  0.0195674044223364 
##                   1                   2                   3                   1 
##  0.0195811646851428  0.0195911853488076  0.0195946188753881   0.019609582542242 
##                   2                   1                   1                   3 
##  0.0196436301137848  0.0196475379046776  0.0196563347892803  0.0196891023752502 
##                   3                   1                   1                   1 
##  0.0197242778014065  0.0197420132550996  0.0197585383685235  0.0197841076352862 
##                   1                   1                   2                   1 
##  0.0198190053932483  0.0199212047803781  0.0199518958594201  0.0200341214966348 
##                   2                   1                   1                   1 
##  0.0200374448604915   0.020059939220221  0.0200624716682637  0.0200626958891857 
##                   1                   2                   1                   1 
##  0.0200675059706866  0.0201166282979243  0.0202150226282548  0.0202243033408238 
##                   1                   1                   2                   4 
##  0.0203392878201326  0.0205706954327983  0.0205812118586312  0.0207020864651809 
##                   1                   2                   5                   1 
##   0.020716876371544  0.0207295733334448  0.0207380614587888  0.0207469075914678 
##                   2                   2                   1                   3 
##  0.0208124555635569  0.0208627280393565  0.0208906046215097  0.0209189987661699 
##                   3                   1                   1                   1 
##   0.020949316715129  0.0209929870370191   0.021010563570551   0.021015490456484 
##                   3                   1                   2                   1 
##  0.0210978669535672  0.0211020578148844  0.0212033759664624  0.0212865415007565 
##                   1                   3                   8                   2 
##  0.0213078564299312  0.0213173805322049  0.0214017079524248  0.0214101410790009 
##                   1                   2                   2                   1 
##  0.0214217681768643  0.0214308995723629  0.0214317033732691  0.0215098553692447 
##                   1                   1                   1                   1 
##  0.0215439470872998  0.0215530381681195  0.0215738638511487  0.0216060748196375 
##                   1                   5                   1                   1 
##  0.0216449360464063  0.0216723456875641  0.0216858857706634  0.0217263843450356 
##                   1                   2                   3                   1 
##  0.0217625261493493  0.0218438773638951  0.0218838896242687  0.0218905297244178 
##                   2                   1                   1                   1 
##  0.0219155746896329  0.0219292633270487  0.0219384987923603   0.021970261148806 
##                   3                   2                   1                   1 
##  0.0219746214116317   0.022119719785031  0.0222034181196567  0.0222039142907981 
##                   1                   1                   1                   1 
##  0.0222435819334914  0.0222629323188928  0.0222815196113342  0.0223009468722198 
##                   2                   1                   1                   1 
##  0.0223102795248421  0.0223425678461275  0.0223905913005242  0.0224039039675362 
##                   1                   2                   1                   1 
##  0.0224438221113308  0.0224884209104376  0.0225735969361808   0.022629028735188 
##                   7                   1                   2                   1 
##  0.0226563501341973   0.022684276147998  0.0226981509421547  0.0227464022065474 
##                   1                   2                   1                   1 
##  0.0229183541602902   0.022955887749321  0.0230014019537563  0.0230349678331901 
##                   1                   1                   1                   1 
##  0.0230431607127274   0.023100594409996  0.0232067538849538  0.0232187486040931 
##                   1                   1                   3                   1 
##  0.0232252845083012  0.0232284658531705  0.0232403087791435  0.0236206853071284 
##                   2                   1                   3                   3 
##   0.023822032771926  0.0238220922350791  0.0238717019418537  0.0239156901957467 
##                   1                   1                   1                   1 
##  0.0239408917228929  0.0240053870663817  0.0240399107361757  0.0240537662533885 
##                   1                   1                   2                   6 
##  0.0240585298412909  0.0241736924764861   0.024247336964261  0.0242769717817371 
##                   1                   6                   1                   1 
##  0.0242893359903364  0.0242927869890986  0.0243055931783757  0.0243345755934522 
##                   1                   1                   1                   2 
##  0.0244841388853651  0.0245536092480748  0.0245611981515037   0.024561518495222 
##                   3                   1                   2                   3 
##  0.0245704202362892  0.0246264472975314  0.0247685227477816  0.0248657445507712 
##                   2                   1                   1                   1 
##  0.0248967107808454  0.0249016370453463  0.0250308473680779  0.0250342696959644 
##                   2                   2                   2                   1 
##  0.0250672988251485  0.0252774270940058  0.0253772005462031  0.0254467035643889 
##                   1                   1                   1                   1 
##   0.025677908015206  0.0259271508109979  0.0259648983969566  0.0260510193422213 
##                   5                   1                   1                   1 
##  0.0260650996094781  0.0262107382616905  0.0262168519105902  0.0263803149422968 
##                   2                   1                   3                   1 
##  0.0264874499134102  0.0267345141277197  0.0267832394611554  0.0268497388858769 
##                   1                   2                   1                   1 
##  0.0270385274305273  0.0271100395795694  0.0271729206117809  0.0272212183267498 
##                   1                   1                   1                   3 
##  0.0273024246354469  0.0273747012640913  0.0273925009192651  0.0274002033301562 
##                   5                   1                   1                   1 
##  0.0275935492977935  0.0276215681274745  0.0276904770823452  0.0278966105501602 
##                   1                   1                   1                   2 
##  0.0280426593399638  0.0282521066092943  0.0284648056115696  0.0285256043432119 
##                   1                   2                   1                   1 
##  0.0289139434847295  0.0289760176630828  0.0292124811813348  0.0292563123141439 
##                   1                   2                   1                   3 
##  0.0297231966425426   0.029765886607996  0.0301751758497612  0.0302759198845446 
##                   2                   2                   1                   1 
##  0.0308025931626339  0.0312257765170483  0.0314482686829706  0.0316507680938054 
##                   1                   1                   1                   1 
##  0.0323234114741823  0.0323958613906503  0.0332249836021863  0.0335230122948129 
##                   1                   2                   1                   1 
##  0.0338931454172859  0.0338973692637351  0.0340951609574423  0.0341439091947036 
##                   1                   1                   1                   1 
##  0.0342068597532975  0.0348391180973915   0.034927805833965  0.0354605578740718 
##                   1                   1                   2                   2 
##  0.0416881804177532  0.0470066210401053   0.048021796674772   0.057089219783618 
##                   1                   1                   2                   2 
##  0.0581769091840787  0.0583445580629671  0.0591469874114392  0.0634993660824244 
##                   1                   1                   1                   1 
##  0.0643082297065537   0.064571632787316  0.0666877420888267   0.068606061705391 
##                   1                   1                   2                   1 
##  0.0693117937516627  0.0694751493403644  0.0703254445770433  0.0707184378680076 
##                   2                   1                   1                   1 
##  0.0708790130647068  0.0709631112970238  0.0718011815464816  0.0724145060457643 
##                   1                   1                   1                   1 
##   0.072779891949211   0.074227825584757  0.0746016251960298  0.0772994651446876 
##                   1                   1                   1                   2 
##  0.0773254188928234  0.0794499592638776  0.0796173779978313  0.0822434139563098 
##                   1                   1                   1                   2 
##  0.0824868884189602  0.0828894217908118  0.0843591723869021  0.0865944519986594 
##                   1                   1                   1                   1 
##  0.0888380847296562  0.0892965786404626  0.0899263275857061  0.0921199600023138 
##                   1                   1                   1                   1 
##  0.0935801334958895  0.0961382029520665  0.0971054041379656  0.0999279089075534 
##                   1                   1                   1                   1 
##   0.102234271784278   0.102295053752378   0.103579437372081   0.105499116708213 
##                   1                   1                   1                   1 
##   0.106264894763085   0.107487934533708   0.108957386819673   0.110332623135973 
##                   1                   1                   1                   1 
##   0.112073047050453   0.116260477579147   0.116452846442627   0.128106862225112 
##                   1                   1                   1                   1 
##     0.1294782152841    0.13996742122343   0.141346464891917   0.149805403756773 
##                   1                   1                   1                   1 
##   0.155600708776907   0.187364116618328   0.283924337624974   0.289403237058514 
##                   1                   1                   1                   1 
##   0.442680094857514   0.447844925744365   0.511674508361446   0.526175468798304 
##                   1                   1                   1                   1 
##   0.549223070238704   0.569955662313997   0.580683027473949   0.613302447363086 
##                   1                   1                   1                   1 
##   0.658774702345976   0.693273466763343   0.699799693507408   0.714668772614321 
##                   1                   1                   2                   1 
##   0.724216893721451   0.736370300314966     0.7421567986855   0.758147824457565 
##                   1                   1                   1                   1 
##   0.770184586964598   0.790967243921418   0.793064530714494   0.801488153530168 
##                   1                   1                   1                   1 
##   0.804773569896239    0.81374439949889   0.817830565769603    0.81831028735539 
##                   1                   1                   1                   1 
##   0.818933020993789   0.835283475973556   0.842014471251895   0.862846987657402 
##                   1                   1                   1                   1 
##   0.863339282576935   0.876737266517904   0.877186800351119   0.883137642740446 
##                   1                   1                   1                   1 
##   0.886175850689163   0.890348766319305   0.892010290698546   0.914461097784853 
##                   1                   1                   1                   1 
##   0.915907043331918   0.933930774907524   0.943791816475291   0.947436650067569 
##                   1                   1                   1                   1 
##   0.953116730695456   0.955498333391612    0.95943957183749   0.960333597730317 
##                   1                   1                   1                   1 
##   0.962319682008478   0.966254714943196   0.966260863628427   0.972051057788731 
##                   1                   1                   1                   1 
##   0.972919287523698   0.975207038518997   0.976722243189188   0.977856487724435 
##                   1                   1                   1                   1 
##   0.979260716813337   0.979534768720111   0.983403307808005   0.985277991349062 
##                   1                   1                   1                   1 
##   0.985946230663705   0.986683603234503   0.987402275398426   0.988673652833114 
##                   1                   1                   1                   1 
##   0.990100003675623   0.990491341629046   0.991422751368218   0.991729116597034 
##                   1                   1                   1                   1 
##   0.992290565549714   0.993010717840816   0.994102484994677   0.994151674526109 
##                   1                   1                   1                   1 
##   0.994547097825151   0.995367789454826   0.995656503417482   0.995757750562912 
##                   1                   1                   1                   1 
##    0.99634421110743   0.996935341491487   0.997487215335876   0.997863050658858 
##                   1                   1                   1                   1 
##   0.997939901961028   0.997964862819303   0.998005096339897   0.998561325793855 
##                   1                   1                   1                   1 
##   0.998663241017075   0.998900201324743    0.99891770202163   0.998977441334501 
##                   1                   1                   1                   1 
##   0.999022815763783   0.999028556959015   0.999179693356662   0.999313599424296 
##                   1                   1                   1                   1 
##   0.999337039653096   0.999339182755807   0.999393700985683   0.999521116187138 
##                   1                   1                   1                   1 
##   0.999544971614029   0.999603304834869   0.999633212036676   0.999701281576424 
##                   1                   1                   1                   1 
##   0.999780144918909   0.999780274651374   0.999780277738306   0.999796842652683 
##                   1                   1                   1                   1 
##   0.999798549001394   0.999871950877686   0.999880470773976   0.999884443206408 
##                   1                   1                   1                   1 
##   0.999920504240636   0.999923926771282   0.999937625123376   0.999937955465987 
##                   1                   1                   1                   1 
##   0.999949163124405   0.999951890915097 
##                   1                   1

# Measure the accuracy of prediction in the test data
y_pred_num<-ifelse(pred>0.13,1,0)
y_pred<-factor(y_pred_num,levels = c(0,1))
y_act<-testData$dv

#Result : Prediction Accuracy 
set.seed(123)
mean(y_pred==y_act)

## [1] 0.9821826

# Plot ROC Curve
library(InformationValue)

## 
## Attaching package: 'InformationValue'

## The following objects are masked from 'package:caret':
## 
##     confusionMatrix, precision, sensitivity, specificity

InformationValue::plotROC(y_act,pred)
InformationValue::AUROC(y_act,pred)

## [1] 0.9798152

#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_table <- table(testData$dv, y_pred_num)
#Displays confusion matrix and the statistics associated with the confusion matrix
result<-caret::confusionMatrix(confusion_table)
result

## Confusion Matrix and Statistics
## 
##    y_pred_num
##       0   1
##   0 773   9
##   1   7 109
##                                           
##                Accuracy : 0.9822          
##                  95% CI : (0.9712, 0.9898)
##     No Information Rate : 0.8686          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9214          
##                                           
##  Mcnemar's Test P-Value : 0.8026          
##                                           
##             Sensitivity : 0.9910          
##             Specificity : 0.9237          
##          Pos Pred Value : 0.9885          
##          Neg Pred Value : 0.9397          
##              Prevalence : 0.8686          
##          Detection Rate : 0.8608          
##    Detection Prevalence : 0.8708          
##       Balanced Accuracy : 0.9574          
##                                           
##        'Positive' Class : 0               
##

metrics<-as.data.frame(result$byClass)
colnames(metrics)<-"metrics"
library(dplyr)

## Warning: package 'dplyr' was built under R version 4.3.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(kableExtra)

## Warning: package 'kableExtra' was built under R version 4.3.3

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

kable(round(metrics,4), caption = "F1-score, Precision and Recall ") %>%
  kable_styling(font_size = 16)

F1-score, Precision and Recall
	metrics
Sensitivity	0.9910
Specificity	0.9237
Pos Pred Value	0.9885
Neg Pred Value	0.9397
Precision	0.9885
Recall	0.9910
F1	0.9898
Prevalence	0.8686
Detection Rate	0.8608
Detection Prevalence	0.8708
Balanced Accuracy	0.9574

# Logistic Regression

# Get feature importance (coefficients)
feature_importance2 <- abs(coef(logitmod2)[-1])

# Create feature importance plot
feature_importance_df2<-data.frame(
  Feature = names(feature_importance2),
  Importance = feature_importance2
)

feature_importance_df2 <- feature_importance_df2[order(feature_importance_df2$Importance, decreasing = TRUE), ]
windows()
ggplot(feature_importance_df2, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "deepskyblue") +
  xlab("Feature") +
  ylab("Importance") +
  ggtitle("Logistic Regression - Feature Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))


# Probit Regression
# Re-run logistic with only significant variables
probitmod<-glm(dv~ .,
               family=binomial(link="probit"), data=trainData)
summary(probitmod)

## 
## Call:
## glm(formula = dv ~ ., family = binomial(link = "probit"), data = trainData)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.936905   0.486415  -1.926   0.0541 .  
## age         -0.663262   0.111871  -5.929 3.05e-09 ***
## HHSize       0.095257   0.118923   0.801   0.4231    
## Sex         -0.149536   0.088111  -1.697   0.0897 .  
## Grade        0.441234   0.019237  22.937  < 2e-16 ***
## Region      -0.003458   0.017995  -0.192   0.8476    
## District     0.009077   0.016174   0.561   0.5747    
## Residence    0.012473   0.080377   0.155   0.8767    
## Occupation  -0.024647   0.015379  -1.603   0.1090    
## Typehousing -0.047607   0.027280  -1.745   0.0810 .  
## Schooltype   0.055396   0.114456   0.484   0.6284    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2722.24  on 3591  degrees of freedom
## Residual deviance:  885.98  on 3581  degrees of freedom
## AIC: 907.98
## 
## Number of Fisher Scoring iterations: 9

# Apply the model to predict the testdata
pred<-predict(probitmod,newdata = testData, type = "response")
summary(pred)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.006944 0.015695 0.021191 0.124165 0.031720 0.999963

# Measure the accuracy of prediction in the test data
y_pred_num<-ifelse(pred>0.12,1,0)
y_pred<-factor(y_pred_num,levels = c(0,1))
y_act<-testData$dv

#Result : Prediction Accuracy 
set.seed(123)
mean(y_pred==y_act)

## [1] 0.9821826

# Plot ROC Curve
library(InformationValue)
InformationValue::plotROC(y_act,pred)
InformationValue::AUROC(y_act,pred)

## [1] 0.9820421

#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_table <- table(testData$dv, y_pred_num)
#Displays confusion matrix and the statistics associated with the confusion matrix
result<-caret::confusionMatrix(confusion_table)
result

## Confusion Matrix and Statistics
## 
##    y_pred_num
##       0   1
##   0 773   9
##   1   7 109
##                                           
##                Accuracy : 0.9822          
##                  95% CI : (0.9712, 0.9898)
##     No Information Rate : 0.8686          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9214          
##                                           
##  Mcnemar's Test P-Value : 0.8026          
##                                           
##             Sensitivity : 0.9910          
##             Specificity : 0.9237          
##          Pos Pred Value : 0.9885          
##          Neg Pred Value : 0.9397          
##              Prevalence : 0.8686          
##          Detection Rate : 0.8608          
##    Detection Prevalence : 0.8708          
##       Balanced Accuracy : 0.9574          
##                                           
##        'Positive' Class : 0               
##

metrics<-as.data.frame(result$byClass)
colnames(metrics)<-"metrics"
library(dplyr)
library(kableExtra)
kable(round(metrics,4), caption = "F1-score, Precision and Recall ") %>%
  kable_styling(font_size = 16)

F1-score, Precision and Recall
	metrics
Sensitivity	0.9910
Specificity	0.9237
Pos Pred Value	0.9885
Neg Pred Value	0.9397
Precision	0.9885
Recall	0.9910
F1	0.9898
Prevalence	0.8686
Detection Rate	0.8608
Detection Prevalence	0.8708
Balanced Accuracy	0.9574

# Probit Regression

# Get feature importance (coefficients)
feature_importance2 <- abs(coef(probitmod)[-1])

# Create feature importance plot
feature_importance_df2<-data.frame(
  Feature = names(feature_importance2),
  Importance = feature_importance2
)

feature_importance_df2 <- feature_importance_df2[order(feature_importance_df2$Importance, decreasing = TRUE), ]
windows()
ggplot(feature_importance_df2, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "chocolate") +
  xlab("Feature") +
  ylab("Importance") +
  ggtitle("Probit Regression - Feature Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))



# Naive Bayes

# Check dimensions of the split
prop.table(table(final$dv))*100

## 
##        0        1 
## 87.32739 12.67261

# Check dimensions of the training data
prop.table(table(trainData$dv))*100

## 
##        0        1 
## 87.38864 12.61136

# Check dimensions of the testing data
prop.table(table(testData$dv))*100

## 
##        0        1 
## 87.08241 12.91759

head(trainData)

# Create objects x which hold the predictor variables and y which holds the response variables
str(trainData)

## 'data.frame':    3592 obs. of  11 variables:
##  $ dv         : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ age        : int  2 1 2 2 2 2 2 2 2 2 ...
##  $ HHSize     : int  2 1 1 1 1 1 1 1 1 1 ...
##  $ Sex        : int  1 1 2 1 1 1 1 1 1 2 ...
##  $ Grade      : int  1 1 1 1 1 1 1 4 1 1 ...
##  $ Region     : int  4 4 4 4 4 4 4 4 4 4 ...
##  $ District   : int  8 8 8 8 8 8 8 8 8 8 ...
##  $ Residence  : int  3 2 2 2 2 2 2 2 2 2 ...
##  $ Occupation : int  2 4 4 4 4 4 5 5 5 5 ...
##  $ Typehousing: int  4 4 4 4 4 4 4 4 4 4 ...
##  $ Schooltype : int  2 1 1 1 2 2 2 2 1 2 ...

y=trainData$dv
x=trainData[,2:10]
y<-as.factor(y)
defaultW<-getOption("warn")
options(warn = -1)
nbmodel<-train(x,y,'nb',trControl = trainControl(method = 'cv',number = 17))
nbmodel

## Naive Bayes 
## 
## 3592 samples
##    9 predictor
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (17 fold) 
## Summary of sample sizes: 3380, 3380, 3380, 3380, 3381, 3382, ... 
## Resampling results across tuning parameters:
## 
##   usekernel  Accuracy   Kappa    
##   FALSE      0.9760479  0.8875553
##    TRUE      0.9253968  0.5504372
## 
## Tuning parameter 'fL' was held constant at a value of 0
## Tuning
##  parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were fL = 0, usekernel = FALSE and adjust
##  = 1.

# Prediction
Predict<-predict(nbmodel,newdata = testData)
y_pred_nb<-ifelse(Predict==1,1,0)
#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_tablenb<- table(testData$dv, y_pred_nb)
#Displays confusion matrix and the statistics associated with the confusion matrix
resultnb<-caret::confusionMatrix(confusion_tablenb)
resultnb

## Confusion Matrix and Statistics
## 
##    y_pred_nb
##       0   1
##   0 777   5
##   1   7 109
##                                           
##                Accuracy : 0.9866          
##                  95% CI : (0.9768, 0.9931)
##     No Information Rate : 0.8731          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9402          
##                                           
##  Mcnemar's Test P-Value : 0.7728          
##                                           
##             Sensitivity : 0.9911          
##             Specificity : 0.9561          
##          Pos Pred Value : 0.9936          
##          Neg Pred Value : 0.9397          
##              Prevalence : 0.8731          
##          Detection Rate : 0.8653          
##    Detection Prevalence : 0.8708          
##       Balanced Accuracy : 0.9736          
##                                           
##        'Positive' Class : 0               
##

metricsnb<-as.data.frame(resultnb$byClass)
colnames(metricsnb)<-"metricsnb"
library(dplyr)
library(kableExtra)
kable(round(metricsnb,4), caption = "F1-score, Precision and Recall ") %>%
  kable_styling(font_size = 16)

F1-score, Precision and Recall
	metricsnb
Sensitivity	0.9911
Specificity	0.9561
Pos Pred Value	0.9936
Neg Pred Value	0.9397
Precision	0.9936
Recall	0.9911
F1	0.9923
Prevalence	0.8731
Detection Rate	0.8653
Detection Prevalence	0.8708
Balanced Accuracy	0.9736

library(ggplot2)

# Create a data frame with feature names and their importance/significance
feature_importance <- data.frame(
  Feature = colnames(x),
  Importance = nbmodel$finalModel$varnames,
  stringsAsFactors = FALSE
)

# Sort the data frame by importance (optional)
feature_importance <- feature_importance[order(feature_importance$Importance, decreasing = TRUE), ]

# Plot feature importance
ggplot(feature_importance, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "moccasin") +
  xlab("Feature") +
  ylab("Importance") +
  ggtitle("Naive Bayes - Feature Selection") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))




# Random Forest 
library(randomForest)

## randomForest 4.7-1.1

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

set.seed(123)
#Random Forest for variables. mtry = 5 since there are 24 variables (square root of 24 is close to 5).
fit_rf <- randomForest(factor(dv) ~., mtry = 17, data = trainData)
fit_rf

## 
## Call:
##  randomForest(formula = factor(dv) ~ ., data = trainData, mtry = 17) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 10
## 
##         OOB estimate of  error rate: 1.92%
## Confusion matrix:
##      0   1 class.error
## 0 3113  26 0.008282893
## 1   43 410 0.094922737

varImpPlot(fit_rf)
#Performance of Random Forest Model on Testing Data
predrf = predict(fit_rf, newdata=testData)
table(predrf)

## predrf
##   0   1 
## 779 119

accuracy <- table(predrf, testData[,2])
sum(diag(accuracy))/sum(accuracy)

## [1] 0.1570156

# Measure the accuracy of prediction in the test data
y_pred_numrf<-ifelse(predrf==1,1,0)
y_predrf<-factor(y_pred_numrf,levels = c(0,1))
y_actrf<-testData$dv

#Result : Prediction Accuracy 
mean(y_predrf==y_actrf)

## [1] 0.9855234

#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_tablerf<- table(testData$dv, y_pred_numrf)
#Displays confusion matrix and the statistics associated with the confusion matrix
resultrf<-caret::confusionMatrix(confusion_tablerf)
resultrf

## Confusion Matrix and Statistics
## 
##    y_pred_numrf
##       0   1
##   0 774   8
##   1   5 111
##                                           
##                Accuracy : 0.9855          
##                  95% CI : (0.9754, 0.9923)
##     No Information Rate : 0.8675          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9364          
##                                           
##  Mcnemar's Test P-Value : 0.5791          
##                                           
##             Sensitivity : 0.9936          
##             Specificity : 0.9328          
##          Pos Pred Value : 0.9898          
##          Neg Pred Value : 0.9569          
##              Prevalence : 0.8675          
##          Detection Rate : 0.8619          
##    Detection Prevalence : 0.8708          
##       Balanced Accuracy : 0.9632          
##                                           
##        'Positive' Class : 0               
##

metricsrf<-as.data.frame(resultrf$byClass)
colnames(metricsrf)<-"metricsrf"
library(dplyr)
library(kableExtra)
kable(round(metricsrf,4), caption = "F1-score, Precision and Recall ") %>%
  kable_styling(font_size = 16)

F1-score, Precision and Recall
	metricsrf
Sensitivity	0.9936
Specificity	0.9328
Pos Pred Value	0.9898
Neg Pred Value	0.9569
Precision	0.9898
Recall	0.9936
F1	0.9917
Prevalence	0.8675
Detection Rate	0.8619
Detection Prevalence	0.8708
Balanced Accuracy	0.9632

#ROC
library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

# RandomForest
test_prob1= predict(fit_rf, newdata=testData, type = "prob")
test_roc1= roc(testData$dv, test_prob1[,c(2)],plot = TRUE, col='darkgoldenrod4', print.auc = TRUE,percent=TRUE)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

#Naive Bayes
test_prob2= predict(nbmodel, newdata=testData, type = "prob")
test_roc2= roc(testData$dv, test_prob2[,c(2)],plot = TRUE, col='darkorange', print.auc = TRUE,percent=TRUE)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

#Logistic Regression
test_prob3= predict(logitmod2, newdata=testData, type = "response")
test_roc3= roc(testData$dv~test_prob3, plot = TRUE, print.auc = TRUE,percent=TRUE)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

#Probit Regression
test_prob7= predict(probitmod, newdata=testData, type = "response")
test_roc7= roc(testData$dv~test_prob7, plot = TRUE, print.auc = TRUE,percent=TRUE)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

windows()
plot(test_roc1,  print.auc = TRUE,  col='blue',percent=TRUE, print.auc.y = 41, print.auc.x = 0)
plot(test_roc2 , add = TRUE, col='red', print.auc = TRUE, print.auc.y = 34, print.auc.x = 0)
plot(test_roc3 , add = TRUE, col='darkgoldenrod4', print.auc = TRUE, print.auc.y = 27, print.auc.x = 0)
abline(a=0, b=1, lty=2, lwd=3, col="black")
legend("topleft", legend = c("Random Forest-Model", "Naive Bayes",
                             "Logistic-Regression") , pch = 15, bty = 'n', col = c("blue","red", "darkgoldenrod4"))
# Random Forest 

# Get feature importance
feature_importance <- importance(fit_rf)

# Convert feature importance to data frame
feature_importance_df <- data.frame(
  Feature = row.names(feature_importance),
  Importance = feature_importance[, "MeanDecreaseGini"]
)

# Sort feature importance by importance score
feature_importance_df <- feature_importance_df[order(feature_importance_df$Importance, decreasing = TRUE), ]
windows()
# Create feature importance plot
ggplot(feature_importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "palegreen") +
  xlab("Feature") +
  ylab("Importance") +
  ggtitle("Random Forest - Feature Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))





##############################################
# Decision Tree
library(rpart)
library(rpart.plot)
ensemble <- rpart(dv~., data = trainData, method = 'class')
rpart.plot(ensemble)

#Prediction
predictionDT <- predict(ensemble, newdata = testData, type = 'class')
# Measure the accuracy of prediction in the test data
y_pred_numDT<-ifelse(predictionDT ==1,1,0)
y_predDT<-factor(y_pred_numDT,levels = c(0,1))
y_act<-testData$dv

#Result : Prediction Accuracy 
mean(y_predDT==y_actrf)

## [1] 0.9888641

#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_tableDT<- table(testData$dv, y_pred_numDT)
#Displays confusion matrix and the statistics associated with the confusion matrix
resultDT<-caret::confusionMatrix(confusion_tableDT)
resultDT

## Confusion Matrix and Statistics
## 
##    y_pred_numDT
##       0   1
##   0 777   5
##   1   5 111
##                                           
##                Accuracy : 0.9889          
##                  95% CI : (0.9796, 0.9946)
##     No Information Rate : 0.8708          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9505          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9936          
##             Specificity : 0.9569          
##          Pos Pred Value : 0.9936          
##          Neg Pred Value : 0.9569          
##              Prevalence : 0.8708          
##          Detection Rate : 0.8653          
##    Detection Prevalence : 0.8708          
##       Balanced Accuracy : 0.9753          
##                                           
##        'Positive' Class : 0               
##

metricsDT<-as.data.frame(resultDT$byClass)
colnames(metricsrf)<-"metricsDT"
library(dplyr)
library(kableExtra)
kable(round(metricsDT,4), caption = "F1-score, Precision and Recall ") %>%
  kable_styling(font_size = 16)

F1-score, Precision and Recall
	resultDT$byClass
Sensitivity	0.9936
Specificity	0.9569
Pos Pred Value	0.9936
Neg Pred Value	0.9569
Precision	0.9936
Recall	0.9936
F1	0.9936
Prevalence	0.8708
Detection Rate	0.8653
Detection Prevalence	0.8708
Balanced Accuracy	0.9753

#Decision Tree
test_prob4= predict(ensemble, newdata=testData, type = "prob")
test_roc4= roc(testData$dv, test_prob4[,c(2)],plot = TRUE, col='darkblue', print.auc = TRUE,percent=TRUE)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

# Decision Tree - Decision Tree

# Get feature importance
variable_importance <- ensemble$variable.importance

# Convert feature importance to data frame
# Create feature importance plot
feature_importance_df <- data.frame(
  Feature = names(variable_importance),
  Importance = variable_importance
)

feature_importance_df <- feature_importance_df[order(feature_importance_df$Importance, decreasing = TRUE), ]
windows()
ggplot(feature_importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "sienna") +
  xlab("Feature") +
  ylab("Importance") +
  ggtitle("Decision Tree - Feature Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))



# Convert Importance column to numeric
feature_importance_df$Importance <- as.numeric(as.character(feature_importance_df$Importance))

# Create feature importance plot
ggplot(feature_importance_df, aes(x = reorder(Feature, -Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "gold") +
  xlab("Feature") +
  ylab("Importance") +
  ggtitle("Decision Tree - Feature Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))




###############################################
# Support Vector Machine (SVM) Model
library(e1071)
set.seed(123)
svm<-svm(dv~., data = trainData, kernel='linear', cost=10, scale=FALSE)
# Plot the SVC obtained
windows()
plot(svm, final)
summary(svm)

## 
## Call:
## svm(formula = dv ~ ., data = trainData, kernel = "linear", cost = 10, 
##     scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  eps-regression 
##  SVM-Kernel:  linear 
##        cost:  10 
##       gamma:  0.1 
##     epsilon:  0.1 
## 
## 
## Number of Support Vectors:  1276

#Prediction
set.seed(123)
predictionSVM<- predict(svm, newdata = testData)
summary(predictionSVM)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.09964 0.09989 0.10003 0.18784 0.10016 1.19092

# Measure the accuracy of prediction in the test data
y_pred_numSVM<-ifelse(predictionSVM>0.18,1,0)
y_predSVM<-factor(y_pred_numSVM,levels = c(0,1))
y_actSVM<-testData$dv

#Result : Prediction Accuracy 
mean(y_predSVM==y_actSVM)

## [1] 0.9888641

#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_tableSVM<- table(testData$dv, y_predSVM)
#Displays confusion matrix and the statistics associated with the confusion matrix
resultSVM<-caret::confusionMatrix(confusion_tableSVM)
resultSVM

## Confusion Matrix and Statistics
## 
##    y_predSVM
##       0   1
##   0 777   5
##   1   5 111
##                                           
##                Accuracy : 0.9889          
##                  95% CI : (0.9796, 0.9946)
##     No Information Rate : 0.8708          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9505          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9936          
##             Specificity : 0.9569          
##          Pos Pred Value : 0.9936          
##          Neg Pred Value : 0.9569          
##              Prevalence : 0.8708          
##          Detection Rate : 0.8653          
##    Detection Prevalence : 0.8708          
##       Balanced Accuracy : 0.9753          
##                                           
##        'Positive' Class : 0               
##

metricsSVM<-as.data.frame(resultSVM$byClass)
colnames(metricsSVM)<-"metricsSVM"
library(dplyr)
library(kableExtra)
kable(round(metricsSVM,4), caption = "F1-score, Precision and Recall ") %>%
  kable_styling(font_size = 16)

F1-score, Precision and Recall
	metricsSVM
Sensitivity	0.9936
Specificity	0.9569
Pos Pred Value	0.9936
Neg Pred Value	0.9569
Precision	0.9936
Recall	0.9936
F1	0.9936
Prevalence	0.8708
Detection Rate	0.8653
Detection Prevalence	0.8708
Balanced Accuracy	0.9753

#SVM
library(pROC)
windows()
test_prob5= predict(svm, newdata=testData)
test_roc5= roc(testData$dv, main="Support Vector Machine", test_prob5,plot = TRUE, col='darkblue', print.auc = TRUE,percent=TRUE)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

coef(svm)[2:10]

##           age        HHSize           Sex         Grade        Region 
## -1.300808e-04 -8.469098e-07 -7.340655e-05  9.088803e-02 -3.159863e-05 
##      District     Residence    Occupation   Typehousing 
##  4.450492e-05 -1.300809e-04 -5.830921e-06  2.365703e-10

# Support Vector Machine
# Get feature importance (coefficients)
feature_importance1<- abs(coef(svm)[2:10])

# Create feature importance plot
feature_importance_df <- data.frame(
  Feature = names(feature_importance1),
  Importance = feature_importance1
)

feature_importance_df <- feature_importance_df[order(feature_importance_df$Importance, decreasing = TRUE), ]
windows()
ggplot(feature_importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "turquoise") +
  xlab("Feature") +
  ylab("Importance") +
  ggtitle("Support Vector Machine - Feature Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

#################################################




############################################
# KNN Model
library(knn)

# Fitting model
fit <-knnreg(trainData$dv ~ ., data = trainData,k=1)
summary(fit)

##         Length Class  Mode   
## learn   2      -none- list   
## k       1      -none- numeric
## terms   3      terms  call   
## xlevels 0      -none- list   
## theDots 0      -none- list

#Predict Output 
predicted<- predict(fit,testData)
table(predicted)

## predicted
##                  0 0.0555555555555556 0.0714285714285714 0.0769230769230769 
##                749                  2                  4                  1 
##               0.08 0.0833333333333333                0.1  0.111111111111111 
##                  5                  1                 12                  4 
##              0.125  0.142857142857143               0.25                0.5 
##                  1                  1                  2                  6 
##                  1 
##                110

# Measure the accuracy of prediction in the test data
y_pred_numknn<-ifelse(predicted==1,1,0)
y_predknn<-factor(y_pred_numknn,levels = c(0,1))
y_actknn<-testData$dv

#Result : Prediction Accuracy 
mean(y_predknn==y_actknn)

## [1] 0.9799555

#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_tableknn<- table(testData$dv, y_pred_numknn)
#Displays confusion matrix and the statistics associated with the confusion matrix
resultknn<-caret::confusionMatrix(confusion_tableknn)
resultknn

## Confusion Matrix and Statistics
## 
##    y_pred_numknn
##       0   1
##   0 776   6
##   1  12 104
##                                           
##                Accuracy : 0.98            
##                  95% CI : (0.9685, 0.9881)
##     No Information Rate : 0.8775          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9089          
##                                           
##  Mcnemar's Test P-Value : 0.2386          
##                                           
##             Sensitivity : 0.9848          
##             Specificity : 0.9455          
##          Pos Pred Value : 0.9923          
##          Neg Pred Value : 0.8966          
##              Prevalence : 0.8775          
##          Detection Rate : 0.8641          
##    Detection Prevalence : 0.8708          
##       Balanced Accuracy : 0.9651          
##                                           
##        'Positive' Class : 0               
##

metricsknn<-as.data.frame(resultknn$byClass)
colnames(metricsrf)<-"metricsknn"
library(dplyr)
library(kableExtra)
kable(round(metricsknn,4), caption = "F1-score, Precision and Recall ") %>%
  kable_styling(font_size = 16)

F1-score, Precision and Recall
	resultknn$byClass
Sensitivity	0.9848
Specificity	0.9455
Pos Pred Value	0.9923
Neg Pred Value	0.8966
Precision	0.9923
Recall	0.9848
F1	0.9885
Prevalence	0.8775
Detection Rate	0.8641
Detection Prevalence	0.8708
Balanced Accuracy	0.9651

knnmodel <- train(dv ~ ., data = trainData, method = "knn")
# Get feature importance
importance <- varImp(knnmodel)
# Print the feature importance
print(importance)

## loess r-squared variable importance
## 
##               Overall
## Grade       1.000e+02
## age         1.253e+01
## Schooltype  5.151e+00
## Residence   1.509e+00
## Sex         8.287e-01
## Region      2.419e-01
## HHSize      1.323e-01
## Occupation  3.936e-02
## District    8.319e-03
## Typehousing 0.000e+00

# Plot the feature importance
plot <- ggplot(importance, aes(x = rownames(importance), y = Overall)) +
  geom_bar(stat = "identity", fill = "peru") +
  labs(x = "Features", y = "Importance") +
  ggtitle("KNN Features Importance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Print the plot
print(plot)




#ROC
library(pROC)
# KNN
windows()
test_prob6= predict(fit, newdata=testData, type = "prob")
test_roc6= roc(testData$dv, test_prob6,plot = TRUE, col='darkblue', print.auc = TRUE,percent=TRUE)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

############################################
#Area-under Curve
windows()
plot(test_roc1,  print.auc = TRUE,  col='brown',percent=TRUE, print.auc.y = 48, print.auc.x = 0)
plot(test_roc2 , add = TRUE, col='red', print.auc = TRUE, print.auc.y = 41, print.auc.x = 0)
plot(test_roc3 , add = TRUE, col='darkgoldenrod4', print.auc = TRUE, print.auc.y = 34, print.auc.x = 0)
plot(test_roc4 , add = TRUE, col='green', print.auc = TRUE, print.auc.y = 27, print.auc.x = 0)
plot(test_roc5 , add = TRUE, col='purple', print.auc = TRUE, print.auc.y = 20, print.auc.x = 0)
plot(test_roc6 , add = TRUE, col='magenta', print.auc = TRUE, print.auc.y = 14, print.auc.x = 0)
plot(test_roc7 , add = TRUE, col='darkblue', print.auc = TRUE, print.auc.y = 7, print.auc.x = 0)
abline(a=0, b=1, lty=3, lwd=4, col="black")
legend("topleft", legend = c("Random Forest-Model", "Naive Bayes",
                             "Logistic-Regression","Decision Tree","Support Vector Machine","KNN","Probit Regression") ,
       pch = 15, bty = 'n', col = c("brown","red", "darkgoldenrod4", "green","purple","magenta","darkblue"))


#############################3
# Metrics
#
# Sample accuracy, sensitivity, F1 Score, and precision values for three models
accuracy <- c(0.982, 0.985, 0.986,0.988,0.988,0.980,0.982)
sensitivity <- c(0.991, 0.993, 0.991,0.993,0.993,0.984,0.991)
AUROC<- c(0.985, 0.988, 0.971,0.986,0.975,0.973,0.984)
f1_score <- c(0.989, 0.991, 0.992,0.993,0.993,0.988,0.989)

# Creating a data frame with the values
model_names <- c("Logistic Regression", "Random Forest", "Naive Bayes","Support Vector Machine",
                 "Decision Tree","KNN","Probit Regression")
data <- data.frame(Model = model_names, Accuracy = accuracy, Sensitivity = sensitivity, F1_Score = f1_score, AUROC = AUROC)


# Loading the ggplot2 package
library(ggplot2)

# Reshaping the data frame for plotting
data_long <- reshape2::melt(data, id.vars = "Model", variable.name = "Metric", value.name = "Value")
windows()
# Creating the bar chart
ggplot(data_long, aes(x = Model, y = Value, fill = Metric)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Comparison of Supervised Machine Learning Models",
       x = "Model",
       y = "Value") +
  scale_fill_manual(values = c("Accuracy" = "#3498db",
                               "Sensitivity" = "seagreen",
                               "F1_Score" = "slateblue",
                               "AUROC" = "#f1c40f")) +
  theme_minimal()


summary(cars)

##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

##           age        HHSize           Sex         Grade        Region 
## -1.300808e-04 -8.469098e-07 -7.340655e-05  9.088803e-02 -3.159863e-05 
##      District     Residence    Occupation   Typehousing 
##  4.450492e-05 -1.300809e-04 -5.830921e-06  2.365703e-10

## loess r-squared variable importance
## 
##               Overall
## Grade       1.000e+02
## age         1.253e+01
## Schooltype  5.151e+00
## Residence   1.509e+00
## Sex         8.287e-01
## Region      2.419e-01
## HHSize      1.323e-01
## Occupation  3.936e-02
## District    8.319e-03
## Typehousing 0.000e+00

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

Dropout Project

Mukhtar Abdi Hassan

2024-06-20

R Markdown

Including Plots