R Markdown
This is an R Markdown document. Markdown is a simple formatting
syntax for authoring HTML, PDF, and MS Word documents. For more details
on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be
generated that includes both content as well as the output of any
embedded R code chunks within the document. You can embed an R code
chunk like this:
# 10 Supervised Machine Learning Models
final1<-read.csv("~/mldropout.csv")
str(final1)
## 'data.frame': 4490 obs. of 11 variables:
## $ Age_of_Household_head: chr "Below 20 years" "20Years and Above" "Below 20 years" "Below 20 years" ...
## $ Household_size : chr "Below 5 Members" "5 Members or mor" "5 Members or mor" "5 Members or mor" ...
## $ Sex_of_Household_head: chr "female" "female" "male" "female" ...
## $ Grade : chr "first_degree" "first_degree" "first_degree" "first_degree" ...
## $ Region : chr "Marodijeh" "Marodijeh" "Marodijeh" "Marodijeh" ...
## $ District : chr "Hargeisa" "Hargeisa" "Hargeisa" "Hargeisa" ...
## $ Location_Type : chr "Urban" "Rural" "Rural" "Rural" ...
## $ Occupation_status : chr "Daily wager" "Government Employee" "Government Employee" "Government Employee" ...
## $ Type_of_Housing : chr "Daar no fence" "Daar no fence" "Daar no fence" "Daar no fence" ...
## $ School_Type : chr "Public" "Private" "Private" "Private" ...
## $ dv : chr "No" "No" "No" "Yes" ...
#Age
dada<-final1$Age_of_Household_head
table(dada)
## dada
## 20Years and Above Below 20 years
## 513 3977
dada<-as.factor(dada)
age<-unclass(dada)
table(age)
## age
## 1 2
## 513 3977
#Household Size
guriga<-final1$Household_size
table(guriga)
## guriga
## 5 Members or mor Below 5 Members
## 3817 673
guriga<-as.factor(guriga)
HHSize<-unclass(guriga)
table(HHSize)
## HHSize
## 1 2
## 3817 673
#Sex
jinsi<-final1$Sex_of_Household_head
table(jinsi)
## jinsi
## female male
## 2164 2326
jinsi<-as.factor(jinsi)
Sex<-unclass(jinsi)
table(Sex)
## Sex
## 1 2
## 2164 2326
#Grade
classka<-final1$Grade
table(classka)
## classka
## first_degree form_1 form_2 form_3 form_4
## 3935 31 5 29 18
## grade_1 grade_2 grade_3 grade_4 grade_5
## 64 60 46 52 63
## grade_6 grade_7 grade_8 post_graduate
## 34 61 89 3
classka<-as.factor(classka)
Grade<-unclass(classka)
table(Grade)
## Grade
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## 3935 31 5 29 18 64 60 46 52 63 34 61 89 3
#Region
gobol<-final1$Region
table(gobol)
## gobol
## Awdal Daadmadheedh Hawd Marodijeh Sahil Sanaag
## 562 105 127 1433 589 472
## Saraar Sool Togdheer
## 152 351 699
gobol<-as.factor(gobol)
Region<-unclass(gobol)
table(Region)
## Region
## 1 2 3 4 5 6 7 8 9
## 562 105 127 1433 589 472 152 351 699
#District
degmo<-final1$District
table(degmo)
## degmo
## balligubadle Berbera Borama Burco Caynabo Ceel-Afweyn
## 127 475 458 702 148 115
## Ceerigaabo Hargeisa Laascaanood Lughaya oodweyne Sheikh
## 357 1434 351 104 103 116
degmo<-as.factor(degmo)
District<-unclass(degmo)
table(District)
## District
## 1 2 3 4 5 6 7 8 9 10 11 12
## 127 475 458 702 148 115 357 1434 351 104 103 116
#Residence
degsiimo<-final1$Location_Type
table(degsiimo)
## degsiimo
## Nomadic Rural Urban
## 207 1319 2964
degsiimo<-as.factor(degsiimo)
Residence<-unclass(degsiimo)
table(Residence)
## Residence
## 1 2 3
## 207 1319 2964
#Occupation
shaqo<-final1$Occupation_status
table(shaqo)
## shaqo
## Agriculture or Agro-pastoralist Daily wager
## 416 676
## Fishing Government Employee
## 16 467
## Household Chores Nomadic Pastoralist
## 1075 163
## Not working but looking for work Other specify
## 477 242
## Private Employee Self Employed
## 313 645
shaqo<-as.factor(shaqo)
Occupation<-unclass(shaqo)
table(Occupation)
## Occupation
## 1 2 3 4 5 6 7 8 9 10
## 416 676 16 467 1075 163 477 242 313 645
#House Type
nooca<-final1$Type_of_Housing
table(nooca)
## nooca
## Bungalow Buul/bus/somali hut Concrete building
## 477 1079 15
## Daar no fence Derked/cariish/Mudul Sandaqad
## 1765 249 905
nooca<-as.factor(nooca)
Typehousing<-unclass(nooca)
table(Typehousing)
## Typehousing
## 1 2 3 4 5 6
## 477 1079 15 1765 249 905
#School Type
dugsiga<-final1$School_Type
table(dugsiga)
## dugsiga
## Private Public
## 1250 3240
dugsiga<-as.factor(dugsiga)
Schooltype<-unclass(dugsiga)
table(Schooltype)
## Schooltype
## 1 2
## 1250 3240
#DV
dv<-final1$dv
dv<-ifelse(dv<"Yes",0,1)
table(dv)
## dv
## 0 1
## 3921 569
prop.table(table(dv))*100
## dv
## 0 1
## 87.32739 12.67261
#Final Data
final<-data.frame(dv,age,HHSize,Sex,Grade,Region,District,Residence,Occupation,Typehousing,Schooltype)
# Logistic Regression
dropout<-glm(dv~.,data=final,family = binomial())
summary(dropout)
##
## Call:
## glm(formula = dv ~ ., family = binomial(), data = final)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.74351 1.02704 -1.698 0.0896 .
## age -1.58277 0.21601 -7.327 2.35e-13 ***
## HHSize 0.01548 0.26549 0.058 0.9535
## Sex -0.31890 0.18956 -1.682 0.0925 .
## Grade 1.05031 0.04839 21.703 < 2e-16 ***
## Region -0.01147 0.03932 -0.292 0.7706
## District 0.02419 0.03541 0.683 0.4946
## Residence 0.13170 0.17938 0.734 0.4628
## Occupation -0.03633 0.03342 -1.087 0.2771
## Typehousing -0.08687 0.05782 -1.502 0.1330
## Schooltype 0.18760 0.24775 0.757 0.4489
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3413.44 on 4489 degrees of freedom
## Residual deviance: 989.15 on 4479 degrees of freedom
## AIC: 1011.2
##
## Number of Fisher Scoring iterations: 7
round(coef(summary(dropout)),4)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.7435 1.0270 -1.6976 0.0896
## age -1.5828 0.2160 -7.3272 0.0000
## HHSize 0.0155 0.2655 0.0583 0.9535
## Sex -0.3189 0.1896 -1.6823 0.0925
## Grade 1.0503 0.0484 21.7033 0.0000
## Region -0.0115 0.0393 -0.2916 0.7706
## District 0.0242 0.0354 0.6830 0.4946
## Residence 0.1317 0.1794 0.7342 0.4628
## Occupation -0.0363 0.0334 -1.0869 0.2771
## Typehousing -0.0869 0.0578 -1.5024 0.1330
## Schooltype 0.1876 0.2477 0.7572 0.4489
# Logistic Regression without factors
dropout1<-glm(dv~.,data=final,family = binomial())
summary(dropout1)
##
## Call:
## glm(formula = dv ~ ., family = binomial(), data = final)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.74351 1.02704 -1.698 0.0896 .
## age -1.58277 0.21601 -7.327 2.35e-13 ***
## HHSize 0.01548 0.26549 0.058 0.9535
## Sex -0.31890 0.18956 -1.682 0.0925 .
## Grade 1.05031 0.04839 21.703 < 2e-16 ***
## Region -0.01147 0.03932 -0.292 0.7706
## District 0.02419 0.03541 0.683 0.4946
## Residence 0.13170 0.17938 0.734 0.4628
## Occupation -0.03633 0.03342 -1.087 0.2771
## Typehousing -0.08687 0.05782 -1.502 0.1330
## Schooltype 0.18760 0.24775 0.757 0.4489
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3413.44 on 4489 degrees of freedom
## Residual deviance: 989.15 on 4479 degrees of freedom
## AIC: 1011.2
##
## Number of Fisher Scoring iterations: 7
# Visualization of Parameter Effects
library(effects)
## Loading required package: carData
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
windows()
plot(allEffects(dropout1),cex.main=.75, cex.lab=.1, cex.axis=0.1)
# Prediction Models
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.3
## Loading required package: lattice
# Prep Training and Test dataset
# First, create a Train-test split with 0% data included in the training set
set.seed(123)
trainDataIndex<-createDataPartition(final$dv, p=0.8, list = F)
trainData<-final[trainDataIndex,]
testData<-final[-trainDataIndex,]
# Build logistic model (Usig All Features)
logitmod1<-glm(dv~ .,
family="binomial", data=trainData)
summary(logitmod1)
##
## Call:
## glm(formula = dv ~ ., family = "binomial", data = trainData)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.99588 1.11971 -1.782 0.0747 .
## age -1.56637 0.23480 -6.671 2.54e-11 ***
## HHSize 0.12275 0.28318 0.433 0.6647
## Sex -0.34251 0.20558 -1.666 0.0957 .
## Grade 1.01050 0.05296 19.079 < 2e-16 ***
## Region -0.01430 0.04235 -0.338 0.7357
## District 0.03114 0.03808 0.818 0.4135
## Residence 0.20095 0.19284 1.042 0.2974
## Occupation -0.04141 0.03578 -1.157 0.2472
## Typehousing -0.08473 0.06266 -1.352 0.1764
## Schooltype 0.22647 0.26741 0.847 0.3971
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2722.24 on 3591 degrees of freedom
## Residual deviance: 834.51 on 3581 degrees of freedom
## AIC: 856.51
##
## Number of Fisher Scoring iterations: 7
# Re-run logistic with only significant variables
logitmod2<-glm(dv~ .,
family="binomial", data=trainData)
summary(logitmod2)
##
## Call:
## glm(formula = dv ~ ., family = "binomial", data = trainData)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.99588 1.11971 -1.782 0.0747 .
## age -1.56637 0.23480 -6.671 2.54e-11 ***
## HHSize 0.12275 0.28318 0.433 0.6647
## Sex -0.34251 0.20558 -1.666 0.0957 .
## Grade 1.01050 0.05296 19.079 < 2e-16 ***
## Region -0.01430 0.04235 -0.338 0.7357
## District 0.03114 0.03808 0.818 0.4135
## Residence 0.20095 0.19284 1.042 0.2974
## Occupation -0.04141 0.03578 -1.157 0.2472
## Typehousing -0.08473 0.06266 -1.352 0.1764
## Schooltype 0.22647 0.26741 0.847 0.3971
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2722.24 on 3591 degrees of freedom
## Residual deviance: 834.51 on 3581 degrees of freedom
## AIC: 856.51
##
## Number of Fisher Scoring iterations: 7
# Apply the model to predict the testdata
pred<-predict(logitmod2,newdata = testData, type = "response")
summary(pred)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.007423 0.015635 0.020036 0.134010 0.027302 0.999952
table(pred)
## pred
## 0.00742264330492044 0.00834557966058118 0.00935186017198008 0.0095735670772806
## 1 3 1 1
## 0.00966655480524891 0.00968072581175757 0.00975780449363521 0.00983825067518304
## 1 1 1 1
## 0.00984834778170783 0.00987684856127385 0.010129602347288 0.0101362970093275
## 1 1 1 1
## 0.0102844837526646 0.0102885250120266 0.0103920358457867 0.0103941226909481
## 1 1 3 1
## 0.0104445769634846 0.0104570681713114 0.0106607549758712 0.010740543673018
## 1 1 1 1
## 0.0108813556118409 0.0110803802769215 0.0111625849379009 0.0111671619241053
## 1 3 1 2
## 0.0113027710121192 0.0114600515058198 0.0115918294622422 0.0116338028816892
## 1 4 1 2
## 0.0116793620708303 0.0117146370802538 0.0117209298446323 0.0117688024765855
## 4 1 1 3
## 0.011863872491806 0.0119711212852362 0.0119838622663828 0.0120563177182936
## 1 1 2 1
## 0.0121147417032472 0.0122565168699555 0.012266992821193 0.0122874036432378
## 5 1 1 1
## 0.0122898663741369 0.0123080101834741 0.0123494079707349 0.01237733206863
## 1 1 4 1
## 0.0124372627164436 0.0124390512552267 0.0124841997262543 0.0125462964423127
## 1 1 1 1
## 0.0125571257031243 0.0125781788192963 0.0126204737030311 0.0126780961394396
## 2 2 2 1
## 0.0127479762050097 0.0127509011475002 0.012752140128656 0.012772001068969
## 1 3 1 2
## 0.0128002500526309 0.0128028142395892 0.0128164591651669 0.0128648085831455
## 1 1 3 1
## 0.0129236371071585 0.0129562818333845 0.0130751540874032 0.0130996309953278
## 1 1 3 1
## 0.0131524164575045 0.0132070310651539 0.01325617248866 0.0132841226338693
## 1 1 1 2
## 0.0132867824471289 0.0133342124575686 0.0135331636747964 0.0135343162629865
## 1 2 1 2
## 0.013598570359466 0.0136184263680403 0.0136890981762095 0.0136952645290174
## 1 1 1 1
## 0.0137029934417753 0.0137037125658686 0.0137339356642547 0.0137577258591493
## 2 1 1 3
## 0.0138015033629629 0.0138478540603425 0.0138668113921259 0.0138901356992704
## 2 2 1 1
## 0.0138929151430402 0.0139133922323584 0.013939766219649 0.0139601133759764
## 1 1 1 1
## 0.0140266075939197 0.0140387740901676 0.0140443475649599 0.0140592637026455
## 2 2 1 1
## 0.014091668804937 0.0141274949523808 0.0142158503759126 0.0143275943604022
## 2 1 1 2
## 0.0143526052231819 0.0143707514861484 0.0143784502802266 0.01441460635028
## 1 1 1 2
## 0.0144174892072757 0.0144248787119776 0.0144370186507005 0.0144660833536126
## 1 1 1 3
## 0.0144898964845193 0.0145590659025113 0.014575092187373 0.0145780066660749
## 1 2 3 1
## 0.0146845254691312 0.0147001113859661 0.0147159980449591 0.0147367949710476
## 1 7 1 1
## 0.0147922802028924 0.0148003677274794 0.0148079787700362 0.0148446319354664
## 1 1 1 1
## 0.0148477129926871 0.0148662799647653 0.0149503434360873 0.014952060211996
## 1 1 1 2
## 0.0149840974612268 0.0149896682313965 0.0150086555168392 0.0150121965633278
## 1 2 2 1
## 0.0150714036244964 0.0150744158289352 0.0151472410453222 0.0151534268194979
## 2 1 6 1
## 0.0151849874779913 0.0152223437706865 0.0152583385958503 0.0152602383838768
## 1 1 1 1
## 0.0152958725355116 0.0153021180641779 0.0153042311169027 0.0153120972424999
## 2 1 1 1
## 0.0153370112015111 0.0153502842231612 0.0153719184201596 0.0153943233792159
## 1 1 1 1
## 0.0153986034767706 0.0154209975902826 0.0154367345312307 0.0154934173773807
## 3 2 1 1
## 0.015536162698721 0.015549223135717 0.0155661887922652 0.0156297559017006
## 1 5 3 1
## 0.0156509013643749 0.0156757598527779 0.015698600607625 0.015724836805133
## 2 1 3 2
## 0.0158455336645185 0.015895166910648 0.0160189320399188 0.016039195154529
## 1 1 1 4
## 0.0160660316994747 0.0160712763576581 0.0161250699746767 0.0161253024430737
## 3 1 1 1
## 0.0161724250918957 0.0161920801886961 0.0161959776178171 0.0162171443460771
## 1 2 4 1
## 0.0162410361651031 0.0162568661408634 0.0163720268970132 0.0163857783057423
## 1 1 3 1
## 0.016440338495152 0.0164658892602669 0.0165054651488881 0.0165438275745182
## 2 1 1 1
## 0.0165881090984207 0.0165946776393754 0.0166014445483232 0.0166122567789941
## 4 2 1 2
## 0.0166294245948103 0.0166951534715256 0.0167017771594211 0.0167059836919066
## 1 1 1 1
## 0.0167095954023294 0.0167967932926841 0.0167989208499831 0.0168346022697389
## 1 1 2 1
## 0.0168944801367872 0.0169325480708322 0.0169602445378723 0.0169793329077077
## 2 1 1 4
## 0.016988405101068 0.0170667252406428 0.0170983936499657 0.0171165373209028
## 1 1 3 1
## 0.0171770470561328 0.0171878279835745 0.0171978038612669 0.0172201175293919
## 3 3 5 2
## 0.0172235516708162 0.0172488519930787 0.0172841532959751 0.0173210995729097
## 3 4 7 2
## 0.0173674248177715 0.01740000206738 0.017429075163574 0.0174930332009511
## 1 4 1 1
## 0.0174945170610337 0.0175443340318425 0.0175632069894364 0.0175698488406534
## 5 2 2 1
## 0.0175889797980354 0.017625539850793 0.0176736373212267 0.0177418237997451
## 1 1 1 2
## 0.017768105022023 0.0177972555691863 0.0178048700650194 0.0178084187082642
## 1 1 1 2
## 0.0178680450039664 0.0178791189381795 0.0178957293266801 0.017935105039955
## 1 1 1 1
## 0.0179576982010048 0.0179913623937972 0.0180402029414723 0.0180791498364049
## 2 2 4 1
## 0.0181510397985229 0.0181525784432821 0.0183182412717443 0.0183523508094168
## 2 3 1 3
## 0.0183677344976174 0.0184259071261055 0.0185020090283481 0.0185473717784921
## 1 4 1 2
## 0.0185629456289942 0.0185745799565362 0.0186131313236526 0.0187411682257349
## 1 2 1 1
## 0.0187562015424543 0.0188618399745509 0.0188740487554849 0.0189512928645733
## 5 2 6 1
## 0.018974808859335 0.0189764159845194 0.0190751530974237 0.0191134349037331
## 2 1 1 1
## 0.0191294441314273 0.0192370952787161 0.0192691777191788 0.0194109420954427
## 1 2 1 1
## 0.0194153657483028 0.0194211032127447 0.0194472766460655 0.0194535856712397
## 2 3 1 1
## 0.0194804685330476 0.0194974631325631 0.0195390464708274 0.0195674044223364
## 1 2 3 1
## 0.0195811646851428 0.0195911853488076 0.0195946188753881 0.019609582542242
## 2 1 1 3
## 0.0196436301137848 0.0196475379046776 0.0196563347892803 0.0196891023752502
## 3 1 1 1
## 0.0197242778014065 0.0197420132550996 0.0197585383685235 0.0197841076352862
## 1 1 2 1
## 0.0198190053932483 0.0199212047803781 0.0199518958594201 0.0200341214966348
## 2 1 1 1
## 0.0200374448604915 0.020059939220221 0.0200624716682637 0.0200626958891857
## 1 2 1 1
## 0.0200675059706866 0.0201166282979243 0.0202150226282548 0.0202243033408238
## 1 1 2 4
## 0.0203392878201326 0.0205706954327983 0.0205812118586312 0.0207020864651809
## 1 2 5 1
## 0.020716876371544 0.0207295733334448 0.0207380614587888 0.0207469075914678
## 2 2 1 3
## 0.0208124555635569 0.0208627280393565 0.0208906046215097 0.0209189987661699
## 3 1 1 1
## 0.020949316715129 0.0209929870370191 0.021010563570551 0.021015490456484
## 3 1 2 1
## 0.0210978669535672 0.0211020578148844 0.0212033759664624 0.0212865415007565
## 1 3 8 2
## 0.0213078564299312 0.0213173805322049 0.0214017079524248 0.0214101410790009
## 1 2 2 1
## 0.0214217681768643 0.0214308995723629 0.0214317033732691 0.0215098553692447
## 1 1 1 1
## 0.0215439470872998 0.0215530381681195 0.0215738638511487 0.0216060748196375
## 1 5 1 1
## 0.0216449360464063 0.0216723456875641 0.0216858857706634 0.0217263843450356
## 1 2 3 1
## 0.0217625261493493 0.0218438773638951 0.0218838896242687 0.0218905297244178
## 2 1 1 1
## 0.0219155746896329 0.0219292633270487 0.0219384987923603 0.021970261148806
## 3 2 1 1
## 0.0219746214116317 0.022119719785031 0.0222034181196567 0.0222039142907981
## 1 1 1 1
## 0.0222435819334914 0.0222629323188928 0.0222815196113342 0.0223009468722198
## 2 1 1 1
## 0.0223102795248421 0.0223425678461275 0.0223905913005242 0.0224039039675362
## 1 2 1 1
## 0.0224438221113308 0.0224884209104376 0.0225735969361808 0.022629028735188
## 7 1 2 1
## 0.0226563501341973 0.022684276147998 0.0226981509421547 0.0227464022065474
## 1 2 1 1
## 0.0229183541602902 0.022955887749321 0.0230014019537563 0.0230349678331901
## 1 1 1 1
## 0.0230431607127274 0.023100594409996 0.0232067538849538 0.0232187486040931
## 1 1 3 1
## 0.0232252845083012 0.0232284658531705 0.0232403087791435 0.0236206853071284
## 2 1 3 3
## 0.023822032771926 0.0238220922350791 0.0238717019418537 0.0239156901957467
## 1 1 1 1
## 0.0239408917228929 0.0240053870663817 0.0240399107361757 0.0240537662533885
## 1 1 2 6
## 0.0240585298412909 0.0241736924764861 0.024247336964261 0.0242769717817371
## 1 6 1 1
## 0.0242893359903364 0.0242927869890986 0.0243055931783757 0.0243345755934522
## 1 1 1 2
## 0.0244841388853651 0.0245536092480748 0.0245611981515037 0.024561518495222
## 3 1 2 3
## 0.0245704202362892 0.0246264472975314 0.0247685227477816 0.0248657445507712
## 2 1 1 1
## 0.0248967107808454 0.0249016370453463 0.0250308473680779 0.0250342696959644
## 2 2 2 1
## 0.0250672988251485 0.0252774270940058 0.0253772005462031 0.0254467035643889
## 1 1 1 1
## 0.025677908015206 0.0259271508109979 0.0259648983969566 0.0260510193422213
## 5 1 1 1
## 0.0260650996094781 0.0262107382616905 0.0262168519105902 0.0263803149422968
## 2 1 3 1
## 0.0264874499134102 0.0267345141277197 0.0267832394611554 0.0268497388858769
## 1 2 1 1
## 0.0270385274305273 0.0271100395795694 0.0271729206117809 0.0272212183267498
## 1 1 1 3
## 0.0273024246354469 0.0273747012640913 0.0273925009192651 0.0274002033301562
## 5 1 1 1
## 0.0275935492977935 0.0276215681274745 0.0276904770823452 0.0278966105501602
## 1 1 1 2
## 0.0280426593399638 0.0282521066092943 0.0284648056115696 0.0285256043432119
## 1 2 1 1
## 0.0289139434847295 0.0289760176630828 0.0292124811813348 0.0292563123141439
## 1 2 1 3
## 0.0297231966425426 0.029765886607996 0.0301751758497612 0.0302759198845446
## 2 2 1 1
## 0.0308025931626339 0.0312257765170483 0.0314482686829706 0.0316507680938054
## 1 1 1 1
## 0.0323234114741823 0.0323958613906503 0.0332249836021863 0.0335230122948129
## 1 2 1 1
## 0.0338931454172859 0.0338973692637351 0.0340951609574423 0.0341439091947036
## 1 1 1 1
## 0.0342068597532975 0.0348391180973915 0.034927805833965 0.0354605578740718
## 1 1 2 2
## 0.0416881804177532 0.0470066210401053 0.048021796674772 0.057089219783618
## 1 1 2 2
## 0.0581769091840787 0.0583445580629671 0.0591469874114392 0.0634993660824244
## 1 1 1 1
## 0.0643082297065537 0.064571632787316 0.0666877420888267 0.068606061705391
## 1 1 2 1
## 0.0693117937516627 0.0694751493403644 0.0703254445770433 0.0707184378680076
## 2 1 1 1
## 0.0708790130647068 0.0709631112970238 0.0718011815464816 0.0724145060457643
## 1 1 1 1
## 0.072779891949211 0.074227825584757 0.0746016251960298 0.0772994651446876
## 1 1 1 2
## 0.0773254188928234 0.0794499592638776 0.0796173779978313 0.0822434139563098
## 1 1 1 2
## 0.0824868884189602 0.0828894217908118 0.0843591723869021 0.0865944519986594
## 1 1 1 1
## 0.0888380847296562 0.0892965786404626 0.0899263275857061 0.0921199600023138
## 1 1 1 1
## 0.0935801334958895 0.0961382029520665 0.0971054041379656 0.0999279089075534
## 1 1 1 1
## 0.102234271784278 0.102295053752378 0.103579437372081 0.105499116708213
## 1 1 1 1
## 0.106264894763085 0.107487934533708 0.108957386819673 0.110332623135973
## 1 1 1 1
## 0.112073047050453 0.116260477579147 0.116452846442627 0.128106862225112
## 1 1 1 1
## 0.1294782152841 0.13996742122343 0.141346464891917 0.149805403756773
## 1 1 1 1
## 0.155600708776907 0.187364116618328 0.283924337624974 0.289403237058514
## 1 1 1 1
## 0.442680094857514 0.447844925744365 0.511674508361446 0.526175468798304
## 1 1 1 1
## 0.549223070238704 0.569955662313997 0.580683027473949 0.613302447363086
## 1 1 1 1
## 0.658774702345976 0.693273466763343 0.699799693507408 0.714668772614321
## 1 1 2 1
## 0.724216893721451 0.736370300314966 0.7421567986855 0.758147824457565
## 1 1 1 1
## 0.770184586964598 0.790967243921418 0.793064530714494 0.801488153530168
## 1 1 1 1
## 0.804773569896239 0.81374439949889 0.817830565769603 0.81831028735539
## 1 1 1 1
## 0.818933020993789 0.835283475973556 0.842014471251895 0.862846987657402
## 1 1 1 1
## 0.863339282576935 0.876737266517904 0.877186800351119 0.883137642740446
## 1 1 1 1
## 0.886175850689163 0.890348766319305 0.892010290698546 0.914461097784853
## 1 1 1 1
## 0.915907043331918 0.933930774907524 0.943791816475291 0.947436650067569
## 1 1 1 1
## 0.953116730695456 0.955498333391612 0.95943957183749 0.960333597730317
## 1 1 1 1
## 0.962319682008478 0.966254714943196 0.966260863628427 0.972051057788731
## 1 1 1 1
## 0.972919287523698 0.975207038518997 0.976722243189188 0.977856487724435
## 1 1 1 1
## 0.979260716813337 0.979534768720111 0.983403307808005 0.985277991349062
## 1 1 1 1
## 0.985946230663705 0.986683603234503 0.987402275398426 0.988673652833114
## 1 1 1 1
## 0.990100003675623 0.990491341629046 0.991422751368218 0.991729116597034
## 1 1 1 1
## 0.992290565549714 0.993010717840816 0.994102484994677 0.994151674526109
## 1 1 1 1
## 0.994547097825151 0.995367789454826 0.995656503417482 0.995757750562912
## 1 1 1 1
## 0.99634421110743 0.996935341491487 0.997487215335876 0.997863050658858
## 1 1 1 1
## 0.997939901961028 0.997964862819303 0.998005096339897 0.998561325793855
## 1 1 1 1
## 0.998663241017075 0.998900201324743 0.99891770202163 0.998977441334501
## 1 1 1 1
## 0.999022815763783 0.999028556959015 0.999179693356662 0.999313599424296
## 1 1 1 1
## 0.999337039653096 0.999339182755807 0.999393700985683 0.999521116187138
## 1 1 1 1
## 0.999544971614029 0.999603304834869 0.999633212036676 0.999701281576424
## 1 1 1 1
## 0.999780144918909 0.999780274651374 0.999780277738306 0.999796842652683
## 1 1 1 1
## 0.999798549001394 0.999871950877686 0.999880470773976 0.999884443206408
## 1 1 1 1
## 0.999920504240636 0.999923926771282 0.999937625123376 0.999937955465987
## 1 1 1 1
## 0.999949163124405 0.999951890915097
## 1 1
# Measure the accuracy of prediction in the test data
y_pred_num<-ifelse(pred>0.13,1,0)
y_pred<-factor(y_pred_num,levels = c(0,1))
y_act<-testData$dv
#Result : Prediction Accuracy
set.seed(123)
mean(y_pred==y_act)
## [1] 0.9821826
# Plot ROC Curve
library(InformationValue)
##
## Attaching package: 'InformationValue'
## The following objects are masked from 'package:caret':
##
## confusionMatrix, precision, sensitivity, specificity
InformationValue::plotROC(y_act,pred)
InformationValue::AUROC(y_act,pred)
## [1] 0.9798152
#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_table <- table(testData$dv, y_pred_num)
#Displays confusion matrix and the statistics associated with the confusion matrix
result<-caret::confusionMatrix(confusion_table)
result
## Confusion Matrix and Statistics
##
## y_pred_num
## 0 1
## 0 773 9
## 1 7 109
##
## Accuracy : 0.9822
## 95% CI : (0.9712, 0.9898)
## No Information Rate : 0.8686
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9214
##
## Mcnemar's Test P-Value : 0.8026
##
## Sensitivity : 0.9910
## Specificity : 0.9237
## Pos Pred Value : 0.9885
## Neg Pred Value : 0.9397
## Prevalence : 0.8686
## Detection Rate : 0.8608
## Detection Prevalence : 0.8708
## Balanced Accuracy : 0.9574
##
## 'Positive' Class : 0
##
metrics<-as.data.frame(result$byClass)
colnames(metrics)<-"metrics"
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.3.3
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
kable(round(metrics,4), caption = "F1-score, Precision and Recall ") %>%
kable_styling(font_size = 16)
F1-score, Precision and Recall
|
|
metrics
|
|
Sensitivity
|
0.9910
|
|
Specificity
|
0.9237
|
|
Pos Pred Value
|
0.9885
|
|
Neg Pred Value
|
0.9397
|
|
Precision
|
0.9885
|
|
Recall
|
0.9910
|
|
F1
|
0.9898
|
|
Prevalence
|
0.8686
|
|
Detection Rate
|
0.8608
|
|
Detection Prevalence
|
0.8708
|
|
Balanced Accuracy
|
0.9574
|
# Logistic Regression
# Get feature importance (coefficients)
feature_importance2 <- abs(coef(logitmod2)[-1])
# Create feature importance plot
feature_importance_df2<-data.frame(
Feature = names(feature_importance2),
Importance = feature_importance2
)
feature_importance_df2 <- feature_importance_df2[order(feature_importance_df2$Importance, decreasing = TRUE), ]
windows()
ggplot(feature_importance_df2, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "deepskyblue") +
xlab("Feature") +
ylab("Importance") +
ggtitle("Logistic Regression - Feature Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Probit Regression
# Re-run logistic with only significant variables
probitmod<-glm(dv~ .,
family=binomial(link="probit"), data=trainData)
summary(probitmod)
##
## Call:
## glm(formula = dv ~ ., family = binomial(link = "probit"), data = trainData)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.936905 0.486415 -1.926 0.0541 .
## age -0.663262 0.111871 -5.929 3.05e-09 ***
## HHSize 0.095257 0.118923 0.801 0.4231
## Sex -0.149536 0.088111 -1.697 0.0897 .
## Grade 0.441234 0.019237 22.937 < 2e-16 ***
## Region -0.003458 0.017995 -0.192 0.8476
## District 0.009077 0.016174 0.561 0.5747
## Residence 0.012473 0.080377 0.155 0.8767
## Occupation -0.024647 0.015379 -1.603 0.1090
## Typehousing -0.047607 0.027280 -1.745 0.0810 .
## Schooltype 0.055396 0.114456 0.484 0.6284
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2722.24 on 3591 degrees of freedom
## Residual deviance: 885.98 on 3581 degrees of freedom
## AIC: 907.98
##
## Number of Fisher Scoring iterations: 9
# Apply the model to predict the testdata
pred<-predict(probitmod,newdata = testData, type = "response")
summary(pred)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.006944 0.015695 0.021191 0.124165 0.031720 0.999963
# Measure the accuracy of prediction in the test data
y_pred_num<-ifelse(pred>0.12,1,0)
y_pred<-factor(y_pred_num,levels = c(0,1))
y_act<-testData$dv
#Result : Prediction Accuracy
set.seed(123)
mean(y_pred==y_act)
## [1] 0.9821826
# Plot ROC Curve
library(InformationValue)
InformationValue::plotROC(y_act,pred)
InformationValue::AUROC(y_act,pred)
## [1] 0.9820421
#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_table <- table(testData$dv, y_pred_num)
#Displays confusion matrix and the statistics associated with the confusion matrix
result<-caret::confusionMatrix(confusion_table)
result
## Confusion Matrix and Statistics
##
## y_pred_num
## 0 1
## 0 773 9
## 1 7 109
##
## Accuracy : 0.9822
## 95% CI : (0.9712, 0.9898)
## No Information Rate : 0.8686
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9214
##
## Mcnemar's Test P-Value : 0.8026
##
## Sensitivity : 0.9910
## Specificity : 0.9237
## Pos Pred Value : 0.9885
## Neg Pred Value : 0.9397
## Prevalence : 0.8686
## Detection Rate : 0.8608
## Detection Prevalence : 0.8708
## Balanced Accuracy : 0.9574
##
## 'Positive' Class : 0
##
metrics<-as.data.frame(result$byClass)
colnames(metrics)<-"metrics"
library(dplyr)
library(kableExtra)
kable(round(metrics,4), caption = "F1-score, Precision and Recall ") %>%
kable_styling(font_size = 16)
F1-score, Precision and Recall
|
|
metrics
|
|
Sensitivity
|
0.9910
|
|
Specificity
|
0.9237
|
|
Pos Pred Value
|
0.9885
|
|
Neg Pred Value
|
0.9397
|
|
Precision
|
0.9885
|
|
Recall
|
0.9910
|
|
F1
|
0.9898
|
|
Prevalence
|
0.8686
|
|
Detection Rate
|
0.8608
|
|
Detection Prevalence
|
0.8708
|
|
Balanced Accuracy
|
0.9574
|
# Probit Regression
# Get feature importance (coefficients)
feature_importance2 <- abs(coef(probitmod)[-1])
# Create feature importance plot
feature_importance_df2<-data.frame(
Feature = names(feature_importance2),
Importance = feature_importance2
)
feature_importance_df2 <- feature_importance_df2[order(feature_importance_df2$Importance, decreasing = TRUE), ]
windows()
ggplot(feature_importance_df2, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "chocolate") +
xlab("Feature") +
ylab("Importance") +
ggtitle("Probit Regression - Feature Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Naive Bayes
# Check dimensions of the split
prop.table(table(final$dv))*100
##
## 0 1
## 87.32739 12.67261
# Check dimensions of the training data
prop.table(table(trainData$dv))*100
##
## 0 1
## 87.38864 12.61136
# Check dimensions of the testing data
prop.table(table(testData$dv))*100
##
## 0 1
## 87.08241 12.91759
head(trainData)
# Create objects x which hold the predictor variables and y which holds the response variables
str(trainData)
## 'data.frame': 3592 obs. of 11 variables:
## $ dv : num 0 0 0 1 0 0 0 0 0 0 ...
## $ age : int 2 1 2 2 2 2 2 2 2 2 ...
## $ HHSize : int 2 1 1 1 1 1 1 1 1 1 ...
## $ Sex : int 1 1 2 1 1 1 1 1 1 2 ...
## $ Grade : int 1 1 1 1 1 1 1 4 1 1 ...
## $ Region : int 4 4 4 4 4 4 4 4 4 4 ...
## $ District : int 8 8 8 8 8 8 8 8 8 8 ...
## $ Residence : int 3 2 2 2 2 2 2 2 2 2 ...
## $ Occupation : int 2 4 4 4 4 4 5 5 5 5 ...
## $ Typehousing: int 4 4 4 4 4 4 4 4 4 4 ...
## $ Schooltype : int 2 1 1 1 2 2 2 2 1 2 ...
y=trainData$dv
x=trainData[,2:10]
y<-as.factor(y)
defaultW<-getOption("warn")
options(warn = -1)
nbmodel<-train(x,y,'nb',trControl = trainControl(method = 'cv',number = 17))
nbmodel
## Naive Bayes
##
## 3592 samples
## 9 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (17 fold)
## Summary of sample sizes: 3380, 3380, 3380, 3380, 3381, 3382, ...
## Resampling results across tuning parameters:
##
## usekernel Accuracy Kappa
## FALSE 0.9760479 0.8875553
## TRUE 0.9253968 0.5504372
##
## Tuning parameter 'fL' was held constant at a value of 0
## Tuning
## parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were fL = 0, usekernel = FALSE and adjust
## = 1.
# Prediction
Predict<-predict(nbmodel,newdata = testData)
y_pred_nb<-ifelse(Predict==1,1,0)
#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_tablenb<- table(testData$dv, y_pred_nb)
#Displays confusion matrix and the statistics associated with the confusion matrix
resultnb<-caret::confusionMatrix(confusion_tablenb)
resultnb
## Confusion Matrix and Statistics
##
## y_pred_nb
## 0 1
## 0 777 5
## 1 7 109
##
## Accuracy : 0.9866
## 95% CI : (0.9768, 0.9931)
## No Information Rate : 0.8731
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9402
##
## Mcnemar's Test P-Value : 0.7728
##
## Sensitivity : 0.9911
## Specificity : 0.9561
## Pos Pred Value : 0.9936
## Neg Pred Value : 0.9397
## Prevalence : 0.8731
## Detection Rate : 0.8653
## Detection Prevalence : 0.8708
## Balanced Accuracy : 0.9736
##
## 'Positive' Class : 0
##
metricsnb<-as.data.frame(resultnb$byClass)
colnames(metricsnb)<-"metricsnb"
library(dplyr)
library(kableExtra)
kable(round(metricsnb,4), caption = "F1-score, Precision and Recall ") %>%
kable_styling(font_size = 16)
F1-score, Precision and Recall
|
|
metricsnb
|
|
Sensitivity
|
0.9911
|
|
Specificity
|
0.9561
|
|
Pos Pred Value
|
0.9936
|
|
Neg Pred Value
|
0.9397
|
|
Precision
|
0.9936
|
|
Recall
|
0.9911
|
|
F1
|
0.9923
|
|
Prevalence
|
0.8731
|
|
Detection Rate
|
0.8653
|
|
Detection Prevalence
|
0.8708
|
|
Balanced Accuracy
|
0.9736
|
library(ggplot2)
# Create a data frame with feature names and their importance/significance
feature_importance <- data.frame(
Feature = colnames(x),
Importance = nbmodel$finalModel$varnames,
stringsAsFactors = FALSE
)
# Sort the data frame by importance (optional)
feature_importance <- feature_importance[order(feature_importance$Importance, decreasing = TRUE), ]
# Plot feature importance
ggplot(feature_importance, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "moccasin") +
xlab("Feature") +
ylab("Importance") +
ggtitle("Naive Bayes - Feature Selection") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Random Forest
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
set.seed(123)
#Random Forest for variables. mtry = 5 since there are 24 variables (square root of 24 is close to 5).
fit_rf <- randomForest(factor(dv) ~., mtry = 17, data = trainData)
fit_rf
##
## Call:
## randomForest(formula = factor(dv) ~ ., data = trainData, mtry = 17)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 10
##
## OOB estimate of error rate: 1.92%
## Confusion matrix:
## 0 1 class.error
## 0 3113 26 0.008282893
## 1 43 410 0.094922737
varImpPlot(fit_rf)
#Performance of Random Forest Model on Testing Data
predrf = predict(fit_rf, newdata=testData)
table(predrf)
## predrf
## 0 1
## 779 119
accuracy <- table(predrf, testData[,2])
sum(diag(accuracy))/sum(accuracy)
## [1] 0.1570156
# Measure the accuracy of prediction in the test data
y_pred_numrf<-ifelse(predrf==1,1,0)
y_predrf<-factor(y_pred_numrf,levels = c(0,1))
y_actrf<-testData$dv
#Result : Prediction Accuracy
mean(y_predrf==y_actrf)
## [1] 0.9855234
#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_tablerf<- table(testData$dv, y_pred_numrf)
#Displays confusion matrix and the statistics associated with the confusion matrix
resultrf<-caret::confusionMatrix(confusion_tablerf)
resultrf
## Confusion Matrix and Statistics
##
## y_pred_numrf
## 0 1
## 0 774 8
## 1 5 111
##
## Accuracy : 0.9855
## 95% CI : (0.9754, 0.9923)
## No Information Rate : 0.8675
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9364
##
## Mcnemar's Test P-Value : 0.5791
##
## Sensitivity : 0.9936
## Specificity : 0.9328
## Pos Pred Value : 0.9898
## Neg Pred Value : 0.9569
## Prevalence : 0.8675
## Detection Rate : 0.8619
## Detection Prevalence : 0.8708
## Balanced Accuracy : 0.9632
##
## 'Positive' Class : 0
##
metricsrf<-as.data.frame(resultrf$byClass)
colnames(metricsrf)<-"metricsrf"
library(dplyr)
library(kableExtra)
kable(round(metricsrf,4), caption = "F1-score, Precision and Recall ") %>%
kable_styling(font_size = 16)
F1-score, Precision and Recall
|
|
metricsrf
|
|
Sensitivity
|
0.9936
|
|
Specificity
|
0.9328
|
|
Pos Pred Value
|
0.9898
|
|
Neg Pred Value
|
0.9569
|
|
Precision
|
0.9898
|
|
Recall
|
0.9936
|
|
F1
|
0.9917
|
|
Prevalence
|
0.8675
|
|
Detection Rate
|
0.8619
|
|
Detection Prevalence
|
0.8708
|
|
Balanced Accuracy
|
0.9632
|
#ROC
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
# RandomForest
test_prob1= predict(fit_rf, newdata=testData, type = "prob")
test_roc1= roc(testData$dv, test_prob1[,c(2)],plot = TRUE, col='darkgoldenrod4', print.auc = TRUE,percent=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
#Naive Bayes
test_prob2= predict(nbmodel, newdata=testData, type = "prob")
test_roc2= roc(testData$dv, test_prob2[,c(2)],plot = TRUE, col='darkorange', print.auc = TRUE,percent=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
#Logistic Regression
test_prob3= predict(logitmod2, newdata=testData, type = "response")
test_roc3= roc(testData$dv~test_prob3, plot = TRUE, print.auc = TRUE,percent=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
#Probit Regression
test_prob7= predict(probitmod, newdata=testData, type = "response")
test_roc7= roc(testData$dv~test_prob7, plot = TRUE, print.auc = TRUE,percent=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
windows()
plot(test_roc1, print.auc = TRUE, col='blue',percent=TRUE, print.auc.y = 41, print.auc.x = 0)
plot(test_roc2 , add = TRUE, col='red', print.auc = TRUE, print.auc.y = 34, print.auc.x = 0)
plot(test_roc3 , add = TRUE, col='darkgoldenrod4', print.auc = TRUE, print.auc.y = 27, print.auc.x = 0)
abline(a=0, b=1, lty=2, lwd=3, col="black")
legend("topleft", legend = c("Random Forest-Model", "Naive Bayes",
"Logistic-Regression") , pch = 15, bty = 'n', col = c("blue","red", "darkgoldenrod4"))
# Random Forest
# Get feature importance
feature_importance <- importance(fit_rf)
# Convert feature importance to data frame
feature_importance_df <- data.frame(
Feature = row.names(feature_importance),
Importance = feature_importance[, "MeanDecreaseGini"]
)
# Sort feature importance by importance score
feature_importance_df <- feature_importance_df[order(feature_importance_df$Importance, decreasing = TRUE), ]
windows()
# Create feature importance plot
ggplot(feature_importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "palegreen") +
xlab("Feature") +
ylab("Importance") +
ggtitle("Random Forest - Feature Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
##############################################
# Decision Tree
library(rpart)
library(rpart.plot)
ensemble <- rpart(dv~., data = trainData, method = 'class')
rpart.plot(ensemble)
#Prediction
predictionDT <- predict(ensemble, newdata = testData, type = 'class')
# Measure the accuracy of prediction in the test data
y_pred_numDT<-ifelse(predictionDT ==1,1,0)
y_predDT<-factor(y_pred_numDT,levels = c(0,1))
y_act<-testData$dv
#Result : Prediction Accuracy
mean(y_predDT==y_actrf)
## [1] 0.9888641
#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_tableDT<- table(testData$dv, y_pred_numDT)
#Displays confusion matrix and the statistics associated with the confusion matrix
resultDT<-caret::confusionMatrix(confusion_tableDT)
resultDT
## Confusion Matrix and Statistics
##
## y_pred_numDT
## 0 1
## 0 777 5
## 1 5 111
##
## Accuracy : 0.9889
## 95% CI : (0.9796, 0.9946)
## No Information Rate : 0.8708
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9505
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9936
## Specificity : 0.9569
## Pos Pred Value : 0.9936
## Neg Pred Value : 0.9569
## Prevalence : 0.8708
## Detection Rate : 0.8653
## Detection Prevalence : 0.8708
## Balanced Accuracy : 0.9753
##
## 'Positive' Class : 0
##
metricsDT<-as.data.frame(resultDT$byClass)
colnames(metricsrf)<-"metricsDT"
library(dplyr)
library(kableExtra)
kable(round(metricsDT,4), caption = "F1-score, Precision and Recall ") %>%
kable_styling(font_size = 16)
F1-score, Precision and Recall
|
|
resultDT$byClass
|
|
Sensitivity
|
0.9936
|
|
Specificity
|
0.9569
|
|
Pos Pred Value
|
0.9936
|
|
Neg Pred Value
|
0.9569
|
|
Precision
|
0.9936
|
|
Recall
|
0.9936
|
|
F1
|
0.9936
|
|
Prevalence
|
0.8708
|
|
Detection Rate
|
0.8653
|
|
Detection Prevalence
|
0.8708
|
|
Balanced Accuracy
|
0.9753
|
#Decision Tree
test_prob4= predict(ensemble, newdata=testData, type = "prob")
test_roc4= roc(testData$dv, test_prob4[,c(2)],plot = TRUE, col='darkblue', print.auc = TRUE,percent=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Decision Tree - Decision Tree
# Get feature importance
variable_importance <- ensemble$variable.importance
# Convert feature importance to data frame
# Create feature importance plot
feature_importance_df <- data.frame(
Feature = names(variable_importance),
Importance = variable_importance
)
feature_importance_df <- feature_importance_df[order(feature_importance_df$Importance, decreasing = TRUE), ]
windows()
ggplot(feature_importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "sienna") +
xlab("Feature") +
ylab("Importance") +
ggtitle("Decision Tree - Feature Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Convert Importance column to numeric
feature_importance_df$Importance <- as.numeric(as.character(feature_importance_df$Importance))
# Create feature importance plot
ggplot(feature_importance_df, aes(x = reorder(Feature, -Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "gold") +
xlab("Feature") +
ylab("Importance") +
ggtitle("Decision Tree - Feature Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
###############################################
# Support Vector Machine (SVM) Model
library(e1071)
set.seed(123)
svm<-svm(dv~., data = trainData, kernel='linear', cost=10, scale=FALSE)
# Plot the SVC obtained
windows()
plot(svm, final)
summary(svm)
##
## Call:
## svm(formula = dv ~ ., data = trainData, kernel = "linear", cost = 10,
## scale = FALSE)
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: linear
## cost: 10
## gamma: 0.1
## epsilon: 0.1
##
##
## Number of Support Vectors: 1276
#Prediction
set.seed(123)
predictionSVM<- predict(svm, newdata = testData)
summary(predictionSVM)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.09964 0.09989 0.10003 0.18784 0.10016 1.19092
# Measure the accuracy of prediction in the test data
y_pred_numSVM<-ifelse(predictionSVM>0.18,1,0)
y_predSVM<-factor(y_pred_numSVM,levels = c(0,1))
y_actSVM<-testData$dv
#Result : Prediction Accuracy
mean(y_predSVM==y_actSVM)
## [1] 0.9888641
#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_tableSVM<- table(testData$dv, y_predSVM)
#Displays confusion matrix and the statistics associated with the confusion matrix
resultSVM<-caret::confusionMatrix(confusion_tableSVM)
resultSVM
## Confusion Matrix and Statistics
##
## y_predSVM
## 0 1
## 0 777 5
## 1 5 111
##
## Accuracy : 0.9889
## 95% CI : (0.9796, 0.9946)
## No Information Rate : 0.8708
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9505
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9936
## Specificity : 0.9569
## Pos Pred Value : 0.9936
## Neg Pred Value : 0.9569
## Prevalence : 0.8708
## Detection Rate : 0.8653
## Detection Prevalence : 0.8708
## Balanced Accuracy : 0.9753
##
## 'Positive' Class : 0
##
metricsSVM<-as.data.frame(resultSVM$byClass)
colnames(metricsSVM)<-"metricsSVM"
library(dplyr)
library(kableExtra)
kable(round(metricsSVM,4), caption = "F1-score, Precision and Recall ") %>%
kable_styling(font_size = 16)
F1-score, Precision and Recall
|
|
metricsSVM
|
|
Sensitivity
|
0.9936
|
|
Specificity
|
0.9569
|
|
Pos Pred Value
|
0.9936
|
|
Neg Pred Value
|
0.9569
|
|
Precision
|
0.9936
|
|
Recall
|
0.9936
|
|
F1
|
0.9936
|
|
Prevalence
|
0.8708
|
|
Detection Rate
|
0.8653
|
|
Detection Prevalence
|
0.8708
|
|
Balanced Accuracy
|
0.9753
|
#SVM
library(pROC)
windows()
test_prob5= predict(svm, newdata=testData)
test_roc5= roc(testData$dv, main="Support Vector Machine", test_prob5,plot = TRUE, col='darkblue', print.auc = TRUE,percent=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
coef(svm)[2:10]
## age HHSize Sex Grade Region
## -1.300808e-04 -8.469098e-07 -7.340655e-05 9.088803e-02 -3.159863e-05
## District Residence Occupation Typehousing
## 4.450492e-05 -1.300809e-04 -5.830921e-06 2.365703e-10
# Support Vector Machine
# Get feature importance (coefficients)
feature_importance1<- abs(coef(svm)[2:10])
# Create feature importance plot
feature_importance_df <- data.frame(
Feature = names(feature_importance1),
Importance = feature_importance1
)
feature_importance_df <- feature_importance_df[order(feature_importance_df$Importance, decreasing = TRUE), ]
windows()
ggplot(feature_importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "turquoise") +
xlab("Feature") +
ylab("Importance") +
ggtitle("Support Vector Machine - Feature Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
#################################################
############################################
# KNN Model
library(knn)
# Fitting model
fit <-knnreg(trainData$dv ~ ., data = trainData,k=1)
summary(fit)
## Length Class Mode
## learn 2 -none- list
## k 1 -none- numeric
## terms 3 terms call
## xlevels 0 -none- list
## theDots 0 -none- list
#Predict Output
predicted<- predict(fit,testData)
table(predicted)
## predicted
## 0 0.0555555555555556 0.0714285714285714 0.0769230769230769
## 749 2 4 1
## 0.08 0.0833333333333333 0.1 0.111111111111111
## 5 1 12 4
## 0.125 0.142857142857143 0.25 0.5
## 1 1 2 6
## 1
## 110
# Measure the accuracy of prediction in the test data
y_pred_numknn<-ifelse(predicted==1,1,0)
y_predknn<-factor(y_pred_numknn,levels = c(0,1))
y_actknn<-testData$dv
#Result : Prediction Accuracy
mean(y_predknn==y_actknn)
## [1] 0.9799555
#Creates confusion table displaying where each client was placed and if they were placed in the right group
confusion_tableknn<- table(testData$dv, y_pred_numknn)
#Displays confusion matrix and the statistics associated with the confusion matrix
resultknn<-caret::confusionMatrix(confusion_tableknn)
resultknn
## Confusion Matrix and Statistics
##
## y_pred_numknn
## 0 1
## 0 776 6
## 1 12 104
##
## Accuracy : 0.98
## 95% CI : (0.9685, 0.9881)
## No Information Rate : 0.8775
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9089
##
## Mcnemar's Test P-Value : 0.2386
##
## Sensitivity : 0.9848
## Specificity : 0.9455
## Pos Pred Value : 0.9923
## Neg Pred Value : 0.8966
## Prevalence : 0.8775
## Detection Rate : 0.8641
## Detection Prevalence : 0.8708
## Balanced Accuracy : 0.9651
##
## 'Positive' Class : 0
##
metricsknn<-as.data.frame(resultknn$byClass)
colnames(metricsrf)<-"metricsknn"
library(dplyr)
library(kableExtra)
kable(round(metricsknn,4), caption = "F1-score, Precision and Recall ") %>%
kable_styling(font_size = 16)
F1-score, Precision and Recall
|
|
resultknn$byClass
|
|
Sensitivity
|
0.9848
|
|
Specificity
|
0.9455
|
|
Pos Pred Value
|
0.9923
|
|
Neg Pred Value
|
0.8966
|
|
Precision
|
0.9923
|
|
Recall
|
0.9848
|
|
F1
|
0.9885
|
|
Prevalence
|
0.8775
|
|
Detection Rate
|
0.8641
|
|
Detection Prevalence
|
0.8708
|
|
Balanced Accuracy
|
0.9651
|
knnmodel <- train(dv ~ ., data = trainData, method = "knn")
# Get feature importance
importance <- varImp(knnmodel)
# Print the feature importance
print(importance)
## loess r-squared variable importance
##
## Overall
## Grade 1.000e+02
## age 1.253e+01
## Schooltype 5.151e+00
## Residence 1.509e+00
## Sex 8.287e-01
## Region 2.419e-01
## HHSize 1.323e-01
## Occupation 3.936e-02
## District 8.319e-03
## Typehousing 0.000e+00
# Plot the feature importance
plot <- ggplot(importance, aes(x = rownames(importance), y = Overall)) +
geom_bar(stat = "identity", fill = "peru") +
labs(x = "Features", y = "Importance") +
ggtitle("KNN Features Importance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Print the plot
print(plot)
#ROC
library(pROC)
# KNN
windows()
test_prob6= predict(fit, newdata=testData, type = "prob")
test_roc6= roc(testData$dv, test_prob6,plot = TRUE, col='darkblue', print.auc = TRUE,percent=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
############################################
#Area-under Curve
windows()
plot(test_roc1, print.auc = TRUE, col='brown',percent=TRUE, print.auc.y = 48, print.auc.x = 0)
plot(test_roc2 , add = TRUE, col='red', print.auc = TRUE, print.auc.y = 41, print.auc.x = 0)
plot(test_roc3 , add = TRUE, col='darkgoldenrod4', print.auc = TRUE, print.auc.y = 34, print.auc.x = 0)
plot(test_roc4 , add = TRUE, col='green', print.auc = TRUE, print.auc.y = 27, print.auc.x = 0)
plot(test_roc5 , add = TRUE, col='purple', print.auc = TRUE, print.auc.y = 20, print.auc.x = 0)
plot(test_roc6 , add = TRUE, col='magenta', print.auc = TRUE, print.auc.y = 14, print.auc.x = 0)
plot(test_roc7 , add = TRUE, col='darkblue', print.auc = TRUE, print.auc.y = 7, print.auc.x = 0)
abline(a=0, b=1, lty=3, lwd=4, col="black")
legend("topleft", legend = c("Random Forest-Model", "Naive Bayes",
"Logistic-Regression","Decision Tree","Support Vector Machine","KNN","Probit Regression") ,
pch = 15, bty = 'n', col = c("brown","red", "darkgoldenrod4", "green","purple","magenta","darkblue"))
#############################3
# Metrics
#
# Sample accuracy, sensitivity, F1 Score, and precision values for three models
accuracy <- c(0.982, 0.985, 0.986,0.988,0.988,0.980,0.982)
sensitivity <- c(0.991, 0.993, 0.991,0.993,0.993,0.984,0.991)
AUROC<- c(0.985, 0.988, 0.971,0.986,0.975,0.973,0.984)
f1_score <- c(0.989, 0.991, 0.992,0.993,0.993,0.988,0.989)
# Creating a data frame with the values
model_names <- c("Logistic Regression", "Random Forest", "Naive Bayes","Support Vector Machine",
"Decision Tree","KNN","Probit Regression")
data <- data.frame(Model = model_names, Accuracy = accuracy, Sensitivity = sensitivity, F1_Score = f1_score, AUROC = AUROC)
# Loading the ggplot2 package
library(ggplot2)
# Reshaping the data frame for plotting
data_long <- reshape2::melt(data, id.vars = "Model", variable.name = "Metric", value.name = "Value")
windows()
# Creating the bar chart
ggplot(data_long, aes(x = Model, y = Value, fill = Metric)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Comparison of Supervised Machine Learning Models",
x = "Model",
y = "Value") +
scale_fill_manual(values = c("Accuracy" = "#3498db",
"Sensitivity" = "seagreen",
"F1_Score" = "slateblue",
"AUROC" = "#f1c40f")) +
theme_minimal()
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00