#Data Prep
nams<-names(brfss_17)
head(nams, n=10)
## [1] "dispcode" "statere1" "safetime" "hhadult" "genhlth" "physhlth"
## [7] "menthlth" "poorhlth" "hlthpln1" "persdoc2"
newnames<-tolower(gsub(pattern = "_",replacement = "",x = nams))
names(brfss_17)<-newnames
##Get TX Cities
brfss_17$tx<-NA
brfss_17$tx[grep(pattern = "TX", brfss_17$mmsaname)]<-1
##Remove Non-Responses
brfss_17<-brfss_17%>%
filter(tx==1, is.na(mmsawt)==F, is.na(hlthpln1)==F)
##Recodes
brfss_17$hlthpln1<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0")
brfss_17$educ<-Recode(brfss_17$educa, recodes="1:3='0hs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA", as.factor=T)
brfss_17$ins<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0")
brfss_17$marst<-Recode(brfss_17$marital, recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA", as.factor=T)
brfss_17$marst<-relevel(brfss_17$marst, ref='married')
brfss_17$badhealth<-Recode(brfss_17$genhlth, recodes="4:5=1; 1:3=0; else=NA")
brfss_17$employ<-Recode(brfss_17$employ1, recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA", as.factor=T)
brfss_17$employ<-relevel(brfss_17$employ, ref='Employed')
brfss_17$agec<-cut(brfss_17$age80, breaks=c(0,24,39,59,79,99))
brfss_17$smoke<-Recode(brfss_17$smoker3, recodes="1:2='Current'; 3='Former';4='NeverSmoked'; else=NA", as.factor=T)
brfss_17$smoke<-relevel(brfss_17$smoke, ref = "NeverSmoked")
brfss_17$sex<-as.factor(ifelse(brfss_17$sex==1, "Male", "Female"))
#recoding employ to make it binomial
brfss_17$employa <- Recode(brfss_17$employ1, recodes="1:2=1; 2:6=0; else=NA", as.factor=T)
#Do I need to relevel with a binomial? Should the reference be employed?
brfss_17$employa<-relevel(brfss_17$employa, ref='1')
View(brfss_17)
names(brfss_17)
## [1] "dispcode" "statere1" "safetime" "hhadult" "genhlth" "physhlth"
## [7] "menthlth" "poorhlth" "hlthpln1" "persdoc2" "medcost" "checkup1"
## [13] "bphigh4" "bpmeds" "cholchk1" "toldhi2" "cholmed1" "cvdinfr4"
## [19] "cvdcrhd4" "cvdstrk3" "asthma3" "asthnow" "chcscncr" "chcocncr"
## [25] "chccopd1" "havarth3" "addepev2" "chckidny" "diabete3" "diabage2"
## [31] "lmtjoin3" "arthdis2" "arthsocl" "joinpai1" "sex" "marital"
## [37] "educa" "renthom1" "numhhol2" "numphon2" "cpdemo1a" "veteran3"
## [43] "employ1" "children" "income2" "internet" "weight2" "height3"
## [49] "pregnant" "deaf" "blind" "decide" "diffwalk" "diffdres"
## [55] "diffalon" "smoke100" "smokday2" "stopsmk2" "lastsmk2" "usenow3"
## [61] "ecigaret" "ecignow" "alcday5" "avedrnk2" "drnk3ge5" "maxdrnks"
## [67] "fruit2" "fruitju2" "fvgreen1" "frenchf1" "potatoe1" "vegetab2"
## [73] "exerany2" "exract11" "exeroft1" "exerhmm1" "exract21" "exeroft2"
## [79] "exerhmm2" "strength" "seatbelt" "flushot6" "flshtmy2" "pneuvac3"
## [85] "shingle2" "hivtst6" "hivtstd3" "hivrisk5" "casthdx2" "casthno2"
## [91] "callbckz" "wdusenow" "wdinftrk" "wdhowoft" "wdshare" "namtribe"
## [97] "namothr" "urbnrrl" "ststr" "impsex" "rfhlth" "phys14d"
## [103] "ment14d" "hcvu651" "rfhype5" "cholch1" "rfchol1" "michd"
## [109] "ltasth1" "casthm1" "asthms1" "drdxar1" "lmtact1" "lmtwrk1"
## [115] "lmtscl1" "prace1" "mrace1" "hispanc" "race" "raceg21"
## [121] "racegr3" "ageg5yr" "age65yr" "age80" "ageg" "wtkg3"
## [127] "bmi5" "bmi5cat" "rfbmi5" "educag" "incomg" "smoker3"
## [133] "rfsmok3" "ecigsts" "curecig" "drnkany5" "rfbing5" "drnkwek"
## [139] "rfdrhv5" "ftjuda2" "frutda2" "grenda1" "frnchda" "potada1"
## [145] "vegeda2" "misfrt1" "misveg1" "frtres1" "vegres1" "frutsu1"
## [151] "vegesu1" "frtlt1a" "veglt1a" "frt16a" "veg23a" "fruite1"
## [157] "vegete1" "totinda" "minac11" "minac21" "pacat1" "paindx1"
## [163] "pa150r2" "pa300r2" "pa30021" "pastrng" "parec1" "pastae1"
## [169] "rfseat2" "rfseat3" "flshot6" "pneumo2" "aidtst3" "mmsa"
## [175] "mmsawt" "seqno" "mmsaname" "tx" "educ" "ins"
## [181] "marst" "badhealth" "employ" "agec" "smoke" "employa"
describe(brfss_17$employ1)
## [8633 obs.] EMPLOYMENT STATUS
## numeric: 1 1 1 6 1 2 2 1 7 1 ...
## min: 1 - max: 9 - NAs: 1 (0%) - 10 unique values
describe(brfss_17$agec)
## [8633 obs.]
## nominal factor: "(24,39]" "(24,39]" "(24,39]" "(0,24]" "(24,39]" "(24,39]" "(39,59]" "(0,24]" "(39,59]" "(39,59]" ...
## 5 levels: (0,24] | (24,39] | (39,59] | (59,79] | (79,99]
## NAs: 0 (0%)
##
## n %
## (0,24] 497 5.8
## (24,39] 1514 17.5
## (39,59] 2468 28.6
## (59,79] 3304 38.3
## (79,99] 850 9.8
## Total 8633 100.0
#1, Define a binary outcome of your choosing
#Outcome: Employment status
#2. Fit a predictive logistic regression model using as many predictor variables as you think you need
library(dplyr)
sub<-brfss_17%>%
select(badhealth, employa, agec,smoke, sex, marst, educ, hlthpln1, mmsawt, ststr) %>%
filter( complete.cases(.))
#First we tell R our survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~1, strata=~ststr, weights=~mmsawt, data =sub )
#Logit model
fit.logit<-svyglm(employa~sex+educ+marst+smoke+agec+hlthpln1,
design= des,
family=binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.logit)
##
## Call:
## svyglm(formula = employa ~ sex + educ + marst + smoke + agec +
## hlthpln1, design = des, family = binomial)
##
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~mmsawt, data = sub)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.10748 0.32927 6.401 1.69e-10 ***
## sexMale -1.76559 0.16047 -11.003 < 2e-16 ***
## educ2hsgrad -0.88738 0.24123 -3.678 0.000237 ***
## educ3somecol -0.58774 0.23674 -2.483 0.013076 *
## educ4colgrad -1.56936 0.24577 -6.385 1.87e-10 ***
## marstcohab -0.15281 0.25319 -0.604 0.546195
## marstdivorced -0.94313 0.29938 -3.150 0.001641 **
## marstnm -0.08153 0.19647 -0.415 0.678188
## marstseparated -0.04116 0.38870 -0.106 0.915663
## marstwidowed -0.30696 0.35797 -0.857 0.391215
## smokeCurrent -0.19150 0.23783 -0.805 0.420741
## smokeFormer -0.36826 0.22432 -1.642 0.100726
## agec(24,39] -1.59694 0.23313 -6.850 8.31e-12 ***
## agec(39,59] -1.86191 0.25334 -7.350 2.32e-13 ***
## agec(59,79] -1.89719 0.33264 -5.703 1.24e-08 ***
## agec(79,99] -0.15259 0.43180 -0.353 0.723809
## hlthpln1 -0.01835 0.19864 -0.092 0.926409
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1.029702)
##
## Number of Fisher Scoring iterations: 5
#3. Use a 80% training/20% test split for your data
sub1 <- sub%>%
# filter(sex=="Female") %>% #women only
dplyr::select(employa, sex, educ, marst, smoke,agec,hlthpln1)
knitr::kable(head(sub1))
1 |
Male |
4colgrad |
married |
NeverSmoked |
(24,39] |
1 |
1 |
Male |
4colgrad |
nm |
NeverSmoked |
(24,39] |
1 |
1 |
Female |
4colgrad |
married |
NeverSmoked |
(24,39] |
1 |
0 |
Male |
2hsgrad |
nm |
Current |
(0,24] |
1 |
1 |
Male |
4colgrad |
married |
NeverSmoked |
(24,39] |
0 |
1 |
Male |
2hsgrad |
nm |
Current |
(24,39] |
0 |
library(caret)
## Warning: package 'caret' was built under R version 4.0.3
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
set.seed(1500)
train<- createDataPartition(y = sub1$employa,
p = .80,
list=F)
sub1train<-sub1[train,]
## Warning: The `i` argument of ``[`()` can't be a matrix as of tibble 3.0.0.
## Convert to a vector.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
sub1test<-sub1[-train,]
table(sub1train$employa)
##
## 1 0
## 2984 967
prop.table(table(sub1train$employa))
##
## 1 0
## 0.7552518 0.2447482
summary(sub1train)
## employa sex educ marst smoke
## 1:2984 Female:2215 0hs : 377 married :2179 NeverSmoked:2641
## 0: 967 Male :1736 2hsgrad : 834 cohab : 177 Current : 501
## 3somecol: 987 divorced : 422 Former : 809
## 4colgrad:1753 nm : 822
## separated: 125
## widowed : 226
## agec hlthpln1
## (0,24] : 351 Min. :0.0000
## (24,39]:1068 1st Qu.:1.0000
## (39,59]:1604 Median :1.0000
## (59,79]: 814 Mean :0.8233
## (79,99]: 114 3rd Qu.:1.0000
## Max. :1.0000
#3 Report the % correct classification from the training data using the .5 decision rule and again using the mean of the outcome decision rule
#3a) Does changing the decision rule threshold affect your classification accuracy?
brfss_17$employa <- as.numeric(brfss_17$employa)
brfss_17$employa <- as.numeric(brfss_17$sex)
brfss_17$employa <- as.numeric(brfss_17$marst)
brfss_17$employa <- as.numeric(brfss_17$smoke)
brfss_17$employa <- as.numeric(brfss_17$agec)
brfss_17$employa <- as.numeric(brfss_17$hlthpln1)
na.omit(sub1train)
## # A tibble: 3,951 x 7
## employa sex educ marst smoke agec hlthpln1
## <fct> <fct> <fct> <fct> <fct> <fct> <dbl>
## 1 1 Male 4colgrad married NeverSmoked (24,39] 1
## 2 1 Male 4colgrad nm NeverSmoked (24,39] 1
## 3 0 Male 2hsgrad nm Current (0,24] 1
## 4 1 Male 4colgrad married NeverSmoked (24,39] 0
## 5 1 Male 2hsgrad nm Current (24,39] 0
## 6 1 Male 3somecol cohab NeverSmoked (39,59] 0
## 7 1 Female 4colgrad cohab NeverSmoked (0,24] 1
## 8 1 Female 4colgrad married Former (39,59] 1
## 9 1 Male 4colgrad married NeverSmoked (39,59] 1
## 10 1 Male 4colgrad married NeverSmoked (39,59] 1
## # ... with 3,941 more rows
glm1<-glm(employa~ factor(sex)+factor(educ)+factor(marst)+ factor(smoke)+factor(agec)+scale(hlthpln1),
data=sub1train,
family = binomial)
summary(glm1)
##
## Call:
## glm(formula = employa ~ factor(sex) + factor(educ) + factor(marst) +
## factor(smoke) + factor(agec) + scale(hlthpln1), family = binomial,
## data = sub1train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0779 -0.7260 -0.4243 -0.2141 2.9011
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.67571 0.20207 8.293 < 2e-16 ***
## factor(sex)Male -1.61154 0.09787 -16.466 < 2e-16 ***
## factor(educ)2hsgrad -0.57102 0.14595 -3.913 9.13e-05 ***
## factor(educ)3somecol -0.67986 0.14582 -4.662 3.13e-06 ***
## factor(educ)4colgrad -1.29564 0.14484 -8.945 < 2e-16 ***
## factor(marst)cohab -0.30964 0.20367 -1.520 0.128431
## factor(marst)divorced -0.78595 0.16038 -4.901 9.56e-07 ***
## factor(marst)nm -0.28042 0.12515 -2.241 0.025047 *
## factor(marst)separated 0.01723 0.22239 0.077 0.938249
## factor(marst)widowed 0.19213 0.17609 1.091 0.275250
## factor(smoke)Current -0.08034 0.12670 -0.634 0.526027
## factor(smoke)Former -0.42886 0.11779 -3.641 0.000272 ***
## factor(agec)(24,39] -1.29681 0.15731 -8.243 < 2e-16 ***
## factor(agec)(39,59] -1.67319 0.16394 -10.206 < 2e-16 ***
## factor(agec)(59,79] -1.27462 0.17857 -7.138 9.47e-13 ***
## factor(agec)(79,99] 0.24494 0.28294 0.866 0.386652
## scale(hlthpln1) -0.15904 0.04285 -3.711 0.000206 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4397.4 on 3950 degrees of freedom
## Residual deviance: 3623.4 on 3934 degrees of freedom
## AIC: 3657.4
##
## Number of Fisher Scoring iterations: 5
describe(brfss_17$hlthpln1)
## [8633 obs.] HAVE ANY HEALTH CARE COVERAGE
## numeric: 1 1 1 1 0 0 0 1 1 1 ...
## min: 0 - max: 1 - NAs: 36 (0.4%) - 3 unique values
##
## n % val%
## 0 1131 13.1 13.2
## 1 7466 86.5 86.8
## NA 36 0.4 NA
## Total 8633 100.0 100.0
# is.numeric(brfss_17$hlthpln1)
# as.numeric(brfss_17$hlthpln1)
# class(brfss_17$educ)
# describe(brfss_17$employa)
#Create class probabilities
sub1_pred<- predict(glm1,
newdata = sub1train,
type = "response")
head(sub1_pred)
## 1 2 3 4 5 6
## 0.06901322 0.05303201 0.28064293 0.10110601 0.13929794 0.09489799
#Set decision rule for .5
sub1_predcl<-factor(ifelse(sub1_pred>.5, 1, 0))
library(ggplot2)
pred1<-data.frame(pr=sub1_pred, gr=sub1_predcl, emp=sub1train$employa)
pred1%>%
ggplot()+geom_histogram(aes(x=pr, color=gr, fill=gr))+ggtitle(label = "Probability of Female Employment", subtitle = "Threshold = .5")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pred1%>%
ggplot()+geom_histogram(aes(x=pr, color=gr, fill=gr))+ggtitle(label = "Probability of Female Employment", subtitle = "Truth")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

table(sub1_predcl,sub1train$employa) #the results are pretty bad...
##
## sub1_predcl 1 0
## 0 2823 688
## 1 161 279
#Confusion Matrix
cm1<-confusionMatrix(data = sub1_predcl,
reference = sub1train$employa)
## Warning in confusionMatrix.default(data = sub1_predcl, reference =
## sub1train$employa): Levels are not in the same order for reference and data.
## Refactoring data to match.
cm1
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 0
## 1 161 279
## 0 2823 688
##
## Accuracy : 0.2149
## 95% CI : (0.2022, 0.228)
## No Information Rate : 0.7553
## P-Value [Acc > NIR] : 1
##
## Kappa : -0.1242
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.05395
## Specificity : 0.71148
## Pos Pred Value : 0.36591
## Neg Pred Value : 0.19596
## Prevalence : 0.75525
## Detection Rate : 0.04075
## Detection Prevalence : 0.11136
## Balanced Accuracy : 0.38272
##
## 'Positive' Class : 1
##
#Mean of the response decision rule set up
tr_predcl<-factor(ifelse(sub1_pred>mean(I(sub1train$employa==1)), 1, 0)) #mean of response
pred2<-data.frame(pr=sub1_pred, gr=sub1_predcl, emp=sub1train$employa)
pred2%>%
ggplot()+geom_histogram(aes(x=pr, color=gr, fill=gr))+ggtitle(label = "Probability of Female Employment", subtitle = "Threshold = Mean")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pred2%>%
ggplot()+geom_histogram(aes(x=pr, color=gr, fill=gr))+ggtitle(label = "Probability of Female Employment", subtitle = "Truth")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#4. Report the % correct classification from the test data using the .5 decision rule and again using the mean of the outcome decision rule
glm2<-glm(employa~ factor(sex)+factor(educ)+factor(marst)+ factor(smoke)+factor(agec)+scale(hlthpln1),
data=sub1test,
family = binomial)
summary(glm2)
##
## Call:
## glm(formula = employa ~ factor(sex) + factor(educ) + factor(marst) +
## factor(smoke) + factor(agec) + scale(hlthpln1), family = binomial,
## data = sub1test)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8983 -0.6912 -0.4834 -0.2130 2.5249
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.81538 0.40433 2.017 0.043732 *
## factor(sex)Male -1.37084 0.18915 -7.247 4.25e-13 ***
## factor(educ)2hsgrad -0.06375 0.30439 -0.209 0.834100
## factor(educ)3somecol -0.05041 0.30188 -0.167 0.867384
## factor(educ)4colgrad -0.87762 0.30562 -2.872 0.004084 **
## factor(marst)cohab -0.16008 0.35681 -0.449 0.653695
## factor(marst)divorced -0.86050 0.31081 -2.769 0.005630 **
## factor(marst)nm 0.02003 0.25312 0.079 0.936928
## factor(marst)separated 0.06757 0.54496 0.124 0.901327
## factor(marst)widowed 0.31069 0.35195 0.883 0.377356
## factor(smoke)Current -0.03889 0.26922 -0.144 0.885143
## factor(smoke)Former -0.26070 0.22096 -1.180 0.238066
## factor(agec)(24,39] -1.13282 0.30588 -3.703 0.000213 ***
## factor(agec)(39,59] -1.31662 0.31725 -4.150 3.32e-05 ***
## factor(agec)(59,79] -0.91072 0.34849 -2.613 0.008966 **
## factor(agec)(79,99] 0.69428 0.63437 1.094 0.273759
## scale(hlthpln1) -0.28038 0.08274 -3.388 0.000703 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1096.68 on 985 degrees of freedom
## Residual deviance: 920.76 on 969 degrees of freedom
## AIC: 954.76
##
## Number of Fisher Scoring iterations: 5
#Create class probabilities
sub2_pred<- predict(glm2,
newdata = sub1test,
type = "response")
head(sub2_pred)
## 1 2 3 4 5 6
## 0.20912914 0.65698312 0.26599544 0.05290912 0.08513931 0.16915437
#Set decision rule for .5
sub2_predcl<-factor(ifelse(sub2_pred>.5, 1, 0))
library(ggplot2)
pred2<-data.frame(pr=sub2_pred, gr=sub2_predcl, emp=sub1test$employa)
pred2%>%
ggplot()+geom_histogram(aes(x=pr, color=gr, fill=gr))+ggtitle(label = "Probability of Female Employment", subtitle = "Threshold = .5")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pred2%>%
ggplot()+geom_histogram(aes(x=pr, color=gr, fill=gr))+ggtitle(label = "Probability of Female Employment", subtitle = "Truth")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Mean of the response decision rule set up for test
tr3_predcl<-factor(ifelse(sub2_pred>mean(I(sub1test$employa==1)), 1, 0)) #mean of response
pred3<-data.frame(pr=sub2_pred, gr=sub2_predcl, emp=sub1test$employa)
pred3%>%
ggplot(aes(x=pr, fill=gr))+
geom_histogram(position="identity", alpha=.5)+
ggtitle(label = "Probability of Female Employment", subtitle = "Threshold = Mean")+
geom_vline(xintercept=mean(I(sub1test$modcontra==1)))
## Warning: Unknown or uninitialised column: `modcontra`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing missing values (geom_vline).

pred3%>%
ggplot(aes(x=pr, fill=gr))+
geom_histogram(position="identity", alpha=.5)+
ggtitle(label = "Probability of Female Employment", subtitle = "Truth")+
geom_vline(xintercept=mean(I(sub1test$employa==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Confusion Matrix for Train
cm1<-confusionMatrix(data = sub1_predcl,
reference = sub1train$employa)
## Warning in confusionMatrix.default(data = sub1_predcl, reference =
## sub1train$employa): Levels are not in the same order for reference and data.
## Refactoring data to match.
cm1
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 0
## 1 161 279
## 0 2823 688
##
## Accuracy : 0.2149
## 95% CI : (0.2022, 0.228)
## No Information Rate : 0.7553
## P-Value [Acc > NIR] : 1
##
## Kappa : -0.1242
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.05395
## Specificity : 0.71148
## Pos Pred Value : 0.36591
## Neg Pred Value : 0.19596
## Prevalence : 0.75525
## Detection Rate : 0.04075
## Detection Prevalence : 0.11136
## Balanced Accuracy : 0.38272
##
## 'Positive' Class : 1
##
#Confusion Matrix for test
cm_test<-confusionMatrix(data = tr3_predcl,
reference = sub1test$employa)
## Warning in confusionMatrix.default(data = tr3_predcl, reference =
## sub1test$employa): Levels are not in the same order for reference and data.
## Refactoring data to match.
cm_test
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 0
## 1 2 14
## 0 743 227
##
## Accuracy : 0.2323
## 95% CI : (0.2062, 0.2599)
## No Information Rate : 0.7556
## P-Value [Acc > NIR] : 1
##
## Kappa : -0.0274
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.002685
## Specificity : 0.941909
## Pos Pred Value : 0.125000
## Neg Pred Value : 0.234021
## Prevalence : 0.755578
## Detection Rate : 0.002028
## Detection Prevalence : 0.016227
## Balanced Accuracy : 0.472297
##
## 'Positive' Class : 1
##
pred_test<-predict(glm1,
newdata=sub1test,
type="response")
pred_cl<-factor(ifelse(pred_test > mean( I(sub1test$employa==1)), 1, 0))
table(sub1test$employa,pred_cl)
## pred_cl
## 0 1
## 1 743 2
## 0 230 11
confusionMatrix(data = pred_cl,sub1test$employa)
## Warning in confusionMatrix.default(data = pred_cl, sub1test$employa): Levels are
## not in the same order for reference and data. Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 0
## 1 2 11
## 0 743 230
##
## Accuracy : 0.2353
## 95% CI : (0.2091, 0.263)
## No Information Rate : 0.7556
## P-Value [Acc > NIR] : 1
##
## Kappa : -0.0212
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.002685
## Specificity : 0.954357
## Pos Pred Value : 0.153846
## Neg Pred Value : 0.236382
## Prevalence : 0.755578
## Detection Rate : 0.002028
## Detection Prevalence : 0.013185
## Balanced Accuracy : 0.478521
##
## 'Positive' Class : 1
##
#The test model scores similarly as the train model (0.68 and 0.70 respectively).