HW4

#Data Prep
nams<-names(brfss_17)
head(nams, n=10)

##  [1] "dispcode" "statere1" "safetime" "hhadult"  "genhlth"  "physhlth"
##  [7] "menthlth" "poorhlth" "hlthpln1" "persdoc2"

newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  nams))
names(brfss_17)<-newnames

##Get TX Cities 
brfss_17$tx<-NA
brfss_17$tx[grep(pattern = "TX", brfss_17$mmsaname)]<-1

##Remove Non-Responses 
brfss_17<-brfss_17%>%
  filter(tx==1, is.na(mmsawt)==F, is.na(hlthpln1)==F)

##Recodes
brfss_17$hlthpln1<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0")

brfss_17$educ<-Recode(brfss_17$educa, recodes="1:3='0hs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA", as.factor=T)

brfss_17$ins<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0")

brfss_17$marst<-Recode(brfss_17$marital, recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA", as.factor=T)
brfss_17$marst<-relevel(brfss_17$marst, ref='married')

brfss_17$badhealth<-Recode(brfss_17$genhlth, recodes="4:5=1; 1:3=0; else=NA")

brfss_17$employ<-Recode(brfss_17$employ1, recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA", as.factor=T)
brfss_17$employ<-relevel(brfss_17$employ, ref='Employed')

brfss_17$agec<-cut(brfss_17$age80, breaks=c(0,24,39,59,79,99))
                   
brfss_17$smoke<-Recode(brfss_17$smoker3, recodes="1:2='Current'; 3='Former';4='NeverSmoked'; else=NA", as.factor=T)
brfss_17$smoke<-relevel(brfss_17$smoke, ref = "NeverSmoked")

brfss_17$sex<-as.factor(ifelse(brfss_17$sex==1, "Male", "Female"))

#recoding employ to make it binomial 
brfss_17$employa <- Recode(brfss_17$employ1, recodes="1:2=1; 2:6=0; else=NA", as.factor=T)
#Do I need to relevel with a binomial? Should the reference be employed?
brfss_17$employa<-relevel(brfss_17$employa, ref='1')



View(brfss_17)
names(brfss_17)

##   [1] "dispcode"  "statere1"  "safetime"  "hhadult"   "genhlth"   "physhlth" 
##   [7] "menthlth"  "poorhlth"  "hlthpln1"  "persdoc2"  "medcost"   "checkup1" 
##  [13] "bphigh4"   "bpmeds"    "cholchk1"  "toldhi2"   "cholmed1"  "cvdinfr4" 
##  [19] "cvdcrhd4"  "cvdstrk3"  "asthma3"   "asthnow"   "chcscncr"  "chcocncr" 
##  [25] "chccopd1"  "havarth3"  "addepev2"  "chckidny"  "diabete3"  "diabage2" 
##  [31] "lmtjoin3"  "arthdis2"  "arthsocl"  "joinpai1"  "sex"       "marital"  
##  [37] "educa"     "renthom1"  "numhhol2"  "numphon2"  "cpdemo1a"  "veteran3" 
##  [43] "employ1"   "children"  "income2"   "internet"  "weight2"   "height3"  
##  [49] "pregnant"  "deaf"      "blind"     "decide"    "diffwalk"  "diffdres" 
##  [55] "diffalon"  "smoke100"  "smokday2"  "stopsmk2"  "lastsmk2"  "usenow3"  
##  [61] "ecigaret"  "ecignow"   "alcday5"   "avedrnk2"  "drnk3ge5"  "maxdrnks" 
##  [67] "fruit2"    "fruitju2"  "fvgreen1"  "frenchf1"  "potatoe1"  "vegetab2" 
##  [73] "exerany2"  "exract11"  "exeroft1"  "exerhmm1"  "exract21"  "exeroft2" 
##  [79] "exerhmm2"  "strength"  "seatbelt"  "flushot6"  "flshtmy2"  "pneuvac3" 
##  [85] "shingle2"  "hivtst6"   "hivtstd3"  "hivrisk5"  "casthdx2"  "casthno2" 
##  [91] "callbckz"  "wdusenow"  "wdinftrk"  "wdhowoft"  "wdshare"   "namtribe" 
##  [97] "namothr"   "urbnrrl"   "ststr"     "impsex"    "rfhlth"    "phys14d"  
## [103] "ment14d"   "hcvu651"   "rfhype5"   "cholch1"   "rfchol1"   "michd"    
## [109] "ltasth1"   "casthm1"   "asthms1"   "drdxar1"   "lmtact1"   "lmtwrk1"  
## [115] "lmtscl1"   "prace1"    "mrace1"    "hispanc"   "race"      "raceg21"  
## [121] "racegr3"   "ageg5yr"   "age65yr"   "age80"     "ageg"      "wtkg3"    
## [127] "bmi5"      "bmi5cat"   "rfbmi5"    "educag"    "incomg"    "smoker3"  
## [133] "rfsmok3"   "ecigsts"   "curecig"   "drnkany5"  "rfbing5"   "drnkwek"  
## [139] "rfdrhv5"   "ftjuda2"   "frutda2"   "grenda1"   "frnchda"   "potada1"  
## [145] "vegeda2"   "misfrt1"   "misveg1"   "frtres1"   "vegres1"   "frutsu1"  
## [151] "vegesu1"   "frtlt1a"   "veglt1a"   "frt16a"    "veg23a"    "fruite1"  
## [157] "vegete1"   "totinda"   "minac11"   "minac21"   "pacat1"    "paindx1"  
## [163] "pa150r2"   "pa300r2"   "pa30021"   "pastrng"   "parec1"    "pastae1"  
## [169] "rfseat2"   "rfseat3"   "flshot6"   "pneumo2"   "aidtst3"   "mmsa"     
## [175] "mmsawt"    "seqno"     "mmsaname"  "tx"        "educ"      "ins"      
## [181] "marst"     "badhealth" "employ"    "agec"      "smoke"     "employa"

describe(brfss_17$employ1)

## [8633 obs.] EMPLOYMENT STATUS
## numeric: 1 1 1 6 1 2 2 1 7 1 ...
## min: 1 - max: 9 - NAs: 1 (0%) - 10 unique values

describe(brfss_17$agec)

## [8633 obs.] 
## nominal factor: "(24,39]" "(24,39]" "(24,39]" "(0,24]" "(24,39]" "(24,39]" "(39,59]" "(0,24]" "(39,59]" "(39,59]" ...
## 5 levels: (0,24] | (24,39] | (39,59] | (59,79] | (79,99]
## NAs: 0 (0%)
## 
##            n     %
## (0,24]   497   5.8
## (24,39] 1514  17.5
## (39,59] 2468  28.6
## (59,79] 3304  38.3
## (79,99]  850   9.8
## Total   8633 100.0

#1, Define a binary outcome of your choosing
#Outcome: Employment status

#2. Fit a predictive logistic regression model using as many predictor variables as you think you need
library(dplyr)
sub<-brfss_17%>%
  select(badhealth, employa, agec,smoke, sex, marst, educ, hlthpln1, mmsawt, ststr) %>%
  filter( complete.cases(.))

#First we tell R our survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~1, strata=~ststr, weights=~mmsawt, data =sub )

#Logit model
fit.logit<-svyglm(employa~sex+educ+marst+smoke+agec+hlthpln1,
                  design= des,
                  family=binomial)

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

summary(fit.logit)

## 
## Call:
## svyglm(formula = employa ~ sex + educ + marst + smoke + agec + 
##     hlthpln1, design = des, family = binomial)
## 
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~mmsawt, data = sub)
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     2.10748    0.32927   6.401 1.69e-10 ***
## sexMale        -1.76559    0.16047 -11.003  < 2e-16 ***
## educ2hsgrad    -0.88738    0.24123  -3.678 0.000237 ***
## educ3somecol   -0.58774    0.23674  -2.483 0.013076 *  
## educ4colgrad   -1.56936    0.24577  -6.385 1.87e-10 ***
## marstcohab     -0.15281    0.25319  -0.604 0.546195    
## marstdivorced  -0.94313    0.29938  -3.150 0.001641 ** 
## marstnm        -0.08153    0.19647  -0.415 0.678188    
## marstseparated -0.04116    0.38870  -0.106 0.915663    
## marstwidowed   -0.30696    0.35797  -0.857 0.391215    
## smokeCurrent   -0.19150    0.23783  -0.805 0.420741    
## smokeFormer    -0.36826    0.22432  -1.642 0.100726    
## agec(24,39]    -1.59694    0.23313  -6.850 8.31e-12 ***
## agec(39,59]    -1.86191    0.25334  -7.350 2.32e-13 ***
## agec(59,79]    -1.89719    0.33264  -5.703 1.24e-08 ***
## agec(79,99]    -0.15259    0.43180  -0.353 0.723809    
## hlthpln1       -0.01835    0.19864  -0.092 0.926409    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.029702)
## 
## Number of Fisher Scoring iterations: 5

#3. Use a 80% training/20% test split for your data

sub1 <- sub%>% 
 # filter(sex=="Female") %>% #women only
 dplyr::select(employa, sex, educ, marst, smoke,agec,hlthpln1)
  
  knitr::kable(head(sub1))

employa	sex	educ	marst	smoke	agec	hlthpln1
1	Male	4colgrad	married	NeverSmoked	(24,39]	1
1	Male	4colgrad	nm	NeverSmoked	(24,39]	1
1	Female	4colgrad	married	NeverSmoked	(24,39]	1
0	Male	2hsgrad	nm	Current	(0,24]	1
1	Male	4colgrad	married	NeverSmoked	(24,39]	0
1	Male	2hsgrad	nm	Current	(24,39]	0

library(caret)

## Warning: package 'caret' was built under R version 4.0.3

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:survival':
## 
##     cluster

set.seed(1500)
train<- createDataPartition(y = sub1$employa,
                            p = .80,
                            list=F)

sub1train<-sub1[train,]

## Warning: The `i` argument of ``[`()` can't be a matrix as of tibble 3.0.0.
## Convert to a vector.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

sub1test<-sub1[-train,]

table(sub1train$employa)

## 
##    1    0 
## 2984  967

prop.table(table(sub1train$employa))

## 
##         1         0 
## 0.7552518 0.2447482

summary(sub1train)

##  employa      sex             educ            marst              smoke     
##  1:2984   Female:2215   0hs     : 377   married  :2179   NeverSmoked:2641  
##  0: 967   Male  :1736   2hsgrad : 834   cohab    : 177   Current    : 501  
##                         3somecol: 987   divorced : 422   Former     : 809  
##                         4colgrad:1753   nm       : 822                     
##                                         separated: 125                     
##                                         widowed  : 226                     
##       agec         hlthpln1     
##  (0,24] : 351   Min.   :0.0000  
##  (24,39]:1068   1st Qu.:1.0000  
##  (39,59]:1604   Median :1.0000  
##  (59,79]: 814   Mean   :0.8233  
##  (79,99]: 114   3rd Qu.:1.0000  
##                 Max.   :1.0000

#3 Report the % correct classification from the training data using the .5 decision rule and again using the mean of the outcome decision rule

  #3a) Does changing the decision rule threshold affect your classification accuracy?

brfss_17$employa <- as.numeric(brfss_17$employa) 
brfss_17$employa <- as.numeric(brfss_17$sex) 
brfss_17$employa <- as.numeric(brfss_17$marst) 
brfss_17$employa <- as.numeric(brfss_17$smoke) 
brfss_17$employa <- as.numeric(brfss_17$agec) 
brfss_17$employa <- as.numeric(brfss_17$hlthpln1) 

na.omit(sub1train)

## # A tibble: 3,951 x 7
##    employa sex    educ     marst   smoke       agec    hlthpln1
##    <fct>   <fct>  <fct>    <fct>   <fct>       <fct>      <dbl>
##  1 1       Male   4colgrad married NeverSmoked (24,39]        1
##  2 1       Male   4colgrad nm      NeverSmoked (24,39]        1
##  3 0       Male   2hsgrad  nm      Current     (0,24]         1
##  4 1       Male   4colgrad married NeverSmoked (24,39]        0
##  5 1       Male   2hsgrad  nm      Current     (24,39]        0
##  6 1       Male   3somecol cohab   NeverSmoked (39,59]        0
##  7 1       Female 4colgrad cohab   NeverSmoked (0,24]         1
##  8 1       Female 4colgrad married Former      (39,59]        1
##  9 1       Male   4colgrad married NeverSmoked (39,59]        1
## 10 1       Male   4colgrad married NeverSmoked (39,59]        1
## # ... with 3,941 more rows

glm1<-glm(employa~ factor(sex)+factor(educ)+factor(marst)+ factor(smoke)+factor(agec)+scale(hlthpln1),
          data=sub1train,
          family = binomial)
summary(glm1)

## 
## Call:
## glm(formula = employa ~ factor(sex) + factor(educ) + factor(marst) + 
##     factor(smoke) + factor(agec) + scale(hlthpln1), family = binomial, 
##     data = sub1train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0779  -0.7260  -0.4243  -0.2141   2.9011  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             1.67571    0.20207   8.293  < 2e-16 ***
## factor(sex)Male        -1.61154    0.09787 -16.466  < 2e-16 ***
## factor(educ)2hsgrad    -0.57102    0.14595  -3.913 9.13e-05 ***
## factor(educ)3somecol   -0.67986    0.14582  -4.662 3.13e-06 ***
## factor(educ)4colgrad   -1.29564    0.14484  -8.945  < 2e-16 ***
## factor(marst)cohab     -0.30964    0.20367  -1.520 0.128431    
## factor(marst)divorced  -0.78595    0.16038  -4.901 9.56e-07 ***
## factor(marst)nm        -0.28042    0.12515  -2.241 0.025047 *  
## factor(marst)separated  0.01723    0.22239   0.077 0.938249    
## factor(marst)widowed    0.19213    0.17609   1.091 0.275250    
## factor(smoke)Current   -0.08034    0.12670  -0.634 0.526027    
## factor(smoke)Former    -0.42886    0.11779  -3.641 0.000272 ***
## factor(agec)(24,39]    -1.29681    0.15731  -8.243  < 2e-16 ***
## factor(agec)(39,59]    -1.67319    0.16394 -10.206  < 2e-16 ***
## factor(agec)(59,79]    -1.27462    0.17857  -7.138 9.47e-13 ***
## factor(agec)(79,99]     0.24494    0.28294   0.866 0.386652    
## scale(hlthpln1)        -0.15904    0.04285  -3.711 0.000206 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4397.4  on 3950  degrees of freedom
## Residual deviance: 3623.4  on 3934  degrees of freedom
## AIC: 3657.4
## 
## Number of Fisher Scoring iterations: 5

describe(brfss_17$hlthpln1)

## [8633 obs.] HAVE ANY HEALTH CARE COVERAGE
## numeric: 1 1 1 1 0 0 0 1 1 1 ...
## min: 0 - max: 1 - NAs: 36 (0.4%) - 3 unique values
## 
##          n     %  val%
## 0     1131  13.1  13.2
## 1     7466  86.5  86.8
## NA      36   0.4    NA
## Total 8633 100.0 100.0

# is.numeric(brfss_17$hlthpln1)
# as.numeric(brfss_17$hlthpln1) 

# class(brfss_17$educ)  
# describe(brfss_17$employa)

#Create class probabilities 
sub1_pred<- predict(glm1,
                  newdata = sub1train,
                  type = "response")
head(sub1_pred)

##          1          2          3          4          5          6 
## 0.06901322 0.05303201 0.28064293 0.10110601 0.13929794 0.09489799

#Set decision rule for .5
sub1_predcl<-factor(ifelse(sub1_pred>.5, 1, 0))

library(ggplot2)
pred1<-data.frame(pr=sub1_pred, gr=sub1_predcl, emp=sub1train$employa)

pred1%>%
  ggplot()+geom_histogram(aes(x=pr, color=gr, fill=gr))+ggtitle(label = "Probability of Female Employment", subtitle = "Threshold = .5")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pred1%>%
  ggplot()+geom_histogram(aes(x=pr, color=gr, fill=gr))+ggtitle(label = "Probability of Female Employment", subtitle = "Truth")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

table(sub1_predcl,sub1train$employa) #the results are pretty bad...

##            
## sub1_predcl    1    0
##           0 2823  688
##           1  161  279

#Confusion Matrix 
cm1<-confusionMatrix(data = sub1_predcl,
                     reference = sub1train$employa)

## Warning in confusionMatrix.default(data = sub1_predcl, reference =
## sub1train$employa): Levels are not in the same order for reference and data.
## Refactoring data to match.

cm1

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    0
##          1  161  279
##          0 2823  688
##                                          
##                Accuracy : 0.2149         
##                  95% CI : (0.2022, 0.228)
##     No Information Rate : 0.7553         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : -0.1242        
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.05395        
##             Specificity : 0.71148        
##          Pos Pred Value : 0.36591        
##          Neg Pred Value : 0.19596        
##              Prevalence : 0.75525        
##          Detection Rate : 0.04075        
##    Detection Prevalence : 0.11136        
##       Balanced Accuracy : 0.38272        
##                                          
##        'Positive' Class : 1              
##

#Mean of the response decision rule set up 
tr_predcl<-factor(ifelse(sub1_pred>mean(I(sub1train$employa==1)), 1, 0)) #mean of response

pred2<-data.frame(pr=sub1_pred, gr=sub1_predcl, emp=sub1train$employa)

pred2%>%
  ggplot()+geom_histogram(aes(x=pr, color=gr, fill=gr))+ggtitle(label = "Probability of Female Employment", subtitle = "Threshold = Mean")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pred2%>%
  ggplot()+geom_histogram(aes(x=pr, color=gr, fill=gr))+ggtitle(label = "Probability of Female Employment", subtitle = "Truth")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#4. Report the % correct classification from the test data using the .5 decision rule and again using the mean of the outcome decision rule

glm2<-glm(employa~ factor(sex)+factor(educ)+factor(marst)+ factor(smoke)+factor(agec)+scale(hlthpln1),
          data=sub1test,
          family = binomial)
summary(glm2)

## 
## Call:
## glm(formula = employa ~ factor(sex) + factor(educ) + factor(marst) + 
##     factor(smoke) + factor(agec) + scale(hlthpln1), family = binomial, 
##     data = sub1test)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8983  -0.6912  -0.4834  -0.2130   2.5249  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             0.81538    0.40433   2.017 0.043732 *  
## factor(sex)Male        -1.37084    0.18915  -7.247 4.25e-13 ***
## factor(educ)2hsgrad    -0.06375    0.30439  -0.209 0.834100    
## factor(educ)3somecol   -0.05041    0.30188  -0.167 0.867384    
## factor(educ)4colgrad   -0.87762    0.30562  -2.872 0.004084 ** 
## factor(marst)cohab     -0.16008    0.35681  -0.449 0.653695    
## factor(marst)divorced  -0.86050    0.31081  -2.769 0.005630 ** 
## factor(marst)nm         0.02003    0.25312   0.079 0.936928    
## factor(marst)separated  0.06757    0.54496   0.124 0.901327    
## factor(marst)widowed    0.31069    0.35195   0.883 0.377356    
## factor(smoke)Current   -0.03889    0.26922  -0.144 0.885143    
## factor(smoke)Former    -0.26070    0.22096  -1.180 0.238066    
## factor(agec)(24,39]    -1.13282    0.30588  -3.703 0.000213 ***
## factor(agec)(39,59]    -1.31662    0.31725  -4.150 3.32e-05 ***
## factor(agec)(59,79]    -0.91072    0.34849  -2.613 0.008966 ** 
## factor(agec)(79,99]     0.69428    0.63437   1.094 0.273759    
## scale(hlthpln1)        -0.28038    0.08274  -3.388 0.000703 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1096.68  on 985  degrees of freedom
## Residual deviance:  920.76  on 969  degrees of freedom
## AIC: 954.76
## 
## Number of Fisher Scoring iterations: 5

#Create class probabilities 
sub2_pred<- predict(glm2,
                  newdata = sub1test,
                  type = "response")
head(sub2_pred)

##          1          2          3          4          5          6 
## 0.20912914 0.65698312 0.26599544 0.05290912 0.08513931 0.16915437

#Set decision rule for .5
sub2_predcl<-factor(ifelse(sub2_pred>.5, 1, 0))

library(ggplot2)
pred2<-data.frame(pr=sub2_pred, gr=sub2_predcl, emp=sub1test$employa)

pred2%>%
  ggplot()+geom_histogram(aes(x=pr, color=gr, fill=gr))+ggtitle(label = "Probability of Female Employment", subtitle = "Threshold = .5")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pred2%>%
  ggplot()+geom_histogram(aes(x=pr, color=gr, fill=gr))+ggtitle(label = "Probability of Female Employment", subtitle = "Truth")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Mean of the response decision rule set up for test
tr3_predcl<-factor(ifelse(sub2_pred>mean(I(sub1test$employa==1)), 1, 0)) #mean of response

pred3<-data.frame(pr=sub2_pred, gr=sub2_predcl, emp=sub1test$employa)


pred3%>%
  ggplot(aes(x=pr, fill=gr))+
  geom_histogram(position="identity", alpha=.5)+
  ggtitle(label = "Probability of Female Employment", subtitle = "Threshold = Mean")+
  geom_vline(xintercept=mean(I(sub1test$modcontra==1)))

## Warning: Unknown or uninitialised column: `modcontra`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 1 rows containing missing values (geom_vline).

pred3%>%
  ggplot(aes(x=pr, fill=gr))+
  geom_histogram(position="identity", alpha=.5)+
  ggtitle(label = "Probability of Female Employment", subtitle = "Truth")+
  geom_vline(xintercept=mean(I(sub1test$employa==1)))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Confusion Matrix for Train
cm1<-confusionMatrix(data = sub1_predcl,
                     reference = sub1train$employa)

## Warning in confusionMatrix.default(data = sub1_predcl, reference =
## sub1train$employa): Levels are not in the same order for reference and data.
## Refactoring data to match.

cm1

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    0
##          1  161  279
##          0 2823  688
##                                          
##                Accuracy : 0.2149         
##                  95% CI : (0.2022, 0.228)
##     No Information Rate : 0.7553         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : -0.1242        
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.05395        
##             Specificity : 0.71148        
##          Pos Pred Value : 0.36591        
##          Neg Pred Value : 0.19596        
##              Prevalence : 0.75525        
##          Detection Rate : 0.04075        
##    Detection Prevalence : 0.11136        
##       Balanced Accuracy : 0.38272        
##                                          
##        'Positive' Class : 1              
##

#Confusion Matrix for test
cm_test<-confusionMatrix(data = tr3_predcl,
                     reference = sub1test$employa)

## Warning in confusionMatrix.default(data = tr3_predcl, reference =
## sub1test$employa): Levels are not in the same order for reference and data.
## Refactoring data to match.

cm_test

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   1   0
##          1   2  14
##          0 743 227
##                                           
##                Accuracy : 0.2323          
##                  95% CI : (0.2062, 0.2599)
##     No Information Rate : 0.7556          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : -0.0274         
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.002685        
##             Specificity : 0.941909        
##          Pos Pred Value : 0.125000        
##          Neg Pred Value : 0.234021        
##              Prevalence : 0.755578        
##          Detection Rate : 0.002028        
##    Detection Prevalence : 0.016227        
##       Balanced Accuracy : 0.472297        
##                                           
##        'Positive' Class : 1               
##

pred_test<-predict(glm1,
                   newdata=sub1test,
                   type="response")
pred_cl<-factor(ifelse(pred_test > mean( I(sub1test$employa==1)), 1, 0))
table(sub1test$employa,pred_cl)

##    pred_cl
##       0   1
##   1 743   2
##   0 230  11

confusionMatrix(data = pred_cl,sub1test$employa)

## Warning in confusionMatrix.default(data = pred_cl, sub1test$employa): Levels are
## not in the same order for reference and data. Refactoring data to match.

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   1   0
##          1   2  11
##          0 743 230
##                                          
##                Accuracy : 0.2353         
##                  95% CI : (0.2091, 0.263)
##     No Information Rate : 0.7556         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : -0.0212        
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.002685       
##             Specificity : 0.954357       
##          Pos Pred Value : 0.153846       
##          Neg Pred Value : 0.236382       
##              Prevalence : 0.755578       
##          Detection Rate : 0.002028       
##    Detection Prevalence : 0.013185       
##       Balanced Accuracy : 0.478521       
##                                          
##        'Positive' Class : 1              
##

#The test model scores similarly as the train model (0.68 and 0.70 respectively).

HW4

Coda Rayo-Garza

2/28/2021