#libraries
library(car, quietly = T)
library(stargazer, quietly = T)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey, quietly = T)
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(questionr, quietly = T)
library(dplyr, quietly = T)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2, quietly = T)
library(haven)
 tb19 <- read_dta("state_19_working_pudf.dta")

 

nams<-names(tb19)
head(nams, n=10)
##  [1] "year"     "ststr"    "state"    "fmonth"   "dispcode" "orgseqno"
##  [7] "c01q01"   "fairpoor" "c02q01"   "phyng"
newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  nams))
names(tb19)<-newnames

#recodevariables

#age
tb19$agegr4 = as.factor(tb19$agegr4)
tb19$age<-Recode(tb19$agegr4, recodes="1='18-19'; 2='30-44'; 3='45-64'; 4='65+'; else=NA", as.factor=T)

#insurance
tb19$c03q01 = as.factor(tb19$c03q01)
tb19$ins<-Recode(tb19$c03q01, recodes ="1=1;2=0")

#employment
tb19$c08q14 = as.factor(tb19$c08q14)

tb19$employ<-Recode(tb19$c08q14, recodes="1:2='Employed'; 3:6='nilf'; 7='retired'; 8='unable'; else=NA", as.factor=T)
 tb19$employ<-relevel(tb19$employ, ref='Employed')

#education level
 tb19$educat4 = as.factor(tb19$educat4)
tb19$educ<-Recode(tb19$educat4, recodes="1='leshs'; 2='hsgrad'; 3='sumcol'; 4='colgrad'", as.factor=T)
tb19$educ<-relevel(tb19$educ, ref='hsgrad')

#race/ethnicity
tb19$raceeth = as.factor(tb19$raceeth)

tb19$black<-Recode(tb19$raceeth, recodes="2=1; 9=NA; else=0")
tb19$white<-Recode(tb19$raceeth, recodes="1=1; 9=NA; else=0")
tb19$other<-Recode(tb19$raceeth, recodes="3:4=1; 9=NA; else=0")
tb19$hispanic<-Recode(tb19$raceeth, recodes="5=1; 9=NA; else=0")

tb19$race_eth<-Recode(tb19$raceeth, recodes="1='nhwhite'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA", as.factor = T)

#smoking currently
tb19$smoker2 = as.factor(tb19$smoker2)

tb19$smoke<-Recode(tb19$smoker2, recodes="1:2='Current'; 3='Former';4='NeverSmoked'; else=NA", as.factor=T)
tb19$smoke<-relevel(tb19$smoke, ref = "NeverSmoked")

#Poor or fair self rated health
tb19$fairpoor = as.factor(tb19$fairpoor)
tb19$genhlth<-Recode(tb19$fairpoor, recodes="1='poor'; 2='good'")


#sex
tb19$sex<-as.factor(ifelse(tb19$sex==1, "Male", "Female"))

#marital status
tb19$c08q05 = as.factor(tb19$c08q05)
tb19$marital<-Recode(tb19$c08q05, recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA", as.factor=T)
tb19$marital<-relevel(tb19$marital, ref='married')

#harveyhomedamage
tb19$tx04q04 = as.factor(tb19$tx04q04)
tb19$hdamage<-Recode(tb19$tx04q04, recodes="1='midly';2='moderate'; 3='severe';4='none'; else=NA", as.factor=T)

#Harvey_health
tb19$tx04q05 = as.factor(tb19$tx04q05)
tb19$h_hlth<-Recode(tb19$tx04q05, recodes="1=1;2=0", as.factor=T)

#harveychildasthma
tb19$tx05q01 = as.factor(tb19$tx05q01)
tb19$c_asth<-Recode(tb19$tx05q01, recodes="1=1;2=0; 3='notaffected'")

#harveymold
tb19$tx05q06 = as.factor(tb19$tx05q06)
tb19$mold<-Recode(tb19$tx05q06, recodes="1=1;2=0")

#harveyvermon
tb19$tx05q09 = as.factor(tb19$tx05q09)
tb19$vermon<-Recode(tb19$tx05q09, recodes="1=1;2=0")

#copd
tb19$c06q08 = as.factor(tb19$c06q08)
tb19$copd<-Recode(tb19$c06q08, recodes="1=1;2=0")
  1. Define a binary outcome of your choosing
#My binary outcome is Chronic Obstructive Pulmonary Disease (COPD). 
tb19$copd<-Recode(tb19$c06q08, recodes="1=1; 2=0;else=NA", as.factor=T)
summary(tb19$copd, na.rm = TRUE)
##     0     1  NA's 
## 11188  1036    73
  1. Fit a predictive logistic regression model using as many predictor variables as you think you need.
tb19_mod<-tb19 %>% 
          dplyr::select(age,ins,employ,educ,black,white,other,hispanic,race_eth,smoke,genhlth,sex,marital,hdamage,h_hlth,c_asth,mold,vermon,copd) %>%
  filter (complete.cases(.))              

knitr::kable(head(tb19_mod))
age ins employ educ black white other hispanic race_eth smoke genhlth sex marital hdamage h_hlth c_asth mold vermon copd
45-64 1 Employed hsgrad 0 1 0 0 nhwhite NeverSmoked good Female married none 0 0 0 0 0
65+ 1 retired sumcol 0 1 0 0 nhwhite NeverSmoked good Female married moderate 0 0 0 0 0
45-64 1 nilf colgrad 1 0 0 0 nh black NeverSmoked good Female separated moderate 1 1 1 0 1
18-19 1 Employed hsgrad 1 0 0 0 nh black NeverSmoked good Male cohab moderate 0 0 1 0 0
30-44 1 unable hsgrad 1 0 0 0 nh black NeverSmoked poor Female married none 0 0 0 0 0
30-44 1 Employed sumcol 0 1 0 0 nhwhite Current good Female widowed severe 0 0 1 0 0
subj<-tb19%>%
  select(age,ins,employ,educ,black,white,other,hispanic,race_eth,smoke,genhlth,sex,marital,hdamage,h_hlth,c_asth,mold,vermon,copd) %>%
  filter( complete.cases(.))

options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~1, strata=~ststr, weights=~llcpwt, data =tb19)
#lm
fit.logit<-svyglm(copd~sex+ins+employ+educ+race_eth+smoke+genhlth+marital+hdamage+h_hlth+c_asth+mold+vermon+copd,
                  design= des,
                  family=binomial)
## Warning in model.matrix.default(mt, mf, contrasts): the response appeared on the
## right-hand side and was dropped
## Warning in model.matrix.default(mt, mf, contrasts): problem with term 14 in
## model.matrix: no columns are assigned
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in model.matrix.default(glm.object, data = structure(list(copd =
## structure(c(1L, : the response appeared on the right-hand side and was dropped
## Warning in model.matrix.default(glm.object, data = structure(list(copd =
## structure(c(1L, : problem with term 14 in model.matrix: no columns are assigned
summary(fit.logit)
## 
## Call:
## svyglm(formula = copd ~ sex + ins + employ + educ + race_eth + 
##     smoke + genhlth + marital + hdamage + h_hlth + c_asth + mold + 
##     vermon + copd, design = des, family = binomial)
## 
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~llcpwt, data = tb19)
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -57.584      3.873 -14.870 2.75e-13 ***
## sexMale                32.117      2.631  12.208 1.57e-11 ***
## ins1                    2.725      1.212   2.249 0.034370 *  
## employnilf             41.660      1.821  22.874  < 2e-16 ***
## employretired         113.784      3.516  32.358  < 2e-16 ***
## employunable          -17.091      3.562  -4.798 7.71e-05 ***
## educcolgrad             1.165      1.414   0.824 0.418441    
## educleshs              31.501      2.288  13.767 1.36e-12 ***
## educsumcol              6.376      1.354   4.708 9.64e-05 ***
## race_ethnh multirace   -9.724      2.395  -4.060 0.000484 ***
## race_ethnh other      -62.091      4.053 -15.319 1.47e-13 ***
## race_ethnhwhite        -8.878      2.199  -4.038 0.000512 ***
## smokeCurrent           12.921      2.417   5.346 1.98e-05 ***
## smokeFormer           -49.481      3.054 -16.202 4.50e-14 ***
## genhlthpoor            19.397      2.276   8.522 1.43e-08 ***
## maritalcohab           47.760      1.295  36.889  < 2e-16 ***
## maritaldivorced         8.584      1.534   5.595 1.08e-05 ***
## maritalnm             -48.246      2.011 -23.990  < 2e-16 ***
## maritalseparated       -1.721      1.839  -0.936 0.358911    
## maritalwidowed         56.888      3.631  15.668 9.16e-14 ***
## hdamagemoderate       -81.945      1.531 -53.520  < 2e-16 ***
## hdamagenone           -72.190      2.340 -30.857  < 2e-16 ***
## hdamagesevere         -70.863      2.718 -26.068  < 2e-16 ***
## h_hlth1               141.048      2.817  50.070  < 2e-16 ***
## c_asth1               -58.657      1.791 -32.744  < 2e-16 ***
## mold1                  33.555      1.962  17.105 1.42e-14 ***
## vermon1               105.749      2.977  35.518  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.476337e-11)
## 
## Number of Fisher Scoring iterations: 25
  1. Use a 80% training/20% test split for your data.
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
#tb19_mod <- na.omit(tb19) 
set.seed(1115)
trainset<- createDataPartition(y = tb19_mod$copd,
                            p = .80,
                            list=F,)


tb19_train<-tb19_mod[trainset,]
## Warning: The `i` argument of ``[`()` can't be a matrix as of tibble 3.0.0.
## Convert to a vector.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
tb19_test<-tb19_mod[-trainset,]
table(tb19_train$copd)
## 
##  0  1 
## 55  7
prop.table(table(tb19_train$copd))
## 
##         0         1 
## 0.8870968 0.1129032
summary(tb19_train)
##     age     ins         employ        educ    black  white  other  hispanic
##  18-19: 4   0:15   Employed:32   hsgrad :13   0:51   0:21   0:52   0:62    
##  30-44:34   1:47   nilf    :19   colgrad:20   1:11   1:41   1:10           
##  45-64:21          retired : 2   leshs  : 9                                
##  65+  : 3          unable  : 9   sumcol :20                                
##                                                                            
##                                                                            
##          race_eth          smoke    genhlth       sex          marital  
##  nh black    :11   NeverSmoked:39   good:46   Female:43   married  :34  
##  nh multirace: 2   Current    :11   poor:16   Male  :19   cohab    : 6  
##  nh other    : 8   Former     :12                         divorced :10  
##  nhwhite     :41                                          nm       : 4  
##                                                           separated: 4  
##                                                           widowed  : 4  
##      hdamage   h_hlth         c_asth   mold   vermon copd  
##  midly   :16   0:53   0          :46   0:41   0:58   0:55  
##  moderate:18   1: 9   1          :16   1:21   1: 4   1: 7  
##  none    :18          notaffected: 0                       
##  severe  :10                                               
##                                                            
## 
  1. Report the % correct classification from the training data using the .5 decision rule and again using the mean of the outcome decision rule.
#attach(tb19_train)

#tb19_train$copd <- as.numeric(tb19_train$copd)

glm1<-glm(copd~factor(h_hlth)+factor(employ)+factor(educ)+factor(race_eth)+factor(smoke)+factor(genhlth)+factor(sex)+factor(marital)+factor(hdamage)+factor(c_asth)+factor(mold)+factor(vermon),
          data=tb19_train[-1],
          family = binomial)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(glm1)
## 
## Call:
## glm(formula = copd ~ factor(h_hlth) + factor(employ) + factor(educ) + 
##     factor(race_eth) + factor(smoke) + factor(genhlth) + factor(sex) + 
##     factor(marital) + factor(hdamage) + factor(c_asth) + factor(mold) + 
##     factor(vermon), family = binomial, data = tb19_train[-1])
## 
## Deviance Residuals: 
##        Min          1Q      Median          3Q         Max  
## -1.330e-05  -3.558e-06  -2.110e-08  -2.110e-08   1.348e-05  
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)
## (Intercept)                  -3.329e+01  6.245e+05       0        1
## factor(h_hlth)1               6.963e+01  5.884e+05       0        1
## factor(employ)nilf            1.867e+01  3.007e+05       0        1
## factor(employ)retired         2.990e+01  2.911e+05       0        1
## factor(employ)unable          2.940e+00  5.841e+05       0        1
## factor(educ)colgrad           2.799e+00  4.738e+05       0        1
## factor(educ)leshs             4.307e+01  1.147e+06       0        1
## factor(educ)sumcol            1.158e+01  6.925e+05       0        1
## factor(race_eth)nh multirace -4.309e+01  9.709e+05       0        1
## factor(race_eth)nh other     -9.107e+01  5.976e+05       0        1
## factor(race_eth)nhwhite      -1.133e+01  3.223e+05       0        1
## factor(smoke)Current          4.237e+01  5.320e+05       0        1
## factor(smoke)Former          -4.573e+01  7.885e+05       0        1
## factor(genhlth)poor          -1.516e+01  2.131e+05       0        1
## factor(sex)Male               7.256e+00  3.274e+05       0        1
## factor(marital)cohab          4.900e+01  1.697e+05       0        1
## factor(marital)divorced       1.566e+01  2.417e+05       0        1
## factor(marital)nm            -4.893e+01  2.520e+05       0        1
## factor(marital)separated      2.565e+01  3.177e+05       0        1
## factor(marital)widowed        3.465e+00  3.137e+05       0        1
## factor(hdamage)moderate      -5.883e+01  3.372e+05       0        1
## factor(hdamage)none          -1.361e+01  3.392e+05       0        1
## factor(hdamage)severe        -4.543e+01  3.867e+05       0        1
## factor(c_asth)1              -9.176e+00  3.624e+05       0        1
## factor(mold)1                 7.660e+00  3.277e+05       0        1
## factor(vermon)1               2.553e+01  5.155e+05       0        1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4.3715e+01  on 61  degrees of freedom
## Residual deviance: 1.6203e-09  on 36  degrees of freedom
## AIC: 52
## 
## Number of Fisher Scoring iterations: 25

.5

tb19_pred<- predict(glm1,
                  newdata = tb19_train,
                  type = "response")

head(tb19_pred)
##            1            2            3            4            5            6 
## 2.220446e-16 1.000000e+00 5.625563e-13 1.423922e-11 1.000000e+00 2.220446e-16
tb19_predcl<-factor(ifelse(tb19_pred>.5, 1, 0))

library(ggplot2)

pred1<-data.frame(pr=tb19_pred, gr=tb19_predcl, modcon=tb19_train$copd)

pred1%>%
  ggplot()+
  geom_histogram(aes(x=pr, color=gr, fill=gr))+
  ggtitle(label = "Probability of COPD", subtitle = "Threshold = .5")+
  geom_vline(xintercept=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pred1%>%
  ggplot()+
  geom_histogram(aes(x=pr, color=gr, fill=gr))+
  ggtitle(label = "Probability of COPD", subtitle = "Truth")+
  geom_vline(xintercept=.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

table( tb19_predcl,tb19_train$copd)
##            
## tb19_predcl  0  1
##           0 55  0
##           1  0  7
cm1<-confusionMatrix(data = tb19_predcl,
                     reference = tb19_train$copd )
cm1
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 55  0
##          1  0  7
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9422, 1)
##     No Information Rate : 0.8871     
##     P-Value [Acc > NIR] : 0.0005946  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.8871     
##          Detection Rate : 0.8871     
##    Detection Prevalence : 0.8871     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : 0          
## 
tb19_predcl<-factor(ifelse(tb19_pred>mean(I(tb19$copd==1)), 1, 0)) #mean of response

pred2<-data.frame(pr=tb19_pred, gr=tb19_predcl, modcon=tb19_train$copd)

pred2%>%
  ggplot(aes(x=pr, fill=modcon))+
  geom_histogram(position="identity", alpha=.2)+
  ggtitle(label = "Probability of COPD", subtitle = "Threshold = Mean")+
  geom_vline(xintercept=mean(I(tb19_train$copd==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pred2%>%
  ggplot(aes(x=pr, fill=modcon))+
  geom_histogram(position="identity", alpha=.2)+
  ggtitle(label = "Probability of COPD", subtitle = "Truth")+
  geom_vline(xintercept=mean(I(tb19_train$copd==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pred_test<-predict(glm1,
                   newdata=tb19_test,
                   type="response")

pred_cl<-factor(ifelse(pred_test > mean( I(tb19_test$copd==1)), 1, 0))

table(tb19_test$copd,pred_cl)
##    pred_cl
##      0  1
##   0 11  2
##   1  1  0
confusionMatrix(data = pred_cl,tb19_test$copd )
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 11  1
##          1  2  0
##                                          
##                Accuracy : 0.7857         
##                  95% CI : (0.492, 0.9534)
##     No Information Rate : 0.9286         
##     P-Value [Acc > NIR] : 0.9854         
##                                          
##                   Kappa : -0.1053        
##                                          
##  Mcnemar's Test P-Value : 1.0000         
##                                          
##             Sensitivity : 0.8462         
##             Specificity : 0.0000         
##          Pos Pred Value : 0.9167         
##          Neg Pred Value : 0.0000         
##              Prevalence : 0.9286         
##          Detection Rate : 0.7857         
##    Detection Prevalence : 0.8571         
##       Balanced Accuracy : 0.4231         
##                                          
##        'Positive' Class : 0              
## 
#The train and test models score diffrent. Looking at the copd observations they seem to be very low and therefore, the data for the models is not accurate. Both models had less than 100 observations, I dont know if it was my recoding...but i know I went wrong somewhere.I cant locate the error(s)