HW 4

#libraries
library(car, quietly = T)
library(stargazer, quietly = T)

## 
## Please cite as:

##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer

library(survey, quietly = T)

## 
## Attaching package: 'survey'

## The following object is masked from 'package:graphics':
## 
##     dotchart

library(questionr, quietly = T)
library(dplyr, quietly = T)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:car':
## 
##     recode

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2, quietly = T)

library(haven)
 tb19 <- read_dta("state_19_working_pudf.dta")

 

nams<-names(tb19)
head(nams, n=10)

##  [1] "year"     "ststr"    "state"    "fmonth"   "dispcode" "orgseqno"
##  [7] "c01q01"   "fairpoor" "c02q01"   "phyng"

newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  nams))
names(tb19)<-newnames

#recodevariables

#age
tb19$agegr4 = as.factor(tb19$agegr4)
tb19$age<-Recode(tb19$agegr4, recodes="1='18-19'; 2='30-44'; 3='45-64'; 4='65+'; else=NA", as.factor=T)

#insurance
tb19$c03q01 = as.factor(tb19$c03q01)
tb19$ins<-Recode(tb19$c03q01, recodes ="1=1;2=0")

#employment
tb19$c08q14 = as.factor(tb19$c08q14)

tb19$employ<-Recode(tb19$c08q14, recodes="1:2='Employed'; 3:6='nilf'; 7='retired'; 8='unable'; else=NA", as.factor=T)
 tb19$employ<-relevel(tb19$employ, ref='Employed')

#education level
 tb19$educat4 = as.factor(tb19$educat4)
tb19$educ<-Recode(tb19$educat4, recodes="1='leshs'; 2='hsgrad'; 3='sumcol'; 4='colgrad'", as.factor=T)
tb19$educ<-relevel(tb19$educ, ref='hsgrad')

#race/ethnicity
tb19$raceeth = as.factor(tb19$raceeth)

tb19$black<-Recode(tb19$raceeth, recodes="2=1; 9=NA; else=0")
tb19$white<-Recode(tb19$raceeth, recodes="1=1; 9=NA; else=0")
tb19$other<-Recode(tb19$raceeth, recodes="3:4=1; 9=NA; else=0")
tb19$hispanic<-Recode(tb19$raceeth, recodes="5=1; 9=NA; else=0")

tb19$race_eth<-Recode(tb19$raceeth, recodes="1='nhwhite'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA", as.factor = T)

#smoking currently
tb19$smoker2 = as.factor(tb19$smoker2)

tb19$smoke<-Recode(tb19$smoker2, recodes="1:2='Current'; 3='Former';4='NeverSmoked'; else=NA", as.factor=T)
tb19$smoke<-relevel(tb19$smoke, ref = "NeverSmoked")

#Poor or fair self rated health
tb19$fairpoor = as.factor(tb19$fairpoor)
tb19$genhlth<-Recode(tb19$fairpoor, recodes="1='poor'; 2='good'")


#sex
tb19$sex<-as.factor(ifelse(tb19$sex==1, "Male", "Female"))

#marital status
tb19$c08q05 = as.factor(tb19$c08q05)
tb19$marital<-Recode(tb19$c08q05, recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA", as.factor=T)
tb19$marital<-relevel(tb19$marital, ref='married')

#harveyhomedamage
tb19$tx04q04 = as.factor(tb19$tx04q04)
tb19$hdamage<-Recode(tb19$tx04q04, recodes="1='midly';2='moderate'; 3='severe';4='none'; else=NA", as.factor=T)

#Harvey_health
tb19$tx04q05 = as.factor(tb19$tx04q05)
tb19$h_hlth<-Recode(tb19$tx04q05, recodes="1=1;2=0", as.factor=T)

#harveychildasthma
tb19$tx05q01 = as.factor(tb19$tx05q01)
tb19$c_asth<-Recode(tb19$tx05q01, recodes="1=1;2=0; 3='notaffected'")

#harveymold
tb19$tx05q06 = as.factor(tb19$tx05q06)
tb19$mold<-Recode(tb19$tx05q06, recodes="1=1;2=0")

#harveyvermon
tb19$tx05q09 = as.factor(tb19$tx05q09)
tb19$vermon<-Recode(tb19$tx05q09, recodes="1=1;2=0")

#copd
tb19$c06q08 = as.factor(tb19$c06q08)
tb19$copd<-Recode(tb19$c06q08, recodes="1=1;2=0")

Define a binary outcome of your choosing

#My binary outcome is Chronic Obstructive Pulmonary Disease (COPD). 
tb19$copd<-Recode(tb19$c06q08, recodes="1=1; 2=0;else=NA", as.factor=T)
summary(tb19$copd, na.rm = TRUE)

##     0     1  NA's 
## 11188  1036    73

Fit a predictive logistic regression model using as many predictor variables as you think you need.

tb19_mod<-tb19 %>% 
          dplyr::select(age,ins,employ,educ,black,white,other,hispanic,race_eth,smoke,genhlth,sex,marital,hdamage,h_hlth,c_asth,mold,vermon,copd) %>%
  filter (complete.cases(.))              

knitr::kable(head(tb19_mod))

age	ins	employ	educ	black	white	race_eth	smoke	genhlth	sex	marital	hdamage	h_hlth	c_asth	mold	copd
45-64	1	Employed	hsgrad	0	1	nhwhite	NeverSmoked	good	Female	married	none	0	0	0	0
65+	1	retired	sumcol	0	1	nhwhite	NeverSmoked	good	Female	married	moderate	0	0	0	0
45-64	1	nilf	colgrad	1	0	nh black	NeverSmoked	good	Female	separated	moderate	1	1	1	1
18-19	1	Employed	hsgrad	1	0	nh black	NeverSmoked	good	Male	cohab	moderate	0	0	1	0
30-44	1	unable	hsgrad	1	0	nh black	NeverSmoked	poor	Female	married	none	0	0	0	0
30-44	1	Employed	sumcol	0	1	nhwhite	Current	good	Female	widowed	severe	0	0	1	0

subj<-tb19%>%
  select(age,ins,employ,educ,black,white,other,hispanic,race_eth,smoke,genhlth,sex,marital,hdamage,h_hlth,c_asth,mold,vermon,copd) %>%
  filter( complete.cases(.))

options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~1, strata=~ststr, weights=~llcpwt, data =tb19)

#lm
fit.logit<-svyglm(copd~sex+ins+employ+educ+race_eth+smoke+genhlth+marital+hdamage+h_hlth+c_asth+mold+vermon+copd,
                  design= des,
                  family=binomial)

## Warning in model.matrix.default(mt, mf, contrasts): the response appeared on the
## right-hand side and was dropped

## Warning in model.matrix.default(mt, mf, contrasts): problem with term 14 in
## model.matrix: no columns are assigned

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning in model.matrix.default(glm.object, data = structure(list(copd =
## structure(c(1L, : the response appeared on the right-hand side and was dropped

## Warning in model.matrix.default(glm.object, data = structure(list(copd =
## structure(c(1L, : problem with term 14 in model.matrix: no columns are assigned

summary(fit.logit)

## 
## Call:
## svyglm(formula = copd ~ sex + ins + employ + educ + race_eth + 
##     smoke + genhlth + marital + hdamage + h_hlth + c_asth + mold + 
##     vermon + copd, design = des, family = binomial)
## 
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~llcpwt, data = tb19)
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -57.584      3.873 -14.870 2.75e-13 ***
## sexMale                32.117      2.631  12.208 1.57e-11 ***
## ins1                    2.725      1.212   2.249 0.034370 *  
## employnilf             41.660      1.821  22.874  < 2e-16 ***
## employretired         113.784      3.516  32.358  < 2e-16 ***
## employunable          -17.091      3.562  -4.798 7.71e-05 ***
## educcolgrad             1.165      1.414   0.824 0.418441    
## educleshs              31.501      2.288  13.767 1.36e-12 ***
## educsumcol              6.376      1.354   4.708 9.64e-05 ***
## race_ethnh multirace   -9.724      2.395  -4.060 0.000484 ***
## race_ethnh other      -62.091      4.053 -15.319 1.47e-13 ***
## race_ethnhwhite        -8.878      2.199  -4.038 0.000512 ***
## smokeCurrent           12.921      2.417   5.346 1.98e-05 ***
## smokeFormer           -49.481      3.054 -16.202 4.50e-14 ***
## genhlthpoor            19.397      2.276   8.522 1.43e-08 ***
## maritalcohab           47.760      1.295  36.889  < 2e-16 ***
## maritaldivorced         8.584      1.534   5.595 1.08e-05 ***
## maritalnm             -48.246      2.011 -23.990  < 2e-16 ***
## maritalseparated       -1.721      1.839  -0.936 0.358911    
## maritalwidowed         56.888      3.631  15.668 9.16e-14 ***
## hdamagemoderate       -81.945      1.531 -53.520  < 2e-16 ***
## hdamagenone           -72.190      2.340 -30.857  < 2e-16 ***
## hdamagesevere         -70.863      2.718 -26.068  < 2e-16 ***
## h_hlth1               141.048      2.817  50.070  < 2e-16 ***
## c_asth1               -58.657      1.791 -32.744  < 2e-16 ***
## mold1                  33.555      1.962  17.105 1.42e-14 ***
## vermon1               105.749      2.977  35.518  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.476337e-11)
## 
## Number of Fisher Scoring iterations: 25

Use a 80% training/20% test split for your data.

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:survival':
## 
##     cluster

#tb19_mod <- na.omit(tb19) 
set.seed(1115)
trainset<- createDataPartition(y = tb19_mod$copd,
                            p = .80,
                            list=F,)


tb19_train<-tb19_mod[trainset,]

## Warning: The `i` argument of ``[`()` can't be a matrix as of tibble 3.0.0.
## Convert to a vector.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

tb19_test<-tb19_mod[-trainset,]
table(tb19_train$copd)

## 
##  0  1 
## 55  7

prop.table(table(tb19_train$copd))

## 
##         0         1 
## 0.8870968 0.1129032

summary(tb19_train)

##     age     ins         employ        educ    black  white  other  hispanic
##  18-19: 4   0:15   Employed:32   hsgrad :13   0:51   0:21   0:52   0:62    
##  30-44:34   1:47   nilf    :19   colgrad:20   1:11   1:41   1:10           
##  45-64:21          retired : 2   leshs  : 9                                
##  65+  : 3          unable  : 9   sumcol :20                                
##                                                                            
##                                                                            
##          race_eth          smoke    genhlth       sex          marital  
##  nh black    :11   NeverSmoked:39   good:46   Female:43   married  :34  
##  nh multirace: 2   Current    :11   poor:16   Male  :19   cohab    : 6  
##  nh other    : 8   Former     :12                         divorced :10  
##  nhwhite     :41                                          nm       : 4  
##                                                           separated: 4  
##                                                           widowed  : 4  
##      hdamage   h_hlth         c_asth   mold   vermon copd  
##  midly   :16   0:53   0          :46   0:41   0:58   0:55  
##  moderate:18   1: 9   1          :16   1:21   1: 4   1: 7  
##  none    :18          notaffected: 0                       
##  severe  :10                                               
##                                                            
##

Report the % correct classification from the training data using the .5 decision rule and again using the mean of the outcome decision rule.

#attach(tb19_train)

#tb19_train$copd <- as.numeric(tb19_train$copd)

glm1<-glm(copd~factor(h_hlth)+factor(employ)+factor(educ)+factor(race_eth)+factor(smoke)+factor(genhlth)+factor(sex)+factor(marital)+factor(hdamage)+factor(c_asth)+factor(mold)+factor(vermon),
          data=tb19_train[-1],
          family = binomial)

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(glm1)

## 
## Call:
## glm(formula = copd ~ factor(h_hlth) + factor(employ) + factor(educ) + 
##     factor(race_eth) + factor(smoke) + factor(genhlth) + factor(sex) + 
##     factor(marital) + factor(hdamage) + factor(c_asth) + factor(mold) + 
##     factor(vermon), family = binomial, data = tb19_train[-1])
## 
## Deviance Residuals: 
##        Min          1Q      Median          3Q         Max  
## -1.330e-05  -3.558e-06  -2.110e-08  -2.110e-08   1.348e-05  
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)
## (Intercept)                  -3.329e+01  6.245e+05       0        1
## factor(h_hlth)1               6.963e+01  5.884e+05       0        1
## factor(employ)nilf            1.867e+01  3.007e+05       0        1
## factor(employ)retired         2.990e+01  2.911e+05       0        1
## factor(employ)unable          2.940e+00  5.841e+05       0        1
## factor(educ)colgrad           2.799e+00  4.738e+05       0        1
## factor(educ)leshs             4.307e+01  1.147e+06       0        1
## factor(educ)sumcol            1.158e+01  6.925e+05       0        1
## factor(race_eth)nh multirace -4.309e+01  9.709e+05       0        1
## factor(race_eth)nh other     -9.107e+01  5.976e+05       0        1
## factor(race_eth)nhwhite      -1.133e+01  3.223e+05       0        1
## factor(smoke)Current          4.237e+01  5.320e+05       0        1
## factor(smoke)Former          -4.573e+01  7.885e+05       0        1
## factor(genhlth)poor          -1.516e+01  2.131e+05       0        1
## factor(sex)Male               7.256e+00  3.274e+05       0        1
## factor(marital)cohab          4.900e+01  1.697e+05       0        1
## factor(marital)divorced       1.566e+01  2.417e+05       0        1
## factor(marital)nm            -4.893e+01  2.520e+05       0        1
## factor(marital)separated      2.565e+01  3.177e+05       0        1
## factor(marital)widowed        3.465e+00  3.137e+05       0        1
## factor(hdamage)moderate      -5.883e+01  3.372e+05       0        1
## factor(hdamage)none          -1.361e+01  3.392e+05       0        1
## factor(hdamage)severe        -4.543e+01  3.867e+05       0        1
## factor(c_asth)1              -9.176e+00  3.624e+05       0        1
## factor(mold)1                 7.660e+00  3.277e+05       0        1
## factor(vermon)1               2.553e+01  5.155e+05       0        1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4.3715e+01  on 61  degrees of freedom
## Residual deviance: 1.6203e-09  on 36  degrees of freedom
## AIC: 52
## 
## Number of Fisher Scoring iterations: 25

tb19_pred<- predict(glm1,
                  newdata = tb19_train,
                  type = "response")

head(tb19_pred)

##            1            2            3            4            5            6 
## 2.220446e-16 1.000000e+00 5.625563e-13 1.423922e-11 1.000000e+00 2.220446e-16

tb19_predcl<-factor(ifelse(tb19_pred>.5, 1, 0))

library(ggplot2)

pred1<-data.frame(pr=tb19_pred, gr=tb19_predcl, modcon=tb19_train$copd)

pred1%>%
  ggplot()+
  geom_histogram(aes(x=pr, color=gr, fill=gr))+
  ggtitle(label = "Probability of COPD", subtitle = "Threshold = .5")+
  geom_vline(xintercept=.5)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pred1%>%
  ggplot()+
  geom_histogram(aes(x=pr, color=gr, fill=gr))+
  ggtitle(label = "Probability of COPD", subtitle = "Truth")+
  geom_vline(xintercept=.5)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

table( tb19_predcl,tb19_train$copd)

##            
## tb19_predcl  0  1
##           0 55  0
##           1  0  7

cm1<-confusionMatrix(data = tb19_predcl,
                     reference = tb19_train$copd )
cm1

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 55  0
##          1  0  7
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9422, 1)
##     No Information Rate : 0.8871     
##     P-Value [Acc > NIR] : 0.0005946  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.8871     
##          Detection Rate : 0.8871     
##    Detection Prevalence : 0.8871     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : 0          
##

tb19_predcl<-factor(ifelse(tb19_pred>mean(I(tb19$copd==1)), 1, 0)) #mean of response

pred2<-data.frame(pr=tb19_pred, gr=tb19_predcl, modcon=tb19_train$copd)

pred2%>%
  ggplot(aes(x=pr, fill=modcon))+
  geom_histogram(position="identity", alpha=.2)+
  ggtitle(label = "Probability of COPD", subtitle = "Threshold = Mean")+
  geom_vline(xintercept=mean(I(tb19_train$copd==1)))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pred2%>%
  ggplot(aes(x=pr, fill=modcon))+
  geom_histogram(position="identity", alpha=.2)+
  ggtitle(label = "Probability of COPD", subtitle = "Truth")+
  geom_vline(xintercept=mean(I(tb19_train$copd==1)))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pred_test<-predict(glm1,
                   newdata=tb19_test,
                   type="response")

pred_cl<-factor(ifelse(pred_test > mean( I(tb19_test$copd==1)), 1, 0))

table(tb19_test$copd,pred_cl)

##    pred_cl
##      0  1
##   0 11  2
##   1  1  0

confusionMatrix(data = pred_cl,tb19_test$copd )

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 11  1
##          1  2  0
##                                          
##                Accuracy : 0.7857         
##                  95% CI : (0.492, 0.9534)
##     No Information Rate : 0.9286         
##     P-Value [Acc > NIR] : 0.9854         
##                                          
##                   Kappa : -0.1053        
##                                          
##  Mcnemar's Test P-Value : 1.0000         
##                                          
##             Sensitivity : 0.8462         
##             Specificity : 0.0000         
##          Pos Pred Value : 0.9167         
##          Neg Pred Value : 0.0000         
##              Prevalence : 0.9286         
##          Detection Rate : 0.7857         
##    Detection Prevalence : 0.8571         
##       Balanced Accuracy : 0.4231         
##                                          
##        'Positive' Class : 0              
##

#The train and test models score diffrent. Looking at the copd observations they seem to be very low and therefore, the data for the models is not accurate. Both models had less than 100 observations, I dont know if it was my recoding...but i know I went wrong somewhere.I cant locate the error(s)

HW 4

Julie A. Gonzalez

2/23/2021