## Loading required package: carData
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
load(url("https://github.com/coreysparks/data/blob/master/brfss_2017.Rdata?raw=true"))
nams<-names(brfss_17)
head(nams, n=10)
##  [1] "dispcode" "statere1" "safetime" "hhadult"  "genhlth"  "physhlth"
##  [7] "menthlth" "poorhlth" "hlthpln1" "persdoc2"
newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  nams))
names(brfss_17)<-newnames

a.) Define a binary outcome variable of your choosing and define how you recode the original variable

brfss_17$chcocncr<-Recode(brfss_17$chcocncr, recodes ="7:9=NA; 1=1;2=0") 


#I selected chcocncr, Have you ever been told you have other types of cancer? Where 1=1 is yes, 2=0 is no. 

b.) State a research question about what factors you believe will affect your outcome variable.

#Does gender correlate with cancer?

c.) Define at least 2 predictor variables, based on your research question. For this assignment, it’s best if these are categorical variables.

#Income, zip code, race/ethnicity, age, and gender. For assignment purposes I will select gender and health insurance which are both categorical variables. 

#gender
brfss_17$sex<-as.factor(ifelse(brfss_17$sex==1, "Male", "Female"))


#insurance
brfss_17$ins<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0")

d.) Perform a descriptive analysis of the outcome variable by each of the variables you defined in part c. (e.g. 2 x 2 table, 2 x k table). Follow a similar approach to presenting your statistics as presented in Sparks 2009 (in the Google drive). This can be done easily using the tableone package!

#Texas
brfss_17$tx<-NA
brfss_17$tx[grep(pattern = "TX", brfss_17$mmsaname)]<-1
##Remove Non-Responses 
brfss_17<-brfss_17%>%
  filter(tx==1, is.na(mmsawt)==F, is.na(chcocncr)==F)

dc<-CreateTableOne(vars = c("sex", "hlthpln1"), strata = "chcocncr", test = T, data = brfss_17)

print(dc, format="p")
##                       Stratified by chcocncr
##                        0           1           p      test
##   n                    7805         815                   
##   sex = Male (%)       42.3        35.8        <0.001     
##   hlthpln1 (mean (SD)) 1.17 (0.57) 1.06 (0.30) <0.001
print(dc)
##                       Stratified by chcocncr
##                        0            1            p      test
##   n                    7805          815                    
##   sex = Male (%)       3305 (42.3)   292 (35.8)  <0.001     
##   hlthpln1 (mean (SD)) 1.17 (0.57)  1.06 (0.30)  <0.001

d-a.) Calculate descriptive statistics (mean or percentages) for each variable using no weights or survey design, as well as with full survey design and weights.

#with sex variable weights and no weights 
#counts
#raw, unweighted table 
table(brfss_17$chcocncr, brfss_17$sex)
##    
##     Female Male
##   0   4500 3305
##   1    523  292
csex<-wtd.table(brfss_17$chcocncr, brfss_17$sex, weights = brfss_17$mmsawt)
print(csex)
##      Female      Male
## 0 7395078.7 7207020.8
## 1  498331.8  285301.3
#proportions
prop.table(wtd.table(brfss_17$chcocncr, brfss_17$sex, weights = brfss_17$mmsawt), margin=2)
##       Female       Male
## 0 0.93686737 0.96192085
## 1 0.06313263 0.03807915
#compare that with the original, unweighted proportions
prop.table(table(brfss_17$chcocncr, brfss_17$sex), margin=2)
##    
##         Female       Male
##   0 0.89587896 0.91882124
##   1 0.10412104 0.08117876

d-a #with insurance variable weights and no weights

#No weights
table(brfss_17$chcocncr, brfss_17$hlthpln1)
##    
##        1    2    7    9
##   0 6685 1087   16   17
##   1  775   39    1    0
cins<-wtd.table(brfss_17$chcocncr, brfss_17$hlthpln1, weights = brfss_17$mmsawt)
print(cins)
##              1            2            7            9
## 0 1.104828e+07 3.490948e+06 4.170268e+04 2.116835e+04
## 1 7.055679e+05 7.806041e+04 4.705983e+00 0.000000e+00
#proportions
prop.table(wtd.table(brfss_17$chcocncr, brfss_17$hlthpln1, weights = brfss_17$mmsawt), margin=2)
##              1            2            7            9
## 0 0.9399713253 0.9781282661 0.9998871667 1.0000000000
## 1 0.0600286747 0.0218717339 0.0001128333 0.0000000000
#compare that with the original, unweighted proportions
prop.table(table(brfss_17$chcocncr, brfss_17$hlthpln1), margin=2)
##    
##              1          2          7          9
##   0 0.89611260 0.96536412 0.94117647 1.00000000
##   1 0.10388740 0.03463588 0.05882353 0.00000000

d-b.) Calculate percentages, or means, for each of your independent variables for each level of your outcome variable and present this in a table, with appropriate survey-corrected test statistics. (table one package helps)

library(tableone)
#not using survey design
t1<-CreateTableOne(vars = c("sex", "hlthpln1"), strata = "chcocncr", test = T, data = brfss_17)
#t1<-print(t1, format="p")
print(t1,format="p")
##                       Stratified by chcocncr
##                        0           1           p      test
##   n                    7805         815                   
##   sex = Male (%)       42.3        35.8        <0.001     
##   hlthpln1 (mean (SD)) 1.17 (0.57) 1.06 (0.30) <0.001
#using survey design
des<-svydesign(ids = ~1, strata = ~ststr, weights = ~mmsawt, data = brfss_17)
options(survey.lonely.psu = "adjust")
st1<-svyCreateTableOne(vars = c("sex", "hlthpln1"), strata = "chcocncr", test = T, data = des)
#st1<-print(st1, format="p")
print(st1, format="p")
##                       Stratified by chcocncr
##                        0                  1                p      test
##   n                     14602099.4         783633.0                   
##   sex = Male (%)              49.4             36.4         0.005     
##   hlthpln1 (mean (SD))        1.27 (0.60)      1.10 (0.30) <0.001

d-c.) Are there substantive differences in the descriptive results between the analysis using survey design and that not using survey design?

#Yes, there are substantive differences. Numbers with no survey design were higher while numbers with a survey design were slightly lower. 

#Homework 3 1. perform a logistic regression (or probit regression) of a binary outcome, using the dataset of your choice. I ask you specify a research question for your analysis and generate appropriate predictors in order to examine your question.

#Logit model
fit.logit<-svyglm(chcocncr ~ sex + hlthpln1,
                  design = des,
                  family = binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
library(broom)
fit.logit%>%
  tidy()%>%
  knitr::kable(digits = 3)
term estimate std.error statistic p.value
(Intercept) -1.548 0.366 -4.225 0.000
sexMale -0.480 0.197 -2.434 0.015
hlthpln1 -1.005 0.342 -2.939 0.003
  1. Present results from a model with sample weights and design effects, if your data allow for this. Present the results in tabular form, with Parameter estimates, odds ratios (if using the logit model) and confidence intervals for the odds ratios.
fit.logit%>%
  tidy()%>%
  mutate(OR = exp(estimate))%>%
  knitr::kable(digits = 3)
term estimate std.error statistic p.value OR
(Intercept) -1.548 0.366 -4.225 0.000 0.213
sexMale -0.480 0.197 -2.434 0.015 0.619
hlthpln1 -1.005 0.342 -2.939 0.003 0.366
myexp<-function(x){exp(x)}

fit.logit%>%
  tidy()%>%
  mutate(OR = exp(estimate),
         LowerOR_Ci = exp(estimate - 1.96*std.error),
         UpperOR_Ci = exp(estimate + 1.96*std.error))%>%
  knitr::kable(digits = 3)
term estimate std.error statistic p.value OR LowerOR_Ci UpperOR_Ci
(Intercept) -1.548 0.366 -4.225 0.000 0.213 0.104 0.436
sexMale -0.480 0.197 -2.434 0.015 0.619 0.421 0.911
hlthpln1 -1.005 0.342 -2.939 0.003 0.366 0.187 0.715

Generate predicted probabilities for some “interesting” cases from your analysis, to highlight the effects from the model and your stated research question.

library(emmeans)
rg<-ref_grid(fit.logit)

marg_logit<-emmeans(object = rg,
              specs = c( "sex",  "hlthpln1"),
              type="response" )

knitr::kable(marg_logit,  digits = 4)
sex hlthpln1 prob SE df asymp.LCL asymp.UCL
Female 1.1582 0.0622 0.0074 Inf 0.0493 0.0784
Male 1.1582 0.0395 0.0057 Inf 0.0297 0.0522