Homework 3

## Loading required package: carData

## 
## Please cite as:

##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer

## Loading required package: grid

## Loading required package: Matrix

## Loading required package: survival

## 
## Attaching package: 'survey'

## The following object is masked from 'package:graphics':
## 
##     dotchart

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:car':
## 
##     recode

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

load(url("https://github.com/coreysparks/data/blob/master/brfss_2017.Rdata?raw=true"))

nams<-names(brfss_17)
head(nams, n=10)

##  [1] "dispcode" "statere1" "safetime" "hhadult"  "genhlth"  "physhlth"
##  [7] "menthlth" "poorhlth" "hlthpln1" "persdoc2"

newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  nams))
names(brfss_17)<-newnames

a.) Define a binary outcome variable of your choosing and define how you recode the original variable

brfss_17$chcocncr<-Recode(brfss_17$chcocncr, recodes ="7:9=NA; 1=1;2=0") 


#I selected chcocncr, Have you ever been told you have other types of cancer? Where 1=1 is yes, 2=0 is no.

b.) State a research question about what factors you believe will affect your outcome variable.

#Does gender correlate with cancer?

c.) Define at least 2 predictor variables, based on your research question. For this assignment, it’s best if these are categorical variables.

#Income, zip code, race/ethnicity, age, and gender. For assignment purposes I will select gender and health insurance which are both categorical variables. 

#gender
brfss_17$sex<-as.factor(ifelse(brfss_17$sex==1, "Male", "Female"))


#insurance
brfss_17$ins<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0")

d.) Perform a descriptive analysis of the outcome variable by each of the variables you defined in part c. (e.g. 2 x 2 table, 2 x k table). Follow a similar approach to presenting your statistics as presented in Sparks 2009 (in the Google drive). This can be done easily using the tableone package!

#Texas
brfss_17$tx<-NA
brfss_17$tx[grep(pattern = "TX", brfss_17$mmsaname)]<-1
##Remove Non-Responses 
brfss_17<-brfss_17%>%
  filter(tx==1, is.na(mmsawt)==F, is.na(chcocncr)==F)

dc<-CreateTableOne(vars = c("sex", "hlthpln1"), strata = "chcocncr", test = T, data = brfss_17)

print(dc, format="p")

##                       Stratified by chcocncr
##                        0           1           p      test
##   n                    7805         815                   
##   sex = Male (%)       42.3        35.8        <0.001     
##   hlthpln1 (mean (SD)) 1.17 (0.57) 1.06 (0.30) <0.001

print(dc)

##                       Stratified by chcocncr
##                        0            1            p      test
##   n                    7805          815                    
##   sex = Male (%)       3305 (42.3)   292 (35.8)  <0.001     
##   hlthpln1 (mean (SD)) 1.17 (0.57)  1.06 (0.30)  <0.001

d-a.) Calculate descriptive statistics (mean or percentages) for each variable using no weights or survey design, as well as with full survey design and weights.

#with sex variable weights and no weights 
#counts
#raw, unweighted table 
table(brfss_17$chcocncr, brfss_17$sex)

##    
##     Female Male
##   0   4500 3305
##   1    523  292

csex<-wtd.table(brfss_17$chcocncr, brfss_17$sex, weights = brfss_17$mmsawt)
print(csex)

##      Female      Male
## 0 7395078.7 7207020.8
## 1  498331.8  285301.3

#proportions
prop.table(wtd.table(brfss_17$chcocncr, brfss_17$sex, weights = brfss_17$mmsawt), margin=2)

##       Female       Male
## 0 0.93686737 0.96192085
## 1 0.06313263 0.03807915

#compare that with the original, unweighted proportions
prop.table(table(brfss_17$chcocncr, brfss_17$sex), margin=2)

##    
##         Female       Male
##   0 0.89587896 0.91882124
##   1 0.10412104 0.08117876

d-a #with insurance variable weights and no weights

#No weights
table(brfss_17$chcocncr, brfss_17$hlthpln1)

##    
##        1    2    7    9
##   0 6685 1087   16   17
##   1  775   39    1    0

cins<-wtd.table(brfss_17$chcocncr, brfss_17$hlthpln1, weights = brfss_17$mmsawt)
print(cins)

##              1            2            7            9
## 0 1.104828e+07 3.490948e+06 4.170268e+04 2.116835e+04
## 1 7.055679e+05 7.806041e+04 4.705983e+00 0.000000e+00

#proportions
prop.table(wtd.table(brfss_17$chcocncr, brfss_17$hlthpln1, weights = brfss_17$mmsawt), margin=2)

##              1            2            7            9
## 0 0.9399713253 0.9781282661 0.9998871667 1.0000000000
## 1 0.0600286747 0.0218717339 0.0001128333 0.0000000000

#compare that with the original, unweighted proportions
prop.table(table(brfss_17$chcocncr, brfss_17$hlthpln1), margin=2)

##    
##              1          2          7          9
##   0 0.89611260 0.96536412 0.94117647 1.00000000
##   1 0.10388740 0.03463588 0.05882353 0.00000000

d-b.) Calculate percentages, or means, for each of your independent variables for each level of your outcome variable and present this in a table, with appropriate survey-corrected test statistics. (table one package helps)

library(tableone)
#not using survey design
t1<-CreateTableOne(vars = c("sex", "hlthpln1"), strata = "chcocncr", test = T, data = brfss_17)
#t1<-print(t1, format="p")
print(t1,format="p")

##                       Stratified by chcocncr
##                        0           1           p      test
##   n                    7805         815                   
##   sex = Male (%)       42.3        35.8        <0.001     
##   hlthpln1 (mean (SD)) 1.17 (0.57) 1.06 (0.30) <0.001

#using survey design
des<-svydesign(ids = ~1, strata = ~ststr, weights = ~mmsawt, data = brfss_17)
options(survey.lonely.psu = "adjust")
st1<-svyCreateTableOne(vars = c("sex", "hlthpln1"), strata = "chcocncr", test = T, data = des)
#st1<-print(st1, format="p")
print(st1, format="p")

##                       Stratified by chcocncr
##                        0                  1                p      test
##   n                     14602099.4         783633.0                   
##   sex = Male (%)              49.4             36.4         0.005     
##   hlthpln1 (mean (SD))        1.27 (0.60)      1.10 (0.30) <0.001

d-c.) Are there substantive differences in the descriptive results between the analysis using survey design and that not using survey design?

#Yes, there are substantive differences. Numbers with no survey design were higher while numbers with a survey design were slightly lower.

#Homework 3 1. perform a logistic regression (or probit regression) of a binary outcome, using the dataset of your choice. I ask you specify a research question for your analysis and generate appropriate predictors in order to examine your question.

#Logit model
fit.logit<-svyglm(chcocncr ~ sex + hlthpln1,
                  design = des,
                  family = binomial)

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

library(broom)
fit.logit%>%
  tidy()%>%
  knitr::kable(digits = 3)

term	estimate	std.error	statistic	p.value
(Intercept)	-1.548	0.366	-4.225	0.000
sexMale	-0.480	0.197	-2.434	0.015
hlthpln1	-1.005	0.342	-2.939	0.003

Present results from a model with sample weights and design effects, if your data allow for this. Present the results in tabular form, with Parameter estimates, odds ratios (if using the logit model) and confidence intervals for the odds ratios.

fit.logit%>%
  tidy()%>%
  mutate(OR = exp(estimate))%>%
  knitr::kable(digits = 3)

term	estimate	std.error	statistic	p.value	OR
(Intercept)	-1.548	0.366	-4.225	0.000	0.213
sexMale	-0.480	0.197	-2.434	0.015	0.619
hlthpln1	-1.005	0.342	-2.939	0.003	0.366

myexp<-function(x){exp(x)}

fit.logit%>%
  tidy()%>%
  mutate(OR = exp(estimate),
         LowerOR_Ci = exp(estimate - 1.96*std.error),
         UpperOR_Ci = exp(estimate + 1.96*std.error))%>%
  knitr::kable(digits = 3)

term	estimate	std.error	statistic	p.value	OR	LowerOR_Ci	UpperOR_Ci
(Intercept)	-1.548	0.366	-4.225	0.000	0.213	0.104	0.436
sexMale	-0.480	0.197	-2.434	0.015	0.619	0.421	0.911
hlthpln1	-1.005	0.342	-2.939	0.003	0.366	0.187	0.715

Generate predicted probabilities for some “interesting” cases from your analysis, to highlight the effects from the model and your stated research question.

library(emmeans)
rg<-ref_grid(fit.logit)

marg_logit<-emmeans(object = rg,
              specs = c( "sex",  "hlthpln1"),
              type="response" )

knitr::kable(marg_logit,  digits = 4)

sex	hlthpln1	prob	SE	df	asymp.LCL	asymp.UCL
Female	1.1582	0.0622	0.0074	Inf	0.0493	0.0784
Male	1.1582	0.0395	0.0057	Inf	0.0297	0.0522

Homework 3

Julie A. Gonzalez

2/15/2021