## Loading required package: carData
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
load(url("https://github.com/coreysparks/data/blob/master/brfss_2017.Rdata?raw=true"))
nams<-names(brfss_17)
head(nams, n=10)
## [1] "dispcode" "statere1" "safetime" "hhadult" "genhlth" "physhlth"
## [7] "menthlth" "poorhlth" "hlthpln1" "persdoc2"
newnames<-tolower(gsub(pattern = "_",replacement = "",x = nams))
names(brfss_17)<-newnames
a.) Define a binary outcome variable of your choosing and define how you recode the original variable
brfss_17$chcocncr<-Recode(brfss_17$chcocncr, recodes ="7:9=NA; 1=1;2=0")
#I selected chcocncr, Have you ever been told you have other types of cancer? Where 1=1 is yes, 2=0 is no.
b.) State a research question about what factors you believe will affect your outcome variable.
#Does gender correlate with cancer?
c.) Define at least 2 predictor variables, based on your research question. For this assignment, it’s best if these are categorical variables.
#Income, zip code, race/ethnicity, age, and gender. For assignment purposes I will select gender and health insurance which are both categorical variables.
#gender
brfss_17$sex<-as.factor(ifelse(brfss_17$sex==1, "Male", "Female"))
#insurance
brfss_17$ins<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0")
d.) Perform a descriptive analysis of the outcome variable by each of the variables you defined in part c. (e.g. 2 x 2 table, 2 x k table). Follow a similar approach to presenting your statistics as presented in Sparks 2009 (in the Google drive). This can be done easily using the tableone package!
#Texas
brfss_17$tx<-NA
brfss_17$tx[grep(pattern = "TX", brfss_17$mmsaname)]<-1
##Remove Non-Responses
brfss_17<-brfss_17%>%
filter(tx==1, is.na(mmsawt)==F, is.na(chcocncr)==F)
dc<-CreateTableOne(vars = c("sex", "hlthpln1"), strata = "chcocncr", test = T, data = brfss_17)
print(dc, format="p")
## Stratified by chcocncr
## 0 1 p test
## n 7805 815
## sex = Male (%) 42.3 35.8 <0.001
## hlthpln1 (mean (SD)) 1.17 (0.57) 1.06 (0.30) <0.001
print(dc)
## Stratified by chcocncr
## 0 1 p test
## n 7805 815
## sex = Male (%) 3305 (42.3) 292 (35.8) <0.001
## hlthpln1 (mean (SD)) 1.17 (0.57) 1.06 (0.30) <0.001
d-a.) Calculate descriptive statistics (mean or percentages) for each variable using no weights or survey design, as well as with full survey design and weights.
#with sex variable weights and no weights
#counts
#raw, unweighted table
table(brfss_17$chcocncr, brfss_17$sex)
##
## Female Male
## 0 4500 3305
## 1 523 292
csex<-wtd.table(brfss_17$chcocncr, brfss_17$sex, weights = brfss_17$mmsawt)
print(csex)
## Female Male
## 0 7395078.7 7207020.8
## 1 498331.8 285301.3
#proportions
prop.table(wtd.table(brfss_17$chcocncr, brfss_17$sex, weights = brfss_17$mmsawt), margin=2)
## Female Male
## 0 0.93686737 0.96192085
## 1 0.06313263 0.03807915
#compare that with the original, unweighted proportions
prop.table(table(brfss_17$chcocncr, brfss_17$sex), margin=2)
##
## Female Male
## 0 0.89587896 0.91882124
## 1 0.10412104 0.08117876
d-a #with insurance variable weights and no weights
#No weights
table(brfss_17$chcocncr, brfss_17$hlthpln1)
##
## 1 2 7 9
## 0 6685 1087 16 17
## 1 775 39 1 0
cins<-wtd.table(brfss_17$chcocncr, brfss_17$hlthpln1, weights = brfss_17$mmsawt)
print(cins)
## 1 2 7 9
## 0 1.104828e+07 3.490948e+06 4.170268e+04 2.116835e+04
## 1 7.055679e+05 7.806041e+04 4.705983e+00 0.000000e+00
#proportions
prop.table(wtd.table(brfss_17$chcocncr, brfss_17$hlthpln1, weights = brfss_17$mmsawt), margin=2)
## 1 2 7 9
## 0 0.9399713253 0.9781282661 0.9998871667 1.0000000000
## 1 0.0600286747 0.0218717339 0.0001128333 0.0000000000
#compare that with the original, unweighted proportions
prop.table(table(brfss_17$chcocncr, brfss_17$hlthpln1), margin=2)
##
## 1 2 7 9
## 0 0.89611260 0.96536412 0.94117647 1.00000000
## 1 0.10388740 0.03463588 0.05882353 0.00000000
d-b.) Calculate percentages, or means, for each of your independent variables for each level of your outcome variable and present this in a table, with appropriate survey-corrected test statistics. (table one package helps)
library(tableone)
#not using survey design
t1<-CreateTableOne(vars = c("sex", "hlthpln1"), strata = "chcocncr", test = T, data = brfss_17)
#t1<-print(t1, format="p")
print(t1,format="p")
## Stratified by chcocncr
## 0 1 p test
## n 7805 815
## sex = Male (%) 42.3 35.8 <0.001
## hlthpln1 (mean (SD)) 1.17 (0.57) 1.06 (0.30) <0.001
#using survey design
des<-svydesign(ids = ~1, strata = ~ststr, weights = ~mmsawt, data = brfss_17)
options(survey.lonely.psu = "adjust")
st1<-svyCreateTableOne(vars = c("sex", "hlthpln1"), strata = "chcocncr", test = T, data = des)
#st1<-print(st1, format="p")
print(st1, format="p")
## Stratified by chcocncr
## 0 1 p test
## n 14602099.4 783633.0
## sex = Male (%) 49.4 36.4 0.005
## hlthpln1 (mean (SD)) 1.27 (0.60) 1.10 (0.30) <0.001
d-c.) Are there substantive differences in the descriptive results between the analysis using survey design and that not using survey design?
#Yes, there are substantive differences. Numbers with no survey design were higher while numbers with a survey design were slightly lower.
#Homework 3 1. perform a logistic regression (or probit regression) of a binary outcome, using the dataset of your choice. I ask you specify a research question for your analysis and generate appropriate predictors in order to examine your question.
#Logit model
fit.logit<-svyglm(chcocncr ~ sex + hlthpln1,
design = des,
family = binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
library(broom)
fit.logit%>%
tidy()%>%
knitr::kable(digits = 3)
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | -1.548 | 0.366 | -4.225 | 0.000 |
| sexMale | -0.480 | 0.197 | -2.434 | 0.015 |
| hlthpln1 | -1.005 | 0.342 | -2.939 | 0.003 |
fit.logit%>%
tidy()%>%
mutate(OR = exp(estimate))%>%
knitr::kable(digits = 3)
| term | estimate | std.error | statistic | p.value | OR |
|---|---|---|---|---|---|
| (Intercept) | -1.548 | 0.366 | -4.225 | 0.000 | 0.213 |
| sexMale | -0.480 | 0.197 | -2.434 | 0.015 | 0.619 |
| hlthpln1 | -1.005 | 0.342 | -2.939 | 0.003 | 0.366 |
myexp<-function(x){exp(x)}
fit.logit%>%
tidy()%>%
mutate(OR = exp(estimate),
LowerOR_Ci = exp(estimate - 1.96*std.error),
UpperOR_Ci = exp(estimate + 1.96*std.error))%>%
knitr::kable(digits = 3)
| term | estimate | std.error | statistic | p.value | OR | LowerOR_Ci | UpperOR_Ci |
|---|---|---|---|---|---|---|---|
| (Intercept) | -1.548 | 0.366 | -4.225 | 0.000 | 0.213 | 0.104 | 0.436 |
| sexMale | -0.480 | 0.197 | -2.434 | 0.015 | 0.619 | 0.421 | 0.911 |
| hlthpln1 | -1.005 | 0.342 | -2.939 | 0.003 | 0.366 | 0.187 | 0.715 |
Generate predicted probabilities for some “interesting” cases from your analysis, to highlight the effects from the model and your stated research question.
library(emmeans)
rg<-ref_grid(fit.logit)
marg_logit<-emmeans(object = rg,
specs = c( "sex", "hlthpln1"),
type="response" )
knitr::kable(marg_logit, digits = 4)
| sex | hlthpln1 | prob | SE | df | asymp.LCL | asymp.UCL |
|---|---|---|---|---|---|---|
| Female | 1.1582 | 0.0622 | 0.0074 | Inf | 0.0493 | 0.0784 |
| Male | 1.1582 | 0.0395 | 0.0057 | Inf | 0.0297 | 0.0522 |