load(url("https://github.com/coreysparks/data/blob/master/brfss_2017.Rdata?raw=true"))
nams<-names(brfss_17)
head(nams, n=10)
## [1] "dispcode" "statere1" "safetime" "hhadult" "genhlth" "physhlth"
## [7] "menthlth" "poorhlth" "hlthpln1" "persdoc2"
newnames<-tolower(gsub(pattern = "_",replacement = "",x = nams))
names(brfss_17)<-newnames
##Get TX Cities
brfss_17$tx<-NA
brfss_17$tx[grep(pattern = "TX", brfss_17$mmsaname)]<-1
##Remove Non-Responses
brfss_17<-brfss_17%>%
filter(tx==1, is.na(mmsawt)==F, is.na(hlthpln1)==F)
# 1. Define a binary outcome variable of your choosing and define how you recode the original variable.
##I will use the variable hlthpln1 as my binary variable. The question is: Do you have any kind of health care coverage, including health insurance, prepaid plans such as HMOs, or government plans such as Medicare,or Indian Health Service?
#I used your code to recode the original variable.
brfss_17$hlthpln1<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0")
View(brfss_17)
#2. State a research question about what factors you believe will affect your outcome variable.
##What effect does gender and education have on health insurance coverage?
#3. Define at least 2 predictor variables, based on your research question. For this assignment, it’s best if these are categorical variables.
##I will use incomeg and educag and here is how I recoded the variables.
brfss_17$educ<-Recode(brfss_17$educa, recodes="1:3='0hs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA", as.factor=T)
##Gender variable recode
brfss_17$gender<-as.factor(ifelse(brfss_17$sex==1, "Male", "Female"))
# brfss_17$employ<-Recode(brfss_17$employ, recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA", as.factor=T)
# brfss_17$employ<-relevel(brfss_17$employ, ref='Employed')
#
# as.factor(brfss_17$employ)
#
# as.factor(brfss_17$educ)
#
# table(brfss_17$employ)
#4. Perform a descriptive analysis of the outcome variable by each of the variables you defined in part b. (e.g. 2 x 2 table, 2 x k table). Follow a similar approach to presenting your statistics as presented in Sparks 2009 (in the Google drive). This can be done easily using the tableone package!
library(tableone)
t1<-CreateTableOne(vars = c("educ", "gender"), strata = "hlthpln1", test = T, data = brfss_17)
#t1<-print(t1, format="p")
print(t1,format="p")
## Stratified by hlthpln1
## 0 1 p test
## n 1131 7466
## educ (%) <0.001
## 0hs 27.9 7.2
## 2hsgrad 33.0 20.7
## 3somecol 23.3 26.3
## 4colgrad 15.8 45.8
## gender = Male (%) 47.3 40.8 <0.001
print(t1)
## Stratified by hlthpln1
## 0 1 p test
## n 1131 7466
## educ (%) <0.001
## 0hs 314 (27.9) 535 ( 7.2)
## 2hsgrad 372 (33.0) 1539 (20.7)
## 3somecol 262 (23.3) 1956 (26.3)
## 4colgrad 178 (15.8) 3405 (45.8)
## gender = Male (%) 535 (47.3) 3049 (40.8) <0.001
#4.1 Calculate descriptive statistics (mean or percentages) for each variable using no weights or survey design, as well as with full survey design and weights.
#For educ variable
##Raw frequencies - No Weights
table(brfss_17$hlthpln1, brfss_17$educ)
##
## 0hs 2hsgrad 3somecol 4colgrad
## 0 314 372 262 178
## 1 535 1539 1956 3405
##Frequencies with Weights
teduc<-wtd.table(brfss_17$hlthpln1, brfss_17$educ, weights = brfss_17$mmsawt)
print(teduc)
## 0hs 2hsgrad 3somecol 4colgrad
## 0 1284062.4 1139395.3 881802.7 266394.6
## 1 1225410.5 2644849.0 3857401.4 3986029.0
##column percentages - No Weights
prop.table(table(brfss_17$hlthpln1, brfss_17$educ), margin=2)
##
## 0hs 2hsgrad 3somecol 4colgrad
## 0 0.36984688 0.19466248 0.11812444 0.04967904
## 1 0.63015312 0.80533752 0.88187556 0.95032096
##Percentages - Weights
prop.table(wtd.table(brfss_17$hlthpln1, brfss_17$educ, weights = brfss_17$mmsawt), margin=2)
## 0hs 2hsgrad 3somecol 4colgrad
## 0 0.51168609 0.30108926 0.18606557 0.06264536
## 1 0.48831391 0.69891074 0.81393443 0.93735464
##basic chi square test of independence
chisq.test(table(brfss_17$hlthpln1, brfss_17$educ))
##
## Pearson's Chi-squared test
##
## data: table(brfss_17$hlthpln1, brfss_17$educ)
## X-squared = 702.45, df = 3, p-value < 2.2e-16
#For gender variable
##Raw frequencies - No Weights
table(brfss_17$hlthpln1, brfss_17$gender)
##
## Female Male
## 0 596 535
## 1 4417 3049
##Frequencies - With Weights
tgen<-wtd.table(brfss_17$hlthpln1, brfss_17$gender, weights = brfss_17$mmsawt)
print(tgen)
## Female Male
## 0 1566721 2012799
## 1 6302505 5452694
##column percentages - No weights
prop.table(table(brfss_17$hlthpln1, brfss_17$gender), margin=2)
##
## Female Male
## 0 0.1188909 0.1492746
## 1 0.8811091 0.8507254
##Percentages - Weights
prop.table(wtd.table(brfss_17$chcocncr, brfss_17$sex, weights = brfss_17$mmsawt), margin=2)
## 1 2 9
## 1 0.0380162823 0.0633983186 0.0000000000
## 2 0.9603326609 0.9363069333 1.0000000000
## 7 0.0016510568 0.0001889718 0.0000000000
## 9 0.0000000000 0.0001057764 0.0000000000
##basic chi square test of independence
chisq.test(table(brfss_17$hlthpln1, brfss_17$gender))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(brfss_17$hlthpln1, brfss_17$gender)
## X-squared = 16.622, df = 1, p-value = 4.563e-05
#4.2 Calculate percentages, or means, for each of your independent variables for each level of your outcome variable and present this in a table, with appropriate survey-corrected test statistics. (tableone package helps)
library(tableone)
des<-svydesign(ids=~1, strata=~ststr, weights=~mmsawt, data = brfss_17)
options(survey.lonely.psu = "adjust")
st1<-svyCreateTableOne(vars = c("gender", "educ"), strata = "hlthpln1", test = T, data = des)
#st1<-print(st1, format="p")
print(st1, format="p")
## Stratified by hlthpln1
## 0 1 p test
## n 3579519.3 11755198.9
## gender = Male (%) 56.2 46.4 0.001
## educ (%) <0.001
## 0hs 36.0 10.5
## 2hsgrad 31.9 22.6
## 3somecol 24.7 32.9
## 4colgrad 7.5 34.0
#4.3 Are there substantive differences in the descriptive results between the analysis using survey design and that not using survey design?
##Yes, the numbers are slightly larger in the responses to the table in 4.2, which did not include a survey design. Once the survey design is included, the weights gave me the higher numbers. For example, in table 4, 47.3% of males do not have insurance, whereas 56.2% of males do not have insurance in the table with the survey design. In the table without survey design, 7.2% of people without a high school diploma in Texas have insurance compared to 10.5% in the table with the survey design. Those numbers also change for people with high school diploma, where 20.7% people have insurance in the original table versus 22.6% in the table with the survey design.
#Homework #3
des<-svydesign(ids=~1, strata=~ststr, weights=~mmsawt, data = brfss_17)
#Create a research question and perform logistic regression:
##Research question: How does gender and education impact health insurance status?
fit.logit<-svyglm(hlthpln1 ~ gender + educ,
design = des,
family = quasibinomial)
#Present results from a model with sample weights and design effects, if your data allow for this. Present the results in tabular form, with Parameter estimates, odds ratios (if using the logit model) and confidence intervals for the odds ratios.
library(broom)
fit.logit%>%
tidy()%>%
knitr::kable(digits = 3)
(Intercept) |
0.157 |
0.147 |
1.069 |
0.285 |
genderMale |
-0.403 |
0.124 |
-3.236 |
0.001 |
educ2hsgrad |
0.901 |
0.167 |
5.398 |
0.000 |
educ3somecol |
1.513 |
0.176 |
8.594 |
0.000 |
educ4colgrad |
2.763 |
0.186 |
14.824 |
0.000 |
fit.logit%>%
tidy()%>%
mutate(OR = exp(estimate))%>%
knitr::kable(digits = 3)
(Intercept) |
0.157 |
0.147 |
1.069 |
0.285 |
1.170 |
genderMale |
-0.403 |
0.124 |
-3.236 |
0.001 |
0.669 |
educ2hsgrad |
0.901 |
0.167 |
5.398 |
0.000 |
2.461 |
educ3somecol |
1.513 |
0.176 |
8.594 |
0.000 |
4.539 |
educ4colgrad |
2.763 |
0.186 |
14.824 |
0.000 |
15.854 |
fit.logit%>%
tidy()%>%
mutate(OR = exp(estimate),
LowerOR_Ci = exp(estimate - 1.96*std.error),
UpperOR_Ci = exp(estimate + 1.96*std.error))%>%
knitr::kable(digits = 3)
(Intercept) |
0.157 |
0.147 |
1.069 |
0.285 |
1.170 |
0.877 |
1.560 |
genderMale |
-0.403 |
0.124 |
-3.236 |
0.001 |
0.669 |
0.524 |
0.853 |
educ2hsgrad |
0.901 |
0.167 |
5.398 |
0.000 |
2.461 |
1.775 |
3.414 |
educ3somecol |
1.513 |
0.176 |
8.594 |
0.000 |
4.539 |
3.215 |
6.410 |
educ4colgrad |
2.763 |
0.186 |
14.824 |
0.000 |
15.854 |
11.002 |
22.847 |
#Generate predicted probabilities for some “interesting” cases from your analysis, to highlight the effects from the model and your stated research question.
library(emmeans)
rg<-ref_grid(fit.logit, data = brfss_17)
marg_logit<-emmeans(object = rg,
specs = c( "gender", "educ"),
type="response" )
knitr::kable(marg_logit, digits = 4)
Female |
0hs |
0.5392 |
0.0365 |
Inf |
0.4673 |
0.6094 |
Male |
0hs |
0.4389 |
0.0348 |
Inf |
0.3722 |
0.5079 |
Female |
2hsgrad |
0.7423 |
0.0238 |
Inf |
0.6929 |
0.7861 |
Male |
2hsgrad |
0.6582 |
0.0268 |
Inf |
0.6039 |
0.7086 |
Female |
3somecol |
0.8416 |
0.0181 |
Inf |
0.8028 |
0.8739 |
Male |
3somecol |
0.7803 |
0.0226 |
Inf |
0.7327 |
0.8214 |
Female |
4colgrad |
0.9488 |
0.0073 |
Inf |
0.9325 |
0.9614 |
Male |
4colgrad |
0.9254 |
0.0099 |
Inf |
0.9035 |
0.9426 |