HW2 & 3

load(url("https://github.com/coreysparks/data/blob/master/brfss_2017.Rdata?raw=true"))

nams<-names(brfss_17)
head(nams, n=10)

##  [1] "dispcode" "statere1" "safetime" "hhadult"  "genhlth"  "physhlth"
##  [7] "menthlth" "poorhlth" "hlthpln1" "persdoc2"

newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  nams))
names(brfss_17)<-newnames

##Get TX Cities 
brfss_17$tx<-NA
brfss_17$tx[grep(pattern = "TX", brfss_17$mmsaname)]<-1

##Remove Non-Responses 
brfss_17<-brfss_17%>%
  filter(tx==1, is.na(mmsawt)==F, is.na(hlthpln1)==F)

# 1. Define a binary outcome variable of your choosing and define how you recode the original variable.
##I will use the variable hlthpln1 as my binary variable. The question is: Do you have any kind of health care coverage, including health insurance, prepaid plans such as HMOs, or government plans such as Medicare,or Indian Health Service?

#I used your code to recode the original variable. 
brfss_17$hlthpln1<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0")

View(brfss_17)

#2. State a research question about what factors you believe will affect your outcome variable.
##What effect does gender and education have on health insurance coverage?

#3. Define at least 2 predictor variables, based on your research question. For this assignment, it’s best if these are categorical variables.

##I will use incomeg and educag and here is how I recoded the variables.
brfss_17$educ<-Recode(brfss_17$educa, recodes="1:3='0hs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA", as.factor=T)

##Gender variable recode
brfss_17$gender<-as.factor(ifelse(brfss_17$sex==1, "Male", "Female"))


# brfss_17$employ<-Recode(brfss_17$employ, recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA", as.factor=T)
# brfss_17$employ<-relevel(brfss_17$employ, ref='Employed')
# 
# as.factor(brfss_17$employ)
# 
# as.factor(brfss_17$educ)
# 
# table(brfss_17$employ)

#4. Perform a descriptive analysis of the outcome variable by each of the variables you defined in part b. (e.g. 2 x 2 table, 2 x k table). Follow a similar approach to presenting your statistics as presented in Sparks 2009 (in the Google drive). This can be done easily using the tableone package!

library(tableone)
t1<-CreateTableOne(vars = c("educ", "gender"), strata = "hlthpln1", test = T, data = brfss_17)
#t1<-print(t1, format="p")
print(t1,format="p")

##                    Stratified by hlthpln1
##                     0    1    p      test
##   n                 1131 7466            
##   educ (%)                    <0.001     
##      0hs            27.9  7.2            
##      2hsgrad        33.0 20.7            
##      3somecol       23.3 26.3            
##      4colgrad       15.8 45.8            
##   gender = Male (%) 47.3 40.8 <0.001

print(t1)

##                    Stratified by hlthpln1
##                     0            1            p      test
##   n                 1131         7466                    
##   educ (%)                                    <0.001     
##      0hs             314 (27.9)   535 ( 7.2)             
##      2hsgrad         372 (33.0)  1539 (20.7)             
##      3somecol        262 (23.3)  1956 (26.3)             
##      4colgrad        178 (15.8)  3405 (45.8)             
##   gender = Male (%)  535 (47.3)  3049 (40.8)  <0.001

#4.1 Calculate descriptive statistics (mean or percentages) for each variable using no weights or survey design, as well as with full survey design and weights.

#For educ variable 
##Raw frequencies - No Weights
table(brfss_17$hlthpln1, brfss_17$educ)

##    
##      0hs 2hsgrad 3somecol 4colgrad
##   0  314     372      262      178
##   1  535    1539     1956     3405

##Frequencies with Weights
teduc<-wtd.table(brfss_17$hlthpln1, brfss_17$educ, weights = brfss_17$mmsawt)
print(teduc)

##         0hs   2hsgrad  3somecol  4colgrad
## 0 1284062.4 1139395.3  881802.7  266394.6
## 1 1225410.5 2644849.0 3857401.4 3986029.0

##column percentages - No Weights
prop.table(table(brfss_17$hlthpln1, brfss_17$educ), margin=2)

##    
##            0hs    2hsgrad   3somecol   4colgrad
##   0 0.36984688 0.19466248 0.11812444 0.04967904
##   1 0.63015312 0.80533752 0.88187556 0.95032096

##Percentages - Weights
prop.table(wtd.table(brfss_17$hlthpln1, brfss_17$educ, weights = brfss_17$mmsawt), margin=2)

##          0hs    2hsgrad   3somecol   4colgrad
## 0 0.51168609 0.30108926 0.18606557 0.06264536
## 1 0.48831391 0.69891074 0.81393443 0.93735464

##basic chi square test of independence
chisq.test(table(brfss_17$hlthpln1, brfss_17$educ))

## 
##  Pearson's Chi-squared test
## 
## data:  table(brfss_17$hlthpln1, brfss_17$educ)
## X-squared = 702.45, df = 3, p-value < 2.2e-16

#For gender variable 
##Raw frequencies - No Weights
table(brfss_17$hlthpln1, brfss_17$gender)

##    
##     Female Male
##   0    596  535
##   1   4417 3049

##Frequencies - With Weights 
tgen<-wtd.table(brfss_17$hlthpln1, brfss_17$gender, weights = brfss_17$mmsawt)
print(tgen)

##    Female    Male
## 0 1566721 2012799
## 1 6302505 5452694

##column percentages - No weights
prop.table(table(brfss_17$hlthpln1, brfss_17$gender), margin=2)

##    
##        Female      Male
##   0 0.1188909 0.1492746
##   1 0.8811091 0.8507254

##Percentages - Weights
prop.table(wtd.table(brfss_17$chcocncr, brfss_17$sex, weights = brfss_17$mmsawt), margin=2)

##              1            2            9
## 1 0.0380162823 0.0633983186 0.0000000000
## 2 0.9603326609 0.9363069333 1.0000000000
## 7 0.0016510568 0.0001889718 0.0000000000
## 9 0.0000000000 0.0001057764 0.0000000000

##basic chi square test of independence
chisq.test(table(brfss_17$hlthpln1, brfss_17$gender))

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(brfss_17$hlthpln1, brfss_17$gender)
## X-squared = 16.622, df = 1, p-value = 4.563e-05

#4.2 Calculate percentages, or means, for each of your independent variables for each level of your outcome variable and present this in a table, with appropriate survey-corrected test statistics. (tableone package helps)

library(tableone)
des<-svydesign(ids=~1, strata=~ststr, weights=~mmsawt, data = brfss_17)
options(survey.lonely.psu = "adjust")
st1<-svyCreateTableOne(vars = c("gender", "educ"), strata = "hlthpln1", test = T, data = des)
#st1<-print(st1, format="p")
print(st1, format="p")

##                    Stratified by hlthpln1
##                     0         1          p      test
##   n                 3579519.3 11755198.9            
##   gender = Male (%)      56.2       46.4  0.001     
##   educ (%)                               <0.001     
##      0hs                 36.0       10.5            
##      2hsgrad             31.9       22.6            
##      3somecol            24.7       32.9            
##      4colgrad             7.5       34.0

#4.3 Are there substantive differences in the descriptive results between the analysis using survey design and that not using survey design?
##Yes, the numbers are slightly larger in the responses to the table in 4.2, which did not include a survey design. Once the survey design is included, the weights gave me the higher numbers. For example, in table 4, 47.3% of males do not have insurance, whereas 56.2% of males do not have insurance in the table with the survey design. In the table without survey design, 7.2% of people without a high school diploma in Texas have insurance compared to 10.5% in the table with the survey design. Those numbers also change for people with high school diploma, where 20.7% people have insurance in the original table versus 22.6% in the table with the survey design.

#Homework #3
des<-svydesign(ids=~1, strata=~ststr, weights=~mmsawt, data = brfss_17)

#Create a research question and perform logistic regression: 
  ##Research question: How does gender and education impact health insurance status?
fit.logit<-svyglm(hlthpln1 ~ gender + educ,
                  design = des,
                  family = quasibinomial)

#Present results from a model with sample weights and design effects, if your data allow for this. Present the results in tabular form, with Parameter estimates, odds ratios (if using the logit model) and confidence intervals for the odds ratios.
library(broom)
fit.logit%>%
  tidy()%>%
  knitr::kable(digits = 3)

term	estimate	std.error	statistic	p.value
(Intercept)	0.157	0.147	1.069	0.285
genderMale	-0.403	0.124	-3.236	0.001
educ2hsgrad	0.901	0.167	5.398	0.000
educ3somecol	1.513	0.176	8.594	0.000
educ4colgrad	2.763	0.186	14.824	0.000

fit.logit%>%
  tidy()%>%
  mutate(OR = exp(estimate))%>%
  knitr::kable(digits = 3)

term	estimate	std.error	statistic	p.value	OR
(Intercept)	0.157	0.147	1.069	0.285	1.170
genderMale	-0.403	0.124	-3.236	0.001	0.669
educ2hsgrad	0.901	0.167	5.398	0.000	2.461
educ3somecol	1.513	0.176	8.594	0.000	4.539
educ4colgrad	2.763	0.186	14.824	0.000	15.854

fit.logit%>%
  tidy()%>%
  mutate(OR = exp(estimate),
         LowerOR_Ci = exp(estimate - 1.96*std.error),
         UpperOR_Ci = exp(estimate + 1.96*std.error))%>%
  knitr::kable(digits = 3)

term	estimate	std.error	statistic	p.value	OR	LowerOR_Ci	UpperOR_Ci
(Intercept)	0.157	0.147	1.069	0.285	1.170	0.877	1.560
genderMale	-0.403	0.124	-3.236	0.001	0.669	0.524	0.853
educ2hsgrad	0.901	0.167	5.398	0.000	2.461	1.775	3.414
educ3somecol	1.513	0.176	8.594	0.000	4.539	3.215	6.410
educ4colgrad	2.763	0.186	14.824	0.000	15.854	11.002	22.847

#Generate predicted probabilities for some “interesting” cases from your analysis, to highlight the effects from the model and your stated research question.

library(emmeans)
rg<-ref_grid(fit.logit, data = brfss_17)

marg_logit<-emmeans(object = rg,
              specs = c( "gender",  "educ"),
              type="response" )

knitr::kable(marg_logit,  digits = 4)

gender	educ	prob	SE	df	asymp.LCL	asymp.UCL
Female	0hs	0.5392	0.0365	Inf	0.4673	0.6094
Male	0hs	0.4389	0.0348	Inf	0.3722	0.5079
Female	2hsgrad	0.7423	0.0238	Inf	0.6929	0.7861
Male	2hsgrad	0.6582	0.0268	Inf	0.6039	0.7086
Female	3somecol	0.8416	0.0181	Inf	0.8028	0.8739
Male	3somecol	0.7803	0.0226	Inf	0.7327	0.8214
Female	4colgrad	0.9488	0.0073	Inf	0.9325	0.9614
Male	4colgrad	0.9254	0.0099	Inf	0.9035	0.9426

HW2 & 3

Coda Rayo-Garza

2/1/2021