load(url("https://github.com/coreysparks/data/blob/master/brfss_2017.Rdata?raw=true"))
nams<-names(brfss_17)
head(nams, n=10)
##  [1] "dispcode" "statere1" "safetime" "hhadult"  "genhlth"  "physhlth"
##  [7] "menthlth" "poorhlth" "hlthpln1" "persdoc2"
newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  nams))
names(brfss_17)<-newnames

##Get TX Cities 
brfss_17$tx<-NA
brfss_17$tx[grep(pattern = "TX", brfss_17$mmsaname)]<-1

##Remove Non-Responses 
brfss_17<-brfss_17%>%
  filter(tx==1, is.na(mmsawt)==F, is.na(hlthpln1)==F)
# 1. Define a binary outcome variable of your choosing and define how you recode the original variable.
##I will use the variable hlthpln1 as my binary variable. The question is: Do you have any kind of health care coverage, including health insurance, prepaid plans such as HMOs, or government plans such as Medicare,or Indian Health Service?

#I used your code to recode the original variable. 
brfss_17$hlthpln1<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0")

View(brfss_17)
#2. State a research question about what factors you believe will affect your outcome variable.
##What effect does gender and education have on health insurance coverage?
#3. Define at least 2 predictor variables, based on your research question. For this assignment, it’s best if these are categorical variables.

##I will use incomeg and educag and here is how I recoded the variables.
brfss_17$educ<-Recode(brfss_17$educa, recodes="1:3='0hs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA", as.factor=T)

##Gender variable recode
brfss_17$gender<-as.factor(ifelse(brfss_17$sex==1, "Male", "Female"))


# brfss_17$employ<-Recode(brfss_17$employ, recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA", as.factor=T)
# brfss_17$employ<-relevel(brfss_17$employ, ref='Employed')
# 
# as.factor(brfss_17$employ)
# 
# as.factor(brfss_17$educ)
# 
# table(brfss_17$employ)
#4. Perform a descriptive analysis of the outcome variable by each of the variables you defined in part b. (e.g. 2 x 2 table, 2 x k table). Follow a similar approach to presenting your statistics as presented in Sparks 2009 (in the Google drive). This can be done easily using the tableone package!

library(tableone)
t1<-CreateTableOne(vars = c("educ", "gender"), strata = "hlthpln1", test = T, data = brfss_17)
#t1<-print(t1, format="p")
print(t1,format="p")
##                    Stratified by hlthpln1
##                     0    1    p      test
##   n                 1131 7466            
##   educ (%)                    <0.001     
##      0hs            27.9  7.2            
##      2hsgrad        33.0 20.7            
##      3somecol       23.3 26.3            
##      4colgrad       15.8 45.8            
##   gender = Male (%) 47.3 40.8 <0.001
print(t1)
##                    Stratified by hlthpln1
##                     0            1            p      test
##   n                 1131         7466                    
##   educ (%)                                    <0.001     
##      0hs             314 (27.9)   535 ( 7.2)             
##      2hsgrad         372 (33.0)  1539 (20.7)             
##      3somecol        262 (23.3)  1956 (26.3)             
##      4colgrad        178 (15.8)  3405 (45.8)             
##   gender = Male (%)  535 (47.3)  3049 (40.8)  <0.001
#4.1 Calculate descriptive statistics (mean or percentages) for each variable using no weights or survey design, as well as with full survey design and weights.

#For educ variable 
##Raw frequencies - No Weights
table(brfss_17$hlthpln1, brfss_17$educ)
##    
##      0hs 2hsgrad 3somecol 4colgrad
##   0  314     372      262      178
##   1  535    1539     1956     3405
##Frequencies with Weights
teduc<-wtd.table(brfss_17$hlthpln1, brfss_17$educ, weights = brfss_17$mmsawt)
print(teduc)
##         0hs   2hsgrad  3somecol  4colgrad
## 0 1284062.4 1139395.3  881802.7  266394.6
## 1 1225410.5 2644849.0 3857401.4 3986029.0
##column percentages - No Weights
prop.table(table(brfss_17$hlthpln1, brfss_17$educ), margin=2)
##    
##            0hs    2hsgrad   3somecol   4colgrad
##   0 0.36984688 0.19466248 0.11812444 0.04967904
##   1 0.63015312 0.80533752 0.88187556 0.95032096
##Percentages - Weights
prop.table(wtd.table(brfss_17$hlthpln1, brfss_17$educ, weights = brfss_17$mmsawt), margin=2)
##          0hs    2hsgrad   3somecol   4colgrad
## 0 0.51168609 0.30108926 0.18606557 0.06264536
## 1 0.48831391 0.69891074 0.81393443 0.93735464
##basic chi square test of independence
chisq.test(table(brfss_17$hlthpln1, brfss_17$educ))
## 
##  Pearson's Chi-squared test
## 
## data:  table(brfss_17$hlthpln1, brfss_17$educ)
## X-squared = 702.45, df = 3, p-value < 2.2e-16
#For gender variable 
##Raw frequencies - No Weights
table(brfss_17$hlthpln1, brfss_17$gender)
##    
##     Female Male
##   0    596  535
##   1   4417 3049
##Frequencies - With Weights 
tgen<-wtd.table(brfss_17$hlthpln1, brfss_17$gender, weights = brfss_17$mmsawt)
print(tgen)
##    Female    Male
## 0 1566721 2012799
## 1 6302505 5452694
##column percentages - No weights
prop.table(table(brfss_17$hlthpln1, brfss_17$gender), margin=2)
##    
##        Female      Male
##   0 0.1188909 0.1492746
##   1 0.8811091 0.8507254
##Percentages - Weights
prop.table(wtd.table(brfss_17$chcocncr, brfss_17$sex, weights = brfss_17$mmsawt), margin=2)
##              1            2            9
## 1 0.0380162823 0.0633983186 0.0000000000
## 2 0.9603326609 0.9363069333 1.0000000000
## 7 0.0016510568 0.0001889718 0.0000000000
## 9 0.0000000000 0.0001057764 0.0000000000
##basic chi square test of independence
chisq.test(table(brfss_17$hlthpln1, brfss_17$gender))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(brfss_17$hlthpln1, brfss_17$gender)
## X-squared = 16.622, df = 1, p-value = 4.563e-05
#4.2 Calculate percentages, or means, for each of your independent variables for each level of your outcome variable and present this in a table, with appropriate survey-corrected test statistics. (tableone package helps)

library(tableone)
des<-svydesign(ids=~1, strata=~ststr, weights=~mmsawt, data = brfss_17)
options(survey.lonely.psu = "adjust")
st1<-svyCreateTableOne(vars = c("gender", "educ"), strata = "hlthpln1", test = T, data = des)
#st1<-print(st1, format="p")
print(st1, format="p")
##                    Stratified by hlthpln1
##                     0         1          p      test
##   n                 3579519.3 11755198.9            
##   gender = Male (%)      56.2       46.4  0.001     
##   educ (%)                               <0.001     
##      0hs                 36.0       10.5            
##      2hsgrad             31.9       22.6            
##      3somecol            24.7       32.9            
##      4colgrad             7.5       34.0
#4.3 Are there substantive differences in the descriptive results between the analysis using survey design and that not using survey design?
##Yes, the numbers are slightly larger in the responses to the table in 4.2, which did not include a survey design. Once the survey design is included, the weights gave me the higher numbers. For example, in table 4, 47.3% of males do not have insurance, whereas 56.2% of males do not have insurance in the table with the survey design. In the table without survey design, 7.2% of people without a high school diploma in Texas have insurance compared to 10.5% in the table with the survey design. Those numbers also change for people with high school diploma, where 20.7% people have insurance in the original table versus 22.6% in the table with the survey design. 
#Homework #3
des<-svydesign(ids=~1, strata=~ststr, weights=~mmsawt, data = brfss_17)

#Create a research question and perform logistic regression: 
  ##Research question: How does gender and education impact health insurance status?
fit.logit<-svyglm(hlthpln1 ~ gender + educ,
                  design = des,
                  family = quasibinomial)
#Present results from a model with sample weights and design effects, if your data allow for this. Present the results in tabular form, with Parameter estimates, odds ratios (if using the logit model) and confidence intervals for the odds ratios.
library(broom)
fit.logit%>%
  tidy()%>%
  knitr::kable(digits = 3)
term estimate std.error statistic p.value
(Intercept) 0.157 0.147 1.069 0.285
genderMale -0.403 0.124 -3.236 0.001
educ2hsgrad 0.901 0.167 5.398 0.000
educ3somecol 1.513 0.176 8.594 0.000
educ4colgrad 2.763 0.186 14.824 0.000
fit.logit%>%
  tidy()%>%
  mutate(OR = exp(estimate))%>%
  knitr::kable(digits = 3)
term estimate std.error statistic p.value OR
(Intercept) 0.157 0.147 1.069 0.285 1.170
genderMale -0.403 0.124 -3.236 0.001 0.669
educ2hsgrad 0.901 0.167 5.398 0.000 2.461
educ3somecol 1.513 0.176 8.594 0.000 4.539
educ4colgrad 2.763 0.186 14.824 0.000 15.854
fit.logit%>%
  tidy()%>%
  mutate(OR = exp(estimate),
         LowerOR_Ci = exp(estimate - 1.96*std.error),
         UpperOR_Ci = exp(estimate + 1.96*std.error))%>%
  knitr::kable(digits = 3)
term estimate std.error statistic p.value OR LowerOR_Ci UpperOR_Ci
(Intercept) 0.157 0.147 1.069 0.285 1.170 0.877 1.560
genderMale -0.403 0.124 -3.236 0.001 0.669 0.524 0.853
educ2hsgrad 0.901 0.167 5.398 0.000 2.461 1.775 3.414
educ3somecol 1.513 0.176 8.594 0.000 4.539 3.215 6.410
educ4colgrad 2.763 0.186 14.824 0.000 15.854 11.002 22.847
#Generate predicted probabilities for some “interesting” cases from your analysis, to highlight the effects from the model and your stated research question.

library(emmeans)
rg<-ref_grid(fit.logit, data = brfss_17)

marg_logit<-emmeans(object = rg,
              specs = c( "gender",  "educ"),
              type="response" )

knitr::kable(marg_logit,  digits = 4)
gender educ prob SE df asymp.LCL asymp.UCL
Female 0hs 0.5392 0.0365 Inf 0.4673 0.6094
Male 0hs 0.4389 0.0348 Inf 0.3722 0.5079
Female 2hsgrad 0.7423 0.0238 Inf 0.6929 0.7861
Male 2hsgrad 0.6582 0.0268 Inf 0.6039 0.7086
Female 3somecol 0.8416 0.0181 Inf 0.8028 0.8739
Male 3somecol 0.7803 0.0226 Inf 0.7327 0.8214
Female 4colgrad 0.9488 0.0073 Inf 0.9325 0.9614
Male 4colgrad 0.9254 0.0099 Inf 0.9035 0.9426