## Loading required package: carData
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
load(url("https://github.com/coreysparks/data/blob/master/brfss_2017.Rdata?raw=true"))
nams<-names(brfss_17)
head(nams, n=10)
##  [1] "dispcode" "statere1" "safetime" "hhadult"  "genhlth"  "physhlth"
##  [7] "menthlth" "poorhlth" "hlthpln1" "persdoc2"
newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  nams))
names(brfss_17)<-newnames

a.) Define a binary outcome variable of your choosing and define how you recode the original variable

brfss_17$chcocncr<-Recode(brfss_17$chcocncr, recodes ="7:9=NA; 1=1;2=0") 


#I selected chcocncr, Have you ever been told you have other types of cancer? Where 1=1 is yes, 2=0 is no. 

b.) State a research question about what factors you believe will affect your outcome variable.

#Does gender correlate with cancer?

c.) Define at least 2 predictor variables, based on your research question. For this assignment, it’s best if these are categorical variables.

#Income, zip code, race/ethnicity, age, and gender. For assignment purposes I will select gender and health insurance which are both categorical variables. 

#gender
brfss_17$sex<-as.factor(ifelse(brfss_17$sex==1, "Male", "Female"))


#insurance
brfss_17$ins<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0")

d.) Perform a descriptive analysis of the outcome variable by each of the variables you defined in part c. (e.g. 2 x 2 table, 2 x k table). Follow a similar approach to presenting your statistics as presented in Sparks 2009 (in the Google drive). This can be done easily using the tableone package!

#Texas
brfss_17$tx<-NA
brfss_17$tx[grep(pattern = "TX", brfss_17$mmsaname)]<-1
##Remove Non-Responses 
brfss_17<-brfss_17%>%
  filter(tx==1, is.na(mmsawt)==F, is.na(chcocncr)==F)

dc<-CreateTableOne(vars = c("sex", "hlthpln1"), strata = "chcocncr", test = T, data = brfss_17)

print(dc, format="p")
##                       Stratified by chcocncr
##                        0           1           p      test
##   n                    7805         815                   
##   sex = Male (%)       42.3        35.8        <0.001     
##   hlthpln1 (mean (SD)) 1.17 (0.57) 1.06 (0.30) <0.001
print(dc)
##                       Stratified by chcocncr
##                        0            1            p      test
##   n                    7805          815                    
##   sex = Male (%)       3305 (42.3)   292 (35.8)  <0.001     
##   hlthpln1 (mean (SD)) 1.17 (0.57)  1.06 (0.30)  <0.001

d-a.) Calculate descriptive statistics (mean or percentages) for each variable using no weights or survey design, as well as with full survey design and weights.

#with sex variable weights and no weights 
#counts
#raw, unweighted table 
table(brfss_17$chcocncr, brfss_17$sex)
##    
##     Female Male
##   0   4500 3305
##   1    523  292
csex<-wtd.table(brfss_17$chcocncr, brfss_17$sex, weights = brfss_17$mmsawt)
print(csex)
##      Female      Male
## 0 7395078.7 7207020.8
## 1  498331.8  285301.3
#proportions
prop.table(wtd.table(brfss_17$chcocncr, brfss_17$sex, weights = brfss_17$mmsawt), margin=2)
##       Female       Male
## 0 0.93686737 0.96192085
## 1 0.06313263 0.03807915
#compare that with the original, unweighted proportions
prop.table(table(brfss_17$chcocncr, brfss_17$sex), margin=2)
##    
##         Female       Male
##   0 0.89587896 0.91882124
##   1 0.10412104 0.08117876

d-a #with insurance variable weights and no weights

#No weights
table(brfss_17$chcocncr, brfss_17$hlthpln1)
##    
##        1    2    7    9
##   0 6685 1087   16   17
##   1  775   39    1    0
cins<-wtd.table(brfss_17$chcocncr, brfss_17$hlthpln1, weights = brfss_17$mmsawt)
print(cins)
##              1            2            7            9
## 0 1.104828e+07 3.490948e+06 4.170268e+04 2.116835e+04
## 1 7.055679e+05 7.806041e+04 4.705983e+00 0.000000e+00
#proportions
prop.table(wtd.table(brfss_17$chcocncr, brfss_17$hlthpln1, weights = brfss_17$mmsawt), margin=2)
##              1            2            7            9
## 0 0.9399713253 0.9781282661 0.9998871667 1.0000000000
## 1 0.0600286747 0.0218717339 0.0001128333 0.0000000000
#compare that with the original, unweighted proportions
prop.table(table(brfss_17$chcocncr, brfss_17$hlthpln1), margin=2)
##    
##              1          2          7          9
##   0 0.89611260 0.96536412 0.94117647 1.00000000
##   1 0.10388740 0.03463588 0.05882353 0.00000000

d-b.) Calculate percentages, or means, for each of your independent variables for each level of your outcome variable and present this in a table, with appropriate survey-corrected test statistics. (table one package helps)

library(tableone)
#not using survey design
t1<-CreateTableOne(vars = c("sex", "hlthpln1"), strata = "chcocncr", test = T, data = brfss_17)
#t1<-print(t1, format="p")
print(t1,format="p")
##                       Stratified by chcocncr
##                        0           1           p      test
##   n                    7805         815                   
##   sex = Male (%)       42.3        35.8        <0.001     
##   hlthpln1 (mean (SD)) 1.17 (0.57) 1.06 (0.30) <0.001
#using survey design
des<-svydesign(ids = ~1, strata = ~ststr, weights = ~mmsawt, data = brfss_17)
options(survey.lonely.psu = "adjust")
st1<-svyCreateTableOne(vars = c("sex", "hlthpln1"), strata = "chcocncr", test = T, data = des)
#st1<-print(st1, format="p")
print(st1, format="p")
##                       Stratified by chcocncr
##                        0                  1                p      test
##   n                     14602099.4         783633.0                   
##   sex = Male (%)              49.4             36.4         0.005     
##   hlthpln1 (mean (SD))        1.27 (0.60)      1.10 (0.30) <0.001

d-c.) Are there substantive differences in the descriptive results between the analysis using survey design and that not using survey design?

#Yes, there are substantive differences. Numbers with no survey design were higher while numbers with a survey design were slightly lower.