#load packages
library(car)
## Loading required package: carData
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey)
## Warning: package 'survey' was built under R version 3.5.2
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(questionr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#load NHIS data
nhis<-read.csv("~/Desktop/DataFolder/nhis_00003.csv")
head(nhis,n=10)
##    YEAR SERIAL STRATA PSU    NHISHID HHWEIGHT PERNUM   NHISPID HHX FMX PX
## 1  2013      1   6079   1 2013000001     3406      1 2.013e+13   1   1  1
## 2  2013      1   6079   1 2013000001     3406      2 2.013e+13   1   1  2
## 3  2013      1   6079   1 2013000001     3406      3 2.013e+13   1   1  3
## 4  2013      1   6079   1 2013000001     3406      4 2.013e+13   1   1  4
## 5  2013      1   6079   1 2013000001     3406      5 2.013e+13   1   1  5
## 6  2013      2   6008   1 2013000002     3996      1 2.013e+13   2   1  1
## 7  2013      3   6223   1 2013000003     1038      1 2.013e+13   3   1  1
## 8  2013      3   6223   1 2013000003     1038      2 2.013e+13   3   1  2
## 9  2013      3   6223   1 2013000003     1038      3 2.013e+13   3   1  3
## 10 2013      3   6223   1 2013000003     1038      4 2.013e+13   3   1  4
##    PERWEIGHT SAMPWEIGHT FWEIGHT ASTATFLG CSTATFLG AGE SEX SEXORIEN MARSTAT
## 1       5019          0    3950        3        0  44   1        0      10
## 2       4321          0    3950        3        0  41   2        0      10
## 3       4118      27268    3950        1        0  19   1        2      50
## 4       4070          0    3950        0        3  17   1        0      50
## 5       3950      15995    3950        0        1  15   2        0      50
## 6       5018       7431    5018        1        0  59   1        2      50
## 7       1315          0    1246        3        0  45   1        0      50
## 8       1342          0    1246        2        0  34   2        0      50
## 9       1320          0    1246        0        3   9   2        0       0
## 10      1246       3134    1246        0        1  13   2        0       0
##    MARST EDUC EARNINGS HEALTH BMICALC SAWMENT HINOTCOVE HINONE HIATWORK
## 1     11  500        7      3     0.0       0         1      1        2
## 2     11  401        0      4     0.0       0         1      1        0
## 3     50  301        1      2    20.1       1         1      1        1
## 4     50  202        0      2     0.0       0         1      1        0
## 5     50  109        0      2     0.0       1         1      1        0
## 6     50  403        0      1    29.8       1         1      1        0
## 7     50  401        6      1     0.0       0         2      2        1
## 8     50  401       99      1     0.0       0         1      1        2
## 9      0  105        0      1     0.0       0         1      1        0
## 10     0  108        0      1     0.0       1         1      1        0
##    HIP1HOWGOT HEARTCONEV ALCANYTP ALCDAYSWK SMOKESTATUS2 WTPROGDYR
## 1           1          0        0        96            0        NA
## 2           0          0        0        96            0        NA
## 3           0          1        3         0           30        NA
## 4           0          0        0        96            0        NA
## 5           0          0        0        96            0        NA
## 6           0          1        0        96           30        NA
## 7           0          0        0        96            0        NA
## 8           1          0        0        96            0        NA
## 9           0          0        0        96            0        NA
## 10          0          0        0        96            0        NA
##    WTPROGNOW CLIMHEART FLWEIGHT
## 1         NA         0        0
## 2         NA         1        0
## 3         NA         0        0
## 4         NA         0        0
## 5         NA         0        0
## 6         NA         0        0
## 7         NA         0        0
## 8         NA         0        0
## 9         NA         0        0
## 10        NA         0        0
#a) Define a binary outcome variable of your choosing and define how you recode the original variable.
#recode MARSTAT as binary, 1=married, 0=single
nhis$maritalst<-Recode(nhis$MARSTAT,recodes="10=1;50=0;else=NA")
table(nhis$maritalst)
## 
##      0      1 
## 124020 203426
#b) State a research question about what factors you believe will affect your outcome variable.
#The purposes for this analysis is to examine how education and sexual orientation influence marriage.  How is marriage influenced by an individuals sexual orientation and level of education?
#c) Define at least two predictor variables, based on your research question.
#recode EDUC as a factor
nhis$educ<-Recode(nhis$EDUC,recodes="100:204='a) Less HighSchool';300:301='b) HighSchool/GED';500:600='c) Bachelors&above';else=NA",as.factor=T)
table(nhis$educ)
## 
## a) Less HighSchool  b) HighSchool/GED c) Bachelors&above 
##             140667              88624              67571
#recode SEXORIEN as a factor
nhis$sexor<-Recode(nhis$SEXORIEN,recodes="1='b) gay';2='a) straight';3='c) bisexual';else=NA",as.factor=T)
table(nhis$sexor)
## 
## a) straight      b) gay c) bisexual 
##      154360        2720        1448
#d) Perform descriptive analysis of the outcome variable by each of the variables you defined in part b.
#Raw frequencies
table(nhis$sexor,nhis$educ)
##              
##               a) Less HighSchool b) HighSchool/GED c) Bachelors&above
##   a) straight              21059             34486              28962
##   b) gay                     187               414                704
##   c) bisexual                148               254                277
#column percentages
prop.table(table(nhis$sexor,nhis$educ),margin=2)
##              
##               a) Less HighSchool b) HighSchool/GED c) Bachelors&above
##   a) straight        0.984341404       0.980997895        0.967237752
##   b) gay             0.008740768       0.011776754        0.023511338
##   c) bisexual        0.006917827       0.007225351        0.009250910
#basic chi square test of independence
chisq.test(table(nhis$sexor,nhis$educ))
## 
##  Pearson's Chi-squared test
## 
## data:  table(nhis$sexor, nhis$educ)
## X-squared = 239.75, df = 4, p-value < 2.2e-16
#Create a survey design object
nhis<-nhis%>%filter(is.na(PERWEIGHT)==F)
options(survey.lonly.psu="adjust")
des<-svydesign(ids=~1,strata = ~STRATA,weights=~PERWEIGHT,data = nhis)
#simple weighted analysis
cat<-wtd.table(nhis$sexor,nhis$educ,weights = nhis$PERWEIGHT)
prop.table(wtd.table(nhis$sexor,nhis$educ,weights=nhis$PERWEIGHT),margin = 2)
##             a) Less HighSchool b) HighSchool/GED c) Bachelors&above
## a) straight        0.982865987       0.980580097        0.966490893
## b) gay             0.009364967       0.011721848        0.024020501
## c) bisexual        0.007769046       0.007698055        0.009488606
#compaired to original
prop.table(table(nhis$sexor,nhis$educ),margin = 2)
##              
##               a) Less HighSchool b) HighSchool/GED c) Bachelors&above
##   a) straight        0.984341404       0.980997895        0.967237752
##   b) gay             0.008740768       0.011776754        0.023511338
##   c) bisexual        0.006917827       0.007225351        0.009250910
#Let's get  n 
n<-table(is.na(nhis$sexor)==F)
n
## 
##  FALSE   TRUE 
## 337135 158528
#Let's get p
p<-prop.table(wtd.table(nhis$sexor,nhis$educ,weights = nhis$PERWEIGHT),margin=2)
se<-(p*(1-p))/n[2]
stargazer(data.frame(proportion=p,se=sqrt(se)),summary=F,type="text",digits = 2)
## 
## ===========================================================================================
##   proportion.Var1  proportion.Var2   proportion.Freq   se.Var1        se.Var2       se.Freq
## -------------------------------------------------------------------------------------------
## 1   a) straight   a) Less HighSchool      0.98       a) straight a) Less HighSchool 0.0003 
## 2     b) gay      a) Less HighSchool      0.01         b) gay    a) Less HighSchool 0.0002 
## 3   c) bisexual   a) Less HighSchool      0.01       c) bisexual a) Less HighSchool 0.0002 
## 4   a) straight   b) HighSchool/GED       0.98       a) straight b) HighSchool/GED  0.0003 
## 5     b) gay      b) HighSchool/GED       0.01         b) gay    b) HighSchool/GED  0.0003 
## 6   c) bisexual   b) HighSchool/GED       0.01       c) bisexual b) HighSchool/GED  0.0002 
## 7   a) straight   c) Bachelors&above      0.97       a) straight c) Bachelors&above 0.0005 
## 8     b) gay      c) Bachelors&above      0.02         b) gay    c) Bachelors&above 0.0004 
## 9   c) bisexual   c) Bachelors&above      0.01       c) bisexual c) Bachelors&above 0.0002 
## -------------------------------------------------------------------------------------------
#Now consider the full sample design + weights
cat2<-svytable(~sexor+educ,design=des)
stargazer(data.frame(prop.table(svytable(~sexor+educ,design=des),margin=2)),summary = F,type="text",digits=3)
## 
## ======================================
##      sexor           educ        Freq 
## --------------------------------------
## 1 a) straight a) Less HighSchool 0.983
## 2   b) gay    a) Less HighSchool 0.009
## 3 c) bisexual a) Less HighSchool 0.008
## 4 a) straight b) HighSchool/GED  0.981
## 5   b) gay    b) HighSchool/GED  0.012
## 6 c) bisexual b) HighSchool/GED  0.008
## 7 a) straight c) Bachelors&above 0.966
## 8   b) gay    c) Bachelors&above 0.024
## 9 c) bisexual c) Bachelors&above 0.009
## --------------------------------------
sv.table<-svyby(formula=~sexor,by=~educ,design=des,FUN=svymean,na.rm=T)
stargazer(sv.table,summary = F,type = "text",digits = 2)
## 
## ==========================================================================================================================================
##                           educ        sexora) straight sexorb) gay sexorc) bisexual se.sexora) straight se.sexorb) gay se.sexorc) bisexual
## ------------------------------------------------------------------------------------------------------------------------------------------
## a) Less HighSchool a) Less HighSchool       0.98          0.01           0.01              0.001            0.001             0.001       
## b) HighSchool/GED  b) HighSchool/GED        0.98          0.01           0.01              0.001            0.001             0.001       
## c) Bachelors&above c) Bachelors&above       0.97          0.02           0.01              0.001            0.001             0.001       
## ------------------------------------------------------------------------------------------------------------------------------------------
#The purpose for this exercise is to examine how education and sexual orientation influence marriage.  In order to examine this relationship a dataset from the National Health Interview Survey is used.  Variables for education, sexual orientation and marital status are used for this analysis.  Marital status is coded as a binary variable with education and sexual orientation coded as categorical variables.  A Chi-Squared test of independence was performed to see if there is a significant relationship between education and sexual orientation.  The data was explored using no weights or survey design in addition to using a full survey design with weights. Full survey design includes sample design information such as stratification by state and primary sampling units that are geographically defined with cluster sampling within PSUs. The weights reflect the possibility of a variable being selected and the identification of strata and PSUs for the survey design.  
#When comparing the raw data and the weighted survey design the results show little difference.  The only noticeable difference is that of bisexual/lessHighSchool of .001 and bisexual/HighSchool/GED by .001.