#load packages
library(car)
## Loading required package: carData
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey)
## Warning: package 'survey' was built under R version 3.5.2
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(questionr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#load NHIS data
nhis<-read.csv("~/Desktop/DataFolder/nhis_00003.csv")
head(nhis,n=10)
## YEAR SERIAL STRATA PSU NHISHID HHWEIGHT PERNUM NHISPID HHX FMX PX
## 1 2013 1 6079 1 2013000001 3406 1 2.013e+13 1 1 1
## 2 2013 1 6079 1 2013000001 3406 2 2.013e+13 1 1 2
## 3 2013 1 6079 1 2013000001 3406 3 2.013e+13 1 1 3
## 4 2013 1 6079 1 2013000001 3406 4 2.013e+13 1 1 4
## 5 2013 1 6079 1 2013000001 3406 5 2.013e+13 1 1 5
## 6 2013 2 6008 1 2013000002 3996 1 2.013e+13 2 1 1
## 7 2013 3 6223 1 2013000003 1038 1 2.013e+13 3 1 1
## 8 2013 3 6223 1 2013000003 1038 2 2.013e+13 3 1 2
## 9 2013 3 6223 1 2013000003 1038 3 2.013e+13 3 1 3
## 10 2013 3 6223 1 2013000003 1038 4 2.013e+13 3 1 4
## PERWEIGHT SAMPWEIGHT FWEIGHT ASTATFLG CSTATFLG AGE SEX SEXORIEN MARSTAT
## 1 5019 0 3950 3 0 44 1 0 10
## 2 4321 0 3950 3 0 41 2 0 10
## 3 4118 27268 3950 1 0 19 1 2 50
## 4 4070 0 3950 0 3 17 1 0 50
## 5 3950 15995 3950 0 1 15 2 0 50
## 6 5018 7431 5018 1 0 59 1 2 50
## 7 1315 0 1246 3 0 45 1 0 50
## 8 1342 0 1246 2 0 34 2 0 50
## 9 1320 0 1246 0 3 9 2 0 0
## 10 1246 3134 1246 0 1 13 2 0 0
## MARST EDUC EARNINGS HEALTH BMICALC SAWMENT HINOTCOVE HINONE HIATWORK
## 1 11 500 7 3 0.0 0 1 1 2
## 2 11 401 0 4 0.0 0 1 1 0
## 3 50 301 1 2 20.1 1 1 1 1
## 4 50 202 0 2 0.0 0 1 1 0
## 5 50 109 0 2 0.0 1 1 1 0
## 6 50 403 0 1 29.8 1 1 1 0
## 7 50 401 6 1 0.0 0 2 2 1
## 8 50 401 99 1 0.0 0 1 1 2
## 9 0 105 0 1 0.0 0 1 1 0
## 10 0 108 0 1 0.0 1 1 1 0
## HIP1HOWGOT HEARTCONEV ALCANYTP ALCDAYSWK SMOKESTATUS2 WTPROGDYR
## 1 1 0 0 96 0 NA
## 2 0 0 0 96 0 NA
## 3 0 1 3 0 30 NA
## 4 0 0 0 96 0 NA
## 5 0 0 0 96 0 NA
## 6 0 1 0 96 30 NA
## 7 0 0 0 96 0 NA
## 8 1 0 0 96 0 NA
## 9 0 0 0 96 0 NA
## 10 0 0 0 96 0 NA
## WTPROGNOW CLIMHEART FLWEIGHT
## 1 NA 0 0
## 2 NA 1 0
## 3 NA 0 0
## 4 NA 0 0
## 5 NA 0 0
## 6 NA 0 0
## 7 NA 0 0
## 8 NA 0 0
## 9 NA 0 0
## 10 NA 0 0
#a) Define a binary outcome variable of your choosing and define how you recode the original variable.
#recode MARSTAT as binary, 1=married, 0=single
nhis$maritalst<-Recode(nhis$MARSTAT,recodes="10=1;50=0;else=NA")
table(nhis$maritalst)
##
## 0 1
## 124020 203426
#b) State a research question about what factors you believe will affect your outcome variable.
#The purposes for this analysis is to examine how education and sexual orientation influence marriage. How is marriage influenced by an individuals sexual orientation and level of education?
#c) Define at least two predictor variables, based on your research question.
#recode EDUC as a factor
nhis$educ<-Recode(nhis$EDUC,recodes="100:204='a) Less HighSchool';300:301='b) HighSchool/GED';500:600='c) Bachelors&above';else=NA",as.factor=T)
table(nhis$educ)
##
## a) Less HighSchool b) HighSchool/GED c) Bachelors&above
## 140667 88624 67571
#recode SEXORIEN as a factor
nhis$sexor<-Recode(nhis$SEXORIEN,recodes="1='b) gay';2='a) straight';3='c) bisexual';else=NA",as.factor=T)
table(nhis$sexor)
##
## a) straight b) gay c) bisexual
## 154360 2720 1448
#d) Perform descriptive analysis of the outcome variable by each of the variables you defined in part b.
#Raw frequencies
table(nhis$sexor,nhis$educ)
##
## a) Less HighSchool b) HighSchool/GED c) Bachelors&above
## a) straight 21059 34486 28962
## b) gay 187 414 704
## c) bisexual 148 254 277
#column percentages
prop.table(table(nhis$sexor,nhis$educ),margin=2)
##
## a) Less HighSchool b) HighSchool/GED c) Bachelors&above
## a) straight 0.984341404 0.980997895 0.967237752
## b) gay 0.008740768 0.011776754 0.023511338
## c) bisexual 0.006917827 0.007225351 0.009250910
#basic chi square test of independence
chisq.test(table(nhis$sexor,nhis$educ))
##
## Pearson's Chi-squared test
##
## data: table(nhis$sexor, nhis$educ)
## X-squared = 239.75, df = 4, p-value < 2.2e-16
#Create a survey design object
nhis<-nhis%>%filter(is.na(PERWEIGHT)==F)
options(survey.lonly.psu="adjust")
des<-svydesign(ids=~1,strata = ~STRATA,weights=~PERWEIGHT,data = nhis)
#simple weighted analysis
cat<-wtd.table(nhis$sexor,nhis$educ,weights = nhis$PERWEIGHT)
prop.table(wtd.table(nhis$sexor,nhis$educ,weights=nhis$PERWEIGHT),margin = 2)
## a) Less HighSchool b) HighSchool/GED c) Bachelors&above
## a) straight 0.982865987 0.980580097 0.966490893
## b) gay 0.009364967 0.011721848 0.024020501
## c) bisexual 0.007769046 0.007698055 0.009488606
#compaired to original
prop.table(table(nhis$sexor,nhis$educ),margin = 2)
##
## a) Less HighSchool b) HighSchool/GED c) Bachelors&above
## a) straight 0.984341404 0.980997895 0.967237752
## b) gay 0.008740768 0.011776754 0.023511338
## c) bisexual 0.006917827 0.007225351 0.009250910
#Let's get n
n<-table(is.na(nhis$sexor)==F)
n
##
## FALSE TRUE
## 337135 158528
#Let's get p
p<-prop.table(wtd.table(nhis$sexor,nhis$educ,weights = nhis$PERWEIGHT),margin=2)
se<-(p*(1-p))/n[2]
stargazer(data.frame(proportion=p,se=sqrt(se)),summary=F,type="text",digits = 2)
##
## ===========================================================================================
## proportion.Var1 proportion.Var2 proportion.Freq se.Var1 se.Var2 se.Freq
## -------------------------------------------------------------------------------------------
## 1 a) straight a) Less HighSchool 0.98 a) straight a) Less HighSchool 0.0003
## 2 b) gay a) Less HighSchool 0.01 b) gay a) Less HighSchool 0.0002
## 3 c) bisexual a) Less HighSchool 0.01 c) bisexual a) Less HighSchool 0.0002
## 4 a) straight b) HighSchool/GED 0.98 a) straight b) HighSchool/GED 0.0003
## 5 b) gay b) HighSchool/GED 0.01 b) gay b) HighSchool/GED 0.0003
## 6 c) bisexual b) HighSchool/GED 0.01 c) bisexual b) HighSchool/GED 0.0002
## 7 a) straight c) Bachelors&above 0.97 a) straight c) Bachelors&above 0.0005
## 8 b) gay c) Bachelors&above 0.02 b) gay c) Bachelors&above 0.0004
## 9 c) bisexual c) Bachelors&above 0.01 c) bisexual c) Bachelors&above 0.0002
## -------------------------------------------------------------------------------------------
#Now consider the full sample design + weights
cat2<-svytable(~sexor+educ,design=des)
stargazer(data.frame(prop.table(svytable(~sexor+educ,design=des),margin=2)),summary = F,type="text",digits=3)
##
## ======================================
## sexor educ Freq
## --------------------------------------
## 1 a) straight a) Less HighSchool 0.983
## 2 b) gay a) Less HighSchool 0.009
## 3 c) bisexual a) Less HighSchool 0.008
## 4 a) straight b) HighSchool/GED 0.981
## 5 b) gay b) HighSchool/GED 0.012
## 6 c) bisexual b) HighSchool/GED 0.008
## 7 a) straight c) Bachelors&above 0.966
## 8 b) gay c) Bachelors&above 0.024
## 9 c) bisexual c) Bachelors&above 0.009
## --------------------------------------
sv.table<-svyby(formula=~sexor,by=~educ,design=des,FUN=svymean,na.rm=T)
stargazer(sv.table,summary = F,type = "text",digits = 2)
##
## ==========================================================================================================================================
## educ sexora) straight sexorb) gay sexorc) bisexual se.sexora) straight se.sexorb) gay se.sexorc) bisexual
## ------------------------------------------------------------------------------------------------------------------------------------------
## a) Less HighSchool a) Less HighSchool 0.98 0.01 0.01 0.001 0.001 0.001
## b) HighSchool/GED b) HighSchool/GED 0.98 0.01 0.01 0.001 0.001 0.001
## c) Bachelors&above c) Bachelors&above 0.97 0.02 0.01 0.001 0.001 0.001
## ------------------------------------------------------------------------------------------------------------------------------------------
#The purpose for this exercise is to examine how education and sexual orientation influence marriage. In order to examine this relationship a dataset from the National Health Interview Survey is used. Variables for education, sexual orientation and marital status are used for this analysis. Marital status is coded as a binary variable with education and sexual orientation coded as categorical variables. A Chi-Squared test of independence was performed to see if there is a significant relationship between education and sexual orientation. The data was explored using no weights or survey design in addition to using a full survey design with weights. Full survey design includes sample design information such as stratification by state and primary sampling units that are geographically defined with cluster sampling within PSUs. The weights reflect the possibility of a variable being selected and the identification of strata and PSUs for the survey design.
#When comparing the raw data and the weighted survey design the results show little difference. The only noticeable difference is that of bisexual/lessHighSchool of .001 and bisexual/HighSchool/GED by .001.