#load brfss
library(car)
library(stargazer)
library(survey)
library(sjPlot)
library(ggplot2)
#load data

library(haven)
nhis_4 <- read_stata("nhis_00004.dta.gz")
View(nhis_4)


nhis_4$ervisit<-Recode(nhis_4$eryrno, recodes = "20=1;30=2;41=3;40=4;42=5;43=6;50=7;60=8;61:62=9;00=NA;97:99=NA;10=NA")
hist(nhis_4$ervisit)

summary(nhis_4$ervisit)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     1.0     1.0     1.0     1.6     2.0     9.0  102684
#poor
nhis_4$poor<-Recode(nhis_4$pooryn, recodes =" 1=1;2=0;else=NA",as.factor=F)

##Alcohol
nhis_4$alcohol<-Recode(nhis_4$alclife, recodes ="1=1;2=0;else=NA")

##smoking
nhis_4$smoke<-Recode(nhis_4$smokev, recodes ="7:9=NA;1=1;2=0; else=NA")

##usborn
nhis_4$born<-Recode(nhis_4$usborn, recodes ="else=NA; 20=1;10:12=0")



##Recode
##citizen
nhis_4$mycitizen<-Recode(nhis_4$citizen, recodes ="7:9=NA; 1=1;2=0")



#Age cut into intervals
nhis_4$agec<-cut(nhis_4$age,breaks = c(18, 25, 35, 45, 55, 65, 75, 85), include.lowest = T )
  1. Define a count outcome for the dataset of your choosing
##I chose ER visits
  1. State a research question about your outcome
##The question is what is the number of times you visited the ER in the past 12 months 
  1. Is an offset term necessary? why or why not?
##No the offset term is not necessary because the parameter is within a certain number of visits and in the 12 months
  1. Consider a Poisson regression model for the outcome
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
sub<-nhis_4%>%
  select(alcohol, citizen,usborn,
         agec,ervisit,strata,psu, perweight, smoke) %>%
  filter( complete.cases(.))
 

#First we tell R our survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~1, strata=~strata, weights=~perweight, data =sub )
#OR THE BRFSS, R GAVE ME A WARNING AND I NEEDED TO ADD:
#YOU MAY NOT NEED TO DO THIS!!!!
#First we tell R our survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~1, strata=~strata, 
               weights=~perweight,
               data = nhis_4[is.na(nhis_4$perweight)==F,])
  1. Evaluate the level of dispersion in the outcome
#First I do some simple descriptives
svyhist(~ervisit, des)

svyby(~ervisit, ~usborn+smoke, des, svymean, na.rm=T)
##      usborn smoke  ervisit         se
## 11.0     11     0 1.792929 0.19732991
## 12.0     12     0 1.502240 0.08744699
## 20.0     20     0 1.696059 0.02940609
## 97.0     97     0 1.000000 0.00000000
## 99.0     99     0 0.000000 0.00000000
## 11.1     11     1 2.031835 0.38265859
## 12.1     12     1 1.476708 0.04524301
## 20.1     20     1 1.579261 0.02494510
## 97.1     97     1 0.000000 0.00000000
## 99.1     99     1 2.000000 0.00000000
svyby(~ervisit, ~agec, des, svymean, na.rm=T)
##            agec  ervisit         se
## [18,25] [18,25] 1.748836 0.06878496
## (25,35] (25,35] 1.636463 0.04171821
## (35,45] (35,45] 1.673345 0.05258919
## (45,55] (45,55] 1.697424 0.04769689
## (55,65] (55,65] 1.640019 0.04160352
## (65,75] (65,75] 1.455516 0.03446322
## (75,85] (75,85] 1.500475 0.03760934
#Poisson glm fit to survey data
fit1<-svyglm(ervisit~factor(usborn)+factor(smoke)+factor(agec), design=des, family=poisson)
summary(fit1)
## 
## Call:
## svyglm(formula = ervisit ~ factor(usborn) + factor(smoke) + factor(agec), 
##     design = des, family = poisson)
## 
## Survey design:
## svydesign(ids = ~1, strata = ~strata, weights = ~perweight, data = nhis_4[is.na(nhis_4$perweight) == 
##     F, ])
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          0.77918    0.12794   6.090 1.19e-09 ***
## factor(usborn)12    -0.24915    0.12035  -2.070 0.038471 *  
## factor(usborn)20    -0.16630    0.11746  -1.416 0.156907    
## factor(usborn)97    -0.73396    0.11922  -6.156 7.89e-10 ***
## factor(usborn)99     0.03849    0.12040   0.320 0.749215    
## factor(smoke)1      -0.07313    0.02284  -3.202 0.001370 ** 
## factor(agec)(25,35] -0.07497    0.04710  -1.592 0.111529    
## factor(agec)(35,45] -0.05139    0.05077  -1.012 0.311473    
## factor(agec)(45,55] -0.04523    0.04934  -0.917 0.359422    
## factor(agec)(55,65] -0.07972    0.04734  -1.684 0.092178 .  
## factor(agec)(65,75] -0.19916    0.04681  -4.255 2.12e-05 ***
## factor(agec)(75,85] -0.16298    0.04699  -3.468 0.000527 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 0.9280652)
## 
## Number of Fisher Scoring iterations: 5
#here are the poisson model "risk ratios", which just show the change in the mean
round(exp(summary(fit1)$coef[-1,1]), 3)
##    factor(usborn)12    factor(usborn)20    factor(usborn)97    factor(usborn)99 
##               0.779               0.847               0.480               1.039 
##      factor(smoke)1 factor(agec)(25,35] factor(agec)(35,45] factor(agec)(45,55] 
##               0.929               0.928               0.950               0.956 
## factor(agec)(55,65] factor(agec)(65,75] factor(agec)(75,85] 
##               0.923               0.819               0.850
  1. Is the Poisson model a good choice?
##Checking for over dispersion
fit2<-glm(ervisit~factor(usborn)+factor(smoke)+factor(poor), data=nhis_4, family=poisson)
summary(fit2)
## 
## Call:
## glm(formula = ervisit ~ factor(usborn) + factor(smoke) + factor(poor), 
##     family = poisson, data = nhis_4)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9537  -0.5147  -0.4485   0.2989   4.3401  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       0.82333    0.08219  10.017  < 2e-16 ***
## factor(usborn)12 -0.21922    0.08597  -2.550  0.01078 *  
## factor(usborn)20 -0.10274    0.08198  -1.253  0.21012    
## factor(smoke)1   -0.05677    0.01923  -2.953  0.00315 ** 
## factor(poor)1    -0.24647    0.02048 -12.037  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 4559.3  on 6740  degrees of freedom
## Residual deviance: 4386.5  on 6736  degrees of freedom
##   (105312 observations deleted due to missingness)
## AIC: 19946
## 
## Number of Fisher Scoring iterations: 5
scale<-sqrt(fit2$deviance/fit2$df.residual)
scale
## [1] 0.8069699
##Using the chi square distribution
1-pchisq(fit2$deviance, df = fit2$df.residual)
## [1] 1
##The model fits the data since the p value is one

##Quasi distribution
fit3<-glm(ervisit~factor(usborn)+factor(smoke)+factor(agec), data=nhis_4, family=quasipoisson)
summary(fit3)
## 
## Call:
## glm(formula = ervisit ~ factor(usborn) + factor(smoke) + factor(agec), 
##     family = quasipoisson, data = nhis_4)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8904  -0.5718  -0.4740   0.2352   4.3814  
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          0.77570    0.08525   9.099  < 2e-16 ***
## factor(usborn)12    -0.25506    0.08433  -3.025   0.0025 ** 
## factor(usborn)20    -0.15773    0.08044  -1.961   0.0499 *  
## factor(usborn)97    -0.76119    0.99271  -0.767   0.4432    
## factor(usborn)99     0.05090    0.70441   0.072   0.9424    
## factor(smoke)1      -0.08135    0.01876  -4.338 1.46e-05 ***
## factor(agec)(25,35] -0.03480    0.03434  -1.013   0.3109    
## factor(agec)(35,45] -0.05210    0.03607  -1.444   0.1487    
## factor(agec)(45,55] -0.01452    0.03461  -0.419   0.6749    
## factor(agec)(55,65] -0.07198    0.03492  -2.061   0.0393 *  
## factor(agec)(65,75] -0.16193    0.03755  -4.313 1.64e-05 ***
## factor(agec)(75,85] -0.16129    0.03806  -4.238 2.28e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasipoisson family taken to be 0.9786789)
## 
##     Null deviance: 4742.4  on 7044  degrees of freedom
## Residual deviance: 4667.1  on 7033  degrees of freedom
##   (105008 observations deleted due to missingness)
## AIC: NA
## 
## Number of Fisher Scoring iterations: 5
  1. Consider a Negative binomial model
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(sandwich)
coeftest(fit2, vcov=vcovHC(fit2, type="HC1",cluster="ststr"))
## 
## z test of coefficients:
## 
##                   Estimate Std. Error  z value  Pr(>|z|)    
## (Intercept)       0.823326   0.093061   8.8471 < 2.2e-16 ***
## factor(usborn)12 -0.219217   0.096802  -2.2646  0.023537 *  
## factor(usborn)20 -0.102739   0.093926  -1.0938  0.274026    
## factor(smoke)1   -0.056769   0.018835  -3.0141  0.002578 ** 
## factor(poor)1    -0.246474   0.022273 -11.0662 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##Fit the Negative Binomial GLM
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
fit.nb1<-glm.nb(ervisit~factor(agec),
              data=sub,
              weights=perweight/mean(perweight, na.rm=T))

fit.nb2<-glm.nb(ervisit~factor(usborn)+factor(smoke)+factor(agec),
              data=sub,
              weights=perweight/mean(perweight, na.rm=T))
  1. Compare the model fits of the alternative models using AIC
#clx2(fit.nb2,cluster =sub$ststr)
tests1<-coeftest(fit.nb1, vcov=vcovHC(fit.nb2, type="HC1",cluster="ststr"))
tests<-coeftest(fit.nb2, vcov=vcovHC(fit.nb2, type="HC1",cluster="ststr"))
library(stargazer)

stargazer(fit.nb1, fit.nb2,style="demography", type = "text", t.auto=F,p.auto=F,coef=list(tests1[, 1],tests[,1]),  se =list(tests1[, 2], tests[, 2]), p=list(tests1[,4],tests[, 4])   )
## 
## ---------------------------------------------------
##                                 ervisit            
##                         Model 1         Model 2    
## ---------------------------------------------------
## factor(usborn)12                        -0.334     
##                                         (0.179)    
## factor(usborn)20                        -0.245     
##                                         (0.175)    
## factor(usborn)97                       -0.961***   
##                                         (0.179)    
## factor(usborn)99                        -0.149     
##                                         (0.180)    
## factor(smoke)1                         -0.110**    
##                                         (0.039)    
## factor(agec)(25,35]     -0.008          -0.011     
##                         (0.078)         (0.078)    
## factor(agec)(35,45]      0.019           0.013     
##                         (0.084)         (0.084)    
## factor(agec)(45,55]      0.046           0.021     
##                         (0.075)         (0.075)    
## factor(agec)(55,65]     -0.046          -0.077     
##                         (0.070)         (0.070)    
## factor(agec)(65,75]    -0.228***       -0.254***   
##                         (0.069)         (0.069)    
## factor(agec)(75,85]     -0.153*        -0.170**    
##                         (0.065)         (0.065)    
## Constant                0.599**        0.940***    
##                         (0.195)         (0.195)    
## N                        2,818           2,818     
## Log Likelihood        -4,296.337      -4,283.931   
## theta               51.768 (29.248) 59.722 (38.498)
## AIC                    8,606.675       8,591.861   
## ---------------------------------------------------
## *p < .05; **p < .01; ***p < .001
##Based on the AIC, model 2(16,780.780) is better compared to model one(16,780.780 )
##
AIC(fit.nb1) ##Negative model 1
## [1] 8606.675
AIC(fit.nb2) ##Negative model 2
## [1] 8591.861
AIC(fit1) ##Poisson model 1
##       eff.p         AIC    deltabar 
##   12.195393 4471.971650    1.108672
AIC(fit2) ##Poisson model 2
## [1] 19946.48
AIC(fit3) ##Poisson model 3
## [1] NA