library(car)
library(stargazer)
library(survey)
library(sjPlot)
library(ggplot2)
library(tigris)
library(haven)
tb19 <- read_dta("state_19_working_pudf.dta")
nams<-names(tb19)
head(nams, n=10)
## [1] "year" "ststr" "state" "fmonth" "dispcode" "orgseqno"
## [7] "c01q01" "fairpoor" "c02q01" "phyng"
newnames<-tolower(gsub(pattern = "_",replacement = "",x = nams))
names(tb19)<-newnames
Recode
#recode
tb19$dia<-Recode(tb19$m02q04, recodes = "88=0; 77=NA; 99=NA")
#age
tb19$agegr4 = as.factor(tb19$agegr4)
tb19$age<-Recode(tb19$agegr4, recodes="1='18-19'; 2='30-44'; 3='45-64'; 4='65+'; else=NA", as.factor=T)
#insurance
tb19$c03q01 = as.factor(tb19$c03q01)
tb19$ins<-Recode(tb19$c03q01, recodes ="1=1;2=0")
#employment
tb19$c08q14 = as.factor(tb19$c08q14)
tb19$employ<-Recode(tb19$c08q14, recodes="1:2='Employed'; 3:6='nilf'; 7='retired'; 8='unable'; else=NA", as.factor=T)
tb19$employ<-relevel(tb19$employ, ref='Employed')
#education level
tb19$educat4 = as.factor(tb19$educat4)
tb19$educ<-Recode(tb19$educat4, recodes="1='leshs'; 2='hsgrad'; 3='sumcol'; 4='colgrad'", as.factor=T)
tb19$educ<-relevel(tb19$educ, ref='hsgrad')
#race/ethnicity
tb19$race = as.factor(tb19$race)
tb19$black<-Recode(tb19$race, recodes="2=1; 9=NA; else=0")
tb19$white<-Recode(tb19$race, recodes="1=1; 9=NA; else=0")
tb19$other<-Recode(tb19$race, recodes="3:7=1; 9=NA; else=0")
tb19$hispanic<-Recode(tb19$race, recodes="8=1; else=0")
tb19$racegr5 = as.factor(tb19$racegr5)
tb19$ethn<-Recode(tb19$racegr5, recodes="1='nhwhite'; 2='nh black'; 3='Hispanic';4='othernh'; 5='multinh'; else=NA", as.factor = T)
#smoking currently
tb19$smoker2 = as.factor(tb19$smoker2)
tb19$smoke<-Recode(tb19$smoker2, recodes="1:2='Current'; 3='Former';4='NeverSmoked'; else=NA", as.factor=T)
tb19$smoke<-relevel(tb19$smoke, ref = "NeverSmoked")
#Poor or fair self rated health
tb19$fairpoor = as.factor(tb19$fairpoor)
tb19$genhlth<-Recode(tb19$fairpoor, recodes="1='poor'; 2='good'")
#sex
tb19$sex = as.factor(tb19$sex)
tb19$sex<-Recode(tb19$sex, recodes="1='Male'; 2='Female'")
#marital status
tb19$c08q05 = as.factor(tb19$c08q05)
tb19$marital<-Recode(tb19$c08q05, recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA", as.factor=T)
tb19$marital<-relevel(tb19$marital, ref='married')
#income
tb19$c08q16 = as.factor(tb19$c08q16)
tb19$income<-Recode(tb19$c08q16, recodes="1='less10k';2='10-15k'; 3='15-20k'; 4='20-25k'; 5='25-35';6='35-50k';7='50-75';8='75k'; else=NA")
Hist and Summary
hist(tb19$dia)
summary(tb19$dia)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 1.000 3.000 3.935 4.000 76.000 10504
Define count Outcome:
#1) Define a count outcome for the data set of your choosing: count count outcome is the times a health professional has been seen for diabetes in the past 12 months.
# a. State a research question about your outcome
#Which populations visit health professional for diabetes the most?
#b. Is an offset term necessary? why or why not?
# No, because the counts over time are the same for my outcome variable.
Subset Data
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
sub19<-tb19%>%
select(dia,smoke,genhlth,sex,age,ins,employ,educ,ethn,income,llcpwt, ststr) %>%
filter( complete.cases(.))
#survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~1, strata=~ststr, weights=~llcpwt, data =sub19 )
Consider a Poisson regression model for the outcome
#2) #a. Evaluate the level of dispersion in the outcome
#b. Is the Poisson model a good choice? The scale number is 2.013713, the Poisson model is not a good choice.
#simple descriptives
svyhist(~dia, des)
svyby(~dia, ~ethn+ins, des, svymean, na.rm=T)
svyby(~dia, ~genhlth, des, svymean, na.rm=T)
#Poisson glm fit to survey data
fit1<-svyglm(dia~factor(smoke)+factor(genhlth)+factor(sex)+factor(age)+factor(ins)+factor(employ)+factor(educ)+factor(ethn)+factor(income), design=des, family=poisson)
summary(fit1)
##
## Call:
## svyglm(formula = dia ~ factor(smoke) + factor(genhlth) + factor(sex) +
## factor(age) + factor(ins) + factor(employ) + factor(educ) +
## factor(ethn) + factor(income), design = des, family = poisson)
##
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~llcpwt, data = sub19)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.09945 0.43877 0.227 0.82073
## factor(smoke)Current -0.04350 0.23157 -0.188 0.85103
## factor(smoke)Former -0.06422 0.13399 -0.479 0.63182
## factor(genhlth)poor 0.47614 0.11949 3.985 7.15e-05 ***
## factor(sex)Male 0.08183 0.13830 0.592 0.55415
## factor(age)30-44 0.43053 0.35624 1.209 0.22707
## factor(age)45-64 0.25331 0.20071 1.262 0.20716
## factor(age)65+ -0.02424 0.22935 -0.106 0.91584
## factor(ins)1 0.61324 0.32719 1.874 0.06113 .
## factor(employ)nilf 0.26355 0.23658 1.114 0.26550
## factor(employ)retired 0.49501 0.17919 2.763 0.00582 **
## factor(employ)unable 0.50446 0.21102 2.391 0.01697 *
## factor(educ)colgrad 0.29702 0.20665 1.437 0.15089
## factor(educ)leshs 0.18601 0.18166 1.024 0.30605
## factor(educ)sumcol 0.26037 0.17444 1.493 0.13579
## factor(ethn)multinh -0.51234 0.19779 -2.590 0.00970 **
## factor(ethn)nh black -0.31056 0.17657 -1.759 0.07885 .
## factor(ethn)nhwhite -0.19995 0.13663 -1.463 0.14359
## factor(ethn)othernh -0.43052 0.25415 -1.694 0.09052 .
## factor(income)15-20k -0.06997 0.19351 -0.362 0.71773
## factor(income)20-25k -0.03118 0.26980 -0.116 0.90803
## factor(income)25-35 -0.16057 0.23523 -0.683 0.49499
## factor(income)35-50k -0.19209 0.21457 -0.895 0.37084
## factor(income)50-75 -0.24500 0.22381 -1.095 0.27387
## factor(income)75k -0.19404 0.27701 -0.700 0.48375
## factor(income)less10k 0.17866 0.32567 0.549 0.58338
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 6.148811)
##
## Number of Fisher Scoring iterations: 6
#poisson risk ratios
round(exp(summary(fit1)$coef[-1,1]), 3)
## factor(smoke)Current factor(smoke)Former factor(genhlth)poor
## 0.957 0.938 1.610
## factor(sex)Male factor(age)30-44 factor(age)45-64
## 1.085 1.538 1.288
## factor(age)65+ factor(ins)1 factor(employ)nilf
## 0.976 1.846 1.302
## factor(employ)retired factor(employ)unable factor(educ)colgrad
## 1.641 1.656 1.346
## factor(educ)leshs factor(educ)sumcol factor(ethn)multinh
## 1.204 1.297 0.599
## factor(ethn)nh black factor(ethn)nhwhite factor(ethn)othernh
## 0.733 0.819 0.650
## factor(income)15-20k factor(income)20-25k factor(income)25-35
## 0.932 0.969 0.852
## factor(income)35-50k factor(income)50-75 factor(income)75k
## 0.825 0.783 0.824
## factor(income)less10k
## 1.196
Deviance
fit2<-glm(dia~factor(smoke)+factor(genhlth)+factor(sex)+factor(age)+factor(ins)+factor(employ)+factor(educ)+factor(ethn)+factor(income), data=tb19, family=poisson)
summary(fit2)
##
## Call:
## glm(formula = dia ~ factor(smoke) + factor(genhlth) + factor(sex) +
## factor(age) + factor(ins) + factor(employ) + factor(educ) +
## factor(ethn) + factor(income), family = poisson, data = tb19)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.3483 -1.4197 -0.4070 0.3758 16.5912
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.529031 0.181763 2.911 0.00361 **
## factor(smoke)Current -0.240140 0.046484 -5.166 2.39e-07 ***
## factor(smoke)Former -0.135687 0.032190 -4.215 2.50e-05 ***
## factor(genhlth)poor 0.233341 0.030692 7.603 2.90e-14 ***
## factor(sex)Male -0.056083 0.029970 -1.871 0.06130 .
## factor(age)30-44 0.485152 0.179542 2.702 0.00689 **
## factor(age)45-64 0.384740 0.172650 2.228 0.02585 *
## factor(age)65+ 0.369221 0.174538 2.115 0.03439 *
## factor(ins)1 0.293865 0.056902 5.164 2.41e-07 ***
## factor(employ)nilf 0.181283 0.056551 3.206 0.00135 **
## factor(employ)retired 0.227503 0.044671 5.093 3.53e-07 ***
## factor(employ)unable 0.632710 0.047352 13.362 < 2e-16 ***
## factor(educ)colgrad -0.097366 0.041992 -2.319 0.02041 *
## factor(educ)leshs -0.078257 0.044221 -1.770 0.07678 .
## factor(educ)sumcol -0.022308 0.037676 -0.592 0.55378
## factor(ethn)multinh -0.158384 0.131809 -1.202 0.22951
## factor(ethn)nh black -0.242841 0.052035 -4.667 3.06e-06 ***
## factor(ethn)nhwhite -0.092144 0.034679 -2.657 0.00788 **
## factor(ethn)othernh -0.006529 0.076922 -0.085 0.93236
## factor(income)15-20k 0.030472 0.058567 0.520 0.60286
## factor(income)20-25k 0.049655 0.059272 0.838 0.40217
## factor(income)25-35 0.051704 0.063690 0.812 0.41690
## factor(income)35-50k -0.078388 0.063553 -1.233 0.21742
## factor(income)50-75 -0.075047 0.065047 -1.154 0.24861
## factor(income)75k 0.018270 0.063795 0.286 0.77459
## factor(income)less10k 0.322583 0.059490 5.422 5.88e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 6016.3 on 1346 degrees of freedom
## Residual deviance: 5356.7 on 1321 degrees of freedom
## (10950 observations deleted due to missingness)
## AIC: 9016.5
##
## Number of Fisher Scoring iterations: 6
scale<-sqrt(fit2$deviance/fit2$df.residual)
scale
## [1] 2.013713
Deviance as test model with chi square and degrees of freedom.
1-pchisq(fit2$deviance, df = fit2$df.residual)
## [1] 0
#the model does not fit the data because the p-value is zero.
fit3<-glm(dia~factor(smoke) + factor(genhlth) + factor(sex) +
factor(age) + factor(ins) + factor(employ) + factor(educ) +
factor(ethn) + factor(income), data=tb19, family=quasipoisson)
summary(fit3)
##
## Call:
## glm(formula = dia ~ factor(smoke) + factor(genhlth) + factor(sex) +
## factor(age) + factor(ins) + factor(employ) + factor(educ) +
## factor(ethn) + factor(income), family = quasipoisson, data = tb19)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.3483 -1.4197 -0.4070 0.3758 16.5912
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.529031 0.493619 1.072 0.28403
## factor(smoke)Current -0.240140 0.126237 -1.902 0.05735 .
## factor(smoke)Former -0.135687 0.087420 -1.552 0.12087
## factor(genhlth)poor 0.233341 0.083351 2.800 0.00519 **
## factor(sex)Male -0.056083 0.081389 -0.689 0.49090
## factor(age)30-44 0.485152 0.487585 0.995 0.31991
## factor(age)45-64 0.384740 0.468869 0.821 0.41204
## factor(age)65+ 0.369221 0.473998 0.779 0.43615
## factor(ins)1 0.293865 0.154530 1.902 0.05743 .
## factor(employ)nilf 0.181283 0.153578 1.180 0.23805
## factor(employ)retired 0.227503 0.121313 1.875 0.06097 .
## factor(employ)unable 0.632710 0.128594 4.920 9.73e-07 ***
## factor(educ)colgrad -0.097366 0.114038 -0.854 0.39336
## factor(educ)leshs -0.078257 0.120093 -0.652 0.51475
## factor(educ)sumcol -0.022308 0.102318 -0.218 0.82744
## factor(ethn)multinh -0.158384 0.357957 -0.442 0.65823
## factor(ethn)nh black -0.242841 0.141312 -1.718 0.08595 .
## factor(ethn)nhwhite -0.092144 0.094177 -0.978 0.32805
## factor(ethn)othernh -0.006529 0.208898 -0.031 0.97507
## factor(income)15-20k 0.030472 0.159052 0.192 0.84810
## factor(income)20-25k 0.049655 0.160965 0.308 0.75776
## factor(income)25-35 0.051704 0.172963 0.299 0.76504
## factor(income)35-50k -0.078388 0.172593 -0.454 0.64978
## factor(income)50-75 -0.075047 0.176650 -0.425 0.67102
## factor(income)75k 0.018270 0.173250 0.105 0.91603
## factor(income)less10k 0.322583 0.161559 1.997 0.04606 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasipoisson family taken to be 7.375155)
##
## Null deviance: 6016.3 on 1346 degrees of freedom
## Residual deviance: 5356.7 on 1321 degrees of freedom
## (10950 observations deleted due to missingness)
## AIC: NA
##
## Number of Fisher Scoring iterations: 6
#3) Consider a Negative binomial model
#use sample weights to fit negative binomial model
library(lmtest)
library(sandwich)
coeftest(fit2, vcov=vcovHC(fit2, type="HC1",cluster="orgseqno"))
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.529031 0.288499 1.8337 0.0666928 .
## factor(smoke)Current -0.240140 0.126352 -1.9006 0.0573598 .
## factor(smoke)Former -0.135687 0.092659 -1.4644 0.1430947
## factor(genhlth)poor 0.233341 0.105647 2.2087 0.0271963 *
## factor(sex)Male -0.056083 0.084657 -0.6625 0.5076702
## factor(age)30-44 0.485152 0.305325 1.5890 0.1120671
## factor(age)45-64 0.384740 0.222489 1.7293 0.0837630 .
## factor(age)65+ 0.369221 0.241198 1.5308 0.1258227
## factor(ins)1 0.293865 0.254163 1.1562 0.2475957
## factor(employ)nilf 0.181283 0.172496 1.0509 0.2932857
## factor(employ)retired 0.227503 0.118418 1.9212 0.0547087 .
## factor(employ)unable 0.632710 0.182881 3.4597 0.0005408 ***
## factor(educ)colgrad -0.097366 0.146647 -0.6639 0.5067225
## factor(educ)leshs -0.078257 0.135179 -0.5789 0.5626449
## factor(educ)sumcol -0.022308 0.118454 -0.1883 0.8506207
## factor(ethn)multinh -0.158384 0.208579 -0.7593 0.4476456
## factor(ethn)nh black -0.242841 0.112079 -2.1667 0.0302581 *
## factor(ethn)nhwhite -0.092144 0.106277 -0.8670 0.3859344
## factor(ethn)othernh -0.006529 0.152175 -0.0429 0.9657776
## factor(income)15-20k 0.030472 0.162798 0.1872 0.8515232
## factor(income)20-25k 0.049655 0.179288 0.2770 0.7818131
## factor(income)25-35 0.051704 0.165020 0.3133 0.7540382
## factor(income)35-50k -0.078388 0.191578 -0.4092 0.6824152
## factor(income)50-75 -0.075047 0.162132 -0.4629 0.6434525
## factor(income)75k 0.018270 0.173912 0.1051 0.9163351
## factor(income)less10k 0.322583 0.172403 1.8711 0.0613318 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#fit negative binomail glm
#library(MASS)
#fit_nb<- glm.nb(lowbw1416 ~ offset(log(births1416)) + hpsa10,
#data=arf2018sub)
#summary(fit_nb)
library(stargazer)
library(MASS)
fit.nb1<-glm.nb(dia~factor(smoke)+factor(genhlth)+factor(sex)+factor(age)+factor(ins)+factor(employ)+factor(educ)+factor(ethn)+factor(income),
data=sub19,
weights=llcpwt/mean(llcpwt, na.rm=T))
fit.nb2<-glm.nb(dia~factor(smoke)+factor(genhlth)+factor(sex)+factor(age)+factor(ins)+factor(employ)+factor(educ)+factor(ethn)+factor(income),
data=sub19,
weights=llcpwt/mean(llcpwt, na.rm=T))
#clx2(fit.nb2,cluster =sub$ststr)
tests1<-coeftest(fit.nb1, vcov=vcovHC(fit.nb2, type="HC1",cluster="orgseqno"))
tests<-coeftest(fit.nb2, vcov=vcovHC(fit.nb2, type="HC1",cluster="orgseqno"))
stargazer(fit.nb1, fit.nb2,style="demography", type = "text", t.auto=F,p.auto=F,coef=list(tests1[, 1],tests[,1]), se =list(tests1[, 2], tests[, 2]), p=list(tests1[,4],tests[, 4]) )
##
## -------------------------------------------------------
## dia
## Model 1 Model 2
## -------------------------------------------------------
## factor(smoke)Current -0.155 -0.155
## (0.195) (0.195)
## factor(smoke)Former -0.048 -0.048
## (0.117) (0.117)
## factor(genhlth)poor 0.472*** 0.472***
## (0.104) (0.104)
## factor(sex)Male 0.045 0.045
## (0.120) (0.120)
## factor(age)30-44 0.283 0.283
## (0.312) (0.312)
## factor(age)45-64 0.298 0.298
## (0.199) (0.199)
## factor(age)65+ 0.125 0.125
## (0.217) (0.217)
## factor(ins)1 0.508 0.508
## (0.262) (0.262)
## factor(employ)nilf 0.207 0.207
## (0.207) (0.207)
## factor(employ)retired 0.360* 0.360*
## (0.155) (0.155)
## factor(employ)unable 0.452* 0.452*
## (0.204) (0.204)
## factor(educ)colgrad 0.199 0.199
## (0.172) (0.172)
## factor(educ)leshs 0.134 0.134
## (0.162) (0.162)
## factor(educ)sumcol 0.234 0.234
## (0.162) (0.162)
## factor(ethn)multinh -0.431* -0.431*
## (0.191) (0.191)
## factor(ethn)nh black -0.225 -0.225
## (0.167) (0.167)
## factor(ethn)nhwhite -0.156 -0.156
## (0.123) (0.123)
## factor(ethn)othernh -0.294 -0.294
## (0.221) (0.221)
## factor(income)15-20k -0.125 -0.125
## (0.199) (0.199)
## factor(income)20-25k 0.029 0.029
## (0.266) (0.266)
## factor(income)25-35 -0.197 -0.197
## (0.226) (0.226)
## factor(income)35-50k -0.220 -0.220
## (0.211) (0.211)
## factor(income)50-75 -0.247 -0.247
## (0.222) (0.222)
## factor(income)75k -0.219 -0.219
## (0.255) (0.255)
## factor(income)less10k 0.092 0.092
## (0.349) (0.349)
## Constant 0.259 0.259
## (0.392) (0.392)
## N 1,347 1,347
## Log Likelihood -3,164.959 -3,164.959
## theta 1.803*** (0.101) 1.803*** (0.101)
## AIC 6,381.918 6,381.918
## -------------------------------------------------------
## *p < .05; **p < .01; ***p < .001
#4) Compare the model fits of the alternative models using AIC
AIC(fit.nb1)
## [1] 6381.918
AIC(fit.nb2)
## [1] 6381.918
AIC(fit1)
## eff.p AIC deltabar
## 535.44355 5919.10504 21.41774
AIC(fit2)
## [1] 9016.534
#Based on the model numbers, the model that best fits is fit1 with AIC of 5919.10504 with the most parsimonious fit.