temp <- tempfile()
#Download the SAS dataset as a ZIP compressed archive
download.file("https://data.hrsa.gov/DataDownload/AHRF/AHRF_2019-2020_SAS.zip", temp)
#Read SAS data into R
ahrf<-haven::read_sas(unz(temp,
filename = "ahrf2020.sas7bdat"))
rm(temp)
#1) Define a count outcome for the dataset of your choosing, the Area Resource File used in class provides many options here Answer:
answer: Unemployment rate ### State a research question about your outcome
answer: Is race associated with unemployment rate?
education,urban/rural, median age
#Filter needed data
library(tidyverse)
ahrf2<-ahrf%>%
mutate(cofips = f00004,
coname = f00010,
state = f00011,
popn = f1198414,
laborfc14 = f1451014,
Unemploy14 = f1451214,
unemployrate14 = 1000*(f1451214/f1451014), #Rate per 1000 unemployment
majoritypop10 = f0453710,
hsdegree14 =f1448114,
medianage10= f1348310,
rucc = as.factor(f0002013) )%>%
mutate(rucc = droplevels(rucc, ""))%>%
dplyr::select(laborfc14,
Unemploy14,
unemployrate14,
state,
cofips,
coname,
popn,
medianage10,
rucc,
majoritypop10,
hsdegree14)%>%
filter(complete.cases(.))%>%
as.data.frame()
ahrf_m%>%
ggplot()+
geom_histogram(aes(x = unemployrate14))+
labs(title = "Distribution of Unemployment Rates in US Counties",
subtitle = "2015 - 2019")+
xlab("Rate per 1,000 Unemployment rate")+
ylab ("Frequency")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
library(tmap)
tm_shape(ahrf_m)+
tm_polygons(col = "unemployrate14",
border.col = NULL,
title="Unemployment Rt",
palette="Blues",
style="quantile",
n=5,
showNA=T, colorNA = "grey50")+
tm_format(format= "World",
main.title="US Unemployment Rate by County",
legend.position = c("left", "bottom"),
main.title.position =c("center"))+
tm_scale_bar(position = c(.1,0))+
tm_compass()+
tm_shape(sts)+
tm_lines( col = "black")
glm1<- glm(unemployrate14 ~ majoritypop10 + rucc + hsdegree14 + medianage10,
data = ahrf_m,
family =gaussian)
glm1<-glm1%>%
tbl_regression()
summary(glm1)
## Length Class Mode
## table_body 24 broom.helpers list
## table_styling 7 -none- list
## N 1 -none- numeric
## n 1 -none- numeric
## model_obj 30 glm list
## inputs 10 -none- list
## call_list 15 -none- list
glmb<- glm(cbind(Unemploy14, laborfc14-Unemploy14) ~ majoritypop10 + rucc + hsdegree14 + medianage10,
data = ahrf_m,
family = binomial)
glmb%>%
tbl_regression()
Characteristic | log(OR)1 | 95% CI1 | p-value |
---|---|---|---|
Percent White Population 2010 | -0.01 | -0.01, -0.01 | <0.001 |
rucc | |||
01 | — | — | |
02 | 0.13 | 0.13, 0.13 | <0.001 |
03 | 0.15 | 0.15, 0.16 | <0.001 |
04 | 0.17 | 0.17, 0.18 | <0.001 |
05 | 0.13 | 0.12, 0.13 | <0.001 |
06 | 0.09 | 0.08, 0.09 | <0.001 |
07 | 0.05 | 0.04, 0.05 | <0.001 |
08 | 0.08 | 0.07, 0.09 | <0.001 |
09 | 0.00 | -0.01, 0.01 | 0.5 |
% Persons 25+ w/HS Dipl or more 2014-18 | -0.02 | -0.02, -0.02 | <0.001 |
Median Age 2010 | 0.01 | 0.01, 0.01 | <0.001 |
1
OR = Odds Ratio, CI = Confidence Interval
|
summary(glmb)
##
## Call:
## glm(formula = cbind(Unemploy14, laborfc14 - Unemploy14) ~ majoritypop10 +
## rucc + hsdegree14 + medianage10, family = binomial, data = ahrf_m)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -94.826 -7.009 -1.591 5.253 116.472
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.990e-01 5.771e-03 -138.445 <2e-16 ***
## majoritypop10 -8.763e-03 2.695e-05 -325.105 <2e-16 ***
## rucc02 1.301e-01 9.085e-04 143.187 <2e-16 ***
## rucc03 1.536e-01 1.290e-03 119.058 <2e-16 ***
## rucc04 1.737e-01 1.801e-03 96.438 <2e-16 ***
## rucc05 1.275e-01 2.918e-03 43.687 <2e-16 ***
## rucc06 8.533e-02 1.834e-03 46.529 <2e-16 ***
## rucc07 4.893e-02 2.424e-03 20.185 <2e-16 ***
## rucc08 8.332e-02 4.434e-03 18.791 <2e-16 ***
## rucc09 -2.753e-03 4.333e-03 -0.635 0.525
## hsdegree14 -2.181e-02 7.255e-05 -300.652 <2e-16 ***
## medianage10 1.348e-02 1.028e-04 131.189 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 838027 on 3099 degrees of freedom
## Residual deviance: 489127 on 3088 degrees of freedom
## AIC: 514853
##
## Number of Fisher Scoring iterations: 4
glmb%>%
tbl_regression(exponentiate=TRUE)
Characteristic | OR1 | 95% CI1 | p-value |
---|---|---|---|
Percent White Population 2010 | 0.99 | 0.99, 0.99 | <0.001 |
rucc | |||
01 | — | — | |
02 | 1.14 | 1.14, 1.14 | <0.001 |
03 | 1.17 | 1.16, 1.17 | <0.001 |
04 | 1.19 | 1.19, 1.19 | <0.001 |
05 | 1.14 | 1.13, 1.14 | <0.001 |
06 | 1.09 | 1.09, 1.09 | <0.001 |
07 | 1.05 | 1.05, 1.06 | <0.001 |
08 | 1.09 | 1.08, 1.10 | <0.001 |
09 | 1.00 | 0.99, 1.01 | 0.5 |
% Persons 25+ w/HS Dipl or more 2014-18 | 0.98 | 0.98, 0.98 | <0.001 |
Median Age 2010 | 1.01 | 1.01, 1.01 | <0.001 |
1
OR = Odds Ratio, CI = Confidence Interval
|
summary(ahrf_m$Unemploy14)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 259 670 3028 1903 357178
ahrf_m$Unemploy14<- ahrf_m$Unemploy14 +1
Yes an offset term is neccesary. One of the conditions of using the poisson model is that each area or person has the same risk or population size. Since this is rear in demography and not the case for my population of study, poisson second type modelling( rate model) will be used.
The offset term is how unequal population size is incorporated. Possion is a strictly integer count so we can’t use the rate/probability so we log instead.
After doing the summary of the data, the outcome variable( unemployed) contain zero. This would bring an error if done directly. So I added 1 to the variable(population). I used 1 because they are individuals and we can’t have 0.9 or 0.1 individual.
glmp_s <- glm(Unemploy14 ~ offset(log(laborfc14)) + majoritypop10 + rucc + hsdegree14 + medianage10,
data=ahrf_m,
family=poisson)
glmp_s%>%
tbl_regression(exp = TRUE)
Characteristic | IRR1 | 95% CI1 | p-value |
---|---|---|---|
Percent White Population 2010 | 0.99 | 0.99, 0.99 | <0.001 |
rucc | |||
01 | — | — | |
02 | 1.13 | 1.13, 1.13 | <0.001 |
03 | 1.16 | 1.15, 1.16 | <0.001 |
04 | 1.18 | 1.17, 1.18 | <0.001 |
05 | 1.13 | 1.12, 1.13 | <0.001 |
06 | 1.08 | 1.08, 1.09 | <0.001 |
07 | 1.05 | 1.04, 1.05 | <0.001 |
08 | 1.08 | 1.07, 1.09 | <0.001 |
09 | 1.00 | 0.99, 1.01 | 0.5 |
% Persons 25+ w/HS Dipl or more 2014-18 | 0.98 | 0.98, 0.98 | <0.001 |
Median Age 2010 | 1.01 | 1.01, 1.01 | <0.001 |
1
IRR = Incidence Rate Ratio, CI = Confidence Interval
|
scale<-sqrt(glmp_s$deviance/glmp_s$df.residual)
scale
## [1] 12.18334
ANSWER: So we have about 12 times more variation in the data than we are supppose to have and thats is not good.
1-pchisq(glmp_s$deviance,
df = glmp_s$df.residual)
## [1] 0
ANSWER: No it is not a good choice since the goodness of fit results to zero. Since its zero it means the poisson distribution is a wrong choice
glmnb<- glm.nb(Unemploy14 ~ offset(log(laborfc14)) + majoritypop10 + rucc + hsdegree14 + medianage10,
data=ahrf_m)
glmnb%>%
tbl_regression( exp= T)
Characteristic | IRR1 | 95% CI1 | p-value |
---|---|---|---|
Percent White Population 2010 | 0.99 | 0.99, 0.99 | <0.001 |
rucc | |||
01 | — | — | |
02 | 1.07 | 1.02, 1.13 | 0.009 |
03 | 1.10 | 1.04, 1.16 | <0.001 |
04 | 1.16 | 1.09, 1.23 | <0.001 |
05 | 1.08 | 0.99, 1.18 | 0.072 |
06 | 1.04 | 0.99, 1.09 | 0.2 |
07 | 1.00 | 0.95, 1.06 | 0.9 |
08 | 1.05 | 0.98, 1.12 | 0.14 |
09 | 0.87 | 0.82, 0.92 | <0.001 |
% Persons 25+ w/HS Dipl or more 2014-18 | 0.97 | 0.97, 0.98 | <0.001 |
Median Age 2010 | 1.01 | 1.00, 1.01 | <0.001 |
1
IRR = Incidence Rate Ratio, CI = Confidence Interval
|
summary(glmnb)
##
## Call:
## glm.nb(formula = Unemploy14 ~ offset(log(laborfc14)) + majoritypop10 +
## rucc + hsdegree14 + medianage10, data = ahrf_m, init.theta = 7.016169231,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.8578 -0.6976 -0.0829 0.5104 4.3088
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.0308562 0.1074249 -0.287 0.773932
## majoritypop10 -0.0095723 0.0004972 -19.252 < 2e-16 ***
## rucc02 0.0707748 0.0269282 2.628 0.008582 **
## rucc03 0.0927867 0.0274854 3.376 0.000736 ***
## rucc04 0.1480640 0.0320586 4.619 3.86e-06 ***
## rucc05 0.0799343 0.0443662 1.802 0.071593 .
## rucc06 0.0356470 0.0254254 1.402 0.160909
## rucc07 0.0038485 0.0271980 0.142 0.887474
## rucc08 0.0501769 0.0338368 1.483 0.138099
## rucc09 -0.1377090 0.0293620 -4.690 2.73e-06 ***
## hsdegree14 -0.0271606 0.0012568 -21.611 < 2e-16 ***
## medianage10 0.0065741 0.0016927 3.884 0.000103 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(7.0162) family taken to be 1)
##
## Null deviance: 4903.0 on 3099 degrees of freedom
## Residual deviance: 3336.3 on 3088 degrees of freedom
## AIC: 43702
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 7.016
## Std. Err.: 0.187
##
## 2 x log-likelihood: -43675.746
glmnb2<-gamlss(Unemploy14 ~ offset(log(laborfc14)) + majoritypop10 + rucc + hsdegree14 + medianage10,
family = NBII,
data=ahrf_m)
## GAMLSS-RS iteration 1: Global Deviance = 55395.73
## GAMLSS-RS iteration 2: Global Deviance = 54494.6
## GAMLSS-RS iteration 3: Global Deviance = 53380.76
## GAMLSS-RS iteration 4: Global Deviance = 51973.44
## GAMLSS-RS iteration 5: Global Deviance = 50176.26
## GAMLSS-RS iteration 6: Global Deviance = 47934.43
## GAMLSS-RS iteration 7: Global Deviance = 45653.22
## GAMLSS-RS iteration 8: Global Deviance = 44528.02
## GAMLSS-RS iteration 9: Global Deviance = 44387.13
## GAMLSS-RS iteration 10: Global Deviance = 44382.64
## GAMLSS-RS iteration 11: Global Deviance = 44382.56
## GAMLSS-RS iteration 12: Global Deviance = 44382.56
## GAMLSS-RS iteration 13: Global Deviance = 44382.56
summary(glmnb2)
## ******************************************************************
## Family: c("NBII", "Negative Binomial type II")
##
## Call: gamlss(formula = Unemploy14 ~ offset(log(laborfc14)) +
## majoritypop10 + rucc + hsdegree14 + medianage10,
## family = NBII, data = ahrf_m)
##
## Fitting method: RS()
##
## ------------------------------------------------------------------
## Mu link function: log
## Mu Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.0627151 0.0675158 -15.740 < 2e-16 ***
## majoritypop10 -0.0078546 0.0003125 -25.133 < 2e-16 ***
## rucc02 0.1206288 0.0108562 11.112 < 2e-16 ***
## rucc03 0.1464894 0.0152583 9.601 < 2e-16 ***
## rucc04 0.1701897 0.0210841 8.072 9.81e-16 ***
## rucc05 0.1210175 0.0340058 3.559 0.000378 ***
## rucc06 0.1354850 0.0207976 6.514 8.49e-11 ***
## rucc07 0.1127672 0.0266848 4.226 2.45e-05 ***
## rucc08 0.2529747 0.0435144 5.814 6.74e-09 ***
## rucc09 0.2919244 0.0386000 7.563 5.17e-14 ***
## hsdegree14 -0.0205772 0.0008565 -24.026 < 2e-16 ***
## medianage10 0.0141798 0.0012051 11.766 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## ------------------------------------------------------------------
## Sigma link function: log
## Sigma Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.01046 0.02676 187.2 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## ------------------------------------------------------------------
## No. of observations in the fit: 3100
## Degrees of Freedom for the fit: 13
## Residual Deg. of Freedom: 3087
## at cycle: 13
##
## Global Deviance: 44382.56
## AIC: 44408.56
## SBC: 44487.07
## ******************************************************************
#4) Compare the model fits of the alternative models using AIC
AIC(glm1, glmb, glmp_b, glmnb, glmnb2)
AIC(glmp_s, glmb,glmnb, glmnb2)
ANSWER: Among these four model specifications, the Binomial fits the worst and the NB1 model, estimated by glm.nb() fits the best, with the lowest AIC among these four models