temp <- tempfile()
#Download the SAS dataset as a ZIP compressed archive
download.file("https://data.hrsa.gov/DataDownload/AHRF/AHRF_2019-2020_SAS.zip", temp)
#Read SAS data into R
ahrf<-haven::read_sas(unz(temp,
filename = "ahrf2020.sas7bdat"))
rm(temp)
#1) Define a count outcome for the dataset of your choosing, the Area Resource File used in class provides many options here Answer:
Count outcome: answer: Insured rate
Predictors: race, education,urban/rural, median age
State a research question about your outcome
answer: Is rate of insured associated with uninsured rate?
library(tidyverse)
ahrf2<-ahrf%>%
mutate(cofips = f00004,
coname = f00010,
state = f00011,
popn = f1198414,
laborfc14 = f1451014,
uninsured14 = f1551715,
uninsuredrate14 = 1000*(f1551715/f1451014), #Rate per 1000 uninsured
majoritypop10 = f0453710,
hsdegree14 =f1448114,
medianage10= f1348310,
rucc = as.factor(f0002013) )%>%
mutate(rucc = droplevels(rucc, ""))%>%
dplyr::select(laborfc14,
uninsured14,
uninsuredrate14,
state,
cofips,
coname,
popn,
medianage10,
rucc,
majoritypop10,
hsdegree14)%>%
filter(complete.cases(.))%>%
as.data.frame()
options(tigris_class="sf")
library(tigris)
## To enable
## caching of data, set `options(tigris_use_cache = TRUE)` in your R script or .Rprofile.
library(sf)
## Linking to GEOS 3.8.1, GDAL 3.2.1, PROJ 7.2.1; sf_use_s2() is TRUE
usco<-counties(cb = T, year= 2016)
##
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|=== | 4%
|
|=== | 5%
|
|==== | 5%
|
|==== | 6%
|
|===== | 7%
|
|===== | 8%
|
|====== | 8%
|
|====== | 9%
|
|======= | 9%
|
|======== | 12%
|
|============= | 19%
|
|============== | 20%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 22%
|
|================ | 23%
|
|================= | 24%
|
|================== | 25%
|
|================== | 26%
|
|=========================== | 39%
|
|============================= | 42%
|
|============================== | 43%
|
|=============================== | 44%
|
|================================ | 45%
|
|================================ | 46%
|
|========================================= | 59%
|
|============================================ | 63%
|
|=============================================== | 67%
|
|=============================================== | 68%
|
|================================================ | 69%
|
|================================================= | 70%
|
|=========================================================== | 84%
|
|============================================================== | 88%
|
|================================================================ | 92%
|
|======================================================================| 100%
usco$cofips<-usco$GEOID
sts<-states(cb = T, year = 2016)
##
|
| | 0%
|
|== | 2%
|
|== | 3%
|
|==== | 5%
|
|===== | 7%
|
|======== | 11%
|
|======== | 12%
|
|========================================= | 59%
|
|======================================================== | 80%
|
|======================================================================| 100%
sts<-st_boundary(sts)%>%
filter(!STATEFP %in% c("02", "15", "60", "66", "69", "72", "78"))%>%
st_transform(crs = 2163)
ahrf_m<-left_join(usco, ahrf2,
by = "cofips")%>%
filter(is.na(uninsuredrate14)==F,
!STATEFP %in% c("02", "15", "60", "66", "69", "72", "78"))%>%
st_transform(crs = 2163)
glimpse(ahrf_m)
## Rows: 3,100
## Columns: 21
## $ STATEFP <chr> "19", "19", "20", "20", "20", "21", "21", "21", "21", …
## $ COUNTYFP <chr> "107", "189", "093", "123", "187", "005", "029", "049"…
## $ COUNTYNS <chr> "00465242", "00465283", "00485011", "00485026", "00485…
## $ AFFGEOID <chr> "0500000US19107", "0500000US19189", "0500000US20093", …
## $ GEOID <chr> "19107", "19189", "20093", "20123", "20187", "21005", …
## $ NAME <chr> "Keokuk", "Winnebago", "Kearny", "Mitchell", "Stanton"…
## $ LSAD <chr> "06", "06", "06", "06", "06", "06", "06", "06", "06", …
## $ ALAND <dbl> 1500067253, 1037261946, 2254696689, 1817632928, 176210…
## $ AWATER <dbl> 1929323, 3182052, 1133601, 44979981, 178555, 6311537, …
## $ cofips <chr> "19107", "19189", "20093", "20123", "20187", "21005", …
## $ laborfc14 <dbl> 5255, 5551, 1755, 2840, 1046, 11242, 41271, 17270, 479…
## $ uninsured14 <dbl> 366, 309, 357, 291, 230, 725, 2441, 1351, 3432, 298, 3…
## $ uninsuredrate14 <dbl> 69.64795, 55.66565, 203.41880, 102.46479, 219.88528, 6…
## $ state <chr> "19", "19", "20", "20", "20", "21", "21", "21", "21", …
## $ coname <chr> "Keokuk", "Winnebago", "Kearny", "Mitchell", "Stanton"…
## $ popn <dbl> 10231, 10559, 3915, 6284, 2111, 21888, 77955, 35758, 9…
## $ medianage10 <dbl> 43.8, 43.5, 35.5, 45.6, 35.6, 38.4, 38.2, 39.8, 38.8, …
## $ rucc <fct> 08, 07, 09, 07, 09, 06, 01, 02, 03, 09, 03, 06, 05, 09…
## $ majoritypop10 <dbl> 98.4, 96.1, 87.3, 98.0, 83.7, 95.5, 96.8, 92.2, 91.2, …
## $ hsdegree14 <dbl> 92.5, 91.8, 74.1, 95.7, 81.8, 88.7, 88.0, 85.4, 90.0, …
## $ geometry <MULTIPOLYGON [m]> MULTIPOLYGON (((632645.1 -3..., MULTIPOLY…
ggplot() histogram of the uninsured rate for US counties.
ahrf_m%>%
ggplot()+
geom_histogram(aes(x = uninsuredrate14))+
labs(title = "Distribution of Uninsured Rates in US Counties",
subtitle = "2015 - 2019")+
xlab("Rate per 1,000 uninsured rate")+
ylab ("Frequency")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
library(tmap)
tm_shape(ahrf_m)+
tm_polygons(col = "uninsuredrate14",
border.col = NULL,
title="Uninsured Rt",
palette="Blues",
style="quantile",
n=5,
showNA=T, colorNA = "grey50")+
tm_format(format= "World",
main.title="US Uninsured Rate by County",
legend.position = c("left", "bottom"),
main.title.position =c("center"))+
tm_scale_bar(position = c(.1,0))+
tm_compass()+
tm_shape(sts)+
tm_lines( col = "black")
glm1<- glm(uninsuredrate14 ~ majoritypop10 + rucc + hsdegree14 + medianage10,
data = ahrf_m,
family =gaussian)
glm1<-glm1%>%
tbl_regression()
summary(glm1)
## Length Class Mode
## table_body 24 broom.helpers list
## table_styling 7 -none- list
## N 1 -none- numeric
## n 1 -none- numeric
## model_obj 30 glm list
## inputs 10 -none- list
## call_list 15 -none- list
glmb<- glm(cbind(uninsured14, laborfc14-uninsured14) ~ majoritypop10 + rucc + hsdegree14 + medianage10,
data = ahrf_m,
family = binomial)
glmb%>%
tbl_regression()
| Characteristic | log(OR)1 | 95% CI1 | p-value |
|---|---|---|---|
| Percent White Population 2010 | 0.00 | 0.00, 0.00 | <0.001 |
| rucc | |||
| 01 | — | — | |
| 02 | 0.13 | 0.12, 0.13 | <0.001 |
| 03 | 0.16 | 0.16, 0.16 | <0.001 |
| 04 | 0.16 | 0.16, 0.17 | <0.001 |
| 05 | 0.24 | 0.24, 0.24 | <0.001 |
| 06 | 0.22 | 0.22, 0.23 | <0.001 |
| 07 | 0.17 | 0.17, 0.17 | <0.001 |
| 08 | 0.26 | 0.26, 0.27 | <0.001 |
| 09 | 0.29 | 0.28, 0.30 | <0.001 |
| % Persons 25+ w/HS Dipl or more 2014-18 | -0.06 | -0.06, -0.06 | <0.001 |
| Median Age 2010 | -0.01 | -0.01, -0.01 | <0.001 |
|
1
OR = Odds Ratio, CI = Confidence Interval
|
|||
summary(glmb)
##
## Call:
## glm(formula = cbind(uninsured14, laborfc14 - uninsured14) ~ majoritypop10 +
## rucc + hsdegree14 + medianage10, family = binomial, data = ahrf_m)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -258.46 -12.70 0.22 11.23 314.69
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.966e+00 3.705e-03 1070.60 <2e-16 ***
## majoritypop10 -2.331e-03 1.840e-05 -126.65 <2e-16 ***
## rucc02 1.260e-01 6.310e-04 199.67 <2e-16 ***
## rucc03 1.579e-01 8.835e-04 178.67 <2e-16 ***
## rucc04 1.648e-01 1.240e-03 132.94 <2e-16 ***
## rucc05 2.400e-01 1.890e-03 127.01 <2e-16 ***
## rucc06 2.236e-01 1.196e-03 186.97 <2e-16 ***
## rucc07 1.713e-01 1.576e-03 108.70 <2e-16 ***
## rucc08 2.612e-01 2.884e-03 90.55 <2e-16 ***
## rucc09 2.902e-01 2.715e-03 106.91 <2e-16 ***
## hsdegree14 -6.043e-02 4.712e-05 -1282.40 <2e-16 ***
## medianage10 -1.187e-02 7.095e-05 -167.30 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6271466 on 3099 degrees of freedom
## Residual deviance: 3359574 on 3088 degrees of freedom
## AIC: 3388114
##
## Number of Fisher Scoring iterations: 4
glmb%>%
tbl_regression(exponentiate=TRUE)
| Characteristic | OR1 | 95% CI1 | p-value |
|---|---|---|---|
| Percent White Population 2010 | 1.00 | 1.00, 1.00 | <0.001 |
| rucc | |||
| 01 | — | — | |
| 02 | 1.13 | 1.13, 1.14 | <0.001 |
| 03 | 1.17 | 1.17, 1.17 | <0.001 |
| 04 | 1.18 | 1.18, 1.18 | <0.001 |
| 05 | 1.27 | 1.27, 1.28 | <0.001 |
| 06 | 1.25 | 1.25, 1.25 | <0.001 |
| 07 | 1.19 | 1.18, 1.19 | <0.001 |
| 08 | 1.30 | 1.29, 1.31 | <0.001 |
| 09 | 1.34 | 1.33, 1.34 | <0.001 |
| % Persons 25+ w/HS Dipl or more 2014-18 | 0.94 | 0.94, 0.94 | <0.001 |
| Median Age 2010 | 0.99 | 0.99, 0.99 | <0.001 |
|
1
OR = Odds Ratio, CI = Confidence Interval
|
|||
summary(ahrf_m$uninsured14)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.0 705.8 1767.0 7118.8 4531.5 867637.0
ahrf_m$uninsured14<- ahrf_m$uninsured14 +1
An offset term is necessary because of the conditions required for using the poisson model.In this model each area or person has the same risk or population size. This is not required for this population.
The offset term allows unequal populations to be added. Possion is based on integer count. Because of this we use the rate/probability which we log instead.
After doing the summary of the data, the outcome variable( uninsured) contains zero. This would bring an error if done directly. I added 1 to the variable to correct the error. The reason a 1 was used is because they are individuals and we cannot has a percentage of an individual like 0.5 or 0.1.
glmp_s <- glm(uninsured14 ~ offset(log(laborfc14)) + majoritypop10 + rucc + hsdegree14 + medianage10,
data=ahrf_m,
family=poisson)
glmp_s%>%
tbl_regression(exp = TRUE)
| Characteristic | IRR1 | 95% CI1 | p-value |
|---|---|---|---|
| Percent White Population 2010 | 1.00 | 1.00, 1.00 | <0.001 |
| rucc | |||
| 01 | — | — | |
| 02 | 1.11 | 1.11, 1.11 | <0.001 |
| 03 | 1.15 | 1.15, 1.15 | <0.001 |
| 04 | 1.15 | 1.15, 1.16 | <0.001 |
| 05 | 1.22 | 1.22, 1.23 | <0.001 |
| 06 | 1.22 | 1.22, 1.22 | <0.001 |
| 07 | 1.16 | 1.16, 1.16 | <0.001 |
| 08 | 1.26 | 1.25, 1.27 | <0.001 |
| 09 | 1.29 | 1.28, 1.29 | <0.001 |
| % Persons 25+ w/HS Dipl or more 2014-18 | 0.95 | 0.95, 0.95 | <0.001 |
| Median Age 2010 | 0.99 | 0.99, 0.99 | <0.001 |
|
1
IRR = Incidence Rate Ratio, CI = Confidence Interval
|
|||
scale<-sqrt(glmp_s$deviance/glmp_s$df.residual)
scale
## [1] 30.66545
There is about 31 times more variation in the data we used than there is supposed to be. This can lead to an inaccurate outcome.
1-pchisq(glmp_s$deviance,
df = glmp_s$df.residual)
## [1] 0
Since the outcome is 0 the Poisson model is not a good choice for the data
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:gtsummary':
##
## select
## The following object is masked from 'package:srvyr':
##
## select
## The following object is masked from 'package:dplyr':
##
## select
glmnb<- glm.nb(uninsured14 ~ offset(log(laborfc14)) + majoritypop10 + rucc + hsdegree14 + medianage10,
data=ahrf_m)
glmnb%>%
tbl_regression( exp= T)
| Characteristic | IRR1 | 95% CI1 | p-value |
|---|---|---|---|
| Percent White Population 2010 | 0.99 | 0.99, 0.99 | <0.001 |
| rucc | |||
| 01 | — | — | |
| 02 | 1.15 | 1.09, 1.20 | <0.001 |
| 03 | 1.17 | 1.12, 1.23 | <0.001 |
| 04 | 1.16 | 1.10, 1.23 | <0.001 |
| 05 | 1.25 | 1.16, 1.36 | <0.001 |
| 06 | 1.19 | 1.14, 1.25 | <0.001 |
| 07 | 1.21 | 1.15, 1.27 | <0.001 |
| 08 | 1.26 | 1.19, 1.34 | <0.001 |
| 09 | 1.35 | 1.28, 1.42 | <0.001 |
| % Persons 25+ w/HS Dipl or more 2014-18 | 0.96 | 0.95, 0.96 | <0.001 |
| Median Age 2010 | 1.00 | 1.00, 1.00 | 0.2 |
|
1
IRR = Incidence Rate Ratio, CI = Confidence Interval
|
|||
summary(glmnb)
##
## Call:
## glm.nb(formula = uninsured14 ~ offset(log(laborfc14)) + majoritypop10 +
## rucc + hsdegree14 + medianage10, data = ahrf_m, init.theta = 8.197434548,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.0394 -0.8819 -0.0987 0.6093 3.5618
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.4449548 0.0983509 24.860 < 2e-16 ***
## majoritypop10 -0.0088845 0.0004575 -19.419 < 2e-16 ***
## rucc02 0.1361363 0.0248725 5.473 4.42e-08 ***
## rucc03 0.1586354 0.0253733 6.252 4.05e-10 ***
## rucc04 0.1511642 0.0296195 5.104 3.33e-07 ***
## rucc05 0.2270307 0.0409797 5.540 3.02e-08 ***
## rucc06 0.1768223 0.0234467 7.541 4.65e-14 ***
## rucc07 0.1909920 0.0250577 7.622 2.50e-14 ***
## rucc08 0.2321529 0.0310239 7.483 7.26e-14 ***
## rucc09 0.2987172 0.0267571 11.164 < 2e-16 ***
## hsdegree14 -0.0444507 0.0011511 -38.617 < 2e-16 ***
## medianage10 0.0018084 0.0015539 1.164 0.244
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(8.1974) family taken to be 1)
##
## Null deviance: 6539.8 on 3099 degrees of freedom
## Residual deviance: 3166.6 on 3088 degrees of freedom
## AIC: 49219
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 8.197
## Std. Err.: 0.206
##
## 2 x log-likelihood: -49193.210
library(gamlss)
## Loading required package: gamlss.data
##
## Attaching package: 'gamlss.data'
## The following object is masked from 'package:datasets':
##
## sleep
## Loading required package: gamlss.dist
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## Loading required package: parallel
## ********** GAMLSS Version 5.4-1 **********
## For more on GAMLSS look at https://www.gamlss.com/
## Type gamlssNews() to see new features/changes/bug fixes.
##
## Attaching package: 'gamlss'
## The following object is masked from 'package:caret':
##
## calibration
library(splines)
glmnb2<-gamlss(uninsured14 ~ offset(log(laborfc14)) + majoritypop10 + rucc + hsdegree14 + medianage10,
family = NBII,
data=ahrf_m)
## GAMLSS-RS iteration 1: Global Deviance = 62005.91
## GAMLSS-RS iteration 2: Global Deviance = 61193.27
## GAMLSS-RS iteration 3: Global Deviance = 60213.43
## GAMLSS-RS iteration 4: Global Deviance = 59020.41
## GAMLSS-RS iteration 5: Global Deviance = 57583.68
## GAMLSS-RS iteration 6: Global Deviance = 55966.76
## GAMLSS-RS iteration 7: Global Deviance = 54513.77
## GAMLSS-RS iteration 8: Global Deviance = 53761.5
## GAMLSS-RS iteration 9: Global Deviance = 53599.43
## GAMLSS-RS iteration 10: Global Deviance = 53584.53
## GAMLSS-RS iteration 11: Global Deviance = 53583.65
## GAMLSS-RS iteration 12: Global Deviance = 53583.6
## GAMLSS-RS iteration 13: Global Deviance = 53583.6
## GAMLSS-RS iteration 14: Global Deviance = 53583.59
summary(glmnb2)
## ******************************************************************
## Family: c("NBII", "Negative Binomial type II")
##
## Call: gamlss(formula = uninsured14 ~ offset(log(laborfc14)) +
## majoritypop10 + rucc + hsdegree14 + medianage10,
## family = NBII, data = ahrf_m)
##
## Fitting method: RS()
##
## ------------------------------------------------------------------
## Mu link function: log
## Mu Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.2913534 0.0940738 24.357 < 2e-16 ***
## majoritypop10 -0.0029720 0.0004889 -6.080 1.35e-09 ***
## rucc02 0.1297826 0.0181763 7.140 1.16e-12 ***
## rucc03 0.2244048 0.0244084 9.194 < 2e-16 ***
## rucc04 0.2518788 0.0334834 7.523 7.01e-14 ***
## rucc05 0.3369010 0.0490856 6.864 8.09e-12 ***
## rucc06 0.4343882 0.0296230 14.664 < 2e-16 ***
## rucc07 0.4555658 0.0365300 12.471 < 2e-16 ***
## rucc08 0.7280141 0.0546660 13.317 < 2e-16 ***
## rucc09 0.9559378 0.0460720 20.749 < 2e-16 ***
## hsdegree14 -0.0456010 0.0012239 -37.258 < 2e-16 ***
## medianage10 -0.0056445 0.0020744 -2.721 0.00654 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## ------------------------------------------------------------------
## Sigma link function: log
## Sigma Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.88568 0.02661 258.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## ------------------------------------------------------------------
## No. of observations in the fit: 3100
## Degrees of Freedom for the fit: 13
## Residual Deg. of Freedom: 3087
## at cycle: 14
##
## Global Deviance: 53583.59
## AIC: 53609.59
## SBC: 53688.1
## ******************************************************************
#4) Compare the model fits of the alternative models using AIC
AIC(glm1, glmb, glmp_b, glmnb, glmnb2)
AIC(glmp_s, glmb,glmnb, glmnb2)
Out of these four model specifications, the Binomial fits the worst and the NB1 model, estimated by glm.nb() fits the best. This one has the lowest AIC.