library(tidyverse, quietly = T)
library(ggplot2, quietly = T)
theme_set(theme_classic())
library(rstanarm)
library(tidycensus)
options(mc.cores = parallel::detectCores()-1)
library(extraoperators)
library(spdep)
library(MASS)
library(spatialreg)
library(ggplot2)
library(nortest)
library(car)
library(lmtest)
library(classInt)
library(sandwich)
library(tidyverse)
library(tidycensus)
library(dplyr)
library(haven)
library(spatialreg)
library(INLA)
library(haven)
AHRF2021 <- read_sas("G:/My Drive/PhD Demography/Fall 2021/Courses/Spatial Demography 7263/Term Paper/Data Analysis/AHRF_2020-2021_SAS/AHRF2021.sas7bdat", NULL)
Selecting data from AREA RESOURCE FILE
library(dplyr)
arf21<-AHRF2021%>%
mutate(cofips=f00004,
coname=f00010,
state = f00011,
totfpop15=f1390715,
totfpop17=f1390717,
nhwm15= f1392615,
nhwf15= f1392715,
hispm=f1392415,
hispf=f1392515,
blkm= f1391015,
blkf= f1391115,
fnins=f1553015,
unemprate= f0679515,
obgyn15= f1168415,
births1517= f1254615,
childpov15= f1332215,
ppoverty= f1332115,
lowbw1517=f1255315,
rucc= as.factor(f0002013),
hpsa10= as.factor(f0978710))%>%
dplyr:: select(totfpop15, totfpop17,births1517, lowbw1517,state, cofips,hpsa10, obgyn15, ppoverty, unemprate)%>%
filter(complete.cases(.))%>%
as.data.frame()
Create expected number of cases
# rates<-aggregate(dat$lbrate~1, dat, mean)
arf21$E<-arf21$births1517*(sum(arf21$lowbw1517)/sum(arf21$births1517))
arf21<-arf21[order(arf21$cofips),]
arf21$id<-1:dim(arf21)[1]
head(arf21)
## totfpop15 totfpop17 births1517 lowbw1517 state cofips hpsa10 obgyn15 ppoverty
## 1 28490 28497 667 57 01 01001 1 1 12.7
## 2 104423 109403 2304 183 01 01003 1 22 12.9
## 3 12341 11935 278 32 01 01005 2 1 32.0
## 4 10413 10530 266 28 01 01007 1 0 22.2
## 5 29178 29406 684 60 01 01009 1 0 14.7
## 6 4839 4694 129 19 01 01011 1 0 39.6
## unemprate E id
## 1 5.2 54.58666 1
## 2 5.5 188.55721 2
## 3 8.9 22.75126 3
## 4 6.6 21.76919 4
## 5 5.4 55.97792 5
## 6 7.8 10.55724 6
options(scipen=999)
Load social capital data
soccap <- readxl::read_xlsx("G:/My Drive/MSc Demography/Summer 2021/Github_repository/Soc-Cap-Smoking/soccap2014.xlsx")
soccap$cofips <- stringr::str_pad(string=soccap$FIPS, width=5,side = "left", pad = 0 )
soccap2 <- soccap[, c(19, 2, 18)]
head(soccap2)
## # A tibble: 6 x 3
## cofips County_Name sk2014
## <chr> <chr> <dbl>
## 1 01001 Autauga County, AL -0.631
## 2 01003 Baldwin County, AL -0.555
## 3 01005 Barbour County, AL -0.891
## 4 01007 Bibb County, AL -0.907
## 5 01009 Blount County, AL -1.01
## 6 01011 Bullock County, AL -0.502
Merge ARF, and social capital data.
dat <- merge(soccap2, arf21, by="cofips")
#yahoo... it ran. lets see the heads
head(dat)
## cofips County_Name sk2014 totfpop15 totfpop17 births1517 lowbw1517
## 1 01001 Autauga County, AL -0.6310033 28490 28497 667 57
## 2 01003 Baldwin County, AL -0.5553960 104423 109403 2304 183
## 3 01005 Barbour County, AL -0.8910361 12341 11935 278 32
## 4 01007 Bibb County, AL -0.9065815 10413 10530 266 28
## 5 01009 Blount County, AL -1.0132797 29178 29406 684 60
## 6 01011 Bullock County, AL -0.5024002 4839 4694 129 19
## state hpsa10 obgyn15 ppoverty unemprate E id
## 1 01 1 1 12.7 5.2 54.58666 1
## 2 01 1 22 12.9 5.5 188.55721 2
## 3 01 2 1 32.0 8.9 22.75126 3
## 4 01 1 0 22.2 6.6 21.76919 4
## 5 01 1 0 14.7 5.4 55.97792 5
## 6 01 1 0 39.6 7.8 10.55724 6
#creating a rate variable
dat$lbrate <- dat$lowbw1517/dat$births1517
Fit ols model to check county level characterisics are associated with low birth weight rate
Finding predictor variables
fit <- lm(lbrate ~ scale(ppoverty)+scale(sk2014)+ scale(unemprate), data=dat)
summary(fit)
##
## Call:
## lm(formula = lbrate ~ scale(ppoverty) + scale(sk2014) + scale(unemprate),
## data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.064307 -0.010026 -0.000478 0.009338 0.080991
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0848523 0.0003255 260.713 < 0.0000000000000002 ***
## scale(ppoverty) 0.0117155 0.0004392 26.674 < 0.0000000000000002 ***
## scale(sk2014) 0.0005711 0.0003396 1.682 0.0927 .
## scale(unemprate) 0.0021064 0.0004344 4.849 0.00000132 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01574 on 2335 degrees of freedom
## Multiple R-squared: 0.4077, Adjusted R-squared: 0.4069
## F-statistic: 535.8 on 3 and 2335 DF, p-value: < 0.00000000000000022
Poission model
dat2<-filter(dat, is.na(lowbw1517)==F)
fit_pois<- glm(lowbw1517 ~ offset(log(births1517)) + scale(ppoverty)+scale(sk2014)+ scale(unemprate),
family=poisson,
data=dat2)
summary(fit_pois)
##
## Call:
## glm(formula = lowbw1517 ~ offset(log(births1517)) + scale(ppoverty) +
## scale(sk2014) + scale(unemprate), family = poisson, data = dat2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -12.1308 -0.9065 -0.0937 0.7824 9.9402
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.473276 0.002035 -1215.131 < 0.0000000000000002 ***
## scale(ppoverty) 0.119535 0.002663 44.887 < 0.0000000000000002 ***
## scale(sk2014) 0.024235 0.002256 10.743 < 0.0000000000000002 ***
## scale(unemprate) -0.014754 0.002546 -5.796 0.0000000068 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 9693.7 on 2338 degrees of freedom
## Residual deviance: 7147.3 on 2335 degrees of freedom
## AIC: 20565
##
## Number of Fisher Scoring iterations: 4
Total number of birth is the offset term here.
To check overdispersion
scale<-sqrt(fit_pois$deviance/fit_pois$df.residual)
scale
## [1] 1.74956
This is more than 1 which indicates this model over dispersion
1-pchisq(fit_pois$deviance, df = fit_pois$df.residual)
## [1] 0
Quasipoisson model
fit_q_pois<- glm(lowbw1517 ~ offset(log(births1517)) + scale(ppoverty)+scale(sk2014)+ scale(unemprate),
family=quasipoisson,
data=dat2)
summary(fit_q_pois)
##
## Call:
## glm(formula = lowbw1517 ~ offset(log(births1517)) + scale(ppoverty) +
## scale(sk2014) + scale(unemprate), family = quasipoisson,
## data = dat2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -12.1308 -0.9065 -0.0937 0.7824 9.9402
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.473276 0.003561 -694.525 < 0.0000000000000002 ***
## scale(ppoverty) 0.119535 0.004659 25.656 < 0.0000000000000002 ***
## scale(sk2014) 0.024235 0.003947 6.140 0.000000000965 ***
## scale(unemprate) -0.014754 0.004454 -3.313 0.000938 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasipoisson family taken to be 3.061048)
##
## Null deviance: 9693.7 on 2338 degrees of freedom
## Residual deviance: 7147.3 on 2335 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 4
The dispersion parameter is 3.06, in the regular Poisson model, it was assumed to be 1. This model is not a good fit for the data.
Alternative of poission model
Negative binomial
library(MASS)
fit_nb<- glm.nb(lowbw1517 ~ offset(log(births1517)) + scale(ppoverty)+scale(sk2014)+ scale(unemprate) ,
data=dat2)
summary(fit_nb)
##
## Call:
## glm.nb(formula = lowbw1517 ~ offset(log(births1517)) + scale(ppoverty) +
## scale(sk2014) + scale(unemprate), data = dat2, init.theta = 68.2992919,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.5877 -0.5865 -0.0461 0.5587 3.2064
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.479219 0.003927 -631.349 <0.0000000000000002 ***
## scale(ppoverty) 0.124490 0.005224 23.832 <0.0000000000000002 ***
## scale(sk2014) -0.001734 0.004622 -0.375 0.7076
## scale(unemprate) 0.011997 0.005045 2.378 0.0174 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(68.2993) family taken to be 1)
##
## Null deviance: 2979.8 on 2338 degrees of freedom
## Residual deviance: 1929.8 on 2335 degrees of freedom
## AIC: 16979
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 68.30
## Std. Err.: 3.65
##
## 2 x log-likelihood: -16968.63
Creating INLA dataframe
library(tigris)
## To enable
## caching of data, set `options(tigris_use_cache = TRUE)` in your R script or .Rprofile.
options(tigris_class="sf")
usco<-counties(cb=T, year=2010)
usco$cofips<-substr(usco$GEO_ID, 10, 15)
sts<-states(cb = T, year=2010)
sts<-st_boundary(sts) %>%
filter(!STATE %in% c("02", "15", "60", "66", "69", "72", "78"))
usco$struct<-1:dim(usco)[1]
nbs<-knearneigh(st_centroid(usco), k = 5, longlat = T) #k=5 nearest neighbors
nbs<-knn2nb(nbs, row.names = usco$struct, sym = T) #force symmetry!!
mat <- nb2mat(nbs, style="B",zero.policy=TRUE)
colnames(mat) <- rownames(mat)
mat <- as.matrix(mat[1:dim(mat)[1], 1:dim(mat)[1]])
nb2INLA("cl_graph",nbs)
am_adj <-paste(getwd(),"/cl_graph",sep="")
H<-inla.read.graph(filename="cl_graph")
library(sf)
usco<-st_as_sf(usco)
usco$cofips<-paste(usco$STATEFP, usco$COUNTYFP, sep="")
usco%>%
ggplot()+
geom_sf()+
coord_sf(crs = 2163)
final.dat<-merge( usco, dat, by="cofips")
final.dat<-final.dat[order(final.dat$cofips),]
head(final.dat)
## Simple feature collection with 6 features and 24 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -88.02927 ymin: 30.22113 xmax: -85.05307 ymax: 34.26048
## Geodetic CRS: NAD83
## cofips GEO_ID STATE COUNTY NAME LSAD CENSUSAREA COUNTYFP STATEFP
## 1 01001 0500000US01001 01 001 Autauga County 594.436 001 01
## 2 01003 0500000US01003 01 003 Baldwin County 1589.784 003 01
## 3 01005 0500000US01005 01 005 Barbour County 884.876 005 01
## 4 01007 0500000US01007 01 007 Bibb County 622.582 007 01
## 5 01009 0500000US01009 01 009 Blount County 644.776 009 01
## 6 01011 0500000US01011 01 011 Bullock County 622.805 011 01
## struct County_Name sk2014 totfpop15 totfpop17 births1517 lowbw1517
## 1 15 Autauga County, AL -0.6310033 28490 28497 667 57
## 2 16 Baldwin County, AL -0.5553960 104423 109403 2304 183
## 3 17 Barbour County, AL -0.8910361 12341 11935 278 32
## 4 18 Bibb County, AL -0.9065815 10413 10530 266 28
## 5 19 Blount County, AL -1.0132797 29178 29406 684 60
## 6 20 Bullock County, AL -0.5024002 4839 4694 129 19
## state hpsa10 obgyn15 ppoverty unemprate E id lbrate
## 1 01 1 1 12.7 5.2 54.58666 1 0.08545727
## 2 01 1 22 12.9 5.5 188.55721 2 0.07942708
## 3 01 2 1 32.0 8.9 22.75126 3 0.11510791
## 4 01 1 0 22.2 6.6 21.76919 4 0.10526316
## 5 01 1 0 14.7 5.4 55.97792 5 0.08771930
## 6 01 1 0 39.6 7.8 10.55724 6 0.14728682
## geometry
## 1 MULTIPOLYGON (((-86.52469 3...
## 2 MULTIPOLYGON (((-87.41247 3...
## 3 MULTIPOLYGON (((-85.13285 3...
## 4 MULTIPOLYGON (((-87.11632 3...
## 5 MULTIPOLYGON (((-86.73121 3...
## 6 MULTIPOLYGON (((-85.74209 3...
ggplot(data = final.dat)+geom_histogram(aes(x =log(lowbw1517/births1517) ,
y=0.5*..density..))+
#facet_wrap(~year)+
ggtitle(label = "Distribution of low birthweight rate",
subtitle = "US Counties, 2015-17")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Relative risk
ggplot(data = final.dat)+geom_histogram(aes(x =log(lowbw1517/E) ,
y=0.5*..density..))+
#facet_wrap(~year)+
ggtitle(label = "Distribution of relative risk of low birthweight",
subtitle = "US Counties, 2015-17")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
sts<- states(cb=T)%>%
st_transform(crs= 2163)%>%
st_boundary()%>%
subset(!STATEFP%in%c("02", "15", "60", "66", "69", "72", "78"))
final.dat%>%
mutate(qrr=cut(I(lowbw1517/E),
breaks = quantile(I(lowbw1517/E),
p=seq(0,1,length.out = 5)),
include.lowest = T,))%>%
ggplot()+
geom_sf(aes(fill=qrr, color=NA))+
geom_sf(data=sts, color="black")+
scale_colour_brewer(palette = "RdBu" )+
scale_fill_brewer(palette = "RdBu", na.value="grey")+
guides(fill=guide_legend(title="Relative Risk Quartile"))+
ggtitle(label="Relative Risk Quartile - Low birth weight 2015-17")+
coord_sf(crs = 2163)
#INLA model run (Besag)
f3<-lowbw1517~scale(ppoverty)+scale(unemprate)+
f(struct, model = "besag",scale.model = T, constr = T, graph = H)
mod3<-inla(formula = f3,data = final.dat,
family = "poisson",
E = E,
control.compute = list(waic=T),
control.predictor = list(link=1),
num.threads = 3,
verbose = F)
#total model summary
summary(mod3)
##
## Call:
## c("inla(formula = f3, family = \"poisson\", data = final.dat, E = E, ",
## " verbose = F, control.compute = list(waic = T), control.predictor =
## list(link = 1), ", " num.threads = 3)")
## Time used:
## Pre = 1.22, Running = 5.38, Post = 0.531, Total = 7.13
## Fixed effects:
## mean sd 0.025quant 0.5quant 0.975quant mode kld
## (Intercept) 0.007 0.004 0.000 0.007 0.015 0.007 0
## scale(ppoverty) 0.093 0.005 0.084 0.093 0.103 0.093 0
## scale(unemprate) 0.007 0.005 -0.002 0.007 0.016 0.007 0
##
## Random effects:
## Name Model
## struct Besags ICAR model
##
## Model hyperparameters:
## mean sd 0.025quant 0.5quant 0.975quant mode
## Precision for struct 63.73 4.54 55.35 63.55 73.16 63.17
##
## Expected number of effective parameters(stdev): 604.22(22.42)
## Number of equivalent replicates : 3.87
##
## Watanabe-Akaike information criterion (WAIC) ...: 15559.82
## Effective number of parameters .................: 400.16
##
## Marginal log-Likelihood: -9875.74
## Posterior marginals for the linear predictor and
## the fitted values are computed