df<- haven::read_sas("C:/Users/victo/OneDrive - University of Texas at San Antonio/Classes/Fall 2021/Spatial Anaylsis/HW 4/AHRF_2020-2021_SAS/AHRF2021.sas7bdat")
labels <- sapply(1:ncol(df), function(x){attr(df[[x]], "label")}) %>%
cbind("Variable_Name"=names(df),"Label"=.)
I want to see if the number of doctors in a county are influenced by the number of old people and other variables. We can examine the labels in the data to explore what variables are in the dataset.
I have selected the following variables to model Active M.Ds
| Dependent Variable | Label |
|---|---|
| f1212919 | Tot Active M.D.s Non-Fed and Fed 2019 |
| Independent Variables | Label |
|---|---|
| f1318219 | Population (Persons) 2019 |
| f0954519 | Inpatient Days Incl Nurs Home;Total Hosp 2019 |
| f1547119 | Persons <65 Yrs 2019 |
| f1551919 | Persons 40-64 Yrs 2019 |
| f1434515 | Median Household Income 2015-19 |
| f1332119 | Percent Persons in Poverty 2019 |
| f1547419 | % <65 without Health Insurance 2019 |
| f00002 | Header - FIPS St and Cty Code |
df.subset <- df %>%
dplyr::select(f00002, f1212919, f1318219, f0954519, f1318219, f1551919, f1434515, f1332119, f1547419, f1547119) %>%
rename(FIPS = f00002,
Total_MD = f1212919,
Pop_2019 = f1318219,
Inpatient_Days = f0954519,
Pop_Old = f1547119,
Pop_Adult = f1551919,
Median_Income = f1434515,
Poverty_Percent = f1332119,
Old_No_Health = f1547419) %>%
filter(complete.cases(.))
summary(df.subset)
## FIPS Total_MD Pop_2019 Inpatient_Days
## Length:3113 Min. : 0.0 Min. : 169 Min. : 0
## Class :character 1st Qu.: 5.0 1st Qu.: 10907 1st Qu.: 843
## Mode :character Median : 18.0 Median : 26108 Median : 6089
## Mean : 309.4 Mean : 105442 Mean : 71190
## 3rd Qu.: 95.0 3rd Qu.: 69830 3rd Qu.: 36017
## Max. :31377.0 Max. :10039107 Max. :5825855
## Pop_Adult Median_Income Poverty_Percent Old_No_Health
## Min. : 41 Min. : 21504 Min. : 2.70 Min. : 2.40
## 1st Qu.: 3384 1st Qu.: 44226 1st Qu.:10.40 1st Qu.: 8.00
## Median : 8097 Median : 51775 Median :13.40 Median :11.10
## Mean : 32861 Mean : 53462 Mean :14.44 Mean :11.97
## 3rd Qu.: 21514 3rd Qu.: 59868 3rd Qu.:17.40 3rd Qu.:15.10
## Max. :3177577 Max. :142299 Max. :47.70 Max. :35.80
## Pop_Old
## Min. : 149
## 1st Qu.: 8301
## Median : 19890
## Mean : 85717
## 3rd Qu.: 54592
## Max. :8464644
Grabbing County spatial infomation
spatial <- counties(year = 2019, cb = TRUE)
sts<- states(cb=T)%>%
st_transform(crs= 2163)%>%
st_boundary()%>%
subset(!STATEFP%in%c("02", "15", "60", "66", "69", "72", "78"))
Merging counties and scaling my x variables.
df.geom <- spatial %>%
dplyr::select(GEOID) %>%
right_join(df.subset, by= c("GEOID" = "FIPS")) %>%
mutate_at(vars(Inpatient_Days, Pop_Adult, Median_Income, Poverty_Percent, Old_No_Health, Poverty_Percent, Pop_Old), scale)
Looking at our outcome variable:
hist(df.geom$Total_MD)
We can see that it is very right skewed.
Let’s plot our outcome variable
df.geom %>%
mutate(MD_group = quantile_variable(Total_MD, 5)) %>%
ggplot() +
geom_sf(aes(fill=MD_group, color = NA)) +
geom_sf(data=sts, color="black", alpha = .5)+
coord_sf(xlim = c(-125, -65), ylim = c(50,25)) +
scale_color_brewer(palette = "Blues") +
scale_fill_brewer(palette = "Blues",na.value = "grey50") +
labs(title = "Number of Active MD's by County in United States") +
theme(axis.title=element_blank(),
axis.text=element_blank(),
axis.ticks=element_blank(),
)
We can see that most doctors are located in the eastern and western coasts of the United States and that the midwestern part of the United States has less Active Doctors.
Fitting a poisson with an offset of the total population in 2019. This does not consider spatial effects.
fit_pois<-glm(Total_MD ~ offset(log(Pop_2019)) + Inpatient_Days + Pop_Adult + Median_Income + Poverty_Percent + Old_No_Health + Poverty_Percent + Pop_Old,
family=poisson,
data=df.geom)
summary(fit_pois)
##
## Call:
## glm(formula = Total_MD ~ offset(log(Pop_2019)) + Inpatient_Days +
## Pop_Adult + Median_Income + Poverty_Percent + Old_No_Health +
## Poverty_Percent + Pop_Old, family = poisson, data = df.geom)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -94.318 -6.433 -4.382 -2.153 97.919
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.1487708 0.0015020 -4093.77 <2e-16 ***
## Inpatient_Days 0.1760316 0.0005048 348.70 <2e-16 ***
## Pop_Adult 0.0794142 0.0040010 19.85 <2e-16 ***
## Median_Income 0.2096329 0.0012350 169.74 <2e-16 ***
## Poverty_Percent 0.1794683 0.0022042 81.42 <2e-16 ***
## Old_No_Health -0.0909379 0.0013178 -69.01 <2e-16 ***
## Pop_Old -0.2089367 0.0041012 -50.95 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 449980 on 3112 degrees of freedom
## Residual deviance: 252712 on 3106 degrees of freedom
## AIC: 268035
##
## Number of Fisher Scoring iterations: 5
We see that all of our variables are significant. We see that Population old has a negative relationship. Is goes against our initial hypothesis of the elderly people attracting doctors. It is possible that there is omitted variable bias in the model for example perhaps elder people live in an area that is less urban which that is the factor that is negative with total MD’s.
Examining overdispersion
sqrt(fit_pois$deviance/fit_pois$df.residual)
## [1] 9.020117
The data is very overdispersed the variance is 9 times greater then as the mean. Since the response is overdispersed we cannot trust our p value estimates. Since our response is overdispersed we will use the negative binomial model.
To eventually account for the spatial effects in the model we will be using INLA framework for our analysis.
formula.nb.simple <- Total_MD ~ offset(log(Pop_2019)) + Inpatient_Days + Pop_Adult + Median_Income + Poverty_Percent + Old_No_Health + Poverty_Percent + Pop_Old
mod1 = inla(formula = formula.nb.simple,
data = df.geom,
family = "nbinomial",
control.family=list(link='log'),
control.compute = list(waic=T),
control.predictor = list(link=1),
num.threads = 3)
summary(mod1)
##
## Call:
## c("inla(formula = formula.nb.simple, family = \"nbinomial\", data =
## df.geom, ", " control.compute = list(waic = T), control.predictor =
## list(link = 1), ", " control.family = list(link = \"log\"), num.threads
## = 3)" )
## Time used:
## Pre = 0.695, Running = 2.34, Post = 0.315, Total = 3.35
## Fixed effects:
## mean sd 0.025quant 0.5quant 0.975quant mode kld
## (Intercept) -6.838 0.013 -6.864 -6.838 -6.811 -6.838 0
## Inpatient_Days 0.873 0.040 0.795 0.872 0.951 0.872 0
## Pop_Adult 0.179 0.213 -0.239 0.180 0.596 0.180 0
## Median_Income 0.342 0.024 0.295 0.342 0.389 0.341 0
## Poverty_Percent 0.077 0.023 0.032 0.077 0.122 0.077 0
## Old_No_Health -0.138 0.015 -0.168 -0.138 -0.109 -0.138 0
## Pop_Old -0.683 0.217 -1.106 -0.684 -0.255 -0.686 0
##
## Model hyperparameters:
## mean sd 0.025quant
## size for the nbinomial observations (1/overdispersion) 2.06 0.058 1.95
## 0.5quant 0.975quant mode
## size for the nbinomial observations (1/overdispersion) 2.06 2.18 2.06
##
## Watanabe-Akaike information criterion (WAIC) ...: 27243.95
## Effective number of parameters .................: 15.52
##
## Marginal log-Likelihood: -13661.95
## Posterior summaries for the linear predictor and the fitted values are computed
## (Posterior marginals needs also 'control.compute=list(return.marginals.predictor=TRUE)')
Compared to the poisson model we see that Pop_Adult is not significant and that the rest of the variables have higher value in their respective directions.
df.geom <- df.geom %>%
mutate(Geom_ID = 1:n()) #ID number for INLA to work with
nbs <- knearneigh(st_centroid(df.geom), k = 5, longlat = T)
## Warning in st_centroid.sf(df.geom): st_centroid assumes attributes are constant
## over geometries of x
## Warning in knearneigh(st_centroid(df.geom), k = 5, longlat = T): dnearneigh:
## longlat argument overrides object
nbs<-knn2nb(nbs, row.names = df.geom$Geom_ID, sym = T) #force symmetry!!
mat <- nb2mat(nbs, style="B",zero.policy=TRUE)
colnames(mat) <- rownames(mat)
mat <- as.matrix(mat[1:dim(mat)[1], 1:dim(mat)[1]])
nb2INLA("cl_graph",nbs)
am_adj <-paste(getwd(),"/cl_graph",sep="")
H<-inla.read.graph(filename="cl_graph")
formula.nb.spatial <- Total_MD ~ offset(log(Pop_2019)) + Inpatient_Days + Pop_Adult + Median_Income + Poverty_Percent + Old_No_Health + Poverty_Percent + Pop_Old + f(Geom_ID, model = "bym", scale.model = T, constr = T, graph = H)
mod2 = inla(formula = formula.nb.spatial,
data = df.geom,
family = "nbinomial",
control.family=list(link='log'),
control.compute = list(waic=T),
control.predictor = list(link=1),
num.threads = 3)
summary(mod2)
##
## Call:
## c("inla(formula = formula.nb.spatial, family = \"nbinomial\", data =
## df.geom, ", " control.compute = list(waic = T), control.predictor =
## list(link = 1), ", " control.family = list(link = \"log\"), num.threads
## = 3)" )
## Time used:
## Pre = 0.956, Running = 17.8, Post = 0.602, Total = 19.4
## Fixed effects:
## mean sd 0.025quant 0.5quant 0.975quant mode kld
## (Intercept) -6.900 0.014 -6.926 -6.900 -6.873 -6.900 0
## Inpatient_Days 0.798 0.039 0.722 0.798 0.875 0.798 0
## Pop_Adult -0.237 0.227 -0.684 -0.236 0.207 -0.236 0
## Median_Income 0.343 0.027 0.291 0.343 0.397 0.343 0
## Poverty_Percent 0.065 0.026 0.013 0.065 0.116 0.064 0
## Old_No_Health -0.196 0.025 -0.244 -0.195 -0.147 -0.195 0
## Pop_Old -0.242 0.233 -0.697 -0.243 0.217 -0.245 0
##
## Random effects:
## Name Model
## Geom_ID BYM model
##
## Model hyperparameters:
## mean sd 0.025quant
## size for the nbinomial observations (1/overdispersion) 2.68 0.094 2.49
## Precision for Geom_ID (iid component) 29.03 6.155 18.72
## Precision for Geom_ID (spatial component) 11.99 2.323 8.14
## 0.5quant 0.975quant
## size for the nbinomial observations (1/overdispersion) 2.68 2.86
## Precision for Geom_ID (iid component) 28.40 43.13
## Precision for Geom_ID (spatial component) 11.73 17.24
## mode
## size for the nbinomial observations (1/overdispersion) 2.69
## Precision for Geom_ID (iid component) 27.21
## Precision for Geom_ID (spatial component) 11.22
##
## Watanabe-Akaike information criterion (WAIC) ...: 26989.26
## Effective number of parameters .................: 439.24
##
## Marginal log-Likelihood: -12525.88
## Posterior summaries for the linear predictor and the fitted values are computed
## (Posterior marginals needs also 'control.compute=list(return.marginals.predictor=TRUE)')
Adding spatial effects we don’t see any large changes expect that Pop_Old is no longer significant.
df.geom$fitted_values <- mod2$summary.fitted.values$mean
df.geom %>%
mutate(MD_Predict = quantile_variable(fitted_values, 5)) %>%
ggplot() +
geom_sf(aes(fill=MD_Predict, color = NA)) +
geom_sf(data=sts, color="black", alpha = .5)+
coord_sf(xlim = c(-125, -65), ylim = c(50,25)) +
scale_color_brewer(palette = "Blues") +
scale_fill_brewer(palette = "Blues",na.value = "grey50") +
labs(title = "Number of Predicted Active MD's by County in United States") +
theme(axis.title=element_blank(),
axis.text=element_blank(),
axis.ticks=element_blank(),
)
We some smoothing from the model let’s expore what areas where not predicted well.
df.geom <- df.geom %>%
mutate(Error = Total_MD - fitted_values) #Negative is overpredicted and postive is underpredictve
df.geom %>%
mutate(Error = quantile_variable(Error, 5)) %>%
ggplot() +
geom_sf(aes(fill=Error, color = NA)) +
geom_sf(data=sts, color="black", alpha = .5)+
coord_sf(xlim = c(-125, -65), ylim = c(50,25)) +
scale_color_brewer(palette = "RdBu") +
scale_fill_brewer(palette = "RdBu",na.value = "grey50") +
labs(title = "Predicted Active MD's Error by County in United States") +
theme(axis.title=element_blank(),
axis.text=element_blank(),
axis.ticks=element_blank(),
)
Interestingly it seems that the midwestern counties are accurately predicted where as most other places are being under or over predicted. The negative binomial model might not be a good fit for modeling places with high levels of Total_MD. I would need to explore other distributions or find other variables that might explain the variance at those dense places.
ggplot(df.geom, aes(x = log(Total_MD) , y = log(abs(Error)))) +
geom_hex()
## Warning: Removed 167 rows containing non-finite values (stat_binhex).
We can see our suspicion confirmed as the Total_MD increases our model does not predict those values very well at all. This might indicate that there are other variables in those areas with lots of doctors that can explain the outcome better.