df<- haven::read_sas("C:/Users/victo/OneDrive - University of Texas at San Antonio/Classes/Fall 2021/Spatial Anaylsis/HW 4/AHRF_2020-2021_SAS/AHRF2021.sas7bdat")
labels <- sapply(1:ncol(df), function(x){attr(df[[x]], "label")}) %>% 
  cbind("Variable_Name"=names(df),"Label"=.)

I want to see if the number of doctors in a county are influenced by the number of old people and other variables. We can examine the labels in the data to explore what variables are in the dataset.

I have selected the following variables to model Active M.Ds

Dependent Variable Label
f1212919 Tot Active M.D.s Non-Fed and Fed 2019
Independent Variables Label
f1318219 Population (Persons) 2019
f0954519 Inpatient Days Incl Nurs Home;Total Hosp 2019
f1547119 Persons <65 Yrs 2019
f1551919 Persons 40-64 Yrs 2019
f1434515 Median Household Income 2015-19
f1332119 Percent Persons in Poverty 2019
f1547419 % <65 without Health Insurance 2019
f00002 Header - FIPS St and Cty Code
df.subset <- df %>% 
  dplyr::select(f00002, f1212919, f1318219, f0954519, f1318219, f1551919, f1434515, f1332119, f1547419, f1547119) %>% 
  rename(FIPS = f00002,
         Total_MD = f1212919,
         Pop_2019 = f1318219,
         Inpatient_Days = f0954519,
         Pop_Old = f1547119,
         Pop_Adult = f1551919,
         Median_Income = f1434515,
         Poverty_Percent = f1332119,
         Old_No_Health = f1547419) %>% 
  filter(complete.cases(.))
summary(df.subset)
##      FIPS              Total_MD          Pop_2019        Inpatient_Days   
##  Length:3113        Min.   :    0.0   Min.   :     169   Min.   :      0  
##  Class :character   1st Qu.:    5.0   1st Qu.:   10907   1st Qu.:    843  
##  Mode  :character   Median :   18.0   Median :   26108   Median :   6089  
##                     Mean   :  309.4   Mean   :  105442   Mean   :  71190  
##                     3rd Qu.:   95.0   3rd Qu.:   69830   3rd Qu.:  36017  
##                     Max.   :31377.0   Max.   :10039107   Max.   :5825855  
##    Pop_Adult       Median_Income    Poverty_Percent Old_No_Health  
##  Min.   :     41   Min.   : 21504   Min.   : 2.70   Min.   : 2.40  
##  1st Qu.:   3384   1st Qu.: 44226   1st Qu.:10.40   1st Qu.: 8.00  
##  Median :   8097   Median : 51775   Median :13.40   Median :11.10  
##  Mean   :  32861   Mean   : 53462   Mean   :14.44   Mean   :11.97  
##  3rd Qu.:  21514   3rd Qu.: 59868   3rd Qu.:17.40   3rd Qu.:15.10  
##  Max.   :3177577   Max.   :142299   Max.   :47.70   Max.   :35.80  
##     Pop_Old       
##  Min.   :    149  
##  1st Qu.:   8301  
##  Median :  19890  
##  Mean   :  85717  
##  3rd Qu.:  54592  
##  Max.   :8464644

Spatial Info

Grabbing County spatial infomation

spatial <- counties(year = 2019, cb = TRUE)

sts<- states(cb=T)%>%
  st_transform(crs= 2163)%>%
  st_boundary()%>%
  subset(!STATEFP%in%c("02", "15", "60", "66", "69", "72", "78"))

Merging counties and scaling my x variables.

df.geom <- spatial %>% 
  dplyr::select(GEOID) %>% 
  right_join(df.subset, by= c("GEOID" = "FIPS")) %>% 
  mutate_at(vars(Inpatient_Days, Pop_Adult, Median_Income, Poverty_Percent, Old_No_Health, Poverty_Percent, Pop_Old), scale)

Looking at our outcome variable:

hist(df.geom$Total_MD)

We can see that it is very right skewed.

Let’s plot our outcome variable

df.geom %>%
  mutate(MD_group = quantile_variable(Total_MD, 5)) %>% 
  ggplot() +
  geom_sf(aes(fill=MD_group, color = NA)) +
  geom_sf(data=sts, color="black", alpha = .5)+
  coord_sf(xlim = c(-125, -65), ylim = c(50,25)) +
  scale_color_brewer(palette = "Blues") +
  scale_fill_brewer(palette = "Blues",na.value = "grey50") +
  labs(title = "Number of Active MD's by County in United States") +
  theme(axis.title=element_blank(),
        axis.text=element_blank(),
        axis.ticks=element_blank(),
        )

We can see that most doctors are located in the eastern and western coasts of the United States and that the midwestern part of the United States has less Active Doctors.

Modeling Total_MD

Fitting a poisson with an offset of the total population in 2019. This does not consider spatial effects.

fit_pois<-glm(Total_MD ~ offset(log(Pop_2019)) + Inpatient_Days + Pop_Adult + Median_Income + Poverty_Percent + Old_No_Health + Poverty_Percent + Pop_Old,
               family=poisson, 
               data=df.geom)


summary(fit_pois)
## 
## Call:
## glm(formula = Total_MD ~ offset(log(Pop_2019)) + Inpatient_Days + 
##     Pop_Adult + Median_Income + Poverty_Percent + Old_No_Health + 
##     Poverty_Percent + Pop_Old, family = poisson, data = df.geom)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -94.318   -6.433   -4.382   -2.153   97.919  
## 
## Coefficients:
##                   Estimate Std. Error  z value Pr(>|z|)    
## (Intercept)     -6.1487708  0.0015020 -4093.77   <2e-16 ***
## Inpatient_Days   0.1760316  0.0005048   348.70   <2e-16 ***
## Pop_Adult        0.0794142  0.0040010    19.85   <2e-16 ***
## Median_Income    0.2096329  0.0012350   169.74   <2e-16 ***
## Poverty_Percent  0.1794683  0.0022042    81.42   <2e-16 ***
## Old_No_Health   -0.0909379  0.0013178   -69.01   <2e-16 ***
## Pop_Old         -0.2089367  0.0041012   -50.95   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 449980  on 3112  degrees of freedom
## Residual deviance: 252712  on 3106  degrees of freedom
## AIC: 268035
## 
## Number of Fisher Scoring iterations: 5

We see that all of our variables are significant. We see that Population old has a negative relationship. Is goes against our initial hypothesis of the elderly people attracting doctors. It is possible that there is omitted variable bias in the model for example perhaps elder people live in an area that is less urban which that is the factor that is negative with total MD’s.

Examining overdispersion

sqrt(fit_pois$deviance/fit_pois$df.residual)
## [1] 9.020117

The data is very overdispersed the variance is 9 times greater then as the mean. Since the response is overdispersed we cannot trust our p value estimates. Since our response is overdispersed we will use the negative binomial model.

To eventually account for the spatial effects in the model we will be using INLA framework for our analysis.

INLA

Simple INLA Negative Binomial Model

formula.nb.simple <- Total_MD ~ offset(log(Pop_2019)) + Inpatient_Days + Pop_Adult + Median_Income + Poverty_Percent + Old_No_Health + Poverty_Percent + Pop_Old

mod1 = inla(formula = formula.nb.simple,
            data = df.geom,
            family = "nbinomial",
            control.family=list(link='log'),
            control.compute = list(waic=T),
            control.predictor = list(link=1),
            num.threads = 3)
summary(mod1)
## 
## Call:
##    c("inla(formula = formula.nb.simple, family = \"nbinomial\", data = 
##    df.geom, ", " control.compute = list(waic = T), control.predictor = 
##    list(link = 1), ", " control.family = list(link = \"log\"), num.threads 
##    = 3)" ) 
## Time used:
##     Pre = 0.695, Running = 2.34, Post = 0.315, Total = 3.35 
## Fixed effects:
##                   mean    sd 0.025quant 0.5quant 0.975quant   mode kld
## (Intercept)     -6.838 0.013     -6.864   -6.838     -6.811 -6.838   0
## Inpatient_Days   0.873 0.040      0.795    0.872      0.951  0.872   0
## Pop_Adult        0.179 0.213     -0.239    0.180      0.596  0.180   0
## Median_Income    0.342 0.024      0.295    0.342      0.389  0.341   0
## Poverty_Percent  0.077 0.023      0.032    0.077      0.122  0.077   0
## Old_No_Health   -0.138 0.015     -0.168   -0.138     -0.109 -0.138   0
## Pop_Old         -0.683 0.217     -1.106   -0.684     -0.255 -0.686   0
## 
## Model hyperparameters:
##                                                        mean    sd 0.025quant
## size for the nbinomial observations (1/overdispersion) 2.06 0.058       1.95
##                                                        0.5quant 0.975quant mode
## size for the nbinomial observations (1/overdispersion)     2.06       2.18 2.06
## 
## Watanabe-Akaike information criterion (WAIC) ...: 27243.95
## Effective number of parameters .................: 15.52
## 
## Marginal log-Likelihood:  -13661.95 
## Posterior summaries for the linear predictor and the fitted values are computed
## (Posterior marginals needs also 'control.compute=list(return.marginals.predictor=TRUE)')

Compared to the poisson model we see that Pop_Adult is not significant and that the rest of the variables have higher value in their respective directions.

Spatial INLA Negative Binomial Model

Creating Spatial relationship
df.geom <- df.geom %>% 
  mutate(Geom_ID = 1:n()) #ID number for INLA to work with

nbs <- knearneigh(st_centroid(df.geom), k = 5, longlat = T)
## Warning in st_centroid.sf(df.geom): st_centroid assumes attributes are constant
## over geometries of x
## Warning in knearneigh(st_centroid(df.geom), k = 5, longlat = T): dnearneigh:
## longlat argument overrides object
nbs<-knn2nb(nbs, row.names = df.geom$Geom_ID, sym = T) #force symmetry!!
mat <- nb2mat(nbs, style="B",zero.policy=TRUE)
colnames(mat) <- rownames(mat) 
mat <- as.matrix(mat[1:dim(mat)[1], 1:dim(mat)[1]])


nb2INLA("cl_graph",nbs)
am_adj <-paste(getwd(),"/cl_graph",sep="")
H<-inla.read.graph(filename="cl_graph")
Modeling Random Spatial Effects
formula.nb.spatial <- Total_MD ~ offset(log(Pop_2019)) + Inpatient_Days + Pop_Adult + Median_Income + Poverty_Percent + Old_No_Health + Poverty_Percent + Pop_Old + f(Geom_ID, model = "bym", scale.model =  T, constr = T, graph = H)

mod2 = inla(formula = formula.nb.spatial,
            data = df.geom,
            family = "nbinomial",
            control.family=list(link='log'),
            control.compute = list(waic=T),
            control.predictor = list(link=1),
            num.threads = 3)
summary(mod2)
## 
## Call:
##    c("inla(formula = formula.nb.spatial, family = \"nbinomial\", data = 
##    df.geom, ", " control.compute = list(waic = T), control.predictor = 
##    list(link = 1), ", " control.family = list(link = \"log\"), num.threads 
##    = 3)" ) 
## Time used:
##     Pre = 0.956, Running = 17.8, Post = 0.602, Total = 19.4 
## Fixed effects:
##                   mean    sd 0.025quant 0.5quant 0.975quant   mode kld
## (Intercept)     -6.900 0.014     -6.926   -6.900     -6.873 -6.900   0
## Inpatient_Days   0.798 0.039      0.722    0.798      0.875  0.798   0
## Pop_Adult       -0.237 0.227     -0.684   -0.236      0.207 -0.236   0
## Median_Income    0.343 0.027      0.291    0.343      0.397  0.343   0
## Poverty_Percent  0.065 0.026      0.013    0.065      0.116  0.064   0
## Old_No_Health   -0.196 0.025     -0.244   -0.195     -0.147 -0.195   0
## Pop_Old         -0.242 0.233     -0.697   -0.243      0.217 -0.245   0
## 
## Random effects:
##   Name     Model
##     Geom_ID BYM model
## 
## Model hyperparameters:
##                                                         mean    sd 0.025quant
## size for the nbinomial observations (1/overdispersion)  2.68 0.094       2.49
## Precision for Geom_ID (iid component)                  29.03 6.155      18.72
## Precision for Geom_ID (spatial component)              11.99 2.323       8.14
##                                                        0.5quant 0.975quant
## size for the nbinomial observations (1/overdispersion)     2.68       2.86
## Precision for Geom_ID (iid component)                     28.40      43.13
## Precision for Geom_ID (spatial component)                 11.73      17.24
##                                                         mode
## size for the nbinomial observations (1/overdispersion)  2.69
## Precision for Geom_ID (iid component)                  27.21
## Precision for Geom_ID (spatial component)              11.22
## 
## Watanabe-Akaike information criterion (WAIC) ...: 26989.26
## Effective number of parameters .................: 439.24
## 
## Marginal log-Likelihood:  -12525.88 
## Posterior summaries for the linear predictor and the fitted values are computed
## (Posterior marginals needs also 'control.compute=list(return.marginals.predictor=TRUE)')

Adding spatial effects we don’t see any large changes expect that Pop_Old is no longer significant.

Mapping Spatial Effects
df.geom$fitted_values <-  mod2$summary.fitted.values$mean

df.geom %>%
  mutate(MD_Predict = quantile_variable(fitted_values, 5)) %>% 
  ggplot() +
  geom_sf(aes(fill=MD_Predict, color = NA)) +
  geom_sf(data=sts, color="black", alpha = .5)+
  coord_sf(xlim = c(-125, -65), ylim = c(50,25)) +
  scale_color_brewer(palette = "Blues") +
  scale_fill_brewer(palette = "Blues",na.value = "grey50") +
  labs(title = "Number of Predicted Active MD's by County in United States") +
  theme(axis.title=element_blank(),
        axis.text=element_blank(),
        axis.ticks=element_blank(),
        )

We some smoothing from the model let’s expore what areas where not predicted well.

df.geom <- df.geom %>% 
  mutate(Error = Total_MD - fitted_values) #Negative is overpredicted and postive is underpredictve 

df.geom %>%
  mutate(Error = quantile_variable(Error, 5)) %>% 
  ggplot() +
  geom_sf(aes(fill=Error, color = NA)) +
  geom_sf(data=sts, color="black", alpha = .5)+
  coord_sf(xlim = c(-125, -65), ylim = c(50,25)) +
  scale_color_brewer(palette = "RdBu") +
  scale_fill_brewer(palette = "RdBu",na.value = "grey50") +
  labs(title = "Predicted Active MD's Error by County in United States") +
  theme(axis.title=element_blank(),
        axis.text=element_blank(),
        axis.ticks=element_blank(),
        )

Interestingly it seems that the midwestern counties are accurately predicted where as most other places are being under or over predicted. The negative binomial model might not be a good fit for modeling places with high levels of Total_MD. I would need to explore other distributions or find other variables that might explain the variance at those dense places.

ggplot(df.geom, aes(x = log(Total_MD) , y = log(abs(Error)))) +
  geom_hex()
## Warning: Removed 167 rows containing non-finite values (stat_binhex).

We can see our suspicion confirmed as the Total_MD increases our model does not predict those values very well at all. This might indicate that there are other variables in those areas with lots of doctors that can explain the outcome better.