Instructions:

Use a data set of your choice.
Consider any of the models discussed in the lecture for the outcome. You MUST use: Either of the Binomial, Poisson or Negative Binomial models
What is your denominator/offset variable, or if you wish to construct a SMR/SIR/Expected Value, describe how you do this.
Construct at least 4 predictors for your outcome, I would z-score these so they are on common scale.
Interpret your models.
Is your outcome over-dispersed?
Use INLA to estimate a spatially correlated (besag or bym model) for your count outcome.

Data and Variables Chosen:

Data: 2015-2019 5 Year ACS

For this assignment I am attempting a Poisson regression with my count data as the number of mobile homes per census tract.
My offset variable is the total number of households per tract.
Predictor variables:
- Percent white homeowners
- Percent renters
- MHI - Median Household Income
- Percent of children under 18

Download ACS Data

(I didn’t feel like sorting out which variables were needed and which were extra so I downloaded too many)

#Download ACS data for the year 2015 for Los Angeles County, California census tracts in R using the tidycensus library
#In this extract, request both the proportion of the population that is Non-Hispanic Black and the proportion of the population that is Hispanic

#Use this to search for variables
#acs2019 <- load_variables(2019 , "acs5", cache = TRUE) 
#ec2019 <- load_variables(2019 , "census of governments", cache = TRUE) 

#demographic profile tables

#Use this to search for variables
#acs2019 <- load_variables(2019 , "acs5", cache = TRUE) #demographic profile tables

tracts<-get_acs(geography = "tract",
                state="TX",
                year = 2019,
                variables=c("B25024_010E",
                            "B01003_001E",
                            "DP03_0062E",
                            "B25003_003E",
                            "B25003H_003E",
                            "B25003I_003E",
                            "B25003B_003E",
                            "B25003_002E",
                            "B25003H_002E",
                            "B25003I_002E",
                            "B25003B_002E",
                            "B05003_003E",
                            "B05003_014E") ,
                geometry = T, output = "wide")

## Getting data from the 2015-2019 5-year ACS

## Downloading feature geometry from the Census website.  To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.

## Fetching data by table type ("B/C", "S", "DP") and combining the result.

#rename variables and filter missing cases
tracts<-tracts%>%
  mutate(mobilehomes=B25024_010E,
         totpop=B01003_001E,
         MHI=DP03_0062E, 
         totRent = B25003_003E,
         whiteRent = B25003H_003E,
         hispRent = B25003I_003E,
         blackRent = B25003B_003E,
         totOwn = B25003_002E,
         whiteOwn = B25003H_002E,
         hispOwn = B25003I_002E,
         blackOwn = B25003B_002E,
         totRentOwn = totRent + totOwn,
         pRent = totRent/totRentOwn,
         pOwn = totOwn/totRentOwn,
         pwhiteOwn = whiteOwn/totRentOwn,
         phispOwn = hispOwn/totRentOwn,
         pblackOwn = blackOwn/totRentOwn,
         pwhiteRent = whiteRent/totRentOwn,
         phispRent = hispRent/totRentOwn,
         pblackRent = blackRent/totRentOwn,
         totUnder18 = B05003_014E + B05003_003E,
         pchildren = totUnder18 / totpop)
tracts<-na.omit(tracts)


#pull texas shapefile for context
state<-get_acs(geography = "state", variables=c("DP05_0001E"), state="TX", year=2019, geometry = T, output = "wide")

## Getting data from the 2015-2019 5-year ACS

## Downloading feature geometry from the Census website.  To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.

## Using the ACS Data Profile

library(tigris)
MSAs<-core_based_statistical_areas(cb=T,year=2019)
SANB<-MSAs[MSAs$GEOID=="41700",]
DFW<-MSAs[MSAs$GEOID=="19100",]
HOU<-MSAs[MSAs$GEOID=="26420",]
AUS<-MSAs[MSAs$GEOID=="12420",]

#Merge MSAs into new shapefile (for some reason couldn't query them all together)
TX_MSAs <- rbind(SANB, DFW, HOU, AUS)


#% Mobile Homes
  tm_shape(tracts)+
  tm_polygons("mobilehomes", 
              title="Mobile Homes by Tract, 2019",
              palette="YlGnBu", 
              border.alpha = 0,  
              style="pretty",
              legend.hist=T )+
    tm_shape(state)+
      tm_polygons(alpha=0, border.col="black")+
    tm_shape(TX_MSAs)+
      tm_polygons(alpha=0, border.col = "black")+
  tm_format("World", title="Mobile Homes by Tract, Texas", legend.outside=T)+
  tm_scale_bar()+
  tm_compass()

Poisson Model

I believe this model is showing a positive association between the percentages of white homeowners, renters, and children and the number of mobile homes in an area, while the median household income has a negative association.

fit_pois<- glm(mobilehomes ~ offset(scale(totpop)) + scale(pwhiteOwn) + scale(pRent) + scale(MHI) + scale(pchildren), 
               family=poisson, 
               data=tracts)
summary(fit_pois)

## 
## Call:
## glm(formula = mobilehomes ~ offset(scale(totpop)) + scale(pwhiteOwn) + 
##     scale(pRent) + scale(MHI) + scale(pchildren), family = poisson, 
##     data = tracts)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -458.87    -4.71    -0.14    15.74   137.00  
## 
## Coefficients:
##                   Estimate Std. Error  z value Pr(>|z|)    
## (Intercept)       2.014894   0.001548  1301.59   <2e-16 ***
## scale(pwhiteOwn)  2.039050   0.001807  1128.56   <2e-16 ***
## scale(pRent)      0.101005   0.002156    46.85   <2e-16 ***
## scale(MHI)       -4.364476   0.001452 -3005.24   <2e-16 ***
## scale(pchildren)  0.110216   0.001457    75.66   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 18040876  on 5161  degrees of freedom
## Residual deviance:  2978632  on 5157  degrees of freedom
## AIC: 2999524
## 
## Number of Fisher Scoring iterations: 15

Check for overdispersion

Since this measure calculated below is 24, this suggests the variance is much larger than the mean, suggesting high overdispersion.

scale<-sqrt(fit_pois$deviance/fit_pois$df.residual)
scale

## [1] 24.0331

INLA

From the readings, the results can be interpreted as “relative risk” - so I believe that would mean that the percent of white homeowners and the percent of children have positive associations with the number of mobile homes in an area, while the percent of renters and the median household income have negative associaitons with the number of mobile homes in an area.

# Numeric Identifiers for tracts
tracts$struct<-1:dim(tracts)[1]  
nbs<-knearneigh(st_centroid(tracts), k = 5, longlat = T) #k=5 nearest neighbors

## Warning in st_centroid.sf(tracts): st_centroid assumes attributes are constant
## over geometries of x

## Warning in knearneigh(st_centroid(tracts), k = 5, longlat = T): dnearneigh:
## longlat argument overrides object

nbs<-knn2nb(nbs, row.names = tracts$struct, sym = T) #force symmetry!!
mat <- nb2mat(nbs, style="B",zero.policy=TRUE)
colnames(mat) <- rownames(mat) 
mat <- as.matrix(mat[1:dim(mat)[1], 1:dim(mat)[1]])
nb2INLA("cl_graph",nbs)
am_adj <-paste(getwd(),"/cl_graph",sep="")
H<-inla.read.graph(filename="cl_graph")


f3<-mobilehomes ~ offset(scale(totpop)) + scale(pwhiteOwn) + scale(pRent) + scale(MHI) + scale(pchildren)+ f(struct, model = "bym", scale.model = T, constr = T, graph = H)

mod3<-inla(formula = f3,data = tracts,
           family = "poisson",
           control.compute = list(waic=T,return.marginals.predictor=TRUE), 
           control.predictor = list(link=1), 
           num.threads = 3, 
               verbose = F)

#total model summary
summary(mod3)

## 
## Call:
##    c("inla(formula = f3, family = \"poisson\", data = tracts, verbose = F, 
##    ", " control.compute = list(waic = T, return.marginals.predictor = 
##    TRUE), ", " control.predictor = list(link = 1), num.threads = 3)") 
## Time used:
##     Pre = 3.39, Running = 28.2, Post = 0.606, Total = 32.2 
## Fixed effects:
##                    mean    sd 0.025quant 0.5quant 0.975quant   mode kld
## (Intercept)       2.000 0.037      1.925    2.000      2.073  2.000   0
## scale(pwhiteOwn)  0.891 0.089      0.716    0.891      1.067  0.891   0
## scale(pRent)     -0.739 0.075     -0.886   -0.739     -0.593 -0.739   0
## scale(MHI)       -1.577 0.081     -1.737   -1.577     -1.417 -1.576   0
## scale(pchildren)  0.267 0.050      0.169    0.267      0.365  0.267   0
## 
## Random effects:
##   Name     Model
##     struct BYM model
## 
## Model hyperparameters:
##                                           mean    sd 0.025quant 0.5quant
## Precision for struct (iid component)     0.221 0.007      0.208    0.221
## Precision for struct (spatial component) 0.141 0.012      0.121    0.141
##                                          0.975quant  mode
## Precision for struct (iid component)          0.236 0.220
## Precision for struct (spatial component)      0.166 0.139
## 
## Watanabe-Akaike information criterion (WAIC) ...: 77409.05
## Effective number of parameters .................: 26297.81
## 
## Marginal log-Likelihood:  -23229.67 
## Posterior summaries for the linear predictor and the fitted values are computed
## (Posterior marginals needs also 'control.compute=list(return.marginals.predictor=TRUE)')

Spatial Dem - Homework 6 - INLA

Sarah Serpas

11/29/2021