Spatially Adjusted Regression Modelling

## load libraries:

# plotting and wrangling
library(ggplot2)
library(tidyverse)

#spatial and regression modeling
library(raster)
library(ModelMetrics)
library(spaMM)
library(leaflet)
library(rgdal)
library(sp)

# tuberculosis vaccination data, Uganda
bcg_vax <- read.csv("BCG_vaccination_UGA.csv")

# shapefile for Uganda.
uga_adm_0 <- raster::getData("GADM", country="UGA", level=0) 

# https://www.worldpop.org/geodata/summary?id=18762
# uganda night time lights data, 2016,
ug_lights_import <- raster("uga_viirs_100m_2016.tif")

# https://forobs.jrc.ec.europa.eu/products/gam/download.php
# travel time to major cities. 
dist <- raster("acc_50k.tif")

# create prevalence vector of vaccinated persons in bcg data
bcg_vax <- bcg_vax %>% 
  mutate(vax_prev = (number_positive / number_examined))

# create spatial data points data frame
bcg_spdf <- SpatialPointsDataFrame(coords = bcg_vax[,c("lng", "lat")],
                                   data =bcg_vax[,c("number_examined", 
                                                    "number_positive", 
                                                    "vax_prev")],
                                   proj4string = CRS("+init=epsg:4326"))

# crop and mask distance to major city with country shape. 
dist_uga <- raster::crop(x = dist, y = uga_adm_0)
dist_uga_mask <- raster::mask(x = dist_uga, mask = uga_adm_0)

# aggregate to match resolution with distance layer.
ug_lights <- raster::aggregate(ug_lights_import, fact= 10)

# resample 
ug_lights_resamp <- resample(ug_lights, dist_uga_mask, method="ngb") 

# evaluate raster resolution and extent for equivalency.
ug_lights_resamp

## class      : RasterLayer 
## dimensions : 686, 651, 446586  (nrow, ncol, ncell)
## resolution : 0.008333333, 0.008333333  (x, y)
## extent     : 29.575, 35, -1.483333, 4.233333  (xmin, xmax, ymin, ymax)
## crs        : +proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0 
## source     : memory
## names      : uga_viirs_100m_2016 
## values     : -0.08896015, 76.44197  (min, max)

dist_uga_mask

## class      : RasterLayer 
## dimensions : 686, 651, 446586  (nrow, ncol, ncell)
## resolution : 0.008333333, 0.008333333  (x, y)
## extent     : 29.575, 35, -1.483333, 4.233333  (xmin, xmax, ymin, ymax)
## crs        : +proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0 
## source     : memory
## names      : acc_50k 
## values     : 0, 1901  (min, max)

# plot to evaluate.
par(mfrow=c(1,3))
plot(dist_uga_mask)
plot(bcg_spdf)
plot(ug_lights_resamp)

#extract distance to major city variable on each spatial observation into new vector of bcg_vax data frame. 
bcg_vax$distance_to <- 
  raster::extract(dist_uga_mask, bcg_vax[,c("lng", "lat")])

#extract nightim lights variable on each spatial observation into new vector of bcg_vax data frame.
bcg_vax$lights <- 
  raster::extract(ug_lights_resamp, bcg_vax[,c("lng", "lat")])

Regression Modeling:

# fit a generalized linear model (GLM) to the prevelance data, including lights and distance to major city. 
glm_mod_1 <- 
  glm(cbind(number_positive, number_examined - number_positive) ~ lights + distance_to, data=bcg_vax, family=binomial())

summary(glm_mod_1)

## 
## Call:
## glm(formula = cbind(number_positive, number_examined - number_positive) ~ 
##     lights + distance_to, family = binomial(), data = bcg_vax)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.7947  -0.5499   0.7350   1.3405   2.6313  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.8917330  0.0842928  22.442  < 2e-16 ***
## lights       0.0739176  0.0238907   3.094  0.00197 ** 
## distance_to -0.0004798  0.0003618  -1.326  0.18478    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 848.76  on 393  degrees of freedom
## Residual deviance: 829.87  on 391  degrees of freedom
## AIC: 1271
## 
## Number of Fisher Scoring iterations: 5

Looking at the output ofglm_mod_1, median Deviance Residuals: are less than 2. The p-value associated with the distance_to co-efficient .18478, is greater than .05 leading us to determine there is no evidence that distance to major cities is related to the prevalence of BCG tuberculosis vaccination.

That being said, we remove the distance_to variable and assemble new model with the vaccinated prevalance and nightime lights data alone.

# fit GLM without distance_to variable.
glm_mod_2 <- 
  glm(cbind(number_positive, number_examined - number_positive) ~ lights,
      data=bcg_vax, family=binomial())

summary(glm_mod_2)

## 
## Call:
## glm(formula = cbind(number_positive, number_examined - number_positive) ~ 
##     lights, family = binomial(), data = bcg_vax)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.8516  -0.5769   0.7191   1.3505   2.6438  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.80554    0.05227  34.546  < 2e-16 ***
## lights       0.08342    0.02306   3.617 0.000298 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 848.76  on 393  degrees of freedom
## Residual deviance: 831.58  on 392  degrees of freedom
## AIC: 1270.7
## 
## Number of Fisher Scoring iterations: 5

Much of our summary output from the revised glm_mod_2 model is similar to glm_mod_1. Deviance Residuals: and the AIC: have diminshed by a small margin. Note the p-value for the lights coefficient has become smaller, from Pr(>|z|) = 0.00197 with distance_to included to Pr(>|z|) = 0.000298.

# exponentiate co-efficient value for an odds ratio measurement. 
exp(.08342)

## [1] 1.086998

An odds ratio of 1.086998 can be interpreted as around a 9% odds of increased vaccinated person prevalence per unit of observed radiance value (nanoWatts/cm2/sr).

# plot to evaluate.
ggplot() +
  geom_point(aes(glm_mod_2$fitted, bcg_vax$vax_prev))

Values do not align well with observed data. This suggests that nighttime lights data alone do not explain prevalance well.

Prediction over a raster object:

# align column names with variable of interest 
pred_raster <- ug_lights_resamp
names(pred_raster) <- 'lights'

# predict values using GLM model 
predicted_prevs <- predict(pred_raster,
                          glm_mod_2,
                          type='response')

# plot to evaluate
plot(predicted_prevs)

# compute correlogram of glm_mod_2 residuals. 
nbc <- 10
cor_r <- pgirmess::correlog(coords=bcg_vax[,c("lng", "lat")],
                            z=glm_mod_2$residuals,
                            method="Moran", nbclass=nbc)

cor_r

## Moran I statistic 
##       dist.class        coef      p.value     n
##  [1,]  0.3425803  0.23913081 3.740957e-73 16794
##  [2,]  1.0222255  0.17217003 2.134150e-77 23212
##  [3,]  1.7018702  0.08469678 3.477336e-31 30356
##  [4,]  2.3815149  0.02677636 7.368693e-05 28546
##  [5,]  3.0611596 -0.02384968 9.963204e-01 27558
##  [6,]  3.7408042 -0.11009238 1.000000e+00 16770
##  [7,]  4.4204489 -0.33007437 1.000000e+00  7064
##  [8,]  5.1000936 -0.40714848 1.000000e+00  3110
##  [9,]  5.7797383 -0.35651222 1.000000e+00  1166
## [10,]  6.4593830 -0.64824446 1.000000e+00   268

# structure output as data frame.
correlogram <- as.data.frame(cor_r)
correlogram$variable <- "residuals_glm" 

# plot correlogram
ggplot(subset(correlogram, variable=="residuals_glm"), aes(dist.class, coef)) + 
  geom_hline(yintercept = 0, col="grey") +
  geom_line(col="steelblue") + 
  geom_point(col="steelblue") +
  xlab("distance") + 
  ylab("Moran's coefficient")+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), 
        panel.background = element_blank(), axis.line = element_line(colour = "black"))

Looking at the output of the plotted corellogram, dist.class 1 through 4 fall above a 0 threshold, with coresponding p-values indicating statistical significance at: 3.740957e-73, 2.134149e-77, 3.477336e-31, and 7.368693e-05. From these spatially oriented measurements we can determine the model violates a GLM assumption of independence.

In our next model, we incorporate a spatially correlated random effect.

# regression model with matern covariance assumption. 
glm_mod_2_spatial <- spaMM::fitme(cbind(number_positive, number_examined - number_positive) ~ lights + Matern(1|lat+lng), data=bcg_vax, family=binomial())

summary(glm_mod_2_spatial)

## formula: cbind(number_positive, number_examined - number_positive) ~ lights + 
##     Matern(1 | lat + lng)
## Estimation of lambda and corrPars by Laplace ML approximation (p_v).
## Estimation of fixed effects by Laplace ML approximation (p_v).
## family: binomial( link = logit ) 
##  ------------ Fixed effects (beta) ------------
##             Estimate Cond. SE t-value
## (Intercept)   2.2682  0.37775   6.005
## lights        0.1809  0.05013   3.608
##  --------------- Random effects ---------------
## Family: gaussian( link = identity ) 
##                    --- Correlation parameters:
##      1.nu     1.rho 
## 0.2443545 0.8694605 
##            --- Variance parameters ('lambda'):
## lambda = var(u) for u ~ Gaussian; 
##    lat + lng  :  1.438  
##              --- Coefficients for log(lambda):
##      Group        Term Estimate Cond.SE
##  lat + lng (Intercept)   0.3635  0.1341
## # of obs: 394; # of groups: lat + lng, 394 
##  ------------- Likelihood values  -------------
##                        logLik
## p_v(h) (marginal L): -504.451

The summary does not include a p-value, but we can extract a .95 confidence interval.

# create dataframe from spatial model beta table
coefs <- as.data.frame(summary(glm_mod_2_spatial)$beta_table)

## formula: cbind(number_positive, number_examined - number_positive) ~ lights + 
##     Matern(1 | lat + lng)
## Estimation of lambda and corrPars by Laplace ML approximation (p_v).
## Estimation of fixed effects by Laplace ML approximation (p_v).
## family: binomial( link = logit ) 
##  ------------ Fixed effects (beta) ------------
##             Estimate Cond. SE t-value
## (Intercept)   2.2682  0.37775   6.005
## lights        0.1809  0.05013   3.608
##  --------------- Random effects ---------------
## Family: gaussian( link = identity ) 
##                    --- Correlation parameters:
##      1.nu     1.rho 
## 0.2443545 0.8694605 
##            --- Variance parameters ('lambda'):
## lambda = var(u) for u ~ Gaussian; 
##    lat + lng  :  1.438  
##              --- Coefficients for log(lambda):
##      Group        Term Estimate Cond.SE
##  lat + lng (Intercept)   0.3635  0.1341
## # of obs: 394; # of groups: lat + lng, 394 
##  ------------- Likelihood values  -------------
##                        logLik
## p_v(h) (marginal L): -504.451

# add name to row. 
row <- row.names(coefs) %in% c('lights')

# multiply standard error by confidence interval limits 
lower <- coefs[row,'Estimate'] - 1.96*coefs[row, 'Cond. SE']
upper <- coefs[row,'Estimate'] + 1.96*coefs[row, 'Cond. SE']

# concacenate lower and upper bound values and exponentiate
OR <- exp(c(lower, upper))
OR

## [1] 1.086121 1.321994

Here we have evidence that after accounting for spatial autocorrelation with a Matern covariance assumption, there is an observable relationship between night time light values and vaccinated person prevalence.

Let’s evaluate our adjusted model for evidence of post-hoc spatial autocorellation.

# Compute correlogram of glm_mod_2_spatial residuals
nbc <- 10
cor_r_spatial <- pgirmess::correlog(coords = bcg_vax[,c("lng", "lat")],
                            z = residuals(glm_mod_2_spatial),
                            method="Moran", nbclass=nbc)
cor_r_spatial

## Moran I statistic 
##       dist.class          coef   p.value     n
##  [1,]  0.3425803 -0.0052640078 0.5800007 16794
##  [2,]  1.0222255  0.0088238229 0.1146851 23212
##  [3,]  1.7018702  0.0001381516 0.3619456 30356
##  [4,]  2.3815149  0.0015067763 0.3010885 28546
##  [5,]  3.0611596 -0.0011701668 0.4317740 27558
##  [6,]  3.7408042  0.0011846690 0.3739705 16770
##  [7,]  4.4204489 -0.0346545313 0.9346268  7064
##  [8,]  5.1000936 -0.0340748699 0.8322097  3110
##  [9,]  5.7797383 -0.0155582910 0.5273283  1166
## [10,]  6.4593830 -0.1521469737 0.9273575   268

None of the dist.class bins appear with p-values that are statsitically significant. Each bin class exceeds Pr(>|z|) = .05.

correlogram_sp <- as.data.frame(cor_r_spatial)
correlogram_sp$variable <- "residuals_glm_sp"


# plot correlogram
 ggplot(subset(correlogram_sp, variable=="residuals_glm_sp"), aes(dist.class, coef)) + 
  geom_hline(yintercept = 0, col="grey") +
  geom_line(col="steelblue") + 
  geom_point(col="steelblue") +
  ylim(-.5, .5) +
  xlab("distance") + 
  ylab("Moran's coefficient")+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), 
        panel.background = element_blank(), axis.line = element_line(colour = "black")) +
  ggtitle("Residual Spatial Autocorrelation Post-adjustment")

Here we can observe a plot of the dist.class bins, which appear to traverse the 0 margin at most levels of distance.

Predict prevalence within region:

# create empty raster with the same extent and resolution as the light layer.
latitude_raster <- longitude_raster <- 
  raster(nrows = nrow(ug_lights_resamp), 
         ncols = ncol(ug_lights_resamp),
         ext = extent(ug_lights_resamp))

# change the values to be latitude and longitude respectively
longitude_raster[] <- coordinates(longitude_raster)[,1]
latitude_raster[] <- coordinates(latitude_raster)[,2]

# create a prediction stack with needed variables. 
pred_stack <- stack(ug_lights_resamp,
                    longitude_raster,
                    latitude_raster)

# rename columns 
names(pred_stack) <- c("lights", "lng", "lat")

# plot
plot(pred_stack)

# generate predictions using covariates and raster stack
predicted_prevalence_raster <- predict(pred_stack, glm_mod_2_spatial)
plot(predicted_prevalence_raster)
lines(uga_adm_0)

Model Validation:

# hold 20% of prevalence data as validation set
set.seed(1)
validation_rows <- sample(1:nrow(bcg_vax), round((.2  * 394),1))
bcg_vax_data_train <- bcg_vax[-validation_rows,]
bcg_vax_data_valid <- bcg_vax[validation_rows,]

# fit model using remaining 80% training data.
glm_mod_2_spatial_validation <- spaMM::fitme(cbind(number_positive,  number_examined - number_positive) ~ lights + Matern(1|lat+lng), data=bcg_vax_data_train, family=binomial())


# predict to validation rows of spatially adjusted model and compare
predictions_validation_sp <- predict(glm_mod_2_spatial_validation, bcg_vax_data_valid)

# plot to evaluate
ggplot() + geom_point(aes(as.vector(predictions_validation_sp), bcg_vax_data_valid$vax_prev)) +
  ggtitle("Spatially adjusted model precitions")

# fit model using remaining 80% training data.
glm_mod_2_validation <- 
  glm(cbind(number_positive, number_examined - number_positive) ~ lights,
      data=bcg_vax_data_train, family=binomial())

# predict to validation rows of spatially adjusted model and compare
predictions_validation <- predict(glm_mod_2_validation, bcg_vax_data_valid)

# plot
ggplot() + geom_point(aes(as.vector(predictions_validation), bcg_vax_data_valid$vax_prev)) +
  ggtitle("Un-adjusted model precitions")

# calculate a rounded mean squared error with unadjusted model. 
mse_unadj <- paste("MSE, unadjusted model         =",
      round(mse(predictions_validation, bcg_vax$vax_prev),4))

# calculate a rounded mean squared error with spatial model. 
mse_adj <- paste("MSE, spatially adjusted model =", 
      round(mse(predictions_validation_sp, bcg_vax$vax_prev),4))

mse_unadj

## [1] "MSE, unadjusted model         = 1.1393"

mse_adj

## [1] "MSE, spatially adjusted model = 0.0274"

Here we can observe a smaller margin of Mean Standard Error in our cross validation testing of the spatially adjusted model when compared with the unajusted GLM.

~fin

Spatially Adjusted Regression Modelling

Avery Richards

April, 2021

Regression Modeling:

Prediction over a raster object:

Predict prevalence within region:

Model Validation: