Shape files in R

Reading and exporting? the shape file

zim_shp <- maptools::readShapePoly("zim_shape/ZWE_adm2.shp", IDvar = "ID_2")

Warning: readShapePoly is deprecated; use rgdal::readOGR or sf::st_read

plot(zim_shp, border = "red", axes = TRUE, las = 1)

What R is reccommending

zim_OGR <- rgdal::readOGR("zim_shape/ZWE_adm2.shp")

OGR data source with driver: ESRI Shapefile 
Source: "C:\Users\kmwai\OneDrive - Kemri Wellcome Trust\H_Drive\PhD_work\courses\WIts\CAR_model\zim_shape\ZWE_adm2.shp", layer: "ZWE_adm2"
with 60 features
It has 11 fields
Integer64 fields read as strings:  ID_0 ID_1 ID_2

plot(zim_OGR, border = "blue", axes = TRUE, las = 1)

We need to check our shape file to see if the spatial geometry is valid and the gIsValid function looks at each polygon, and makes sure it doesn’t break any topological rules. Ideally, all of the geometry is valid and the total number of polygons that fail the validity tests is 0.

zim_valid <- data.table(valid = gIsValid(zim_OGR, byid = TRUE))
sum(zim_valid$valid == F)

[1] 0

Zimbabwe HAZ (Stunting) CAR Model

Load the data

The data is extracted from the Zimbabwean DHS. Our outcome variable is Stunting predicted by (Variable selection done earlier);

Employed - Employment status of the parent
b19 - Age of the child
Education - Education level of the parent
b4 - Sex of the child
v025 - Place of residence Urban or Rural
BMI - BMI calculated earlier

# zim_child_data <- haven::read_dta('data/Data2.dta')
zim_child_data <- readstata13::read.dta13("data/Data2.dta")

Warning in readstata13::read.dta13("data/Data2.dta"): 
   Missing factor labels for variables

   v445

   No labels have beend assigned.
   Set option 'generate.factors=TRUE' to generate labels.

# dplyr::glimpse(zim_child_data , width = 5)

Explolatory Data Analysis

## EDA
eda_data <- zim_child_data %>%
    select(Stunting, Employed, b19, Education, BMI)
PerformanceAnalytics::chart.Correlation(eda_data, histogram = TRUE, pch = 19)

Regression model

Select the variables of interest

What approaches can you use to select the variables of interest. - Stepwise regression 1^ - Regularization methods 2^

Here we have already selected the variables to use and we use complete case data.

zim_child_model <- zim_child_data %>%
    select(id_2, name_1, name_2, Stunting, Employed, b19, Education, b4, v025, BMI)
zim_child_model <- zim_child_model[complete.cases(zim_child_model), ]

glimpse(zim_child_model)

Rows: 5,962
Columns: 10
$ id_2      <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1~
$ name_1    <chr> "Bulawayo", "Harare", "Manicaland", "Manicaland", "Manicalan~
$ name_2    <chr> "Bulawayo", "Harare", "Buhera", "Chimanimani", "Chipinge", "~
$ Stunting  <dbl> -1.93, -0.86, -1.34, 0.80, 1.86, -0.32, 1.01, -1.06, -0.79, ~
$ Employed  <int> 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, ~
$ b19       <int> 13, 44, 18, 27, 1, 28, 5, 16, 8, 32, 59, 36, 23, 48, 50, 29,~
$ Education <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
$ b4        <fct> male, male, male, male, male, male, female, female, female, ~
$ v025      <fct> rural, rural, urban, rural, urban, rural, urban, rural, rura~
$ BMI       <dbl> 21.15, 37.29, 28.26, 21.70, 19.64, 19.56, 30.28, 28.82, 23.7~

Check whether the ID_2 on the shape file corresponds to what you have in your data

table(zim_shp@data$ID_2 %in% zim_child_model$id_2)


TRUE 
  60

table(zim_shp@data$NAME_2 %in% zim_child_model$name_2)


TRUE 
  60

Linear regression model

## Define the formula of the model
form_fit <- Stunting ~ Employed + b19 + as.factor(Education) + b4 + v025 + BMI

## fit the model
lin_mod <- glm(form_fit, data = zim_child_data, family = gaussian(link = "identity"))

Extract the summary statistics from the model. What do the estimates mean?

summary(lin_mod)$coefficient

                          Estimate   Std. Error    t value     Pr(>|t|)
(Intercept)           -0.708127745 0.0906876251 -7.8084275 6.795954e-15
Employed               0.013977182 0.0275116631  0.5080457 6.114401e-01
b19                    0.001503499 0.0007767325  1.9356713 5.295526e-02
as.factor(Education)1  0.258544943 0.0406933905  6.3534874 2.260783e-10
as.factor(Education)2  0.487947342 0.0673593288  7.2439460 4.910341e-13
b4female               0.001661043 0.0266641938  0.0622949 9.503301e-01
v025rural             -0.063267906 0.0299378549 -2.1133079 3.461591e-02
BMI                   -0.004092509 0.0030319814 -1.3497803 1.771378e-01

AIC(lin_mod)  # AIC => 17272

[1] 17272.36

BIC(lin_mod)  # BIC => 17333

[1] 17332.6

Check for the assumptions

We focus on the Residuals vs Fitted values and the Normal Q-Q. This will inform our decision on whether the outcome has met the assumptions to be used in further spatial models

par(mfrow = c(2, 2))  # display a unique layout for all graphs
plot(lin_mod)

par(mfrow = c(1, 1))

CAR model in INLA

INLA is a nice “fast” alternative to MCMC for fitting Bayesian models ¹ .

It uses the Integrated Nested Laplace Approximation, a deterministic Bayesian method ² and a very nice book here by Marta Blangiardo [^3].

Normal linear model in INLA

In the formula below 1 means that the model includes the intercept and

form_fit <-  Stunting ~1+ Employed +b19  + 
  as.factor(Education) + b4+ v025+ BMI

model_linear <- inla(form_fit,
                     family="gaussian",
                     data=zim_child_model,
                    control.compute=list(dic=TRUE, waic=TRUE))

Warning in .recacheSubclasses(def@className, def, env): undefined subclass
"numericVector" of class "Mnumeric"; definition not updated

Extracting the model summaries

Here we compare our glm model with the linear model from INLA, are they similar?

summary(model_linear)


Call:
   c("inla(formula = form_fit, family = \"gaussian\", data = 
   zim_child_model, ", " control.compute = list(dic = TRUE, waic = 
   TRUE))") 
Time used:
    Pre = 0.854, Running = 7.89, Post = 0.817, Total = 9.56 
Fixed effects:
                        mean    sd 0.025quant 0.5quant 0.975quant   mode kld
(Intercept)           -0.708 0.091     -0.886   -0.708     -0.530 -0.708   0
Employed               0.014 0.028     -0.040    0.014      0.068  0.014   0
b19                    0.002 0.001      0.000    0.002      0.003  0.002   0
as.factor(Education)1  0.259 0.041      0.179    0.259      0.338  0.259   0
as.factor(Education)2  0.488 0.067      0.356    0.488      0.620  0.488   0
b4female               0.002 0.027     -0.051    0.002      0.054  0.002   0
v025rural             -0.063 0.030     -0.122   -0.063     -0.005 -0.063   0
BMI                   -0.004 0.003     -0.010   -0.004      0.002 -0.004   0

Model hyperparameters:
                                         mean    sd 0.025quant 0.5quant
Precision for the Gaussian observations 0.944 0.017      0.911    0.944
                                        0.975quant  mode
Precision for the Gaussian observations      0.978 0.944

Expected number of effective parameters(stdev): 8.04(0.001)
Number of equivalent replicates : 742.05 

Deviance Information Criterion (DIC) ...............: 17272.44
Deviance Information Criterion (DIC, saturated) ....: 5973.37
Effective number of parameters .....................: 9.07

Watanabe-Akaike information criterion (WAIC) ...: 17272.83
Effective number of parameters .................: 9.45

Marginal log-Likelihood:  -8697.69 
Posterior marginals for the linear predictor and
 the fitted values are computed

summary(lin_mod)


Call:
glm(formula = form_fit, family = gaussian(link = "identity"), 
    data = zim_child_data)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-3.7788  -0.6882  -0.0303   0.6685   6.2951  

Coefficients:
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)           -0.7081277  0.0906876  -7.808 6.80e-15 ***
Employed               0.0139772  0.0275117   0.508   0.6114    
b19                    0.0015035  0.0007767   1.936   0.0530 .  
as.factor(Education)1  0.2585449  0.0406934   6.353 2.26e-10 ***
as.factor(Education)2  0.4879473  0.0673593   7.244 4.91e-13 ***
b4female               0.0016610  0.0266642   0.062   0.9503    
v025rural             -0.0632679  0.0299379  -2.113   0.0346 *  
BMI                   -0.0040925  0.0030320  -1.350   0.1771    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for gaussian family taken to be 1.059208)

    Null deviance: 6399.5  on 5961  degrees of freedom
Residual deviance: 6306.5  on 5954  degrees of freedom
AIC: 17272

Number of Fisher Scoring iterations: 2

Extracting the summary of the fixed effects

round(model_linear$summary.fixed[, 1:5], 3)

                        mean    sd 0.025quant 0.5quant 0.975quant
(Intercept)           -0.708 0.091     -0.886   -0.708     -0.530
Employed               0.014 0.028     -0.040    0.014      0.068
b19                    0.002 0.001      0.000    0.002      0.003
as.factor(Education)1  0.259 0.041      0.179    0.259      0.338
as.factor(Education)2  0.488 0.067      0.356    0.488      0.620
b4female               0.002 0.027     -0.051    0.002      0.054
v025rural             -0.063 0.030     -0.122   -0.063     -0.005
BMI                   -0.004 0.003     -0.010   -0.004      0.002

par(mfrow = c(2, 2))  # display a unique layout for all graphs
## Posterior density plot for the intercepts
plot(model_linear$marginals.fixed[[1]], type = "l", main = "", ylab = "", xlab = expression(beta[0]))
plot(model_linear$marginals.fixed[[2]], type = "l", main = "", ylab = "", xlab = expression(beta[1]))
par(mfrow = c(1, 1))

Spatial Model in INLA

Remember the importance of CAR models , to adjust for the neighbouring effect. Sharing a common border within the map. We extract the matrix to represent the structure of the neighbours. First we convert our neighbour map to an inla integrated map , a map that INLA can use to fit its model

zim_nb <- spdep::poly2nb(zim_shp)

nb2INLA("zim_shape/zim_inla.graph", zim_nb)
zim_adj <- paste(getwd(), "/zim_shape/zim_inla.graph", sep = "")

Then we generate the adjacency matrix for the ZIM example: in the plot below rows and columns identify areas; squares identify neighbors

H <- inla.read.graph(filename = "zim_shape/zim_inla.graph")
image(inla.graph2matrix(H), xlab = "", ylab = "")

Fitting CAR in INLA

After having defined our neighbourhood structure, then we need to specify the formula for the model, through

formula_inla <- Stunting ~ 1 + as.factor(Employed) +
  b19  + as.factor(Education) + b4+ v025+ BMI+
  
  ##our CAR specification
  f(id_2, model="bym",graph=zim_adj, scale.model=TRUE,
    ## spefiying the priors for the unstri and str 
                     hyper=list(prec.unstruct=list(prior="loggamma",param=c(1,0.001)), 
                                prec.spatial=list(prior="loggamma",param=c(1,0.001))))

where id_2 represents the identifiers for the locations and through the graph option we include the name of the object containing the neighborhood structure. Note that with f(ID, model=“bym”,…) R-INLA parameterizes

\[{{\xi }_{i}}={{u}_{i}}+{{\upsilon }_{i}}\]

By default, minimally informative priors are specified on the log of the unstructured effect precision \(\log \left( {{\tau }_{\upsilon }} \right)\sim \log Gamma\left( 1,0.0005 \right)\) and the log of the structured effect precision \(\log \left( {{u}_{\upsilon }} \right)\sim \log Gamma\left( 1,0.0005 \right)\)

model_inla <- inla(formula_inla,family="gaussian",
                     data=zim_child_model,
                     control.compute=list(dic=TRUE))

For model comparison and best fit, we use the Deviance Information Criterion (DIC), where the small DIC was considered better. DIC is defined as \(\overset{-}{\mathop{D}}\,\left( \theta \right)+pD\) where \(\overset{-}{\mathop{D}}\,\left( \theta \right)=E\left[ D(\theta )|y \right]\) which is the posterior mean of the deviance, \(D(\theta )\). \(pD\) is the difference in the posterior mean deviance and the deviance evaluated at the posterior mean of the parameters, \(pD = \mathop D\limits^ - \left( \theta \right) - D\left( {E\left( {\theta |y} \right)} \right)\).

RInla Model –> DIC 17218.91(pd=15.09)

Linear model–> DIC=17272.83(pd=9.444)

summary(model_inla)  #inla --> DIC 17218.91(pd=15.09) ##linear model--> DIC=17272.83(pd=9.444)


Call:
   c("inla(formula = formula_inla, family = \"gaussian\", data = 
   zim_child_model, ", " control.compute = list(dic = TRUE))") 
Time used:
    Pre = 0.776, Running = 58.7, Post = 0.691, Total = 60.2 
Fixed effects:
                        mean     sd 0.025quant 0.5quant 0.975quant   mode kld
(Intercept)           -0.458 40.537    -80.046   -0.459     79.064 -0.458   0
as.factor(Employed)1   0.010  0.028     -0.045    0.010      0.064  0.010   0
b19                    0.001  0.001      0.000    0.001      0.003  0.001   0
as.factor(Education)1  0.252  0.041      0.172    0.252      0.333  0.252   0
as.factor(Education)2  0.476  0.067      0.344    0.476      0.608  0.476   0
b4female               0.002  0.026     -0.050    0.002      0.054  0.002   0
v025rural             -0.054  0.032     -0.117   -0.054      0.009 -0.054   0
BMI                   -0.003  0.003     -0.009   -0.003      0.003 -0.003   0

Random effects:
  Name    Model
    id_2 BYM model

Model hyperparameters:
                                        mean    sd 0.025quant 0.5quant
Precision for the Gaussian observations 0.97 0.004      0.963     0.97
Precision for id_2 (iid component)      0.00 0.000      0.000     0.00
Precision for id_2 (spatial component)  0.00 0.000      0.000     0.00
                                        0.975quant mode
Precision for the Gaussian observations      0.977 0.97
Precision for id_2 (iid component)           0.000 0.00
Precision for id_2 (spatial component)       0.000 0.00

Expected number of effective parameters(stdev): 68.81(6.96)
Number of equivalent replicates : 86.65 

Deviance Information Criterion (DIC) ...............: 17264.38
Deviance Information Criterion (DIC, saturated) ....: 6123.43
Effective number of parameters .....................: 67.12

Marginal log-Likelihood:  -10591.89 
Posterior marginals for the linear predictor and
 the fitted values are computed

Summary of the fixed effect

round(model_inla$summary.fixed, 3)

                        mean     sd 0.025quant 0.5quant 0.975quant   mode kld
(Intercept)           -0.458 40.537    -80.046   -0.459     79.064 -0.458   0
as.factor(Employed)1   0.010  0.028     -0.045    0.010      0.064  0.010   0
b19                    0.001  0.001      0.000    0.001      0.003  0.001   0
as.factor(Education)1  0.252  0.041      0.172    0.252      0.333  0.252   0
as.factor(Education)2  0.476  0.067      0.344    0.476      0.608  0.476   0
b4female               0.002  0.026     -0.050    0.002      0.054  0.002   0
v025rural             -0.054  0.032     -0.117   -0.054      0.009 -0.054   0
BMI                   -0.003  0.003     -0.009   -0.003      0.003 -0.003   0

Summary of the random effects

round(head(model_inla$summary.random$id_2), 3)

  ID   mean     sd 0.025quant 0.5quant 0.975quant   mode kld
1  1 -0.511 40.537    -80.099   -0.512     79.011 -0.511   0
2  2 -0.313 40.537    -79.901   -0.314     79.208 -0.313   0
3  3 -1.073 40.549    -80.685   -1.074     78.472 -1.073   0
4  4  1.086 40.549    -78.526    1.085     80.632  1.086   0
5  5  2.120 40.549    -77.492    2.119     81.666  2.120   0
6  6 -0.032 40.549    -79.644   -0.034     79.513 -0.032   0

Plotting the estimates on map

For this exercise, we plot the probability of having high HAZ in ZIM to compare with out BUGs model. First extract the marginals of the random effects

csi <- model_inla$marginals.random$id_2[1:60]

Then create the posterior probability using inla.pmarginal function.

# *** Code for posterior probablity
a <- 0
prob_csi <- lapply(csi, function(x) {
    1 - inla.pmarginal(a, x)
})
## for each location estimate the probability in continous
cont_prob_cs1 <- data.frame(maps_cont_prob_csi = unlist(prob_csi)) %>%
    tibble::rownames_to_column("ID_2") %>%
    mutate(ID_2 = gsub("index.", "", ID_2))

maps_cont_prob_csi <- cont_prob_cs1

Or create categories if you wish

# for each location estimate the probability in groups
prob_csi_cutoff <- c(0,0.2,0.4,0.6,0.8,1) ## can change accordingly
cat_prob_csi <- cut(unlist(prob_csi),
                    breaks=prob_csi_cutoff, 
                    include.lowest=TRUE)


maps_cat_prob_csi <- data.frame(ID_2=unique(zim_child_model$id_2), ## check whether it joins well
                                cat_prob_csi=cat_prob_csi)
maps_cat_prob_csi$ID_2 <- as.character(maps_cat_prob_csi$ID_2)

Eventually, we join our posterior probabilities to the data

zim_shp_df <- broom::tidy(zim_shp, region = "ID_2")
zim_shp_df <- zim_shp_df %>% 
 # left_join(maps_cat_prob_csi, by=c("id"="ID_2")) %>% 
  left_join(maps_cont_prob_csi, by=c("id"="ID_2"))

glimpse(zim_shp_df)

Rows: 30,130
Columns: 8
$ long               <dbl> 28.61305, 28.60440, 28.57862, 28.55219, 28.54004, 2~
$ lat                <dbl> -20.23587, -20.22540, -20.22598, -20.22138, -20.259~
$ order              <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ~
$ hole               <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FA~
$ piece              <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
$ group              <fct> 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1~
$ id                 <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "~
$ maps_cont_prob_csi <dbl> 0.4938038, 0.4938038, 0.4938038, 0.4938038, 0.49380~

We have our plot, is it similar

p2 <- ggplot() + geom_polygon(data = zim_shp_df, aes(x = long, y = lat, group = group,
    fill = maps_cont_prob_csi), colour = "white") + theme_void() + ggtitle("RINLA Fit") +
    labs(fill = "P of  high HAZ") + scale_fill_continuous(high = "#fff7ec", low = "#7F0000")
p2

CAR Models

using Zimbabwe DHS data

KM Wambui