Shape files in R
Reading and exporting? the shape file
zim_shp <- maptools::readShapePoly("zim_shape/ZWE_adm2.shp", IDvar = "ID_2")Warning: readShapePoly is deprecated; use rgdal::readOGR or sf::st_read
plot(zim_shp, border = "red", axes = TRUE, las = 1)What R is reccommending
zim_OGR <- rgdal::readOGR("zim_shape/ZWE_adm2.shp")OGR data source with driver: ESRI Shapefile
Source: "C:\Users\kmwai\OneDrive - Kemri Wellcome Trust\H_Drive\PhD_work\courses\WIts\CAR_model\zim_shape\ZWE_adm2.shp", layer: "ZWE_adm2"
with 60 features
It has 11 fields
Integer64 fields read as strings: ID_0 ID_1 ID_2
plot(zim_OGR, border = "blue", axes = TRUE, las = 1)We need to check our shape file to see if the spatial geometry is valid and the gIsValid function looks at each polygon, and makes sure it doesn’t break any topological rules. Ideally, all of the geometry is valid and the total number of polygons that fail the validity tests is 0.
zim_valid <- data.table(valid = gIsValid(zim_OGR, byid = TRUE))
sum(zim_valid$valid == F)[1] 0
Zimbabwe HAZ (Stunting) CAR Model
Load the data
The data is extracted from the Zimbabwean DHS. Our outcome variable is Stunting predicted by (Variable selection done earlier);
- Employed - Employment status of the parent
- b19 - Age of the child
- Education - Education level of the parent
- b4 - Sex of the child
- v025 - Place of residence Urban or Rural
- BMI - BMI calculated earlier
# zim_child_data <- haven::read_dta('data/Data2.dta')
zim_child_data <- readstata13::read.dta13("data/Data2.dta")Warning in readstata13::read.dta13("data/Data2.dta"):
Missing factor labels for variables
v445
No labels have beend assigned.
Set option 'generate.factors=TRUE' to generate labels.
# dplyr::glimpse(zim_child_data , width = 5)Explolatory Data Analysis
## EDA
eda_data <- zim_child_data %>%
select(Stunting, Employed, b19, Education, BMI)
PerformanceAnalytics::chart.Correlation(eda_data, histogram = TRUE, pch = 19)Regression model
Select the variables of interest
What approaches can you use to select the variables of interest. - Stepwise regression 1^ - Regularization methods 2^
Here we have already selected the variables to use and we use complete case data.
zim_child_model <- zim_child_data %>%
select(id_2, name_1, name_2, Stunting, Employed, b19, Education, b4, v025, BMI)
zim_child_model <- zim_child_model[complete.cases(zim_child_model), ]
glimpse(zim_child_model)Rows: 5,962
Columns: 10
$ id_2 <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1~
$ name_1 <chr> "Bulawayo", "Harare", "Manicaland", "Manicaland", "Manicalan~
$ name_2 <chr> "Bulawayo", "Harare", "Buhera", "Chimanimani", "Chipinge", "~
$ Stunting <dbl> -1.93, -0.86, -1.34, 0.80, 1.86, -0.32, 1.01, -1.06, -0.79, ~
$ Employed <int> 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, ~
$ b19 <int> 13, 44, 18, 27, 1, 28, 5, 16, 8, 32, 59, 36, 23, 48, 50, 29,~
$ Education <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
$ b4 <fct> male, male, male, male, male, male, female, female, female, ~
$ v025 <fct> rural, rural, urban, rural, urban, rural, urban, rural, rura~
$ BMI <dbl> 21.15, 37.29, 28.26, 21.70, 19.64, 19.56, 30.28, 28.82, 23.7~
Check whether the ID_2 on the shape file corresponds to what you have in your data
table(zim_shp@data$ID_2 %in% zim_child_model$id_2)
TRUE
60
table(zim_shp@data$NAME_2 %in% zim_child_model$name_2)
TRUE
60
Linear regression model
## Define the formula of the model
form_fit <- Stunting ~ Employed + b19 + as.factor(Education) + b4 + v025 + BMI
## fit the model
lin_mod <- glm(form_fit, data = zim_child_data, family = gaussian(link = "identity"))Extract the summary statistics from the model. What do the estimates mean?
summary(lin_mod)$coefficient Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.708127745 0.0906876251 -7.8084275 6.795954e-15
Employed 0.013977182 0.0275116631 0.5080457 6.114401e-01
b19 0.001503499 0.0007767325 1.9356713 5.295526e-02
as.factor(Education)1 0.258544943 0.0406933905 6.3534874 2.260783e-10
as.factor(Education)2 0.487947342 0.0673593288 7.2439460 4.910341e-13
b4female 0.001661043 0.0266641938 0.0622949 9.503301e-01
v025rural -0.063267906 0.0299378549 -2.1133079 3.461591e-02
BMI -0.004092509 0.0030319814 -1.3497803 1.771378e-01
AIC(lin_mod) # AIC => 17272[1] 17272.36
BIC(lin_mod) # BIC => 17333[1] 17332.6
Check for the assumptions
We focus on the Residuals vs Fitted values and the Normal Q-Q. This will inform our decision on whether the outcome has met the assumptions to be used in further spatial models
par(mfrow = c(2, 2)) # display a unique layout for all graphs
plot(lin_mod)par(mfrow = c(1, 1))CAR model in INLA
- INLA is a nice “fast” alternative to MCMC for fitting Bayesian models 1 .
It uses the Integrated Nested Laplace Approximation, a deterministic Bayesian method 2 and a very nice book here by Marta Blangiardo [^3].
Normal linear model in INLA
In the formula below 1 means that the model includes the intercept and
form_fit <- Stunting ~1+ Employed +b19 +
as.factor(Education) + b4+ v025+ BMI
model_linear <- inla(form_fit,
family="gaussian",
data=zim_child_model,
control.compute=list(dic=TRUE, waic=TRUE))Warning in .recacheSubclasses(def@className, def, env): undefined subclass
"numericVector" of class "Mnumeric"; definition not updated
Extracting the model summaries
Here we compare our glm model with the linear model from INLA, are they similar?
summary(model_linear)
Call:
c("inla(formula = form_fit, family = \"gaussian\", data =
zim_child_model, ", " control.compute = list(dic = TRUE, waic =
TRUE))")
Time used:
Pre = 0.854, Running = 7.89, Post = 0.817, Total = 9.56
Fixed effects:
mean sd 0.025quant 0.5quant 0.975quant mode kld
(Intercept) -0.708 0.091 -0.886 -0.708 -0.530 -0.708 0
Employed 0.014 0.028 -0.040 0.014 0.068 0.014 0
b19 0.002 0.001 0.000 0.002 0.003 0.002 0
as.factor(Education)1 0.259 0.041 0.179 0.259 0.338 0.259 0
as.factor(Education)2 0.488 0.067 0.356 0.488 0.620 0.488 0
b4female 0.002 0.027 -0.051 0.002 0.054 0.002 0
v025rural -0.063 0.030 -0.122 -0.063 -0.005 -0.063 0
BMI -0.004 0.003 -0.010 -0.004 0.002 -0.004 0
Model hyperparameters:
mean sd 0.025quant 0.5quant
Precision for the Gaussian observations 0.944 0.017 0.911 0.944
0.975quant mode
Precision for the Gaussian observations 0.978 0.944
Expected number of effective parameters(stdev): 8.04(0.001)
Number of equivalent replicates : 742.05
Deviance Information Criterion (DIC) ...............: 17272.44
Deviance Information Criterion (DIC, saturated) ....: 5973.37
Effective number of parameters .....................: 9.07
Watanabe-Akaike information criterion (WAIC) ...: 17272.83
Effective number of parameters .................: 9.45
Marginal log-Likelihood: -8697.69
Posterior marginals for the linear predictor and
the fitted values are computed
summary(lin_mod)
Call:
glm(formula = form_fit, family = gaussian(link = "identity"),
data = zim_child_data)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.7788 -0.6882 -0.0303 0.6685 6.2951
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.7081277 0.0906876 -7.808 6.80e-15 ***
Employed 0.0139772 0.0275117 0.508 0.6114
b19 0.0015035 0.0007767 1.936 0.0530 .
as.factor(Education)1 0.2585449 0.0406934 6.353 2.26e-10 ***
as.factor(Education)2 0.4879473 0.0673593 7.244 4.91e-13 ***
b4female 0.0016610 0.0266642 0.062 0.9503
v025rural -0.0632679 0.0299379 -2.113 0.0346 *
BMI -0.0040925 0.0030320 -1.350 0.1771
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for gaussian family taken to be 1.059208)
Null deviance: 6399.5 on 5961 degrees of freedom
Residual deviance: 6306.5 on 5954 degrees of freedom
AIC: 17272
Number of Fisher Scoring iterations: 2
Extracting the summary of the fixed effects
round(model_linear$summary.fixed[, 1:5], 3) mean sd 0.025quant 0.5quant 0.975quant
(Intercept) -0.708 0.091 -0.886 -0.708 -0.530
Employed 0.014 0.028 -0.040 0.014 0.068
b19 0.002 0.001 0.000 0.002 0.003
as.factor(Education)1 0.259 0.041 0.179 0.259 0.338
as.factor(Education)2 0.488 0.067 0.356 0.488 0.620
b4female 0.002 0.027 -0.051 0.002 0.054
v025rural -0.063 0.030 -0.122 -0.063 -0.005
BMI -0.004 0.003 -0.010 -0.004 0.002
par(mfrow = c(2, 2)) # display a unique layout for all graphs
## Posterior density plot for the intercepts
plot(model_linear$marginals.fixed[[1]], type = "l", main = "", ylab = "", xlab = expression(beta[0]))
plot(model_linear$marginals.fixed[[2]], type = "l", main = "", ylab = "", xlab = expression(beta[1]))
par(mfrow = c(1, 1))Spatial Model in INLA
Remember the importance of CAR models , to adjust for the neighbouring effect. Sharing a common border within the map. We extract the matrix to represent the structure of the neighbours. First we convert our neighbour map to an inla integrated map , a map that INLA can use to fit its model
zim_nb <- spdep::poly2nb(zim_shp)
nb2INLA("zim_shape/zim_inla.graph", zim_nb)
zim_adj <- paste(getwd(), "/zim_shape/zim_inla.graph", sep = "")Then we generate the adjacency matrix for the ZIM example: in the plot below rows and columns identify areas; squares identify neighbors
H <- inla.read.graph(filename = "zim_shape/zim_inla.graph")
image(inla.graph2matrix(H), xlab = "", ylab = "")Fitting CAR in INLA
After having defined our neighbourhood structure, then we need to specify the formula for the model, through
formula_inla <- Stunting ~ 1 + as.factor(Employed) +
b19 + as.factor(Education) + b4+ v025+ BMI+
##our CAR specification
f(id_2, model="bym",graph=zim_adj, scale.model=TRUE,
## spefiying the priors for the unstri and str
hyper=list(prec.unstruct=list(prior="loggamma",param=c(1,0.001)),
prec.spatial=list(prior="loggamma",param=c(1,0.001))))where id_2 represents the identifiers for the locations and through the graph option we include the name of the object containing the neighborhood structure. Note that with f(ID, model=“bym”,…) R-INLA parameterizes
\[{{\xi }_{i}}={{u}_{i}}+{{\upsilon }_{i}}\]
By default, minimally informative priors are specified on the log of the unstructured effect precision \(\log \left( {{\tau }_{\upsilon }} \right)\sim \log Gamma\left( 1,0.0005 \right)\) and the log of the structured effect precision \(\log \left( {{u}_{\upsilon }} \right)\sim \log Gamma\left( 1,0.0005 \right)\)
model_inla <- inla(formula_inla,family="gaussian",
data=zim_child_model,
control.compute=list(dic=TRUE))For model comparison and best fit, we use the Deviance Information Criterion (DIC), where the small DIC was considered better. DIC is defined as \(\overset{-}{\mathop{D}}\,\left( \theta \right)+pD\) where \(\overset{-}{\mathop{D}}\,\left( \theta \right)=E\left[ D(\theta )|y \right]\) which is the posterior mean of the deviance, \(D(\theta )\). \(pD\) is the difference in the posterior mean deviance and the deviance evaluated at the posterior mean of the parameters, \(pD = \mathop D\limits^ - \left( \theta \right) - D\left( {E\left( {\theta |y} \right)} \right)\).
RInla Model –> DIC 17218.91(pd=15.09)
Linear model–> DIC=17272.83(pd=9.444)
summary(model_inla) #inla --> DIC 17218.91(pd=15.09) ##linear model--> DIC=17272.83(pd=9.444)
Call:
c("inla(formula = formula_inla, family = \"gaussian\", data =
zim_child_model, ", " control.compute = list(dic = TRUE))")
Time used:
Pre = 0.776, Running = 58.7, Post = 0.691, Total = 60.2
Fixed effects:
mean sd 0.025quant 0.5quant 0.975quant mode kld
(Intercept) -0.458 40.537 -80.046 -0.459 79.064 -0.458 0
as.factor(Employed)1 0.010 0.028 -0.045 0.010 0.064 0.010 0
b19 0.001 0.001 0.000 0.001 0.003 0.001 0
as.factor(Education)1 0.252 0.041 0.172 0.252 0.333 0.252 0
as.factor(Education)2 0.476 0.067 0.344 0.476 0.608 0.476 0
b4female 0.002 0.026 -0.050 0.002 0.054 0.002 0
v025rural -0.054 0.032 -0.117 -0.054 0.009 -0.054 0
BMI -0.003 0.003 -0.009 -0.003 0.003 -0.003 0
Random effects:
Name Model
id_2 BYM model
Model hyperparameters:
mean sd 0.025quant 0.5quant
Precision for the Gaussian observations 0.97 0.004 0.963 0.97
Precision for id_2 (iid component) 0.00 0.000 0.000 0.00
Precision for id_2 (spatial component) 0.00 0.000 0.000 0.00
0.975quant mode
Precision for the Gaussian observations 0.977 0.97
Precision for id_2 (iid component) 0.000 0.00
Precision for id_2 (spatial component) 0.000 0.00
Expected number of effective parameters(stdev): 68.81(6.96)
Number of equivalent replicates : 86.65
Deviance Information Criterion (DIC) ...............: 17264.38
Deviance Information Criterion (DIC, saturated) ....: 6123.43
Effective number of parameters .....................: 67.12
Marginal log-Likelihood: -10591.89
Posterior marginals for the linear predictor and
the fitted values are computed
Summary of the fixed effect
round(model_inla$summary.fixed, 3) mean sd 0.025quant 0.5quant 0.975quant mode kld
(Intercept) -0.458 40.537 -80.046 -0.459 79.064 -0.458 0
as.factor(Employed)1 0.010 0.028 -0.045 0.010 0.064 0.010 0
b19 0.001 0.001 0.000 0.001 0.003 0.001 0
as.factor(Education)1 0.252 0.041 0.172 0.252 0.333 0.252 0
as.factor(Education)2 0.476 0.067 0.344 0.476 0.608 0.476 0
b4female 0.002 0.026 -0.050 0.002 0.054 0.002 0
v025rural -0.054 0.032 -0.117 -0.054 0.009 -0.054 0
BMI -0.003 0.003 -0.009 -0.003 0.003 -0.003 0
Summary of the random effects
round(head(model_inla$summary.random$id_2), 3) ID mean sd 0.025quant 0.5quant 0.975quant mode kld
1 1 -0.511 40.537 -80.099 -0.512 79.011 -0.511 0
2 2 -0.313 40.537 -79.901 -0.314 79.208 -0.313 0
3 3 -1.073 40.549 -80.685 -1.074 78.472 -1.073 0
4 4 1.086 40.549 -78.526 1.085 80.632 1.086 0
5 5 2.120 40.549 -77.492 2.119 81.666 2.120 0
6 6 -0.032 40.549 -79.644 -0.034 79.513 -0.032 0
Plotting the estimates on map
For this exercise, we plot the probability of having high HAZ in ZIM to compare with out BUGs model. First extract the marginals of the random effects
csi <- model_inla$marginals.random$id_2[1:60]Then create the posterior probability using inla.pmarginal function.
# *** Code for posterior probablity
a <- 0
prob_csi <- lapply(csi, function(x) {
1 - inla.pmarginal(a, x)
})
## for each location estimate the probability in continous
cont_prob_cs1 <- data.frame(maps_cont_prob_csi = unlist(prob_csi)) %>%
tibble::rownames_to_column("ID_2") %>%
mutate(ID_2 = gsub("index.", "", ID_2))
maps_cont_prob_csi <- cont_prob_cs1Or create categories if you wish
# for each location estimate the probability in groups
prob_csi_cutoff <- c(0,0.2,0.4,0.6,0.8,1) ## can change accordingly
cat_prob_csi <- cut(unlist(prob_csi),
breaks=prob_csi_cutoff,
include.lowest=TRUE)
maps_cat_prob_csi <- data.frame(ID_2=unique(zim_child_model$id_2), ## check whether it joins well
cat_prob_csi=cat_prob_csi)
maps_cat_prob_csi$ID_2 <- as.character(maps_cat_prob_csi$ID_2)Eventually, we join our posterior probabilities to the data
zim_shp_df <- broom::tidy(zim_shp, region = "ID_2")
zim_shp_df <- zim_shp_df %>%
# left_join(maps_cat_prob_csi, by=c("id"="ID_2")) %>%
left_join(maps_cont_prob_csi, by=c("id"="ID_2"))
glimpse(zim_shp_df)Rows: 30,130
Columns: 8
$ long <dbl> 28.61305, 28.60440, 28.57862, 28.55219, 28.54004, 2~
$ lat <dbl> -20.23587, -20.22540, -20.22598, -20.22138, -20.259~
$ order <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ~
$ hole <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FA~
$ piece <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
$ group <fct> 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1~
$ id <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "~
$ maps_cont_prob_csi <dbl> 0.4938038, 0.4938038, 0.4938038, 0.4938038, 0.49380~
We have our plot, is it similar
p2 <- ggplot() + geom_polygon(data = zim_shp_df, aes(x = long, y = lat, group = group,
fill = maps_cont_prob_csi), colour = "white") + theme_void() + ggtitle("RINLA Fit") +
labs(fill = "P of high HAZ") + scale_fill_continuous(high = "#fff7ec", low = "#7F0000")
p2