Data Analysis

library(magrittr)
library(utils)
library(ggplot2)
library(sf)
library(sp)
library(rgdal)
library(dplyr)
library(kableExtra)

### Prepare data
load("cases.RData")

### Observed/expected table (for knitr)
kable(
  cases %>%
    group_by(Year) %>%
    summarise(`Observed` = format(sum(Observed), big.mark=","), `Expected` = format(sum(Expected), big.mark=",")),
  booktabs = TRUE,
  caption = "Aggregated Admissions"
) %>% kable_styling(., full_width = TRUE)

Aggregated Admissions
Year	Observed	Expected
2003	NA	NA
2004	NA	NA
2005	NA	NA
2006	NA	NA
2007	NA	NA
2008	NA	NA
2009	NA	NA
2010	NA	NA
2011	NA	NA
2012	NA	NA
2013	42,399	44,258.5
2014	45,018	43,732.9
2015	49,738	43,690.5
2016	52,879	43,984.1
2017	50,710	43,316.6
2018	56,345	42,555
2019	56,505	41,570

missing <- data.frame()
missing_LA <- list()
for (i in 2003:2019) {
  o <- cases[cases$Year == i, ]$Observed %>%
    is.na() %>%
    which() %>%
    length()
  e <- cases[cases$Year == i, ]$Expected %>%
    is.na() %>%
    which() %>%
    length()
  s <- cases[cases$Year == i, ]$SAR %>%
    is.na() %>%
    which() %>%
    length()
  p <- cases[cases$Year == i, ]$Population %>%
    is.na() %>%
    which() %>%
    length()
  row <- data.frame(Year = i, Observed = o, Expected = e, SAR = s, Population = p)
  missing <- rbind(missing, row)
  
  which <- cases[cases$Year == i, ]$SAR %>%
    is.na() %>%
    which()
  missing_LA <- c(missing_LA, list(cases[cases$Year == i, ][which, ]$Level.description))
}
kable(missing, caption = "Missing Observations") %>% kable_styling(., full_width = TRUE)

Missing Observations
Year	Observed	Expected	SAR
2003	120	120	120
2004	99	99	99
2005	114	114	114
2006	127	127	127
2007	108	108	108
2008	77	77	77
2009	105	105	105
2010	129	129	129
2011	115	115	115
2012	73	73	73
2013	0	0	0
2014	0	0	0
2015	0	0	0
2016	0	0	0
2017	0	0	0
2018	0	0	0
2019	0	0	0

cases <- cases[cases$Year >= 2013, ]

temp<-cases %>% group_by(Year) %>% summarise(`Observed` = sum(Observed), `Expected` = sum(Expected))
ggplot(temp, aes(x=Year)) + 
  geom_line(aes(y = Observed), color = "darkred") + 
  geom_line(aes(y = Expected), color="steelblue")  + 
  scale_color_manual(name = "Admissions", values = c("Observed" = "darkred", "Expected" = "steelblue"))

Exploratory Analysis

load("pollution.RData")
P25<- P25[P25$Year >= 2013,]

mu = mean(NO2$NO2)
sd = sd(NO2$NO2)

hist(P25$PM25,
     main="Concentration of PM2.5",
     xlab="µg/m3",
     col="yellow")
abline(v=mean(P25$PM25), col="blue")

hist(NO2$NO2,
     main="Concentration of NO2",
     xlab="µg/m3",
     col="yellow")
abline(v=mu, col="red")

#correlation
# Calculate the correlation matrix
temp<-left_join(P25, NO2, by=c("lad19nm", "Year")) %>% select(lad19nm, Year, PM25, NO2)
corr_matrix <- cor(temp[,3:4])
print(corr_matrix)

##           PM25       NO2
## PM25 1.0000000 0.7452461
## NO2  0.7452461 1.0000000

Maps

Maps of SAR by LTLA by year

# Create colour pallette
library(RColorBrewer)
library(spdep)

colors <- c("#D0F0C0", brewer.pal(4, "Reds"))

#I think it's not divided by 100
cases$SAR.cut <- cut(
  cases$SAR,
  breaks = c(min(cases$SAR[!is.na(cases$SAR)]), 1, 2.5, 5, max(cases$SAR[!is.na(cases$SAR)])),
)

# Read shapefile
shapefile_path <- "../Boundaries (2019)/England/Local_Authority_Districts_December_2019_England.shp"
map <- st_read(shapefile_path)

## Reading layer `Local_Authority_Districts_December_2019_England' from data source `/Users/cam/Library/CloudStorage/OneDrive-ImperialCollegeLondon/LRTI project/Boundaries (2019)/England/Local_Authority_Districts_December_2019_England.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 317 features and 11 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -714517.9 ymin: 6422874 xmax: 196334.8 ymax: 7520900
## Projected CRS: WGS 84 / Pseudo-Mercator

map <- map[map$lad19nm != "Isles of Scilly",]

map_nb <- poly2nb(map)
summary(map_nb)

## Neighbour list object:
## Number of regions: 316 
## Number of nonzero links: 1610 
## Percentage nonzero weights: 1.612322 
## Average number of links: 5.094937 
## 1 region with no links:
## 44
## Link number distribution:
## 
##  0  1  2  3  4  5  6  7  8  9 10 11 
##  1  8 25 39 54 58 46 49 21 10  3  2 
## 8 least connected regions:
## 10 26 61 67 88 89 115 263 with 1 link
## 2 most connected regions:
## 51 227 with 11 links

# Convert to INLA format
nb2INLA("map.graph", map_nb)
map.adj <- paste(getwd(), "/map.graph", sep = "")

# Prep data (do I need to add PM2.5 here? - depends if I uses map_i later)
for (i in 2013:2019) {
  d <- cases[cases$Year == i, ]
  map_i <- left_join(map, d, by = join_by(lad19cd == Level))
  
  p <- ggplot() +
    geom_sf(data = map_i) +
    aes(fill = SAR.cut) +
    theme_bw() +
    scale_fill_manual(values = colors) +
    guides(fill = guide_legend(title = "SAR")) +
    labs(title = paste0("SAR in ", i))
  
  print(p)
}

Analysis

Now we build the models

# Aggregate observed and expected cases by LA (independent of year)
cases_agg <- cases %>%
  group_by(Level) %>%
  summarise(Observed = sum(Observed), Expected = sum(Expected), Population = sum(Population)) %>%
  mutate(SAR = Observed / Expected) %>%
  mutate(log.Population = log(Population)) %>%
  dplyr::rename(O = Observed, E = Expected)

#round up/down
cases_agg[cases_agg$Level == "E09000001",]$O <- cases_agg[cases_agg$Level == "E09000012",]$O %>% floor()
cases_agg[cases_agg$Level == "E09000012",]$O <- cases_agg[cases_agg$Level == "E09000012",]$O %>% ceiling()

pol <- P25[P25$Year >= 2013,] %>% group_by(lad19cd) %>% summarise(PM2.5 = mean(PM25))

#merge; 'data' is cases_agg combined with pm2.5
data<-merge(cases_agg, pol, by.x="Level", by.y="lad19cd")

Starting with the fixed environmental effects (with and without intercept)

library(INLA)

formula <- O ~ 1 + PM2.5
fixed.model1 <- inla(formula, family="poisson", data=data)

fixed.model1$summary.fixed

##                   mean          sd  0.025quant   0.5quant  0.975quant
## (Intercept)  7.3272231 0.010352538  7.30693254  7.3272231  7.34751375
## PM2.5       -0.0319657 0.001067797 -0.03405855 -0.0319657 -0.02987286
##                   mode          kld
## (Intercept)  7.3272231 2.431918e-11
## PM2.5       -0.0319657 6.114588e-14

formula2 <- O ~ PM2.5
fixed.model2 <- inla(formula2, family="poisson", data=data)

fixed.model2$summary.fixed

##                   mean          sd  0.025quant   0.5quant  0.975quant
## (Intercept)  7.3272231 0.010352538  7.30693254  7.3272231  7.34751375
## PM2.5       -0.0319657 0.001067797 -0.03405855 -0.0319657 -0.02987286
##                   mode          kld
## (Intercept)  7.3272231 2.431918e-11
## PM2.5       -0.0319657 6.114588e-14

Now map spatial effects

# Create colour pallette
colors <- c("#D0F0C0", brewer.pal(4, "Reds"))

# Create buckets for SAR
cases_agg$SAR.cut <- cut(
  cases_agg$SAR,
  breaks = c(min(cases_agg$SAR[!is.na(cases_agg$SAR)]), 0.4, 0.6, 0.8, 1, 1.2, 1.4, 1.6, max(cases_agg$SAR[!is.na(cases_agg$SAR)])),
)

# Produce spatial map of aggregatged SAR
# map_agg is map joined with cases_agg
map_agg <- left_join(map, cases_agg, by = join_by(lad19cd == Level))

# Plot SAR using ggplot
ggplot() +
  geom_sf(data = map_agg, aes(fill = SAR.cut)) +
  theme_bw() +
  scale_fill_viridis_d() +
  theme(legend.position = "bottom")

# Fit hierarchical Poisson log-linear model in INLA
LA <- seq(1, 316)
formula <- O ~ f(LA, model = "bym2", graph = map.adj, hyper = list(
  prec = list(
    prior = "pc.prec",
    param = c(0.5 / 0.31, 0.01)
  ),
  phi = list(
    prior = "pc",
    param = c(0.5, 2 / 3)
  )
))

sBYM.model <- inla(formula = formula, family = "poisson", data = data, E = E, control.compute = list(waic = TRUE))

# Obtain the posterior summary statistics
summary(sBYM.model, summary.stat = c("mean", "p0.975", "p0.025", "p0.5in", "p0.5out", "waic"))

## 
## Call:
##    c("inla.core(formula = formula, family = family, contrasts = contrasts, 
##    ", " data = data, quantiles = quantiles, E = E, offset = offset, ", " 
##    scale = scale, weights = weights, Ntrials = Ntrials, strata = strata, 
##    ", " lp.scale = lp.scale, link.covariates = link.covariates, verbose = 
##    verbose, ", " lincomb = lincomb, selection = selection, control.compute 
##    = control.compute, ", " control.predictor = control.predictor, 
##    control.family = control.family, ", " control.inla = control.inla, 
##    control.fixed = control.fixed, ", " control.mode = control.mode, 
##    control.expert = control.expert, ", " control.hazard = control.hazard, 
##    control.lincomb = control.lincomb, ", " control.update = 
##    control.update, control.lp.scale = control.lp.scale, ", " 
##    control.pardiso = control.pardiso, only.hyperparam = only.hyperparam, 
##    ", " inla.call = inla.call, inla.arg = inla.arg, num.threads = 
##    num.threads, ", " keep = keep, working.directory = working.directory, 
##    silent = silent, ", " inla.mode = inla.mode, safe = FALSE, debug = 
##    debug, .parent.frame = .parent.frame)" ) 
## Time used:
##     Pre = 4.61, Running = 0.247, Post = 0.0142, Total = 4.87 
## Fixed effects:
##              mean    sd 0.025quant 0.5quant 0.975quant  mode kld
## (Intercept) 0.131 0.004      0.123    0.131       0.14 0.131   0
## 
## Random effects:
##   Name     Model
##     LA BYM2 model
## 
## Model hyperparameters:
##                   mean    sd 0.025quant 0.5quant 0.975quant mode
## Precision for LA 9.801 0.952      7.999    9.776     11.742 8.95
## Phi for LA       0.957 0.036      0.862    0.967      0.995 0.96
## 
## Watanabe-Akaike information criterion (WAIC) ...: 3262.58
## Effective number of parameters .................: 158.88
## 
## Marginal log-Likelihood:  -1968.79 
##  is computed 
## Posterior summaries for the linear predictor and the fitted values are computed
## (Posterior marginals needs also 'control.compute=list(return.marginals.predictor=TRUE)')

Now combine fixed with random spatial effects

LA <- seq(1, 316)
formula <- O ~ PM2.5 + f(LA, model = "bym2", graph = map.adj, hyper = list(
  prec = list(
    prior = "pc.prec",
    param = c(0.6 / 0.31, 0.01)
  ),
  phi = list(
    prior = "pc",
    param = c(0.6, 2 / 3)
  )
))

fixed.sBYM.model <- inla(formula = formula, family = "poisson", data = data, E = E, control.compute = list(waic = TRUE))
summary(fixed.sBYM.model, summary.stat = c("mean", "p0.975", "p0.025", "p0.5in", "p0.5out", "waic"))

## 
## Call:
##    c("inla.core(formula = formula, family = family, contrasts = contrasts, 
##    ", " data = data, quantiles = quantiles, E = E, offset = offset, ", " 
##    scale = scale, weights = weights, Ntrials = Ntrials, strata = strata, 
##    ", " lp.scale = lp.scale, link.covariates = link.covariates, verbose = 
##    verbose, ", " lincomb = lincomb, selection = selection, control.compute 
##    = control.compute, ", " control.predictor = control.predictor, 
##    control.family = control.family, ", " control.inla = control.inla, 
##    control.fixed = control.fixed, ", " control.mode = control.mode, 
##    control.expert = control.expert, ", " control.hazard = control.hazard, 
##    control.lincomb = control.lincomb, ", " control.update = 
##    control.update, control.lp.scale = control.lp.scale, ", " 
##    control.pardiso = control.pardiso, only.hyperparam = only.hyperparam, 
##    ", " inla.call = inla.call, inla.arg = inla.arg, num.threads = 
##    num.threads, ", " keep = keep, working.directory = working.directory, 
##    silent = silent, ", " inla.mode = inla.mode, safe = FALSE, debug = 
##    debug, .parent.frame = .parent.frame)" ) 
## Time used:
##     Pre = 5.58, Running = 0.288, Post = 0.015, Total = 5.88 
## Fixed effects:
##               mean    sd 0.025quant 0.5quant 0.975quant   mode kld
## (Intercept)  0.528 0.221      0.093    0.529      0.960  0.529   0
## PM2.5       -0.041 0.023     -0.086   -0.041      0.004 -0.041   0
## 
## Random effects:
##   Name     Model
##     LA BYM2 model
## 
## Model hyperparameters:
##                    mean    sd 0.025quant 0.5quant 0.975quant  mode
## Precision for LA 10.048 1.055      8.024   10.027     12.170 8.990
## Phi for LA        0.942 0.044      0.829    0.954      0.993 0.943
## 
## Watanabe-Akaike information criterion (WAIC) ...: 3263.55
## Effective number of parameters .................: 159.30
## 
## Marginal log-Likelihood:  -1974.31 
##  is computed 
## Posterior summaries for the linear predictor and the fitted values are computed
## (Posterior marginals needs also 'control.compute=list(return.marginals.predictor=TRUE)')

Lower Respiratory Tract Infections in England: A Bayesian spatio-temporal analysis

2023-07-05

Data Analysis

Exploratory Analysis

Maps

Analysis