library("ggplot2"); theme_set(theme_bw())
library("dplyr")
library("tidyr")
library("reshape2")
library("fingertipsR")
library("RSQLite")
library("lme4")
library("lmerTest")
library("bbmle")
library("forcats")
library("rgdal")
library("broom")
library("PerformanceAnalytics") # for scatterplot matrix
library("spdep") # for poly2nb
lsoa_demographics <- read.csv(file="data\\lsoa_demographics.csv", stringsAsFactors=F)
lsoa_demographics <- melt(lsoa_demographics, id=1:3, measure=4:7, variable.name="Age_Band", value.name="Female_Population")
# Create ethnicity splits for each lsoa
# could be done in more detail, keeping simple for now
lsoa_age_demographics <- lsoa_demographics %>%
group_by(Age_Band, LSOA) %>%
summarise("Female_Population"=sum(Female_Population)) %>%
spread(Age_Band, Female_Population)
# Create age splits for each lsoa
lsoa_ethnic_demographics <- lsoa_demographics %>%
group_by(Ethnic_Cat, LSOA) %>%
summarise("Female_Population"=sum(Female_Population)) %>%
spread(Ethnic_Cat, Female_Population)
# GP catchment female population for Jan 2017
# From: http://digital.nhs.uk/catalogue/PUB23139
gp_lsoa_catch <- read.csv(file="data\\gp-reg-patients-LSOA-alt-tall.csv", stringsAsFactors=F)
# calculate total LSOA population
LSOA_POP <- gp_lsoa_catch %>%
group_by(LSOA_CODE) %>%
summarise("TOTAL_FEMALE"=sum(FEMALE_PATIENTS))
# Calc percentage of total lsoa pop registered with each practice
gp_lsoa_catch <- gp_lsoa_catch %>%
left_join(LSOA_POP) %>%
mutate("PERCENTAGE_OF_LSOA"=FEMALE_PATIENTS/TOTAL_FEMALE)
## Joining, by = "LSOA_CODE"
# Calc weighted ethnicity
gp_lsoa_catch <- gp_lsoa_catch %>%
left_join(lsoa_ethnic_demographics, by=c("LSOA_CODE"="LSOA"))
gp_lsoa_catch[,9:13] <- (gp_lsoa_catch[,9:13] * gp_lsoa_catch$PERCENTAGE_OF_LSOA)
# Calc weighted age bands
gp_lsoa_catch <- gp_lsoa_catch %>%
left_join(lsoa_age_demographics, by=c("LSOA_CODE"="LSOA"))
gp_lsoa_catch[,14:17] <- (gp_lsoa_catch[,14:17] * gp_lsoa_catch$PERCENTAGE_OF_LSOA)
# now group up to practice level
gp_demographics <- gp_lsoa_catch %>%
group_by(PRACTICE_CODE) %>%
summarise_at(funs(sum(.,na.rm = TRUE)) , .vars=c(8:16)) %>%
mutate("Total_Pop"=`Asian/Asian British` +
`Black/African/Caribbean/Black British` +
`Mixed/multiple ethnic group` +
`Other ethnic group` +
`White`) %>%
mutate(
"Ethn_Asian"=`Asian/Asian British` / Total_Pop * 100,
"Ethn_Black"=`Black/African/Caribbean/Black British` / Total_Pop * 100,
"Ethn_Mixed"=`Mixed/multiple ethnic group` / Total_Pop * 100,
"Ethn_Other"=`Other ethnic group` / Total_Pop * 100,
"Ethn_White"=`White` / Total_Pop * 100) %>%
mutate(
"Age_0_to_24"=Age_0_to_24 / Total_Pop * 100,
"Age_25_to_49"=Age_25_to_49 / Total_Pop * 100,
"Age_50_to_64"=Age_50_to_64 / Total_Pop * 100,
"Age_65_and_over"=Age_65_and_over / Total_Pop * 100
) %>%
select(PRACTICE_CODE, starts_with("Ethn_"),starts_with("Age_"))
# load fingertips data
# IMD 2015
imd_2015 <- fingertips_data(IndicatorID=91872,
AreaTypeID = 7,
stringsAsFactors=F)
imd_2015 <- imd_2015 %>% filter(AreaType=="GP") %>%
select(AreaCode, "IMD_2015"=Value)
# Patient satisfaction with opening hours
satisfied_opening_hours <- fingertips_data(IndicatorID=1942,
AreaTypeID = 7,
stringsAsFactors=F)
satisfied_opening_hours <- satisfied_opening_hours %>%
filter(AreaType=="GP", Timeperiod=="2016/17") %>%
select(AreaCode, "Satisfied_Opening_Hours"=Value)
# Load coverage data
coverage_data <- read.csv(file="data\\Cerv_Cov_MachRead_GP_Q1_1718.csv", stringsAsFactors=F)
coverage_data <- coverage_data %>%
filter(Age=="25_49", Year=="2017/18", Quarter=="Q1") %>%
spread(DataType, Value) %>%
mutate("Coverage"=Screened/Eligible*100)
# drop supressed values
nrow(coverage_data)
## [1] 7670
# How many records are dropped
nrow(coverage_data %>%
filter(is.na(Coverage)==TRUE))
## [1] 367
coverage_data <- coverage_data %>%
filter(is.na(Coverage)==FALSE)
nrow(coverage_data)
## [1] 7303
# gp descriptive data
# gp
gp_info <- read.csv(file="data\\egpcur.csv", stringsAsFactors=FALSE, header=FALSE)
# Create number of GPs per practice
gp_info <- gp_info %>%
filter(V13=="A") %>% # active GPs only
group_by(V15) %>%
summarise("gp_number"=n()) %>%
select("practice_code"=V15, gp_number)
# gpp
gpp_info <- read.csv(file="data\\epraccur.csv", stringsAsFactors=FALSE, header=FALSE)
gpp_info <- gpp_info %>%
mutate("pcd_spaceless"=gsub(" ","", V10)) %>%
select("practice_code"=V1, pcd_spaceless, "ccg_code" = V15)
# load the spatial lookups
# postcode data
con = dbConnect(SQLite(), dbname="C:\\Users\\User1\\Documents\\Rstudio_Projects\\ONS_Lookup_Database\\ons_lkp_db")
# dbListTables(con)
pcd <- dbGetQuery(con, "SELECT pcd_spaceless, oseast1m, osnrth1m, lsoa11 FROM ONS_PD")
## Warning in rsqlite_fetch(res@ptr, n = n): Column `oseast1m`: mixed type,
## first seen values of type integer, coercing other values of type string
## Warning in rsqlite_fetch(res@ptr, n = n): Column `osnrth1m`: mixed type,
## first seen values of type integer, coercing other values of type string
# limit the size of the postcode dataframe to only those postcodes in the gp practice data
pcd <- pcd %>% filter(pcd_spaceless %in% unique(gpp_info$pcd_spaceless))
# rural/urban lsoa classification
# https://ons.maps.arcgis.com/home/item.html?id=9855221596994bde8363a685cb3dd58a
urban_rural <- read.csv(file="data\\RUC11_LSOA11_EW.csv", stringsAsFactors=FALSE)
# combine it all
nrow(coverage_data)
## [1] 7303
coverage_data_supplemented <- coverage_data %>%
inner_join(gp_demographics, by=c("OrganisationCode"="PRACTICE_CODE"))
nrow(coverage_data_supplemented)
## [1] 7296
coverage_data_supplemented <- coverage_data_supplemented %>%
inner_join(imd_2015, by=c("OrganisationCode"="AreaCode"))
nrow(coverage_data_supplemented)
## [1] 7197
coverage_data_supplemented <- coverage_data_supplemented %>%
inner_join(satisfied_opening_hours, by=c("OrganisationCode"="AreaCode"))
nrow(coverage_data_supplemented)
## [1] 7196
coverage_data_supplemented <- coverage_data_supplemented %>%
inner_join(gp_info, by=c("OrganisationCode"="practice_code"))
nrow(coverage_data_supplemented)
## [1] 7190
coverage_data_supplemented <- coverage_data_supplemented %>%
inner_join(gpp_info, by=c("OrganisationCode"="practice_code"))
nrow(coverage_data_supplemented)
## [1] 7190
coverage_data_supplemented <- coverage_data_supplemented %>%
inner_join(pcd, by=c("pcd_spaceless"="pcd_spaceless"))
nrow(coverage_data_supplemented)
## [1] 7187
coverage_data_supplemented <- coverage_data_supplemented %>%
inner_join(urban_rural, by=c("lsoa11"="LSOA11CD"))
nrow(coverage_data_supplemented)
## [1] 7187
coverage_data_supplemented <- coverage_data_supplemented %>%
mutate("osnrth100km"=osnrth1m/100000,
"IMD_2015_Rank"=dense_rank(IMD_2015),
"IMD_2015_Quintile"=ntile(IMD_2015, 5),
"gp_per_1k_eligible_women"=gp_number/Eligible*1000,
"Urban_Rural"=ifelse(substring(RUC11,1,3)=="Urb", "Urban", "Rural"))
# sort out ccg codes!
# add the ons ccg codes based on lsoa to prevent pain and heartache later on
# from https://ons.maps.arcgis.com/home/item.html?id=19e5c35c6a504a7b9e1b74bed1b6225f
ccg_lkp <- read.csv(file="data\\LSOA11_CCG16_LAD16_EN_LU.csv", stringsAsFactors=F)
coverage_data_supplemented <- coverage_data_supplemented %>%
left_join(ccg_lkp, by=c("lsoa11"="LSOA11CD"))
Exploration
# scatterplot matrix
PerformanceAnalytics::chart.Correlation(coverage_data_supplemented[,c(9:18,20,30,31,33)], method="pearson")

ggplot(coverage_data_supplemented, aes(Eligible, Screened, col=Coverage)) +
geom_point(alpha=0.1) +
geom_smooth(method="lm", se=F, col="red") +
#scale_colour_gradient2(mid="#aaaaaa", high="#0571b0", low="#ca0020", midpoint=0.4) +
labs(title="Screened vs ELigible",
x="Eligible",
y="Screened")

ggplot(coverage_data_supplemented, aes(Urban_Rural, Coverage)) +
geom_boxplot(notch=T, fill="light blue") +
coord_flip() +
labs(title="Coverage Compared Across Urban and Rural Practices",
x="Urban or Rural Practice",
y="Coverage (%)")

ggplot(coverage_data_supplemented, aes(IMD_2015_Rank, Coverage)) +
geom_point(alpha=0.1) +
geom_smooth(method="lm", se=F, col="red") +
# facet_wrap(~RUC11) +
labs(title="Coverage by IMD 2015 Rank",
x="IMD 2015 Rank (1 = least deprived)",
y="Coverage (%)")

ggplot(coverage_data_supplemented, aes(osnrth100km, Coverage)) +
geom_point(alpha=0.25)+
facet_wrap(~Urban_Rural) +
geom_smooth(method="lm", se=F, col="red") +
labs(title="Coverage by Distance North",
x="Distance North from Origin of OSGB36 Grid (100km units)",
y="Coverage (%)")

ggplot(coverage_data_supplemented, aes(Satisfied_Opening_Hours, Coverage)) +
geom_point(alpha=0.3) +
geom_smooth(method="lm", se=F, col="red") +
labs(title="Coverage by Satisfaction With Opening Hours",
x="Satisfaction With Opening Hours (%)",
y="Coverage (%)")

ethn_plot_data <- gather(coverage_data_supplemented[,c(9:14)], key="Ethnic_Group", value="Percentage", -Coverage)
ggplot(ethn_plot_data, aes(Percentage, Coverage)) +
geom_point(alpha=0.3) +
facet_wrap(~Ethnic_Group, scale="free") +
geom_smooth(method="lm", se=F, col="red") +
labs(title="Coverage by Proportion of Ethnic Group Registrants (modelled)",
subtitle="Note that axes may vary across sub-plots",
x="Percentage of Registrants of Ethnic Group (%)",
y="Coverage (%)")

age_plot_data <- gather(coverage_data_supplemented[,c(9,15:18)], key="Age_Group", value="Percentage", -Coverage)
ggplot(age_plot_data, aes(Percentage, Coverage)) +
geom_point(alpha=0.3) +
facet_wrap(~Age_Group, scale="free") +
geom_smooth(method="lm", se=F, col="red") +
labs(title="Coverage by Proportion of Age Group Registrants (modelled)",
subtitle="Note that axes may vary across sub-plots",
x="Percentage of Registrants of Age Group (%)",
y="Coverage (%)")

ccg_sample <- sample(unique(coverage_data_supplemented$CCG16CD),20)
coverage_data_supplemented %>%
filter(CCG16CD %in% ccg_sample) %>%
ggplot(data=., aes(CCG16NM, Coverage)) +
geom_boxplot(fill="light blue") +
coord_flip() +
labs(title="Coverage Distribution by CCG",
subtitle="Based on practices from a random sample of 20 CCGs",
x="CCG Name",
y="Coverage (%)")

Modelling
# take a copy of the main dataset to use for modelling (some variables will be altered)
coverage_data_supplemented2 <- coverage_data_supplemented
# convert percentage variables by dividing by 100
coverage_data_supplemented2[,9:18] <- coverage_data_supplemented2[,9:18]/100
coverage_data_supplemented2[,10:18] <- as.vector(coverage_data_supplemented2[,10:18])
coverage_data_supplemented2$Satisfied_Opening_Hours <- as.vector(coverage_data_supplemented2$Satisfied_Opening_Hours/100)
# No longer used, make any transformations or scalings here
# coverage_data_supplemented2$IMD_2015_Rank <- as.vector(coverage_data_supplemented2$IMD_2015_Rank)
# coverage_data_supplemented2$osnrth100km <- as.vector(coverage_data_supplemented2$osnrth100km)
# coverage_data_supplemented2$gp_number <- as.vector(coverage_data_supplemented2$gp_number)
# coverage_data_supplemented2$gp_per_1k_eligible_women <- as.vector(coverage_data_supplemented2$gp_per_1k_eligible_women)
# check for multilevel structure
# null multilevel model
fit_null_multi <- glmer(Coverage ~ (1 | CCG16CD),
family=binomial, weights=Eligible,
data=coverage_data_supplemented2)
# null single level model
fit_null_single <- glm(Coverage ~ 1,
family=binomial, weights=Eligible,
data=coverage_data_supplemented2)
# compare AIC
bbmle::AICtab(fit_null_single, fit_null_multi)
## dAIC df
## fit_null_multi 0.0 2
## fit_null_single 134178.9 1
AIC(fit_null_single, fit_null_multi)
## df AIC
## fit_null_single 1 366789.9
## fit_null_multi 2 232610.9
# Likelihood Ratio Testing
# create function to perform likelihood ratio test
# https://stat.ethz.ch/pipermail/r-sig-mixed-models/2008q3/001175.html
lrt <- function (obj1, obj2) {
L0 <- logLik(obj1)
L1 <- logLik(obj2)
L01 <- as.vector(- 2 * (L0 - L1))
df <- attr(L1, "df") - attr(L0, "df")
list(L01 = L01, df = df,
"p-value" = pchisq(L01, df, lower.tail = FALSE))
}
# perform a likelihood ratio test
lrt(fit_null_single,fit_null_multi)
## $L01
## [1] 134180.9
##
## $df
## [1] 1
##
## $`p-value`
## [1] 0
# note that ANOVA gives identical results
anova(fit_null_multi, fit_null_single)
## Data: coverage_data_supplemented2
## Models:
## fit_null_single: Coverage ~ 1
## fit_null_multi: Coverage ~ (1 | CCG16CD)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## fit_null_single 1 366790 366797 -183394 366788
## fit_null_multi 2 232611 232625 -116303 232607 134181 1 < 2.2e-16
##
## fit_null_single
## fit_null_multi ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Create the final model
fit_2 <- glmer(Coverage ~
Ethn_Asian +
Ethn_Black +
Ethn_Mixed +
Ethn_Other +
Age_25_to_49 +
Age_65_and_over +
IMD_2015_Rank +
Satisfied_Opening_Hours +
gp_per_1k_eligible_women +
osnrth100km +
Urban_Rural +
(1 | CCG16CD),
family=binomial(link=logit),
weights=Eligible,
data=coverage_data_supplemented2)
## Warning: Some predictor variables are on very different scales: consider
## rescaling
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control
## $checkConv, : Model failed to converge with max|grad| = 1.51356 (tol =
## 0.001, component 1)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue
## - Rescale variables?;Model is nearly unidentifiable: large eigenvalue ratio
## - Rescale variables?
summary(fit_2)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: Coverage ~ Ethn_Asian + Ethn_Black + Ethn_Mixed + Ethn_Other +
## Age_25_to_49 + Age_65_and_over + IMD_2015_Rank + Satisfied_Opening_Hours +
## gp_per_1k_eligible_women + osnrth100km + Urban_Rural + (1 |
## CCG16CD)
## Data: coverage_data_supplemented2
## Weights: Eligible
##
## AIC BIC logLik deviance df.resid
## 158775.3 158864.8 -79374.7 158749.3 7174
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -38.524 -2.008 0.177 2.290 19.598
##
## Random effects:
## Groups Name Variance Std.Dev.
## CCG16CD (Intercept) 0.01783 0.1335
## Number of obs: 7187, groups: CCG16CD, 209
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.872e-01 2.540e-02 -11.31 <2e-16 ***
## Ethn_Asian -7.902e-01 1.040e-02 -75.99 <2e-16 ***
## Ethn_Black 1.920e+00 2.631e-02 72.98 <2e-16 ***
## Ethn_Mixed -9.408e+00 1.328e-01 -70.86 <2e-16 ***
## Ethn_Other -3.551e+00 1.079e-01 -32.90 <2e-16 ***
## Age_25_to_49 2.467e+00 3.060e-02 80.63 <2e-16 ***
## Age_65_and_over 2.845e+00 3.361e-02 84.66 <2e-16 ***
## IMD_2015_Rank -5.623e-05 6.394e-07 -87.93 <2e-16 ***
## Satisfied_Opening_Hours 1.258e-01 4.305e-03 29.21 <2e-16 ***
## gp_per_1k_eligible_women 6.884e-04 2.752e-04 2.50 0.0124 *
## osnrth100km 7.492e-02 5.896e-03 12.71 <2e-16 ***
## Urban_RuralUrban -6.430e-02 2.818e-03 -22.81 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) Ethn_A Ethn_B Ethn_M Ethn_O A_25__ A_65__ IMD_20 St_O_H
## Ethn_Asian -0.190
## Ethn_Black -0.084 0.012
## Ethn_Mixed -0.060 0.150 -0.432
## Ethn_Other -0.075 -0.298 -0.154 -0.208
## Age_25_t_49 -0.568 0.171 0.173 -0.191 0.091
## Ag_65_nd_vr -0.505 0.300 0.099 0.187 0.100 0.510
## IMD_2015_Rn -0.065 -0.099 -0.245 -0.127 0.040 0.047 0.184
## Stsfd_Opn_H -0.183 0.124 0.049 -0.051 -0.029 0.066 0.051 0.149
## gp_pr_1k_l_ -0.111 0.036 -0.018 -0.023 -0.018 0.138 -0.101 -0.030 -0.024
## osnrth100km -0.679 0.005 0.021 0.044 0.023 0.034 0.007 -0.091 0.007
## Urbn_RrlUrb -0.092 -0.020 0.110 -0.108 0.004 -0.070 0.011 -0.143 0.113
## g__1__ osn100
## Ethn_Asian
## Ethn_Black
## Ethn_Mixed
## Ethn_Other
## Age_25_t_49
## Ag_65_nd_vr
## IMD_2015_Rn
## Stsfd_Opn_H
## gp_pr_1k_l_
## osnrth100km 0.007
## Urbn_RrlUrb 0.108 0.031
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling
## convergence code: 0
## Model failed to converge with max|grad| = 1.51356 (tol = 0.001, component 1)
## Model is nearly unidentifiable: very large eigenvalue
## - Rescale variables?
## Model is nearly unidentifiable: large eigenvalue ratio
## - Rescale variables?
# Use the DHARMa package to examine residuals of the model
library("DHARMa")
## Warning: package 'DHARMa' was built under R version 3.4.3
# https://cran.r-project.org/web/packages/DHARMa/vignettes/DHARMa.html#installing-loading-and-citing-the-package
simulationOutput <- simulateResiduals(fittedModel = fit_2, n = 250)
# slow!
plotSimulatedResiduals(simulationOutput = simulationOutput)

#These lines should be straight, horizontal, and at y-values of 0.25, 0.5 and 0.75.
# Note, however, that some deviations from this are to be expected by chance,
# even for a perfect model, especially if the sample size is small.
# test spatial autocorrelation
# note coordinates are jittered by 1m to account for practices sharing premises
# without jitter the moran's I function cannot calculate a distance matrix
testSpatialAutocorrelation(simulationOutput = simulationOutput,
x = jitter(coverage_data_supplemented2$oseast1m, amount=1),
y = jitter(coverage_data_supplemented2$osnrth1m, amount=1),
plot=TRUE)

##
## DHARMa Moran's I test for spatial autocorrelation
##
## data: simulationOutput
## observed = 0.01367800, expected = -0.00013916, sd = 0.00583540,
## p-value = 0.01789
## alternative hypothesis: Spatial autocorrelation
# Add the residuals and the fitted values back to the main dataframe
fitted_values <- fitted(fit_2)
residual <- residuals(fit_2)
coverage_data_supplemented2$fitted_values <- fitted_values
coverage_data_supplemented2$residual <- residual
# Add residual quintiles
coverage_data_supplemented2 <- coverage_data_supplemented2 %>%
mutate("residual_quintile"=ntile(residual,5))
# residual vs fitted plot
ggplot(coverage_data_supplemented2, aes(fitted_values, residual)) +
geom_point(alpha=0.2) +
geom_smooth(method="lm", col="red", size=1, se=FALSE)

# Median coverage map
ggplot(coverage_data_supplemented2, aes(oseast1m, osnrth1m, col=factor(ntile(Coverage,2)))) +
geom_point(alpha=0.25) +
coord_fixed() +
theme_void()

# Residuals map
ggplot(coverage_data_supplemented2, aes(oseast1m, osnrth1m, col=residual)) +
geom_point(alpha=0.75) +
coord_fixed() +
facet_wrap(~residual_quintile) +
scale_colour_gradient2(midpoint=0, mid="grey") +
theme_void()

# caterpillar plot
random_effects <- ranef(fit_2)
intercept_variances <- attr(ranef(fit_2, postVar = TRUE)[[1]], "postVar")
## Warning in ranef.merMod(fit_2, postVar = TRUE): 'postVar' is deprecated:
## please use 'condVar' instead
random_intercept <- random_effects$CCG16CD
caterpillar_data <- data.frame(
"intercepts"=random_effects$CCG16CD[,1],
"sd_intercept"=2*sqrt(intercept_variances[,,1:length(intercept_variances)]),
"CCG16CD"=rownames(random_intercept)
)
# add in ccg names
ccg_names <- ccg_lkp %>% group_by(CCG16CD, CCG16NM) %>% summarise()
# add in categorisation for colouring ggplot
caterpillar_data <- caterpillar_data %>%
inner_join(ccg_names, by=c("CCG16CD"="CCG16CD")) %>%
mutate(
"lower_limit"=intercepts-sd_intercept,
"upper_limit"=intercepts+sd_intercept,
"performance_cat"=ifelse(lower_limit>0,"high",
ifelse(upper_limit<0, "low", "average")),
"CCG16NM"=factor(CCG16NM))
## Warning: Column `CCG16CD` joining factor and character vector, coercing
## into character vector
# reorder the ccg names factor
caterpillar_data$CCG16NM <- fct_reorder(caterpillar_data$CCG16NM, caterpillar_data$intercepts)
# Caterpillar plot
ggplot(caterpillar_data,
aes(CCG16NM,
intercepts)) +
geom_hline(yintercept=0) +
geom_errorbar(aes(ymin=lower_limit,
ymax=upper_limit),
width=0,
color="black") +
geom_point(aes(col=performance_cat),
size=2) +
scale_colour_manual(values=c("#aaaaaa", "#0571b0", "#ca0020")) +
guides(size=FALSE,
shape=FALSE) +
xlab("Levels") +
ylab("") +
theme(axis.text.x=element_text(size=rel(1.2)),
axis.title.x=element_text(size=rel(1.3)),
axis.text.y=element_text(size=rel(1.2)),
panel.grid.minor=element_blank(),
panel.grid.major.x=element_blank()) +
coord_flip()

# Histogram of random effects
ggplot(caterpillar_data,
aes(intercepts)) +
geom_histogram(bins=60)

{qqnorm(caterpillar_data$intercepts);qqline(caterpillar_data$intercepts)}

# Mapping the random effects
path <- ".\\shp\\CCG_April_2016_UGC_V4"
ccg_shp <- readOGR(dsn=path,
layer="Clinical_Commissioning_Groups_April_2016_Generalised_Clipped_Boundaries_in_England",
stringsAsFactors=FALSE)
## OGR data source with driver: ESRI Shapefile
## Source: ".\shp\CCG_April_2016_UGC_V4", layer: "Clinical_Commissioning_Groups_April_2016_Generalised_Clipped_Boundaries_in_England"
## with 209 features
## It has 5 fields
## Integer64 fields read as strings: objectid
ccg_shp <- sp::merge(ccg_shp,
caterpillar_data,
by.x="ccg16cd",
by.y="CCG16CD",
sort = FALSE)
# define a function to handle tidying (which used to be called fortifying) and then joinng the data items back in
clean <- function(shape){
shape@data$id = rownames(shape@data)
shape.points = tidy(shape, region="id")
shape.df = inner_join(shape.points, shape@data, by="id")
}
ccg_shp_tidy <- clean(ccg_shp)
ggplot(ccg_shp_tidy, aes(long, lat, fill=intercepts, group=group)) +
geom_polygon(col="black", size=0.05) +
coord_fixed() +
scale_fill_gradient2(midpoint=0, low="#ca0020", high="#0571b0") +
facet_wrap(~performance_cat)

ggplot(ccg_shp_tidy, aes(long, lat, fill=performance_cat, group=group)) +
geom_polygon(col="white", size=0.05) +
coord_fixed() +
scale_fill_manual(values=c("#aaaaaa", "#0571b0", "#ca0020")) +
theme_void()

# check for clustering of random effects
# remember to reset oa_shp_data as it's filtered in the previous script
neighbourhood <- spdep::poly2nb(ccg_shp, queen=TRUE)
{
par(mar=c(0,0,0,0))
plot(ccg_shp,
border="grey")
plot(neighbourhood,
coords=coordinates(ccg_shp),
col="red",
add=T)
}

# Now create a neighbourhood weights matrix
neighbourhood_weights_list <- nb2listw(neighbourhood, style="W", zero.policy=TRUE)
# and run the moran's I test
moran.test(ccg_shp$intercepts,neighbourhood_weights_list, zero.policy=T)
##
## Moran I test under randomisation
##
## data: ccg_shp$intercepts
## weights: neighbourhood_weights_list
##
## Moran I statistic standard deviate = 9.7897, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 0.448372814 -0.004830918 0.002143131