Multilevel Modelling Screening Uptake

library("ggplot2"); theme_set(theme_bw())
library("dplyr")
library("tidyr")
library("reshape2")
library("fingertipsR")
library("RSQLite") 
library("lme4")
library("lmerTest")
library("bbmle")
library("forcats")
library("rgdal")
library("broom")
library("PerformanceAnalytics") # for scatterplot matrix
library("spdep") # for poly2nb

lsoa_demographics <- read.csv(file="data\\lsoa_demographics.csv", stringsAsFactors=F)


lsoa_demographics <- melt(lsoa_demographics, id=1:3, measure=4:7, variable.name="Age_Band", value.name="Female_Population")


# Create ethnicity splits for each lsoa
# could be done in more detail, keeping simple for now
lsoa_age_demographics <- lsoa_demographics %>%
  group_by(Age_Band, LSOA) %>%
  summarise("Female_Population"=sum(Female_Population)) %>% 
  spread(Age_Band, Female_Population)

  
# Create age splits for each lsoa
lsoa_ethnic_demographics <- lsoa_demographics %>%
  group_by(Ethnic_Cat, LSOA) %>%
  summarise("Female_Population"=sum(Female_Population)) %>% 
  spread(Ethnic_Cat, Female_Population)

# GP catchment female population for Jan 2017
# From: http://digital.nhs.uk/catalogue/PUB23139

gp_lsoa_catch <- read.csv(file="data\\gp-reg-patients-LSOA-alt-tall.csv", stringsAsFactors=F)


# calculate total LSOA population
LSOA_POP <- gp_lsoa_catch %>% 
  group_by(LSOA_CODE) %>% 
  summarise("TOTAL_FEMALE"=sum(FEMALE_PATIENTS))


# Calc percentage of total lsoa pop registered with each practice
gp_lsoa_catch <- gp_lsoa_catch %>% 
  left_join(LSOA_POP) %>% 
  mutate("PERCENTAGE_OF_LSOA"=FEMALE_PATIENTS/TOTAL_FEMALE)

## Joining, by = "LSOA_CODE"

# Calc weighted ethnicity
gp_lsoa_catch <- gp_lsoa_catch %>%  
  left_join(lsoa_ethnic_demographics, by=c("LSOA_CODE"="LSOA"))


gp_lsoa_catch[,9:13] <- (gp_lsoa_catch[,9:13] * gp_lsoa_catch$PERCENTAGE_OF_LSOA)



# Calc weighted age bands
gp_lsoa_catch <- gp_lsoa_catch %>%  
  left_join(lsoa_age_demographics, by=c("LSOA_CODE"="LSOA"))


gp_lsoa_catch[,14:17] <- (gp_lsoa_catch[,14:17] * gp_lsoa_catch$PERCENTAGE_OF_LSOA)

# now group up to practice level
gp_demographics <- gp_lsoa_catch %>% 
  group_by(PRACTICE_CODE) %>% 
  summarise_at(funs(sum(.,na.rm = TRUE)) , .vars=c(8:16)) %>% 
  mutate("Total_Pop"=`Asian/Asian British` +
                     `Black/African/Caribbean/Black British` +
                     `Mixed/multiple ethnic group` +
                     `Other ethnic group` +
                     `White`) %>% 
  mutate(
  "Ethn_Asian"=`Asian/Asian British` / Total_Pop * 100, 
  "Ethn_Black"=`Black/African/Caribbean/Black British` / Total_Pop * 100,
  "Ethn_Mixed"=`Mixed/multiple ethnic group` / Total_Pop * 100,
  "Ethn_Other"=`Other ethnic group` / Total_Pop * 100,
  "Ethn_White"=`White` / Total_Pop * 100) %>% 
mutate(
  "Age_0_to_24"=Age_0_to_24 / Total_Pop * 100, 
  "Age_25_to_49"=Age_25_to_49 / Total_Pop * 100, 
  "Age_50_to_64"=Age_50_to_64 / Total_Pop * 100, 
  "Age_65_and_over"=Age_65_and_over / Total_Pop * 100
  ) %>% 
  select(PRACTICE_CODE, starts_with("Ethn_"),starts_with("Age_"))

# load fingertips data


# IMD 2015
imd_2015 <- fingertips_data(IndicatorID=91872,
                        AreaTypeID = 7,
                        stringsAsFactors=F)

imd_2015 <- imd_2015 %>% filter(AreaType=="GP") %>%
  select(AreaCode, "IMD_2015"=Value)


# Patient satisfaction with opening hours
satisfied_opening_hours <- fingertips_data(IndicatorID=1942,
                        AreaTypeID = 7,
                        stringsAsFactors=F)

satisfied_opening_hours <- satisfied_opening_hours %>%
  filter(AreaType=="GP", Timeperiod=="2016/17") %>%
  select(AreaCode, "Satisfied_Opening_Hours"=Value)

# Load coverage data
coverage_data <- read.csv(file="data\\Cerv_Cov_MachRead_GP_Q1_1718.csv", stringsAsFactors=F)
coverage_data <- coverage_data %>%
  filter(Age=="25_49", Year=="2017/18", Quarter=="Q1") %>%
  spread(DataType, Value) %>% 
  mutate("Coverage"=Screened/Eligible*100)


# drop supressed values
nrow(coverage_data)

## [1] 7670

# How many records are dropped
nrow(coverage_data %>% 
  filter(is.na(Coverage)==TRUE))

## [1] 367

coverage_data <- coverage_data %>% 
  filter(is.na(Coverage)==FALSE)

nrow(coverage_data)

## [1] 7303

# gp descriptive data

# gp
gp_info <- read.csv(file="data\\egpcur.csv", stringsAsFactors=FALSE, header=FALSE)

# Create number of GPs per practice
gp_info <- gp_info %>% 
  filter(V13=="A") %>% # active GPs only 
  group_by(V15) %>% 
  summarise("gp_number"=n()) %>% 
  select("practice_code"=V15, gp_number)


# gpp
gpp_info <- read.csv(file="data\\epraccur.csv", stringsAsFactors=FALSE, header=FALSE)

gpp_info <- gpp_info %>%
  mutate("pcd_spaceless"=gsub(" ","", V10)) %>% 
  select("practice_code"=V1, pcd_spaceless, "ccg_code" = V15)

# load the spatial lookups
# postcode data

con = dbConnect(SQLite(), dbname="C:\\Users\\User1\\Documents\\Rstudio_Projects\\ONS_Lookup_Database\\ons_lkp_db")
# dbListTables(con)
pcd <- dbGetQuery(con, "SELECT pcd_spaceless,  oseast1m, osnrth1m, lsoa11 FROM ONS_PD")

## Warning in rsqlite_fetch(res@ptr, n = n): Column `oseast1m`: mixed type,
## first seen values of type integer, coercing other values of type string

## Warning in rsqlite_fetch(res@ptr, n = n): Column `osnrth1m`: mixed type,
## first seen values of type integer, coercing other values of type string

# limit the size of the postcode dataframe to only those postcodes in the gp practice data
pcd <- pcd %>% filter(pcd_spaceless %in% unique(gpp_info$pcd_spaceless))



# rural/urban lsoa classification
# https://ons.maps.arcgis.com/home/item.html?id=9855221596994bde8363a685cb3dd58a
urban_rural <- read.csv(file="data\\RUC11_LSOA11_EW.csv", stringsAsFactors=FALSE)

# combine it all

nrow(coverage_data)

## [1] 7303

coverage_data_supplemented <- coverage_data %>%
  inner_join(gp_demographics, by=c("OrganisationCode"="PRACTICE_CODE"))

nrow(coverage_data_supplemented)

## [1] 7296

coverage_data_supplemented <- coverage_data_supplemented %>%
  inner_join(imd_2015, by=c("OrganisationCode"="AreaCode"))

nrow(coverage_data_supplemented)

## [1] 7197

coverage_data_supplemented <- coverage_data_supplemented %>%
  inner_join(satisfied_opening_hours, by=c("OrganisationCode"="AreaCode"))

nrow(coverage_data_supplemented)

## [1] 7196

coverage_data_supplemented <- coverage_data_supplemented %>%
  inner_join(gp_info, by=c("OrganisationCode"="practice_code"))

nrow(coverage_data_supplemented)

## [1] 7190

coverage_data_supplemented <- coverage_data_supplemented %>%
  inner_join(gpp_info, by=c("OrganisationCode"="practice_code"))

nrow(coverage_data_supplemented)

## [1] 7190

coverage_data_supplemented <- coverage_data_supplemented %>%
  inner_join(pcd, by=c("pcd_spaceless"="pcd_spaceless"))

nrow(coverage_data_supplemented)

## [1] 7187

coverage_data_supplemented <- coverage_data_supplemented %>%
  inner_join(urban_rural, by=c("lsoa11"="LSOA11CD"))

nrow(coverage_data_supplemented)

## [1] 7187

coverage_data_supplemented <- coverage_data_supplemented %>%
  mutate("osnrth100km"=osnrth1m/100000,
         "IMD_2015_Rank"=dense_rank(IMD_2015),
         "IMD_2015_Quintile"=ntile(IMD_2015, 5),
         "gp_per_1k_eligible_women"=gp_number/Eligible*1000,
         "Urban_Rural"=ifelse(substring(RUC11,1,3)=="Urb", "Urban", "Rural"))

# sort out ccg codes!

# add the ons ccg codes based on lsoa to prevent pain and heartache later on
# from https://ons.maps.arcgis.com/home/item.html?id=19e5c35c6a504a7b9e1b74bed1b6225f
ccg_lkp <- read.csv(file="data\\LSOA11_CCG16_LAD16_EN_LU.csv", stringsAsFactors=F)

coverage_data_supplemented <- coverage_data_supplemented %>%
  left_join(ccg_lkp, by=c("lsoa11"="LSOA11CD"))

Exploration

# scatterplot matrix
PerformanceAnalytics::chart.Correlation(coverage_data_supplemented[,c(9:18,20,30,31,33)], method="pearson")

  ggplot(coverage_data_supplemented, aes(Eligible, Screened, col=Coverage)) +
  geom_point(alpha=0.1) +
  geom_smooth(method="lm", se=F, col="red") +
  #scale_colour_gradient2(mid="#aaaaaa", high="#0571b0", low="#ca0020", midpoint=0.4) +
  labs(title="Screened vs ELigible",
       x="Eligible",
       y="Screened")

ggplot(coverage_data_supplemented, aes(Urban_Rural, Coverage)) +
  geom_boxplot(notch=T, fill="light blue") +
  coord_flip() +
  labs(title="Coverage Compared Across Urban and Rural Practices",
       x="Urban or Rural Practice",
       y="Coverage (%)")

  ggplot(coverage_data_supplemented, aes(IMD_2015_Rank, Coverage)) +
  geom_point(alpha=0.1) +
  geom_smooth(method="lm", se=F, col="red") +
  # facet_wrap(~RUC11) +
  labs(title="Coverage by IMD 2015 Rank",
       x="IMD 2015 Rank (1 = least deprived)",
       y="Coverage (%)")

  ggplot(coverage_data_supplemented, aes(osnrth100km, Coverage)) +
  geom_point(alpha=0.25)+
  facet_wrap(~Urban_Rural) +
  geom_smooth(method="lm", se=F, col="red") +
  labs(title="Coverage by Distance North",
       x="Distance North from Origin of OSGB36 Grid (100km units)",
       y="Coverage (%)")

ggplot(coverage_data_supplemented, aes(Satisfied_Opening_Hours, Coverage)) +
  geom_point(alpha=0.3) +
  geom_smooth(method="lm", se=F, col="red") +
    labs(title="Coverage by Satisfaction With Opening Hours",
       x="Satisfaction With Opening Hours (%)",
       y="Coverage (%)")

ethn_plot_data <- gather(coverage_data_supplemented[,c(9:14)], key="Ethnic_Group", value="Percentage", -Coverage)

ggplot(ethn_plot_data, aes(Percentage, Coverage)) +
  geom_point(alpha=0.3) +
  facet_wrap(~Ethnic_Group, scale="free") +
  geom_smooth(method="lm", se=F, col="red") +
  labs(title="Coverage by Proportion of Ethnic Group Registrants (modelled)",
       subtitle="Note that axes may vary across sub-plots",
       x="Percentage of Registrants of Ethnic Group (%)",
       y="Coverage (%)")

age_plot_data <- gather(coverage_data_supplemented[,c(9,15:18)], key="Age_Group", value="Percentage", -Coverage)

ggplot(age_plot_data, aes(Percentage, Coverage)) +
  geom_point(alpha=0.3) +
  facet_wrap(~Age_Group, scale="free") +
  geom_smooth(method="lm", se=F, col="red") +
  labs(title="Coverage by Proportion of Age Group Registrants (modelled)",
       subtitle="Note that axes may vary across sub-plots",
       x="Percentage of Registrants of Age Group (%)",
       y="Coverage (%)")

ccg_sample <- sample(unique(coverage_data_supplemented$CCG16CD),20)

coverage_data_supplemented %>% 
  filter(CCG16CD %in% ccg_sample) %>% 
ggplot(data=., aes(CCG16NM, Coverage)) +
  geom_boxplot(fill="light blue") +
  coord_flip() +
  labs(title="Coverage Distribution by CCG",
       subtitle="Based on practices from a random sample of 20 CCGs",
       x="CCG Name",
       y="Coverage (%)")

Modelling

# take a copy of the main dataset to use for modelling (some variables will be altered)
coverage_data_supplemented2 <- coverage_data_supplemented
# convert percentage variables by dividing by 100
coverage_data_supplemented2[,9:18] <- coverage_data_supplemented2[,9:18]/100
coverage_data_supplemented2[,10:18] <- as.vector(coverage_data_supplemented2[,10:18])
coverage_data_supplemented2$Satisfied_Opening_Hours <- as.vector(coverage_data_supplemented2$Satisfied_Opening_Hours/100)

# No longer used, make any transformations or scalings here
# coverage_data_supplemented2$IMD_2015_Rank <- as.vector(coverage_data_supplemented2$IMD_2015_Rank)
# coverage_data_supplemented2$osnrth100km <- as.vector(coverage_data_supplemented2$osnrth100km)
# coverage_data_supplemented2$gp_number <- as.vector(coverage_data_supplemented2$gp_number)
# coverage_data_supplemented2$gp_per_1k_eligible_women <- as.vector(coverage_data_supplemented2$gp_per_1k_eligible_women)

# check for multilevel structure

# null multilevel model
fit_null_multi <- glmer(Coverage ~ (1 | CCG16CD),
               family=binomial, weights=Eligible,
               data=coverage_data_supplemented2)

# null  single level model
fit_null_single <- glm(Coverage ~ 1,
               family=binomial, weights=Eligible,
               data=coverage_data_supplemented2)

# compare AIC
bbmle::AICtab(fit_null_single, fit_null_multi)

##                 dAIC     df
## fit_null_multi       0.0 2 
## fit_null_single 134178.9 1

AIC(fit_null_single, fit_null_multi)

##                 df      AIC
## fit_null_single  1 366789.9
## fit_null_multi   2 232610.9

# Likelihood Ratio Testing
# create function to perform likelihood ratio test
# https://stat.ethz.ch/pipermail/r-sig-mixed-models/2008q3/001175.html
lrt <- function (obj1, obj2) {
    L0 <- logLik(obj1)
    L1 <- logLik(obj2)
    L01 <- as.vector(- 2 * (L0 - L1))
    df <- attr(L1, "df") - attr(L0, "df")
    list(L01 = L01, df = df,
        "p-value" = pchisq(L01, df, lower.tail = FALSE))
 }

# perform a likelihood ratio test
lrt(fit_null_single,fit_null_multi)

## $L01
## [1] 134180.9
## 
## $df
## [1] 1
## 
## $`p-value`
## [1] 0

# note that ANOVA gives identical results
anova(fit_null_multi, fit_null_single)

## Data: coverage_data_supplemented2
## Models:
## fit_null_single: Coverage ~ 1
## fit_null_multi: Coverage ~ (1 | CCG16CD)
##                 Df    AIC    BIC  logLik deviance  Chisq Chi Df Pr(>Chisq)
## fit_null_single  1 366790 366797 -183394   366788                         
## fit_null_multi   2 232611 232625 -116303   232607 134181      1  < 2.2e-16
##                    
## fit_null_single    
## fit_null_multi  ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Create the  final model
fit_2 <- glmer(Coverage ~
                      Ethn_Asian +
                      Ethn_Black +
                      Ethn_Mixed +
                      Ethn_Other +
                      Age_25_to_49 +
                      Age_65_and_over +
                      IMD_2015_Rank +
                      Satisfied_Opening_Hours +
                      gp_per_1k_eligible_women +
                      osnrth100km +
                      Urban_Rural +

                      (1 | CCG16CD),

               family=binomial(link=logit),
               weights=Eligible,
               data=coverage_data_supplemented2)

## Warning: Some predictor variables are on very different scales: consider
## rescaling

## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control
## $checkConv, : Model failed to converge with max|grad| = 1.51356 (tol =
## 0.001, component 1)

## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue
##  - Rescale variables?;Model is nearly unidentifiable: large eigenvalue ratio
##  - Rescale variables?

summary(fit_2)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: Coverage ~ Ethn_Asian + Ethn_Black + Ethn_Mixed + Ethn_Other +  
##     Age_25_to_49 + Age_65_and_over + IMD_2015_Rank + Satisfied_Opening_Hours +  
##     gp_per_1k_eligible_women + osnrth100km + Urban_Rural + (1 |  
##     CCG16CD)
##    Data: coverage_data_supplemented2
## Weights: Eligible
## 
##      AIC      BIC   logLik deviance df.resid 
## 158775.3 158864.8 -79374.7 158749.3     7174 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -38.524  -2.008   0.177   2.290  19.598 
## 
## Random effects:
##  Groups  Name        Variance Std.Dev.
##  CCG16CD (Intercept) 0.01783  0.1335  
## Number of obs: 7187, groups:  CCG16CD, 209
## 
## Fixed effects:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -2.872e-01  2.540e-02  -11.31   <2e-16 ***
## Ethn_Asian               -7.902e-01  1.040e-02  -75.99   <2e-16 ***
## Ethn_Black                1.920e+00  2.631e-02   72.98   <2e-16 ***
## Ethn_Mixed               -9.408e+00  1.328e-01  -70.86   <2e-16 ***
## Ethn_Other               -3.551e+00  1.079e-01  -32.90   <2e-16 ***
## Age_25_to_49              2.467e+00  3.060e-02   80.63   <2e-16 ***
## Age_65_and_over           2.845e+00  3.361e-02   84.66   <2e-16 ***
## IMD_2015_Rank            -5.623e-05  6.394e-07  -87.93   <2e-16 ***
## Satisfied_Opening_Hours   1.258e-01  4.305e-03   29.21   <2e-16 ***
## gp_per_1k_eligible_women  6.884e-04  2.752e-04    2.50   0.0124 *  
## osnrth100km               7.492e-02  5.896e-03   12.71   <2e-16 ***
## Urban_RuralUrban         -6.430e-02  2.818e-03  -22.81   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) Ethn_A Ethn_B Ethn_M Ethn_O A_25__ A_65__ IMD_20 St_O_H
## Ethn_Asian  -0.190                                                        
## Ethn_Black  -0.084  0.012                                                 
## Ethn_Mixed  -0.060  0.150 -0.432                                          
## Ethn_Other  -0.075 -0.298 -0.154 -0.208                                   
## Age_25_t_49 -0.568  0.171  0.173 -0.191  0.091                            
## Ag_65_nd_vr -0.505  0.300  0.099  0.187  0.100  0.510                     
## IMD_2015_Rn -0.065 -0.099 -0.245 -0.127  0.040  0.047  0.184              
## Stsfd_Opn_H -0.183  0.124  0.049 -0.051 -0.029  0.066  0.051  0.149       
## gp_pr_1k_l_ -0.111  0.036 -0.018 -0.023 -0.018  0.138 -0.101 -0.030 -0.024
## osnrth100km -0.679  0.005  0.021  0.044  0.023  0.034  0.007 -0.091  0.007
## Urbn_RrlUrb -0.092 -0.020  0.110 -0.108  0.004 -0.070  0.011 -0.143  0.113
##             g__1__ osn100
## Ethn_Asian               
## Ethn_Black               
## Ethn_Mixed               
## Ethn_Other               
## Age_25_t_49              
## Ag_65_nd_vr              
## IMD_2015_Rn              
## Stsfd_Opn_H              
## gp_pr_1k_l_              
## osnrth100km  0.007       
## Urbn_RrlUrb  0.108  0.031
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling
## convergence code: 0
## Model failed to converge with max|grad| = 1.51356 (tol = 0.001, component 1)
## Model is nearly unidentifiable: very large eigenvalue
##  - Rescale variables?
## Model is nearly unidentifiable: large eigenvalue ratio
##  - Rescale variables?

# Use the DHARMa package to examine residuals of the model
library("DHARMa")

## Warning: package 'DHARMa' was built under R version 3.4.3

# https://cran.r-project.org/web/packages/DHARMa/vignettes/DHARMa.html#installing-loading-and-citing-the-package
simulationOutput <- simulateResiduals(fittedModel = fit_2, n = 250)

# slow!
plotSimulatedResiduals(simulationOutput = simulationOutput)

#These lines should be straight, horizontal, and at y-values of 0.25, 0.5 and 0.75.
# Note, however, that some deviations from this are to be expected by chance,
# even for a perfect model, especially if the sample size is small.


# test spatial autocorrelation
# note coordinates are jittered by 1m to account for practices sharing premises
# without jitter the moran's I function cannot calculate a distance matrix
testSpatialAutocorrelation(simulationOutput = simulationOutput,
                           x = jitter(coverage_data_supplemented2$oseast1m, amount=1),
                           y = jitter(coverage_data_supplemented2$osnrth1m, amount=1),
                           plot=TRUE)

## 
##  DHARMa Moran's I test for spatial autocorrelation
## 
## data:  simulationOutput
## observed = 0.01367800, expected = -0.00013916, sd = 0.00583540,
## p-value = 0.01789
## alternative hypothesis: Spatial autocorrelation

# Add the residuals and the fitted values back to the main dataframe
fitted_values <- fitted(fit_2)
residual <- residuals(fit_2)

coverage_data_supplemented2$fitted_values <- fitted_values
coverage_data_supplemented2$residual <- residual

# Add residual quintiles
coverage_data_supplemented2 <- coverage_data_supplemented2 %>% 
  mutate("residual_quintile"=ntile(residual,5))

# residual vs fitted plot
ggplot(coverage_data_supplemented2, aes(fitted_values, residual)) +
  geom_point(alpha=0.2) +
  geom_smooth(method="lm", col="red", size=1, se=FALSE)

# Median coverage map
ggplot(coverage_data_supplemented2, aes(oseast1m, osnrth1m, col=factor(ntile(Coverage,2)))) +
  geom_point(alpha=0.25) +
  coord_fixed() +
  theme_void()

# Residuals map
ggplot(coverage_data_supplemented2, aes(oseast1m, osnrth1m, col=residual)) +
  geom_point(alpha=0.75) +
  coord_fixed() +
  facet_wrap(~residual_quintile) +
  scale_colour_gradient2(midpoint=0, mid="grey") +
  theme_void()

# caterpillar plot

random_effects <- ranef(fit_2)
intercept_variances <- attr(ranef(fit_2, postVar = TRUE)[[1]], "postVar")

## Warning in ranef.merMod(fit_2, postVar = TRUE): 'postVar' is deprecated:
## please use 'condVar' instead

random_intercept <- random_effects$CCG16CD


caterpillar_data <- data.frame(
                                "intercepts"=random_effects$CCG16CD[,1],
                                "sd_intercept"=2*sqrt(intercept_variances[,,1:length(intercept_variances)]),
                                "CCG16CD"=rownames(random_intercept)
                                )
# add in ccg names
ccg_names <- ccg_lkp %>% group_by(CCG16CD, CCG16NM) %>% summarise()


# add in categorisation for colouring ggplot
caterpillar_data <- caterpillar_data %>% 
  inner_join(ccg_names, by=c("CCG16CD"="CCG16CD")) %>% 
  mutate(
    "lower_limit"=intercepts-sd_intercept,
    "upper_limit"=intercepts+sd_intercept,
    "performance_cat"=ifelse(lower_limit>0,"high",
                             ifelse(upper_limit<0, "low", "average")),
    "CCG16NM"=factor(CCG16NM))

## Warning: Column `CCG16CD` joining factor and character vector, coercing
## into character vector

# reorder the ccg names factor
caterpillar_data$CCG16NM <- fct_reorder(caterpillar_data$CCG16NM, caterpillar_data$intercepts)


# Caterpillar plot
ggplot(caterpillar_data,
       aes(CCG16NM,
           intercepts)) +
geom_hline(yintercept=0) +
geom_errorbar(aes(ymin=lower_limit,
                  ymax=upper_limit),
              width=0,
              color="black") +
geom_point(aes(col=performance_cat),
           size=2)  +
scale_colour_manual(values=c("#aaaaaa", "#0571b0", "#ca0020")) +
guides(size=FALSE,
       shape=FALSE) +
xlab("Levels") +
ylab("") +
theme(axis.text.x=element_text(size=rel(1.2)),
               axis.title.x=element_text(size=rel(1.3)),
               axis.text.y=element_text(size=rel(1.2)),
               panel.grid.minor=element_blank(),
               panel.grid.major.x=element_blank()) +
  coord_flip()

# Histogram of random effects
ggplot(caterpillar_data,
       aes(intercepts)) +
  geom_histogram(bins=60)

{qqnorm(caterpillar_data$intercepts);qqline(caterpillar_data$intercepts)}

# Mapping the random effects
path <- ".\\shp\\CCG_April_2016_UGC_V4"

ccg_shp <- readOGR(dsn=path,
        layer="Clinical_Commissioning_Groups_April_2016_Generalised_Clipped_Boundaries_in_England",
        stringsAsFactors=FALSE)

## OGR data source with driver: ESRI Shapefile 
## Source: ".\shp\CCG_April_2016_UGC_V4", layer: "Clinical_Commissioning_Groups_April_2016_Generalised_Clipped_Boundaries_in_England"
## with 209 features
## It has 5 fields
## Integer64 fields read as strings:  objectid

ccg_shp <- sp::merge(ccg_shp,
                     caterpillar_data,
                     by.x="ccg16cd",
                     by.y="CCG16CD",
                     sort = FALSE)



# define a function to handle tidying (which used to be called fortifying) and then joinng the data items back in
clean <- function(shape){
                          shape@data$id = rownames(shape@data)
                          shape.points = tidy(shape, region="id")
                          shape.df = inner_join(shape.points, shape@data, by="id")
}

ccg_shp_tidy <- clean(ccg_shp)



ggplot(ccg_shp_tidy, aes(long, lat, fill=intercepts, group=group)) +
  geom_polygon(col="black", size=0.05) +
  coord_fixed() +
  scale_fill_gradient2(midpoint=0, low="#ca0020", high="#0571b0") +
  facet_wrap(~performance_cat)

ggplot(ccg_shp_tidy, aes(long, lat, fill=performance_cat, group=group)) +
  geom_polygon(col="white", size=0.05) +
  coord_fixed() +
  scale_fill_manual(values=c("#aaaaaa", "#0571b0", "#ca0020")) +
  theme_void()

# check for clustering of random effects
# remember to reset oa_shp_data as it's filtered in the previous script
neighbourhood <- spdep::poly2nb(ccg_shp, queen=TRUE)

{
  par(mar=c(0,0,0,0))
  plot(ccg_shp,
      border="grey")
  plot(neighbourhood,
     coords=coordinates(ccg_shp),
     col="red",
     add=T)
  }

# Now create a neighbourhood weights matrix
neighbourhood_weights_list <- nb2listw(neighbourhood, style="W", zero.policy=TRUE)

# and run the moran's I test
moran.test(ccg_shp$intercepts,neighbourhood_weights_list, zero.policy=T)

## 
##  Moran I test under randomisation
## 
## data:  ccg_shp$intercepts  
## weights: neighbourhood_weights_list  
## 
## Moran I statistic standard deviate = 9.7897, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##       0.448372814      -0.004830918       0.002143131

Multilevel Modelling Screening Uptake

Simon Hailstone

25 February 2018

Exploration

Modelling