1 Set up
2 Get data
- 2.1 Join data
- 2.2 Fix CCG coding
3 Exploration
4 Modelling
- 4.1 Prep data
- 4.2 Summary stats table
5 Sumary statistics table
- 5.1 Check multilevel structure
6 Build GLMM model
- 6.1 Map residuals too
- 6.2 Extract model information
7 Inspect random effects
8 Inspect coefficients
9 Spatial analysis
- 9.1 Local Moran’s I

1 Set up

library("knitr")
library("ggplot2"); theme_set(theme_bw())
library("dplyr")
library("tidyr")
library("reshape2")
library("fingertipsR")
library("RSQLite")
library("glmmTMB")
library("lme4")
library("lmerTest")
library("bbmle")
library("forcats")
library("rgdal")
library("rgeos")
library("maptools")
library("broom")
library("PerformanceAnalytics") # for scatterplot matrix
library("spdep") # for poly2nb
library("ape") # for moran's I
library("arm") # to get se.ranef function
library("kableExtra") # to make tables nicer

2 Get data

# Downloaded from NOMIS
lsoa_demographics <- read.csv(file="data\\lsoa_demographics.csv", stringsAsFactors=F)


lsoa_demographics <- melt(lsoa_demographics, id=1:3, measure=4:7, variable.name="Age_Band", value.name="Female_Population")


# Create age splits for each lsoa
lsoa_age_demographics <- lsoa_demographics %>%
  group_by(Age_Band, LSOA) %>%
  summarise("Female_Population"=sum(Female_Population)) %>% 
  spread(Age_Band, Female_Population)

  
# Create ethnicity splits for each lsoa
# could be done in more detail, keeping simple for now
lsoa_ethnic_demographics <- lsoa_demographics %>%
  group_by(Ethnic_Cat, LSOA) %>%
  summarise("Female_Population"=sum(Female_Population)) %>% 
  spread(Ethnic_Cat, Female_Population)

# tidy up!
# use sort(sapply(ls(),function(x){object.size(get(x))}))
rm(lsoa_demographics)

# GP catchment female population for Jan 2017
# From: http://digital.nhs.uk/catalogue/PUB23139

gp_lsoa_catch <- read.csv(file="data\\gp-reg-patients-LSOA-alt-tall.csv", stringsAsFactors=F)


# calculate total LSOA population
LSOA_POP <- gp_lsoa_catch %>% 
  group_by(LSOA_CODE) %>% 
  summarise("TOTAL_FEMALE"=sum(FEMALE_PATIENTS))


# Calc percentage of total lsoa pop registered with each practice
gp_lsoa_catch <- gp_lsoa_catch %>% 
  left_join(LSOA_POP) %>% 
  mutate("PERCENTAGE_OF_LSOA"=FEMALE_PATIENTS/TOTAL_FEMALE)

## Joining, by = "LSOA_CODE"

# Calc weighted ethnicity
gp_lsoa_catch <- gp_lsoa_catch %>%  
  left_join(lsoa_ethnic_demographics, by=c("LSOA_CODE"="LSOA"))


gp_lsoa_catch[,9:13] <- (gp_lsoa_catch[,9:13] * gp_lsoa_catch$PERCENTAGE_OF_LSOA)



# Calc weighted age bands
gp_lsoa_catch <- gp_lsoa_catch %>%  
  left_join(lsoa_age_demographics, by=c("LSOA_CODE"="LSOA"))


gp_lsoa_catch[,14:17] <- (gp_lsoa_catch[,14:17] * gp_lsoa_catch$PERCENTAGE_OF_LSOA)

# now group up to practice level
gp_demographics <- gp_lsoa_catch %>% 
  group_by(PRACTICE_CODE) %>% 
  summarise_at(funs(sum(.,na.rm = TRUE)) , .vars=c(8:16)) %>% 
  mutate("Total_Pop"=`Asian/Asian British` +
                     `Black/African/Caribbean/Black British` +
                     `Mixed/multiple ethnic group` +
                     `Other ethnic group` +
                     `White`) %>% 
  mutate(
  "Ethn_Asian"=`Asian/Asian British` / Total_Pop * 100, 
  "Ethn_Black"=`Black/African/Caribbean/Black British` / Total_Pop * 100,
  "Ethn_Mixed"=`Mixed/multiple ethnic group` / Total_Pop * 100,
  "Ethn_Other"=`Other ethnic group` / Total_Pop * 100,
  "Ethn_White"=`White` / Total_Pop * 100) %>% 
mutate(
  "Age_0_to_24"=Age_0_to_24 / Total_Pop * 100, 
  "Age_25_to_49"=Age_25_to_49 / Total_Pop * 100, 
  "Age_50_to_64"=Age_50_to_64 / Total_Pop * 100, 
  "Age_65_and_over"=Age_65_and_over / Total_Pop * 100
  ) %>% 
  dplyr::select(PRACTICE_CODE, starts_with("Ethn_"),starts_with("Age_"))


# tidy up!
# use sort(sapply(ls(),function(x){object.size(get(x))}))
rm(gp_lsoa_catch)

# load fingertips data


# IMD 2015
imd_2015 <- fingertips_data(IndicatorID=91872,
                        AreaTypeID = 7)

imd_2015 <- imd_2015 %>% filter(AreaType=="GP") %>%
  dplyr::select(AreaCode, "IMD_2015"=Value)


# Patient satisfaction with opening hours
satisfied_opening_hours <- fingertips_data(IndicatorID=1942,
                        AreaTypeID = 7)

satisfied_opening_hours <- satisfied_opening_hours %>%
  filter(AreaType=="GP", Timeperiod=="2016/17") %>%
  dplyr::select(AreaCode, "Satisfied_Opening_Hours"=Value)

# Load coverage data
coverage_data <- read.csv(file="data\\Cerv_Cov_MachRead_GP_Q1_1718.csv", stringsAsFactors=F)
coverage_data <- coverage_data %>%
  filter(Age=="25_49", Year=="2017/18", Quarter=="Q1") %>%
  spread(DataType, Value) %>% 
  mutate("Coverage"=Screened/Eligible*100)


# drop supressed values
nrow(coverage_data)

## [1] 7670

# How many records are dropped
nrow(coverage_data %>% 
  filter(is.na(Coverage)==TRUE))

## [1] 367

coverage_data <- coverage_data %>% 
  filter(is.na(Coverage)==FALSE)

nrow(coverage_data)

## [1] 7303

# gp descriptive data

# # gp
# gp_info <- read.csv(file="data\\egpcur.csv", stringsAsFactors=FALSE, header=FALSE)
# 
# # Create number of GPs per practice
# gp_info <- gp_info %>% 
#   filter(V13=="A") %>% # active GPs only 
#   group_by(V15) %>% 
#   summarise("gp_number"=n()) %>% 
#   dplyr::select("practice_code"=V15, gp_number)


# gpp
gpp_info <- read.csv(file="data\\epraccur.csv", stringsAsFactors=FALSE, header=FALSE)

gpp_info <- gpp_info %>%
  mutate("pcd_spaceless"=gsub(" ","", V10)) %>%
  dplyr::select("practice_code"=V1, pcd_spaceless, "ccg_code" = V15)



gp_staffing <- read.csv("data\\General_Practice_March_2016_Practice_Level.csv", stringsAsFactors=FALSE)

# only select columns we need
gp_staffing <- gp_staffing %>% dplyr::select(PRAC_CODE,
                              TOTAL_PATIENTS,
                              TOTAL_GP_HC,
                              TOTAL_GP_FTE,
                              MALE_GP_FTE,
                              FEMALE_GP_FTE,
                              TOTAL_NURSES_FTE,
                              TOTAL_GP_HC_COQ_UK
                              ) %>% 
  filter_at(vars(-PRAC_CODE),all_vars(. != "NS")) %>%
  gather(-PRAC_CODE, key="COL", value="VALUE") %>% 
  mutate("VALUE"=as.numeric(VALUE)) %>% 
  spread(COL, VALUE) %>% 
  mutate(
    "FEMALE_GP_PROPORTION"=FEMALE_GP_FTE/TOTAL_GP_FTE,
    "NON_UKQ_GP_PROPORTION"=1-(TOTAL_GP_HC_COQ_UK/TOTAL_GP_HC)
         ) %>% 
  mutate(
    "FEMALE_GP_PROPORTION"=ifelse(is.na(FEMALE_GP_PROPORTION),0,FEMALE_GP_PROPORTION),
    "NON_UKQ_GP_PROPORTION"=ifelse(is.na(NON_UKQ_GP_PROPORTION),0,NON_UKQ_GP_PROPORTION)
         )

# load the spatial lookups
# postcode data

con = dbConnect(SQLite(), dbname="C:\\Users\\User1\\Documents\\Rstudio_Projects\\ONS_Lookup_Database\\ons_lkp_db")
# dbListTables(con)
pcd <- dbGetQuery(con, "SELECT pcd_spaceless,  oseast1m, osnrth1m, lsoa11 FROM ONS_PD")

## Warning in result_fetch(res@ptr, n = n): Column `oseast1m`: mixed type,
## first seen values of type integer, coercing other values of type string

## Warning in result_fetch(res@ptr, n = n): Column `osnrth1m`: mixed type,
## first seen values of type integer, coercing other values of type string

# limit the size of the postcode dataframe to only those postcodes in the gp practice data
pcd <- pcd %>% filter(pcd_spaceless %in% unique(gpp_info$pcd_spaceless))



# rural/urban lsoa classification
# https://ons.maps.arcgis.com/home/item.html?id=9855221596994bde8363a685cb3dd58a
urban_rural <- read.csv(file="data\\RUC11_LSOA11_EW.csv", stringsAsFactors=FALSE)

2.1 Join data

# combine it all

nrow(coverage_data)

## [1] 7303

coverage_data_supplemented <- coverage_data %>%
  inner_join(gp_demographics, by=c("OrganisationCode"="PRACTICE_CODE"))

# ----------
nrow(coverage_data_supplemented)

## [1] 7296

# ----------

coverage_data_supplemented <- coverage_data_supplemented %>%
  inner_join(imd_2015, by=c("OrganisationCode"="AreaCode"))

# ----------
nrow(coverage_data_supplemented)

## [1] 7197

# ----------

coverage_data_supplemented <- coverage_data_supplemented %>%
  inner_join(satisfied_opening_hours, by=c("OrganisationCode"="AreaCode"))

# ----------
nrow(coverage_data_supplemented)

## [1] 7196

# ----------

#coverage_data_supplemented <- coverage_data_supplemented %>%
#  inner_join(gp_info, by=c("OrganisationCode"="practice_code"))

coverage_data_supplemented <- coverage_data_supplemented %>%
  inner_join(gp_staffing, by=c("OrganisationCode"="PRAC_CODE"))

# ----------
nrow(coverage_data_supplemented)

## [1] 6727

# ----------

coverage_data_supplemented <- coverage_data_supplemented %>%
  inner_join(gpp_info, by=c("OrganisationCode"="practice_code"))

# ----------
nrow(coverage_data_supplemented)

## [1] 6727

# ----------

coverage_data_supplemented <- coverage_data_supplemented %>%
  inner_join(pcd, by=c("pcd_spaceless"="pcd_spaceless"))

# ----------
nrow(coverage_data_supplemented)

## [1] 6724

# ----------

coverage_data_supplemented <- coverage_data_supplemented %>%
  inner_join(urban_rural, by=c("lsoa11"="LSOA11CD"))

# ----------
nrow(coverage_data_supplemented)

## [1] 6724

# ----------

# minor adjustments
coverage_data_supplemented <- coverage_data_supplemented %>%
  mutate("osnrth100km"=osnrth1m/100000,
         "IMD_2015_Rank"=dense_rank(IMD_2015),
         "IMD_2015_Quintile"=ntile(IMD_2015, 5),
         "gp_per_1k_eligible_women"=TOTAL_GP_FTE/Eligible*1000,
         "Urban_Rural"=ifelse(substring(RUC11,1,3)=="Urb", "Urban", "Rural"),
         "nurses_per_1k_eligible_women"=TOTAL_NURSES_FTE/Eligible*1000)

2.2 Fix CCG coding

# Add the ons ccg codes based on lsoa to prevent pain and heartache later on
# from https://ons.maps.arcgis.com/home/item.html?id=19e5c35c6a504a7b9e1b74bed1b6225f
ccg_lkp <- read.csv(file="data\\LSOA11_CCG16_LAD16_EN_LU.csv", stringsAsFactors=F)

coverage_data_supplemented <- coverage_data_supplemented %>%
  left_join(ccg_lkp, by=c("lsoa11"="LSOA11CD"))

3 Exploration

# scatterplot matrix 1
PerformanceAnalytics::chart.Correlation(coverage_data_supplemented[,c(9:19)], method="pearson")

# scatterplot matrix 2
PerformanceAnalytics::chart.Correlation(coverage_data_supplemented[,c(9,20,23,26:29, 41, 43)], method="pearson")

  ggplot(coverage_data_supplemented, aes(Eligible, Screened, col=Coverage)) +
  geom_point(alpha=0.1) +
  geom_smooth(method="lm", se=F, col="red") +
  #scale_colour_gradient2(mid="#aaaaaa", high="#0571b0", low="#ca0020", midpoint=0.4) +
  labs(title="Screened vs ELigible",
       x="Eligible",
       y="Screened")

ggplot(coverage_data_supplemented, aes(Urban_Rural, Coverage)) +
  geom_boxplot(notch=T, fill="light blue") +
  coord_flip() +
  labs(title="Coverage Compared Across Urban and Rural Practices",
       x="Urban or Rural Practice",
       y="Coverage (%)")

  ggplot(coverage_data_supplemented, aes(IMD_2015_Rank, Coverage)) +
  geom_point(alpha=0.1) +
  geom_smooth(method="lm", se=F, col="red") +
  # facet_wrap(~RUC11) +
  labs(title="Coverage by IMD 2015 Rank",
       x="IMD 2015 Rank (1 = least deprived)",
       y="Coverage (%)")

  ggplot(coverage_data_supplemented, aes(osnrth100km, Coverage)) +
  geom_point(alpha=0.25)+
  facet_wrap(~Urban_Rural) +
  geom_smooth(method="lm", se=F, col="red") +
  labs(title="Coverage by Distance North",
       x="Distance North from Origin of OSGB36 Grid (100km units)",
       y="Coverage (%)")

ggplot(coverage_data_supplemented, aes(Satisfied_Opening_Hours, Coverage)) +
  geom_point(alpha=0.3) +
  geom_smooth(method="lm", se=F, col="red") +
    labs(title="Coverage by Satisfaction With Opening Hours",
       x="Satisfaction With Opening Hours (%)",
       y="Coverage (%)")

ethn_plot_data <- gather(coverage_data_supplemented[,c(9:14)], key="Ethnic_Group", value="Percentage", -Coverage)

ggplot(ethn_plot_data, aes(Percentage, Coverage)) +
  geom_point(alpha=0.3) +
  facet_wrap(~Ethnic_Group, scale="free") +
  geom_smooth(method="lm", se=F, col="red") +
  labs(title="Coverage by Proportion of Ethnic Group Registrants (modelled)",
       subtitle="Note that axes may vary across sub-plots",
       x="Percentage of Registrants of Ethnic Group (%)",
       y="Coverage (%)")

age_plot_data <- gather(coverage_data_supplemented[,c(9,15:18)], key="Age_Group", value="Percentage", -Coverage)

ggplot(age_plot_data, aes(Percentage, Coverage)) +
  geom_point(alpha=0.3) +
  facet_wrap(~Age_Group, scale="free") +
  geom_smooth(method="lm", se=F, col="red") +
  labs(title="Coverage by Proportion of Age Group Registrants (modelled)",
       subtitle="Note that axes may vary across sub-plots",
       x="Percentage of Registrants of Age Group (%)",
       y="Coverage (%)")

ccg_sample <- sample(unique(coverage_data_supplemented$CCG16CD),20)

coverage_data_supplemented %>% 
  filter(CCG16CD %in% ccg_sample) %>% 
ggplot(data=., aes(CCG16NM, Coverage)) +
  geom_boxplot(fill="light blue") +
  coord_flip() +
  labs(title="Coverage Distribution by CCG",
       subtitle="Based on practices from a random sample of 20 CCGs",
       x="CCG Name",
       y="Coverage (%)")

4 Modelling

4.1 Prep data

# take a copy of the main dataset to use for modelling (some variables will be altered)
coverage_data_supplemented2 <- coverage_data_supplemented
# convert percentage variables by dividing by 100
coverage_data_supplemented2[,9:18] <- coverage_data_supplemented2[,9:18]/100
coverage_data_supplemented2[,10:18] <- as.vector(coverage_data_supplemented2[,10:18])
coverage_data_supplemented2$Satisfied_Opening_Hours <- as.vector(coverage_data_supplemented2$Satisfied_Opening_Hours/100)

coverage_data_supplemented2$CCG16CD <- factor(coverage_data_supplemented2$CCG16CD)


# No longer used, make any transformations or scalings here
# coverage_data_supplemented2$IMD_2015_Rank <- as.vector(coverage_data_supplemented2$IMD_2015_Rank)
# coverage_data_supplemented2$osnrth100km <- as.vector(coverage_data_supplemented2$osnrth100km)
# coverage_data_supplemented2$gp_number <- as.vector(coverage_data_supplemented2$gp_number)
# coverage_data_supplemented2$gp_per_1k_eligible_women <- as.vector(coverage_data_supplemented2$gp_per_1k_eligible_women)

4.2 Summary stats table

5 Sumary statistics table

summary_stats <- coverage_data_supplemented2 %>% 
  dplyr::select(OrganisationCode,
                Eligible,
                Screened,
                Coverage,
                starts_with("Ethn_"),
                starts_with("Age_"),
                IMD_2015,
                IMD_2015_Rank,
                Satisfied_Opening_Hours,
                osnrth100km,
                gp_per_1k_eligible_women,
                nurses_per_1k_eligible_women,
                FEMALE_GP_PROPORTION,
                NON_UKQ_GP_PROPORTION,
                TOTAL_PATIENTS,
                TOTAL_NURSES_FTE) %>%
  gather(key="Variable", value="Value",-OrganisationCode) %>% 
  group_by(Variable) %>% 
  summarise(
    "n"=n(),
    "mean"=mean(Value),
    "sd"=sd(Value),
    "min"=min(Value),
    "percentile_25"=quantile(Value, probs=0.25),
    "median"=median(Value),
    "percentile_75"=quantile(Value, probs=0.75),
    "max"=max(Value)
  )

summary_stats %>%
      kable(digits=2) %>%
      kable_styling()

Variable	n	mean	sd	min	percentile_25	median	percentile_75	max
Age_0_to_24	6724	0.30	0.05	0.13	0.27	0.29	0.32	0.70
Age_25_to_49	6724	0.35	0.05	0.18	0.32	0.34	0.37	0.64
Age_50_to_64	6724	0.18	0.04	0.04	0.16	0.18	0.20	0.29
Age_65_and_over	6724	0.18	0.05	0.03	0.14	0.18	0.21	0.46
Coverage	6724	0.70	0.08	0.22	0.66	0.72	0.76	0.93
Eligible	6724	1378.10	889.90	14.00	749.00	1199.00	1796.00	13595.00
Ethn_Asian	6724	0.09	0.13	0.00	0.01	0.03	0.10	0.76
Ethn_Black	6724	0.04	0.07	0.00	0.00	0.01	0.04	0.46
Ethn_Mixed	6724	0.02	0.02	0.00	0.01	0.02	0.03	0.10
Ethn_Other	6724	0.01	0.02	0.00	0.00	0.00	0.01	0.17
Ethn_White	6724	0.84	0.19	0.09	0.78	0.93	0.97	1.00
FEMALE_GP_PROPORTION	6724	0.42	0.26	0.00	0.26	0.44	0.60	1.00
gp_per_1k_eligible_women	6724	3.27	2.60	0.00	2.22	3.06	4.04	141.43
IMD_2015	6724	23.45	11.87	3.23	13.76	21.40	31.40	66.49
IMD_2015_Rank	6724	3362.50	1941.20	1.00	1681.75	3362.50	5043.25	6724.00
NON_UKQ_GP_PROPORTION	6724	0.32	0.35	0.00	0.00	0.20	0.50	1.00
nurses_per_1k_eligible_women	6724	1.62	1.36	0.00	0.92	1.43	2.07	51.73
osnrth100km	6724	2.82	1.27	0.10	1.79	2.73	3.93	6.53
Satisfied_Opening_Hours	6724	0.74	0.18	0.10	0.63	0.78	0.89	1.00
Screened	6724	964.39	612.57	11.00	522.00	845.00	1266.00	8271.00
TOTAL_NURSES_FTE	6724	2.08	1.78	0.00	0.91	1.67	2.76	30.59
TOTAL_PATIENTS	6724	7829.41	4577.87	0.00	4416.75	7009.00	10351.50	60329.00

5.1 Check multilevel structure

# check for multilevel structure

# null multilevel model
fit_null_multi <- glmer(Coverage ~ (1 | CCG16CD),
               family=binomial, weights=Eligible,
               data=coverage_data_supplemented2)

# null  single level model
fit_null_single <- glm(Coverage ~ 1,
               family=binomial, weights=Eligible,
               data=coverage_data_supplemented2)

# compare AIC
bbmle::AICtab(fit_null_single, fit_null_multi)

##                 dAIC     df
## fit_null_multi       0.0 2 
## fit_null_single 123326.7 1

AIC(fit_null_single, fit_null_multi)

##                 df      AIC
## fit_null_single  1 338896.9
## fit_null_multi   2 215570.3

# Likelihood Ratio Testing
# create function to perform likelihood ratio test
# https://stat.ethz.ch/pipermail/r-sig-mixed-models/2008q3/001175.html
lrt <- function (obj1, obj2) {
    L0 <- logLik(obj1)
    L1 <- logLik(obj2)
    L01 <- as.vector(- 2 * (L0 - L1))
    df <- attr(L1, "df") - attr(L0, "df")
    list(L01 = L01, df = df,
        "p-value" = pchisq(L01, df, lower.tail = FALSE))
 }

# perform a likelihood ratio test
lrt(fit_null_single,fit_null_multi)

## $L01
## [1] 123328.7
## 
## $df
## [1] 1
## 
## $`p-value`
## [1] 0

# note that ANOVA gives identical results
anova(fit_null_multi, fit_null_single)

## Data: coverage_data_supplemented2
## Models:
## fit_null_single: Coverage ~ 1
## fit_null_multi: Coverage ~ (1 | CCG16CD)
##                 Df    AIC    BIC  logLik deviance  Chisq Chi Df Pr(>Chisq)
## fit_null_single  1 338897 338904 -169447   338895                         
## fit_null_multi   2 215570 215584 -107783   215566 123329      1  < 2.2e-16
##                    
## fit_null_single    
## fit_null_multi  ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

6 Build GLMM model

# Create the  final model

fit_2 <- glmmTMB(Coverage ~
                      Ethn_Asian +
                      Ethn_Black +
                      Ethn_Mixed +
                      Ethn_Other +
                      Age_25_to_49 +
                      Age_65_and_over +
                      IMD_2015_Rank +
                      Satisfied_Opening_Hours +
                      gp_per_1k_eligible_women +
                      osnrth100km +
                      Urban_Rural +
                      FEMALE_GP_PROPORTION +
                      NON_UKQ_GP_PROPORTION +
                      nurses_per_1k_eligible_women +
                      Eligible +

                      (1 | CCG16CD),

               family=binomial(link=logit),
               weights=Eligible,
               data=coverage_data_supplemented2)

summary(fit_2)

##  Family: binomial  ( logit )
## Formula:          
## Coverage ~ Ethn_Asian + Ethn_Black + Ethn_Mixed + Ethn_Other +  
##     Age_25_to_49 + Age_65_and_over + IMD_2015_Rank + Satisfied_Opening_Hours +  
##     gp_per_1k_eligible_women + osnrth100km + Urban_Rural + FEMALE_GP_PROPORTION +  
##     NON_UKQ_GP_PROPORTION + nurses_per_1k_eligible_women + Eligible +  
##     (1 | CCG16CD)
## Data: coverage_data_supplemented2
## Weights: Eligible
## 
##      AIC      BIC   logLik deviance df.resid 
## 145893.1 146009.0 -72929.6 145859.1     6707 
## 
## Random effects:
## 
## Conditional model:
##  Groups  Name        Variance Std.Dev.
##  CCG16CD (Intercept) 0.01746  0.1321  
## Number of obs: 6724, groups:  CCG16CD, 209
## 
## Conditional model:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                  -3.373e-01  2.586e-02  -13.04  < 2e-16 ***
## Ethn_Asian                   -7.805e-01  1.100e-02  -70.94  < 2e-16 ***
## Ethn_Black                    1.946e+00  2.847e-02   68.34  < 2e-16 ***
## Ethn_Mixed                   -9.484e+00  1.390e-01  -68.25  < 2e-16 ***
## Ethn_Other                   -3.716e+00  1.130e-01  -32.87  < 2e-16 ***
## Age_25_to_49                  2.691e+00  3.202e-02   84.03  < 2e-16 ***
## Age_65_and_over               2.733e+00  3.507e-02   77.93  < 2e-16 ***
## IMD_2015_Rank                -5.914e-05  9.600e-07  -61.61  < 2e-16 ***
## Satisfied_Opening_Hours       8.969e-02  4.711e-03   19.04  < 2e-16 ***
## gp_per_1k_eligible_women      8.806e-03  6.127e-04   14.37  < 2e-16 ***
## osnrth100km                   7.615e-02  6.106e-03   12.47  < 2e-16 ***
## Urban_RuralUrban             -5.033e-02  2.928e-03  -17.19  < 2e-16 ***
## FEMALE_GP_PROPORTION          3.163e-02  3.309e-03    9.56  < 2e-16 ***
## NON_UKQ_GP_PROPORTION        -4.334e-02  2.751e-03  -15.75  < 2e-16 ***
## nurses_per_1k_eligible_women  7.677e-03  1.013e-03    7.58 3.51e-14 ***
## Eligible                     -1.668e-05  9.120e-07  -18.29  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# perform a likelihood ratio test
lrt(fit_null_multi, fit_2)

## $L01
## [1] 69707.12
## 
## $df
## [1] 15
## 
## $`p-value`
## [1] 0

plot(fitted(fit_2,type = "response"), residuals(fit_2))

6.1 Map residuals too

coverage_data_supplemented2$residual <- as.vector(residuals(fit_2))
x <- jitter(coverage_data_supplemented2$oseast1m, amount=0.01)
y <- jitter(coverage_data_supplemented2$osnrth1m, amount=0.01)
distMat <- as.matrix(dist(cbind(x, y)))
invDistMat <- 1/distMat
diag(invDistMat) <- 0
MI = ape::Moran.I(coverage_data_supplemented2$residual, weight = invDistMat)
MI

## $observed
## [1] 0.1358679
## 
## $expected
## [1] -0.0001487431
## 
## $sd
## [1] 0.006466252
## 
## $p.value
## [1] 0

# tidy up!
# use sort(sapply(ls(),function(x){object.size(get(x))}))
rm(invDistMat)
rm(distMat)

6.2 Extract model information

# Add the fitted values back to the main dataframe
coverage_data_supplemented2$fitted_values <- fitted(fit_2,type = "response")
coverage_data_supplemented2$fitted_values_logit <- fitted(fit_2,type = "logit")


#predict(fit_2, coverage_data_supplemented3, type = "response") - fitted_values


# Add residual quintiles
coverage_data_supplemented2 <- coverage_data_supplemented2 %>% 
  mutate("residual_quintile"=ntile(coverage_data_supplemented2$residual,5))

7 Inspect random effects

# caterpillar plot

# get random effects
random_effects <- ranef(fit_2)
random_intercept <- random_effects$cond


# get variances
random_effect_var <- TMB::sdreport(fit_2$obj, getJointPrecision=TRUE)
random_effect_sd <- sqrt(random_effect_var$diag.cov.random)




caterpillar_data <- data.frame(
                                "intercepts"=random_intercept$CCG16CD$`(Intercept)`,
                                "sd"=random_effect_sd,
                                "CCG16CD"=factor(row.names(random_effects$cond$CCG16CD))
                                )

# calc confidence interval
caterpillar_data$ucl <- caterpillar_data$intercepts + (caterpillar_data$sd * 1.96)
caterpillar_data$lcl <- caterpillar_data$intercepts - (caterpillar_data$sd * 1.96)

# categorise for colour coding in plot

caterpillar_data$category <- ifelse(caterpillar_data$lcl > 0, "High",
                                    ifelse(caterpillar_data$ucl < 0, "Low",
                                    "Average"))


# add in ccg names
ccg_names <- ccg_lkp %>% group_by(CCG16CD, CCG16NM) %>% summarise()
ccg_names$CCG16NM <- factor(ccg_names$CCG16NM)

caterpillar_data <- caterpillar_data %>% left_join(ccg_names)

## Joining, by = "CCG16CD"

## Warning: Column `CCG16CD` joining factor and character vector, coercing
## into character vector

# reorder the ccg names factor
caterpillar_data$CCG16NM_B <- fct_reorder(caterpillar_data$CCG16NM, caterpillar_data$intercepts)

# add quintiles
caterpillar_data$intercepts_quintile <- fct_rev(factor(ntile(caterpillar_data$intercepts, 5)))

# Caterpillar plot
ggplot(caterpillar_data,
       aes(CCG16NM_B,
           intercepts,
           colour=category)) +
geom_hline(yintercept=0) +
geom_point(size=4)  +
  geom_errorbar(aes(ymin=lcl, ymax=ucl)) +
scale_colour_manual(values=c("grey", "#0571b0","#ca0020")) +
guides(size=FALSE,
       shape=FALSE) +
xlab("Levels") +
ylab("") +
theme(axis.text.x=element_text(size=rel(1.2)),
               axis.title.x=element_text(size=rel(1.3)),
               axis.text.y=element_text(size=rel(1.2)),
               panel.grid.minor=element_blank(),
               panel.grid.major.x=element_blank()) +
  coord_flip()

# plot a histogram of the ccg residuals

ggplot(caterpillar_data, aes(intercepts)) +
  geom_histogram(col="black",fill="grey", bins=20) +
  geom_vline(xintercept=0, col="red", lty=2, size=1) +
  theme_bw() +
  labs(title="Histogram of CCG Level Random Effects",
       x="Random Effects",
       y="Count")

qqnorm(caterpillar_data$intercepts); qqline(caterpillar_data$intercepts, col = 2,lwd=2,lty=2)

8 Inspect coefficients

# here we add some extra columns to quantify how much each coefficient affects the overall coverage

# get model coefficients:
model_coefs <- data.frame(summary(fit_2)$coef$cond)
model_coefs$Coef <- row.names(model_coefs)

# join these to summary_stats to generate value for 'average' practice as an example

# look at how changing practice characteristics affects the coverage
summary_stats_2 <- summary_stats %>%
  left_join(model_coefs, by=c("Variable"="Coef")) %>% 
  dplyr::select(Variable,mean,median,percentile_25,percentile_75, "Coefficient"=Estimate) %>% 
  filter(is.na(Coefficient)==F) %>% 
  mutate("Mean_Value"=Coefficient*mean,
         "Percentile_25_Value"=Coefficient*percentile_25,
         "Percentile_75_Value"=Coefficient*percentile_75) %>% 
  arrange(desc(abs(Mean_Value)))

summary_stats_2 %>%
      kable(digits=2) %>%
      kable_styling()

Variable	mean	median	percentile_25	percentile_75	Coefficient	Mean_Value	Percentile_25_Value	Percentile_75_Value
Age_25_to_49	0.35	0.34	0.32	0.37	2.69	0.93	0.86	0.98
Age_65_and_over	0.18	0.18	0.14	0.21	2.73	0.48	0.39	0.57
Ethn_Mixed	0.02	0.02	0.01	0.03	-9.48	-0.22	-0.09	-0.31
osnrth100km	2.82	2.73	1.79	3.93	0.08	0.21	0.14	0.30
IMD_2015_Rank	3362.50	3362.50	1681.75	5043.25	0.00	-0.20	-0.10	-0.30
Ethn_Black	0.04	0.01	0.00	0.04	1.95	0.08	0.01	0.09
Ethn_Asian	0.09	0.03	0.01	0.10	-0.78	-0.07	-0.01	-0.08
Satisfied_Opening_Hours	0.74	0.78	0.63	0.89	0.09	0.07	0.06	0.08
Ethn_Other	0.01	0.00	0.00	0.01	-3.72	-0.04	0.00	-0.04
gp_per_1k_eligible_women	3.27	3.06	2.22	4.04	0.01	0.03	0.02	0.04
Eligible	1378.10	1199.00	749.00	1796.00	0.00	-0.02	-0.01	-0.03
NON_UKQ_GP_PROPORTION	0.32	0.20	0.00	0.50	-0.04	-0.01	0.00	-0.02
FEMALE_GP_PROPORTION	0.42	0.44	0.26	0.60	0.03	0.01	0.01	0.02
nurses_per_1k_eligible_women	1.62	1.43	0.92	2.07	0.01	0.01	0.01	0.02

# pull out the coefficients in a format ready to go into the main dataframe
model_coefs <- model_coefs %>% 
  dplyr::select(Coef, Estimate) %>% 
  mutate("Coef"=ifelse(Coef=="(Intercept)","Intercept",Coef)) %>%
  mutate("Coef"=paste0("coef_",Coef)) %>% 
  spread(Coef, Estimate)

# get the CCG intercepts in a format to go into the main dataframe
ccg_intercepts <- caterpillar_data %>% dplyr::select(CCG16CD, "CCG_intercept"=intercepts)

# add CCG intercepts to main dataframe
coverage_data_supplemented3 <- coverage_data_supplemented2 %>% 
  left_join(ccg_intercepts, by="CCG16CD")

## Warning: Column `CCG16CD` joining factor and character vector, coercing
## into character vector

# add the model coefficients to the main dataframe
coverage_data_supplemented3 <- cbind(coverage_data_supplemented3, model_coefs)  


# now calculate the fitted values
coverage_data_supplemented3 <- coverage_data_supplemented3 %>% 
  mutate(
    "Urban_Rural_Coded"=ifelse(Urban_Rural=="Urban",1,0),
    
           "Result_Age_25_49"=(Age_25_to_49*coef_Age_25_to_49),
           "Result_Age_65_and_over"=(Age_65_and_over*coef_Age_65_and_over),
           "Result_Ethn_Asian"=(Ethn_Asian*coef_Ethn_Asian),
           "Result_Ethn_Black"=(Ethn_Black*coef_Ethn_Black),
           "Result_Ethn_Mixed"=(Ethn_Mixed*coef_Ethn_Mixed),
           "Result_Ethn_Other"=(Ethn_Other*coef_Ethn_Other),
           "Result_gp_per_1k"=(gp_per_1k_eligible_women*coef_gp_per_1k_eligible_women),
           "Result_IMD_2015_Rank"=(IMD_2015_Rank*coef_IMD_2015_Rank),
           "Result_osnrth100km"=(osnrth100km*coef_osnrth100km),
           "Result_Satisfied_Opening"=(Satisfied_Opening_Hours*coef_Satisfied_Opening_Hours),
           "Result_Urban_Rural"=(Urban_Rural_Coded*coef_Urban_RuralUrban),
           "Result_FEMALE_GP_PROPORTION"=(FEMALE_GP_PROPORTION*coef_FEMALE_GP_PROPORTION),
           "Result_NON_UKQ_GP_PROPORTION"=(NON_UKQ_GP_PROPORTION*coef_NON_UKQ_GP_PROPORTION), 
           "Result_nurses_per_1k"=(nurses_per_1k_eligible_women*coef_nurses_per_1k_eligible_women),
           "Result_Eligible"=(Eligible*coef_Eligible),
    
    "test_log_odds"= CCG_intercept +
                         coef_Intercept +
                         (Age_25_to_49*coef_Age_25_to_49) +
                         (Age_65_and_over*coef_Age_65_and_over) +
                         (Ethn_Asian*coef_Ethn_Asian) +
                         (Ethn_Black*coef_Ethn_Black) +
                         (Ethn_Mixed*coef_Ethn_Mixed) +
                         (Ethn_Other*coef_Ethn_Other) +
                         (gp_per_1k_eligible_women*coef_gp_per_1k_eligible_women) +
                         (IMD_2015_Rank*coef_IMD_2015_Rank) +
                         (osnrth100km*coef_osnrth100km) +
                         (Satisfied_Opening_Hours*coef_Satisfied_Opening_Hours) +
                         (Urban_Rural_Coded*coef_Urban_RuralUrban) +
                         (FEMALE_GP_PROPORTION*coef_FEMALE_GP_PROPORTION) +
                         (NON_UKQ_GP_PROPORTION*coef_NON_UKQ_GP_PROPORTION) +
                         (nurses_per_1k_eligible_women*coef_nurses_per_1k_eligible_women) +
                         (Eligible*coef_Eligible),
    "test_odds"=exp(test_log_odds),
    "test_prob"=test_odds/(1+test_odds))

9 Spatial analysis

# Mapping the random effects
path <- ".\\shp\\CCG_April_2016_UGC_V4"

ccg_shp <- readOGR(dsn=path,
        layer="Clinical_Commissioning_Groups_April_2016_Generalised_Clipped_Boundaries_in_England",
        stringsAsFactors=FALSE)

## OGR data source with driver: ESRI Shapefile 
## Source: "C:\Users\User1\Documents\Rstudio_Projects\Multilevel_model_screening\shp\CCG_April_2016_UGC_V4", layer: "Clinical_Commissioning_Groups_April_2016_Generalised_Clipped_Boundaries_in_England"
## with 209 features
## It has 5 fields
## Integer64 fields read as strings:  objectid

ccg_shp <- sp::merge(ccg_shp,
                     caterpillar_data,
                     by.x="ccg16cd",
                     by.y="CCG16CD",
                     sort = FALSE)



# define a function to handle tidying (which used to be called fortifying) and then joining the data items back in
clean <- function(shape){
                          shape@data$id = rownames(shape@data)
                          shape.points = tidy(shape, region="id")
                          shape.df = inner_join(shape.points, shape@data, by="id")
}

ccg_shp_tidy <- clean(ccg_shp)



ggplot(ccg_shp_tidy, aes(long, lat, fill=category, group=group)) +
  geom_polygon(col="white", size=0.005) +
  coord_fixed() +
  scale_fill_manual(values=c("grey", "#0571b0","#ca0020")) +
  theme_void()

# check for clustering of random effects
# remember to reset oa_shp_data as it's filtered in the previous script
neighbourhood <- spdep::poly2nb(ccg_shp, queen=TRUE)

{
  par(mar=c(0,0,0,0))
  plot(ccg_shp,
      border="grey")
  plot(neighbourhood,
     coords=coordinates(ccg_shp),
     col="red",
     add=T)
  }

# Now create a neighbourhood weights matrix
neighbourhood_weights_list <- nb2listw(neighbourhood, style="W", zero.policy=TRUE)

# and run the moran's I test
moran.test(ccg_shp$intercepts,neighbourhood_weights_list, zero.policy=T)

## 
##  Moran I test under randomisation
## 
## data:  ccg_shp$intercepts  
## weights: neighbourhood_weights_list  n reduced by no-neighbour observations
##   
## 
## Moran I statistic standard deviate = 9.9459, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##       0.455767883      -0.004830918       0.002144665

9.1 Local Moran’s I

# Local Moran
LM_Results <- localmoran(ccg_shp$intercepts,
                         neighbourhood_weights_list,
           p.adjust.method="bonferroni",
           na.action=na.exclude,
           zero.policy=TRUE)

summary(LM_Results)

##        Ii                 E.Ii               Var.Ii      
##  Min.   :-0.732318   Min.   :-0.004808   Min.   :0.0000  
##  1st Qu.: 0.006079   1st Qu.:-0.004808   1st Qu.:0.1608  
##  Median : 0.145113   Median :-0.004808   Median :0.1939  
##  Mean   : 0.455768   Mean   :-0.004785   Mean   :0.2505  
##  3rd Qu.: 0.577668   3rd Qu.:-0.004808   3rd Qu.:0.2435  
##  Max.   : 6.230396   Max.   : 0.000000   Max.   :0.9882  
##                                                          
##       Z.Ii            Pr(z > 0)     
##  Min.   :-1.21630   Min.   :0.0000  
##  1st Qu.: 0.03054   1st Qu.:0.5343  
##  Median : 0.32569   Median :1.0000  
##  Mean   : 0.92273   Mean   :0.7615  
##  3rd Qu.: 1.22377   3rd Qu.:1.0000  
##  Max.   :10.91639   Max.   :1.0000  
##  NA's   :1          NA's   :1

# add moran's I results back to the shapefile
ccg_shp@data$lmoran_i <- LM_Results[,1]
ccg_shp@data$lmoran_p <- LM_Results[,5]
ccg_shp@data$lmoran_sig <- LM_Results[,5]<0.01


# manually make a moran plot based on standardised variables
# standardise variables and save to a new column
ccg_shp$SCALED_INTERCEPT <- scale(ccg_shp$intercepts)

# create a lagged variable
ccg_shp$LAGGED_SCALED_INTERCEPT <- lag.listw(neighbourhood_weights_list, ccg_shp$SCALED_INTERCEPT)

## Warning in lag.listw(neighbourhood_weights_list, ccg_shp$SCALED_INTERCEPT):
## NAs in lagged values

ccg_shp$SPATIAL_LAG_CAT <- factor(ifelse(ccg_shp$SCALED_INTERCEPT>0 & ccg_shp$LAGGED_SCALED_INTERCEPT>0, "High-High",
       ifelse(ccg_shp$SCALED_INTERCEPT>0 & ccg_shp$LAGGED_SCALED_INTERCEPT<0, "High-Low",
              ifelse(ccg_shp$SCALED_INTERCEPT<0 & ccg_shp$LAGGED_SCALED_INTERCEPT<0, "Low-Low",
                     ifelse(ccg_shp$SCALED_INTERCEPT<0 & ccg_shp$LAGGED_SCALED_INTERCEPT>0, "Low-High",
       "Equivalent")))))

ggplot(ccg_shp@data, aes(SCALED_INTERCEPT, LAGGED_SCALED_INTERCEPT, colour=lmoran_p)) +
  geom_point(alpha=0.5, size=3) +
  geom_smooth(method="lm", se=F, col="red") +
  geom_hline(yintercept=0, lty=2) +
  geom_vline(xintercept=0, lty=2) +
  theme_bw() +
  labs(title="Scaled Spatial Lag Comparison",
       x="Scaled Value",
       y="Lagged Scaled Value")

## Warning: Removed 1 rows containing non-finite values (stat_smooth).

## Warning: Removed 1 rows containing missing values (geom_point).

# set id columns to merge local moran's results back to the shapefile
ccg_shp@data$id <- row.names(ccg_shp@data)

# tidy the shapefile
ccg_shp_tidy <- tidy(ccg_shp,region="id")
ccg_shp_tidy <- merge(ccg_shp_tidy,ccg_shp@data,by="id")


ccg_shp_tidy_sig_high_high <- ccg_shp_tidy[ccg_shp_tidy$lmoran_sig==T & ccg_shp_tidy$SPATIAL_LAG_CAT=="High-High",]
ccg_shp_tidy_sig_low_low <- ccg_shp_tidy[ccg_shp_tidy$lmoran_sig==T & ccg_shp_tidy$SPATIAL_LAG_CAT=="Low-Low",]

ggplot() +
  geom_polygon(data=ccg_shp_tidy, aes(long, lat, fill=lmoran_sig, group=group),fill="grey",col="white") +
  geom_polygon(data=ccg_shp_tidy_sig_high_high, aes(long, lat, fill=lmoran_sig, group=group),fill="#0571b0",col="white") +
  geom_polygon(data=ccg_shp_tidy_sig_low_low, aes(long, lat, fill=lmoran_sig, group=group),fill="#ca0020",col="white") +
  coord_fixed() +
  theme_void()

# extra row is the Isle of White
ccg_shp_tidy_sig_high_high %>% ungroup() %>% group_by(CCG16NM) %>% summarise(n())

## # A tibble: 13 x 2
##    CCG16NM                              `n()`
##    <fct>                                <int>
##  1 NHS Ashford CCG                        742
##  2 NHS Birmingham South and Central CCG   600
##  3 NHS Bradford City CCG                  352
##  4 NHS Bradford Districts CCG             877
##  5 NHS Calderdale CCG                     477
##  6 NHS Medway CCG                        1138
##  7 NHS Nottingham City CCG                370
##  8 NHS Nottingham North and East CCG      426
##  9 NHS Nottingham West CCG                458
## 10 NHS Rushcliffe CCG                     658
## 11 NHS South Kent Coast CCG               829
## 12 NHS Swale CCG                         1218
## 13 <NA>                                  1231

ccg_shp_tidy_sig_low_low %>% group_by(CCG16NM) %>% summarise(n())

## # A tibble: 10 x 2
##    CCG16NM                                     `n()`
##    <fct>                                       <int>
##  1 NHS Brent CCG                                 314
##  2 NHS Camden CCG                                245
##  3 NHS Central London (Westminster) CCG          255
##  4 NHS Fylde & Wyre CCG                         1057
##  5 NHS Hambleton, Richmondshire and Whitby CCG  2126
##  6 NHS Hammersmith and Fulham CCG                196
##  7 NHS Newcastle Gateshead CCG                   831
##  8 NHS Southport and Formby CCG                  365
##  9 NHS West London CCG                           246
## 10 <NA>                                         1231

Multilevel Modelling Screening Uptake

Simon Hailstone

20 May 2018

1 Set up

2 Get data

2.1 Join data

2.2 Fix CCG coding

3 Exploration

4 Modelling

4.1 Prep data

4.2 Summary stats table

5 Sumary statistics table

5.1 Check multilevel structure

6 Build GLMM model

6.1 Map residuals too

6.2 Extract model information

7 Inspect random effects

8 Inspect coefficients

9 Spatial analysis

9.1 Local Moran’s I