Introduction

This project looks at the following:

  • How are specific ethnic groups distributed within a local authority
  • Is there evidence of clustering of ethnic groups?
  • How does GP practice coverage relate to these clusters?

What is the reason for looking at this information? Health inequalities are of significant importance in the field of public health. Those inequalities are driven by a wide variety of factors. One of those factors can be cultural attitudes towards health and wellbeing services, leading to poorer uptake or lower levels of engagement with programmes.

This work will hopefully form a foundation upon which a more fully-featured tool might be developed. Allowing local public health teams to better understand their resident populations and design services which are both acceptable to and appropriate for those groups.

A more detailed discussion of the issue of health and ethnicity in the UK can be read here: http://www.parliament.uk/documents/post/postpn276.pdf

Load Packages

library("tidyverse")
library("dplyr")
library("rgdal")
library("extrafont")
library("sp")
library("maptools")
library("rgeos")
library("MASS")
library("raster")
library("broom") # contains the tidy function which now replaces the fortify function for ggplot
library("viridis") # For nicer ggplot colours
library("spdep")
library("gridExtra")
library("Cairo")
library("RSQLite")

# To connect to a postgis database
library("RPostgreSQL")
library("postGIStools")

Load Ethnicity Data

Census data can be downloaded from the NOMIS website: https://www.nomisweb.co.uk/

In this instance I use an API to directly pull the data I need at Output Area level. I’m only downloading data for Slough (ONS code E06000039) currently, I would eventually like to download data for the entire country at this level of detail as this would allow identification of clusters which span authority boundaries (i.e. edge effects).

For now we will look at a single BMI ethnic group. The NOMIS census code for this group is 132 (NOMIS ethnicity detailed codelist available from https://www.nomisweb.co.uk/api/v01/dataset/NM_575_1/cell.def.htm)

api_string <- "http://www.nomisweb.co.uk/api/v01/dataset/NM_575_1.data.csv?date=latest&geography=E06000039TYPE299&rural_urban=0&cell=132&measures=20100&select=date_name,geography_name,geography_code,rural_urban_name,cell_name,measures_name,obs_value,obs_status_name"

ethnicity_detailed <- read.csv(api_string, stringsAsFactors=F)

Load Resident Population Numbers

We will also pull population information from NOMIS to use as a denominator when calculating a proportion for each output area.

api_string <- "http://www.nomisweb.co.uk/api/v01/dataset/NM_144_1.data.csv?date=latest&geography=E06000039TYPE299&rural_urban=0&cell=0&measures=20100&select=date_name,geography_name,geography_code,rural_urban_name,cell_name,measures_name,obs_value,obs_status_name"

pop_detailed <- read.csv(api_string, stringsAsFactors=F)

pop_detailed <- pop_detailed %>%
  dplyr::select(GEOGRAPHY_CODE, "POPULATION"=OBS_VALUE)

Download and Unzip the Output Area Lookup Data

I wanted to have R directly download the file but it seems to be a bit more complex than just pointing at a URL so I downloaded manually and just use R to unzip and read in the csv file. This file allows us to assign Output Areas to the correct Local Authority.

# Unzip
unzip(zipfile="data\\OA11_WD11_LAD11_EW_LU.zip", exdir="data")

# Import
oa_lad_lkp <- read.csv("data\\OA11_WD11_LAD11_EW_LU.csv", stringsAsFactors=F)

# Trim out the fields we don't need in the LAD lookup
oa_lad_lkp <- oa_lad_lkp %>% dplyr::select(OA11CD,LAD11CD,LAD11NM)

Now the Local Authority Code can be joined to the census ethnicity table.

ethnicity_detailed <- ethnicity_detailed %>% 
  left_join(pop_detailed, by="GEOGRAPHY_CODE") %>% 
  left_join(oa_lad_lkp, by=c("GEOGRAPHY_CODE"="OA11CD")) %>% 
  filter(!is.na(LAD11CD)) %>% 
  mutate(
    "CELL_NAME"=gsub("[.]"," ",CELL_NAME),
    "POP_PROPORTION"=OBS_VALUE/POPULATION)

Obtain Data From a PostGIS PostgreSQL System

In another post I discuss setting up a spatial database using PostgreSQL with PostGIS. Now this sytem can be connected to in order to download outputa areas for Slough.

# Should be able to pull data from my postgres database using the rgdal package but this doesn't want to work for some reason so using postGIStools package instead
# dsn <- "PG:dbname=spatial_data_store host=localhost port=5432 user=postgres password=postgres"
# ogrListLayers(dsn)

# Create connection to the postgis database where all my shapefiles are stored
con <- dbConnect(PostgreSQL(), dbname = "spatial_data_store", user = "postgres",
                 host = "localhost",
                 password = "postgres")

# Pull all the output areas from the postgis database for a specific local authority
oa_shp <- get_postgis_query(con,
                                  "SELECT *
                                    FROM output_area_december_2011_generalised_clipped_boundaries_in_eng
                            WHERE lad11cd='E06000039'",
                             geom_name = "geom")

Map of Ethnic Group by Output Area

Now we can see how the population of this particular ethnic group is distributed across Slough, both as a rate per output area and also as actual numbers per output area.

oa_shp_tidy <- tidy(oa_shp,region="oa11cd")


# Merge population figures to shapefile
oa_shp_tidy <- oa_shp_tidy %>% left_join(ethnicity_detailed,by=c("id"="GEOGRAPHY_CODE"))


plot_proportion <- ggplot(oa_shp_tidy, aes(long, lat, fill=POP_PROPORTION, group=id)) +
  geom_polygon(col="grey") +
  scale_fill_gradient2(low="white",high="red", labels=scales::percent) +
  labs(fill="Proportion") +
  coord_fixed() +
  theme_void() +
  theme(legend.position="bottom")

plot_numbers <- ggplot(oa_shp_tidy, aes(long, lat, fill=OBS_VALUE, group=id)) +
  geom_polygon(col="grey") +
  scale_fill_gradient2(low="white",high="blue") +
  labs(fill="Count") +
  coord_fixed() +
  theme_void() +
  theme(legend.position="bottom")

grid.arrange(plot_proportion, plot_numbers, ncol=2, nrow=1)

Dots in Polygon Mapping

Purely to make an interesting presentation of this data, I use the dotsInPolys function from the maptools package. This creates dot density data which we can use to create a 2d kernel density estimate later on.

Generate the Dots

# merge the population data into the output area shapefile
oa_shp_data <-  merge(oa_shp, ethnicity_detailed, by.x="oa11cd",by.y="GEOGRAPHY_CODE")
oa_shp_data <-  oa_shp_data[!is.na(oa_shp_data$OBS_VALUE),]
oa_shp_data <-  oa_shp_data[oa_shp_data$OBS_VALUE>0,]

# create the dot density data
dot_density <- dotsInPolys(oa_shp_data, x=as.integer(oa_shp_data$OBS_VALUE), f="random")


{
par(mar=c(0,0,0,0))
plot(dot_density, pch=16,  cex=0.4, col="#00000044")
plot(oa_shp_data, add=TRUE, border="black")
}

Create a 2d Kernel Density Raster

# Calculate a 500 metre margin around the shapefile to ensure the density hotspots aren't cut off on the map plot.
map_padding <- 500
map_padding <- c(-map_padding, map_padding,
                 -map_padding, map_padding)

shp_limits <- extent(oa_shp)[1:4]

kernal_density_estimate = kde2d(dot_density$x,
                                dot_density$y,
                                h=400, # bandwidth
                                n=500, # gridpoints in each direction
                                lims=shp_limits+map_padding)


raster_kde2d = raster(kernal_density_estimate)

Plot the 2d Kernel Density Estimate Raster

Now a basic plot can be produced before we do the full version using ggplot.

{
plot(raster_kde2d, col=magma(7))
plot(oa_shp_data, add=TRUE, border="#555555")
}

Some Additional Details

Now I pull some extra contextual information from my PostgreSQL spatial databse. These will provide the overall outline of Slough, building outlines and also roads.

# Pull the outline of Slough
slough_shp <- get_postgis_query(con,
                                  "SELECT *
                                    FROM local_authority_districts_december_2016_generalised_clipped_bou
                                    WHERE lad16cd = 'E06000039'",
                             geom_name = "geom")



# Pull Slough buildings
bld_slough_shp <- get_postgis_query(con,
                                 "SELECT *
                                   FROM bld_clip_slough",
                            geom_name = "geom")


# Pull Slough roads
rd_slough_shp <- get_postgis_query(con,
                                 "SELECT *
                                   FROM rd_clip_slough",
                            geom_name = "geom")

# Prepare the new spatial data for plotting in ggplot
slough_shp_tidy <- tidy(slough_shp,region="lad16cd")
bld_slough_shp_tidy <- tidy(bld_slough_shp,region="id")
rd_slough_shp_tidy <- tidy(rd_slough_shp,region="id")

Plot Kernel Density Map in ggplot

# set background and foreground colours
bckgrnd_col <- viridis(2, alpha = 1, begin = 0.1, end = 0.9, option="magma")[1]
frgrnd_col <- viridis(2, alpha = 1, begin = 0.1, end = 0.9, option="magma")[2]

windowsFonts(arial=windowsFont("Arial"))

# set themes
theme_map <- function(...) {
  theme_minimal() +
  theme(
    text = element_text(family = "arial", colour="#FFFFFF"), #, size=36),
    axis.line = element_blank(),
    axis.text.x = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks = element_blank(),
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    plot.background = element_rect(fill = bckgrnd_col, color = NA),
    panel.background = element_rect(fill = bckgrnd_col, color = NA),
    legend.background = element_rect(fill = bckgrnd_col, color = NA),
    panel.border = element_blank(),
    ...
  )
}



ggplot() +
  #stat_density_2d(data=as.data.frame(coordinates(x)), aes(x=x, y=y,fill = ..density..), geom = "raster", contour = FALSE) +
  geom_polygon(data=slough_shp_tidy,
               aes(long, lat, group=id),
               fill=NA,
               colour="white",
               size=1) +
  geom_raster(data=as.data.frame(raster_kde2d,
                                 xy=TRUE),
              aes(x,y,fill=layer)) +
  scale_fill_viridis(option="magma",
                     begin=0.1,
                     end=0.9,
                     guide = guide_colorbar(
                              direction = "horizontal",
                              #barheight = unit(30, units = "mm"),
                              #barwidth = unit(300, units = "mm"),
                              draw.ulim = FALSE,
                              title.position = 'top',
                              title.hjust = 0.5,
                              label.hjust = 0.5)) +
  geom_polygon(data=bld_slough_shp_tidy, aes(long, lat, group=id), fill="white", colour=NA, alpha=0.5) +
  geom_path(data=rd_slough_shp_tidy, aes(long, lat, group=id), colour="white", alpha=0.5) +
  geom_polygon(data=slough_shp_tidy, aes(long, lat, group=id), fill=NA, colour="white",size=0.25) +
  coord_fixed() +
  labs(x=NULL,
       y=NULL,
       fill="Density") +
  theme_void() +
  theme(legend.position = "bottom")

Clustering of Ethnic Group

Now we can test to see if there is significant clustering of this ethnic group and which output areas are part of this clustering. For this the Moran’s I statistic can be used to look at both global and local spatial autocorrelation. A useful tutorial on this can be found here: https://mgimond.github.io/Spatial/spatial-autocorrelation-in-r.html

Create a Neighbourhood

First we create a neighbourhood object using the poly2nb function and our output area shapefile. We do this using the ‘Queen’s case’ setting, meaning that adjacent areas which share either a border or a corner are counted as neighbours.

# remember to reset oa_shp_data as it's filtered in the previous script
neighbourhood <- poly2nb(oa_shp_data, queen=TRUE)

{
  par(mar=c(0,0,0,0))
  plot(oa_shp_data,
      border="grey")
  plot(neighbourhood,
     coords=coordinates(oa_shp_data),
     col="red",
     add=T)
  }

Generate Neighbourhood Weights

# first generate list of weights
neighbourhood_weights_list <- nb2listw(neighbourhood, style="W", zero.policy=TRUE)

Compute Global Moran’s I test

moran.test(oa_shp_data$POP_PROPORTION,neighbourhood_weights_list)
## 
##  Moran I test under randomisation
## 
## data:  oa_shp_data$POP_PROPORTION  
## weights: neighbourhood_weights_list  
## 
## Moran I statistic standard deviate = 23.441, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##      0.7327179450     -0.0026041667      0.0009840443

Now a monte carlo based test of significance can be performed.

moran_i_monte_carlo <- moran.mc(oa_shp_data$POP_PROPORTION,
                                neighbourhood_weights_list,
                                nsim=599)

# View results (including p-value)
moran_i_monte_carlo
## 
##  Monte-Carlo simulation of Moran I
## 
## data:  oa_shp_data$POP_PROPORTION 
## weights: neighbourhood_weights_list  
## number of simulations + 1: 600 
## 
## statistic = 0.73272, observed rank = 600, p-value = 0.001667
## alternative hypothesis: greater
# Plot the distribution (note that this is a density plot instead of a histogram)
plot(moran_i_monte_carlo)

A local moran’s i statistic can now be calculated for each output area.

# Local Moran
LM_Results <- localmoran(oa_shp_data$POP_PROPORTION,
                         neighbourhood_weights_list,
           p.adjust.method="bonferroni",
           na.action=na.exclude,
           zero.policy=TRUE)

summary(LM_Results)
##        Ii               E.Ii               Var.Ii             Z.Ii        
##  Min.   :-0.9174   Min.   :-0.002604   Min.   :0.05603   Min.   :-2.1682  
##  1st Qu.: 0.1113   1st Qu.:-0.002604   1st Qu.:0.13976   1st Qu.: 0.2451  
##  Median : 0.4019   Median :-0.002604   Median :0.19670   Median : 0.9107  
##  Mean   : 0.7327   Mean   :-0.002604   Mean   :0.20317   Mean   : 1.7362  
##  3rd Qu.: 0.8386   3rd Qu.:-0.002604   3rd Qu.:0.24652   3rd Qu.: 1.9323  
##  Max.   : 5.0224   Max.   :-0.002604   Max.   :0.99382   Max.   :14.1130  
##    Pr(z > 0)     
##  Min.   :0.0000  
##  1st Qu.:0.1778  
##  Median :1.0000  
##  Mean   :0.6611  
##  3rd Qu.:1.0000  
##  Max.   :1.0000

The results of the local moran’s i are merged back into the output area shapefile for plotting.

# add moran's I results back to the shapefile
oa_shp_data@data$lmoran_i <- LM_Results[,1]
oa_shp_data@data$lmoran_p <- LM_Results[,5]
oa_shp_data@data$lmoran_sig <- LM_Results[,5]<0.01


# manually make a moran plot based on standardised variables
# standardise variables and save to a new column
oa_shp_data$SCALED_POP_PROPORTION <- scale(oa_shp_data$POP_PROPORTION)

# create a lagged variable
oa_shp_data$LAGGED_SCALED_POP_PROPORTION <- lag.listw(neighbourhood_weights_list, oa_shp_data$SCALED_POP_PROPORTION)


oa_shp_data$SPATIAL_LAG_CAT <- factor(ifelse(oa_shp_data$SCALED_POP_PROPORTION>0 & oa_shp_data$LAGGED_SCALED_POP_PROPORTION>0, "High-High",
       ifelse(oa_shp_data$SCALED_POP_PROPORTION>0 & oa_shp_data$LAGGED_SCALED_POP_PROPORTION<0, "High-Low",
              ifelse(oa_shp_data$SCALED_POP_PROPORTION<0 & oa_shp_data$LAGGED_SCALED_POP_PROPORTION<0, "Low-Low",
                     ifelse(oa_shp_data$SCALED_POP_PROPORTION<0 & oa_shp_data$LAGGED_SCALED_POP_PROPORTION>0, "Low-High",
       "Equivalent")))))

First we look at the relationship between population proportion and the spatially lagged values.

ggplot(oa_shp_data@data, aes(SCALED_POP_PROPORTION, LAGGED_SCALED_POP_PROPORTION, colour=lmoran_p)) +
  geom_point(alpha=0.5, size=3) +
  geom_smooth(method="lm", se=F, col="red") +
  geom_hline(yintercept=0, lty=2) +
  geom_vline(xintercept=0, lty=2) +
  theme_bw() +
  labs(title="Scaled Spatial Lag Comparison",
       x="Scaled Value",
       y="Lagged Scaled Value")

Secondly we can plot a comparison set of maps using ggplot to look at the clusters which are identified.

# set id columns to merge local moran's results back to the shapefile
oa_shp_data@data$id <- row.names(oa_shp_data@data)

# tidy the shapefile
oa_shp_data_tidy <- tidy(oa_shp_data,region="id")
oa_shp_data_tidy <- merge(oa_shp_data_tidy,oa_shp_data@data,by="id")



# ==================================================
# Comparison plot

gg1 <- ggplot(oa_shp_data_tidy, aes(long, lat, fill=lmoran_sig, group=id)) +
  geom_polygon(col="white") +
  scale_fill_manual(values=c("grey","red")) +
  coord_fixed() +
  theme_void()


gg2 <- ggplot(oa_shp_data_tidy, aes(long, lat, fill=POP_PROPORTION, group=id)) +
  geom_polygon(col="white") +
  scale_fill_gradient(low="white",high="red", labels=scales::percent) +
  coord_fixed() +
  theme_void()

gg3 <- ggplot(oa_shp_data_tidy, aes(long, lat, fill=lmoran_i, group=id)) +
  geom_polygon(col="white") +
  scale_fill_gradient(low="white",high="blue") +
  coord_fixed() +
  theme_void()

gg4 <- ggplot(oa_shp_data_tidy, aes(long, lat, fill=SPATIAL_LAG_CAT, group=id)) +
  geom_polygon(col="white") +
  scale_fill_manual(values=c("red","pink","light blue","blue")) +
  coord_fixed() +
  theme_void()

# grid.arrange(gg1, gg2, gg3, gg4, ncol=1, nrow=4)

gg1

gg2

gg3

gg4

Now we can create a plot which shows only the areas with a high proportion that are surrounded by high proportion areas and are statistically significant.

oa_shp_data_tidy_sig_high_high <- oa_shp_data_tidy[oa_shp_data_tidy$lmoran_sig==T & oa_shp_data_tidy$SPATIAL_LAG_CAT=="High-High",]

ggplot() +
  geom_polygon(data=oa_shp_data_tidy, aes(long, lat, fill=lmoran_sig, group=id),fill="grey",col="white") +
  geom_polygon(data=oa_shp_data_tidy_sig_high_high, aes(long, lat, fill=lmoran_sig, group=id),fill="red",col="white") +
  coord_fixed() +
  theme_void()

We then union the clusters to form cluster boundaries.

oa_shp_data_sig_high_high <- oa_shp_data[oa_shp_data$lmoran_sig==T & oa_shp_data$SPATIAL_LAG_CAT=="High-High",]
plot(oa_shp_data_sig_high_high)

cluster_boundary <- gUnaryUnion(oa_shp_data_sig_high_high)
plot(cluster_boundary)

The cluster boundaries can then be plotted.

cluster_boundary_tidy <- tidy(cluster_boundary)

ggplot() +
  geom_polygon(data=slough_shp_tidy,
               aes(long, lat, group=id),
               fill=NA,
               colour="white",
               size=1) +
  geom_raster(data=as.data.frame(raster_kde2d,
                                 xy=TRUE),
              aes(x,y,fill=layer)) +
  scale_fill_viridis(option="magma",
                     begin=0.1,
                     end=0.9,
                     guide = guide_colorbar(
                              direction = "horizontal",
                              #barheight = unit(30, units = "mm"),
                              #barwidth = unit(300, units = "mm"),
                              draw.ulim = FALSE,
                              title.position = 'top',
                              title.hjust = 0.5,
                              label.hjust = 0.5)) +
  geom_polygon(data=bld_slough_shp_tidy, aes(long, lat, group=id), fill="white", colour=NA, alpha=0.5) +
  geom_path(data=rd_slough_shp_tidy, aes(long, lat, group=id), colour="white", alpha=0.5) +
  geom_polygon(data=slough_shp_tidy, aes(long, lat, group=id), fill=NA, colour="white",size=0.25) +
  geom_polygon(data=cluster_boundary_tidy, aes(long, lat, group=piece), fill=NA, colour="white",size=1, lty=1) +
  coord_fixed() +
  labs(x=NULL,
       y=NULL,
       fill="Density") +
  theme_void() +
  theme(legend.position = "bottom")

GP Registration Numbers

It would be useful to know if areas with low GP registration were in any way related to our clusters. For now we will just do this visually.

Load GP Practice Locations

Here I pull GP practice coordinates from a SQLite database I have set-up which contains the ONS postcoe directory data. I then filter them to only include ones in our selected Local Authority using a point-in-polygon selection function.

# Connect to pre-existing SQLite database containing ONS postcode directory
db_connection <- dbConnect(SQLite(), dbname="C:\\Users\\User1\\Documents\\Rstudio_Projects\\ONS_Lookup_Database\\ons_lkp_db")


# SQL query to pull postcode lookup data from SQLite database
pcd <- dbGetQuery(db_connection,
                                "SELECT pcd_spaceless, oseast1m, osnrth1m
                                  FROM ONS_PD
                  WHERE length(oseast1m)!=0")

# Load NHS GP reference data
gpp <- read.csv(file="data\\epraccur\\epraccur.csv", stringsAsFactors=F)

# Join postcode directory to GP reference data
gpp <- gpp %>% 
  mutate("pcd_spaceless"=gsub(" ","",gpp_pcd)) %>% 
  left_join(pcd)

# We lose a couple of hundred practices mainly from Jersey, Guernsey and Northern Ireland
# drop those practices with null coordinates
gpp <- gpp %>% filter(!is.na(oseast1m))

# remove the pcd table as it takes up a lot of memory
rm(pcd)

# Now convert the GP practice coordinates to a spatial points dataframe
xy <- gpp[,5:6] # get the coordinates
gpp_shp <- SpatialPointsDataFrame(coords = xy, data = gpp,
                               proj4string = CRS("+proj=tmerc +lat_0=49 +lon_0=-2 +k=0.9996012717 +x_0=400000 +y_0=-100000 +datum=OSGB36 +units=m +no_defs +ellps=airy +towgs84=446.448,-125.157,542.060,0.1502,0.2470,0.8421,-20.4894"))


# filter our practices to those within our local authority area using the over function to perform a point-in-polygon selection
point_in_polygon <- sp::over(gpp_shp, slough_shp)
point_in_polygon$row_number <- row.names(point_in_polygon)
point_in_polygon <- point_in_polygon[is.na(point_in_polygon$gid)==F,]

gpp_shp_filtered <- gpp_shp[point_in_polygon$row_number,]


{
par(mar=c(0,0,0,0))
plot(slough_shp)
plot(gpp_shp_filtered, add=TRUE)
}

Generate GP Catchment Areas

Note that we will generate catchment areas and then select all of the ones which overlap with our selected local authority.

# Read in the LSOA catchment information
gp_catch <- read.csv("data\\gp-reg-patients-LSOA-alt-tall.csv", stringsAsFactors=F)

# Get a list of LSOAs within our local authority
slough_lsoa <- dbGetQuery(db_connection, "SELECT DISTINCT lsoa11 FROM ONS_PD WHERE oslaua='E06000039'")


# Read in LSOA shapefile to attach to GP practices
lsoa_shp <- readOGR(dsn="shp", layer="Lower_Layer_Super_Output_Areas_December_2011_Generalised_Clipped__Boundaries_in_England_and_Wales", stringsAsFactors=FALSE)
## OGR data source with driver: ESRI Shapefile 
## Source: "shp", layer: "Lower_Layer_Super_Output_Areas_December_2011_Generalised_Clipped__Boundaries_in_England_and_Wales"
## with 34753 features
## It has 6 fields
## Integer64 fields read as strings:  objectid
# read in LSOA population figures
lsoa_pop_2016 <- read.csv("data\\ons_mid2016_lsoa_pop.csv", stringsAsFactors=F)
lsoa_pop_2016$All.Ages <- as.integer(gsub(",","",lsoa_pop_2016$All.Ages))

# View some summary information for each GP practice
gp_catch %>% 
  group_by(PRACTICE_CODE) %>% 
  summarise("n"=n(),
            "TOTAL_POP"=sum(ALL_PATIENTS),
            "MEAN_LSOA_POP"=mean(ALL_PATIENTS),
            "MEDIAN_LSOA_POP"=median(ALL_PATIENTS))
## # A tibble: 7,531 x 5
##    PRACTICE_CODE     n TOTAL_POP MEAN_LSOA_POP MEDIAN_LSOA_POP
##            <chr> <int>     <int>         <dbl>           <dbl>
##  1        A81001    50      4178      83.56000             6.5
##  2        A81002    91     19902     218.70330           220.0
##  3        A81004   134      9344      69.73134            42.0
##  4        A81005    23      7931     344.82609           167.0
##  5        A81006   106     13661     128.87736            95.0
##  6        A81007    68      9834     144.61765           160.5
##  7        A81008   129      3973      30.79845             4.0
##  8        A81009   128      9084      70.96875            49.5
##  9        A81011    70     11723     167.47143           182.0
## 10        A81012   132      4778      36.19697            25.0
## # ... with 7,521 more rows
lsoa_pop_var <- gp_catch %>% 
  filter(substring(LSOA_CODE,1,1)=="E") %>% # remove Wales
  group_by(LSOA_CODE) %>% 
  summarise(
    "Number_of_Practices"=n(),
    "All_Patients"=sum(ALL_PATIENTS)
            ) %>% 
  left_join(lsoa_pop_2016, by=c("LSOA_CODE"="LSOA_Code")) %>% 
  mutate("Difference"=All_Patients-All.Ages,
         "LA_Flag"=ifelse(LSOA_CODE %in% slough_lsoa$lsoa11,"LSOA_CODE",NA)) %>% 
  filter(is.na(LA_Flag)==F)



lsoa_shp_pop <-  sp::merge(lsoa_shp, lsoa_pop_var, by.x="lsoa11cd", by.y="LSOA_CODE", all.x=FALSE)

# get polygon centroids
xy <- coordinates(lsoa_shp_pop)
lsoa_shp_pop$x <- xy[,1]
lsoa_shp_pop$y <- xy[,2]

# define a function to handle tidying (which used to be called fortifying) and then joinng the data items back in
clean <- function(shape){
                          shape@data$id = rownames(shape@data)
                          shape.points = tidy(shape, region="id")
                          shape.df = inner_join(shape.points, shape@data, by="id")
                        }


lsoa_shp_pop_tidy <- clean(lsoa_shp_pop)



ggplot(data=lsoa_shp_pop_tidy,
               aes(long,
                   lat,
                   group=id,
                   fill=Difference)) +
  geom_polygon(colour="white") +
  geom_text(aes(x,y,label=Number_of_Practices),color = "white", size=3) +
  scale_fill_gradient2(midpoint=0, low="red", mid="white",high="blue") +
  coord_fixed() +
  theme_void()

# ----------
median_ons_pop <- median(lsoa_pop_var$All.Ages, na.rm=T)

ggplot(lsoa_pop_var, aes(All.Ages)) +
  geom_histogram(bins=200) +
  geom_vline(xintercept=median_ons_pop, linetype=2, col="red") +
  theme_bw() +
  labs(x="LSOA Population (2016 Estimate)", y="Count", title="Distribution of LSOA Population Estimates (2016)")

# ----------
median_pat_pop_diff <- median(lsoa_pop_var$Difference, na.rm=T)

ggplot(lsoa_pop_var, aes(Difference)) +
  geom_histogram(bins=100) +
  geom_vline(xintercept=median_pat_pop_diff, linetype=2, col="red") +
  theme_bw() +
  labs(x="Difference between registered and resident population", y="Count", title="Distribution of Difference Between LSOA Population Estimates (2016) and Registrants") +
  xlim(-2500,2500)

# ----------

ggplot(lsoa_pop_var, aes(All_Patients,All.Ages,colour=Difference)) +
  geom_point() +
  scale_color_gradient2(low="#2c7bb6", mid="#ffffbf", high="#d7191c") +
  geom_smooth(method="lm", se=T, colour="black", linetype=3) +
  #geom_abline(intercept=0, slope=1) +
  theme_bw() +
  labs(x="Population Estimate 2016",
       y="Registrants",
       title="LSOA Population Estimates vs Number of Registrants"
       ) +
  coord_equal()

# ----------
median_pat_num_prac <- median(lsoa_pop_var$Number_of_Practices, na.rm=T)

ggplot(lsoa_pop_var, aes(Number_of_Practices)) +
  geom_histogram(binwidth=1) +
  geom_vline(xintercept=median_pat_num_prac, linetype=2, col="red") +
  theme_bw()