This project looks at the following:
What is the reason for looking at this information? Health inequalities are of significant importance in the field of public health. Those inequalities are driven by a wide variety of factors. One of those factors can be cultural attitudes towards health and wellbeing services, leading to poorer uptake or lower levels of engagement with programmes.
This work will hopefully form a foundation upon which a more fully-featured tool might be developed. Allowing local public health teams to better understand their resident populations and design services which are both acceptable to and appropriate for those groups.
A more detailed discussion of the issue of health and ethnicity in the UK can be read here: http://www.parliament.uk/documents/post/postpn276.pdf
library("tidyverse")
library("dplyr")
library("rgdal")
library("extrafont")
library("sp")
library("maptools")
library("rgeos")
library("MASS")
library("raster")
library("broom") # contains the tidy function which now replaces the fortify function for ggplot
library("viridis") # For nicer ggplot colours
library("spdep")
library("gridExtra")
library("Cairo")
library("RSQLite")
# To connect to a postgis database
library("RPostgreSQL")
library("postGIStools")
Census data can be downloaded from the NOMIS website: https://www.nomisweb.co.uk/
In this instance I use an API to directly pull the data I need at Output Area level. I’m only downloading data for Slough (ONS code E06000039) currently, I would eventually like to download data for the entire country at this level of detail as this would allow identification of clusters which span authority boundaries (i.e. edge effects).
For now we will look at a single BMI ethnic group. The NOMIS census code for this group is 132 (NOMIS ethnicity detailed codelist available from https://www.nomisweb.co.uk/api/v01/dataset/NM_575_1/cell.def.htm)
api_string <- "http://www.nomisweb.co.uk/api/v01/dataset/NM_575_1.data.csv?date=latest&geography=E06000039TYPE299&rural_urban=0&cell=132&measures=20100&select=date_name,geography_name,geography_code,rural_urban_name,cell_name,measures_name,obs_value,obs_status_name"
ethnicity_detailed <- read.csv(api_string, stringsAsFactors=F)
We will also pull population information from NOMIS to use as a denominator when calculating a proportion for each output area.
api_string <- "http://www.nomisweb.co.uk/api/v01/dataset/NM_144_1.data.csv?date=latest&geography=E06000039TYPE299&rural_urban=0&cell=0&measures=20100&select=date_name,geography_name,geography_code,rural_urban_name,cell_name,measures_name,obs_value,obs_status_name"
pop_detailed <- read.csv(api_string, stringsAsFactors=F)
pop_detailed <- pop_detailed %>%
dplyr::select(GEOGRAPHY_CODE, "POPULATION"=OBS_VALUE)
I wanted to have R directly download the file but it seems to be a bit more complex than just pointing at a URL so I downloaded manually and just use R to unzip and read in the csv file. This file allows us to assign Output Areas to the correct Local Authority.
# Unzip
unzip(zipfile="data\\OA11_WD11_LAD11_EW_LU.zip", exdir="data")
# Import
oa_lad_lkp <- read.csv("data\\OA11_WD11_LAD11_EW_LU.csv", stringsAsFactors=F)
# Trim out the fields we don't need in the LAD lookup
oa_lad_lkp <- oa_lad_lkp %>% dplyr::select(OA11CD,LAD11CD,LAD11NM)
Now the Local Authority Code can be joined to the census ethnicity table.
ethnicity_detailed <- ethnicity_detailed %>%
left_join(pop_detailed, by="GEOGRAPHY_CODE") %>%
left_join(oa_lad_lkp, by=c("GEOGRAPHY_CODE"="OA11CD")) %>%
filter(!is.na(LAD11CD)) %>%
mutate(
"CELL_NAME"=gsub("[.]"," ",CELL_NAME),
"POP_PROPORTION"=OBS_VALUE/POPULATION)
In another post I discuss setting up a spatial database using PostgreSQL with PostGIS. Now this sytem can be connected to in order to download outputa areas for Slough.
# Should be able to pull data from my postgres database using the rgdal package but this doesn't want to work for some reason so using postGIStools package instead
# dsn <- "PG:dbname=spatial_data_store host=localhost port=5432 user=postgres password=postgres"
# ogrListLayers(dsn)
# Create connection to the postgis database where all my shapefiles are stored
con <- dbConnect(PostgreSQL(), dbname = "spatial_data_store", user = "postgres",
host = "localhost",
password = "postgres")
# Pull all the output areas from the postgis database for a specific local authority
oa_shp <- get_postgis_query(con,
"SELECT *
FROM output_area_december_2011_generalised_clipped_boundaries_in_eng
WHERE lad11cd='E06000039'",
geom_name = "geom")
Now we can see how the population of this particular ethnic group is distributed across Slough, both as a rate per output area and also as actual numbers per output area.
oa_shp_tidy <- tidy(oa_shp,region="oa11cd")
# Merge population figures to shapefile
oa_shp_tidy <- oa_shp_tidy %>% left_join(ethnicity_detailed,by=c("id"="GEOGRAPHY_CODE"))
plot_proportion <- ggplot(oa_shp_tidy, aes(long, lat, fill=POP_PROPORTION, group=id)) +
geom_polygon(col="grey") +
scale_fill_gradient2(low="white",high="red", labels=scales::percent) +
labs(fill="Proportion") +
coord_fixed() +
theme_void() +
theme(legend.position="bottom")
plot_numbers <- ggplot(oa_shp_tidy, aes(long, lat, fill=OBS_VALUE, group=id)) +
geom_polygon(col="grey") +
scale_fill_gradient2(low="white",high="blue") +
labs(fill="Count") +
coord_fixed() +
theme_void() +
theme(legend.position="bottom")
grid.arrange(plot_proportion, plot_numbers, ncol=2, nrow=1)
Purely to make an interesting presentation of this data, I use the dotsInPolys function from the maptools package. This creates dot density data which we can use to create a 2d kernel density estimate later on.
# merge the population data into the output area shapefile
oa_shp_data <- merge(oa_shp, ethnicity_detailed, by.x="oa11cd",by.y="GEOGRAPHY_CODE")
oa_shp_data <- oa_shp_data[!is.na(oa_shp_data$OBS_VALUE),]
oa_shp_data <- oa_shp_data[oa_shp_data$OBS_VALUE>0,]
# create the dot density data
dot_density <- dotsInPolys(oa_shp_data, x=as.integer(oa_shp_data$OBS_VALUE), f="random")
{
par(mar=c(0,0,0,0))
plot(dot_density, pch=16, cex=0.4, col="#00000044")
plot(oa_shp_data, add=TRUE, border="black")
}
# Calculate a 500 metre margin around the shapefile to ensure the density hotspots aren't cut off on the map plot.
map_padding <- 500
map_padding <- c(-map_padding, map_padding,
-map_padding, map_padding)
shp_limits <- extent(oa_shp)[1:4]
kernal_density_estimate = kde2d(dot_density$x,
dot_density$y,
h=400, # bandwidth
n=500, # gridpoints in each direction
lims=shp_limits+map_padding)
raster_kde2d = raster(kernal_density_estimate)
Now a basic plot can be produced before we do the full version using ggplot.
{
plot(raster_kde2d, col=magma(7))
plot(oa_shp_data, add=TRUE, border="#555555")
}
Now I pull some extra contextual information from my PostgreSQL spatial databse. These will provide the overall outline of Slough, building outlines and also roads.
# Pull the outline of Slough
slough_shp <- get_postgis_query(con,
"SELECT *
FROM local_authority_districts_december_2016_generalised_clipped_bou
WHERE lad16cd = 'E06000039'",
geom_name = "geom")
# Pull Slough buildings
bld_slough_shp <- get_postgis_query(con,
"SELECT *
FROM bld_clip_slough",
geom_name = "geom")
# Pull Slough roads
rd_slough_shp <- get_postgis_query(con,
"SELECT *
FROM rd_clip_slough",
geom_name = "geom")
# Prepare the new spatial data for plotting in ggplot
slough_shp_tidy <- tidy(slough_shp,region="lad16cd")
bld_slough_shp_tidy <- tidy(bld_slough_shp,region="id")
rd_slough_shp_tidy <- tidy(rd_slough_shp,region="id")
# set background and foreground colours
bckgrnd_col <- viridis(2, alpha = 1, begin = 0.1, end = 0.9, option="magma")[1]
frgrnd_col <- viridis(2, alpha = 1, begin = 0.1, end = 0.9, option="magma")[2]
windowsFonts(arial=windowsFont("Arial"))
# set themes
theme_map <- function(...) {
theme_minimal() +
theme(
text = element_text(family = "arial", colour="#FFFFFF"), #, size=36),
axis.line = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.background = element_rect(fill = bckgrnd_col, color = NA),
panel.background = element_rect(fill = bckgrnd_col, color = NA),
legend.background = element_rect(fill = bckgrnd_col, color = NA),
panel.border = element_blank(),
...
)
}
ggplot() +
#stat_density_2d(data=as.data.frame(coordinates(x)), aes(x=x, y=y,fill = ..density..), geom = "raster", contour = FALSE) +
geom_polygon(data=slough_shp_tidy,
aes(long, lat, group=id),
fill=NA,
colour="white",
size=1) +
geom_raster(data=as.data.frame(raster_kde2d,
xy=TRUE),
aes(x,y,fill=layer)) +
scale_fill_viridis(option="magma",
begin=0.1,
end=0.9,
guide = guide_colorbar(
direction = "horizontal",
#barheight = unit(30, units = "mm"),
#barwidth = unit(300, units = "mm"),
draw.ulim = FALSE,
title.position = 'top',
title.hjust = 0.5,
label.hjust = 0.5)) +
geom_polygon(data=bld_slough_shp_tidy, aes(long, lat, group=id), fill="white", colour=NA, alpha=0.5) +
geom_path(data=rd_slough_shp_tidy, aes(long, lat, group=id), colour="white", alpha=0.5) +
geom_polygon(data=slough_shp_tidy, aes(long, lat, group=id), fill=NA, colour="white",size=0.25) +
coord_fixed() +
labs(x=NULL,
y=NULL,
fill="Density") +
theme_void() +
theme(legend.position = "bottom")
Now we can test to see if there is significant clustering of this ethnic group and which output areas are part of this clustering. For this the Moran’s I statistic can be used to look at both global and local spatial autocorrelation. A useful tutorial on this can be found here: https://mgimond.github.io/Spatial/spatial-autocorrelation-in-r.html
First we create a neighbourhood object using the poly2nb function and our output area shapefile. We do this using the ‘Queen’s case’ setting, meaning that adjacent areas which share either a border or a corner are counted as neighbours.
# remember to reset oa_shp_data as it's filtered in the previous script
neighbourhood <- poly2nb(oa_shp_data, queen=TRUE)
{
par(mar=c(0,0,0,0))
plot(oa_shp_data,
border="grey")
plot(neighbourhood,
coords=coordinates(oa_shp_data),
col="red",
add=T)
}
# first generate list of weights
neighbourhood_weights_list <- nb2listw(neighbourhood, style="W", zero.policy=TRUE)
moran.test(oa_shp_data$POP_PROPORTION,neighbourhood_weights_list)
##
## Moran I test under randomisation
##
## data: oa_shp_data$POP_PROPORTION
## weights: neighbourhood_weights_list
##
## Moran I statistic standard deviate = 23.441, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 0.7327179450 -0.0026041667 0.0009840443
Now a monte carlo based test of significance can be performed.
moran_i_monte_carlo <- moran.mc(oa_shp_data$POP_PROPORTION,
neighbourhood_weights_list,
nsim=599)
# View results (including p-value)
moran_i_monte_carlo
##
## Monte-Carlo simulation of Moran I
##
## data: oa_shp_data$POP_PROPORTION
## weights: neighbourhood_weights_list
## number of simulations + 1: 600
##
## statistic = 0.73272, observed rank = 600, p-value = 0.001667
## alternative hypothesis: greater
# Plot the distribution (note that this is a density plot instead of a histogram)
plot(moran_i_monte_carlo)
A local moran’s i statistic can now be calculated for each output area.
# Local Moran
LM_Results <- localmoran(oa_shp_data$POP_PROPORTION,
neighbourhood_weights_list,
p.adjust.method="bonferroni",
na.action=na.exclude,
zero.policy=TRUE)
summary(LM_Results)
## Ii E.Ii Var.Ii Z.Ii
## Min. :-0.9174 Min. :-0.002604 Min. :0.05603 Min. :-2.1682
## 1st Qu.: 0.1113 1st Qu.:-0.002604 1st Qu.:0.13976 1st Qu.: 0.2451
## Median : 0.4019 Median :-0.002604 Median :0.19670 Median : 0.9107
## Mean : 0.7327 Mean :-0.002604 Mean :0.20317 Mean : 1.7362
## 3rd Qu.: 0.8386 3rd Qu.:-0.002604 3rd Qu.:0.24652 3rd Qu.: 1.9323
## Max. : 5.0224 Max. :-0.002604 Max. :0.99382 Max. :14.1130
## Pr(z > 0)
## Min. :0.0000
## 1st Qu.:0.1778
## Median :1.0000
## Mean :0.6611
## 3rd Qu.:1.0000
## Max. :1.0000
The results of the local moran’s i are merged back into the output area shapefile for plotting.
# add moran's I results back to the shapefile
oa_shp_data@data$lmoran_i <- LM_Results[,1]
oa_shp_data@data$lmoran_p <- LM_Results[,5]
oa_shp_data@data$lmoran_sig <- LM_Results[,5]<0.01
# manually make a moran plot based on standardised variables
# standardise variables and save to a new column
oa_shp_data$SCALED_POP_PROPORTION <- scale(oa_shp_data$POP_PROPORTION)
# create a lagged variable
oa_shp_data$LAGGED_SCALED_POP_PROPORTION <- lag.listw(neighbourhood_weights_list, oa_shp_data$SCALED_POP_PROPORTION)
oa_shp_data$SPATIAL_LAG_CAT <- factor(ifelse(oa_shp_data$SCALED_POP_PROPORTION>0 & oa_shp_data$LAGGED_SCALED_POP_PROPORTION>0, "High-High",
ifelse(oa_shp_data$SCALED_POP_PROPORTION>0 & oa_shp_data$LAGGED_SCALED_POP_PROPORTION<0, "High-Low",
ifelse(oa_shp_data$SCALED_POP_PROPORTION<0 & oa_shp_data$LAGGED_SCALED_POP_PROPORTION<0, "Low-Low",
ifelse(oa_shp_data$SCALED_POP_PROPORTION<0 & oa_shp_data$LAGGED_SCALED_POP_PROPORTION>0, "Low-High",
"Equivalent")))))
First we look at the relationship between population proportion and the spatially lagged values.
ggplot(oa_shp_data@data, aes(SCALED_POP_PROPORTION, LAGGED_SCALED_POP_PROPORTION, colour=lmoran_p)) +
geom_point(alpha=0.5, size=3) +
geom_smooth(method="lm", se=F, col="red") +
geom_hline(yintercept=0, lty=2) +
geom_vline(xintercept=0, lty=2) +
theme_bw() +
labs(title="Scaled Spatial Lag Comparison",
x="Scaled Value",
y="Lagged Scaled Value")
Secondly we can plot a comparison set of maps using ggplot to look at the clusters which are identified.
# set id columns to merge local moran's results back to the shapefile
oa_shp_data@data$id <- row.names(oa_shp_data@data)
# tidy the shapefile
oa_shp_data_tidy <- tidy(oa_shp_data,region="id")
oa_shp_data_tidy <- merge(oa_shp_data_tidy,oa_shp_data@data,by="id")
# ==================================================
# Comparison plot
gg1 <- ggplot(oa_shp_data_tidy, aes(long, lat, fill=lmoran_sig, group=id)) +
geom_polygon(col="white") +
scale_fill_manual(values=c("grey","red")) +
coord_fixed() +
theme_void()
gg2 <- ggplot(oa_shp_data_tidy, aes(long, lat, fill=POP_PROPORTION, group=id)) +
geom_polygon(col="white") +
scale_fill_gradient(low="white",high="red", labels=scales::percent) +
coord_fixed() +
theme_void()
gg3 <- ggplot(oa_shp_data_tidy, aes(long, lat, fill=lmoran_i, group=id)) +
geom_polygon(col="white") +
scale_fill_gradient(low="white",high="blue") +
coord_fixed() +
theme_void()
gg4 <- ggplot(oa_shp_data_tidy, aes(long, lat, fill=SPATIAL_LAG_CAT, group=id)) +
geom_polygon(col="white") +
scale_fill_manual(values=c("red","pink","light blue","blue")) +
coord_fixed() +
theme_void()
# grid.arrange(gg1, gg2, gg3, gg4, ncol=1, nrow=4)
gg1
gg2
gg3
gg4
Now we can create a plot which shows only the areas with a high proportion that are surrounded by high proportion areas and are statistically significant.
oa_shp_data_tidy_sig_high_high <- oa_shp_data_tidy[oa_shp_data_tidy$lmoran_sig==T & oa_shp_data_tidy$SPATIAL_LAG_CAT=="High-High",]
ggplot() +
geom_polygon(data=oa_shp_data_tidy, aes(long, lat, fill=lmoran_sig, group=id),fill="grey",col="white") +
geom_polygon(data=oa_shp_data_tidy_sig_high_high, aes(long, lat, fill=lmoran_sig, group=id),fill="red",col="white") +
coord_fixed() +
theme_void()
We then union the clusters to form cluster boundaries.
oa_shp_data_sig_high_high <- oa_shp_data[oa_shp_data$lmoran_sig==T & oa_shp_data$SPATIAL_LAG_CAT=="High-High",]
plot(oa_shp_data_sig_high_high)
cluster_boundary <- gUnaryUnion(oa_shp_data_sig_high_high)
plot(cluster_boundary)
The cluster boundaries can then be plotted.
cluster_boundary_tidy <- tidy(cluster_boundary)
ggplot() +
geom_polygon(data=slough_shp_tidy,
aes(long, lat, group=id),
fill=NA,
colour="white",
size=1) +
geom_raster(data=as.data.frame(raster_kde2d,
xy=TRUE),
aes(x,y,fill=layer)) +
scale_fill_viridis(option="magma",
begin=0.1,
end=0.9,
guide = guide_colorbar(
direction = "horizontal",
#barheight = unit(30, units = "mm"),
#barwidth = unit(300, units = "mm"),
draw.ulim = FALSE,
title.position = 'top',
title.hjust = 0.5,
label.hjust = 0.5)) +
geom_polygon(data=bld_slough_shp_tidy, aes(long, lat, group=id), fill="white", colour=NA, alpha=0.5) +
geom_path(data=rd_slough_shp_tidy, aes(long, lat, group=id), colour="white", alpha=0.5) +
geom_polygon(data=slough_shp_tidy, aes(long, lat, group=id), fill=NA, colour="white",size=0.25) +
geom_polygon(data=cluster_boundary_tidy, aes(long, lat, group=piece), fill=NA, colour="white",size=1, lty=1) +
coord_fixed() +
labs(x=NULL,
y=NULL,
fill="Density") +
theme_void() +
theme(legend.position = "bottom")
It would be useful to know if areas with low GP registration were in any way related to our clusters. For now we will just do this visually.
Here I pull GP practice coordinates from a SQLite database I have set-up which contains the ONS postcoe directory data. I then filter them to only include ones in our selected Local Authority using a point-in-polygon selection function.
# Connect to pre-existing SQLite database containing ONS postcode directory
db_connection <- dbConnect(SQLite(), dbname="C:\\Users\\User1\\Documents\\Rstudio_Projects\\ONS_Lookup_Database\\ons_lkp_db")
# SQL query to pull postcode lookup data from SQLite database
pcd <- dbGetQuery(db_connection,
"SELECT pcd_spaceless, oseast1m, osnrth1m
FROM ONS_PD
WHERE length(oseast1m)!=0")
# Load NHS GP reference data
gpp <- read.csv(file="data\\epraccur\\epraccur.csv", stringsAsFactors=F)
# Join postcode directory to GP reference data
gpp <- gpp %>%
mutate("pcd_spaceless"=gsub(" ","",gpp_pcd)) %>%
left_join(pcd)
# We lose a couple of hundred practices mainly from Jersey, Guernsey and Northern Ireland
# drop those practices with null coordinates
gpp <- gpp %>% filter(!is.na(oseast1m))
# remove the pcd table as it takes up a lot of memory
rm(pcd)
# Now convert the GP practice coordinates to a spatial points dataframe
xy <- gpp[,5:6] # get the coordinates
gpp_shp <- SpatialPointsDataFrame(coords = xy, data = gpp,
proj4string = CRS("+proj=tmerc +lat_0=49 +lon_0=-2 +k=0.9996012717 +x_0=400000 +y_0=-100000 +datum=OSGB36 +units=m +no_defs +ellps=airy +towgs84=446.448,-125.157,542.060,0.1502,0.2470,0.8421,-20.4894"))
# filter our practices to those within our local authority area using the over function to perform a point-in-polygon selection
point_in_polygon <- sp::over(gpp_shp, slough_shp)
point_in_polygon$row_number <- row.names(point_in_polygon)
point_in_polygon <- point_in_polygon[is.na(point_in_polygon$gid)==F,]
gpp_shp_filtered <- gpp_shp[point_in_polygon$row_number,]
{
par(mar=c(0,0,0,0))
plot(slough_shp)
plot(gpp_shp_filtered, add=TRUE)
}
Note that we will generate catchment areas and then select all of the ones which overlap with our selected local authority.
# Read in the LSOA catchment information
gp_catch <- read.csv("data\\gp-reg-patients-LSOA-alt-tall.csv", stringsAsFactors=F)
# Get a list of LSOAs within our local authority
slough_lsoa <- dbGetQuery(db_connection, "SELECT DISTINCT lsoa11 FROM ONS_PD WHERE oslaua='E06000039'")
# Read in LSOA shapefile to attach to GP practices
lsoa_shp <- readOGR(dsn="shp", layer="Lower_Layer_Super_Output_Areas_December_2011_Generalised_Clipped__Boundaries_in_England_and_Wales", stringsAsFactors=FALSE)
## OGR data source with driver: ESRI Shapefile
## Source: "shp", layer: "Lower_Layer_Super_Output_Areas_December_2011_Generalised_Clipped__Boundaries_in_England_and_Wales"
## with 34753 features
## It has 6 fields
## Integer64 fields read as strings: objectid
# read in LSOA population figures
lsoa_pop_2016 <- read.csv("data\\ons_mid2016_lsoa_pop.csv", stringsAsFactors=F)
lsoa_pop_2016$All.Ages <- as.integer(gsub(",","",lsoa_pop_2016$All.Ages))
# View some summary information for each GP practice
gp_catch %>%
group_by(PRACTICE_CODE) %>%
summarise("n"=n(),
"TOTAL_POP"=sum(ALL_PATIENTS),
"MEAN_LSOA_POP"=mean(ALL_PATIENTS),
"MEDIAN_LSOA_POP"=median(ALL_PATIENTS))
## # A tibble: 7,531 x 5
## PRACTICE_CODE n TOTAL_POP MEAN_LSOA_POP MEDIAN_LSOA_POP
## <chr> <int> <int> <dbl> <dbl>
## 1 A81001 50 4178 83.56000 6.5
## 2 A81002 91 19902 218.70330 220.0
## 3 A81004 134 9344 69.73134 42.0
## 4 A81005 23 7931 344.82609 167.0
## 5 A81006 106 13661 128.87736 95.0
## 6 A81007 68 9834 144.61765 160.5
## 7 A81008 129 3973 30.79845 4.0
## 8 A81009 128 9084 70.96875 49.5
## 9 A81011 70 11723 167.47143 182.0
## 10 A81012 132 4778 36.19697 25.0
## # ... with 7,521 more rows
lsoa_pop_var <- gp_catch %>%
filter(substring(LSOA_CODE,1,1)=="E") %>% # remove Wales
group_by(LSOA_CODE) %>%
summarise(
"Number_of_Practices"=n(),
"All_Patients"=sum(ALL_PATIENTS)
) %>%
left_join(lsoa_pop_2016, by=c("LSOA_CODE"="LSOA_Code")) %>%
mutate("Difference"=All_Patients-All.Ages,
"LA_Flag"=ifelse(LSOA_CODE %in% slough_lsoa$lsoa11,"LSOA_CODE",NA)) %>%
filter(is.na(LA_Flag)==F)
lsoa_shp_pop <- sp::merge(lsoa_shp, lsoa_pop_var, by.x="lsoa11cd", by.y="LSOA_CODE", all.x=FALSE)
# get polygon centroids
xy <- coordinates(lsoa_shp_pop)
lsoa_shp_pop$x <- xy[,1]
lsoa_shp_pop$y <- xy[,2]
# define a function to handle tidying (which used to be called fortifying) and then joinng the data items back in
clean <- function(shape){
shape@data$id = rownames(shape@data)
shape.points = tidy(shape, region="id")
shape.df = inner_join(shape.points, shape@data, by="id")
}
lsoa_shp_pop_tidy <- clean(lsoa_shp_pop)
ggplot(data=lsoa_shp_pop_tidy,
aes(long,
lat,
group=id,
fill=Difference)) +
geom_polygon(colour="white") +
geom_text(aes(x,y,label=Number_of_Practices),color = "white", size=3) +
scale_fill_gradient2(midpoint=0, low="red", mid="white",high="blue") +
coord_fixed() +
theme_void()
# ----------
median_ons_pop <- median(lsoa_pop_var$All.Ages, na.rm=T)
ggplot(lsoa_pop_var, aes(All.Ages)) +
geom_histogram(bins=200) +
geom_vline(xintercept=median_ons_pop, linetype=2, col="red") +
theme_bw() +
labs(x="LSOA Population (2016 Estimate)", y="Count", title="Distribution of LSOA Population Estimates (2016)")
# ----------
median_pat_pop_diff <- median(lsoa_pop_var$Difference, na.rm=T)
ggplot(lsoa_pop_var, aes(Difference)) +
geom_histogram(bins=100) +
geom_vline(xintercept=median_pat_pop_diff, linetype=2, col="red") +
theme_bw() +
labs(x="Difference between registered and resident population", y="Count", title="Distribution of Difference Between LSOA Population Estimates (2016) and Registrants") +
xlim(-2500,2500)
# ----------
ggplot(lsoa_pop_var, aes(All_Patients,All.Ages,colour=Difference)) +
geom_point() +
scale_color_gradient2(low="#2c7bb6", mid="#ffffbf", high="#d7191c") +
geom_smooth(method="lm", se=T, colour="black", linetype=3) +
#geom_abline(intercept=0, slope=1) +
theme_bw() +
labs(x="Population Estimate 2016",
y="Registrants",
title="LSOA Population Estimates vs Number of Registrants"
) +
coord_equal()
# ----------
median_pat_num_prac <- median(lsoa_pop_var$Number_of_Practices, na.rm=T)
ggplot(lsoa_pop_var, aes(Number_of_Practices)) +
geom_histogram(binwidth=1) +
geom_vline(xintercept=median_pat_num_prac, linetype=2, col="red") +
theme_bw()