2 Get data

Get the data: https://datashare.ed.ac.uk/handle/10283/2501

load our packages

library(spatstat)
library(tidyverse)
library(tmap)
library(here)
library(sp)
library(sf)
library(stringr)
library(spdep)
library(RColorBrewer)

Load the data

Pharmacy <- st_read(here::here("Data", "PharmacyEW", "PharmacyEW.shp"))%>%
  #remove any duplicates
  distinct()
## Reading layer `PharmacyEW' from data source 
##   `C:\Users\Andy\OneDrive - University College London\Teaching\Guest\point_pattern_example\Data\PharmacyEW\PharmacyEW.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 10589 features and 11 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: 137070 ymin: 19187 xmax: 655138 ymax: 653253
## Projected CRS: OSGB 1936 / British National Grid
#check CRS
#st_crs(Pharmacy)

#check with a plot
tm_shape(Pharmacy) +
  tm_dots(col = "blue")

London wards (or MSOA): https://data.london.gov.uk/dataset/statistical-gis-boundary-files-london

LondonWards<- st_read(here::here("Data", 
                                        "statistical-gis-boundaries-london",
                                        "statistical-gis-boundaries-london",
                                        "ESRI",
                                        "London_Ward_CityMerged.shp"))%>%
  st_transform(.,27700)
## Reading layer `London_Ward_CityMerged' from data source 
##   `C:\Users\Andy\OneDrive - University College London\Teaching\Guest\point_pattern_example\Data\statistical-gis-boundaries-london\statistical-gis-boundaries-london\ESRI\London_Ward_CityMerged.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 625 features and 7 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: 503568.2 ymin: 155850.8 xmax: 561957.5 ymax: 200933.9
## Projected CRS: OSGB 1936 / British National Grid
#check with a plot
tm_shape(LondonWards) +
  tm_polygons(col = NA)

We need to spatially subset our points within our study area…

Pharmacysub <- Pharmacy[LondonWards, op=st_within]

#check with a plot
tm_shape(LondonWards) +
  tm_polygons(col = NA, alpha=0.5)+
tm_shape(Pharmacysub) +
  tm_dots(col="blue")

3 Point pattern

For point pattern analysis we need a point pattern object…within an observation window

#now set a window as the borough boundary
window <- as.owin(LondonWards)
plot(window)

#create a sp object
PharmacysubSP<- Pharmacysub %>%
  as(., 'Spatial')
#create a ppp object
PharmacysubSP.ppp <- ppp(x=PharmacysubSP@coords[,1],
                          y=PharmacysubSP@coords[,2],
                          window=window)

PharmacysubSP.ppp %>%
  plot(.,pch=16,cex=0.5, 
              main="Pharmacies in London")

3.1 Quadrat analysis

#First plot the points

plot(PharmacysubSP.ppp,
     pch=16,
     cex=0.5, 
     main="Pharmacies")

#now count the points in that fall in a 20 x 20
#must be run all together otherwise there is a plot issue

PharmacysubSP.ppp %>%
  quadratcount(.,nx = 20, ny = 20)%>%
    plot(., add=T, col="red")

#run the quadrat count
Qcount <- PharmacysubSP.ppp %>%
  quadratcount(.,nx = 20, ny = 20) %>%
  as.data.frame() %>%
  dplyr::count(Var1=Freq)%>%
  dplyr::rename(Freqquadratcount=n)

Next we need to calculate the probabilities…

sums <- Qcount %>%
  #calculate the total blue plaques (Var * Freq)
  mutate(total = Var1 * Freqquadratcount) %>%
  # then the sums
  dplyr::summarise(across(everything(), sum))%>%
  dplyr::select(-Var1) 

lambda<- Qcount%>%
  #calculate lambda - sum of freq count / sum of total plaques
  mutate(total = Var1 * Freqquadratcount)%>%
  dplyr::summarise(across(everything(), sum)) %>%
  mutate(lambda=total/Freqquadratcount) %>%
  dplyr::select(lambda)%>%
  pull(lambda)

Next is the expected values with the Poisson formula

QCountTable <- Qcount %>%
  #Probability of number of plaques in quadrant using the formula 
  mutate(Pr=((lambda^Var1)*exp(-lambda))/factorial(Var1))%>%
  #now calculate the expected counts based on our total number of plaques
  #and save them to the table
  mutate(Expected= (round(Pr * sums$Freqquadratcount, 0)))

Plot them

QCountTable_long <- QCountTable %>% 
  pivot_longer(c("Freqquadratcount", "Expected"), 
               names_to="countvs_expected", 
               values_to="value")

ggplot(QCountTable_long, aes(Var1, value)) +
  geom_line(aes(colour = countvs_expected ))

Check for association between two categorical variables

teststats <- quadrat.test(PharmacysubSP.ppp, nx = 20, ny = 20)

Chi square with a p value < 0.05 therefore some clustering…but from the plot, this was expected

3.2 Ripley K

K <- PharmacysubSP.ppp %>%
  Kest(., correction="border") %>%
  plot()

This was sort of expected too due to our previous analysis - suggesting that there is clustering throughout the points.

3.3 DBSCAN

We need to identify the best value for the min points and distance….try increasing the value of k in kNNdistplot you will notice as you increase it the knee becomes less obvious.

If we have it too low we have a massive cluster

If we have it too high we have a small single cluster

I started with an eps of 1600 and a minpts of 20..however…the large eps means that the city centre has a massive cluster…this isn’t exactly what i wanted to pull out. Instead i want to identify local clusters of pharmacies so try reducing the eps to 500 and the minpts to 5

OPTICS will let us remove the eps parameter but running every possible value, however, minpts is always meant to be domain knowledge - see https://stats.stackexchange.com/questions/88872/a-routine-to-choose-eps-and-minpts-for-dbscan

Depending on your points it might be possible to filter the values you aren’t interested in - this isn’t the case here, but for example stop and search data or flytipping could be filtered (well, depepdning on the extra data within the columns)

#first extract the points from the spatial points data frame
Pharmacysub_coords <- Pharmacysub %>%
  st_coordinates(.)%>%
  as.data.frame()

Pharmacysub_coords%>%
  dbscan::kNNdistplot(.,k=20)

#now run the dbscan analysis
db <- Pharmacysub_coords %>%
  fpc::dbscan(.,eps = 500, MinPts = 5)

#now plot the results
plot(db, Pharmacysub_coords, main = "DBSCAN Output", frame = F)
plot(LondonWards$geometry, add=T)

Pharmacysub_coords<- Pharmacysub_coords %>%
  mutate(dbcluster=db$cluster)
Pharmacysub_coordsgt0 <- Pharmacysub_coords %>%
  filter(dbcluster>0)

dbplot <- ggplot(data=LondonWards)+
  geom_sf()+
  geom_point(data=Pharmacysub_coordsgt0, 
                 aes(X,Y, colour=dbcluster, fill=dbcluster))
#add the points in

dbplot + theme_bw() + coord_sf()

Now, this identifies where we have clustering based on our criteria but it doesn’t show where we have similar densities of pharmacies.

4 Spatial Autocorrelation

We need to add all the points to the London ward data then compute a density per ward…we could also use population here too! or some other data that can give us meaningful comparisons for our variable of interest.

Ward level population estimates: https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/populationestimates/datasets/wardlevelmidyearpopulationestimatesexperimental

points_sf_joined <- LondonWards%>%
  st_join(Pharmacysub)%>%
  add_count(GSS_CODE)%>%
  janitor::clean_names()%>%
  #calculate area
  mutate(area=st_area(.))%>%
  #then density of the points per ward
  mutate(density=n/area)

Now map density

points_sf_joined<- points_sf_joined %>%                    
  group_by(gss_code) %>%         
  summarise(density = first(density),
          name  = first(name))

tm_shape(points_sf_joined) +
    tm_polygons("density",
        style="jenks",
        palette="PuOr",
        midpoint=NA)

So, from the map, it looks as though we might have some clustering of pharmacies in the centre of London and a few other places, so let’s check this with Moran’s I and some other statistics.

As we saw in the session we need to create a spatial weight matrix…to do so we need centroid points first…to then compute the neighbours of each centroid…

coordsW <- points_sf_joined%>%
  st_centroid()%>%
  st_geometry()

#check, alpha is transparency 
tm_shape(points_sf_joined) +
    tm_polygons(alpha=0.1)+
tm_shape(coordsW) +
  tm_dots(col = "blue")

4.1 Weight matrix

Next create a list of the neighbours from the centroids…

#create a neighbours list
LWard_nb <- points_sf_joined %>%
  poly2nb(., queen=T)

Have a look at the summary of neighbours - average is 5.88 for Queens case

summary(LWard_nb)
## Neighbour list object:
## Number of regions: 625 
## Number of nonzero links: 3680 
## Percentage nonzero weights: 0.94208 
## Average number of links: 5.888 
## Link number distribution:
## 
##   1   2   3   4   5   6   7   8   9  10  11  12 
##   1   4  15  72 162 182 112  55  14   4   2   2 
## 1 least connected region:
## 380 with 1 link
## 2 most connected regions:
## 313 612 with 12 links

plot them - we can’t use tamp as it isn’t class of sf, starts, spatial raster not spatraster…

plot(LWard_nb, st_geometry(coordsW), col="red")
#add a map underneath
plot(points_sf_joined$geometry, add=T)

Next take our weight list and make it into a matrix …style here denotes the weight type - common ones are B for Binary, W for row and C for global.

#create a spatial weights matrix from these weights
Lward.lw <- LWard_nb %>%
  nb2mat(., style="W")

I have used row…based on the lecture what should the value of the weights sum to?

sum(Lward.lw)
## [1] 625

4.2 Moran’s I

Moran’s I requires a spatial weight list type object as opposed to matrix, this is simply…

Lward.lw <- LWard_nb %>%
  nb2listw(., style="W")

Now let’s run Moran’s I - test tells us whether the values at neighbouring sites are similar to the target site (giving a Moran’s I close to 1) or the value of the taget is different to the neighbours (close to -1)

I_LWard_Global_Density <- points_sf_joined %>%
  pull(density) %>%
  as.vector()%>%
  moran.test(., Lward.lw)

4.3 Geary’s C

Geary’s C - tells us whether similar values or dissimilar values are clustering - Geary’s C falls between 0 and 2; 1 means no spatial autocorrelation, <1 - positive spatial autocorrelation or similar values clustering, >1 - negative spatial autocorreation or dissimilar values clustering)

C_LWard_Global_Density <- 
  points_sf_joined %>%
  pull(density) %>%
  as.vector()%>%
  geary.test(., Lward.lw)

4.4 Getis Ord General G

Getis Ord General G…? This tells us whether high or low values are clustering. If G > Expected = High values clustering; if G < expected = low values clustering

G_LWard_Global_Density <- 
  points_sf_joined %>%
  pull(density) %>%
  as.vector()%>%
  globalG.test(., Lward.lw)

Based on the results write down what you can conclude here….

5 Local Indicies of Spatial Autocorrelation

5.1 Moran’s I

The only difference is that we now use the function localmoran(). And remember local Moran’s I gives a value for each spatial unit in relation to neigbhours as opposed to global that compares each spatial unit to the neighbours and then gives an average of all the differences identified

I_LWard_Local_Density <- points_sf_joined %>%
  pull(density) %>%
  as.vector()%>%
  localmoran(., Lward.lw)%>%
  as_tibble()

This outputs a table, so we need to append that back to our sf of the wards to make a map…

points_sf_joined <- points_sf_joined %>%
    mutate(density_I =as.numeric(I_LWard_Local_Density$Ii))%>%
    mutate(density_Iz =as.numeric(I_LWard_Local_Density$Z.Ii))%>%
    mutate(p =as.numeric(I_LWard_Local_Density$`Pr(z != E(Ii))`))

We’ll set the breaks manually based on the rule that data points:>2.58 or <-2.58 standard deviations away from the mean are significant at the 99% level (<1% chance that autocorrelation not present); >1.96 - <2.58 or <-1.96 to >-2.58 standard deviations are significant at the 95% level (<5% change that autocorrelation not present). >1.65 = 90% etc…like we saw in the lecture…

breaks1<-c(-1,-0.5,0,0.5,1)

breaks2<-c(-1000,-2.58,-1.96,-1.65,1.65,1.96,2.58,1000)

Now create a new diverging colour brewer palette and reverse the order using rev() (reverse) so higher values correspond to red - see https://www.r-graph-gallery.com/38-rcolorbrewers-palettes.html

MoranColours<- rev(brewer.pal(8, "RdGy"))

Remember Moran’s I - test tells us whether we have clustered values (close to 1) or dispersed values (close to -1) of similar values based on the spatial weight matrix that identifies the neighoburs

But… the z-score shows were this is unlikely because of complete spatial randomness and so we have spatial clustering (either clustering of similar or dissimilar values depending on the Moran’s I value) within these locations…

tm_shape(points_sf_joined) +
    tm_polygons("density_I",
        style="fixed",
        breaks=breaks1,
        palette=MoranColours,
        midpoint=NA,
        title="Local Moran's I")

tm_shape(points_sf_joined) +
    tm_polygons("density_Iz",
        style="fixed",
        breaks=breaks2,
        palette=MoranColours,
        midpoint=NA,
        title="Local Moran's I Z-score")

5.2 Getis Ord \(G_{i}^{*}\)

What about the Getis Ord \(G_{i}^{*}\) statistic for hot and cold spots? - this just contains the z-score (standardised value relating to whether high values or low values are clustering together)

Gi_LWard_Local_Density <- points_sf_joined %>%
  pull(density) %>%
  as.vector()%>%
  localG(., Lward.lw)

points_sf_joined <- points_sf_joined %>%
  mutate(density_G = as.numeric(Gi_LWard_Local_Density))

And map the outputs…

GIColours<- rev(brewer.pal(8, "RdBu"))

#now plot on an interactive map
tm_shape(points_sf_joined) +
    tm_polygons("density_G",
        style="fixed",
        breaks=breaks2,
        palette=GIColours,
        midpoint=NA,
        title="Gi* Z score")

5.3 Note

In the analysis you might see a Moran plot where the values of our variable (density of pharmacies) and plotted against (on the y axis) the spatially lagged version (the average value of the same attribute at neighboring locations). However this plot below shows the value of density in relation to the spatial weight matrix…

This is useful as we can express the level of spatial association of each observation with its neighboring ones. Points in the upper right (or high-high) and lower left (or low-low) quadrants indicate positive spatial association of values that are higher and lower than the sample mean, respectively. The lower right (or high-low) and upper left (or low-high) quadrants include observations that exhibit negative spatial association; that is, these observed values carry little similarity to their neighboring ones. Source: https://documentation.sas.com/doc/en/pgmsascdc/9.4_3.4/statug/statug_variogram_details31.htm#:~:text=The%20Moran%20scatter%20plot%20(Anselin,known%20as%20the%20response%20axis.

Moran_plot_LWard_Global_Density <- points_sf_joined %>%
  pull(density)%>%
  as.vector()%>%
  moran.plot(., Lward.lw)

When you see Moran’s I out in the wild you will come across maps with:

  • high values surrounded by high values (HH)
  • low values nearby other low values (LL)
  • low values among high values (LH)
  • high values among low values (HL)

Here, we use the values we have, of density and Moran’s I, compared to the mean of density and Moran’s I (termed centering). Where the

  • value of density is greater than 0 and the value of Moran’s I is greater than 0 then high values (of density) are surrounded by other high values (from Moran’s I)= HH

  • value of density is lower than 0 and the value of Moran’s I is lower than 0 then low values (of density) are surrounded by other low values (from Moran’s I) = LL

  • value of density is lower than 0 and the value of Moran’s I is higher than 0 then low values (of density) are surrounded by high values (from Moran’s I) = LH

  • value of density is higher than 0 and the value of Moran’s I is lower than 0 then high values (of density) are surrounded by high values (from Moran’s I) =HL

signif <- 0.1


# centers the variable of interest around its mean
points_sf_joined2 <- points_sf_joined %>%
  mutate(mean_density = density- mean(density))%>%
  mutate(mean_density = as.vector(mean_density))%>%
  mutate(mean_densityI= density_I - mean(density_I))%>%
  mutate(quadrant = case_when(mean_density>0 & mean_densityI >0 ~ 4,
         mean_density<0 & mean_densityI <0 ~ 1,
         mean_density<0 & mean_densityI >0 ~ 2,
         mean_density>0 & mean_densityI <0 ~ 3))%>%
  mutate(quadrant=case_when(p > signif ~ 0, TRUE ~ quadrant))

brks <- c(0,1,2,3,4,5)
colors <- c("white","blue","skyblue","pink","red")


tm_shape(points_sf_joined2) +
    tm_polygons("quadrant",
        style="fixed",
        breaks=brks,
        labels = c("insignificant","low-low","low-high","high-low","high-high"),
        palette=colors,
        title="Moran's I HH etc")

Source: https://rpubs.com/quarcs-lab/spatial-autocorrelation

This might seem somewhat confusing as if we look in the South East we have low values of Getis Ord \(G_{i}^{*}\) yet we have shown that these are low (density) and high Moran’s I. But as Matthew Peeples concisely summarises remember…

  • Moran’s I is a measure of the degree to which the value at a target site is similar to values at adjacent sites. Moran’s I is large and positive when the value for a given target (or for all locations in the global case) is similar to adjacent values and negative when the value at a target is dissimilar to adjacent values.

  • Getis Ord \(G_{i}^{*}\) identifies areas where high or low values cluster in space. It is high where the sum of values within a neighborhood of a given radius or configuration is high relative to the global average and negative where the sum of values within a neighborhood are small relative to the global average and approaches 0 at intermediate values.

So here we have a high Moran’s I as the values around it are similar (probably all low) but a low Getis Ord as the values within the local area are low relative to the global average.

6 Consider what this all means

  • Do you think Pharmacies take into account the ward they are in when they open?
  • Do you think people only go to pharmacies within their ward?
  • Pharmacies don’t exhibit complete spatial randomness and there seems to be clustering in certain locations….
  • Another question we could move onto is now are pharmacies locating around a need (e.g. health outcomes) and does that mean some of the population in London have poor access or choice.
  • For example….we could now look to see if there is clustering of “Deaths from causes considered preventable, under 75 years, standardised mortality ratio” and if that aligns with (or can be explained by) access / clusters of pharmacies.
  • If you wanted to do this (e.g. see if the ratio was clustered), the data is in a somewhat unfriendly format so you’d need to:
    • Download the data for ward from here: https://fingertips.phe.org.uk/profile/local-health/data#page/9/gid/1938133184/ati/8/iid/93227/age/1/sex/4/cat/-1/ctp/-1/yrr/5/cid/4/tbm/1
    • Read the Indicator definitions for the last row, specifically column G which tells us how it’s created.
    • Filter the data based on the indicator value of 93480 (from the Indicator definitions)
    • Filter out the London wards - as we did here and join it to our spatial feature.
    • Run a Moran’s I or some other spatial autocorrelation
    • Join this back to our spatial feature that has the Moran’s I for pharmacy density
    • Decide what to plot - does dispersion of pharmacies occur in the same spatial units as clustering of deaths considered preventable? This could simply be two maps and a table.
  • Should it be a requirement to have access…well if we consult the “2022 Pharmacy Access Scheme: guidance” then yes it appears so but…“Pharmacies in areas with dense provision of pharmacies remain excluded from the scheme”

7 Extensions

  • The use of OPTICS
  • We have spatial autocorrelation now can we try and model local differences in density of pharmacies - i.e. what factors might (or might not) explain the difference here — https://andrewmaclachlan.github.io/CASA0005repo/explaining-spatial-patterns.html.
  • Or perhaps does the distance to pharmacies assist in explaining deaths from causes considered preventable, under 75 years. Similarly, we could even use this as a dummy variable (yes/no the areas is / is not in a cluster of pharmacies) in a regression model.