Get the data: https://datashare.ed.ac.uk/handle/10283/2501
load our packages
library(spatstat)
library(tidyverse)
library(tmap)
library(here)
library(sp)
library(sf)
library(stringr)
library(spdep)
library(RColorBrewer)
Load the data
Pharmacy <- st_read(here::here("Data", "PharmacyEW", "PharmacyEW.shp"))%>%
#remove any duplicates
distinct()
## Reading layer `PharmacyEW' from data source
## `C:\Users\Andy\OneDrive - University College London\Teaching\Guest\point_pattern_example\Data\PharmacyEW\PharmacyEW.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 10589 features and 11 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: 137070 ymin: 19187 xmax: 655138 ymax: 653253
## Projected CRS: OSGB 1936 / British National Grid
#check CRS
#st_crs(Pharmacy)
#check with a plot
tm_shape(Pharmacy) +
tm_dots(col = "blue")
London wards (or MSOA): https://data.london.gov.uk/dataset/statistical-gis-boundary-files-london
LondonWards<- st_read(here::here("Data",
"statistical-gis-boundaries-london",
"statistical-gis-boundaries-london",
"ESRI",
"London_Ward_CityMerged.shp"))%>%
st_transform(.,27700)
## Reading layer `London_Ward_CityMerged' from data source
## `C:\Users\Andy\OneDrive - University College London\Teaching\Guest\point_pattern_example\Data\statistical-gis-boundaries-london\statistical-gis-boundaries-london\ESRI\London_Ward_CityMerged.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 625 features and 7 fields
## Geometry type: POLYGON
## Dimension: XY
## Bounding box: xmin: 503568.2 ymin: 155850.8 xmax: 561957.5 ymax: 200933.9
## Projected CRS: OSGB 1936 / British National Grid
#check with a plot
tm_shape(LondonWards) +
tm_polygons(col = NA)
We need to spatially subset our points within our study area…
Pharmacysub <- Pharmacy[LondonWards, op=st_within]
#check with a plot
tm_shape(LondonWards) +
tm_polygons(col = NA, alpha=0.5)+
tm_shape(Pharmacysub) +
tm_dots(col="blue")
For point pattern analysis we need a point pattern object…within an observation window
#now set a window as the borough boundary
window <- as.owin(LondonWards)
plot(window)
#create a sp object
PharmacysubSP<- Pharmacysub %>%
as(., 'Spatial')
#create a ppp object
PharmacysubSP.ppp <- ppp(x=PharmacysubSP@coords[,1],
y=PharmacysubSP@coords[,2],
window=window)
PharmacysubSP.ppp %>%
plot(.,pch=16,cex=0.5,
main="Pharmacies in London")
#First plot the points
plot(PharmacysubSP.ppp,
pch=16,
cex=0.5,
main="Pharmacies")
#now count the points in that fall in a 20 x 20
#must be run all together otherwise there is a plot issue
PharmacysubSP.ppp %>%
quadratcount(.,nx = 20, ny = 20)%>%
plot(., add=T, col="red")
#run the quadrat count
Qcount <- PharmacysubSP.ppp %>%
quadratcount(.,nx = 20, ny = 20) %>%
as.data.frame() %>%
dplyr::count(Var1=Freq)%>%
dplyr::rename(Freqquadratcount=n)
Next we need to calculate the probabilities…
sums <- Qcount %>%
#calculate the total blue plaques (Var * Freq)
mutate(total = Var1 * Freqquadratcount) %>%
# then the sums
dplyr::summarise(across(everything(), sum))%>%
dplyr::select(-Var1)
lambda<- Qcount%>%
#calculate lambda - sum of freq count / sum of total plaques
mutate(total = Var1 * Freqquadratcount)%>%
dplyr::summarise(across(everything(), sum)) %>%
mutate(lambda=total/Freqquadratcount) %>%
dplyr::select(lambda)%>%
pull(lambda)
Next is the expected values with the Poisson formula
QCountTable <- Qcount %>%
#Probability of number of plaques in quadrant using the formula
mutate(Pr=((lambda^Var1)*exp(-lambda))/factorial(Var1))%>%
#now calculate the expected counts based on our total number of plaques
#and save them to the table
mutate(Expected= (round(Pr * sums$Freqquadratcount, 0)))
Plot them
QCountTable_long <- QCountTable %>%
pivot_longer(c("Freqquadratcount", "Expected"),
names_to="countvs_expected",
values_to="value")
ggplot(QCountTable_long, aes(Var1, value)) +
geom_line(aes(colour = countvs_expected ))
Check for association between two categorical variables
teststats <- quadrat.test(PharmacysubSP.ppp, nx = 20, ny = 20)
Chi square with a p value < 0.05 therefore some clustering…but from the plot, this was expected
K <- PharmacysubSP.ppp %>%
Kest(., correction="border") %>%
plot()
This was sort of expected too due to our previous analysis - suggesting that there is clustering throughout the points.
We need to identify the best value for the min points and distance….try increasing the value of k in kNNdistplot you will notice as you increase it the knee becomes less obvious.
If we have it too low we have a massive cluster
If we have it too high we have a small single cluster
I started with an eps of 1600 and a minpts of 20..however…the large eps means that the city centre has a massive cluster…this isn’t exactly what i wanted to pull out. Instead i want to identify local clusters of pharmacies so try reducing the eps to 500 and the minpts to 5
OPTICS will let us remove the eps parameter but running every possible value, however, minpts is always meant to be domain knowledge - see https://stats.stackexchange.com/questions/88872/a-routine-to-choose-eps-and-minpts-for-dbscan
Depending on your points it might be possible to filter the values you aren’t interested in - this isn’t the case here, but for example stop and search data or flytipping could be filtered (well, depepdning on the extra data within the columns)
#first extract the points from the spatial points data frame
Pharmacysub_coords <- Pharmacysub %>%
st_coordinates(.)%>%
as.data.frame()
Pharmacysub_coords%>%
dbscan::kNNdistplot(.,k=20)
#now run the dbscan analysis
db <- Pharmacysub_coords %>%
fpc::dbscan(.,eps = 500, MinPts = 5)
#now plot the results
plot(db, Pharmacysub_coords, main = "DBSCAN Output", frame = F)
plot(LondonWards$geometry, add=T)
Pharmacysub_coords<- Pharmacysub_coords %>%
mutate(dbcluster=db$cluster)
Pharmacysub_coordsgt0 <- Pharmacysub_coords %>%
filter(dbcluster>0)
dbplot <- ggplot(data=LondonWards)+
geom_sf()+
geom_point(data=Pharmacysub_coordsgt0,
aes(X,Y, colour=dbcluster, fill=dbcluster))
#add the points in
dbplot + theme_bw() + coord_sf()
Now, this identifies where we have clustering based on our criteria but it doesn’t show where we have similar densities of pharmacies.
We need to add all the points to the London ward data then compute a density per ward…we could also use population here too! or some other data that can give us meaningful comparisons for our variable of interest.
Ward level population estimates: https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/populationestimates/datasets/wardlevelmidyearpopulationestimatesexperimental
points_sf_joined <- LondonWards%>%
st_join(Pharmacysub)%>%
add_count(GSS_CODE)%>%
janitor::clean_names()%>%
#calculate area
mutate(area=st_area(.))%>%
#then density of the points per ward
mutate(density=n/area)
Now map density
points_sf_joined<- points_sf_joined %>%
group_by(gss_code) %>%
summarise(density = first(density),
name = first(name))
tm_shape(points_sf_joined) +
tm_polygons("density",
style="jenks",
palette="PuOr",
midpoint=NA)
So, from the map, it looks as though we might have some clustering of pharmacies in the centre of London and a few other places, so let’s check this with Moran’s I and some other statistics.
As we saw in the session we need to create a spatial weight matrix…to do so we need centroid points first…to then compute the neighbours of each centroid…
coordsW <- points_sf_joined%>%
st_centroid()%>%
st_geometry()
#check, alpha is transparency
tm_shape(points_sf_joined) +
tm_polygons(alpha=0.1)+
tm_shape(coordsW) +
tm_dots(col = "blue")
Next create a list of the neighbours from the centroids…
#create a neighbours list
LWard_nb <- points_sf_joined %>%
poly2nb(., queen=T)
Have a look at the summary of neighbours - average is 5.88 for Queens case
summary(LWard_nb)
## Neighbour list object:
## Number of regions: 625
## Number of nonzero links: 3680
## Percentage nonzero weights: 0.94208
## Average number of links: 5.888
## Link number distribution:
##
## 1 2 3 4 5 6 7 8 9 10 11 12
## 1 4 15 72 162 182 112 55 14 4 2 2
## 1 least connected region:
## 380 with 1 link
## 2 most connected regions:
## 313 612 with 12 links
plot them - we can’t use tamp as it isn’t class of sf, starts, spatial raster not spatraster…
plot(LWard_nb, st_geometry(coordsW), col="red")
#add a map underneath
plot(points_sf_joined$geometry, add=T)
Next take our weight list and make it into a matrix …style here denotes the weight type - common ones are B for Binary, W for row and C for global.
#create a spatial weights matrix from these weights
Lward.lw <- LWard_nb %>%
nb2mat(., style="W")
I have used row…based on the lecture what should the value of the weights sum to?
sum(Lward.lw)
## [1] 625
Moran’s I requires a spatial weight list type object as opposed to matrix, this is simply…
Lward.lw <- LWard_nb %>%
nb2listw(., style="W")
Now let’s run Moran’s I - test tells us whether the values at neighbouring sites are similar to the target site (giving a Moran’s I close to 1) or the value of the taget is different to the neighbours (close to -1)
I_LWard_Global_Density <- points_sf_joined %>%
pull(density) %>%
as.vector()%>%
moran.test(., Lward.lw)
Geary’s C - tells us whether similar values or dissimilar values are clustering - Geary’s C falls between 0 and 2; 1 means no spatial autocorrelation, <1 - positive spatial autocorrelation or similar values clustering, >1 - negative spatial autocorreation or dissimilar values clustering)
C_LWard_Global_Density <-
points_sf_joined %>%
pull(density) %>%
as.vector()%>%
geary.test(., Lward.lw)
Getis Ord General G…? This tells us whether high or low values are clustering. If G > Expected = High values clustering; if G < expected = low values clustering
G_LWard_Global_Density <-
points_sf_joined %>%
pull(density) %>%
as.vector()%>%
globalG.test(., Lward.lw)
Based on the results write down what you can conclude here….
The only difference is that we now use the function localmoran(). And remember local Moran’s I gives a value for each spatial unit in relation to neigbhours as opposed to global that compares each spatial unit to the neighbours and then gives an average of all the differences identified
I_LWard_Local_Density <- points_sf_joined %>%
pull(density) %>%
as.vector()%>%
localmoran(., Lward.lw)%>%
as_tibble()
This outputs a table, so we need to append that back to our sf of the wards to make a map…
points_sf_joined <- points_sf_joined %>%
mutate(density_I =as.numeric(I_LWard_Local_Density$Ii))%>%
mutate(density_Iz =as.numeric(I_LWard_Local_Density$Z.Ii))%>%
mutate(p =as.numeric(I_LWard_Local_Density$`Pr(z > 0)`))
We’ll set the breaks manually based on the rule that data points:>2.58 or <-2.58 standard deviations away from the mean are significant at the 99% level (<1% chance that autocorrelation not present); >1.96 - <2.58 or <-1.96 to >-2.58 standard deviations are significant at the 95% level (<5% change that autocorrelation not present). >1.65 = 90% etc…like we saw in the lecture…
breaks1<-c(-1,-0.5,0,0.5,1)
breaks2<-c(-1000,-2.58,-1.96,-1.65,1.65,1.96,2.58,1000)
Now create a new diverging colour brewer palette and reverse the order using rev() (reverse) so higher values correspond to red - see https://www.r-graph-gallery.com/38-rcolorbrewers-palettes.html
MoranColours<- rev(brewer.pal(8, "RdGy"))
Remember Moran’s I - test tells us whether we have clustered values (close to 1) or dispersed values (close to -1) of similar values based on the spatial weight matrix that identifies the neighoburs
But… the z-score shows were this is unlikely because of complete spatial randomness and so we have spatial clustering (either clustering of similar or dissimilar values depending on the Moran’s I value) within these locations…
tm_shape(points_sf_joined) +
tm_polygons("density_I",
style="fixed",
breaks=breaks1,
palette=MoranColours,
midpoint=NA,
title="Local Moran's I")
tm_shape(points_sf_joined) +
tm_polygons("density_Iz",
style="fixed",
breaks=breaks2,
palette=MoranColours,
midpoint=NA,
title="Local Moran's I Z-score")
What about the Getis Ord \(G_{i}^{*}\) statistic for hot and cold spots? - this just contains the z-score (standardised value relating to whether high values or low values are clustering together)
Gi_LWard_Local_Density <- points_sf_joined %>%
pull(density) %>%
as.vector()%>%
localG(., Lward.lw)
points_sf_joined <- points_sf_joined %>%
mutate(density_G = as.numeric(Gi_LWard_Local_Density))
And map the outputs…
GIColours<- rev(brewer.pal(8, "RdBu"))
#now plot on an interactive map
tm_shape(points_sf_joined) +
tm_polygons("density_G",
style="fixed",
breaks=breaks2,
palette=GIColours,
midpoint=NA,
title="Gi* Z score")
In the analysis you might see a Moran plot where the values of our variable (density of pharmacies) and plotted against (on the y axis) the spatially lagged version (the average value of the same attribute at neighboring locations). However this plot below shows the value of density in relation to the spatial weight matrix…
This is useful as we can express the level of spatial association of each observation with its neighboring ones. Points in the upper right (or high-high) and lower left (or low-low) quadrants indicate positive spatial association of values that are higher and lower than the sample mean, respectively. The lower right (or high-low) and upper left (or low-high) quadrants include observations that exhibit negative spatial association; that is, these observed values carry little similarity to their neighboring ones. Source: https://documentation.sas.com/doc/en/pgmsascdc/9.4_3.4/statug/statug_variogram_details31.htm#:~:text=The%20Moran%20scatter%20plot%20(Anselin,known%20as%20the%20response%20axis.
Moran_plot_LWard_Global_Density <- points_sf_joined %>%
pull(density)%>%
as.vector()%>%
moran.plot(., Lward.lw)
When you see Moran’s I out in the wild you will come across maps with:
Here, we use the values we have, of density and Moran’s I, compared to the mean of density and Moran’s I (termed centering). Where the
value of density is greater than 0 and the value of Moran’s I is greater than 0 then high values (of density) are surrounded by other high values (from Moran’s I)= HH
value of density is lower than 0 and the value of Moran’s I is lower than 0 then low values (of density) are surrounded by other low values (from Moran’s I) = LL
value of density is lower than 0 and the value of Moran’s I is higher than 0 then low values (of density) are surrounded by high values (from Moran’s I) = LH
value of density is higher than 0 and the value of Moran’s I is lower than 0 then high values (of density) are surrounded by high values (from Moran’s I) =HL
signif <- 0.1
# centers the variable of interest around its mean
points_sf_joined2 <- points_sf_joined %>%
mutate(mean_density = density- mean(density))%>%
mutate(mean_density = as.vector(mean_density))%>%
mutate(mean_densityI= density_I - mean(density_I))%>%
mutate(quadrant = case_when(mean_density>0 & mean_densityI >0 ~ 4,
mean_density<0 & mean_densityI <0 ~ 1,
mean_density<0 & mean_densityI >0 ~ 2,
mean_density>0 & mean_densityI <0 ~ 3))%>%
mutate(quadrant=case_when(p > signif ~ 0, TRUE ~ quadrant))
brks <- c(0,1,2,3,4,5)
colors <- c("white","blue","skyblue","pink","red")
tm_shape(points_sf_joined2) +
tm_polygons("quadrant",
style="fixed",
breaks=brks,
labels = c("insignificant","low-low","low-high","high-low","high-high"),
palette=colors,
title="Moran's I HH etc")
Source: https://rpubs.com/quarcs-lab/spatial-autocorrelation
This might seem somewhat confusing as if we look in the South East we have low values of Getis Ord \(G_{i}^{*}\) yet we have shown that these are low (density) and high Moran’s I. But as Matthew Peeples concisely summarises remember…
Moran’s I is a measure of the degree to which the value at a target site is similar to values at adjacent sites. Moran’s I is large and positive when the value for a given target (or for all locations in the global case) is similar to adjacent values and negative when the value at a target is dissimilar to adjacent values.
Getis Ord \(G_{i}^{*}\) identifies areas where high or low values cluster in space. It is high where the sum of values within a neighborhood of a given radius or configuration is high relative to the global average and negative where the sum of values within a neighborhood are small relative to the global average and approaches 0 at intermediate values.
So here we have a high Moran’s I as the values around it are similar (probably all low) but a low Getis Ord as the values within the local area are low relative to the global average.