knitr::opts_chunk$set(echo = TRUE)
require(ggplot2, quietly = T)
require(sf, quietly = T)
require(BRCmap, quietly = T)
require(dplyr, quietly = T)
require(here, quietly = T)

Introduction

Here we will investigate how well currently surveyed NPMS squares represent UK environmental gradients. To do this, we will require some data, both on our gradients and on NPMS data.

# CS Environmental Zones (https://catalogue.ceh.ac.uk/documents/0cfd454a-d035-416c-80dc-803c65470ea2)
csEZ <- st_read(here("data/CS_Environmental_Zones_2007.gdb"))
## Reading layer `CS_Environmental_Zones_2007' from data source 
##   `W:\PYWELL_SHARED\Pywell Projects\BRC\Oli Pescott\424 NPMS square prioritisation\npmsSquareRProj\data\CS_Environmental_Zones_2007.gdb' 
##   using driver `OpenFileGDB'
## Simple feature collection with 8 features and 4 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 54000 ymin: 11000 xmax: 656000 ymax: 1218000
## Projected CRS: OSGB36 / British National Grid
plot(csEZ[1], border = NA)

# Add Northern Ireland
ni <- st_transform(st_read(here("data/Ireland_NAME__Northern Ireland.shp")), crs = 27700)
## Reading layer `Ireland_NAME__Northern Ireland' from data source 
##   `W:\PYWELL_SHARED\Pywell Projects\BRC\Oli Pescott\424 NPMS square prioritisation\npmsSquareRProj\data\Ireland_NAME__Northern Ireland.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 1 feature and 4 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -8.177089 ymin: 54.02272 xmax: -5.431648 ymax: 55.31297
## Geodetic CRS:  WGS 84
plot(ni[1])

# Get currently surveyed NPMS squares (https://catalogue.ceh.ac.uk/documents/eb135726-9039-441c-8335-1aab5f6dda21)
dat <- read.csv(here("data/sampleInfoWithLatLong_2015to2023.csv"), header = T, stringsAsFactors = F)
head(dat) # Quick glance
##        id survey_id date_start location_id entered_sref entered_sref_system
## 1 3951688        87 27/08/2018      202913  C5538609499                OSIE
## 2 3951687        87 25/05/2018      202913  C5538609499                OSIE
## 3 4000121        87 25/05/2018      202909  C5518309079                OSIE
## 4 4000150        87 27/08/2018      202909  C5518309079                OSIE
## 5 4000165        87 25/05/2018      202910  C5550009500                OSIE
## 6 4000167        87 27/08/2018      202910  C5550009500                OSIE
##   created_on created_by_id LATITUDE LONGITUDE monad monadCRS          country
## 1    20:40.0        155645 54.93000 -7.135914 C5509     OSIE Northern_Ireland
## 2    18:43.0        155645 54.93000 -7.135914 C5509     OSIE Northern_Ireland
## 3    33:42.0        155645 54.92625 -7.139161 C5509     OSIE Northern_Ireland
## 4    53:45.0        155645 54.92625 -7.139161 C5509     OSIE Northern_Ireland
## 5    57:04.0        155645 54.93000 -7.134136 C5509     OSIE Northern_Ireland
## 6    58:17.0        155645 54.93000 -7.134136 C5509     OSIE Northern_Ireland
dat <- dat[dat$monadCRS %in% c("OSIE", "OSGB"),] # Keep UK data only
uniDat <- data.frame(monad = unique(dat[,c("monad")])) # Only want unique 1 km squares
nrow(uniDat) # 1208 squares
## [1] 1208
# Convert squares to polygons
datSf <- BRCmap::gr2sf_poly(uniDat$monad, gr_atts = uniDat$location_id, crs = 27700)
# Quick look
ggplot() +
  geom_sf(data = csEZ) +
  geom_sf(data = ni) + 
  geom_sf(data = datSf, colour = "red") # surveyed squares

# We really need to combine Northern Ireland with the CS Zones too
names(csEZ)
## [1] "ENVZONE"      "Description"  "Shape_Length" "Shape_Area"   "Shape"
names(ni)
## [1] "NAME"      "AREA"      "PERIMETER" "HECTARES"  "geometry"
ni$ENVZONE <- 10
ni$Description <- "Northern Ireland"
ni <- ni[,c(6:7,1:5)]
ni <- ni[,c(1:2,5,4,7 )]
names(ni) <- names(csEZ)
st_geometry(ni) <- "Shape"
allZones <- rbind(csEZ, ni)
#plot(allZones[1], border = NA)
checkProps <- data.frame(zone = allZones$ENVZONE, description = allZones$Description, prop = allZones$Shape_Area/sum(allZones$Shape_Area))
head(checkProps)
##   zone                                description       prop
## 1    1                 Easterly Lowlands, England 0.25795190
## 2    2                 Westerly Lowlands, England 0.20424165
## 3    3                           Uplands, England 0.06203917
## 4    4                         Lowlands, Scotland 0.09099130
## 5    5 Intermediate Uplands and Islands, Scotland 0.11772423
## 6    6                     True Uplands, Scotland 0.12626994

Now we will create some summary metrics of actual coverage of zones by NPMS squares

# Calculate the area of each small square in datSf
datSf <- datSf %>% 
  mutate(area_km2 = st_area(.) %>% as.numeric() / 1e6)  # Convert area from m² to km²

# Calculate the area of each CS zone in allZones
allZones <- allZones %>% 
  mutate(zone_area_km2 = st_area(.) %>% as.numeric() / 1e6)  # Convert area from m² to km²

# Perform a spatial join to combine datSf with allZones
joined_sf <- st_join(datSf, allZones, join = st_intersects)
table(joined_sf$Description)
## 
##                 Easterly Lowlands, England 
##                                        509 
## Intermediate Uplands and Islands, Scotland 
##                                         86 
##                         Lowlands, Scotland 
##                                         98 
##                            Lowlands, Wales 
##                                         70 
##                           Northern Ireland 
##                                         73 
##                     True Uplands, Scotland 
##                                         84 
##                           Uplands, England 
##                                        130 
##                             Uplands, Wales 
##                                         62 
##                 Westerly Lowlands, England 
##                                        445
# Summarize the total area of squares in datSf per zone in allZones
#rm(result)
result <- joined_sf %>%
  group_by(ENVZONE, Description) %>%
  summarize(
    total_sq_area_km2 = sum(area_km2, na.rm = TRUE), # summed areas of NPMS squares by CS Zone
    zone_area_km2 = first(zone_area_km2)  # Use 'first' to get the zone area, assuming all rows in a group have the same value
  )
## `summarise()` has grouped output by 'ENVZONE'. You can override using the
## `.groups` argument.
# Calculate the proportion of the zone area covered by squares
result <- result %>%
  mutate(zone_proportion_covered_by_sq = total_sq_area_km2 / zone_area_km2) # By CS Zone, what proportion of the whole zone is covered by surveyed squares?

result <- result[c(1:9),] # crop off last rows (very small number of squares not intersected, e.g. Isle of Man)
print(as.data.frame(result))
##   ENVZONE                                Description total_sq_area_km2
## 1       1                 Easterly Lowlands, England           509.000
## 2       2                 Westerly Lowlands, England           445.000
## 3       3                           Uplands, England           130.000
## 4       4                         Lowlands, Scotland            98.000
## 5       5 Intermediate Uplands and Islands, Scotland            86.000
## 6       6                     True Uplands, Scotland            84.000
## 7       8                            Lowlands, Wales            70.000
## 8       9                             Uplands, Wales            62.000
## 9      10                           Northern Ireland            73.061
##   zone_area_km2                       geometry zone_proportion_covered_by_sq
## 1      65441.00 MULTIPOLYGON (((360000 8600...                   0.007777999
## 2      51815.00 MULTIPOLYGON (((165000 2000...                   0.008588247
## 3      15739.00 MULTIPOLYGON (((261000 6300...                   0.008259737
## 4      23084.00 MULTIPOLYGON (((209000 5830...                   0.004245365
## 5      29866.00 MULTIPOLYGON (((202000 6380...                   0.002879529
## 6      32034.00 MULTIPOLYGON (((262000 6090...                   0.002622214
## 7      11309.00 MULTIPOLYGON (((247000 1850...                   0.006189760
## 8      10272.00 MULTIPOLYGON (((272000 2070...                   0.006035826
## 9      14156.17 MULTIPOLYGON (((56092.42 49...                   0.005161069

Random surveying comparison

Ideally, and especially for national applications such as indicators of various types, we want even coverage nationally. Random sampling would achieve this. So, one way of prioritising environmental zones, and thus coverage of important ecological variation, is to compare our actual sample of surveyed squares to that which we would expect if we had the same number randomly distributed.

set.seed(232680) # for reproducibility
britmon <- read.csv(file = here("data/Britain_Monads_VC_biggest.csv"), header = T, strings = F) # all British land-containing monads
britmon <- britmon[!(britmon$VC %in% c(9999,113)),] # remove CI and those without land
britmon <- britmon[,c("Monad", "VC")] # reduce to essential info
nimon <- read.csv(file = here("data/niMonMaxArea.csv"), header = T, strings = F) # Northern Irish monads
nimon <- nimon[,c("Monad","VC")] # ditto
allMons <- rbind(britmon, nimon) # combine
allMons <- unique(allMons) # remove duplicates
randIndex <- sample(nrow(allMons), size = nrow(uniDat)) # random sample of same size as NPMS data
allMonsSamp <- allMons[randIndex,] # select random sample
allMonsSampSf <- BRCmap::gr2sf_poly(allMonsSamp$Monad, crs = 27700) # plot random sample

# Plot random sample
ggplot() +
  geom_sf(data = csEZ) +
  geom_sf(data = ni) + 
  geom_sf(data = allMonsSampSf, colour = "red")

## repeat metrics above for this random sample
allMonsSampSf <- allMonsSampSf %>% 
  mutate(area_km2 = st_area(.) %>% as.numeric() / 1e6)  # Convert area from m² to km²
joined_sfN <- st_join(allMonsSampSf, allZones, join = st_intersects)
table(joined_sfN$Description)
## 
##                 Easterly Lowlands, England 
##                                        379 
## Intermediate Uplands and Islands, Scotland 
##                                        178 
##                         Lowlands, Scotland 
##                                        138 
##                            Lowlands, Wales 
##                                         63 
##                           Northern Ireland 
##                                         75 
##                     True Uplands, Scotland 
##                                        170 
##                           Uplands, England 
##                                         94 
##                             Uplands, Wales 
##                                         50 
##                 Westerly Lowlands, England 
##                                        340
# Summarize the total area of squares in datSf per zone in allZones
#rm(resultRn)
resultRn <- joined_sfN %>%
  group_by(ENVZONE, Description) %>%
  summarize(
    total_sq_area_km2 = sum(area_km2, na.rm = TRUE), # summed areas of NPMS squares by CS Zone
    zone_area_km2 = first(zone_area_km2)  # Use 'first' to get the zone area, assuming all rows in a group have the same value
  )
## `summarise()` has grouped output by 'ENVZONE'. You can override using the
## `.groups` argument.
# Calculate the proportion of the zone area covered by squares
resultRn <- resultRn %>%
  mutate(zone_proportion_covered_by_sq = total_sq_area_km2 / zone_area_km2) # By CS Zone, what proportion of the whole zone is covered by surveyed squares?

# what is the actual proportional cover of zones
resultRn <- resultRn[c(1:9),]
print(as.data.frame(resultRn))
##   ENVZONE                                Description total_sq_area_km2
## 1       1                 Easterly Lowlands, England         379.00000
## 2       2                 Westerly Lowlands, England         340.00000
## 3       3                           Uplands, England          94.00000
## 4       4                         Lowlands, Scotland         138.00000
## 5       5 Intermediate Uplands and Islands, Scotland         178.00000
## 6       6                     True Uplands, Scotland         170.00000
## 7       8                            Lowlands, Wales          63.00000
## 8       9                             Uplands, Wales          50.00000
## 9      10                           Northern Ireland          75.08784
##   zone_area_km2                       geometry zone_proportion_covered_by_sq
## 1      65441.00 MULTIPOLYGON (((376000 8300...                   0.005791476
## 2      51815.00 MULTIPOLYGON (((170000 2500...                   0.006561806
## 3      15739.00 MULTIPOLYGON (((269000 6900...                   0.005972425
## 4      23084.00 MULTIPOLYGON (((211000 5410...                   0.005978167
## 5      29866.00 MULTIPOLYGON (((126000 6600...                   0.005959954
## 6      32034.00 MULTIPOLYGON (((205000 6950...                   0.005306861
## 7      11309.00 MULTIPOLYGON (((198000 1960...                   0.005570784
## 8      10272.00 MULTIPOLYGON (((270000 2210...                   0.004867601
## 9      14156.17 MULTIPOLYGON (((85608.26 49...                   0.005304247
# Now, we can compare the empirical areal coverage of CS Zones by NPMS squares with a random sample of the same size
# #compare
empirical <- result[,c("ENVZONE", "zone_proportion_covered_by_sq")]
empirical$type <- "empirical"
random <- resultRn[,c("ENVZONE", "zone_proportion_covered_by_sq")]
random$type <- "random"
empirical <- as.data.frame(st_drop_geometry(empirical))
random <- as.data.frame(st_drop_geometry(random))
comparison <- merge(random, empirical, by = "ENVZONE")
comparison$discrepancy <- abs(comparison$zone_proportion_covered_by_sq.x - comparison$zone_proportion_covered_by_sq.y)
comparison <- merge(comparison, checkProps[,c(1:2)], by.x = "ENVZONE", by.y = "zone") # add zone names
comparison <- comparison[order(comparison$discrepancy, decreasing = T),]
print(comparison) # Looks like English Uplands, and Intermediate Uplands and Islands, Scotland are top, but note that discrepancies are in different directions
##   ENVZONE zone_proportion_covered_by_sq.x type.x
## 5       5                     0.005959954 random
## 6       6                     0.005306861 random
## 3       3                     0.005972425 random
## 2       2                     0.006561806 random
## 1       1                     0.005791476 random
## 4       4                     0.005978167 random
## 8       9                     0.004867601 random
## 7       8                     0.005570784 random
## 9      10                     0.005304247 random
##   zone_proportion_covered_by_sq.y    type.y  discrepancy
## 5                     0.002879529 empirical 0.0030804259
## 6                     0.002622214 empirical 0.0026846476
## 3                     0.008259737 empirical 0.0022873118
## 2                     0.008588247 empirical 0.0020264402
## 1                     0.007777999 empirical 0.0019865222
## 4                     0.004245365 empirical 0.0017328019
## 8                     0.006035826 empirical 0.0011682243
## 7                     0.006189760 empirical 0.0006189760
## 9                     0.005161069 empirical 0.0001431772
##                                  description
## 5 Intermediate Uplands and Islands, Scotland
## 6                     True Uplands, Scotland
## 3                           Uplands, England
## 2                 Westerly Lowlands, England
## 1                 Easterly Lowlands, England
## 4                         Lowlands, Scotland
## 8                             Uplands, Wales
## 7                            Lowlands, Wales
## 9                           Northern Ireland

Bootstrap to evaluate uncertainty

This is all fine, but one random sample may be misleading, especially for the smaller CS Zones. To overcome this, we can bootstrap the whole process (i.e. repeat the random selection of squares repeatedly, and quantify the uncertainty).

Because the below code takes some time to execute, we use existing results here, and the R code below is set to not run as a part of this Markdown report.

#########################
## Bootstrap the above ##
#########################
n_bootstrap <- 1000
bootstrap_emp_props <- list()
bootstrap_emp_props <- replicate(n_bootstrap, {
  
  randIndex <- sample(nrow(allMons), size = nrow(uniDat)) # random sample of same size as NPMS data
  allMonsSamp <- allMons[randIndex,]
  allMonsSampSf <- BRCmap::gr2sf_poly(allMonsSamp$Monad, crs = 27700)
  
  allMonsSampSf <- allMonsSampSf %>% 
    mutate(area_km2 = st_area(.) %>% as.numeric() / 1e6)  # Convert area from m² to km²
  joined_sfN <- st_join(allMonsSampSf, allZones, join = st_intersects)
  
  # Summarize the total area of squares in datSf per zone in allZones
  #rm(result)
  temp <- joined_sfN %>%
    group_by(ENVZONE, Description) %>%
    summarize(
      total_sq_area_km2 = sum(area_km2, na.rm = TRUE), # summed areas of NPMS squares by CS Zone
      zone_area_km2 = first(zone_area_km2)  # Use 'first' to get the zone area, assuming all rows in a group have the same value
    )  %>%
    mutate(
      zone_proportion_covered_by_sq = total_sq_area_km2 / zone_area_km2
    ) # By CS Zone, what proportion of the whole zone is covered by surveyed squares?
  
  return(temp$zone_proportion_covered_by_sq)
})

Inspect statistics

We now look at whether the uncertainty from the bootstrapping changes anything.

load(file = here("outputs/bootstrap_emp_props_25072024.Rdata")) # load 1000 pre-saved bootstraps of size 1208 from above

# Calculate confidence intervals and mean
ci_lower <- apply(bootstrap_emp_props, 1, quantile, 0.025, na.rm = T)
ci_upper <- apply(bootstrap_emp_props, 1, quantile, 0.975, na.rm = T)
mean <- apply(bootstrap_emp_props, 1, mean, na.rm = T)

comparison <- comparison %>%
  mutate(
    ci_lower = ci_lower[1:9],
    ci_upper = ci_upper[1:9],
    mean = mean[1:9]
  )

# We can calculate the standard deviation using a standard formula
getSD <- function(x,m,p) {(x-m)/qnorm(p)} # m = mean, x = CI value, p = corresponding percentile of CI value
sd_values <- apply(comparison, 1, function(row) {
  getSD(m = as.numeric(row["mean"]), x = as.numeric(row["ci_lower"]), p = 0.025)
})
comparison$sd <- sd_values

# Calculate the z-value for the empirical result
comparison$z_value <- apply(comparison, 1, function(row) {
  (as.numeric(row["zone_proportion_covered_by_sq.y"]) - as.numeric(row["mean"])) / as.numeric(row["sd"])
})

# Calculate the p-value for the z-value
# For a two-tailed test
comparison$p_value <- apply(comparison, 1, function(row) {
  2 * (1 - pnorm(abs(as.numeric(row["z_value"]))))
})

# Convert p-value to surprisal value
comparison$surprisal_value <- apply(comparison, 1, function(row) {
  -log2(as.numeric(row["p_value"]))
})

comparison <- merge(comparison, checkProps[,c(1:2)], by.x = "ENVZONE", by.y = "zone", all.x = TRUE) # for names
# Negative z-values are under-surveyed, positive z-values over-surveyed
comparison <- comparison[order(comparison$z_value, decreasing = F),]

# discrepancy better from mean of nulls
comparison$discrepancy <- comparison$zone_proportion_covered_by_sq.y - comparison$mean
comparison$RANK <- 1:nrow(comparison) # rank
names(comparison)[which(names(comparison)=="description.x")] <- "CSzone" # rename for clarity
print(comparison[,c("CSzone", "discrepancy", "z_value", "RANK", "p_value", "surprisal_value")])
##                                       CSzone   discrepancy     z_value RANK
## 6                     True Uplands, Scotland -0.0037411172 -13.2306543    1
## 5 Intermediate Uplands and Islands, Scotland -0.0029931354 -11.8808389    2
## 4                         Lowlands, Scotland -0.0012610347  -3.3846892    3
## 9                           Northern Ireland  0.0003624789   0.6766531    4
## 8                             Uplands, Wales  0.0005032409   0.8221144    5
## 7                            Lowlands, Wales  0.0007000797   1.0526147    6
## 1                 Easterly Lowlands, England  0.0013369284   3.2081173    7
## 3                           Uplands, England  0.0023953237   4.0377396    8
## 2                 Westerly Lowlands, England  0.0023845991   5.3396822    9
##        p_value surprisal_value
## 6 0.000000e+00             Inf
## 5 0.000000e+00             Inf
## 4 7.125889e-04       10.454642
## 9 4.986261e-01        1.003970
## 8 4.110118e-01        1.282748
## 7 2.925176e-01        1.773405
## 1 1.336070e-03        9.547789
## 3 5.396871e-05       14.177517
## 2 9.310965e-08       23.356494

Conclusion

You can see here that the z-value of the empirical result relative to the normal distribution implied by the bootstrap results is the most useful metric for ordering the zones by their departure from random sampling. The P-values, and their Shannon information transformation, are less useful because the z-values are so extreme in some cases that they are too small to be quantified by the computer (0 and Inf respectively). Note that the z-values characterise the departure from random in both directions: a positive value is a sign of over-sampling, a negative one undersampling. This can be confirmed by comparing the two tables of square counts by zone displayed above (i.e. compare table(joined_sf$Description) to table(joined_sfN$Description)).

So, the result seems to hold that the most deficient zone, relative to random sampling, is the “True Uplands” of Scotland, with the “Intermediate” Scottish zone second. The “Westerly Lowlands” of England are the most over-surveyed relative to the random sampling scenario.

If prioritisation were to proceed on this basis, then there would need to be some monitoring to check the point at which the addition of surveyed squares to the first zone (True Uplands, Scotland) led to the second ranked zone becoming first, and so on (once zones were a tie, the new squares would presumably be added to each alternately, until the point that a tie with a another zone was created, etc. etc.)