if (!"foreach" %in% installed.packages()) install.packages('foreach')
if (!"doParallel" %in% installed.packages()) install.packages('doParallel')
if (!"here" %in% installed.packages()) install.packages('here')
require(foreach, quietly=T)
require(doParallel, quietly=T)
require(here, quietly=T)
source(here('scripts/frescalo_functions.R')) # source required functions
In this tutorial we demonstrate the use of Dr Jon Yearsley’s parallelised R coding of FreScaLO (Frequency Scaling using Local Occupancy; “frescalo” hereafter). The original paper of Hill (2012) is here. Dr Yearsley’s original code is here, and all the code and functions relating to this tutorial can be found here (this is a forked version of Dr Yearsley’s repo with additional code and functionality). The paper for which Jon created his version of frescalo is here. Note also that an alternative R translation of the original fortran is also available here — this loop-based implementation is much slower than the current parallelised version, and was done for educational purposes rather than actual use.
The data in this example are dummy data derived from the “unicorns”
dataset in the sparta package, filtered and arranged into two time
periods. The neighbourhood weights used are those created for Great
Britain at the 10 x 10 km grid scale using UKCEH Land Cover Map
information, also packaged within sparta. These data have all already
been prepared for use with frescalo; guidance on these steps can be
found within the sparta vignette. Custom weights could be created using the
sparta::createWeights() function, or other approaches
(e.g. see here and here; there is also code for a RShiny neighbourhood
weights visualisation tool here). In our example here the sites are 10 x 10 km
squares of the British National Grid (ESPG:27700) in lettered format,
the species are just represented by arbitrary names and the time bins
are of unspecified lengths (for frescalo there is no particular
restriction that all time periods in the analysis should be the same
length).
s <- read.csv(file = here("data/clusterTestDat.csv"), stringsAsFactors = F, header = T)
s <- s[,c(4,2,3)]
head(s)
d <- read.delim(file = here("data/GB_LC_Wts.txt"), header = F, sep = "")
d <- d[,c("V1","V2","V3")]
# location 1 = focal site, location2 = sites in focal site neighbourhood, w = weight of site in neighbourhood
names(d) <- c("location1", "location2", "w")
head(d)
We now establish a computing cluster (here just on a local machine,
but this can obviously be done on an HPC as well) using the
doParallel library. We then also set a number of parameters
and options for the frescalo algorithm (R_star and
Phi).
# Set the size of the cluster (number of nodes).
# cores=NULL will automatically pick the number of cores per node
# (by defauult this is half the number of available cores)
cl <- makeCluster(3)
registerDoParallel(cl, cores = NULL)
R_star = 0.2703 # Proportion of species used as benchmarks within neighbourhoods
Phi = 0.74 # Target value of neighbourhood local-frequency weighted mean frequency using to standardise the cross-time period sampling intensity across neighbourhoods
trend_analysis = TRUE # Perform trend analysis
outputPrefix = 'outputs/testApr24_' # relative directory path and prefix for all output files
chunkSize = 250 # Number of sites or other data segments to pass to each compute node
We now prepare various required objects for the first part of the frescalo algorithm (subroutine “fresca” in the original fortran code). Broadly speaking, this part of the analysis equalises the all-time (i.e. across all time bins in the analysis) sampling intensity for each local neighbourhood in order to make them comparable for the subsequent trend assessment.
spLocations = unique(s$location) # unique list of sites
d <- d[d$location1 %in% spLocations,] # just keep relevant neighbourhoods in weights file
speciesNames = as.character(unique(s$species)) # create list of unique species
# For each region record presence/absence of each species
locationGroups = as.factor(rep(c(1:ceiling(length(spLocations)/chunkSize)),each=chunkSize))
sSplit = split(s, locationGroups[match(s$location, spLocations)]) # Split species data up into hectads
speciesList <- foreach(i = 1:length(sSplit), .inorder=T, .combine='c') %dopar% {
speciesListFun(spList = sSplit[[i]], species = speciesNames) # which species are in each location?
}
# For each focal regional calculate the sampling intensity multipler alpha_i
dSub = d[,1:3] # not really needed if d already has 3 columns
location1List = as.character(unique(dSub$location1)) # Create list of unique focal regions
location1Groups = as.factor(rep(c(1:ceiling(length(location1List)/chunkSize)),each=chunkSize))
dSplit = split(dSub, location1Groups[match(dSub$location1, location1List)]) # Split neighbourhood data up into focal regions
# Run frescalo algorithm across chunks of neighbourhood data
output <- foreach(i=1:length(dSplit), .inorder=T, .combine='cfun', .multicombine=TRUE) %dopar% {
frescalo(data_in = dSplit[[i]], speciesList, spLocations, speciesNames, Phi, R_star=R_star, missing.data = 2) # leave missing.data as 2 for fortran equivalence
}
We now have our two output tables. output$frescalo.out
is the equivalent of the “location report” samples.txt from
the original fortran program. This provides site- and
neighbourhood-level information relating to the rescaling of the
frequency-weighted mean frequency (\(\phi_{i}\), i.e. phi_i) across
neighbourhoods (e.g. \(\alpha_{i}\),
the multiplier used to adjust species local frequencies in a
neighbourhood so that a neighbourhood’s \(\phi_{i} = \Phi\)), and derived
quantitites.
The other table output$freq.out is the equivalent of the
“listing of rescaled species frequencies” table
frequencies.out produced by the original fortran program.
This is a table of sites x species, providing such information as the
rescaled local frequencies of species across neighbourhoods, averaged
across all time periods in the assessment. This can be considered as a
“smoothed” or “idealised” average local frequency, from which time
period-specific deviations (i.e. species’ true frequencies within
specific time periods) will subsequently be estimated.
The rescaled frequencies of species in neighbourhoods are also used
to select the benchmark species for each neighbourhood using a single
value of R_star (\(R^{*}\)
of Hill 2012). Once all neighbourhoods have been scaled to have the same
“shape” (i.e. value of phi, the frequency weighted local mean
frequency), then, after rank normalisation, the same proportion of each
rescaled and normalised frequency curve can be selected from each
neighbourhood using the proportion R_star.
head(output$frescalo.out)
head(output$freq.out)
# And we could write the outputs to text files using our specified path and prefix
# write.table(format(output$frescalo.out[order(output$frescalo.out$location),], digits=4,zero.print=T, width=10),
# file=paste(outputPrefix,'_frescalo_out.txt',sep=''), col.names=T, row.names=F, quote=F, sep=' ')
# write.table(format(output$freq.out[order(output$freq.out$location, output$freq.out$rank),], digits=4, zero.print=T, width=10, scientific=F, justify='left'),
# file=paste(outputPrefix,'_frescalo_freq.txt',sep=''), col.names=T, row.names=F, quote=F, sep=' ')
The following animation provides a theoretical example of how this
aspect of frescalo is intended to work at the individual neighbourhood
level. This was generated using an underlying multivariate Poisson point
process (representing a community or species pool within a
neighbourhood), combined with thinning (i.e. less than “perfect”
sampling intensity relative to retrieving the true frequencies of
species in the neighbourhood) and an iterative, successive
approximation, approach to then acheiving a target value of the
frequency-weighted mean frequency (i.e. moving \(\phi_{i}\) to the specified \(\Phi\) (Phi in the code)
value). This can be thought of as the multipler (\(\alpha_{i}\)) required to re-“fatten” the
Poisson point process for each species. The code behind this simulation
can be found here. The red dot shown for each curve is its
empirical \(\phi\) value (i.e. its
frequency-weighted mean frequency).
Note that a key assumption of frescalo is that neighbourhood local frequency curves contain accurate information on species relative frequencies, even when sampling effort varies across space and time. Obviously this assumption may not always be satisfied, but tends to be true at larger spatio-temporal scales, especially when the relevant spatial domain and time periods in question have experienced a reasonable level of recording. This is obviously true when there have been national recording campaigns of some sort, and, indeed, frescalo was originally designed by Hill to deal with the types of data collected in order to produce species distribution atlases.
require(knitr, quietly=T)
include_graphics(here("outputs/species_frequency_animation.gif"))
Animation demonstrating the idea of rescaling a neighbourhood species frequency curve
Finally, we perform the frescalo trend analysis. In the original
fortran code, this was subroutine “tfcalc”. The trend.out
object was simply described in the original frescalo README as the
“listing of time factors for species”, although the standard deviations
of these are also included. We have amended this function to also return
the site x time period recording effort information used to make
site/time period effort adjustments. This comprises two important
things: (i) Information on \(s_{it}\)
(s_it in the code), the proportion of benchmarks recorded
for a site within a time period, and (ii), w, an additional weight used
by Hill to reduce the influence of site x time period combinations which
have very few benchmarks and which are therefore considered to have
received no systematic sampling effort in a time period. This addition
was not described in the main paper of Hill (2012), but it present in
the fortran code and the original Frescalo README file packaged with the
fortran executable. To quote Hill’s README:
“In FRESCALO, data from times at a location in which the proportion of benchmark species (recording effort) is less than 0.1 are given low weight, ranging linearly from a minimum of 0.05 to 1.0 if sampling effort is 0.095. The logic of this is that where no systematic sample is taken, observations are isolated and unsupported, and are rather unsuitable for calculating time factors.”
The relevant line of code in the current implementation is
w[s_it<0.0995] = 10*s_it[s_it<0.0995]+0.005,
following the original fortran.
# Trend analysis
if (trend_analysis) {
# Do the Frescalo trend analysis if there are more than 1 year bins (use same location groups as sSplit)
sSplit2 = split(s, as.factor(s$time)) # Split species data up into year bins
trend.out <- foreach(i=1:length(sSplit2), .inorder=T, .combine='cfunTrend', .multicombine=TRUE) %dopar% {
trend(s_data = sSplit2[[i]], output$freq.out)
}
}
# View outputs
head(trend.out$trend.out, n = 10) # Trend information
site.time.out <- trend.out$site.time.out # Site x time period sampling effort information
head(site.time.out, n = 10)
# If we were saving trend output:
# if (trend_analysis) {
# write.table(format(trend.out$trend.out[order(trend.out$trend.out$species,trend.out$time),], digits=4, zero.print=T, width=10, scientific=F, justify='left'),
# file=paste(outputPrefix,'_frescalo_trend.txt',sep=''), col.names=T, row.names=F, quote=F, sep=' ')
# }
stopCluster(cl) # stop cluster
There are various things that we can do with these outputs. The most obvious being to plot the time trend for any given species, along with its standard deviations. First though, let us check that we have got the same results that the original fortran program produces for this dataset.
## Check against fortran outputs
load(file = here("data/unicorn_TF.rda"))
fortranTrend <- unicorn_TF$trend
names(fortranTrend)[1:2] <- c("species","time")
para.trend <- trend.out$trend.out
para.trend$time <- ifelse(para.trend$time == 1, 1984.5, 1994.5)
tfCompare <- merge(fortranTrend, para.trend, by = c("time", "species"))
# Time factors
cor(tfCompare$tFactor, tfCompare$TFactor) # 0.998
## [1] 0.9984765
# Time factors SDs
cor(tfCompare$StDev.x, tfCompare$StDev.y) # 0.999
## [1] 0.9990594
par(mfrow=c(1,2))
plot(tfCompare$tFactor, tfCompare$TFactor, main = "Time factors") #
abline(a = 0, b = 1)
plot(tfCompare$StDev.x, tfCompare$StDev.y, main = "Time factor SDs") #
abline(a = 0, b = 1)
Apart from a couple of small discrepancies (which are likely due to the small, sparse dataset used for this example), the outputs are extremely close. The same pattern has been found with a real world dataset on vascular plants in the Isle-de-France area around Paris (Dr J. Vallet, June 2024, pers. comm.) Dr Vallet found correlations of 0.99921 for her time factors and 0.99997 for the time factor standard deviations. It is also worth noting that the correlations found here between the parallel R coding and the original fortran (see this script for other comparisons) were identical to the correlations between an independent loop-based coding in R and the original fortran (see here for these numbers), further confirming that all programs are implementing essentially the same algorithm. The main divergences between implementations relate to the rankings of species within neighbourhoods: ties between species’ frequencies within neighbourhoods are resolved slightly differently between the programs, but this typically only occurs for rare (non-benchmark) species, and has no impact on estimated time trends. See L178-L182 here for example.
Returning to general visualisation of the results, we can plot the trend for a species, along with the time factor standard deviations, as follows.
para.trend$time <- ifelse(para.trend$time == 1984.5, 1, 2) # recode back to original time bins
sppToPlot <- para.trend[para.trend$species=="Species 1",]
plot(1,type='n',
ylim = c(0,1.0),
xlim = c(0.75,2.25), xlab = 'Time', ylab ='Relative frequency',
main = paste0(unique(sppToPlot$species)), cex.main = 0.7, cex.lab = 0.7, cex.axis = 0.5)
abline(h = 0, col = "grey5", lty = 1, lwd = 0.7)
points(x = sppToPlot$time, y = sppToPlot$tFactor, pch = 21, col = "black", cex = 1, bg = "white")
arrows(x0 = sppToPlot$time, x1 = sppToPlot$time, y0 = sppToPlot$tFactor - sppToPlot$StDev,
y1 = sppToPlot$tFactor + sppToPlot$StDev, length = 0, col = "black")
We can then fit a model for the linear trend for example, ideally respecting the uncertainty around the point estimates. Bootstrapped line ensembles (see here for a frescalo example) or Bayesian meta-regression are both potential options here, depending on the aim and audience.
# create 100 random draws for each species at each time point and fit linear model
spLs <- spLsSim <- vector("list", length(unique(sppToPlot$species)))
names(spLsSim) <- unique(sppToPlot$species)
spLs <- lapply(unique(sppToPlot$species), function(x) sppToPlot[sppToPlot$species == x,])
names(spLs) <- unique(sppToPlot$species) # add names to list elements
out <- lapply(spLs, function(x) fitLinModSp(x)) # fit linear models using helper function
out2 <- lapply(out, FUN = extractDf)
### Create line ensemble plots from lists of bootstrapped linear model fits
par(mfrow=c(1,1))
for (i in seq_along(out2)[1]){ # taken and simplified from existing code across many species
plot(1,type='n',
ylim = c(0.2,0.85),
xlim = c(1,2), xlab = 'Time', ylab ="",
main = "", cex.lab = 1.2, cex.axis = 1.0)
title(ylab = "Relative frequency", line = 2.2, cex.lab = 1.2)
## add 100 lines to ensemble plot
for (j in 1:100) {
abline(a = out2[[i]][j,1], b = out2[[i]][j,2], col = adjustcolor("blue", alpha = 0.05), lwd = 2)
}
## add original Frescalo (Hill, 2012) time-period specific means and standard deviations
abline(a = 0, b = 0, col = "grey")
arrows(x0 = spLs[[i]]$time, x1 = spLs[[i]]$time, y0 = spLs[[i]]$tFactor - spLs[[i]]$StDev,
y1 = spLs[[i]]$tFactor + spLs[[i]]$StDev, length = 0, col = "black", lwd = 0.8)
points(x = spLs[[i]]$time, y = spLs[[i]]$tFactor, pch = 21, col = "black", cex = 0.8, bg = "white")
}
We can also produce the data required to make maps of the predicted
species occupancies for different time periods, as first suggested by Bijlsma (2013). This works by setting \(s_{it}\) to 1, and then using a species’
time factor per time period to adjust a species all-time local frequency
estimate (freq_1 from the freq.out object
returned by the frescalo() function above, or \(f'_{ij}\) from the original paper of
Hill 2012). Ideally these estimates would also take uncertainty in
freq_1 and in the time factor estimates into account,
although we do not pursue that development here.
head(output$freq.out[output$freq.out$species=="Species 1",])
head(para.trend)
# using createPijt function from frescalo_functions.R file
list_data2 <- by(data = output$freq.out, as.factor(output$freq.out$species), function(x) createPijt(x, trend = para.trend))
options(scipen = 999)
occ.out <- do.call(rbind, list_data2)
row.names(occ.out) <- 1:nrow(occ.out)
occ.out$P_ijt <- round(occ.out$P_ijt, digits = 3)
head(occ.out, n = 5)
hist(occ.out$P_ijt) # show distribution of predicted occupancies