Cleaning Locality Points

Hannah Owens
25 January, 2017

Cleaning Data Points

More points aren't always better.

  • Remove points with incomplete data
  • Remove identical points
  • Remove points with no environmental data
  • Reduce point resolution to match environmental data
  • Remove environmental outliers
    • Bad georeferences
    • Climate more heterogeneous than resolution of environmental data

The libraries we'll use

library(spocc); #This one should look familiar
library(raster); #This one should, too
library(scrubr); #This one's new! It helps clean data!
library(spatstat); #Spatial statistics package with method for calculating nearest neighbor distance

Getting data

spoccCod <- occ(query = "Gadus morhua", from = c('gbif', 'bison', 'inat', 'ecoengine', 'vertnet', 'idigbio', 'obis'), has_coords = T);
spoccCod <- fixnames(spoccCod, how = "query");
allSpoccCod <- occ2df(spoccCod);
allSpoccCod <- date_standardize(allSpoccCod, "%d%b%Y");
head(allSpoccCod);
# A tibble: 6 x 6
  name         longitude latitude  prov  date      key       
  <chr>        <chr>     <chr>     <chr> <chr>     <chr>     
1 Gadus morhua 6.287291  62.475752 gbif  30Jan2018 1822719846
2 Gadus morhua 10.734329 59.251057 gbif  08Jan2018 1822575452
3 Gadus morhua 8.434516  63.308017 gbif  05Jan2018 1822524957
4 Gadus morhua 5.994916  62.456746 gbif  22Jan2018 1822306860
5 Gadus morhua 8.788955  58.429648 gbif  03Feb2018 1823839527
6 Gadus morhua 12.781221 55.825975 gbif  06Mar2018 1836288808

Using scrubr

nrow(allSpoccCod);
[1] 2529
scrubbedCod <- coord_incomplete(allSpoccCod);
nrow(scrubbedCod);
[1] 2528

Using scrubr

nrow(scrubbedCod);
[1] 2528
scrubbedCod <- coord_unlikely(scrubbedCod);
nrow(scrubbedCod);
[1] 2495

Removing duplicate occurrences

nrow(scrubbedCod);
[1] 2495
scrubbedCod <- scrubbedCod[,-6];
uniqueCod <- unique(scrubbedCod[,-4]);
nrow(uniqueCod);
[1] 1977

Removing points with no environmental data

nrow(uniqueCod)
[1] 1977
bathymetry <- raster("~/Dropbox/ENMSeminar/Labs:Homeworks/Lab4/marineBathymetry.asc");
codExtract <- extract(bathymetry, sapply(uniqueCod[2:3], as.numeric));
codExtract<- cbind(uniqueCod, codExtract);
cleanCodExtract <- codExtract[complete.cases(codExtract[,5]),];
nrow(cleanCodExtract);
[1] 1139

Reducing points to resolution of environmental data

nrow(cleanCodExtract);
[1] 1139
rasterResolution <- max(res(bathymetry));
while(min(nndist(cleanCodExtract[,2:3])) < rasterResolution){
  nnD <- nndist(cleanCodExtract[,2:3]);
  cleanCodExtract <- cleanCodExtract[-(which(min(nnD) == nnD)[1]),];
}
row.names(cleanCodExtract) <- seq(nrow(cleanCodExtract));
nrow(cleanCodExtract);
[1] 101

The results

plot(bathymetry, ylim = c(0, 90), col = gray.colors(10, start = 0.3, end = 0.9, gamma = 2.2, alpha = NULL));
points(allSpoccCod[2:3], pch = 16, cex = 2.5);
points(cleanCodExtract[,2:3], pch = 16, cex = 2.5, col = "red");
legend(-175, 20, c("Raw Cod", "Clean Cod"), pch = c(16,16), cex = c(2.5, 2.5), col=c("black", "red"));

plot of chunk unnamed-chunk-8