Aplication exercise for search and cleaning occurrences from GBif
Search for a species of interest Scarus guacamaia
library(rgbif)
#obtain data from GBIF via rgbif
dat <- occ_search(scientificName = "Scarus guacamaia", limit = 100000,
return = "data", hasCoordinate = T)
#names(dat$data) #a lot of columns
#select columns of interest
dat <- dat$data %>%
dplyr::select(species, decimalLongitude, decimalLatitude, countryCode,
individualCount,gbifID, family, taxonRank,
coordinateUncertaintyInMeters, year,
basisOfRecord, institutionCode, datasetName)
names(dat)
## [1] "species" "decimalLongitude"
## [3] "decimalLatitude" "countryCode"
## [5] "individualCount" "gbifID"
## [7] "family" "taxonRank"
## [9] "coordinateUncertaintyInMeters" "year"
## [11] "basisOfRecord" "institutionCode"
## [13] "datasetName"
dim(dat)
## [1] 21876 13
First let’s look the data
-remove records without coordinates
dat <- dat %>%
filter(!is.na(decimalLongitude))%>%
filter(!is.na(decimalLatitude))
dim(dat)
## [1] 21876 13
-Visualize the data on a map
library(ggplot2)
#plot data to get an overview
wm <- borders("world", colour="gray70", fill="gray80")
ggplot()+
coord_fixed()+
wm +
geom_point(data = dat,
aes(x = decimalLongitude, y = decimalLatitude),
colour = "red", size = 0.9)+
theme_bw()+
labs(title = paste("Spatial distribution of records",dat$species))
Then we will mark all suspicious records
#Use CoordinateCleaner to automatically flag problematic records
library(countrycode)
#convert country code from ISO2c to ISO3c
dat$countryCode <- countrycode(dat$countryCode,
origin = 'iso2c',
destination = 'iso3c')
library(CoordinateCleaner)
#flag problems
dat <- data.frame(dat)
flags <- clean_coordinates(x = dat,
lon = "decimalLongitude", lat = "decimalLatitude",
countries = "countryCode",
species = "species",
tests = c("capitals", "centroids", "equal",
"gbif", "institutions",
"zeros")) # most test are on by default
summary(flags)
## .val .equ .zer .cap .cen .gbf .inst .summary
## 0 0 0 23 2 0 1 26
plot(flags, lon = "decimalLongitude", lat = "decimalLatitude")
#Exclude problematic records
dat_cl <- dat[flags$.summary,]
dim(dat_cl)
## [1] 21850 13
#The flagged records
dat_fl <- dat[!flags$.summary,]
dim(dat_fl)
## [1] 26 13
-Remove records with low coordinate precision
hist(dat_cl$coordinateUncertaintyInMeters / 1000, breaks = 20)
dat_cl <- dat_cl %>%
dplyr::filter(coordinateUncertaintyInMeters / 1000 <= 100 | is.na(coordinateUncertaintyInMeters))
hist(dat_cl$coordinateUncertaintyInMeters / 1000, breaks = 20)
table(dat$basisOfRecord)
##
## HUMAN_OBSERVATION PRESERVED_SPECIMEN UNKNOWN
## 21703 169 4
dat_cl <- filter(dat_cl, basisOfRecord == "HUMAN_OBSERVATION")
table(dat_cl$basisOfRecord)
##
## HUMAN_OBSERVATION
## 21692
#suspicious individual counts removal
#Individual count
table(dat_cl$individualCount)
##
## 1 2 3 4 5 8 10
## 43 8 5 3 2 3 2
dat_cl <- dat_cl%>%
filter(individualCount > 0 | is.na(individualCount))%>%
filter(individualCount < 99 | is.na(individualCount)) # high counts are not a problem
#remove very old records
#Age of records
table(dat_cl$year)
##
## 1983 1988 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007
## 1 1 142 331 169 486 566 920 1050 939 783 509 951 627 1288 871
## 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021
## 1678 1354 1673 1042 1941 133 1931 132 1689 128 132 142 13 65
dat_cl %>%
group_by(year) %>%
summarise(records=n()) %>%
ggplot(aes(year, records))+
geom_col(fill="steelblue")+
labs(title=paste("Number of records of:",dat_cl$species, "by year"),
y="",x="")+
theme_bw()
library("rnaturalearth")
library(sf)
library(ggspatial)
world <- ne_countries(scale = "medium", returnclass = "sf")
coords<-dat_cl %>%
dplyr::select(species,decimalLongitude, decimalLatitude ) %>%
rename(Longitude=decimalLongitude, Latitude=decimalLatitude)
coords_sf<- coords %>%
st_as_sf(coords=(2:3))
bbox_new <- st_bbox(coords_sf)
ggplot()+
geom_sf(data = world, fill= NA) +
coord_sf(xlim=c(bbox_new[1],bbox_new[3]),
ylim=c(bbox_new[2],bbox_new[4]))+
geom_point(data = dat_cl, aes(x = decimalLongitude, y = decimalLatitude),
colour = "cyan4", size = 1.2, alpha=0.6)+
annotation_scale(location = "br", width_hint = 0.1,line_width = 0.5) +
annotation_north_arrow(location = "tr", which_north = "true",
pad_x = unit(1.5, "cm"),
pad_y = unit(0.5, "cm"),
height = unit(1, "cm"),
width = unit(1, "cm"), # 0.2 # 0.3
style = north_arrow_fancy_orienteering)+
labs(x = "", y = "",
title =paste("Record distribution of:", dat_cl$species),
caption = paste("Source: GBif\nAuthor: Julián Avila- Jiménez\nDate:",
format(Sys.time(), '%d %B, %Y')))+
theme(plot.title=element_text(size=19),
plot.caption = element_text(size = 7, color="grey60"),
panel.background= element_rect(fill = "grey96"))
library(leaflet)
leaflet (dat_cl) %>%
addProviderTiles("Stamen.Terrain") %>%
addCircleMarkers(lng=~decimalLongitude,
lat=~decimalLatitude,
fillColor = "red",
color = "red",
radius = 5,
opacity = 0.7,
weight = 1)