# if (!require(devtools)) install.packages("devtools")
# devtools::install_github("yanlinlin82/ggvenn")
require(data.table)
library(knitr)
library(dplyr)
library(DT)
library(ggplot2)
library(ggvenn)
library(tidyverse)
library(readxl)
library(googledrive)
library(writexl)
# v1
stv1 <- fread('v1/References.csv')
names(stv1)[1:2] <- c('paperID', 'v1citation')
# Fix v1 ref ID
stv1id <- fread('v1/Table_reference_finalized.csv')
stv1 <- merge(stv1, stv1id, by.x = "paperID", by.y = "Mendeley_ref.see_word_document.", all.y = T)
stv1 <- stv1[,c("Article_ID","v1citation")]
stv1$Article_ID <- as.numeric(gsub("[^0-9.-]", "", stv1$Article_ID))
# v2
stv2 <- fread('v2/paper.ids.csv', encoding='Latin-1')
stv2[, citation := paste(author, yr, title, journal)]
## Use agrep to match on title
stv2[, v1citation := NA_character_] # the match citation from v2
stv2[, v1paperid := NA_integer_] # the match paperid from v2
stv2[, v2matches := NA_integer_] # N matches in v1
Check if papers in v2 are present in v1.
for(i in 1:nrow(stv2)){
cat(i)
matches <- agrep(stv2$title[i], stv1$v1citation, costs = list(ins = 0, del = 2, subs =1))
if(length(matches) == 1) {
if(is.na(stv2$v1citation[i])){
stv2$v1citation[i] <- stv1$v1citation[matches]
stv2$v1paperid[i] <- stv1$Article_ID[matches]
stv2$v2matches[i] <- 1
}
else{ # same reference matched - keep the closest match
matches_ <- c(stv2$v1citation[i], stv1$v1citation[matches])
distances <- adist(stv2$title[i], matches_)
keep <- which.min(distances)
cat(paste0('\nChose v1 citation=', matches_[keep], '\n'))
j <- which(stv1$v1citation %in% matches_[keep])
stv2$v1citation[i] <- stv1$v1citation[j]
stv2$v1paperid[i] <- stv1$Article_ID[j]
stv2$v2matches[i] <- 1
}
}
if(length(matches) > 1) {
if(is.na(stv2$v1citation[i])){
cat(paste0('\n\nFound more than 1 match for i=',i, '. v2 title=', stv2$title[i]))
distances <- adist(stv2$title[i], stv1$v1citation[matches])
keep <- which.min(distances)
cat(paste0('\nChose v1 citation=', stv1$v1citation[matches][keep], '\n'))
j <- which(stv1$v1citation %in% stv1$v1citation[matches][keep])
stv2$v1citation[i] <- stv1$v1citation[j]
stv2$v1paperid[i] <- stv1$Article_ID[j]
stv2$v2matches[i] <- as.numeric(length(matches))
}
else{ # same reference matched - keep the closest match
matches_ <- c(stv2$v1citation[i], stv1$v1citation[matches])
distances <- adist(stv2$title[i], matches_)
keep <- which.min(distances)
cat(paste0('\nChose v1 citation=', matches_[keep], '\n'))
j <- which(stv1$v1citation %in% matches_[keep])
stv2$v1citation[i] <- stv1$v1citation[j]
stv2$v1paperid[i] <- stv1$Article_ID[j]
stv2$v2matches[i] <- as.numeric(length(matches))
}
}
}
Some v2 citations with multiple matches in v1:
i = 2, 5, 11, 22, 38, 66, 78, 124, 173, 178, 179, 198, 208, 256, 266
Some citations in v2 were correctly identified v1, but others were not.
# write.csv(stv2, file = 'output/v1_v2_match_2check.csv', row.names=FALSE)
#
# v1v2 <- read.csv('output/v1_v2_match_checked.csv')
#
# # Fix matches
# v1v2$v1paperid[which(v1v2$match==0)] <- NA
# v1v2$v1citation[which(v1v2$match==0)] <- NA
#
# # Any ref in v2 with multiple matches in v1?
# names(which(table(v1v2$v1paperid)>1)) # fix these
#
# write.csv(v1v2, file = 'output/v1_v2_match_2check.csv', row.names=FALSE)
v1v2 <- read.csv('output/v1_v2_match_checked.csv')
# Fix matches
v1v2$v1paperid[which(v1v2$match==0)] <- NA
v1v2$v1citation[which(v1v2$match==0)] <- NA
# Any ref in v2 with multiple matches in v1?
names(which(table(v1v2$v1paperid)>1)) # fix these
## character(0)
From the 0 v2 citations with multiple matches in v1, 0 of these were correct matches.
RefFull <- v1v2[,c(1,8,9,10)]
names(RefFull) <- c("ID_v2","Ref_v2","Ref_v1", "ID_v1")
# add non-matching references from v1
RefFull <- merge(RefFull,stv1,
by.x = "ID_v1",by.y = "Article_ID",
all = T)
RefFull <- RefFull[,-4]
names(RefFull) <- c("ID_v1","ID_v2", "Ref_v2","Ref_v1")
RefFull$Refstd <- RefFull$Ref_v2
RefFull$Refstd[which(!is.na(RefFull$Ref_v1))] <- RefFull$Ref_v1[which(!is.na(RefFull$Ref_v1))]
# check
all(stv1$Article_ID %in% RefFull$ID_v1)
## [1] TRUE
all(stv2$paperid %in% RefFull$ID_v2)
## [1] TRUE
# check for duplicates
any(duplicated(na.omit(RefFull$ID_v1)))
## [1] FALSE
any(duplicated(na.omit(RefFull$ID_v2)))
## [1] FALSE
RefFull <- RefFull[,c(1,4,2,3,5)]
DT::datatable(RefFull,
rownames = FALSE,
extensions = 'Buttons',
options = list(pageLength = 100,
scrollY = "400px",
scrollX = T,
dom = 'Blfrtip',
buttons = c('csv', 'excel'),
lengthMenu = list(c(10,25,50,-1),
c(10,25,50,"All")))) %>%
DT::formatStyle(columns = colnames(RefFull), `fontSize` = '80%')
# create database with matches (ref included in both databases) between databases
RefMatch <- na.omit(RefFull)
# v1 refs
Refv1 = RefFull[which(!is.na(RefFull$Ref_v1)),c("ID_v1","ID_v2","Refstd")]
# v2 refs
Refv2 = RefFull[which(!is.na(RefFull$Ref_v2)),c("ID_v1","ID_v2","Refstd")]
# v1 only
Refv1only <- Refv1[which(!Refv1$ID_v1 %in% Refv2$ID_v1),]
# v2 only
Refv2only <- Refv2[which(!Refv2$ID_v2 %in% Refv1$ID_v2),]
# Number of matches and mismatches
x <- list(v1 = Refv1$Refstd,
v2 = Refv2$Refstd)
names(x) = c(paste0("Bioshifts v1 (", length(x$v1),")"),
paste0("Bioshifts v2 (", length(x$v2),")"))
ggvenn(
x,
fill_color = c("#0073C2FF", "#CD534CFF"),
stroke_size = 0.5, set_name_size = 4
)
Missing refs from v2 in v1 can be i) papers published after 2018 or ii) papers that register qualitative shifts, while missing papers from v1 in v2 can be due to papers looking at shifts in organisms that are not animals or plants (i.e., v2 did not included papers looking at shifts in algae, lichens…). Open the range shift data and clean.
# Load databases
# Load v1
biov1_raw <- read.table("v1/Shifts2018_checkedtaxo.txt",header = T)
# Fix references in biov1
biov1_raw$Article_ID <- gsub("[^0-9.-]", "", biov1_raw$Article_ID)
biov1_raw$Article_ID <- as.numeric(biov1_raw$Article_ID)
biov1_raw <- biov1_raw[-which(biov1_raw$valid_species_name=="no"),]
# create a shift ID
biov1_raw$S_ID_v1 <- sapply(1:nrow(biov1_raw), function(x) paste("S",x,sep = "_"))
# Load v2
# biov2_raw <- read.csv("v2/Bioshifts.v2.final.csv")
biov2_raw <- read_excel("v2/Bioshifts.v2.final.corrected.xlsx", sheet = 2) # new file sent by Sarah with start/end dates for shifts
# create a shift ID
biov2_raw$S_ID_v2 <- sapply(1:nrow(biov2_raw), function(x) paste("S",x,sep = "_"))
biov1 = biov1_raw
biov2 = biov2_raw
# Check if there are any references from the reference list absent from the raw databases
all(Refv1$ID_v1 %in% biov1_raw$Article_ID)
## [1] FALSE
all(Refv2$ID_v2 %in% biov2_raw$paperid)
## [1] FALSE
mis <- Refv1$ID_v1[which(!Refv1$ID_v1 %in% biov1_raw$Article_ID)]
mis
## [1] 91 101
# 2 papers from v1 reference list is missing from the v1 database
# Remove this paper from the v2 reference list
Refv1 = Refv1[which(!Refv1$ID_v1 %in% mis),]
Refv1only = Refv1only[which(!Refv1only$ID_v1 %in% mis),]
RefFull = RefFull[which(!RefFull$ID_v1 %in% mis),]
# check again
all(Refv1$ID_v1 %in% biov1_raw$Article_ID)
## [1] TRUE
# OK
unique(biov1$Param)
## [1] "O" "LE" "TE"
unique(biov2$Parameter)
## [1] "leading edge" "mean" "trailing edge" "maximum/optimum"
## [5] "east" "west"
# We kept "maximum/optimum" because some studies considered as "O" in v1 are classified as "maximum/optimum" in v2.
# V1 used the direction and angle (azimuth) to convert HOR shifts to LAT shifts, but these were entered as HOR shifts in the raw database. We have to call these LAT shifts instead
table(biov1$Type)
##
## BAT ELE HOR LAT LON
## 566 13451 4031 12859 1032
biov1$Type[which(!is.na(biov1$Azimuth))] <- "LAT" # All obs type == HOR contain Azimuth value
# flag whether LAT comes from Azimuth conversion
biov1$Azi = 0
biov1$Azi[which(!is.na(biov1$Azimuth))] <- 1
unique(biov1$Type)
## [1] "ELE" "LAT" "BAT" "LON"
biov1 <- biov1[which(biov1$Type=="ELE" | biov1$Type=="LAT"),]
table(biov2$Dimension)
##
## depth elevation latitude longitude
## 739 13610 15898 2762
biov2 <- biov2[which(biov2$Dimension=="latitude" | biov2$Dimension=="elevation"),]
Standardize shift measures to km/year
# v1
unique(biov1$UNIT)
## [1] "m/year" "km/year"
# from m/year to km/year
biov1 <- biov1 %>%
mutate(
kmyr = case_when(
UNIT == "m/year" ~ SHIFT/1000,
UNIT == "km/year" ~ SHIFT,
TRUE ~ NA_real_))
biov2$`Total Years Studied` <- as.numeric(biov2$`Total Years Studied`)
biov2$`Numeric Change` <- as.numeric(biov2$`Numeric Change`)
# v2
biov2 <- biov2 %>%
mutate(`Total Years Studied`=as.numeric(`Total Years Studied`)) %>%
mutate(
kmyr = case_when(
Unit == "ft" ~ `Numeric Change`/3280.8 /`Total Years Studied`,
Unit == 'degrees lat' ~ `Numeric Change`*111/`Total Years Studied`,
Unit =='degrees lat/year' ~ `Numeric Change`*111,
Unit=='degree latitude per decade'~ `Numeric Change`*111/10,
Unit=='km' ~ `Numeric Change`/`Total Years Studied`,
Unit=='km/decade' ~ `Numeric Change`/10,
Unit=='km/year' ~ `Numeric Change`,
Unit=='m' ~ `Numeric Change`/1000/`Total Years Studied`,
Unit=='m/year' ~ `Numeric Change`/1000,
Unit=='m/decade'~ `Numeric Change`/1000/10,
TRUE ~ NA_real_
)
)
# In v1 duration is (END - START) + 1
plot(biov1$DUR,biov1$END-biov1$START +1)
biov1$DUR[1]==biov1$END[1]-biov1$START[1]+1
## [1] TRUE
# In v2 duration should be the same, but years are not provided in their data
biov2$`Start Year` <- as.numeric(biov2$`Start Year`)
## Warning: NAs introduced by coercion
biov2$`End Year` <- as.numeric(biov2$`End Year`)
## Warning: NAs introduced by coercion
plot(biov2$`Total Years Studied`, biov2$`End Year`-biov2$`Start Year`+1)
biov2$`Total Years Studied`[1]==biov2$`End Year`[1]-biov2$`Start Year`[1]+1
## [1] TRUE
# get relevant columns
biov1 <- biov1[,c("Article_ID","Study_ID","Trend","Unit.trend","DUR","Type","Param","kmyr","Azi","New_name", "START", "END", "old_name","S_ID_v1")]
names(biov1) <- c("ID_v1","Study_ID_v1","Trend_v1","Unit.trend_v1","DUR_v1","Type_v1","Param_v1","SHIFT_v1","Azi","sp_name_v1", "START_v1", "END_v1","sp_reported_name_v1","S_ID_v1")
biov2 <- biov2[,c("Paper ID","Study Period","Numeric Change","Unit","Total Years Studied","Dimension","Parameter","kmyr","Scientific Name", "species", "Common Name", "Start Year", "End Year","Study Area", "Found on Table/Pg", "Notes","S_ID_v2")]
names(biov2) <- c("ID_v2","Study_ID_v2","Trend_v2","Unit.trend_v2","DUR_v2","Type_v2","Param_v2","SHIFT_v2","sp_name_v2","sp_reported_name_v2", "sp_common_name_v2", "START_v2", "END_v2", "Study_area_v2", "FigTab_v2", "Notes_v2","S_ID_v2")
biov2$sp_name_v2 <- gsub(" ","_",biov2$sp_name_v2)
Use refs from clean databases:
Type: latitudinal or altitudinal
Param: LE, TE or O
unique(biov1$Type_v1)
## [1] "ELE" "LAT"
unique(biov2$Type_v2)
## [1] "latitude" "elevation"
biov2$Type_v2[which(biov2$Type_v2 == "latitude")] <- "LAT"
biov2$Type_v2[which(biov2$Type_v2 == "elevation")] <- "ELE"
unique(biov1$Param_v1)
## [1] "O" "LE" "TE"
unique(biov2$Param_v2)
## [1] "leading edge" "mean" "trailing edge" "maximum/optimum"
biov2$Param_v2[which(biov2$Param_v2 == "leading edge")] <- "LE"
biov2$Param_v2[which(biov2$Param_v2 == "mean")] <- "O"
biov2$Param_v2[which(biov2$Param_v2 == "maximum/optimum")] <- "O"
biov2$Param_v2[which(biov2$Param_v2 == "trailing edge")] <- "TE"
PS.: All shifts in v1 are quantitative, but v2 included qualitative shifts (e.g., yes/no).
# keep with categorical shifts for comparison
biov2cat <- biov2
biov2 <- biov2[-which(is.na(biov2$SHIFT_v2)),]
# save species list
splist <- data.frame(species = unique(c(biov1$sp_name_v1, biov2$sp_name_v2)))
splist$v1 <- 0
splist$v1[which(splist$species %in% biov1_raw$New_name)] <- 1
splist$v2 <- 0
splist$v2[which(splist$species %in% gsub(" ","_",biov2_raw$`Scientific Name`))] <- 1
# taxonomy
taxonomyv1 <- biov1_raw[,c("Kingdom", "Phylum", "Class", "Order", "Family", "New_name")]
taxonomyv2 <- biov2_raw[,c("kingdom", "phylum", "class", "order", "family", "Scientific Name")]
names(taxonomyv1) <- names(taxonomyv2)
taxonomy <- rbind(taxonomyv2,taxonomyv1)
taxonomy <- taxonomy[-which(duplicated(taxonomy$`Scientific Name`)),]
taxonomy$`Scientific Name` <- gsub(" ","_",taxonomy$`Scientific Name`)
splist <- merge(splist, taxonomy, by.x = "species", by.y = "Scientific Name", all.x = TRUE)
write.csv(splist, "output/splist.csv", row.names = F)
RefFullAF <- RefFull[which((RefFull$ID_v1 %in% biov1$ID_v1) | (RefFull$ID_v2 %in% biov2$ID_v2)),]
RefFullAFc <- RefFull[which((RefFull$ID_v1 %in% biov1$ID_v1) | (RefFull$ID_v2 %in% biov2cat$ID_v2)),]
# v1 refs
Refv1AF = filter(Refv1, ID_v1 %in% biov1$ID_v1)
# v2 refs excluding categorical shifts
Refv2AF = filter(Refv2, ID_v2 %in% biov2$ID_v2)
# v2 refs including categorical shifts
Refv2AFc = filter(Refv2, ID_v2 %in% biov2cat$ID_v2)
# v1 only
Refv1onlyAF <- filter(Refv1AF, !ID_v1 %in% Refv2AF$ID_v1)
# v2 only excluding categorical shifts
Refv2onlyAF <- filter(Refv2AF, !ID_v2 %in% Refv1AF$ID_v2)
# v2 only including categorical shifts
Refv2onlyAFc <- filter(Refv2AFc, !ID_v2 %in% Refv1AF$ID_v2)
# Number of matches and mismatches excluding categorical shifts from v2
x1 <- list(v1 = unique(Refv1AF$Refstd),
v2 = unique(Refv2AF$Refstd))
names(x1) = c(paste0("Bioshifts v1 (", length(x1$v1),")"),
paste0("Bioshifts v2 (", length(x1$v2),")"))
# Number of matches and mismatches including categorical shifts from v2
x1c <- list(v1 = unique(Refv1AF$Refstd),
v2 = unique(Refv2AFc$Refstd))
names(x1c) = c(paste0("Bioshifts v1 (", length(x1c$v1),")"),
paste0("Bioshifts v2 (", length(x1c$v2),")"))
# Matches excluding categorical shifts from v2
ggvenn(x1,
fill_color = c("#0073C2FF", "#CD534CFF"),
stroke_size = 0.5, set_name_size = 4)
# Matches including categorical shifts from v2
ggvenn(x1c,
fill_color = c("#0073C2FF", "#CD534CFF"),
stroke_size = 0.5, set_name_size = 4)
Greater number of uniques in v2 including qualitative shifts >> v1 did not include qualitative shifts. Increase in uniques in v1 after excluding qualitative shifts >> indicative that some quantitative shifts in v1 are included as qualitative in v2.
Merging the two databases could lead to a total of 402 studies. There are 8721 shift measures from v1 not included in v2, and 2453 shift measures from v2 not included in v1.
Papers absent in v1 but present in v2.
Possible reasons:
1) year of publication (v1 compiled data until 2018; v2 compiled data
until 2020)
2) data requested was not received in v1 but received in v2
3) qualitative shift
4) excluded from v1 based on criteria
v1missing <- Refv2onlyAFc[,c(2:3)]
# total v1missing
nrow(v1missing)
## [1] 144
v1missing$R_yr <- 0
v1missing$R_request <- 0
v1missing$R_quali <- 0
v1missing$R_excluded <- 0
# Reason: Year >> papers published after >2018
tmp. <- select(filter(stv2, yr > 2018), paperid)
v1missing$R_yr[which(v1missing$ID_v2 %in% tmp.$paperid)] <- 1
table(v1missing$R_yr) # articles
##
## 0 1
## 121 23
length(na.omit(biov2$SHIFT_v2[which(biov2$ID_v2 %in% tmp.$paperid)])) # shifts
## [1] 1735
# Reason: Data requested but not received
## Load papers requested in v1
tmp. <- read_xlsx("v1/Base_contact.xlsx")
## List of papers from which data was not received
tmp. <- filter(tmp., is.na(Article_ID))
tmp.. <- strsplit(tmp.$References, "_")
tmp.$Author <- sapply(tmp.., function(x) x[1])
tmp.$Year <- sapply(tmp.., function(x) x[3])
tmp. <- tmp.[,c("Author","Year","Title")]
## Are these papers in v2?
v2author <- sapply(strsplit(stv2$author, " "), function(x) x[1])
inv2 <- stv2[which(paste(v2author,stv2$yr) %in% paste(tmp.$Author,tmp.$Year)),]
## Feed
v1missing$R_request[which(v1missing$ID_v2 %in% inv2$paperid)] <- 1
table(v1missing$R_request) # articles
##
## 0 1
## 142 2
length(na.omit(biov2$SHIFT_v2[which(biov2$ID_v2 %in% inv2$paperid)])) # shifts
## [1] 15
# Reason: Qualitative data in v2
tmp. <- biov2cat$ID_v2[which(is.na(biov2cat$SHIFT_v2))]
v1missing$R_quali[which(v1missing$ID_v2 %in% tmp.)] <- 1
table(v1missing$R_quali) # articles
##
## 0 1
## 35 109
length(na.omit(biov2$SHIFT_v2[which(biov2$ID_v2 %in% tmp.)])) # shifts
## [1] 1265
# List of excluded papers
tmp. <- read_xlsx("v1/Base_Rebut.xlsx")
tmp.. <- strsplit(tmp.$References, "_")
tmp.$Author <- sapply(tmp.., function(x) x[1])
tmp.$Year <- sapply(tmp.., function(x) x[3])
tmp. <- tmp.[,c("Author","Year")]
## Are these papers in v2?
v2author <- sapply(strsplit(stv2$author, " "), function(x) x[1])
inv2 <- stv2[which(paste(v2author,stv2$yr) %in% paste(tmp.$Author,tmp.$Year)),]
## Feed
v1missing$R_excluded[which(v1missing$ID_v2 %in% inv2$paperid)] <- 1
table(v1missing$R_excluded) # articles
##
## 0 1
## 126 18
length(na.omit(biov2$SHIFT_v2[which(biov2$ID_v2 %in% inv2$paperid)])) # shifts
## [1] 27
DT::datatable(v1missing,
rownames = FALSE,
extensions = 'Buttons',
options = list(pageLength = 100,
scrollY = "400px",
scrollX = T,
dom = 'Blfrtip',
buttons = c('csv', 'excel'),
lengthMenu = list(c(10,25,50,-1),
c(10,25,50,"All")))) %>%
DT::formatStyle(columns = colnames(v1missing), `fontSize` = '80%')
Papers absent in v2 but present in v1.
Possible reasons:
1) Fail in extracting the data, fail in getting the data, fail because
no complete species list…
2) data requested but not received
3) LAT calculated estimated from Azimuth (v1 converted LONG to LAT using
Azimuth)
4) If paper included non-animal or plant organisms (e.g., fungi, virus,
algae, phytoplancton…)
5) Shift in v1 extracted from figure
v2missing <- Refv1onlyAF[,c(1,3)]
# total v2missing
nrow(v2missing)
## [1] 96
v2missing$R_request <- 0
v2missing$R_fail <- 0
v2missing$R_LatFromAz <- 0
v2missing$R_taxa <- 0
v2missing$R_fig <- 0
# Reason: Data requested but not received
## Load papers requested in v2
tmp. <- read_xlsx("v2/Data Contact Sheet.xlsx",skip = 1)
## List of papers from which data was not received
tmp. <- tmp.[which(grepl("No,",tmp.$`Data obtained?`) | grepl("No ",tmp.$`Data obtained?`) | is.na(tmp.$`Data obtained?`)),]
tmp.. <- strsplit(tmp.$Author, " ")
tmp.$Author <- sapply(tmp.., function(x) x[1])
tmp. <- tmp.[,c("Author","Year","Title")]
## Are these papers in v1?
inv1 <- unlist(sapply(1:nrow(tmp.), function(x)
which((grepl(tmp.$Author[x], Refv1$Refstd) & grepl(tmp.$Year[x], Refv1$Refstd)) |
grepl(tmp.$Title[x], Refv1$Refstd))))
inv1 <- Refv1[inv1,]
## Feed
v2missing$R_request[which(v2missing$ID_v1 %in% inv1$ID_v1)] <- 1
table(v2missing$R_request) # articles
##
## 0 1
## 86 10
length(na.omit(biov1$SHIFT_v1[which(biov1$ID_v1 %in% inv1$ID_v1)])) # shifts
## [1] 1788
# Reason: Papers that fail in getting data
tmp. <- read_xlsx("v2/BIF Matrix_v1.xlsx",sheet = 1)
## New names:
## • `` -> `...12`
## • `` -> `...16`
tmp. <- tmp.[which(grepl("Fail",tmp.$Notes)),]
tmp.. <- strsplit(tmp.$Author, " ")
tmp.$Author <- sapply(tmp.., function(x) x[1])
tmp. <- tmp.[,c("Author","Year","Title")]
## Are these papers in v1?
inv1 <- unlist(sapply(1:nrow(tmp.), function(x)
which((grepl(tmp.$Author[x], Refv1$Refstd) & grepl(tmp.$Year[x], Refv1$Refstd)) |
grepl(tmp.$Title[x], Refv1$Refstd))))
inv1 <- Refv1[inv1,]
## Feed
v2missing$R_fail[which(v2missing$ID_v1 %in% inv1$ID_v1)] <- 1
table(v2missing$R_fail) # articles
##
## 0 1
## 80 16
length(na.omit(biov1$SHIFT_v1[which(biov1$ID_v1 %in% inv1$ID_v1)])) # shifts
## [1] 1385
# Reason: Papers from which LAT shift was estimated from Azimuth transformation in v1
tmp. <- unique(biov1$ID_v1[which(biov1$Azi==1)])
v2missing$R_LatFromAz[which(v2missing$ID_v1 %in% tmp.)] <- 1
table(v2missing$R_LatFromAz)
##
## 0 1
## 86 10
# Reason: Papers from which organisms were not animals/plants in v1
tmp. <- filter(biov1_raw, biov1_raw$Phylum=="Tracheophyta" |
biov1_raw$Phylum=="Bryophyta" |
biov1_raw$Kingdom == "Animalia")
tmp. <- biov1_raw$ID[-which(biov1_raw$ID %in% tmp.$ID)]
v2missing$R_taxa[which(v2missing$ID_v1 %in% tmp.)] <- 1
table(v2missing$R_taxa) # articles
##
## 0
## 96
length(tmp.) # shifts
## [1] 129
# Reason: Shift extracted from figure in v1
tmp. <- read_xlsx("v1/Check_qualitatifShiftsBIOSHIFTS.xlsx")
## New names:
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`
## • `` -> `...12`
tmp. <- tmp.[grep(pattern = "yes", tmp.$ExtractedFigure),]
tmp. <- tmp.$ID_v1
v2missing$R_fig[which(v2missing$ID_v1 %in% tmp.)] <- 1
table(v2missing$R_fig) # articles
##
## 0 1
## 85 11
length(na.omit(biov1$SHIFT_v1[which(biov1$ID_v1 %in% tmp.)])) # shifts
## [1] 918
DT::datatable(v2missing,
rownames = FALSE,
extensions = 'Buttons',
options = list(pageLength = 100,
scrollY = "400px",
scrollX = T,
dom = 'Blfrtip',
buttons = c('csv', 'excel'),
lengthMenu = list(c(10,25,50,-1),
c(10,25,50,"All")))) %>%
DT::formatStyle(columns = colnames(v2missing), `fontSize` = '80%')
# Number of matches and mismatches excluding categorical shifts from v2
x1 <- list(v1 = unique(biov1$sp_name_v1),
v2 = unique(biov2cat$sp_name_v2))
names(x1) = c(paste0("Bioshifts v1 (", length(x1$v1),")"),
paste0("Bioshifts v2 (", length(x1$v2),")"))
ggvenn(x1,
fill_color = c("#0073C2FF", "#CD534CFF"),
stroke_size = 0.5, set_name_size = 4)
# total N species
totalsps <- length(unique(c(x1[[1]],x1[[2]])))
totalsps
## [1] 16149
# species unique to v1
uniquev1 <- length(which(!x1[[1]] %in% x1[[2]]))
uniquev1 ; uniquev1/totalsps
## [1] 3132
## [1] 0.1939439
# species unique to v2
uniquev2 <- length(which(!x1[[2]] %in% x1[[1]]))
uniquev2 ; uniquev2/totalsps
## [1] 3870
## [1] 0.2396433
# species in common
spincommon <- length(which(x1[[1]] %in% x1[[2]]))
spincommon ; spincommon/totalsps
## [1] 9147
## [1] 0.5664128
Merge should be made between the same species present in the same studies in both databases.
biov1 <- merge(biov1, Refv2AF[,1:2], all.x = T)
# differentiate multiple Study_IDs
## For v1
key <- paste(biov1$ID_v1, biov1$Type_v1, biov1$Param_v1, biov1$sp_name_v1, biov1$Study_ID_v1, sep = "__")
length(key)
## [1] 30341
length(unique(key))
## [1] 29274
# make key unique
key <- make.unique(key,sep = "-")
length(key) == length(unique(key))
## [1] TRUE
biov1$Study_ID_v1 <- as.character(sapply(key, function(x) strsplit(x, "__")[[1]][5]))
## For v2
key <- paste(biov2cat$ID_v2, biov2cat$Type_v2, biov2cat$Param_v2, biov2cat$sp_name_v2, biov2cat$Study_ID_v2, sep = "__")
length(key)
## [1] 29508
length(unique(key))
## [1] 24751
# make key unique
key <- make.unique(key,sep = "-")
length(key) == length(unique(key))
## [1] TRUE
biov2cat$Study_ID_v2 <- as.character(sapply(key, function(x) strsplit(x, "__")[[1]][5]))
# Merge
biov1$key <- paste(biov1$ID_v2,biov1$Type_v1,biov1$Param_v1,biov1$sp_name_v1, sep = "__")
biov2cat$key <- paste(biov2cat$ID_v2,biov2cat$Type_v2,biov2cat$Param_v2,biov2cat$sp_name_v2, sep = "__")
v1v2_merge <- plyr::join(biov1, biov2cat,
by= "key",
type="full")
test = which(is.na(v1v2_merge$Type_v1))
v1v2_merge[which(is.na(v1v2_merge$Type_v1)), c("ID_v2","Type_v1", "Param_v1", "sp_name_v1")] <- v1v2_merge[which(is.na(v1v2_merge$Type_v1)), c("ID_v2","Type_v2", "Param_v2", "sp_name_v2")]
v1v2_merge <- v1v2_merge[,c("ID_v1","ID_v2","Type_v1", "Param_v1", "sp_name_v1", "Study_ID_v1", "Study_ID_v2","Unit.trend_v1","Unit.trend_v2","Trend_v1","Trend_v2","DUR_v1","DUR_v2","SHIFT_v1", "SHIFT_v2", "key", "sp_reported_name_v1", "sp_reported_name_v2", "sp_common_name_v2", "START_v1", "START_v2", "END_v1", "END_v2", "Study_area_v2", "FigTab_v2", "Notes_v2", "S_ID_v1", "S_ID_v2")]
colnames(v1v2_merge)[1:5] <- c("ID_v1","ID_v2","Type", "Param", "sp_name")
col_names = c('Type', 'Param', 'sp_name')
col_1 = colnames(v1v2_merge)[grep('_v1$', colnames(v1v2_merge))]
col_2 = colnames(v1v2_merge)[grep('_v2$', colnames(v1v2_merge))]
# Na to refs without shift in v1
# v1v2_merge$ID_v1[which(is.na(v1v2_merge$Unit.trend_v1))] <- NA
# Na to refs without shift in v2
# v1v2_merge$ID_v2[which(is.na(v1v2_merge$Unit.trend_v2))] <- NA
v1v2_merge$key <- paste(v1v2_merge$ID_v1,v1v2_merge$ID_v2,v1v2_merge$Type,v1v2_merge$Param,v1v2_merge$sp_name, sep = "__")
keys = unique(v1v2_merge$key)
# i = which(keys == '5__233__ELE__O__Ptilidium_ciliare') # 1 x 2
# i = which(keys == '1__150__ELE__O__Garrulus_glandarius') # 2 x 2
# i = which(keys == '1__150__ELE__O__Periparus_ater') # 2 x 2
# i = which(keys == '147__283__ELE__O__Eriogonum_microthecum') # 5 x 1
# i = which(keys == '55__103__ELE__LE__Peromyscus_boylii') # 3 x 3 # case of duplicated order
# i = which(keys == 'NA__103__ELE__LE__Hesperosciurus_griseus') # 3 x 3
# i = which(keys == '1__150__ELE__O__Parus_major') # 2 x 2
# i = which(keys == '1__NA__ELE__O__Sylvia_cantillans') # 2 x 2
# i = which(keys == '121__78__ELE__LE__Salix_serpyllifolia') #
# i = which(keys == 'NA__103__ELE__LE__Hesperosciurus_griseus') #
# i = which(keys == '1__150__ELE__O__Parus_major') #
# i = which(keys == '151__49__LAT__LE__Acropora_hyacinthus') #
# test species
# "Cyanocitta_stelleri"
library(parallel)
library(pbapply)
cl <- makeCluster(4)
clusterExport(cl, c("v1v2_merge","keys","col_names","col_1","col_2"))
DAT = pblapply(1:length(keys), function(i) {
# DAT = list()
# for(i in 1:length(keys)) {
# cat("\r", i)
dat = v1v2_merge[v1v2_merge$key == keys[i], ]
if (nrow(dat) > 1) {
if (!all(is.na(dat$SHIFT_v1))) {
na_1 = which(duplicated(dat$SHIFT_v1) & duplicated(dat$Study_ID_v1))
if (length(na_1) > 0) {
dat_1 = dat[-na_1, col_1]
dat_1 = dat_1[order(as.numeric(dat_1$SHIFT_v1)), ]
} else {
dat_1 = dat[ , col_1]
dat_1 = dat_1[order(as.numeric(dat_1$SHIFT_v1)), ]
}
dat_1$fake_id = 1:nrow(dat_1)
dat_1$SHIFT_v1 = as.numeric(dat_1$SHIFT_v1)
} else {
dat_1 = dat[ , col_1]
dat_1 = dat_1[order(as.numeric(dat_1$SHIFT_v1)), ]
dat_1$fake_id = 1:nrow(dat_1)
}
if (all(is.na(dat$SHIFT_v2))) {
dat_2 = dat[ , col_2]
dat_2 = dat_2[order(as.numeric(dat_2$SHIFT_v2)), ]
dat_2$fake_id = 1:nrow(dat_2)
} else {
na_2 = which(duplicated(dat$SHIFT_v2) & duplicated(dat$Study_ID_v2))
if (length(na_2) > 0) {
dat_2 = dat[-na_2, col_2]
dat_2 = dat_2[order(as.numeric(dat_2$SHIFT_v2)), ]
dat_2$fake_id = 1:nrow(dat_2)
} else {
dat_2 = dat[ , col_2]
dat_2 = dat_2[order(as.numeric(dat_2$SHIFT_v2)), ]
dat_2$fake_id = 1:nrow(dat_2)
}
if(all(is.na(dat$SHIFT_v1))){
dat_2$fake_id = 1:nrow(dat_2)
} else {
# force seq v2 to match seq in v1
myord <- data.frame()
# dat_1 <- as.numeric(dat_1$SHIFT_v1)
# dat_2 <- as.numeric(dat_2$SHIFT_v2)
for(j in 1:nrow(dat_2)){
tmp <- min(na.omit(abs(dat_2$SHIFT_v2[j] - dat_1$SHIFT_v1)))
tmp.1 <- which(tmp == abs(dat_2$SHIFT_v2[j] - dat_1$SHIFT_v1))[1]
myord <- rbind(myord,
data.frame(fake_id = tmp.1, dis = tmp))
}
# dat_1
# dat_2
# myord
mydups <- duplicated(dat_2$id)
if(any(mydups)){
mydups <- which(mydups)
dat_2 <- cbind(dat_2, myord)
for(k in 1:length(mydups)){
dup.tmp <- myord$fake_id[mydups[k]]
myrows <- which(myord$fake_id == dup.tmp)
dat_2.tmp <- dat_2[myrows,]
dat_2.tmp$fake_id <- order(-dat_2.tmp$fake_id, dat_2.tmp$dis)
dat_2 <- dat_2[,-which(names(dat_2) %in% names(myord))]
dat_2.tmp <- dat_2.tmp[,-which(names(dat_2.tmp) %in% names(myord))]
dat_2[myrows,] <- dat_2.tmp
}
}
}
}
mat <- merge(dat_1,dat_2,by="fake_id",all=T)
mat = cbind(dat[1:nrow(mat),col_names],mat)
mat = mat[,-which(names(mat)=="fake_id")]
}else{
mat = dat[,c(col_names,col_1,col_2)]
mat$ID_v1 <- unique(mat$ID_v1)
mat$ID_v2 <- unique(mat$ID_v2)
}
#mat
mat$key=keys[i]
return(mat)
#DAT[[i]]=mat
}
,
cl = cl)
stopCluster(cl)
DAT = do.call("rbind",DAT)
DAT = DAT[,c("ID_v1","ID_v2","Type", "Param", "sp_name", "Study_ID_v1", "Study_ID_v2","Unit.trend_v1","Unit.trend_v2","Trend_v1","Trend_v2","DUR_v1","DUR_v2","SHIFT_v1", "SHIFT_v2", "sp_reported_name_v1", "sp_reported_name_v2", "sp_common_name_v2", "START_v1", "START_v2", "END_v1", "END_v2", "Study_area_v2", "FigTab_v2", "Notes_v2", "key", "S_ID_v1", "S_ID_v2")]
DAT$SHIFT_v2 <- as.numeric(DAT$SHIFT_v2)
# Flag with C if shift in v2 is qualitative
## qualitative ids from v2
idbiov2cat <- paste(biov2cat$ID_v2, biov2cat$Type_v2, biov2cat$Param_v2, biov2cat$sp_name_v2, sep = "__")
idbiov2cat <- idbiov2cat[which(is.na(biov2cat$SHIFT_v2))]
idbiov2cat <- unique(na.omit(idbiov2cat))
idbiov2cat <- which((paste(DAT$ID_v2, DAT$Type, DAT$Param, DAT$sp_name, sep = "__") %in% idbiov2cat) & is.na(DAT$Trend_v2))
DAT$SHIFT_v2[idbiov2cat] <- DAT$Trend_v2[idbiov2cat] <- "C"
# test
# All obs should be included in both DAT and biov1
idDATv1 <- paste(DAT$ID_v1, DAT$Type, DAT$Param, DAT$sp_name, sep = "__")
idDATv1 <- idDATv1[-grep("NA__",idDATv1)]
idDATv1 <- unique(na.omit(idDATv1))
idbiov1 <- paste(biov1$ID_v1, biov1$Type_v1, biov1$Param_v1, biov1$sp_name_v1, sep = "__")
idbiov1 <- unique(na.omit(idbiov1))
length(idDATv1) == length(idbiov1)
## [1] TRUE
idDATv2 <- paste(DAT$ID_v2, DAT$Type, DAT$Param, DAT$sp_name, sep = "__")
idDATv2 <- idDATv2[-grep("NA__",idDATv2)]
idDATv2 <- unique(na.omit(idDATv2))
idbiov2cat <- paste(biov2cat$ID_v2, biov2cat$Type_v2, biov2cat$Param_v2, biov2cat$sp_name_v2, sep = "__")
idbiov2cat <- unique(na.omit(idbiov2cat))
length(idDATv2) == length(idbiov2cat)
## [1] FALSE
DT::datatable(DAT,
rownames = FALSE,
extensions = 'Buttons',
options = list(pageLength = 100,
scrollY = "400px",
scrollX = T,
dom = 'Blfrtip',
buttons = c('csv', 'excel'),
lengthMenu = list(c(10,25,50,-1),
c(10,25,50,"All")))) %>%
DT::formatStyle(columns = colnames(DAT), `fontSize` = '80%')
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html