1 setup

# if (!require(devtools)) install.packages("devtools")
# devtools::install_github("yanlinlin82/ggvenn")

require(data.table)
library(knitr)
library(dplyr)
library(DT)
library(ggplot2)
library(ggvenn)
library(tidyverse)
library(readxl)
library(googledrive)
library(writexl)

2 Load references

# v1
stv1 <- fread('v1/References.csv')
names(stv1)[1:2] <- c('paperID', 'v1citation')
# Fix v1 ref ID
stv1id <- fread('v1/Table_reference_finalized.csv')
stv1 <- merge(stv1, stv1id, by.x = "paperID", by.y = "Mendeley_ref.see_word_document.", all.y = T)
stv1 <- stv1[,c("Article_ID","v1citation")]
stv1$Article_ID <- as.numeric(gsub("[^0-9.-]", "", stv1$Article_ID))

# v2
stv2 <- fread('v2/paper.ids.csv', encoding='Latin-1')
stv2[, citation := paste(author, yr, title, journal)]

## Use agrep to match on title
stv2[, v1citation := NA_character_] # the match citation from v2
stv2[, v1paperid := NA_integer_] # the match paperid from v2
stv2[, v2matches := NA_integer_] # N matches in v1

3 Compare reference list

Check if papers in v2 are present in v1.

for(i in 1:nrow(stv2)){ 
    cat(i)
    matches <- agrep(stv2$title[i], stv1$v1citation, costs = list(ins = 0, del = 2, subs =1))
    if(length(matches) == 1) {
        if(is.na(stv2$v1citation[i])){
            stv2$v1citation[i] <- stv1$v1citation[matches]
            stv2$v1paperid[i] <- stv1$Article_ID[matches]
            stv2$v2matches[i] <- 1
        }
        else{ # same reference matched - keep the closest match
            matches_ <- c(stv2$v1citation[i], stv1$v1citation[matches])
            distances <- adist(stv2$title[i], matches_)
            keep <- which.min(distances)
            cat(paste0('\nChose v1 citation=', matches_[keep], '\n'))
            j <- which(stv1$v1citation %in% matches_[keep])
            stv2$v1citation[i] <- stv1$v1citation[j]
            stv2$v1paperid[i] <- stv1$Article_ID[j]
            stv2$v2matches[i] <- 1
        }
    }
    if(length(matches) > 1) {
        if(is.na(stv2$v1citation[i])){
            cat(paste0('\n\nFound more than 1 match for i=',i, '. v2 title=', stv2$title[i]))
            distances <- adist(stv2$title[i], stv1$v1citation[matches])
            keep <- which.min(distances)
            cat(paste0('\nChose v1 citation=', stv1$v1citation[matches][keep], '\n'))
            j <- which(stv1$v1citation %in% stv1$v1citation[matches][keep])
            stv2$v1citation[i] <- stv1$v1citation[j]
            stv2$v1paperid[i] <- stv1$Article_ID[j]
            stv2$v2matches[i] <- as.numeric(length(matches))
        }
        else{ # same reference matched - keep the closest match
            matches_ <- c(stv2$v1citation[i], stv1$v1citation[matches])
            distances <- adist(stv2$title[i], matches_)
            keep <- which.min(distances)
            cat(paste0('\nChose v1 citation=', matches_[keep], '\n'))
            j <- which(stv1$v1citation %in% matches_[keep])
            stv2$v1citation[i] <- stv1$v1citation[j]
            stv2$v1paperid[i] <- stv1$Article_ID[j]
            stv2$v2matches[i] <- as.numeric(length(matches))
        }
    }
}

Some v2 citations with multiple matches in v1:

i = 2, 5, 11, 22, 38, 66, 78, 124, 173, 178, 179, 198, 208, 256, 266

Some citations in v2 were correctly identified v1, but others were not.

3.1 Manually check for inconsistencies in matching. Open in file in Excel and fix.

# write.csv(stv2, file = 'output/v1_v2_match_2check.csv', row.names=FALSE)
# 
# v1v2 <- read.csv('output/v1_v2_match_checked.csv')
# 
# # Fix matches
# v1v2$v1paperid[which(v1v2$match==0)] <- NA
# v1v2$v1citation[which(v1v2$match==0)] <- NA
# 
# # Any ref in v2 with multiple matches in v1?
# names(which(table(v1v2$v1paperid)>1)) # fix these
# 
# write.csv(v1v2, file = 'output/v1_v2_match_2check.csv', row.names=FALSE)

v1v2 <- read.csv('output/v1_v2_match_checked.csv')

# Fix matches
v1v2$v1paperid[which(v1v2$match==0)] <- NA
v1v2$v1citation[which(v1v2$match==0)] <- NA

# Any ref in v2 with multiple matches in v1?
names(which(table(v1v2$v1paperid)>1)) # fix these
## character(0)

From the 0 v2 citations with multiple matches in v1, 0 of these were correct matches.

4 Create a full reference list

RefFull <- v1v2[,c(1,8,9,10)]
names(RefFull) <- c("ID_v2","Ref_v2","Ref_v1", "ID_v1") 

# add non-matching references from v1
RefFull <- merge(RefFull,stv1,
                 by.x = "ID_v1",by.y = "Article_ID",
                 all = T)

RefFull <- RefFull[,-4]
names(RefFull) <- c("ID_v1","ID_v2", "Ref_v2","Ref_v1") 

RefFull$Refstd <- RefFull$Ref_v2
RefFull$Refstd[which(!is.na(RefFull$Ref_v1))] <- RefFull$Ref_v1[which(!is.na(RefFull$Ref_v1))]

# check
all(stv1$Article_ID %in% RefFull$ID_v1)
## [1] TRUE
all(stv2$paperid %in% RefFull$ID_v2)
## [1] TRUE
# check for duplicates
any(duplicated(na.omit(RefFull$ID_v1)))
## [1] FALSE
any(duplicated(na.omit(RefFull$ID_v2)))
## [1] FALSE
RefFull <- RefFull[,c(1,4,2,3,5)]

DT::datatable(RefFull,
              rownames = FALSE,
              extensions = 'Buttons',
              options = list(pageLength = 100, 
                             scrollY = "400px",
                             scrollX = T,
                             dom = 'Blfrtip',
                             buttons = c('csv', 'excel'),
                             lengthMenu = list(c(10,25,50,-1),
                                               c(10,25,50,"All")))) %>%
    DT::formatStyle(columns = colnames(RefFull), `fontSize` = '80%')

5 Match summary

# create database with matches (ref included in both databases) between databases
RefMatch <- na.omit(RefFull)

# v1 refs
Refv1 = RefFull[which(!is.na(RefFull$Ref_v1)),c("ID_v1","ID_v2","Refstd")]
# v2 refs
Refv2 = RefFull[which(!is.na(RefFull$Ref_v2)),c("ID_v1","ID_v2","Refstd")]
# v1 only
Refv1only <- Refv1[which(!Refv1$ID_v1 %in% Refv2$ID_v1),]
# v2 only
Refv2only <- Refv2[which(!Refv2$ID_v2 %in% Refv1$ID_v2),]

# Number of matches and mismatches
x <- list(v1 = Refv1$Refstd,
          v2 = Refv2$Refstd)

names(x) = c(paste0("Bioshifts v1 (", length(x$v1),")"),
             paste0("Bioshifts v2 (", length(x$v2),")"))

ggvenn(
    x,
    fill_color = c("#0073C2FF", "#CD534CFF"),
    stroke_size = 0.5, set_name_size = 4
)

Missing refs from v2 in v1 can be i) papers published after 2018 or ii) papers that register qualitative shifts, while missing papers from v1 in v2 can be due to papers looking at shifts in organisms that are not animals or plants (i.e., v2 did not included papers looking at shifts in algae, lichens…). Open the range shift data and clean.

6 Load range shift data

# Load databases

# Load v1
biov1_raw <- read.table("v1/Shifts2018_checkedtaxo.txt",header = T)
# Fix references in biov1
biov1_raw$Article_ID <- gsub("[^0-9.-]", "", biov1_raw$Article_ID)
biov1_raw$Article_ID <- as.numeric(biov1_raw$Article_ID)
biov1_raw <- biov1_raw[-which(biov1_raw$valid_species_name=="no"),]
# create a shift ID
biov1_raw$S_ID_v1 <- sapply(1:nrow(biov1_raw), function(x) paste("S",x,sep = "_"))

# Load v2 
# biov2_raw <- read.csv("v2/Bioshifts.v2.final.csv")
biov2_raw <- read_excel("v2/Bioshifts.v2.final.corrected.xlsx", sheet = 2) # new file sent by Sarah with start/end dates for shifts
# create a shift ID
biov2_raw$S_ID_v2 <- sapply(1:nrow(biov2_raw), function(x) paste("S",x,sep = "_"))

biov1 = biov1_raw
biov2 = biov2_raw

# Check if there are any references from the reference list absent from the raw databases
all(Refv1$ID_v1 %in% biov1_raw$Article_ID)
## [1] FALSE
all(Refv2$ID_v2 %in% biov2_raw$paperid)
## [1] FALSE
mis <- Refv1$ID_v1[which(!Refv1$ID_v1 %in% biov1_raw$Article_ID)]
mis
## [1]  91 101
# 2 papers from v1 reference list is missing from the v1 database
# Remove this paper from the v2 reference list
Refv1 = Refv1[which(!Refv1$ID_v1 %in% mis),]
Refv1only = Refv1only[which(!Refv1only$ID_v1 %in% mis),]
RefFull = RefFull[which(!RefFull$ID_v1 %in% mis),]

# check again
all(Refv1$ID_v1 %in% biov1_raw$Article_ID)
## [1] TRUE
# OK

6.1 Use shift type leading, tralling or centroid.

unique(biov1$Param)
## [1] "O"  "LE" "TE"
unique(biov2$Parameter)
## [1] "leading edge"    "mean"            "trailing edge"   "maximum/optimum"
## [5] "east"            "west"
# We kept "maximum/optimum" because some studies considered as "O" in v1 are classified as "maximum/optimum" in v2.

6.2 Use only latitudinal and elevational shifts

# V1 used the direction and angle (azimuth) to convert HOR shifts to LAT shifts, but these were entered as HOR shifts in the raw database. We have to call these LAT shifts instead

table(biov1$Type)
## 
##   BAT   ELE   HOR   LAT   LON 
##   566 13451  4031 12859  1032
biov1$Type[which(!is.na(biov1$Azimuth))] <- "LAT" # All obs type == HOR contain Azimuth value
# flag whether LAT comes from Azimuth conversion
biov1$Azi = 0
biov1$Azi[which(!is.na(biov1$Azimuth))] <- 1

unique(biov1$Type)
## [1] "ELE" "LAT" "BAT" "LON"
biov1 <- biov1[which(biov1$Type=="ELE" | biov1$Type=="LAT"),]

table(biov2$Dimension)
## 
##     depth elevation  latitude longitude 
##       739     13610     15898      2762
biov2 <- biov2[which(biov2$Dimension=="latitude" | biov2$Dimension=="elevation"),]

6.3 Standardize range shifts

Standardize shift measures to km/year

# v1
unique(biov1$UNIT)
## [1] "m/year"  "km/year"
# from m/year to km/year
biov1 <- biov1 %>% 
    mutate(
        kmyr = case_when(
            UNIT == "m/year" ~ SHIFT/1000, 
            UNIT == "km/year" ~ SHIFT, 
            TRUE ~ NA_real_))

biov2$`Total Years Studied` <- as.numeric(biov2$`Total Years Studied`)
biov2$`Numeric Change` <- as.numeric(biov2$`Numeric Change`)

# v2
biov2 <- biov2 %>%
    mutate(`Total Years Studied`=as.numeric(`Total Years Studied`)) %>% 
    mutate(
        kmyr = case_when(
            Unit == "ft" ~ `Numeric Change`/3280.8 /`Total Years Studied`,
            Unit == 'degrees lat' ~ `Numeric Change`*111/`Total Years Studied`,
            Unit =='degrees lat/year' ~ `Numeric Change`*111,
            Unit=='degree latitude per decade'~ `Numeric Change`*111/10,
            Unit=='km' ~ `Numeric Change`/`Total Years Studied`,
            Unit=='km/decade' ~ `Numeric Change`/10,
            Unit=='km/year' ~ `Numeric Change`,
            Unit=='m' ~ `Numeric Change`/1000/`Total Years Studied`,
            Unit=='m/year' ~ `Numeric Change`/1000,
            Unit=='m/decade'~ `Numeric Change`/1000/10,
            TRUE ~ NA_real_
        )
    )

6.4 Duration

# In v1 duration is (END - START) + 1
plot(biov1$DUR,biov1$END-biov1$START +1)

biov1$DUR[1]==biov1$END[1]-biov1$START[1]+1
## [1] TRUE
# In v2 duration should be the same, but years are not provided in their data
biov2$`Start Year` <- as.numeric(biov2$`Start Year`)
## Warning: NAs introduced by coercion
biov2$`End Year` <- as.numeric(biov2$`End Year`)
## Warning: NAs introduced by coercion
plot(biov2$`Total Years Studied`, biov2$`End Year`-biov2$`Start Year`+1)

biov2$`Total Years Studied`[1]==biov2$`End Year`[1]-biov2$`Start Year`[1]+1
## [1] TRUE

6.5 Get columns of interest

# get relevant columns
biov1 <- biov1[,c("Article_ID","Study_ID","Trend","Unit.trend","DUR","Type","Param","kmyr","Azi","New_name", "START", "END", "old_name","S_ID_v1")]
names(biov1) <- c("ID_v1","Study_ID_v1","Trend_v1","Unit.trend_v1","DUR_v1","Type_v1","Param_v1","SHIFT_v1","Azi","sp_name_v1", "START_v1", "END_v1","sp_reported_name_v1","S_ID_v1")

biov2 <- biov2[,c("Paper ID","Study Period","Numeric Change","Unit","Total Years Studied","Dimension","Parameter","kmyr","Scientific Name", "species", "Common Name", "Start Year", "End Year","Study Area", "Found on Table/Pg", "Notes","S_ID_v2")]
names(biov2) <- c("ID_v2","Study_ID_v2","Trend_v2","Unit.trend_v2","DUR_v2","Type_v2","Param_v2","SHIFT_v2","sp_name_v2","sp_reported_name_v2", "sp_common_name_v2", "START_v2", "END_v2", "Study_area_v2", "FigTab_v2", "Notes_v2","S_ID_v2")
biov2$sp_name_v2 <- gsub(" ","_",biov2$sp_name_v2)

6.6 Standardize categories

Use refs from clean databases:
Type: latitudinal or altitudinal
Param: LE, TE or O

unique(biov1$Type_v1)
## [1] "ELE" "LAT"
unique(biov2$Type_v2) 
## [1] "latitude"  "elevation"
biov2$Type_v2[which(biov2$Type_v2 == "latitude")] <- "LAT"
biov2$Type_v2[which(biov2$Type_v2 == "elevation")] <- "ELE"

unique(biov1$Param_v1)
## [1] "O"  "LE" "TE"
unique(biov2$Param_v2)
## [1] "leading edge"    "mean"            "trailing edge"   "maximum/optimum"
biov2$Param_v2[which(biov2$Param_v2 == "leading edge")] <- "LE"
biov2$Param_v2[which(biov2$Param_v2 == "mean")] <- "O"
biov2$Param_v2[which(biov2$Param_v2 == "maximum/optimum")] <- "O"
biov2$Param_v2[which(biov2$Param_v2 == "trailing edge")] <- "TE"

6.7 Use only numberic shifts (remove qualitative shifts)

PS.: All shifts in v1 are quantitative, but v2 included qualitative shifts (e.g., yes/no).

# keep with categorical shifts for comparison
biov2cat <- biov2
biov2 <- biov2[-which(is.na(biov2$SHIFT_v2)),]

7 Save species list

# save species list
splist <- data.frame(species = unique(c(biov1$sp_name_v1, biov2$sp_name_v2)))
splist$v1 <- 0
splist$v1[which(splist$species %in% biov1_raw$New_name)] <- 1
splist$v2 <- 0
splist$v2[which(splist$species %in% gsub(" ","_",biov2_raw$`Scientific Name`))] <- 1

# taxonomy
taxonomyv1 <- biov1_raw[,c("Kingdom", "Phylum", "Class", "Order", "Family", "New_name")]
taxonomyv2 <- biov2_raw[,c("kingdom", "phylum", "class", "order", "family", "Scientific Name")]
names(taxonomyv1) <- names(taxonomyv2)
taxonomy <- rbind(taxonomyv2,taxonomyv1)
taxonomy <- taxonomy[-which(duplicated(taxonomy$`Scientific Name`)),]
taxonomy$`Scientific Name` <- gsub(" ","_",taxonomy$`Scientific Name`)

splist <- merge(splist, taxonomy, by.x = "species", by.y = "Scientific Name", all.x = TRUE)

write.csv(splist, "output/splist.csv", row.names = F)

8 Match summary using cleanned databases

RefFullAF <- RefFull[which((RefFull$ID_v1 %in% biov1$ID_v1) | (RefFull$ID_v2 %in% biov2$ID_v2)),]
RefFullAFc <- RefFull[which((RefFull$ID_v1 %in% biov1$ID_v1) | (RefFull$ID_v2 %in% biov2cat$ID_v2)),]

# v1 refs
Refv1AF = filter(Refv1, ID_v1 %in% biov1$ID_v1)
# v2 refs excluding categorical shifts
Refv2AF = filter(Refv2, ID_v2 %in% biov2$ID_v2)
# v2 refs including categorical shifts
Refv2AFc = filter(Refv2, ID_v2 %in% biov2cat$ID_v2)

# v1 only
Refv1onlyAF <- filter(Refv1AF, !ID_v1 %in% Refv2AF$ID_v1)
# v2 only excluding categorical shifts
Refv2onlyAF <- filter(Refv2AF, !ID_v2 %in% Refv1AF$ID_v2)
# v2 only including categorical shifts
Refv2onlyAFc <- filter(Refv2AFc, !ID_v2 %in% Refv1AF$ID_v2)

# Number of matches and mismatches excluding categorical shifts from v2
x1 <- list(v1 = unique(Refv1AF$Refstd),
           v2 = unique(Refv2AF$Refstd))

names(x1) = c(paste0("Bioshifts v1 (", length(x1$v1),")"),
              paste0("Bioshifts v2 (", length(x1$v2),")"))

# Number of matches and mismatches including categorical shifts from v2
x1c <- list(v1 = unique(Refv1AF$Refstd),
            v2 = unique(Refv2AFc$Refstd))

names(x1c) = c(paste0("Bioshifts v1 (", length(x1c$v1),")"),
               paste0("Bioshifts v2 (", length(x1c$v2),")"))

# Matches excluding categorical shifts from v2
ggvenn(x1,
       fill_color = c("#0073C2FF", "#CD534CFF"),
       stroke_size = 0.5, set_name_size = 4)

# Matches including categorical shifts from v2
ggvenn(x1c,
       fill_color = c("#0073C2FF", "#CD534CFF"),
       stroke_size = 0.5, set_name_size = 4)

Greater number of uniques in v2 including qualitative shifts >> v1 did not include qualitative shifts. Increase in uniques in v1 after excluding qualitative shifts >> indicative that some quantitative shifts in v1 are included as qualitative in v2.

Merging the two databases could lead to a total of 402 studies. There are 8721 shift measures from v1 not included in v2, and 2453 shift measures from v2 not included in v1.

8.1 Missing papers from v2 in v1

Papers absent in v1 but present in v2.
Possible reasons:
1) year of publication (v1 compiled data until 2018; v2 compiled data until 2020)
2) data requested was not received in v1 but received in v2
3) qualitative shift
4) excluded from v1 based on criteria

v1missing <- Refv2onlyAFc[,c(2:3)]
# total v1missing
nrow(v1missing)
## [1] 144
v1missing$R_yr <- 0
v1missing$R_request <- 0
v1missing$R_quali <- 0
v1missing$R_excluded <- 0

# Reason: Year >> papers published after >2018
tmp. <- select(filter(stv2, yr > 2018), paperid)
v1missing$R_yr[which(v1missing$ID_v2 %in% tmp.$paperid)] <- 1
table(v1missing$R_yr) # articles
## 
##   0   1 
## 121  23
length(na.omit(biov2$SHIFT_v2[which(biov2$ID_v2 %in% tmp.$paperid)])) # shifts
## [1] 1735
# Reason: Data requested but not received
## Load papers requested in v1
tmp. <- read_xlsx("v1/Base_contact.xlsx")
## List of papers from which data was not received
tmp. <- filter(tmp., is.na(Article_ID))
tmp.. <- strsplit(tmp.$References, "_")
tmp.$Author <- sapply(tmp.., function(x) x[1])
tmp.$Year <- sapply(tmp.., function(x) x[3])
tmp. <- tmp.[,c("Author","Year","Title")]
## Are these papers in v2?
v2author <- sapply(strsplit(stv2$author, " "), function(x) x[1])
inv2 <- stv2[which(paste(v2author,stv2$yr) %in% paste(tmp.$Author,tmp.$Year)),]
## Feed
v1missing$R_request[which(v1missing$ID_v2 %in% inv2$paperid)] <- 1

table(v1missing$R_request) # articles
## 
##   0   1 
## 142   2
length(na.omit(biov2$SHIFT_v2[which(biov2$ID_v2 %in% inv2$paperid)])) # shifts
## [1] 15
# Reason: Qualitative data in v2
tmp. <- biov2cat$ID_v2[which(is.na(biov2cat$SHIFT_v2))]
v1missing$R_quali[which(v1missing$ID_v2 %in% tmp.)] <- 1

table(v1missing$R_quali) # articles
## 
##   0   1 
##  35 109
length(na.omit(biov2$SHIFT_v2[which(biov2$ID_v2 %in% tmp.)])) # shifts
## [1] 1265
# List of excluded papers
tmp. <- read_xlsx("v1/Base_Rebut.xlsx")
tmp.. <- strsplit(tmp.$References, "_")
tmp.$Author <- sapply(tmp.., function(x) x[1])
tmp.$Year <- sapply(tmp.., function(x) x[3])
tmp. <- tmp.[,c("Author","Year")]
## Are these papers in v2?
v2author <- sapply(strsplit(stv2$author, " "), function(x) x[1])
inv2 <- stv2[which(paste(v2author,stv2$yr) %in% paste(tmp.$Author,tmp.$Year)),]
## Feed
v1missing$R_excluded[which(v1missing$ID_v2 %in% inv2$paperid)] <- 1

table(v1missing$R_excluded) # articles
## 
##   0   1 
## 126  18
length(na.omit(biov2$SHIFT_v2[which(biov2$ID_v2 %in% inv2$paperid)])) # shifts
## [1] 27
DT::datatable(v1missing,
              rownames = FALSE,
              extensions = 'Buttons',
              options = list(pageLength = 100, 
                             scrollY = "400px",
                             scrollX = T,
                             dom = 'Blfrtip',
                             buttons = c('csv', 'excel'), 
                             lengthMenu = list(c(10,25,50,-1),
                                               c(10,25,50,"All")))) %>%
    DT::formatStyle(columns = colnames(v1missing), `fontSize` = '80%')

8.2 Missing papers from v1 in v2

Papers absent in v2 but present in v1.
Possible reasons:
1) Fail in extracting the data, fail in getting the data, fail because no complete species list…
2) data requested but not received
3) LAT calculated estimated from Azimuth (v1 converted LONG to LAT using Azimuth)
4) If paper included non-animal or plant organisms (e.g., fungi, virus, algae, phytoplancton…)
5) Shift in v1 extracted from figure

v2missing <- Refv1onlyAF[,c(1,3)]

# total v2missing
nrow(v2missing)
## [1] 96
v2missing$R_request <- 0
v2missing$R_fail <- 0
v2missing$R_LatFromAz <- 0
v2missing$R_taxa <- 0
v2missing$R_fig <- 0

# Reason: Data requested but not received
## Load papers requested in v2
tmp. <- read_xlsx("v2/Data Contact Sheet.xlsx",skip = 1)
## List of papers from which data was not received
tmp. <- tmp.[which(grepl("No,",tmp.$`Data obtained?`) | grepl("No ",tmp.$`Data obtained?`) | is.na(tmp.$`Data obtained?`)),]
tmp.. <- strsplit(tmp.$Author, " ")
tmp.$Author <- sapply(tmp.., function(x) x[1])
tmp. <- tmp.[,c("Author","Year","Title")]
## Are these papers in v1?
inv1 <- unlist(sapply(1:nrow(tmp.), function(x) 
    which((grepl(tmp.$Author[x], Refv1$Refstd) & grepl(tmp.$Year[x], Refv1$Refstd)) |
              grepl(tmp.$Title[x], Refv1$Refstd))))
inv1 <- Refv1[inv1,]
## Feed
v2missing$R_request[which(v2missing$ID_v1 %in% inv1$ID_v1)] <- 1

table(v2missing$R_request) # articles
## 
##  0  1 
## 86 10
length(na.omit(biov1$SHIFT_v1[which(biov1$ID_v1 %in% inv1$ID_v1)])) # shifts
## [1] 1788
# Reason: Papers that fail in getting data
tmp. <- read_xlsx("v2/BIF Matrix_v1.xlsx",sheet = 1)
## New names:
## • `` -> `...12`
## • `` -> `...16`
tmp. <- tmp.[which(grepl("Fail",tmp.$Notes)),]
tmp.. <- strsplit(tmp.$Author, " ")
tmp.$Author <- sapply(tmp.., function(x) x[1])
tmp. <- tmp.[,c("Author","Year","Title")]
## Are these papers in v1?
inv1 <- unlist(sapply(1:nrow(tmp.), function(x) 
    which((grepl(tmp.$Author[x], Refv1$Refstd) & grepl(tmp.$Year[x], Refv1$Refstd)) |
              grepl(tmp.$Title[x], Refv1$Refstd))))
inv1 <- Refv1[inv1,]
## Feed
v2missing$R_fail[which(v2missing$ID_v1 %in% inv1$ID_v1)] <- 1

table(v2missing$R_fail) # articles
## 
##  0  1 
## 80 16
length(na.omit(biov1$SHIFT_v1[which(biov1$ID_v1 %in% inv1$ID_v1)])) # shifts
## [1] 1385
# Reason: Papers from which LAT shift was estimated from Azimuth transformation in v1
tmp. <- unique(biov1$ID_v1[which(biov1$Azi==1)])
v2missing$R_LatFromAz[which(v2missing$ID_v1 %in% tmp.)] <- 1
table(v2missing$R_LatFromAz)
## 
##  0  1 
## 86 10
# Reason: Papers from which organisms were not animals/plants in v1
tmp. <- filter(biov1_raw, biov1_raw$Phylum=="Tracheophyta" | 
                   biov1_raw$Phylum=="Bryophyta" | 
                   biov1_raw$Kingdom == "Animalia")
tmp. <- biov1_raw$ID[-which(biov1_raw$ID %in% tmp.$ID)]
v2missing$R_taxa[which(v2missing$ID_v1 %in% tmp.)] <- 1

table(v2missing$R_taxa) # articles
## 
##  0 
## 96
length(tmp.) # shifts
## [1] 129
# Reason: Shift extracted from figure in v1
tmp. <- read_xlsx("v1/Check_qualitatifShiftsBIOSHIFTS.xlsx")
## New names:
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`
## • `` -> `...12`
tmp. <- tmp.[grep(pattern = "yes", tmp.$ExtractedFigure),]
tmp. <- tmp.$ID_v1
v2missing$R_fig[which(v2missing$ID_v1 %in% tmp.)] <- 1

table(v2missing$R_fig) # articles
## 
##  0  1 
## 85 11
length(na.omit(biov1$SHIFT_v1[which(biov1$ID_v1 %in% tmp.)])) # shifts
## [1] 918
DT::datatable(v2missing,
              rownames = FALSE,
              extensions = 'Buttons',
              options = list(pageLength = 100, 
                             scrollY = "400px",
                             scrollX = T,
                             dom = 'Blfrtip',
                             buttons = c('csv', 'excel'), 
                             lengthMenu = list(c(10,25,50,-1),
                                               c(10,25,50,"All")))) %>%
    DT::formatStyle(columns = colnames(v2missing), `fontSize` = '80%')

8.3 Species in common between v1 in v2

# Number of matches and mismatches excluding categorical shifts from v2
x1 <- list(v1 = unique(biov1$sp_name_v1),
           v2 = unique(biov2cat$sp_name_v2))

names(x1) = c(paste0("Bioshifts v1 (", length(x1$v1),")"),
              paste0("Bioshifts v2 (", length(x1$v2),")"))

ggvenn(x1,
       fill_color = c("#0073C2FF", "#CD534CFF"),
       stroke_size = 0.5, set_name_size = 4)

# total N species
totalsps <- length(unique(c(x1[[1]],x1[[2]])))
totalsps
## [1] 16149
# species unique to v1
uniquev1 <- length(which(!x1[[1]] %in% x1[[2]]))
uniquev1 ; uniquev1/totalsps
## [1] 3132
## [1] 0.1939439
# species unique to v2
uniquev2 <- length(which(!x1[[2]] %in% x1[[1]]))
uniquev2 ; uniquev2/totalsps
## [1] 3870
## [1] 0.2396433
# species in common
spincommon <- length(which(x1[[1]] %in% x1[[2]]))
spincommon ; spincommon/totalsps
## [1] 9147
## [1] 0.5664128

9 Merge bioshifts v1 and v2

Merge should be made between the same species present in the same studies in both databases.

biov1 <- merge(biov1, Refv2AF[,1:2], all.x = T)

# differentiate multiple Study_IDs
## For v1
key <- paste(biov1$ID_v1, biov1$Type_v1, biov1$Param_v1, biov1$sp_name_v1, biov1$Study_ID_v1, sep = "__")
length(key)
## [1] 30341
length(unique(key))
## [1] 29274
# make key unique
key <- make.unique(key,sep = "-")
length(key) == length(unique(key))
## [1] TRUE
biov1$Study_ID_v1 <- as.character(sapply(key, function(x) strsplit(x, "__")[[1]][5]))

## For v2
key <- paste(biov2cat$ID_v2, biov2cat$Type_v2, biov2cat$Param_v2, biov2cat$sp_name_v2, biov2cat$Study_ID_v2, sep = "__")
length(key)
## [1] 29508
length(unique(key))
## [1] 24751
# make key unique
key <- make.unique(key,sep = "-")
length(key) == length(unique(key))
## [1] TRUE
biov2cat$Study_ID_v2 <- as.character(sapply(key, function(x) strsplit(x, "__")[[1]][5]))

# Merge
biov1$key <- paste(biov1$ID_v2,biov1$Type_v1,biov1$Param_v1,biov1$sp_name_v1, sep = "__")
biov2cat$key <- paste(biov2cat$ID_v2,biov2cat$Type_v2,biov2cat$Param_v2,biov2cat$sp_name_v2, sep = "__")

v1v2_merge <- plyr::join(biov1, biov2cat, 
                         by= "key",
                         type="full")
test = which(is.na(v1v2_merge$Type_v1))

v1v2_merge[which(is.na(v1v2_merge$Type_v1)), c("ID_v2","Type_v1", "Param_v1", "sp_name_v1")] <- v1v2_merge[which(is.na(v1v2_merge$Type_v1)), c("ID_v2","Type_v2", "Param_v2", "sp_name_v2")]

v1v2_merge <- v1v2_merge[,c("ID_v1","ID_v2","Type_v1", "Param_v1", "sp_name_v1", "Study_ID_v1", "Study_ID_v2","Unit.trend_v1","Unit.trend_v2","Trend_v1","Trend_v2","DUR_v1","DUR_v2","SHIFT_v1", "SHIFT_v2", "key", "sp_reported_name_v1", "sp_reported_name_v2", "sp_common_name_v2", "START_v1", "START_v2", "END_v1", "END_v2", "Study_area_v2", "FigTab_v2", "Notes_v2", "S_ID_v1", "S_ID_v2")]

colnames(v1v2_merge)[1:5] <- c("ID_v1","ID_v2","Type", "Param", "sp_name")

col_names = c('Type', 'Param', 'sp_name')
col_1 = colnames(v1v2_merge)[grep('_v1$', colnames(v1v2_merge))]
col_2 = colnames(v1v2_merge)[grep('_v2$', colnames(v1v2_merge))]

# Na to refs without shift in v1
# v1v2_merge$ID_v1[which(is.na(v1v2_merge$Unit.trend_v1))] <- NA
# Na to refs without shift in v2
# v1v2_merge$ID_v2[which(is.na(v1v2_merge$Unit.trend_v2))] <- NA

v1v2_merge$key <- paste(v1v2_merge$ID_v1,v1v2_merge$ID_v2,v1v2_merge$Type,v1v2_merge$Param,v1v2_merge$sp_name, sep = "__")

keys = unique(v1v2_merge$key)

# i = which(keys == '5__233__ELE__O__Ptilidium_ciliare') # 1 x 2
# i = which(keys == '1__150__ELE__O__Garrulus_glandarius') # 2 x 2
# i = which(keys == '1__150__ELE__O__Periparus_ater') # 2 x 2
# i = which(keys == '147__283__ELE__O__Eriogonum_microthecum') # 5 x 1
# i = which(keys == '55__103__ELE__LE__Peromyscus_boylii') # 3 x 3 # case of duplicated order 
# i = which(keys == 'NA__103__ELE__LE__Hesperosciurus_griseus') # 3 x 3
# i = which(keys == '1__150__ELE__O__Parus_major') # 2 x 2
# i = which(keys == '1__NA__ELE__O__Sylvia_cantillans') # 2 x 2
# i = which(keys == '121__78__ELE__LE__Salix_serpyllifolia') #
# i = which(keys == 'NA__103__ELE__LE__Hesperosciurus_griseus') # 
# i = which(keys == '1__150__ELE__O__Parus_major') # 
# i = which(keys == '151__49__LAT__LE__Acropora_hyacinthus') # 

# test species
# "Cyanocitta_stelleri"

library(parallel)
library(pbapply)

cl <- makeCluster(4)
clusterExport(cl, c("v1v2_merge","keys","col_names","col_1","col_2"))

DAT = pblapply(1:length(keys), function(i) {
    # DAT = list()
    # for(i in 1:length(keys)) {
    #     cat("\r", i)
    
    dat = v1v2_merge[v1v2_merge$key == keys[i], ]
    
    if (nrow(dat) > 1) {
        
        if (!all(is.na(dat$SHIFT_v1))) {
            
            na_1 = which(duplicated(dat$SHIFT_v1) & duplicated(dat$Study_ID_v1))
            
            if (length(na_1) > 0) {
                dat_1 = dat[-na_1, col_1]
                dat_1 = dat_1[order(as.numeric(dat_1$SHIFT_v1)), ]
            } else {
                dat_1 = dat[ , col_1]
                dat_1 = dat_1[order(as.numeric(dat_1$SHIFT_v1)), ]
            }
            dat_1$fake_id = 1:nrow(dat_1)
            dat_1$SHIFT_v1 = as.numeric(dat_1$SHIFT_v1)
            
        } else {
            dat_1 = dat[ , col_1]
            dat_1 = dat_1[order(as.numeric(dat_1$SHIFT_v1)), ]
            dat_1$fake_id = 1:nrow(dat_1)
        }
        
        if (all(is.na(dat$SHIFT_v2))) {
            dat_2 = dat[ , col_2]
            dat_2 = dat_2[order(as.numeric(dat_2$SHIFT_v2)), ]
            dat_2$fake_id = 1:nrow(dat_2)
        } else {
            na_2 = which(duplicated(dat$SHIFT_v2) & duplicated(dat$Study_ID_v2))
            
            if (length(na_2) > 0) {
                dat_2 = dat[-na_2, col_2]
                dat_2 = dat_2[order(as.numeric(dat_2$SHIFT_v2)), ]
                dat_2$fake_id = 1:nrow(dat_2)
            } else {
                dat_2 = dat[ , col_2]
                dat_2 = dat_2[order(as.numeric(dat_2$SHIFT_v2)), ]
                dat_2$fake_id = 1:nrow(dat_2)
            }
            if(all(is.na(dat$SHIFT_v1))){
                dat_2$fake_id = 1:nrow(dat_2)
            } else {
                
                # force seq v2 to match seq in v1
                myord <- data.frame()
                # dat_1 <- as.numeric(dat_1$SHIFT_v1)
                # dat_2 <- as.numeric(dat_2$SHIFT_v2)
                
                for(j in 1:nrow(dat_2)){
                    tmp <- min(na.omit(abs(dat_2$SHIFT_v2[j] - dat_1$SHIFT_v1)))
                    tmp.1 <- which(tmp == abs(dat_2$SHIFT_v2[j] - dat_1$SHIFT_v1))[1]
                    myord <- rbind(myord,
                                   data.frame(fake_id = tmp.1, dis = tmp))
                }
                
                # dat_1
                # dat_2
                # myord
                
                mydups <- duplicated(dat_2$id)
                
                if(any(mydups)){
                    
                    mydups <- which(mydups)
                    dat_2 <- cbind(dat_2, myord)
                    for(k in 1:length(mydups)){
                        dup.tmp <- myord$fake_id[mydups[k]]
                        myrows <- which(myord$fake_id == dup.tmp)
                        dat_2.tmp <- dat_2[myrows,]
                        dat_2.tmp$fake_id <- order(-dat_2.tmp$fake_id, dat_2.tmp$dis)
                        dat_2 <- dat_2[,-which(names(dat_2) %in% names(myord))]
                        dat_2.tmp <- dat_2.tmp[,-which(names(dat_2.tmp) %in% names(myord))]
                        dat_2[myrows,] <- dat_2.tmp
                    }
                }
            } 
        }
        mat <- merge(dat_1,dat_2,by="fake_id",all=T)
        mat = cbind(dat[1:nrow(mat),col_names],mat)
        mat = mat[,-which(names(mat)=="fake_id")]
    }else{
        mat = dat[,c(col_names,col_1,col_2)]
        mat$ID_v1 <- unique(mat$ID_v1)
        mat$ID_v2 <- unique(mat$ID_v2)
    }
    #mat
    mat$key=keys[i]
    return(mat)
    #DAT[[i]]=mat
}
,
cl = cl)

stopCluster(cl)

DAT = do.call("rbind",DAT)

DAT = DAT[,c("ID_v1","ID_v2","Type", "Param", "sp_name", "Study_ID_v1", "Study_ID_v2","Unit.trend_v1","Unit.trend_v2","Trend_v1","Trend_v2","DUR_v1","DUR_v2","SHIFT_v1", "SHIFT_v2", "sp_reported_name_v1", "sp_reported_name_v2", "sp_common_name_v2", "START_v1", "START_v2", "END_v1", "END_v2", "Study_area_v2", "FigTab_v2", "Notes_v2", "key", "S_ID_v1", "S_ID_v2")]

DAT$SHIFT_v2 <- as.numeric(DAT$SHIFT_v2)

# Flag with C if shift in v2 is qualitative
## qualitative ids from v2
idbiov2cat <- paste(biov2cat$ID_v2, biov2cat$Type_v2, biov2cat$Param_v2, biov2cat$sp_name_v2, sep = "__")
idbiov2cat <- idbiov2cat[which(is.na(biov2cat$SHIFT_v2))]
idbiov2cat <- unique(na.omit(idbiov2cat))
idbiov2cat <- which((paste(DAT$ID_v2, DAT$Type, DAT$Param, DAT$sp_name, sep = "__") %in% idbiov2cat) & is.na(DAT$Trend_v2))

DAT$SHIFT_v2[idbiov2cat] <- DAT$Trend_v2[idbiov2cat] <- "C"

# test
# All obs should be included in both DAT and biov1
idDATv1 <- paste(DAT$ID_v1, DAT$Type, DAT$Param, DAT$sp_name, sep = "__")
idDATv1 <- idDATv1[-grep("NA__",idDATv1)]
idDATv1 <- unique(na.omit(idDATv1))

idbiov1 <- paste(biov1$ID_v1, biov1$Type_v1, biov1$Param_v1, biov1$sp_name_v1, sep = "__")
idbiov1 <- unique(na.omit(idbiov1))

length(idDATv1) == length(idbiov1)
## [1] TRUE
idDATv2 <- paste(DAT$ID_v2, DAT$Type, DAT$Param, DAT$sp_name, sep = "__")
idDATv2 <- idDATv2[-grep("NA__",idDATv2)]
idDATv2 <- unique(na.omit(idDATv2))

idbiov2cat <- paste(biov2cat$ID_v2, biov2cat$Type_v2, biov2cat$Param_v2, biov2cat$sp_name_v2, sep = "__")
idbiov2cat <- unique(na.omit(idbiov2cat))

length(idDATv2) == length(idbiov2cat) 
## [1] FALSE

9.1 See the data

DT::datatable(DAT,
              rownames = FALSE,
              extensions = 'Buttons',
              options = list(pageLength = 100,
                             scrollY = "400px",
                             scrollX = T,
                             dom = 'Blfrtip',
                             buttons = c('csv', 'excel'),
                             lengthMenu = list(c(10,25,50,-1),
                                               c(10,25,50,"All")))) %>%
    DT::formatStyle(columns = colnames(DAT), `fontSize` = '80%')
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html

9.2 Compare shifts

# Do comparison on papers that overlap in both datasets
DATover <- DAT[-grep("NA__",DAT$key),] 

plot(SHIFT_v2~SHIFT_v1,
     data = DATover,
     xlab = "Shift v1",
     ylab = "Shift v2") 
## Warning in xy.coords(x, y, xlabel, ylabel, log): NAs introduced by coercion

9.2.1 Split LAT and ELE shifts

# Only LAT
plot(SHIFT_v2~SHIFT_v1,
     data = DATover[which(DATover$Type == "LAT"),],
     main = "Latitude",
     xlab = "Shift v1",
     ylab = "Shift v2")
## Warning in xy.coords(x, y, xlabel, ylabel, log): NAs introduced by coercion

# Only ELE
plot(SHIFT_v2~SHIFT_v1,
     data = DATover[which(DATover$Type == "ELE"),],
     main = "Elevation",
     xlab = "Shift v1",
     ylab = "Shift v2")
## Warning in xy.coords(x, y, xlabel, ylabel, log): NAs introduced by coercion

9.3 Observations with qualitative shifts in v2 but quantitative shifts in v1

# qualitative in v2
mis <- DATover[which(DATover$SHIFT_v2=="C"),]
nrow(mis) # qualitative shifts in v2
## [1] 364
# but present in v1 with a quantitative shift
mis <- mis[which(!is.na(mis$SHIFT_v1)),]

# N studies with qualitative shifts in v2 but quantitative shift in v1
nrow(mis)
## [1] 108
# N studies from which qualitative shift in v2 was interpreted due to non-signification shift in v1
nrow(mis[which(round(mis$SHIFT_v1,2)==0),])
## [1] 54
DT::datatable(mis,
              rownames = FALSE,
              extensions = 'Buttons',
              options = list(pageLength = 100,
                             scrollY = "400px",
                             scrollX = T,
                             dom = 'Blfrtip',
                             buttons = c('csv', 'excel'),
                             lengthMenu = list(c(10,25,50,-1),
                                               c(10,25,50,"All")))) %>%
    DT::formatStyle(columns = colnames(mis), `fontSize` = '80%')

9.4 N obs

The databases can differ in the number of shifts reported for the same species (N of observations per shift). This can be due to different interpretation about the number of areas/populations or criteria for inclusion used in each database.
Check studies mismatches in N observations.

key1 <- paste(biov1$ID_v1, biov1$Type_v1, biov1$Param_v1, biov1$sp_name_v1, sep = "__")
key2 <- paste(biov2cat$ID_v2, biov2cat$Type_v2, biov2cat$Param_v2, biov2cat$sp_name_v2, sep = "__")
key12 <- data.frame(key1=key1,key2=biov1$key)

Nobsv1 <- data.frame(table(key1))
names(Nobsv1)[2] <- c("obs_v1")
Nobsv2 <- data.frame(table(key2))
names(Nobsv2)[2] <- c("obs_v2")

Nobs <- merge(Nobsv1, key12, all.x = T) 
Nobs <- merge(Nobs, Nobsv2, all.x = T) 

tmp <- lapply(1:nrow(Nobs), function(x) {
    y = strsplit(as.character(Nobs$key1[x]), "__")[[1]]
    y2 = strsplit(as.character(Nobs$key2[x]), "__")[[1]]
    
    data.frame(ID_v1 = y[1], ID_v2 = y2[1], Type = y[2], Param = y[3], sp_name = y[4])
})

tmp <- do.call(rbind,tmp)

Nobs <- cbind(Nobs,tmp)
Nobs <- Nobs[,-1:-2]

# compare N obs in each database
plot(obs_v2~obs_v1,
     data = Nobs,
     xlab = "N obs v1",
     ylab = "N obs v2")

Nobs.diff <- Nobs[which(!Nobs$obs_v1 == Nobs$obs_v2),]

# number of identical observations per shift (%)
nrow(Nobs[which(Nobs$obs_v1 == Nobs$obs_v2),]) ; nrow(Nobs[which(Nobs$obs_v1 == Nobs$obs_v2),])/nrow(na.omit(Nobs))
## [1] 15490
## [1] 0.8340064
# number of different observations per shift (%)
nrow(Nobs.diff) ; nrow(Nobs.diff)/nrow(na.omit(Nobs))
## [1] 3083
## [1] 0.1659936

Most of shifts measures are identical when N observation is the same, but still some mismatches.

9.5 Shift duration

The databases can differ in the duration of shifts reported for the same species. This can be due to different interpretation about duration in each database.

# Comparing average shift duration between v1 and v2
plot(DUR_v2~DUR_v1,
     data = na.omit(DATover),
     xlab = "Duration v1",
     ylab = "Duration v2")

DATover.diff <- DATover[which(!DATover$DUR_v1 == DATover$DUR_v2),]

# number of identical shift duration 
nrow(DATover[which(DATover$DUR_v1 == DATover$DUR_v2),])
## [1] 14240
# number of different shift duration
nrow(DATover.diff)
## [1] 3393
# plot relationships for shifts with the same duration
toplot <- DATover[which(DATover$DUR_v1 == DATover$DUR_v2),]

plot(SHIFT_v2 ~ SHIFT_v1,
     data = na.omit(toplot),
     xlab = "Shift v1",
     ylab = "Shift v2")

# plot relationships for studies with the same duration and N observations
toplot <- DATover[which(DATover$ID_v1 %in% Nobs$ID_v1[which(Nobs$obs_v1 == Nobs$obs_v2)]),]
toplot <- toplot[which(toplot$DUR_v1 == toplot$DUR_v2),]

plot(SHIFT_v2 ~ SHIFT_v1,
     data = na.omit(toplot),
     xlab = "Shift v1",
     ylab = "Shift v2")

9.6 Flag mismatches

We looked for mismatches in range shift, duration of shift. For mismatch in shift, we considered there is a mismatch when difference in LAT was >0.1km/year, or a ELE difference was >0.001km/year.

DATover$sdiff <- abs(DATover$SHIFT_v1 - as.numeric(DATover$SHIFT_v2))
## Warning: NAs introduced by coercion
summary(DATover$sdiff[which(DATover$Type=="LAT")])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   0.000   0.000   0.477   0.001  30.571    3595
summary(DATover$sdiff[which(DATover$Type=="ELE")])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.0000  0.0000  0.0004  0.0000  0.0512    2984
DATover$misS <- ifelse(DATover$Type=="LAT" & DATover$sdiff > 0.1 | # mismatch in LAT
                           DATover$Type=="ELE" & DATover$sdiff > 0.001 & # mismatch in ELE
                           !is.na(DATover$sdiff), # not NA in shift <- mismatch in N observations
                       1, 0)

# number of total matches across all observations
length(which(DATover$misS==0)) ; length(which(DATover$misS==0))/ length(DATover$misS)
## [1] 18147
## [1] 0.7606572
# number of total mismatches across all observations
length(which(DATover$misS==1)) ; length(which(DATover$misS==1))/ length(DATover$misS)
## [1] 2115
## [1] 0.08865323
# mismatches just because there are diff N obs
DATover$misN <- 0
DATover$misN[which(is.na(DATover$sdiff))] <- 1
# number of mismatches just because there are diff N obs
length(which(DATover$misN==1)) ; length(which(DATover$misN==1))/ length(which(DATover$misS==1))
## [1] 6579
## [1] 3.110638
# number of shifts (sps / type) that match at least one shift observation
iden <- which(DATover$misS==0)
length(unique(DATover$key[iden])) ; length(unique(DATover$key[iden])) / length(unique(DATover$key))
## [1] 14512
## [1] 0.8576832
# number of shifts (sps / type) that don't match at least one shift observation
iden <- which(DATover$misS==1)
length(unique(DATover$key[iden])) ; length(unique(DATover$key[iden]))/ length(unique(DATover$key))
## [1] 1905
## [1] 0.1125887
# number of papers that match at least one shift observation
iden <- which(DATover$misS==0)
length(unique(DATover$ID_v1[iden])) ; length(unique(DATover$ID_v1[iden])) / length(unique(DATover$ID_v1))
## [1] 138
## [1] 0.9019608
# number of papers that don't match at least one shift observation
iden <- which(DATover$misS==1)
length(unique(DATover$ID_v1[iden])) ; length(unique(DATover$ID_v1[iden])) / length(unique(DATover$ID_v1))
## [1] 60
## [1] 0.3921569
# number of species that match at least one shift observation
iden <- which(DATover$misS==0)
length(unique(DATover$sp_name[iden])) ; length(unique(DATover$sp_name[iden])) / length(unique(DATover$sp_name))
## [1] 9069
## [1] 0.9091729
# number of species that don't match at least one shift observation
iden <- which(DATover$misS==1)
length(unique(DATover$sp_name[iden])) ; length(unique(DATover$sp_name[iden])) / length(unique(DATover$sp_name))
## [1] 1486
## [1] 0.1489724
#########
# opposite sign

shfroundv1 <- DATover$SHIFT_v1
shfroundv1[which(DATover$Type=="LAT")] <- round(as.numeric(DATover$SHIFT_v1[which(DATover$Type=="LAT")]),1)
shfroundv1[which(DATover$Type=="ELE")] <- round(as.numeric(DATover$SHIFT_v1[which(DATover$Type=="ELE")]),2)

shfroundv2 <- DATover$SHIFT_v2
shfroundv2[which(DATover$Type=="LAT")] <- round(as.numeric(DATover$SHIFT_v2[which(DATover$Type=="LAT")]),1)
## Warning: NAs introduced by coercion
shfroundv2[which(DATover$Type=="ELE")] <- round(as.numeric(DATover$SHIFT_v2[which(DATover$Type=="ELE")]),2)
## Warning: NAs introduced by coercion
DATover$misOpos <- 0
DATover$misOpos[which(as.numeric(shfroundv1) == -as.numeric(shfroundv2))] <- 1
# remove zeros
DATover$misOpos[which(shfroundv1==0 | shfroundv2==0)] <- 0

# number of mismatches in sign
length(which(DATover$misOpos==1)) ; length(which(DATover$misOpos==1))/length(DATover$misS)
## [1] 30
## [1] 0.001257493
#########
# mismatch in duration
DATover$misD <- ifelse(DATover$DUR_v1 == DATover$DUR_v2, 0, 1) 

# number of matches in duration
length(which(DATover$misD==0))  ; length(which(DATover$misD==0))/length(DATover$misS)
## [1] 14240
## [1] 0.5968898
# number of mismatches in duration
length(which(DATover$misD==1))  ; length(which(DATover$misD==1))/length(DATover$misS)
## [1] 3393
## [1] 0.1422224
# number of papers with mismatches in duration
iden <- which(DATover$misD==1)
length(unique(DATover$ID_v1[iden])) ; length(unique(DATover$ID_v1[iden])) / length(unique(DATover$ID_v1))
## [1] 56
## [1] 0.3660131
# mismatch in shift despite same duration
length(which(DATover$misD==0 & DATover$misS==1)) ; length(which(DATover$misD==0 & DATover$misS==1))/ length(which(DATover$misS==1)) 
## [1] 892
## [1] 0.4217494
# number of mismatches in shift and and duration (exclude false mismatch due to N obs)
length(which(DATover$misS==1 & !is.na(DATover$sdiff) & DATover$misD==1)) ; length(which(DATover$misS==1 & !is.na(DATover$sdiff) & DATover$misD==1))/ length(which(DATover$misS==1 & !is.na(DATover$sdiff)))
## [1] 1223
## [1] 0.5782506
# Difference of shift rate in function of difference in duration
# Absolute values
plot(abs(DATover$DUR_v1-DATover$DUR_v2),
     DATover$sdiff,
     ylab = "Absolute diff. in shift",
     xlab = "Absolute diff. in duration")

# Raw values
plot(DATover$DUR_v1-DATover$DUR_v2,
     DATover$SHIFT_v1-as.numeric(DATover$SHIFT_v2),
     ylab = "Diff. in shift",
     xlab = "Diff. in duration")
## Warning in xy.coords(x, y, xlabel, ylabel, log): NAs introduced by coercion

#########
# Flag when LAT was calculated from LON using Azimuth transformation
idsAzi = unique(biov1$ID_v1[which(biov1$Azi==1)])
DATover$Azi = 0
DATover$Azi[which(DATover$ID_v1 %in% idsAzi)] = 1

# Number of LAT was calculated from LON using Azimuth with a mismatch in shift
iden <- which(DATover$Type=="LAT" & DATover$Azi==1)
length(which(DATover$misS[iden]==1)) ; length(which(DATover$misS[iden]==1)) / length(DATover$misS[iden])
## [1] 190
## [1] 0.0652921
#########
# qualitative shifts in v2 but quantitative shifts in v1
DATover$misQ = 0
DATover$misQ[which(DATover$SHIFT_v2=="C")] <- 1
length(which(DATover$misQ==1 & !is.na(DATover$SHIFT_v1))) # qualitative shifts in v2 but present in v1 with a quantitative shift
## [1] 108
#########
# Create mismatch table
misma <- apply(DATover[,c("misS", "misOpos", "misD", "misQ", "Azi")], 1, function(x){
    any(na.omit(as.numeric(x))==1)
})

misma <- DATover[misma,]

any(na.omit(as.numeric(DATover[which(rownames(DATover)==37745),c("sp_name","misS", "misOpos", "misD", "misQ", "Azi")]))==1)
## Warning in na.omit(as.numeric(DATover[which(rownames(DATover) == 37745), : NAs
## introduced by coercion
## [1] TRUE
# misS = missmatch in shift
# misD = missmatch in distance
# misN = missmatch in N obs

DT::datatable(misma,
              rownames = FALSE,
              extensions = 'Buttons',
              options = list(pageLength = 100,
                             scrollY = "400px",
                             scrollX = T,
                             dom = 'Blfrtip',
                             buttons = c('csv', 'excel'),
                             lengthMenu = list(c(10,25,50,-1),
                                               c(10,25,50,"All")))) %>%
    DT::formatStyle(columns = colnames(misma), `fontSize` = '80%')
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html

10 Mismatch type

misma$Miss <- "Unknow"
misma$Miss[which(misma$misS == 1)] <- "Value"
misma$Miss[which(misma$misOpos == 1)] <- paste(misma$Miss[which(misma$misOpos == 1)],"Opo_Sign", sep = "+")
misma$Miss[which(misma$misD == 1)] <- paste(misma$Miss[which(misma$misD == 1)],"Duration", sep = "+")
misma$Miss[which(misma$misQ == 1)] <- paste(misma$Miss[which(misma$misQ == 1)], "Qualitative", sep = "+")

misma$Miss <- gsub("Unknow\\+","",misma$Miss,ignore.case = T)
unique(misma$Miss)
## [1] "Duration"                "Value+Duration"         
## [3] "Qualitative"             "Value"                  
## [5] "Unknow"                  "Value+Opo_Sign"         
## [7] "Value+Opo_Sign+Duration" "Duration+Qualitative"
toplot = data.frame(table(misma$Miss))

ggplot(toplot, aes(x = Var1, y = Freq))+
    geom_bar(stat="identity")+
    geom_text(aes(y=Freq+100, label=Freq), hjust = 1, vjust=0.1, size=3.5)+
    theme_minimal()+
    coord_flip()

11 Create table for checking

# v1links <- drive_ls("BIOSHIFTS Working Group/DATA/Bioshifts_v1/PDF_v1")
# v1links$Link_V1 <- drive_link(v1links)
# v1links <- data.frame(v1links)
# v1links <- v1links[,c(1,4)]
# v1links$ID_v1 <- sapply(v1links$name, function(x){
#     tmp = strsplit(x,split = "_")[[1]]
#     tmp = tmp[1]
#     tmp = gsub("[^0-9.-]", "", tmp)
#     tmp
# })
# v1links = v1links[,2:3]
# v1links = aggregate(Link_V1 ~ ID_v1, v1links, paste, collapse = ", ")
# 
# v2links <- drive_ls("Secondary Review")
# v2links$Link_V2 <- drive_link(v2links)
# v2links <- data.frame(v2links)
# v2links <- v2links[,c(1,4)]
# v2links$ID_v2 <- sapply(v2links$name, function(x){
#     tmp = strsplit(x,split = "[[:punct:]]")[[1]]
#     tmp = tmp[1]
#     tmp
# })
# v2links$ID_v2 <- sapply(v2links$ID_v2, function(x){
#     tmp = strsplit(x,split = " ")[[1]]
#     tmp = tmp[1]
#     tmp
# })
# 
# v2links$ID_v2 <- gsub("[^0-9.-]+","",v2links$ID_v2)
# v2links = v2links[,2:3]
# v2links = aggregate(Link_V2 ~ ID_v2, v2links, paste, collapse = ", ")
# 
# misma_ <- merge(misma,
#                 v1links,
#                 all.x = T)
# 
# misma_ <- merge(misma_,
#                 v2links,
#                 all.x = T)

# if you dont run the get links option you run this
misma_ <- misma
############

# add original ID in v1
IDv1 <- sapply(1:nrow(misma_), function(x) paste0("A", misma_$ID_v1[x], "_", misma_$Study_ID_v1[x]))
IDv1[grep("NA",IDv1)] <- NA
misma_$ID_v1 <- IDv1


misma_ <- misma_[order(misma_$ID_v2,misma_$sp_name,misma_$SHIFT_v1),]
misma_$checked_by <- NA


misma_meta <- data.frame(Variable = names(misma_), Description = NA)
misma_meta$Description = 
    c("v2 ID", "v1 ID", "Shift type", "Shift parameter", "Accepted species name",
      "v1 Study ID", "v2 study ID", "v1 trend unit", "v2 trend unit",
      "v1 trend", "v2 trend", "v1 shift duration", "v2 shift duration",
      "v1 shift (km/yr)", "v2 shift (km/yr)", "v1 species name as reported in original publication", "v2 species name as reported in original publication",
      "v2 species common name - this information exists in v2 only",
      "v1 shift start year", "v2 shift start year", 
      "v1 shift end year", "v2 shift end year", 
      "v2 description of study area - this information exists in v2 only",
      "v2 Fig, table, page where shift data where extract for the paper - this information exists in v2 only",
      "v2 general notes - this information exists in v2 only",
      "Identifier for Brunno",
      "Shift ID v1. Localizer for each shift reported in v1. See Sheet 'Raw_v1'",
      "Shift ID v2. Localizer for each shift reported in v2. See Sheet 'Raw_v2'",
      "Difference in shift value between v1 and v2 = abs(SHIFT_v1 - SHIFT_v2)",
      "Whether there is a mismatch in shift between v1 and v2: 1 = mismatch, 0 = match. A mismatch was considered when the difference is > 0.1 km/yr for latitudinal shift, or > 0.001 km/yr for elevational shift",
      "Whether there is a mismatch in N observations between v1 and v2",
      "Whether there is a mismatch in shift sign v1 and v2 (E.g., negative value in v1 but positive value in v2)",
      "Whether there is a mismatch in shift duration between v1 and v2",
      "Whether range shift in v1 was calculated from Azimuth transformation",
      "Whether the shift is recorded as qualitative in v2 but quantivative in v1",
      "Verbose for mismatch type = 'Duration', 'Value+Duration', 'Value', 'Value+Opo_Sign',
                    'Qualitative', 'Duration+Qualitative', 'Value+Opo_Sign+Duration'",
      'Person responsible for checking values')

#Fixtable

# gtable<-read_xlsx(here::here("output/gtable.xlsx"))
# #getchecked
# gtable<-gtable[-which(is.na(gtable$checked_by)),]
# 
# colsg<-c("S_ID_v1", "checked_by", "sp_reported_name_v3", "...39", "degree/decade", "Direction_v3", "Significance_v3", "START_firstperiod_v3", "END_firstperiod_v3", "MIDPOINT_firstperiod_v3", "START_secondperiod_v3", "END_secondperiod_v3", "MIDPOINT_secondperiod_v3", "Comments_v3", "Reason")
# colsmisma_<-c("ID_v1", "ID_v2", "Type", "Param", "sp_name", "Study_ID_v1", "Study_ID_v2", "Unit.trend_v1", "Unit.trend_v2", "Trend_v1", "Trend_v2", "DUR_v1", "DUR_v2", "SHIFT_v1", "SHIFT_v2", "sp_reported_name_v1", "sp_reported_name_v2", "sp_common_name_v2", "START_v1", "START_v2", "END_v1", "END_v2", "Study_area_v2", "FigTab_v2", "Notes_v2", "key", "S_ID_v1", "S_ID_v2", "sdiff", "misS", "misN", "misOpos", "misD", "Azi", "misQ", "Miss")
# 
# misma_<-merge(misma_[,colsmisma_], gtable[,colsg], by="S_ID_v1", all.x = T)
# 
# misma_ <- misma_[,names(gtable)]

12 Save table results

#Fixtable

gtable<-read_xlsx(here::here("output/gtable.xlsx"))
#getchecked
gtable<-gtable[-which(is.na(gtable$checked_by)),]

colsg<-c("S_ID_v1", "checked_by", "sp_reported_name_v3", "...39", "degree/decade", "Direction_v3", "Significance_v3", "START_firstperiod_v3", "END_firstperiod_v3", "MIDPOINT_firstperiod_v3", "START_secondperiod_v3", "END_secondperiod_v3", "MIDPOINT_secondperiod_v3", "Comments_v3", "Reason")
colsmisma_<-c("ID_v1", "ID_v2", "Type", "Param", "sp_name", "Study_ID_v1", "Study_ID_v2", "Unit.trend_v1", "Unit.trend_v2", "Trend_v1", "Trend_v2", "DUR_v1", "DUR_v2", "SHIFT_v1", "SHIFT_v2", "sp_reported_name_v1", "sp_reported_name_v2", "sp_common_name_v2", "START_v1", "START_v2", "END_v1", "END_v2", "Study_area_v2", "FigTab_v2", "Notes_v2", "key", "S_ID_v1", "S_ID_v2", "sdiff", "misS", "misN", "misOpos", "misD", "Azi", "misQ", "Miss")

misma_<-merge(misma_[,colsmisma_], gtable[,colsg], by="S_ID_v1", all.x = T)

misma_ <- misma_[,names(gtable)]