Library

pacman::p_load(rio,
               here,
               dplyr,
               magrittr,
               tidyr,
               metagear,
               bibliometrix,
               rscopus, 
               synthesisr, 
               revtools,
               data.table)

Web of Science

All fields

# Step 1: read all the files downloaded from database 
bibfiles_allfields <- list.files(
  "./search_results_1/",
  full.names = TRUE
)
print(bibfiles_allfields)
# Step 2: we save them to a data.frame object (because return_df=TRUE) called imported_files
library(synthesisr)

imported_files_allfields <- read_refs(
  filename = bibfiles_allfields,
  return_df = TRUE)
# Step 3: Remove duplication 

df_doi_allfields <- deduplicate(
  imported_files_allfields,
  match_by = "doi",
  method = "exact"
)

df_title_allfields <- deduplicate(
  df_doi_allfields,
  match_by = "title",
  method = "exact"
)
write.csv(df_title_allfields,'./output/df_deduplicate_allfield.csv')

Topic only

# Step 1: read all the files downloaded from database 
bibfiles_topic <- list.files(
  "./search_results_2/",
  full.names = TRUE
)
print(bibfiles_topic)
# Step 2: 
imported_files_topic <- read_refs(
  filename = bibfiles_topic,
  return_df = TRUE)
# Step 3: Remove duplication 

df_doi_topic <- deduplicate(
  imported_files_topic,
  match_by = "doi",
  method = "exact"
)

df_title_topic <- deduplicate(
  df_doi_topic,
  match_by = "title",
  method = "exact"
)
write.csv(df_title_topic,'./output/df_deduplicate_topic.csv')

Combine all files

# # put all data frames into list
# df_list <- list(df1, df2)
# df_m <- Reduce(function(x, y) merge(x, y, all=TRUE), df_list) # without "doi", merge is intersec(). It is not what i want for this task 
# df_m.2 <- merge(df1, df2, by = "doi", all = TRUE) 

df_m <- rbind(df_title_allfields, df_title_topic) # 2150 
df_m1 <- data.table::rbindlist(list(df_title_allfields, df_title_topic), fill = TRUE) # the best code  

d1 <- df_m[!duplicated(df_m$doi), ] # 936
d2 <- unique(df_m$doi) # 936
d3 <- df_m %>% distinct(doi) # 936 
# deduplicate 
df_doi_m <- deduplicate(
  df_m,
  match_by = "doi",
  method = "exact"
)

df_title_m <- deduplicate(
  df_doi_m,
  match_by = "title",
  method = "exact"     # 1198
)

# Find duplicate values in R [duplicate]
dups <- df_title_m[duplicated(df_title_m$doi)|duplicated(df_title_m$doi, fromLast=TRUE),] # 263 is paper with doi NA. 1198 - 263 = 936
a_wos <- df_title_m

Data Check

# SAME ROWS among the data frames
## Option 1: ‘intersect’
df_title_topic <- subset(df_title_topic, select = c('doi'))  # %>% filter(!is.na(doi))
df_title_allfields <- subset(df_title_allfields, select = c('doi')) 
a1 <- intersect(df_title_topic, df_title_allfields) # 749 
a2 <- intersect(df_title_allfields, df_title_topic) # 749  

## Option 2: 'inner_join()'
a3 <- inner_join(df_title_topic, df_title_allfields) # 749 
## Joining with `by = join_by(doi)`
a4 <- inner_join(df_title_allfields, df_title_topic) # 749
## Joining with `by = join_by(doi)`
# NOT SAME ROW among the data frames
## Option 1: ‘setdiff’, 'anti_join' which would return only the left dataset's rows that are not match right dataset
t1 <- setdiff(df_title_topic, df_title_allfields) # 46
t2 <- setdiff(df_title_allfields, df_title_topic) # 140
t3 <- anti_join(df_title_topic, df_title_allfields) # 46
## Joining with `by = join_by(doi)`
t4 <- anti_join(df_title_allfields, df_title_topic) # 140
## Joining with `by = join_by(doi)`
## Option 2: unique()
df <- bind_rows(df_title_allfields, df_title_topic) # 1685
df <- df %>% filter(!is.na(doi))
u <- unique(df) # 935 unique row  |  # 935+750 = 1685 


## Check 
allfields <- df_title_allfields %>% mutate(check = ifelse(df_title_allfields$doi %in% df_title_topic$doi, TRUE, FALSE))
topic <- df_title_topic %>% mutate(check = ifelse(df_title_topic$doi %in% df_title_allfields$doi, TRUE, FALSE))

table(allfields['check'])
table(topic['check'])
# Find uncommon rows (observations) between 2 datasets 
data.1 <- subset(df_title_topic,!(doi %in% df_title_allfields$doi))
data.2 <- subset(df_title_allfields, !(doi %in% df_title_topic$doi))

Scopus

All fields

# Step 1: read all the files downloaded from database 
bibfiles_allfields <- list.files(
  "./search_results_5/",
  full.names = TRUE
)
print(bibfiles_allfields)
# Step 2: we save them to a data.frame object (because return_df=TRUE) called imported_files
imported_files_allfields <- read_refs(
  filename = bibfiles_allfields,
  return_df = TRUE)
# Step 3: Remove duplication 

df_doi_allfields <- deduplicate(
  imported_files_allfields,
  match_by = "doi",
  method = "exact"
)

df_title_allfields <- deduplicate(
  df_doi_allfields,
  match_by = "title",
  method = "exact"
)
write.csv(df_title_allfields,'./output/scopus_df_deduplicate_allfield.csv')

Topic only

# Step 1: read all the files downloaded from database 
bibfiles_topic <- list.files(
  "./search_results_4/",
  full.names = TRUE
)
print(bibfiles_topic)
# Step 2: 
imported_files_topic <- read_refs(
  filename = bibfiles_topic,
  return_df = TRUE)
# Step 3: Remove duplication 

df_doi_topic <- deduplicate(
  imported_files_topic,
  match_by = "doi",
  method = "exact"
)

df_title_topic <- deduplicate(
  df_doi_topic,
  match_by = "title",
  method = "exact"
)
write.csv(df_title_topic,'./output/scopus_df_deduplicate_topic.csv')

Combine all files

df_m <- rbind(df_title_allfields, df_title_topic) # 930
df_m1 <- rbindlist(list(df_title_allfields, df_title_topic), fill = TRUE) # the best code  

d1 <- df_m[!duplicated(df_m$doi), ] # 867
# depublication 
df_doi_m <- deduplicate(
  df_m,
  match_by = "doi",
  method = "exact"
)

df_title_m <- deduplicate(
  df_doi_m,
  match_by = "title",
  method = "exact"       # 903
)

# Find duplicate values in a dataframe 
dups <- df_title_m[duplicated(df_title_m$doi)|duplicated(df_title_m$doi, fromLast=TRUE),] # 37 is paper with doi NA. 903 - 37 = 866
a_scopus <- df_title_m

Data Check

# SAME ROWS among the data frames
## Option 1: ‘intersect’
df_title_topic <- subset(df_title_topic, select = c('doi'))  # %>% filter(!is.na(doi))
df_title_allfields <- subset(df_title_allfields, select = c('doi')) 
a1 <- intersect(df_title_topic, df_title_allfields) # 25 
a2 <- intersect(df_title_allfields, df_title_topic) # 25  

## Option 2: 'inner_join()'
a3 <- inner_join(df_title_topic, df_title_allfields) # 168 
## Joining with `by = join_by(doi)`
a4 <- inner_join(df_title_allfields, df_title_topic) # 168
## Joining with `by = join_by(doi)`
# NOT SAME ROW among the data frames
## Option 1: ‘setdiff’, 'anti_join' which would return only the left dataset's rows that are not match right dataset
t1 <- setdiff(df_title_topic, df_title_allfields) # 29
t2 <- setdiff(df_title_allfields, df_title_topic) # 813
t3 <- anti_join(df_title_topic, df_title_allfields) # 29
## Joining with `by = join_by(doi)`
t4 <- anti_join(df_title_allfields, df_title_topic) # 813
## Joining with `by = join_by(doi)`
## Option 2: unique()
df <- bind_rows(df_title_allfields, df_title_topic) # 890
df <- df %>% filter(!is.na(doi))
u <- unique(df) # 866 unique row  |  


## Check 
allfields <- df_title_allfields %>% mutate(check = ifelse(df_title_allfields$doi %in% df_title_topic$doi, TRUE, FALSE))
topic <- df_title_topic %>% mutate(check = ifelse(df_title_topic$doi %in% df_title_allfields$doi, TRUE, FALSE))

table(allfields['check'])
table(topic['check'])
# Find uncommon rows (observations) between 2 datasets 
data.1 <- subset(df_title_topic,!(doi %in% df_title_allfields$doi))
data.2 <- subset(df_title_allfields, !(doi %in% df_title_topic$doi))

TRID

# Step 1: read all the files downloaded from database 
bibfiles_allfields <- list.files(
  "./search_results_6/",
  full.names = TRUE
)
print(bibfiles_allfields)
# Step 2: we save them to a data.frame object (because return_df=TRUE) called imported_files
imported_files_allfields <- read_refs(
  filename = bibfiles_allfields,
  return_df = TRUE)
# Step 3: Remove duplication 

df_doi_allfields <- deduplicate(
  imported_files_allfields,
  match_by = "doi",
  method = "exact"
)

df_title_allfields <- deduplicate(
  df_doi_allfields,
  match_by = "title",
  method = "exact"     # 1547
)
df <- df_title_allfields[!duplicated(df_title_allfields$doi), ] # 1033  (1547 - 1033 = 514)
dups <- df_title_allfields[duplicated(df_title_allfields$doi)|duplicated(df_title_allfields$doi, fromLast=TRUE),] # extract 514 row duplication 
write.csv(df_title_allfields,'./output/TRID_df_deduplicate_allfield.csv')
a_trid <- df_title_allfields

All database (WoS - Scopus - TRID)

a_df <- rbindlist(list(a_wos, a_scopus, a_trid), fill = TRUE) # 3648 
a_df.1 <- a_df[!duplicated(a_df$doi), ] # 2757
# depublication 
df_doi_m <- deduplicate(
  a_df,
  match_by = "doi",
  method = "exact"       # 3571
)

df_title_m <- deduplicate(
  df_doi_m,
  match_by = "title",
  method = "exact"       # 3457
)

df_title_m.1 <- deduplicate(
  df_title_m, 
  "title",
  method = "string_osa",
  rm_punctuation = TRUE,
  to_lower = TRUE)       # 3386 

# Find duplicate values in a dataframe 
dups.2 <- df_doi_m[duplicated(df_doi_m$doi)|duplicated(df_doi_m$doi, fromLast=TRUE),] # 815 is paper with doi NA. 3571 - 815 = 2756
mydata <- df_title_m 

Reference

  1. Package ‘synthesisr’