Library
pacman::p_load(rio,
here,
dplyr,
magrittr,
tidyr,
metagear,
bibliometrix,
rscopus,
synthesisr,
revtools,
data.table)
Web of Science
All fields
# Step 1: read all the files downloaded from database
bibfiles_allfields <- list.files(
"./search_results_1/",
full.names = TRUE
)
print(bibfiles_allfields)
# Step 2: we save them to a data.frame object (because return_df=TRUE) called imported_files
library(synthesisr)
imported_files_allfields <- read_refs(
filename = bibfiles_allfields,
return_df = TRUE)
# Step 3: Remove duplication
df_doi_allfields <- deduplicate(
imported_files_allfields,
match_by = "doi",
method = "exact"
)
df_title_allfields <- deduplicate(
df_doi_allfields,
match_by = "title",
method = "exact"
)
write.csv(df_title_allfields,'./output/df_deduplicate_allfield.csv')
Topic only
# Step 1: read all the files downloaded from database
bibfiles_topic <- list.files(
"./search_results_2/",
full.names = TRUE
)
print(bibfiles_topic)
# Step 2:
imported_files_topic <- read_refs(
filename = bibfiles_topic,
return_df = TRUE)
# Step 3: Remove duplication
df_doi_topic <- deduplicate(
imported_files_topic,
match_by = "doi",
method = "exact"
)
df_title_topic <- deduplicate(
df_doi_topic,
match_by = "title",
method = "exact"
)
write.csv(df_title_topic,'./output/df_deduplicate_topic.csv')
Combine all files
# # put all data frames into list
# df_list <- list(df1, df2)
# df_m <- Reduce(function(x, y) merge(x, y, all=TRUE), df_list) # without "doi", merge is intersec(). It is not what i want for this task
# df_m.2 <- merge(df1, df2, by = "doi", all = TRUE)
df_m <- rbind(df_title_allfields, df_title_topic) # 2150
df_m1 <- data.table::rbindlist(list(df_title_allfields, df_title_topic), fill = TRUE) # the best code
d1 <- df_m[!duplicated(df_m$doi), ] # 936
d2 <- unique(df_m$doi) # 936
d3 <- df_m %>% distinct(doi) # 936
# deduplicate
df_doi_m <- deduplicate(
df_m,
match_by = "doi",
method = "exact"
)
df_title_m <- deduplicate(
df_doi_m,
match_by = "title",
method = "exact" # 1198
)
# Find duplicate values in R [duplicate]
dups <- df_title_m[duplicated(df_title_m$doi)|duplicated(df_title_m$doi, fromLast=TRUE),] # 263 is paper with doi NA. 1198 - 263 = 936
a_wos <- df_title_m
Data Check
# SAME ROWS among the data frames
## Option 1: ‘intersect’
df_title_topic <- subset(df_title_topic, select = c('doi')) # %>% filter(!is.na(doi))
df_title_allfields <- subset(df_title_allfields, select = c('doi'))
a1 <- intersect(df_title_topic, df_title_allfields) # 749
a2 <- intersect(df_title_allfields, df_title_topic) # 749
## Option 2: 'inner_join()'
a3 <- inner_join(df_title_topic, df_title_allfields) # 749
## Joining with `by = join_by(doi)`
a4 <- inner_join(df_title_allfields, df_title_topic) # 749
## Joining with `by = join_by(doi)`
# NOT SAME ROW among the data frames
## Option 1: ‘setdiff’, 'anti_join' which would return only the left dataset's rows that are not match right dataset
t1 <- setdiff(df_title_topic, df_title_allfields) # 46
t2 <- setdiff(df_title_allfields, df_title_topic) # 140
t3 <- anti_join(df_title_topic, df_title_allfields) # 46
## Joining with `by = join_by(doi)`
t4 <- anti_join(df_title_allfields, df_title_topic) # 140
## Joining with `by = join_by(doi)`
## Option 2: unique()
df <- bind_rows(df_title_allfields, df_title_topic) # 1685
df <- df %>% filter(!is.na(doi))
u <- unique(df) # 935 unique row | # 935+750 = 1685
## Check
allfields <- df_title_allfields %>% mutate(check = ifelse(df_title_allfields$doi %in% df_title_topic$doi, TRUE, FALSE))
topic <- df_title_topic %>% mutate(check = ifelse(df_title_topic$doi %in% df_title_allfields$doi, TRUE, FALSE))
table(allfields['check'])
table(topic['check'])
# Find uncommon rows (observations) between 2 datasets
data.1 <- subset(df_title_topic,!(doi %in% df_title_allfields$doi))
data.2 <- subset(df_title_allfields, !(doi %in% df_title_topic$doi))
Scopus
All fields
# Step 1: read all the files downloaded from database
bibfiles_allfields <- list.files(
"./search_results_5/",
full.names = TRUE
)
print(bibfiles_allfields)
# Step 2: we save them to a data.frame object (because return_df=TRUE) called imported_files
imported_files_allfields <- read_refs(
filename = bibfiles_allfields,
return_df = TRUE)
# Step 3: Remove duplication
df_doi_allfields <- deduplicate(
imported_files_allfields,
match_by = "doi",
method = "exact"
)
df_title_allfields <- deduplicate(
df_doi_allfields,
match_by = "title",
method = "exact"
)
write.csv(df_title_allfields,'./output/scopus_df_deduplicate_allfield.csv')
Topic only
# Step 1: read all the files downloaded from database
bibfiles_topic <- list.files(
"./search_results_4/",
full.names = TRUE
)
print(bibfiles_topic)
# Step 2:
imported_files_topic <- read_refs(
filename = bibfiles_topic,
return_df = TRUE)
# Step 3: Remove duplication
df_doi_topic <- deduplicate(
imported_files_topic,
match_by = "doi",
method = "exact"
)
df_title_topic <- deduplicate(
df_doi_topic,
match_by = "title",
method = "exact"
)
write.csv(df_title_topic,'./output/scopus_df_deduplicate_topic.csv')
Combine all files
df_m <- rbind(df_title_allfields, df_title_topic) # 930
df_m1 <- rbindlist(list(df_title_allfields, df_title_topic), fill = TRUE) # the best code
d1 <- df_m[!duplicated(df_m$doi), ] # 867
# depublication
df_doi_m <- deduplicate(
df_m,
match_by = "doi",
method = "exact"
)
df_title_m <- deduplicate(
df_doi_m,
match_by = "title",
method = "exact" # 903
)
# Find duplicate values in a dataframe
dups <- df_title_m[duplicated(df_title_m$doi)|duplicated(df_title_m$doi, fromLast=TRUE),] # 37 is paper with doi NA. 903 - 37 = 866
a_scopus <- df_title_m
Data Check
# SAME ROWS among the data frames
## Option 1: ‘intersect’
df_title_topic <- subset(df_title_topic, select = c('doi')) # %>% filter(!is.na(doi))
df_title_allfields <- subset(df_title_allfields, select = c('doi'))
a1 <- intersect(df_title_topic, df_title_allfields) # 25
a2 <- intersect(df_title_allfields, df_title_topic) # 25
## Option 2: 'inner_join()'
a3 <- inner_join(df_title_topic, df_title_allfields) # 168
## Joining with `by = join_by(doi)`
a4 <- inner_join(df_title_allfields, df_title_topic) # 168
## Joining with `by = join_by(doi)`
# NOT SAME ROW among the data frames
## Option 1: ‘setdiff’, 'anti_join' which would return only the left dataset's rows that are not match right dataset
t1 <- setdiff(df_title_topic, df_title_allfields) # 29
t2 <- setdiff(df_title_allfields, df_title_topic) # 813
t3 <- anti_join(df_title_topic, df_title_allfields) # 29
## Joining with `by = join_by(doi)`
t4 <- anti_join(df_title_allfields, df_title_topic) # 813
## Joining with `by = join_by(doi)`
## Option 2: unique()
df <- bind_rows(df_title_allfields, df_title_topic) # 890
df <- df %>% filter(!is.na(doi))
u <- unique(df) # 866 unique row |
## Check
allfields <- df_title_allfields %>% mutate(check = ifelse(df_title_allfields$doi %in% df_title_topic$doi, TRUE, FALSE))
topic <- df_title_topic %>% mutate(check = ifelse(df_title_topic$doi %in% df_title_allfields$doi, TRUE, FALSE))
table(allfields['check'])
table(topic['check'])
# Find uncommon rows (observations) between 2 datasets
data.1 <- subset(df_title_topic,!(doi %in% df_title_allfields$doi))
data.2 <- subset(df_title_allfields, !(doi %in% df_title_topic$doi))
TRID
# Step 1: read all the files downloaded from database
bibfiles_allfields <- list.files(
"./search_results_6/",
full.names = TRUE
)
print(bibfiles_allfields)
# Step 2: we save them to a data.frame object (because return_df=TRUE) called imported_files
imported_files_allfields <- read_refs(
filename = bibfiles_allfields,
return_df = TRUE)
# Step 3: Remove duplication
df_doi_allfields <- deduplicate(
imported_files_allfields,
match_by = "doi",
method = "exact"
)
df_title_allfields <- deduplicate(
df_doi_allfields,
match_by = "title",
method = "exact" # 1547
)
df <- df_title_allfields[!duplicated(df_title_allfields$doi), ] # 1033 (1547 - 1033 = 514)
dups <- df_title_allfields[duplicated(df_title_allfields$doi)|duplicated(df_title_allfields$doi, fromLast=TRUE),] # extract 514 row duplication
write.csv(df_title_allfields,'./output/TRID_df_deduplicate_allfield.csv')
a_trid <- df_title_allfields
All database (WoS - Scopus - TRID)
a_df <- rbindlist(list(a_wos, a_scopus, a_trid), fill = TRUE) # 3648
a_df.1 <- a_df[!duplicated(a_df$doi), ] # 2757
# depublication
df_doi_m <- deduplicate(
a_df,
match_by = "doi",
method = "exact" # 3571
)
df_title_m <- deduplicate(
df_doi_m,
match_by = "title",
method = "exact" # 3457
)
df_title_m.1 <- deduplicate(
df_title_m,
"title",
method = "string_osa",
rm_punctuation = TRUE,
to_lower = TRUE) # 3386
# Find duplicate values in a dataframe
dups.2 <- df_doi_m[duplicated(df_doi_m$doi)|duplicated(df_doi_m$doi, fromLast=TRUE),] # 815 is paper with doi NA. 3571 - 815 = 2756
mydata <- df_title_m