Clean RIS files

Author

Julius, Nastia

global variables

keywords

keywords for exclusion:

keywords_exclude <- c(
  "patients", "impairment", "cognitive impairment", "symptoms", "will be", "disease",
  "disorder", "deficits", "depression", "dementia", "protocol", "disorders",
  "cognitive decline", "mild cognitive impairment", "questionnaire", "anxiety",
  "cognitive rehabilitation", "schizophrenia", "MCI", "alzheimer", "alzheimer's",
  "difficulties", "physical activity", "depressive", "cognitive deficits", "chronic",
  "diagnosed", "stroke", "questionnaires", "ADHD", "disability", "hospital",
  "brain injury", "psychiatric", "at risk", "illness", "parkinson", "aerobic exercise",
  "parkinson's", "survivors", "cognitive dysfunction", "study protocol", "syndrome",
  "multiple sclerosis", "clinic", "healthy controls", "mindfulness", "cancer",
  "survey", "complaints", "alcohol", "TBI", "self-reported", "exercise training",
  "diet", "cognitive-behavioral", "high risk", "psychosis",
  "cognitive behavioral therapy", "physical training", "diabetes",
  "attention deficit hyperactivity disorder", "outpatient", "executive dysfunction",
  "surgery", "depressive disorder", "veterans", "CBT", "autism", "nutrition",
  "suffering from", "sports", "addiction", "motor training", "obesity", "Psychotic",
  "inpatient", "HIV", "PTSD", "regression analyses", "regression analysis", "trauma",
  "bipolar disorder", "meditation", "use disorder", "resistance training",
  "Psychotherapy", "chemotherapy", "counseling", "aerobic training", "secondary analysis",
  "attention deficit/hyperactivity disorder", "inpatients", "mental illness",
  "single session", "reviews", "epilepsy", "motor skills", "balance training",
  "mindfulness training", "tumor", "supplementation", "subjective cognitive decline",
  "first episode", "Theory of Mind", "use disorders", "practice effects",
  "abnormalities", "neurofeedback training", "acid", "motor task",
  "psychiatric disorders", "posttraumatic stress disorder", "healthy control", "obese",
  "retrospectively", "overview", "heart failure", "diabetes mellitus", "aphasia",
  "dataset", "dyslexia", "music training", "Learning Disabilities",
  "psychological intervention", "compensatory cognitive training",
  "cognitive-behavioural", "intellectual disability", "animals",
  "cognitive-behavioral therapy", "tumors", "severe mental illness", "tai chi",
  "vascular cognitive impairment", "concussion", "cerebral palsy", "substance abuse",
  "systematic review", "autistic", "case report", "fitness training", "functional ability",
  "obsessive compulsive disorder", "meta-analysis", "chronic pain", "poststroke",
  "single arm", "case control", "neurodevelopmental disorders", "ICU",
  "literature review", "Down syndrome", "sickle cell disease",
  "intellectual disabilities", "anorexia", "psychiatric disorder", "aerobic exercises",
  "intensive care unit", "Physical exercise training", "neurodevelopmental disorder",
  "COPD", "pulmonary disease", "cells", "abacus", "leukemia", "anhedonia",
  "very preterm", "hallucinations", "Executive Dysfunctions",
  "with subjective memory complaints", "Functional Training", "behavioral disorders",
  "dyslexic", "methamphetamine", "awareness training", "huntington's disease", "fish",
  "brain atrophy", "surviving", "this review", "physical intervention",
  "borderline personality disorder", "malaria", "piano training",
  "sensitivity and specificity", "NFT", "alcoholic", "learning disorder",
  "mental retardation", "Dance Intervention", "paranoid", "alcoholism", "dyscalculia",
  "conduct disorder", "retrospective study", "cocaine use disorder",
  "developmental language disorder", "spina bifida", "spinal cord injury",
  "coronary artery disease", "SLI", "dogs", "end-stage", "martial arts",
  "neurofibromatosis", "Music Interventions", "functional exercises", "reanalyzed",
  "mobility training", "chronic kidney disease", "CKD", "opiate", "in-patient",
  "sport training", "deletion syndrome", "substance abusers", "learning difficulty",
  "single training session", "Sprint Interval Training", "chronic fatigue syndrome",
  "ESRD", "in vitro", "reanalysis", "case reports", "Rett syndrome", "brain-injured",
  "self-harm", "out-patient", "karate training", "RUNNING TRAINING",
  "Arts Intervention", "Williams syndrome", "neurological damage",
  "retrospective cohort", "specific language impairment",
  "specific language impairments", "perceptual discrimination task",
  "High-Intensity Functional Training", "re-analysis", "encephalomyelitis",
  "motor skills training", "severe trauma", "Nature Retreat", "spastic diplegia",
  "behavioral disorder", "acupressure training", "martial art training",
  "social training program", "with cognitive declines", "visuomotor tracking task",
  "music instrument training"
)

keywords for inclusion:

keywords_include <- c(
  "cognitive", "training", "intervention", "control", "cognitive training",
  "effects of", "working memory", "attention", "improve", "improvement",
  "improvements", "effect of", "efficacy", "healthy", "executive function",
  "improving", "randomized controlled trial", "effectiveness", "transfer",
  "randomly assigned", "benefits", "memory training", "executive functions",
  "working memory training", "cognitive assessment", "problem solving",
  "benefit", "gains", "inhibition", "controls", "improves", "executive functioning",
  "active control", "sham", "decision making", "stroop", "cognitive intervention",
  "placebo", "inhibitory control", "cognitive control", "control groups",
  "cognitive flexibility", "Cognitive Interventions", "planning", "problem-solving",
  "dual task", "untrained", "neuropsychological tests", "Digit Span", "efficiency",
  "decision-making", "EF", "RCT", "switching", "randomised controlled trial",
  "Brain Training", "CCT", "n-back", "shifting", "executive control",
  "fluid intelligence", "updating", "gain", "cogmed", "executive function training",
  "community-dwelling", "TMT", "computerized training", "attentional control",
  "attention training", "flanker", "task switching", "passive control",
  "card sorting", "untrained tasks", "EFs", "no contact", "dual-task training",
  "typically developing", "far-transfer", "video game training", "exergame",
  "near-transfer", "inhibition training", "task-switching", "Community-Based",
  "Lumosity", "serious games", "exergaming", "transfers", "Verbal Fluency Test",
  "Serious Game", "BrainHQ", "brainHQ", "different tasks", "pretest-posttest",
  "transferable", "brain games", "cognifit", "nontrained", "Stroop Color‐Word Test",
  "neuropsychological outcomes", "antisaccade", "simon task", "Brain HQ", "APT",
  "Brain Exercise", "Game-Based Training", "untrained task", "computerised training",
  "Corsi Block Tapping Task", "NeuroTracker", "Attention Process Training",
  "active-control", "different task", "reasoning abilities", "brain exercises",
  "randomised controlled trials", "Brain endurance training", "BET",
  "Relational integration", "brain game", "typically-developing", "Simon Effect",
  "anti-saccade task", "shifting training", "task-switch", "untrained control group",
  "SmartBrain", "Wisconsin Sorting Card Test", "task-switch training",
  "braining training"
)

load packages and get path to raw data files

# sets the directory of location of this script as the current directory
# setwd(dirname(rstudioapi::getSourceEditorContext()$path))

### load packages
require(pacman)
p_load('tidyverse', 'DT', 'writexl', 'future.apply',
       'httr', 'jsonlite', 'urltools', 'stringdist', 'data.table')


### load data
setwd("data")

# List all the RIS files in the folder
ris_files <- list.files(path = getwd(), pattern = "*.ris", full.names = TRUE)

ris_files

 [1] "C:/Users/fenn/Nextcloud/PROGRAMMING PROJECTS/merge clean ris files/data/res_ris_cochranelib_1002_5193.ris"  
 [2] "C:/Users/fenn/Nextcloud/PROGRAMMING PROJECTS/merge clean ris files/data/res_ris_psycinfo_2402_3450.ris"     
 [3] "C:/Users/fenn/Nextcloud/PROGRAMMING PROJECTS/merge clean ris files/data/res_ris_wos_1002_8719_1-1000.ris"   
 [4] "C:/Users/fenn/Nextcloud/PROGRAMMING PROJECTS/merge clean ris files/data/res_ris_wos_1002_8719_1001-2000.ris"
 [5] "C:/Users/fenn/Nextcloud/PROGRAMMING PROJECTS/merge clean ris files/data/res_ris_wos_1002_8719_2001-3000.ris"
 [6] "C:/Users/fenn/Nextcloud/PROGRAMMING PROJECTS/merge clean ris files/data/res_ris_wos_1002_8719_3001-4000.ris"
 [7] "C:/Users/fenn/Nextcloud/PROGRAMMING PROJECTS/merge clean ris files/data/res_ris_wos_1002_8719_4001-5000.ris"
 [8] "C:/Users/fenn/Nextcloud/PROGRAMMING PROJECTS/merge clean ris files/data/res_ris_wos_1002_8719_5001-6000.ris"
 [9] "C:/Users/fenn/Nextcloud/PROGRAMMING PROJECTS/merge clean ris files/data/res_ris_wos_1002_8719_6001-7000.ris"
[10] "C:/Users/fenn/Nextcloud/PROGRAMMING PROJECTS/merge clean ris files/data/res_ris_wos_1002_8719_7001-8000.ris"
[11] "C:/Users/fenn/Nextcloud/PROGRAMMING PROJECTS/merge clean ris files/data/res_ris_wos_1002_8719_8001-8719.ris"

merge data files

Aim: load single RIS files, extract information, create list and data frame

Function to extract information (tags) and write it into list and data frame:

extract tags from ris file, see table: https://en.wikipedia.org/wiki/RIS_(file_format)

extract_information <- function(entry) {
  
  ## function to set up data frame:
  # Helper: safely extract and collapse field
  collapse_field <- function(x) {
    if (is.null(x) || length(x) == 0 || all(is.na(x))) return(NA)
    return(paste(x, collapse = "; "))
  }
  
  
  lines <- str_split(entry, "\n")[[1]]
  lines <- trimws(lines)
  
  record <- list()

  for (line in lines) {
    if (grepl("^DO  -", line)) {
      record$doi <- sub("^DO  -\\s*", "", line)
      
    } else if (grepl("^SN  -", line)) {
      record$issn <- c(record$issn, sub("^SN  -\\s*", "", line))
      
    } else if (grepl("^AU  -|^A1  -", line)) {
      record$authors <- c(record$authors, sub("^(AU|A1)  -\\s*", "", line))
      
    } else if (grepl("^TI  -|^T1  -", line)) {
      record$title <- sub("^(TI|T1)  -\\s*", "", line)
      
    } else if (grepl("^T2  -", line)) {
      record$secondary_title <- sub("^T2  -\\s*", "", line)
      
    } else if (grepl("^AB  -|^N2  -", line)) {
      record$abstract <- sub("^(AB|N2)  -\\s*", "", line)
      
    } else if (grepl("^PU  -", line)) {
      record$publisher <- sub("^PU  -\\s*", "", line)
      
    } else if (grepl("^PY  -|^Y1  -", line)) {
      record$year <- sub("^(PY|Y1)  -\\s*", "", line)
      
    } else if (grepl("^VL  -", line)) {
      record$volume <- sub("^VL  -\\s*", "", line)
      
    } else if (grepl("^IS  -", line)) {
      record$issue <- sub("^IS  -\\s*", "", line)
      
    } else if (grepl("^SP  -", line)) {
      record$start_page <- sub("^SP  -\\s*", "", line)
      
    } else if (grepl("^EP  -", line)) {
      record$end_page <- sub("^EP  -\\s*", "", line)
      
    } else if (grepl("^AD  -", line)) {
      record$address <- c(record$address, sub("^AD  -\\s*", "", line))
      
    } else if (grepl("^M3  -", line)) {
      record$medium <- sub("^M3  -\\s*", "", line)
      
    } else if (grepl("^UR  -", line)) {
      record$url <- c(record$url, sub("^UR  -\\s*", "", line))
      
    } else if (grepl("^AN  -", line)) {
      record$accession_number <- sub("^AN  -\\s*", "", line)
      
    } else if (grepl("^KW  -", line)) {
      record$keywords <- c(record$keywords, sub("^KW  -\\s*", "", line))
      
    } else if (grepl("^C3  -", line)) {
      record$custom_field <- sub("^C3  -\\s*", "", line)
    }
  }
  
  
    # Define all expected fields and default to NA if missing
  all_fields <- c(
    "doi", "issn", "authors", "title", "secondary_title", "abstract", "publisher",
    "year", "volume", "issue", "start_page", "end_page", "address", "medium",
    "url", "accession_number", "keywords", "custom_field"
  )

  for (field in all_fields) {
    if (!field %in% names(record)) {
      record[[field]] <- NA
    }
  }
  

  ### set up data frame:
  record_df <- data.frame(
        doi = collapse_field(record$doi),
        issn = collapse_field(record$issn),
        authors = collapse_field(record$authors),
        title = collapse_field(record$title),
        secondary_title = collapse_field(record$secondary_title),
        abstract = collapse_field(record$abstract),
        publisher = collapse_field(record$publisher),
        year = collapse_field(record$year),
        volume = collapse_field(record$volume),
        issue = collapse_field(record$issue),
        start_page = collapse_field(record$start_page),
        end_page = collapse_field(record$end_page),
        address = collapse_field(record$address),
        medium = collapse_field(record$medium),
        url = collapse_field(record$url),
        accession_number = collapse_field(record$accession_number),
        keywords = collapse_field(record$keywords),
        custom_field = collapse_field(record$custom_field),
        stringsAsFactors = FALSE
      )

  return(list(record = record, record_df = record_df))
}

Load ris files and extract tags (as data frame and list):

start_time <- Sys.time()

# Set up parallel workers (multisession works on Windows)
plan(multisession)  # Automatically chooses reasonable # of workers



list_parsed_records <- list()

for(file in ris_files){
  file_name <- str_extract(file, "[^/]+$")
  
  # Read lines from file
  ris_file <- readLines(file)
  
  # Combine into one string
  ris_text <- paste(ris_file, collapse = "\n")
  
  # Extract all entries from TY to before the next TY
  entries <- str_extract_all(
    ris_text,
    regex("TY\\s+-\\s+.*?(?=\\nER\\s+-\\s+|$)", dotall = TRUE)
  )[[1]]
  
  cat("\n for file:", file_name, "the number of entries ris files (using TY as start and ER as end) is:", length(entries), "\n")
  
  parsed_entries <- future_lapply(entries, extract_information)
  
  cat(">>> Number of parsed entries:", length(parsed_entries), "\n")
  
  list_parsed_records[[file_name]] <- parsed_entries
}


 for file: res_ris_cochranelib_1002_5193.ris the number of entries ris files (using TY as start and ER as end) is: 5193 
>>> Number of parsed entries: 5193 

 for file: res_ris_psycinfo_2402_3450.ris the number of entries ris files (using TY as start and ER as end) is: 3450 
>>> Number of parsed entries: 3450 

 for file: res_ris_wos_1002_8719_1-1000.ris the number of entries ris files (using TY as start and ER as end) is: 1000 
>>> Number of parsed entries: 1000 

 for file: res_ris_wos_1002_8719_1001-2000.ris the number of entries ris files (using TY as start and ER as end) is: 1000 
>>> Number of parsed entries: 1000 

 for file: res_ris_wos_1002_8719_2001-3000.ris the number of entries ris files (using TY as start and ER as end) is: 1000 
>>> Number of parsed entries: 1000 

 for file: res_ris_wos_1002_8719_3001-4000.ris the number of entries ris files (using TY as start and ER as end) is: 1000 
>>> Number of parsed entries: 1000 

 for file: res_ris_wos_1002_8719_4001-5000.ris the number of entries ris files (using TY as start and ER as end) is: 1000 
>>> Number of parsed entries: 1000 

 for file: res_ris_wos_1002_8719_5001-6000.ris the number of entries ris files (using TY as start and ER as end) is: 1000 
>>> Number of parsed entries: 1000 

 for file: res_ris_wos_1002_8719_6001-7000.ris the number of entries ris files (using TY as start and ER as end) is: 1000 
>>> Number of parsed entries: 1000 

 for file: res_ris_wos_1002_8719_7001-8000.ris the number of entries ris files (using TY as start and ER as end) is: 1000 
>>> Number of parsed entries: 1000 

 for file: res_ris_wos_1002_8719_8001-8719.ris the number of entries ris files (using TY as start and ER as end) is: 719 
>>> Number of parsed entries: 719

###  Flatten and extract all record_df data frames
pattern <- "(wos|psycinfo|cochranelib)"
final_df_list <- list()
counter <- 1

for (file_name in names(list_parsed_records)) {
  
  file_entries <- list_parsed_records[[file_name]]
  
  # Extract DB name from file name
  name_DB <- str_extract(file_name, pattern)
  
  counter_entry_number = 1
  
  
  for (entry in file_entries) {
    
    # Extract the data frame
    df <- entry$record_df
    
    # Add metadata columns
    df$file_name <- file_name
    df$name_DB <- name_DB
    
    # entry number:
    df$entry_number <- counter_entry_number
    counter_entry_number = counter_entry_number + 1
    
    # Store in list
    final_df_list[[counter]] <- df
    counter <- counter + 1
  }
}

# Combine all into a single data frame
df_all <- do.call(rbind, final_df_list)


end_time <- Sys.time()
as.numeric(difftime(end_time, start_time, units = "secs"))

[1] 51.51283

length(list_parsed_records)

[1] 11

dim(df_all)

[1] 17362    21

# ## remove everything not needed from the environment:
rm(parsed_entries); rm(entries); rm(ris_file); rm(ris_text)
rm(file_name); rm(file)
rm(start_time); rm(end_time)
rm(pattern); rm(final_df_list); rm(counter); rm(counter_entry_number); rm(file_entries); rm(df); rm(entry); rm(name_DB)

add unique Rayyan ID

1.Load ris file from Rayyan:

start_time <- Sys.time()


setwd("data/Rayyan")

# Set up parallel processing
plan(multisession)

# Read and split the RIS file
ris_lines <- readLines("articles.ris")
ris_text <- paste(ris_lines, collapse = "\n")

entries <- str_extract_all(
  ris_text,
  regex("TY\\s+-\\s+.*?(?=\\nER\\s+-\\s+|$)", dotall = TRUE)
)[[1]]

cat("Number of RIS entries:", length(entries), "\n")

Number of RIS entries: 10727

# Parse each entry in parallel using your existing function
parsed_entries <- future_lapply(entries, extract_information)

# Combine parsed entries into one data frame
df_Rayyan <- do.call(rbind, lapply(parsed_entries, function(x) x$record_df))

end_time <- Sys.time()
as.numeric(difftime(end_time, start_time, units = "secs"))

[1] 18.63064

dim(df_Rayyan)

[1] 10727    18

rm(parsed_entries); rm(entries); rm(ris_lines); rm(ris_text)
rm(start_time); rm(end_time)

Create unique ID variable

if(eval_rayyan_fuzzyMatching){
  # Replace NA or "" with fixed placeholder
  safe_field <- function(x) {
    ifelse(is.na(x) | x == "", "M_MISSING_M", x)
  }
  
  # Construct match_id
  build_match_id <- function(df) {
    paste(
      safe_field(df$doi),
      safe_field(df$title),
      safe_field(df$all_authors),
      safe_field(df$abstract),
      safe_field(df$year),
      sep = " ||| "
    )
  }
  
  # Build match_id
  df_all$match_id <- build_match_id(df_all)
  df_Rayyan$match_id <- build_match_id(df_Rayyan)
  
  rm(safe_field); rm(build_match_id)
}

Identify matching entries between two data sets

This section performs approximate record linkage between two datasets: df_all and df_Rayyan. The goal is to identify potentially corresponding entries based on fuzzy matching of composite identifiers.

Preprocessing for Matching:

if(eval_rayyan_fuzzyMatching){
  # Preprocess function
  preprocess <- function(x) {
    x <- tolower(x)
    x <- gsub("[[:punct:][:space:]]+", "", x)
    x
  }
  
  # Apply preprocessing
  df_all$match_id_short <- preprocess(df_all$match_id)
  df_Rayyan$match_id_short <- preprocess(df_Rayyan$match_id)
  
  df_all$match_id <- NULL
  df_Rayyan$match_id <- NULL
  rm(preprocess)
}

To improve the robustness and efficiency of fuzzy string matching, match identifiers are normalized by converting all characters to lowercase and stripping whitespace and punctuation. This reduces irrelevant variation and standardizes string structure prior to similarity computation.

Batched Fuzzy Matching with Intermediate Saving:

setwd("outputs")
results_file <- "intermediate_results_fuzzyMatching.rds"  # Path to save intermediate results

if(eval_rayyan_fuzzyMatching){
  # Track time
  start_time <- Sys.time()
  
  # Parameters
  batch_size <- 100
  top_n <- 3
  similarity_threshold <- 0.90
  
  # Prepare vectors
  all_vec <- df_all$match_id_short
  rayyan_vec <- df_Rayyan$match_id_short
  
  # Batch splitting
  batches <- split(seq_along(all_vec), ceiling(seq_along(all_vec) / batch_size))
  
  # Initialize or load results
  if (file.exists(results_file)) {
    results <- readRDS(results_file)
    start_batch <- length(results) / batch_size + 1
    cat("Resuming from batch", start_batch, "\n")
  } else {
    results <- list()
    start_batch <- 1
  }
  
  # Loop over batches
  for (b in start_batch:length(batches)) {
    idx <- batches[[b]]
    cat("Processing batch", b, "of", length(batches), "...\n")
  
    dist_mat <- stringdistmatrix(all_vec[idx], rayyan_vec, method = "jw")
    sim_mat <- 1 - as.matrix(dist_mat)  # similarity matrix
  
    for (i in seq_along(idx)) {
      sim_vec <- sim_mat[i, ]
      top_matches <- order(sim_vec, decreasing = TRUE)[1:min(top_n, length(sim_vec))]
      top_scores <- sim_vec[top_matches]
  
      results[[length(results) + 1]] <- data.table::data.table(
        df_all_index = idx[i],
        df_Rayyan_index = top_matches,
        similarity = top_scores,
        above_threshold = top_scores >= similarity_threshold
      )
    }
  
    # Save results after each batch
    saveRDS(results, results_file)
  }
  match_results <- data.table::rbindlist(results)
  
  # End timing
  end_time <- Sys.time()
  elapsed_sec <- as.numeric(difftime(end_time, start_time, units = "secs"))
  
  cat("Time elapsed (sec):", elapsed_sec, "\n")
  # took 10.5 hours!
  
  rm(all_vec); rm(rayyan_vec)
  rm(b); rm(i); rm(idx); rm(batch_size); rm(similarity_threshold); rm(start_time)
  rm(top_n); rm(top_matches); rm(top_scores); rm(start_batch)
  rm(sim_vec); rm(sim_mat); rm(dist_mat); rm(batches)
  
  df_all$match_id_short <- NULL
  df_Rayyan$match_id_short <- NULL
}else{
  results <- readRDS(results_file)
  match_results <- data.table::rbindlist(results)
}

rm(results_file); rm(results)

To efficiently scale approximate string matching to over 10,000 records, the process is batched and parallelizable. Within each batch, Jaro-Winkler similarity is computed using stringdistmatrix(). For each entry in df_all, the top N most similar entries from df_Rayyan are retained. Matches with similarity above a defined threshold (e.g., 0.90) are flagged. Intermediate results are persisted to disk after each batch using saveRDS(), enabling resumption in the case of interruption or long runtimes. Execution time is tracked for performance monitoring.

Set threshold to .83 to get stable results:

dim(match_results)

[1] 52086     4

match_results_filtered <- match_results %>%
  group_by(df_all_index) %>%
  slice(1) %>%
  ungroup()
dim(match_results_filtered)

[1] 17362     4

nrow(match_results_filtered) * 3

[1] 52086

sum(match_results_filtered$similarity != 1)

[1] 11150

sum(match_results_filtered$similarity == 1)

[1] 6212

sum(match_results_filtered$similarity >= .95)

[1] 14532

match_results_filtered <- match_results_filtered[match_results_filtered$similarity >= .83,]
dim(match_results_filtered)

[1] 16560     4

# Helper to extract first 10 words
first_10_words <- function(text) {
  words <- unlist(strsplit(text, "\\s+"))
  paste(head(words, 10), collapse = " ")
}

# Create a subset data.frame for printing
print_matches <- function(matches, df_all, df_Rayyan, top_n = 10, thresholdSmaller = NA, from = c("head", "tail")) {
  from <- match.arg(from)  # ensures only "head" or "tail" is allowed
  
  # Optional threshold filtering
  if (!is.na(thresholdSmaller)) {
    matches <- matches[matches$similarity < thresholdSmaller, ]
  }

  # Choose top_n matches from head or tail
  n <- min(nrow(matches), top_n)
  if (from == "head") {
    selected <- matches[seq_len(n), ]
  } else {
    selected <- matches[(nrow(matches) - n + 1):nrow(matches), ]
  }
  
  # Print selected matches
  for (i in seq_len(nrow(selected))) {
    all_idx <- selected$df_all_index[i]
    rayyan_idx <- selected$df_Rayyan_index[i]
    
    cat("Match", i, "\n")
    cat("Similarity:", selected$similarity[i], 
        "| Above threshold:", selected$above_threshold[i], "\n\n")
    
    cat("🔹 From df_all:\n")
    cat("Title:   ", df_all$title[all_idx], "\n")
    cat("DOI:     ", df_all$doi[all_idx], "\n")
    cat("Authors: ", df_all$authors[all_idx], "\n")
    cat("Abstract snippet: ", first_10_words(df_all$abstract[all_idx]), "\n\n")
    
    cat("🔸 From df_Rayyan:\n")
    cat("Title:   ", df_Rayyan$title[rayyan_idx], "\n")
    cat("DOI:     ", df_Rayyan$doi[rayyan_idx], "\n")
    cat("Authors: ", df_Rayyan$authors[rayyan_idx], "\n")
    cat("Abstract snippet: ", first_10_words(df_Rayyan$abstract[rayyan_idx]), "\n")
    
    cat(strrep("-", 60), "\n\n")
  }
}


print_matches(match_results_filtered, df_all, df_Rayyan, top_n = 20, threshold = .85, from = "head")

Match 1 
Similarity: 0.833622 | Above threshold: FALSE 

🔹 From df_all:
Title:    Effect of cognitive training on episodic memory retrieval in amnestic mild cognitive impairment patients: study protocol for a clinical randomized controlled trial 
DOI:      NA 
Authors:  Zhang, K; Wang, J; Peng, G; Liu, P; He, F; Zhu, Z; Luo, B 
Abstract snippet:  Background: Mild cognitive impairment (MCI) is a transition state between 

🔸 From df_Rayyan:
Title:    Effect of cognitive training on episodic memory retrieval in amnestic mild cognitive impairment patients: study protocol for a clinical randomized controlled trial 
DOI:      10.1186/s13063-018-3143-0 
Authors:  Zhang, K; Wang, J; Peng, G; Liu, P; He, F; Zhu, Z; Luo, B 
Abstract snippet:  Background: Mild cognitive impairment (MCI) is a transition state between 
------------------------------------------------------------ 

Match 2 
Similarity: 0.8412729 | Above threshold: FALSE 

🔹 From df_all:
Title:    Cognitive and Brain Activity Changes After Mnemonic Strategy Training in Amnestic Mild Cognitive Impairment: evidence From a Randomized Controlled Tria 
DOI:      NA 
Authors:  Simon, SS; Hampstead, BM; Nucci, MP; Duran, FLS; Fonseca, LM; Martino, MDM; Avila, R; Porto, FHG; Brucki, SMD; Martins, CB; et al. 
Abstract snippet:  Background: Mnemonic strategy training (MST) has been shown to improve 

🔸 From df_Rayyan:
Title:    Cognitive and Brain Activity Changes After Mnemonic Strategy Training in Amnestic Mild Cognitive Impairment: evidence From a Randomized Controlled Trial 
DOI:      10.3389/fnagi.2018.00342 
Authors:  Simon, SS; Hampstead, BM; Nucci, MP; Duran, FLS; Fonseca, LM; Martin, MDGM; Ávila, R; Porto, FHG; Brucki, SMD; Martins, CB; et al. 
Abstract snippet:  Background: Mnemonic strategy training (MST) has been shown to improve 
------------------------------------------------------------ 

Match 3 
Similarity: 0.8369171 | Above threshold: FALSE 

🔹 From df_all:
Title:    The efficacy of Cognitive training in patients with VAsCular Cognitive Impairment, No dEmentia (the Cog-VACCINE study): study protocol for a randomized controlled trial 
DOI:      NA 
Authors:  NA 
Abstract snippet:  Background: Vascular cognitive impairment, no dementia (VCIND) refers to cognitive 

🔸 From df_Rayyan:
Title:    The efficacy of Cognitive training in patients with VAsCular Cognitive Impairment, No dEmentia (the Cog-VACCINE study): study protocol for a randomized controlled trial 
DOI:      10.1186/s13063-016-1523-x 
Authors:  Tang, Y; Zhu, Z; Liu, Q; Li, F; Yang, J; Li, F; Xing, Y; Jia, J 
Abstract snippet:  BACKGROUND: Vascular cognitive impairment, no dementia (VCIND) refers to cognitive 
------------------------------------------------------------ 

Match 4 
Similarity: 0.8348307 | Above threshold: FALSE 

🔹 From df_all:
Title:    Cognitive Stimulation in Moderate Alzheimer's Disease 
DOI:      NA 
Authors:  Gonzalez-Moreno, J; Satorres, E; Soria-Urios, G; Melendez, JC 
Abstract snippet:  Cognitive stimulation is one of the non‐pharmacological therapies recommended for 

🔸 From df_Rayyan:
Title:    Cognitive Stimulation in Moderate Alzheimer's Disease 
DOI:      10.1177/07334648221089283 
Authors:  Gonzalez-Moreno, J; Satorres, E; Soria-Urios, G; Meléndez, JC 
Abstract snippet:  Cognitive stimulation is one of the non‐pharmacological therapies recommended for 
------------------------------------------------------------ 

Match 5 
Similarity: 0.8396087 | Above threshold: FALSE 

🔹 From df_all:
Title:    The Effect of Repetitive Transcranial Magnetic Stimulation (rTMS) on Cognition in Patients With Traumatic Brain Injury: a Protocol for a Randomized Controlled Trial 
DOI:      NA 
Authors:  Zhang, H; Zhao, Y; Qu, Y; Huang, Y; Chen, Z; Lan, H; Peng, Y; Ren, H 
Abstract snippet:  Cognitive impairment, defined as a decline in memory and executive 

🔸 From df_Rayyan:
Title:    The Effect of Repetitive Transcranial Magnetic Stimulation (rTMS) on Cognition in Patients With Traumatic Brain Injury: a Protocol for a Randomized Controlled Trial 
DOI:      10.3389/fneur.2022.832818 
Authors:  Zhang, H; Zhao, Y; Qu, Y; Huang, Y; Chen, Z; Lan, H; Peng, Y; Ren, H 
Abstract snippet:  Cognitive impairment, defined as a decline in memory and executive 
------------------------------------------------------------ 

Match 6 
Similarity: 0.8450666 | Above threshold: FALSE 

🔹 From df_all:
Title:    Cognitive behavioral rehabilitation for bipolar disorder patients: a randomized controlled trial 
DOI:      10.1111/bdi.12746 
Authors:  Gomes, B; Rocca, C; Belizario, G; Fernandes, F; Valois, I; Olmo, G; Fachin, R; Farhat, LC; Lafer, B 
Abstract snippet:  Introduction: Bipolar disorder is frequently associated with cognitive impairment even 

🔸 From df_Rayyan:
Title:    Cognitive behavioral rehabilitation for bipolar disorder patients: a randomized controlled trial 
DOI:      10.1111/bdi.12784 
Authors:  Gomes, BC; Rocca, CC; Belizario, GO; de B F Fernandes F,; Valois, I; Olmo, GC; Fachin, RVP; Farhat, LC; Lafer, B 
Abstract snippet:  Objectives: Bipolar disorder is frequently associated with cognitive impairment even 
------------------------------------------------------------ 

Match 7 
Similarity: 0.8421543 | Above threshold: FALSE 

🔹 From df_all:
Title:    Effects of Cognitive Training on Cognitive Performance of Healthy Older Adults 
DOI:      NA 
Authors:  Santos Golino, MT; Mendoza, CF; Golino, HF 
Abstract snippet:  The purpose of this study was to determine the immediate 

🔸 From df_Rayyan:
Title:    Effects of Cognitive Training on Cognitive Performance of Healthy Older Adults 
DOI:      10.1017/sjp.2017.38 
Authors:  Golino, MTS; Flores Mendoza, C; Golino, HF 
Abstract snippet:  The purpose of this study was to determine the immediate 
------------------------------------------------------------ 

Match 8 
Similarity: 0.8343647 | Above threshold: FALSE 

🔹 From df_all:
Title:    Long-Lasting Neuropsychological Effects of a Computerized Cognitive Training in Patients Affected by Early Stage Alzheimer's Disease: are They Stable Over Time? 
DOI:      NA 
Authors:  Cavallo, M; Angilletta, C 
Abstract snippet:  INTRODUCTION: We investigated the stability of effects of a computerized 

🔸 From df_Rayyan:
Title:    Long-Lasting Neuropsychological Effects of a Computerized Cognitive Training in Patients Affected by Early Stage Alzheimer's Disease: are They Stable Over Time? 
DOI:      10.1177/0733464817750276 
Authors:  Cavallo, M; Angilletta, C 
Abstract snippet:  INTRODUCTION: We investigated the stability of effects of a computerized 
------------------------------------------------------------ 

Match 9 
Similarity: 0.8398159 | Above threshold: FALSE 

🔹 From df_all:
Title:    Cognitive Training Using a Novel Memory Game on an iPad in Patients with Amnestic Mild Cognitive Impairment (aMCI) 
DOI:      NA 
Authors:  Savulich, G; Piercy, T; Fox, C; Suckling, J; Rowe, JB; O'Brien, JT; Sahakian, BJ 
Abstract snippet:  Background: Cognitive training is effective in patients with mild cognitive 

🔸 From df_Rayyan:
Title:    Cognitive Training Using a Novel Memory Game on an iPad in Patients with Amnestic Mild Cognitive Impairment (aMCI) 
DOI:      10.1093/ijnp/pyx040 
Authors:  Savulich, G; Piercy, T; Fox, C; Suckling, J; Rowe, JB; O'Brien, JT; Sahakian, BJ 
Abstract snippet:  Background: Cognitive training is effective in patients with mild cognitive 
------------------------------------------------------------ 

Match 10 
Similarity: 0.834186 | Above threshold: FALSE 

🔹 From df_all:
Title:    Randomized Trial of Working Memory Training as an Adjunct to Inpatient Substance Use Disorder Treatment 
DOI:      NA 
Authors:  Hendershot, CS; Wardell, JD; Vandervoort, J; McPhee, MD; Keough, MT; Quilty, LC 
Abstract snippet:  Despite interest in computerized working memory training as a transdiagnostic 

🔸 From df_Rayyan:
Title:    Randomized Trial of Working Memory Training as an Adjunct to Inpatient Substance Use Disorder Treatment 
DOI:      10.1037/adb0000415 
Authors:  Hendershot, CS; Wardell, JD; Vandervoort, J; McPhee, MD; Keough, MT; Quilty, LC 
Abstract snippet:  Despite interest in computerized working memory training as a transdiagnostic 
------------------------------------------------------------ 

Match 11 
Similarity: 0.8465459 | Above threshold: FALSE 

🔹 From df_all:
Title:    Working memory training for older adults after major surgery: benefits to cognitive and emotional functioning 
DOI:      NA 
Authors:  Carbone, E; Vianello, E; Carretti, B; Borella, E 
Abstract snippet:  Objectives: Cognitive and mood changes can affect postoperative recovery in 

🔸 From df_Rayyan:
Title:    Working Memory Training for Older Adults After Major Surgery: benefits to Cognitive and Emotional Functioning 
DOI:      10.1016/j.jagp.2019.05.023 
Authors:  Carbone, E; Vianello, E; Carretti, B; Borella, E 
Abstract snippet:  OBJECTIVES: Cognitive and mood changes can affect postoperative recovery in 
------------------------------------------------------------ 

Match 12 
Similarity: 0.8345881 | Above threshold: FALSE 

🔹 From df_all:
Title:    Cognitive effects of anti-epileptic drugs in nigerians with epilepsy 
DOI:      10.1159/000356326 
Authors:  Ogunrin, O 
Abstract snippet:  Background: Epilepsy is particularly highly prevalent in developing African countries 

🔸 From df_Rayyan:
Title:    Cognitive effects of anti-epileptic drugs in Nigerians with epilepsy 
DOI:      NA 
Authors:  Ogunrin, O; Adamolekun, B; Ogunniyi, A 
Abstract snippet:  Background: Epilepsy is particularly highly prevalent in developing African countries 
------------------------------------------------------------ 

Match 13 
Similarity: 0.8345939 | Above threshold: FALSE 

🔹 From df_all:
Title:    Verbal working memory training in older adults: an investigation of dose response 
DOI:      NA 
Authors:  Brum, PS; Borella, E; Carretti, B; Sanches Yassuda, M 
Abstract snippet:  The WM training protocol proposed by Borella et al. found 

🔸 From df_Rayyan:
Title:    Verbal working memory training in older adults: an investigation of dose response 
DOI:      10.1080/13607863.2018.1531372 
Authors:  Brum, PS; Borella, E; Carretti, B; Sanches Yassuda, M 
Abstract snippet:  The WM training protocol proposed by Borella et al. found 
------------------------------------------------------------ 

Match 14 
Similarity: 0.8391278 | Above threshold: FALSE 

🔹 From df_all:
Title:    Comparing memory group training and computerized cognitive training for improving memory function following stroke: a phase II randomized controlled trial 
DOI:      NA 
Authors:  Withiel, TD; Wong, D; Ponsford, JL; Cadilhac, DA; New, P; Mihaljcic, T; Stolwyk, RJ 
Abstract snippet:  OBJECTIVES: Memory deficits are common after stroke, yet remain a 

🔸 From df_Rayyan:
Title:    Comparing memory group training and computerized cognitive training for improving memory function following stroke: a phase II randomized controlled trial 
DOI:      10.2340/16501977-2540 
Authors:  Withiel, TD; Wong, D; Ponsford, JL; Cadilhac, DA; New, P; Mihaljcic, T; Stolwyk, RJ 
Abstract snippet:  OBJECTIVES: Memory deficits are common after stroke, yet remain a 
------------------------------------------------------------ 

Match 15 
Similarity: 0.836012 | Above threshold: FALSE 

🔹 From df_all:
Title:    Adaptive computerized working memory training in patients with mild cognitive impairment. A randomized double-blind active controlled trial 
DOI:      NA 
Authors:  Flak, MM; Hol, HR; Hernes, SS; Chang, L; Engvig, A; Bjuland, KJ; Pripp, A; Madsen, B-O; Knapskog, A-B; Ulstein, I; et al. 
Abstract snippet:  Objective:We investigated if a 5‐week computerized adaptive working memory training 

🔸 From df_Rayyan:
Title:    Adaptive computerized working memory training in patients with mild cognitive impairment. A randomized double-blind active controlled trial. 
DOI:      10.3389/fpsyg.2019.00807 
Authors:  Flak, Marianne M.; Hol, Haakon R.; Hernes, Susanne S.; Chang, Linda; Engvig, Andreas; Bjuland, Knut Jørgen; Pripp, Are; Madsen, Bengt-Ove; Knapskog, Anne-Brita; Ulstein, Ingun; Lona, Trine; Skranes, Jon; Løhaugen, Gro C. C. 
Abstract snippet:  Objective:We investigated if a 5-week computerized adaptive working memory training 
------------------------------------------------------------ 

Match 16 
Similarity: 0.8436802 | Above threshold: FALSE 

🔹 From df_all:
Title:    Cognitive Flexibility and Reaction Time Improvements After Cognitive Training Designed for Men Perpetrators of Intimate Partner Violence: results of a Pilot Randomized Controlled Trial 
DOI:      NA 
Authors:  Romero-Martínez, Á; Santirso, F; Lila, M; Comes-Fayos, J; Moya-Albiol, L 
Abstract snippet:  Purpose Current interventions for intimate partner violence (IPV) perpetrators are 

🔸 From df_Rayyan:
Title:    Cognitive Flexibility and Reaction Time Improvements After Cognitive Training Designed for Men Perpetrators of Intimate Partner Violence: results of a Pilot Randomized Controlled Trial 
DOI:      10.1007/s10896-021-00304-2 
Authors:  Romero-Martínez, Á; Santirso, F; Lila, M; Comes-Fayos, J; Moya-Albiol, L 
Abstract snippet:  Purpose Current interventions for intimate partner violence (IPV) perpetrators are 
------------------------------------------------------------ 

Match 17 
Similarity: 0.8359828 | Above threshold: FALSE 

🔹 From df_all:
Title:    Structured Cognitive Training Yields Best Results in Healthy Older Adults, and Their ApoE4 State and Baseline Cognitive Level Predict Training Benefits 
DOI:      10.1097/WNN.0000000000000195 
Authors:  Roheger, M; Kessler, J; Kalbe, E 
Abstract snippet:  BACKGROUND: Cognitive training has been shown to improve cognitive functions 

🔸 From df_Rayyan:
Title:    Structured Cognitive Training Yields Best Results in Healthy Older Adults, and Their <i>ApoE4</i> State and Baseline Cognitive Level Predict Training Benefits 
DOI:      10.1097/WNN.0000000000000195 
Authors:  Roheger, Mandy; Kessler, Josef; Kalbe, Elke 
Abstract snippet:  Background: Cognitive training has been shown to improve cognitive functions 
------------------------------------------------------------ 

Match 18 
Similarity: 0.8487515 | Above threshold: FALSE 

🔹 From df_all:
Title:    Efforts of systematic categorization training on cognitive performance in healthy older adults and in adults with traumatic brain injury 
DOI:      NA 
Authors:  Constantinidou, F 
Abstract snippet:  This study investigated the effects of hierarchical cognitive training using 

🔸 From df_Rayyan:
Title:    Efforts of systematic categorization training on cognitive performance in healthy older adults and in adults with traumatic brain injury. 
DOI:      10.1155/2019/9785319 
Authors:  Constantinidou, Fofi 
Abstract snippet:  This study investigated the effects of hierarchical cognitive training using 
------------------------------------------------------------ 

Match 19 
Similarity: 0.8304724 | Above threshold: FALSE 

🔹 From df_all:
Title:    The effect of aerobic dance intervention on brain spontaneous activity in older adults with mild cognitive impairment: a resting-state functional MRI study 
DOI:      NA 
Authors:  Qi, M; Zhu, Y; Zhang, L; Wu, T; Wang, J 
Abstract snippet:  The current study aimed to evaluate the effect of a 

🔸 From df_Rayyan:
Title:    The effect of aerobic dance intervention on brain spontaneous activity in older adults with mild cognitive impairment: A resting-state functional MRI study 
DOI:      10.3892/etm.2018.7006 
Authors:  Qi, Ming; Zhu, Yi; Zhang, Ling; Wu, Ting; Wang, Jie 
Abstract snippet:  The current study aimed to evaluate the effect of a 
------------------------------------------------------------ 

Match 20 
Similarity: 0.8378899 | Above threshold: FALSE 

🔹 From df_all:
Title:    Randomized double-masked controlled trial of cognitive training in breast cancer survivors: a preliminary study 
DOI:      NA 
Authors:  Ah, DV; McDonald, BC; Crouch, AD; Ofner, S; Perkins, S; Storey, S; Considine, R; Unverzagt, F 
Abstract snippet:  Purpose To evaluate the acceptability, satisfaction, and preliminary efficacy of 

🔸 From df_Rayyan:
Title:    Randomized double-masked controlled trial of cognitive training in breast cancer survivors: a preliminary study 
DOI:      10.1007/s00520-022-07182-4 
Authors:  Von Ah, D; McDonald, BC; Crouch, AD; Ofner, S; Perkins, S; Storey, S; Considine, R; Unverzagt, F 
Abstract snippet:  Purpose: To evaluate the acceptability, satisfaction, and preliminary efficacy of 
------------------------------------------------------------

print_matches(match_results_filtered, df_all, df_Rayyan, top_n = 20, threshold = .85, from = "tail")

Match 1 
Similarity: 0.8403169 | Above threshold: FALSE 

🔹 From df_all:
Title:    Table_1_Cortical Thickness Changes After Computerized Working Memory Training in Patients With Mild Cognitive Impairment.docx 
DOI:      10.3389/fnagi.2022.796110.s002 
Authors:  Hol, Haakon R; Flak, Marianne M; Chang, Linda; Lohaugen, Gro Christine Christensen; Bjuland, Knut Jorgen; Rimol, Lars M; Engvig, Andreas; Skranes, Jon; Ernst, Thomas; Madsen, Bengt-Ove; Hernes, Susanne S 
Abstract snippet:  BackgroundAdaptive computerized working memory (WM) training has shown favorable effects 

🔸 From df_Rayyan:
Title:    Cortical Thickness Changes After Computerized Working Memory Training in Patients With Mild Cognitive Impairment 
DOI:      10.3389/fnagi.2022.796110 
Authors:  Hol, HR; Flak, MM; Chang, L; Lohaugen, GCC; Bjuland, KJ; Rimol, LM; Engvig, A; Skranes, J; Ernst, T; Madsen, B-O; et al. 
Abstract snippet:  Background: Adaptive computerized working memory (WM) training has shown favorable 
------------------------------------------------------------ 

Match 2 
Similarity: 0.8410584 | Above threshold: FALSE 

🔹 From df_all:
Title:    Table_1_Enriched Rehabilitation Improves Gait Disorder and Cognitive Function in Parkinsons Disease: A Randomized Clinical Trial.doc 
DOI:      10.3389/fnins.2021.733311.s002 
Authors:  Wang, Xin; Chen, LanLan; Zhou, Hongyu; Xu, Yao; Zhang, Hongying; Yang, Wenrui; Tang, XiaoJia; Wang, Junya; Lv, Yichen; Yan, Ping; Peng, Yuan 
Abstract snippet:  Background: Studies on non-pharmacological strategies for improving gait performance and 

🔸 From df_Rayyan:
Title:    Enriched Rehabilitation Improves Gait Disorder and Cognitive Function in Parkinson's Disease: a Randomized Clinical Trial 
DOI:      10.3389/fnins.2021.733311 
Authors:  Wang, X; Chen, L; Zhou, H; Xu, Y; Zhang, H; Yang, W; Tang, X; Wang, J; Lv, Y; Yan, P; et al. 
Abstract snippet:  Background: Studies on non‐pharmacological strategies for improving gait performance and 
------------------------------------------------------------ 

Match 3 
Similarity: 0.8379978 | Above threshold: FALSE 

🔹 From df_all:
Title:    Data_Sheet_1_Cortical Thickness Changes After Computerized Working Memory Training in Patients With Mild Cognitive Impairment.PDF 
DOI:      10.3389/fnagi.2022.796110.s001 
Authors:  Hol, Haakon R; Flak, Marianne M; Chang, Linda; Lohaugen, Gro Christine Christensen; Bjuland, Knut Jorgen; Rimol, Lars M; Engvig, Andreas; Skranes, Jon; Ernst, Thomas; Madsen, Bengt-Ove; Hernes, Susanne S 
Abstract snippet:  BackgroundAdaptive computerized working memory (WM) training has shown favorable effects 

🔸 From df_Rayyan:
Title:    Cortical Thickness Changes After Computerized Working Memory Training in Patients With Mild Cognitive Impairment 
DOI:      10.3389/fnagi.2022.796110 
Authors:  Hol, HR; Flak, MM; Chang, L; Lohaugen, GCC; Bjuland, KJ; Rimol, LM; Engvig, A; Skranes, J; Ernst, T; Madsen, B-O; et al. 
Abstract snippet:  Background: Adaptive computerized working memory (WM) training has shown favorable 
------------------------------------------------------------ 

Match 4 
Similarity: 0.8367257 | Above threshold: FALSE 

🔹 From df_all:
Title:    Data_Sheet_1_Effectiveness of a stepped-care programme of WHO psychological interventions in migrant populations resettled in Italy: Study protocol for the RESPOND randomized controlled trial.PDF 
DOI:      10.3389/fpubh.2023.1100546.s001 
Authors:  Purgato, Marianna; Turrini, Giulia; Tedeschi, Federico; Serra, Riccardo; Tarsitani, Lorenzo; Compri, Beatrice; Muriago, Giulia; Cadorin, Camilla; Ostuzzi, Giovanni; Nicaise, Pablo; Lorant, Vincent; Sijbrandij, Marit; Witteveen, Anke B; Ayuso-Mateos, Jose Luis; Mediavilla, Roberto; Haro, Josep Maria; Felez-Nobrega, Mireia; Figueiredo, Natasha; Pollice, Giulia; McDaid, David; Park, A-La; Kalisch, Raffael; Petri-Romao, Papoula; Underhill, James; Bryant, Richard A; Nose, Michela; Barbui, Corrado 
Abstract snippet:  IntroductionMigrant populations, including workers, undocumented migrants, asylum seekers, refugees, internationally 

🔸 From df_Rayyan:
Title:    Effectiveness of a stepped-care programme of WHO psychological interventions in migrant populations resettled in Italy: Study protocol for the RESPOND randomized controlled trial 
DOI:      10.3389/fpubh.2023.1100546 
Authors:  Purgato, Marianna; Turrini, Giulia; Tedeschi, Federico; Serra, Riccardo; Tarsitani, Lorenzo; Compri, Beatrice; Muriago, Giulia; Cadorin, Camilla; Ostuzzi, Giovanni; Nicaise, Pablo; Lorant, Vincent; Sijbrandij, Marit; Witteveen, Anke B. B.; Ayuso-Mateos, Jose Luis; Mediavilla, Roberto; Haro, Josep Maria; Felez-Nobrega, Mireia; Figueiredo, Natasha; Pollice, Giulia; McDaid, David; Park, A-La; Kalisch, Raffael; Petri-Romao, Papoula; Underhill, James; Bryant, Richard A. A.; Nose, Michela; Barbui, Corrado; Respond Consortium 
Abstract snippet:  Introduction: Migrant populations, including workers, undocumented migrants, asylum seekers, refugees, 
------------------------------------------------------------ 

Match 5 
Similarity: 0.845302 | Above threshold: FALSE 

🔹 From df_all:
Title:    Participant characteristics: Nutritional markers. 
DOI:      10.1371/journal.pgph.0002531.t002 
Authors:  McCann, Samantha; Mason, Luke; Milosavljevic, Bosiljka; Mbye, Ebrima; Touray, Ebou; Colley, Alhassan; Johnson, William; Lloyd-Fox, Sarah; Elwell, Clare E; Moore, Sophie E 
Abstract snippet:  IntroductionIron deficiency is among the leading risk factors for poor 

🔸 From df_Rayyan:
Title:    Participant characteristics: Demographics. 
DOI:      10.1371/journal.pgph.0002531.t001 
Authors:  McCann, Samantha; Mason, Luke; Milosavljevic, Bosiljka; Mbye, Ebrima; Touray, Ebou; Colley, Alhassan; Johnson, William; Lloyd-Fox, Sarah; Elwell, Clare E; Moore, Sophie E 
Abstract snippet:  IntroductionIron deficiency is among the leading risk factors for poor 
------------------------------------------------------------ 

Match 6 
Similarity: 0.8346916 | Above threshold: FALSE 

🔹 From df_all:
Title:    Table_1_Association of markers of inflammation on attention and neurobehavioral outcomes in survivors of childhood acute lymphoblastic leukemia.docx 
DOI:      10.3389/fonc.2023.1117096.s001 
Authors:  Cheung, Yin Ting; To, Kenneth Kin-Wah; Hua, Rong; Lee, Chui Ping; Chan, Agnes Sui-Ying; Li, Chi Kong 
Abstract snippet:  BackgroundSurvivors of childhood acute lymphoblastic leukemia (ALL) are at-risk of 

🔸 From df_Rayyan:
Title:    Association of markers of inflammation on attention and neurobehavioral outcomes in survivors of childhood acute lymphoblastic leukemia 
DOI:      10.3389/fonc.2023.1117096 
Authors:  Cheung, Yin Ting; To, Kenneth Kin-Wah; Hua, Rong; Lee, Chui Ping; Chan, Agnes Sui-Ying; Li, Chi Kong 
Abstract snippet:  BackgroundSurvivors of childhood acute lymphoblastic leukemia (ALL) are at-risk of 
------------------------------------------------------------ 

Match 7 
Similarity: 0.8412083 | Above threshold: FALSE 

🔹 From df_all:
Title:    Carolina Abecedarian Project and the Carolina Approach to Responsive Education (CARE), United States, 1972-1992 
DOI:      10.3886/ICPSR04091.v2 
Authors:  Ramey, Craig T; Gallagher, James J; Campbell, Frances; Wasik, Barbara Hanna; Sparling, Joseph 
Abstract snippet:  The Carolina Abecedarian (ABC) Project and the Carolina Approach to 

🔸 From df_Rayyan:
Title:    Carolina Abecedarian Project and the Carolina Approach to Responsive Education (CARE), United States, 1972-1992 
DOI:      10.3886/ICPSR04091 
Authors:  Ramey, Craig T; Gallagher, James J; Campbell, Frances; Wasik, Barbara Hanna; Sparling, Joseph 
Abstract snippet:  The Carolina Abecedarian (ABC) Project and the Carolina Approach to 
------------------------------------------------------------ 

Match 8 
Similarity: 0.8300687 | Above threshold: FALSE 

🔹 From df_all:
Title:    Data_Sheet_1_Predicting Outcome for Early Attention Training After Acquired Brain Injury.pdf 
DOI:      10.3389/fnhum.2022.767276.s001 
Authors:  Bartfai, Aniko; Elg, Mattias; Schult, Marie-Louise; Markovic, Gabriela 
Abstract snippet:  BackgroundThe training of impaired attention after acquired brain injury is 

🔸 From df_Rayyan:
Title:    Predicting Outcome for Early Attention Training After Acquired Brain Injury 
DOI:      10.3389/fnhum.2022.767276 
Authors:  Bartfai, Aniko; Elg, Mattias; Schult, Marie-Louise; Markovic, Gabriela 
Abstract snippet:  BackgroundThe training of impaired attention after acquired brain injury is 
------------------------------------------------------------ 

Match 9 
Similarity: 0.8316396 | Above threshold: FALSE 

🔹 From df_all:
Title:    Data_Sheet_1_Economical Assessment of Working Memory and Response Inhibition in ADHD Using a Combined n-back/Nogo Paradigm: An ERP Study.PDF 
DOI:      10.3389/fnhum.2020.00322.s001 
Authors:  Breitling-Ziegler, Carolin; Tegelbeckers, Jana; Flechtner, Hans-Henning; Krauel, Kerstin 
Abstract snippet:  The development of cognitive interventions in attention-deficit/hyperactivity disorder (ADHD) often 

🔸 From df_Rayyan:
Title:    Economical Assessment of Working Memory and Response Inhibition in ADHD Using a Combined<i>n</i>-back/Nogo Paradigm: An ERP Study 
DOI:      10.3389/fnhum.2020.00322 
Authors:  Breitling-Ziegler, Carolin; Tegelbeckers, Jana; Flechtner, Hans-Henning; Krauel, Kerstin 
Abstract snippet:  The development of cognitive interventions in attention-deficit/hyperactivity disorder (ADHD) often 
------------------------------------------------------------ 

Match 10 
Similarity: 0.8363526 | Above threshold: FALSE 

🔹 From df_all:
Title:    Supplementary Material for: Effects of eligibility criteria on patient selection and treatment implications from 10 multidomain dementia prevention trials: a population-based study 
DOI:      10.6084/m9.figshare.21574458.v1 
Authors:  Licher, S; Wolters, FJ; Pavlovic, J; Kavousi, M; Leening, MJ; Ikram, MK; Ikram, MA 
Abstract snippet:  Introduction: Dementia prevention trials have so far shown little benefit 

🔸 From df_Rayyan:
Title:    Effects of eligibility criteria on patient selection and treatment implications from 10 multidomain dementia prevention trials: a population-based study 
DOI:      10.1159/000528120 
Authors:  Licher, Silvan; Wolters, Frank J.; Pavlovic, Jelena; Kavousi, Maryam; Leening, Maarten J. G.; Ikram, M. Kamran; Ikram, M. Arfan 
Abstract snippet:  Introduction: Dementia prevention trials have so far shown little benefit 
------------------------------------------------------------ 

Match 11 
Similarity: 0.8300143 | Above threshold: FALSE 

🔹 From df_all:
Title:    Multidomain Interventions to Prevent Cognitive Impairment, Alzheimer's Disease, and Dementia: From FINGER to World-Wide FINGERS 
DOI:      NA 
Authors:  Rosenberg, A.; Mangialasche, F.; Ngandu, T.; Solomon, A.; Kivipelto, Miia 
Abstract snippet:  Alzheimer's disease (AD) and dementia are a global public health 

🔸 From df_Rayyan:
Title:    Multidomain Interventions to Prevent Cognitive Impairment, Alzheimer's Disease, and Dementia: From FINGER to World-Wide FINGERS 
DOI:      10.14283/jpad.2019.41 
Authors:  Rosenberg, A.; Mangialasche, F.; Ngandu, T.; Solomon, A.; Kivipelto, M. 
Abstract snippet:  Alzheimer's disease (AD) and dementia are a global public health 
------------------------------------------------------------ 

Match 12 
Similarity: 0.8360168 | Above threshold: FALSE 

🔹 From df_all:
Title:    Data_Sheet_1_Effectiveness of an Individual Cognitive-Behavioral Intervention for Serious, Young Male Violent Offenders: Randomized Controlled Study With Twenty-Four-Month Follow-Up.docx 
DOI:      10.3389/fpsyt.2021.670957.s001 
Authors:  Larden, Martin; Hogstrom, Jens; Langstrom, Niklas 
Abstract snippet:  Background: Psychological recidivism-reducing interventions with serious, young violent offenders in 

🔸 From df_Rayyan:
Title:    Effectiveness of an Individual Cognitive-Behavioral Intervention for Serious, Young Male Violent Offenders: Randomized Controlled Study With Twenty-Four-Month Follow-Up 
DOI:      10.3389/fpsyt.2021.670957 
Authors:  Larden, Martin; Hogstrom, Jens; Langstrom, Niklas 
Abstract snippet:  Background: Psychological recidivism-reducing interventions with serious, young violent offenders in 
------------------------------------------------------------ 

Match 13 
Similarity: 0.841545 | Above threshold: FALSE 

🔹 From df_all:
Title:    Video gaming for cognitive functioning of people with schizophrenia: a single-blind, individually randomised three-arm parallel-group controlled trial in Hong Kong (GAME-S) 
DOI:      10.17632/J4SD32S2FR 
Authors:  Valimaki, Maritta 
Abstract snippet:  In this single-blind, individually randomised three-arm parallel-group controlled trial in 

🔸 From df_Rayyan:
Title:    Video gaming for cognitive functioning of people with schizophrenia: a single-blind, individually randomised three-arm parallel-group controlled trial in Hong Kong (GAME-S) 
DOI:      10.17632/J4SD32S2FR.1 
Authors:  Valimaki, Maritta 
Abstract snippet:  In this single-blind, individually randomised three-arm parallel-group controlled trial in 
------------------------------------------------------------ 

Match 14 
Similarity: 0.8336054 | Above threshold: FALSE 

🔹 From df_all:
Title:    Data_Sheet_1_Learning Effectiveness of Social Work Methods With Groups, in Online and Face-to-Face Contexts.docx 
DOI:      10.3389/fpsyg.2021.649691.s001 
Authors:  Neamtu, Nicoleta; Faludi, Cristina 
Abstract snippet:  During the last three decades, thousands of highly qualified social 

🔸 From df_Rayyan:
Title:    Learning Effectiveness of Social Work Methods With Groups, in Online and Face-to-Face Contexts 
DOI:      10.3389/fpsyg.2021.649691 
Authors:  Neamtu, Nicoleta; Faludi, Cristina 
Abstract snippet:  During the last three decades, thousands of highly qualified social 
------------------------------------------------------------ 

Match 15 
Similarity: 0.8340324 | Above threshold: FALSE 

🔹 From df_all:
Title:    Data_Sheet_1_Promoting Resilience to Food Commercials Decreases Susceptibility to Unhealthy Food Decision-Making.docx 
DOI:      10.3389/fpsyg.2020.599663.s001 
Authors:  Ha, Oh-Ryeong; Killian, Haley J; Davis, Ann M; Lim, Seung-Lark; Bruce, Jared M; Sotos, Jarrod J; Nelson, Samuel C; Bruce, Amanda S 
Abstract snippet:  Children are vulnerable to adverse effects of food advertising. Food 

🔸 From df_Rayyan:
Title:    Promoting Resilience to Food Commercials Decreases Susceptibility to Unhealthy Food Decision-Making 
DOI:      10.3389/fpsyg.2020.599663 
Authors:  Ha, Oh-Ryeong; Killian, Haley J.; Davis, Ann M.; Lim, Seung-Lark; Bruce, Jared M.; Sotos, Jarrod J.; Nelson, Samuel C.; Bruce, Amanda S. 
Abstract snippet:  Children are vulnerable to adverse effects of food advertising. Food 
------------------------------------------------------------ 

Match 16 
Similarity: 0.8337978 | Above threshold: FALSE 

🔹 From df_all:
Title:    Video_1_Providing Task Instructions During Motor Training Enhances Performance and Modulates Attentional Brain Networks.mp4 
DOI:      10.3389/fnins.2021.755721.s002 
Authors:  Penalver-Andres, Joaquin; Buetler, Karin A; Koenig, Thomas; Muri, Rene Martin; Marchal-Crespo, Laura 
Abstract snippet:  Learning a new motor task is a complex cognitive and 

🔸 From df_Rayyan:
Title:    Providing Task Instructions During Motor Training Enhances Performance and Modulates Attentional Brain Networks 
DOI:      10.3389/fnins.2021.755721 
Authors:  Penalver-Andres, Joaquin; Buetler, Karin A.; Koenig, Thomas; Muri, Rene Martin; Marchal-Crespo, Laura 
Abstract snippet:  Learning a new motor task is a complex cognitive and 
------------------------------------------------------------ 

Match 17 
Similarity: 0.8460485 | Above threshold: FALSE 

🔹 From df_all:
Title:    Cognitive Behavioral Interventions for Medium- and High-Risk Juvenile Offenders: A Statewide Randomized Controlled Trial in Virginia, 2003-2018 
DOI:      10.3886/ICPSR38762.v1 
Authors:  Kim, KiDeuk 
Abstract snippet:  The Urban Institute developed and fostered a research partnership with 

🔸 From df_Rayyan:
Title:    Cognitive Behavioral Interventions for Medium- and High-Risk Juvenile Offenders: A Statewide Randomized Controlled Trial in Virginia, 2003-2018 
DOI:      10.3886/ICPSR38762 
Authors:  Kim, KiDeuk 
Abstract snippet:  The Urban Institute developed and fostered a research partnership with 
------------------------------------------------------------ 

Match 18 
Similarity: 0.8303407 | Above threshold: FALSE 

🔹 From df_all:
Title:    Table_1_The Role of Serotonin in the Influence of Intense Locomotion on the Behavior Under Uncertainty in the Mollusk Lymnaea stagnalis.docx 
DOI:      10.3389/fphys.2020.00221.s001 
Authors:  Aonuma, Hitoshi; Mezheritskiy, Maxim; Boldyshev, Boris; Totani, Yuki; Vorontsov, Dmitry; Zakharov, Igor; Ito, Etsuro; Dyakonova, Varvara 
Abstract snippet:  The role of serotonin in the immediate and delayed influence 

🔸 From df_Rayyan:
Title:    The Role of Serotonin in the Influence of Intense Locomotion on the Behavior Under Uncertainty in the Mollusk <i>Lymnaea stagnalis</i> 
DOI:      10.3389/fphys.2020.00221 
Authors:  Aonuma, Hitoshi; Mezheritskiy, Maxim; Boldyshev, Boris; Totani, Yuki; Vorontsov, Dmitry; Zakharov, Igor; Ito, Etsuro; Dyakonova, Varvara 
Abstract snippet:  The role of serotonin in the immediate and delayed influence 
------------------------------------------------------------ 

Match 19 
Similarity: 0.8302921 | Above threshold: FALSE 

🔹 From df_all:
Title:    The Efficacy of a Metacognitive Training Program in Amnestic Mild Cognitive Impairment: A 6-Month Follow-Up Random-ized Controlled Trial 
DOI:      10.17632/S24BB5P42T.1 
Authors:  Bampa, Grigoria 
Abstract snippet:  This study was conducted in response to the increasing prevalence 

🔸 From df_Rayyan:
Title:    The Efficacy of a Metacognitive Training Program in Amnestic Mild Cognitive Impairment: A 6-Month Follow-Up Clinical Study 
DOI:      10.3390/healthcare12101019 
Authors:  Bampa, Grigoria; Moraitou, Despina; Metallidou, Panagiota; Masoura, Elvira; Papantoniou, Georgia; Sofologi, Maria; Kougioumtzis, Georgios A.; Tsolaki, Magdalini 
Abstract snippet:  This study was conducted in response to the increasing prevalence 
------------------------------------------------------------ 

Match 20 
Similarity: 0.8474194 | Above threshold: FALSE 

🔹 From df_all:
Title:    CISC-LIVE-LAB-3/dataset: v1.0.3 
DOI:      10.5281/ZENODO.10694563 
Authors:  Abbas, Ammar N; Winniewelsh 
Abstract snippet:  Human-in-the-Loop Decision Support in Process Control Rooms Dataset Overview This 

🔸 From df_Rayyan:
Title:    CISC-LIVE-LAB-3/dataset: v1.1.0 
DOI:      10.5281/ZENODO.10569181 
Authors:  Abbas, Ammar N; Winniewelsh 
Abstract snippet:  Human-in-the-Loop Decision Support in Process Control Rooms Dataset Overview This 
------------------------------------------------------------

Add unique Rayyan ID to data set:

df_all$Rayyan_ID <- NA

for (i in seq_len(nrow(match_results_filtered))) {
  all_idx <- match_results_filtered$df_all_index[i]
  rayyan_idx <- match_results_filtered$df_Rayyan_index[i]
  
  df_all$Rayyan_ID[all_idx] <- df_Rayyan$accession_number[rayyan_idx]
}

df_all$Rayyan_ID_num <- as.numeric(str_extract(string =  df_all$Rayyan_ID, pattern = "[:digit:]+"))

nrow(df_all)

[1] 17362

length(unique(df_all$Rayyan_ID))

[1] 10718

sum(is.na(df_all$Rayyan_ID))

[1] 802

add decisions made in Rayyan

load data:

setwd("data/Rayyan")

df_customizations_log <- read.csv(file = "customizations_log.csv")

# Convert created_at to POSIXct datetime
df_customizations_log$created_at <- as.POSIXct(df_customizations_log$created_at, format = "%Y-%m-%dT%H:%M:%OSZ", tz = "UTC")

dim(df_customizations_log)

[1] 8264    6

append decisions made to df_all:

df_all$Rayyan_decision <- NA

for(i in 1:nrow(df_all)) {
  tmp_id <- df_all$Rayyan_ID_num[i]
  
  if (!is.na(tmp_id)) {
    tmp_log <-
      df_customizations_log[df_customizations_log$article_id == tmp_id,]
    
    if (nrow(tmp_log) > 0) {
      tmp_log <- tmp_log[tmp_log$key == "included",]
      tmp_log <- tmp_log[which.max(tmp_log$created_at),]
      
      if (nrow(tmp_log) != 1) {
        stop("Error - adjust function")
      }
      
      df_all$Rayyan_decision[i] <- tmp_log$value
    }
   
  }
}

table(df_all$Rayyan_decision)


   -1     0     1 
10239    41  1439

sum(is.na(df_all$Rayyan_decision))

[1] 5643

Overview over data

dim(df_all)

[1] 17362    24

table(df_all$name_DB)


cochranelib    psycinfo         wos 
       5193        3450        8719

number of missing and unique values:

# Calculate the number of missing values (NA) for each variable
missing_values <- df_all %>%
  summarise(across(everything(), ~ sum(is.na(.)))) %>%
  pivot_longer(cols = everything(), names_to = "Variable", values_to = "Missing_Count") %>%
  mutate(
    Missing_Percent = round((Missing_Count / nrow(df_all)) * 100, 2)
  )

# Calculate the number of unique values for each variable
unique_values <- df_all %>%
  summarise(across(everything(), ~ length(unique(.)))) %>%
  pivot_longer(cols = everything(), names_to = "Variable", values_to = "Unique_Count") %>%
  mutate(
    Unique_Proportion = round(Unique_Count / nrow(df_all) * 100, 2)
  )



# Display the result in a nicely formatted interactive table
DT::datatable(left_join(x = missing_values, y = unique_values, by = "Variable"))

Prepare combined data frame

! has implications on later findings - currently not done.

# Loop through each column of the dataframe and set all columns to lower case and remove whitespace from start and end of string
# for (col in names(df_all)) {
#   df_all[[col]] <- str_trim(str_to_lower(df_all[[col]]))
# }

add frequencies of keywords to include and exclude

# Combine title and abstract
df_all$text_combined <- paste(df_all$title, df_all$abstract, sep = " ")

# Normalize text to lowercase
df_all$text_combined <- tolower(df_all$text_combined)
keywords_include_lower <- tolower(keywords_include)
keywords_exclude_lower <- tolower(keywords_exclude)

# Optimized keyword matcher: returns all matches (not unique)
get_all_keyword_matches <- function(text, keyword_list) {
  unlist(lapply(keyword_list, function(k) {
    reps <- str_count(text, fixed(k))
    if (reps > 0) rep(k, reps) else NULL
  }))
}

# Apply row-wise
df_all <- df_all %>%
  rowwise() %>%
  mutate(
    total_words = str_count(text_combined, "\\S+"),

    include_all_matches = list(get_all_keyword_matches(text_combined, keywords_include_lower)),
    exclude_all_matches = list(get_all_keyword_matches(text_combined, keywords_exclude_lower)),

    include_count = length(include_all_matches),
    exclude_count = length(exclude_all_matches),

    include_keywords = paste(unique(include_all_matches), collapse = ", "),
    exclude_keywords = paste(unique(exclude_all_matches), collapse = ", "),

    include_proportion = ifelse(total_words > 0, include_count / total_words, 0),
    exclude_proportion = ifelse(total_words > 0, exclude_count / total_words, 0)
  ) %>%
  ungroup() %>%
  select(-include_all_matches, -exclude_all_matches)

# Optional preview
df_all %>%
  select(total_words, include_keywords, exclude_keywords, include_count, exclude_count, include_proportion, exclude_proportion) %>%
  head(3)

# A tibble: 3 × 7
  total_words include_keywords      exclude_keywords include_count exclude_count
        <int> <chr>                 <chr>                    <int>         <int>
1         155 cognitive, training,… patients, impai…            27            13
2        1890 cognitive, training,… patients, impai…           183           106
3         138 training, interventi… symptoms                    28             1
# ℹ 2 more variables: include_proportion <dbl>, exclude_proportion <dbl>

Clean data frame (flag entries to delete)

df_all$flag_delete <- 0

Cleaning RIS Data (M3 and T2 Fields)

We are cleaning bibliographic data extracted from an RIS-formatted source.

M3 (medium): Contains publication type details like “Book”, “Comment”, etc.
T2 (secondary_title): Usually the journal or repository name.

Steps Taken: 1. Normalization: Convert all entries to Title Case and remove whitespace. 2. Filtering: Remove known problematic or ambiguous entries (identified by red background in source). 3. Deduplication: Drop duplicate rows based on cleaned medium and secondary_title.

This improves consistency and prepares the dataset for analysis or integration into structured reference managers.

# Define problematic values for 
problematic_m3 <- c(
  "Book", "Book Chapter", "Comment", "Data study", "Editorial", "Erratum",
  "Journal article; Conference proceeding", "Journal article; Erratum",
  "Journal article; Retracted Publication", "Letter", "Preprint",
  "Published Erratum", "Retracted Publication", "Trial registry record",
  "Data set", "Editorial Material", "Meeting", "Meeting Abstract",
  "Conference proceeding", "Conference Proceeding", "Correction", "Data Paper"
)

problematic_t2 <- c("Figshare", "Mendeley Data", "Open Science Framework", "Zenodo")


# Normalize to Title Case and trim whitespace
df_all <- df_all %>%
  mutate(
    medium_clean = str_to_title(str_trim(medium)),
    secondary_title_clean = str_to_title(str_trim(secondary_title))
  )

# Overwrite flag_delete = 1 for problematic rows
df_all <- df_all %>%
  mutate(
    flag_delete = ifelse(
      medium_clean %in% str_to_title(problematic_m3) |
      secondary_title_clean %in% str_to_title(problematic_t2),
      1,
      0
    )
  )

table(df_all$flag_delete)


    0     1 
13858  3504

discuss !!!

sum(is.na(df_all$title))

[1] 14

df_all$flag_delete[is.na(df_all$title)] <- 1
sum(is.na(df_all$abstract))

[1] 757

table(df_all$flag_delete[is.na(df_all$abstract)])


  0   1 
155 602

df_all$flag_delete[is.na(df_all$abstract)] <- 1

table(df_all$flag_delete)


    0     1 
13689  3673

discuss !!!

dim(df_all)

[1] 17362    35

df_all_clean <- df_all[df_all$flag_delete != 1, ]
dim(df_all_clean)

[1] 13689    35

sum(is.na(df_all_clean$Rayyan_decision))

[1] 4564

table(df_all_clean$Rayyan_decision)


  -1    0    1 
7723   40 1362

Save data frame

setwd("outputs")

saveRDS(object = df_all, file = "df_all.rds")
# writexl::write_xlsx(x = df_all, path = "df_all.xlsx")

saveRDS(object = df_all_clean, file = "df_all_clean.rds")
# writexl::write_xlsx(x = df_all_clean, path = "df_all_clean.xlsx")

write.csv(df_all, file = "df_all.csv", row.names = FALSE)
write.csv(df_all_clean, file = "df_all_clean.csv", row.names = FALSE)

have fun

key words frequencies

summary(df_all$include_proportion)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
0.00000 0.08386 0.11765 0.12766 0.15751 1.00000

summary(df_all$exclude_proportion)

    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
0.000000 0.008403 0.033333 0.039701 0.059524 0.611111

boxplot(df_all$total_words)

t.test(df_all$total_words ~ df_all$flag_delete)


    Welch Two Sample t-test

data:  df_all$total_words by df_all$flag_delete
t = -7.5011, df = 3773.5, p-value = 7.848e-14
alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
95 percent confidence interval:
 -51.37236 -30.08219
sample estimates:
mean in group 0 mean in group 1 
       250.6808        291.4081

t.test(df_all$include_proportion  ~ df_all$flag_delete)


    Welch Two Sample t-test

data:  df_all$include_proportion by df_all$flag_delete
t = -12.987, df = 4161, p-value < 2.2e-16
alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
95 percent confidence interval:
 -0.02696346 -0.01989036
sample estimates:
mean in group 0 mean in group 1 
      0.1226999       0.1461268

t.test(df_all$exclude_proportion  ~ df_all$flag_delete)


    Welch Two Sample t-test

data:  df_all$exclude_proportion by df_all$flag_delete
t = -17.738, df = 4444.4, p-value < 2.2e-16
alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
95 percent confidence interval:
 -0.01812654 -0.01451845
sample estimates:
mean in group 0 mean in group 1 
     0.03624772      0.05257022

df_all[df_all$include_proportion > .99,]$title

[1] "WORKING MEMORY TRAINING IMPROVES ATTENTIONAL CONTROL IN DYSPHORIA"

df_all[df_all$total_words > 5000,]$title

[1] "General Practice and the Community: Research on health service, quality improvements and training. Selected abstracts from the EGPRN Meeting in Vigo, Spain, 17-20 October 2019 Abstracts"

heuristics to clean data

set a variable which flags that the respective entry should be deleted:

# df_all$flag_delete <- 0

simple heuristics possible:

## duplicates just based on DOIs
(nrow(df_all) - length(unique(df_all$doi))) / nrow(df_all)

[1] 0.4940099

## duplicates just based on title (! perfect match)
(nrow(df_all) - length(unique(df_all$title))) / nrow(df_all)

[1] 0.1039051

keep only records which contain the most information

df_all$missing_rowsise <- rowSums(x = is.na(df_all))

df_all %>%
  group_by(name_DB) %>%
  summarise(N = n(), mean_missing = mean(missing_rowsise), sd_missing = sd(missing_rowsise))

# A tibble: 3 × 4
  name_DB         N mean_missing sd_missing
  <chr>       <int>        <dbl>      <dbl>
1 cochranelib  5193         8.29       2.13
2 psycinfo     3450         9.32       1.44
3 wos          8719         5.16       2.18

some dois even exists more than five times:

names(table(df_all$doi))[table(df_all$doi) > 5]

[1] "10.1027/0269-8803/a000095"     "10.1080/09297049.2022.2138301"
[3] "10.1093/schbul/sbv010"         "10.14283/jpad.2022.97"        
[5] "10.3109/02699052.2014.892379"  "10.3109/02699052.2016.1162060"

tmp_filter <- names(table(df_all$doi))[table(df_all$doi) > 5]

tmp_filter[1]

[1] "10.1027/0269-8803/a000095"

DT::datatable(data = df_all[df_all$doi %in% tmp_filter[1], colnames(df_all)[colnames(df_all) != "abstract"]])

Appendix

enrich data by APIs (not applied)

in some cases good results:

search via DOI:

# get vector of unique DOIs:
tmp_filter <- names(table(df_all$doi))[table(df_all$doi) == 1]

# Use the first DOI in the filtered list
doi <- tmp_filter[2]
row <- df_all[df_all$doi == doi & !is.na(df_all$doi), ]

# Format the DOI properly (e.g., replace slashes with encoded characters if needed)
doi_encoded <- URLencode(doi)

# API URLs
handle_api <- paste0("https://doi.org/api/handles/", doi_encoded)
crossref_api <- paste0("https://api.crossref.org/works/", doi_encoded)



# Function to fetch and parse JSON from API
fetch_api <- function(url) {
  res <- GET(url)
  if (status_code(res) == 200) {
    content(res, "parsed", simplifyVector = TRUE)
  } else {
    warning(paste("Failed to fetch from:", url))
    NULL
  }
}

# Get results
handle_result <- fetch_api(handle_api)
crossref_result <- fetch_api(crossref_api)

# Print out some key parts of each result
print("Handle API Result:")

[1] "Handle API Result:"

print(handle_result)

$responseCode
[1] 1

$handle
[1] "10.1001/archinternmed.2009.494"

$values
   index     type data.format
1      1      URL      string
2 700050   700050      string
3    100 HS_ADMIN       admin
                                                                       data.value
1 http://archinte.jamanetwork.com/article.aspx?doi=10.1001/archinternmed.2009.494
2                                                                  20131030193117
3                                                 0.na/10.1001, 200, 111111110010
    ttl            timestamp
1 86400 2013-10-30T19:52:24Z
2 86400 2013-10-30T19:52:24Z
3 86400 2013-10-30T19:52:24Z

print("Crossref API Result:")

[1] "Crossref API Result:"

print(crossref_result$message$title)

[1] "Resistance Training and Executive Functions"

print(crossref_result$message$DOI)

[1] "10.1001/archinternmed.2009.494"

print("Entries in data base:")

[1] "Entries in data base:"

print(row$title)

[1] "Resistance Training and Executive Functions <i>A 12</i>-<i>Month Randomized Controlled Trial</i>"

print(row$doi)

[1] "10.1001/archinternmed.2009.494"

in some cases bad results:

search via DOI:

# Use the first DOI in the filtered list
doi <- tmp_filter[1] # !!!
row <- df_all[df_all$doi == doi & !is.na(df_all$doi), ]

# Format the DOI properly (e.g., replace slashes with encoded characters if needed)
doi_encoded <- URLencode(doi)

# API URLs
handle_api <- paste0("https://doi.org/api/handles/", doi_encoded)
crossref_api <- paste0("https://api.crossref.org/works/", doi_encoded)



# Function to fetch and parse JSON from API
fetch_api <- function(url) {
  res <- GET(url)
  if (status_code(res) == 200) {
    content(res, "parsed", simplifyVector = TRUE)
  } else {
    warning(paste("Failed to fetch from:", url))
    NULL
  }
}

# Get results
handle_result <- fetch_api(handle_api)

Warning in fetch_api(handle_api): Failed to fetch from:
https://doi.org/api/handles/10.10002/pon.3245

crossref_result <- fetch_api(crossref_api)

Warning in fetch_api(crossref_api): Failed to fetch from:
https://api.crossref.org/works/10.10002/pon.3245

# Print out some key parts of each result
print("Handle API Result:")

[1] "Handle API Result:"

print(handle_result)

NULL

print("Crossref API Result:")

[1] "Crossref API Result:"

print(crossref_result$message$title)

NULL

print(crossref_result$message$DOI)

NULL

print("Entries in data base:")

[1] "Entries in data base:"

print(row$title)

[1] "Rehabilitation of cognitive changes in breast cancer survivors"

print(row$doi)

[1] "10.10002/pon.3245"

search via terms:

# Retrieve the associated row
row <- df_all[df_all$doi == doi & !is.na(df_all$doi), ]

# Extract title and first author
title <- row$title
first_author <- strsplit(row$authors, ";|,")[[1]][1]  # Get first listed author

# Construct a dynamic search query using title and author
search_api <- paste0(
  "https://api.crossref.org/works?",
  "query.title=", URLencode(title, reserved = TRUE),
  "&query.author=", URLencode(first_author, reserved = TRUE),
  "&rows=5"
)

# Fetch the search API results
fetch_api <- function(url) {
  res <- GET(url)
  if (status_code(res) == 200) {
    content(res, "parsed", simplifyVector = TRUE)
  } else {
    warning(paste("Failed to fetch from:", url))
    NULL
  }
}

# Get and view result
search_result <- fetch_api(search_api)

Warning in fetch_api(search_api): Failed to fetch from:
https://api.crossref.org/works?query.title=Rehabilitation%20of%20cognitive%20changes%20in%20breast%20cancer%20survivors&query.author=Haynes&rows=5

# Example: print retrieved titles
if (!is.null(search_result)) {
  cat("Search API Titles:\n")
  print(unlist(search_result$message$items$title)[1])
  print(search_result$message$items$author[[1]]$given[1])
  print(search_result$message$items$DOI[1])
  cat("vs. real entry:")
  print(title)
  print(first_author)
  print(doi)
}