code written: 2020-06-29
last ran: 2020-06-30
code: /projects/ncalarco/thesis/PNS/analyses/00_participantOverlap.Rmd
website: https://rpubs.com/navona/sharedParticipants

Description. This script checks the overlap of shared participants between the SPINS and PNS studies, to ensure that we only include data once per participant. Note: this summary includes participants scanned on both the ‘CMH’ (GE) and ‘CMP’ (Siemens) scanners.

#clean workspace
rm(list=ls())

#libraries
library(magrittr)
library(dplyr)
library(stringr)

#bring in 'ls' of SPINS data (from /projects/ncalarco/thesis/SPINS/Slicer/data/01_dmriprep)
SPINS <- read.csv('../outputs/participantsSPINS_2020-06-29.txt', header=FALSE) #n=445 (before data cleaning)

#bring in 'ls' of PNS data
PNS <- read.csv('../outputs/archiveParticipantsPNS_2020-06-29.txt', header=FALSE) #from archive, as M missing newer data in BIDS

#bring in export of PNS REDCap, which explicitly indicates overlapping IDs
df_PNS <- read.csv('../clinical/PNS_DATA_2020-06-29_1355.csv', header=TRUE, na.strings = c("", "NA"), stringsAsFactors = FALSE)

#bring in SPINS REDCap
df_SPINS <- read.csv('/projects/ncalarco/thesis/SPINS/clin_data/SPINS_complete_2020-06-29.csv', header=TRUE, na.strings = c("", "NA"), stringsAsFactors = FALSE)

#take PNS MRI vars
mri_pns <- df_PNS[df_PNS$redcap_event_name == 'month9__study_comp_arm_1', c('record_id','subsum_mri')]

#take SPINS MRI vars
mri_spins <- df_SPINS[, c('record_id', 'subsum_mri')]

#get linked IDs
linked <- df_PNS[df_PNS$redcap_event_name == 'baseline__screenin_arm_1', c('record_spins_id','record_id')]

#fix typo
linked$record_spins_id<- ifelse(linked$record_spins_id == 'SPN01_CMH_0211', 'SPN01_CMP_0211', linked$record_spins_id)

#change names for clarity
names(linked) <- c('id_SPINS', 'id_PNS')

#recode mri variables
mri_pns$completeMRI_PNS <- ifelse(mri_pns$subsum_mri == 1, 'yes', 'no')
mri_spins$completeMRI_SPINS <- ifelse(mri_spins$subsum_mri == 1, 'yes', 'no')

#remove unneeded vars
mri_pns <- subset(mri_pns, select=-subsum_mri)
mri_spins<- subset(mri_spins, select=-subsum_mri)

#merge together the dataframes from different timepoints
df <- merge(linked, mri_spins, by.x='id_SPINS', by.y='record_id', all.x = T)
df <- merge(df, mri_pns, by.x='id_PNS', by.y='record_id', all.x = T)

#get SPINS REDCap/study ID and BIDS in same format
SPINS$V1 <- paste('SPN01_', substr(SPINS$V1, 5, 7), '_', substr(SPINS$V1, 8,11), sep='')

#get PNS REDCap/study ID and BIDS in same format
PNS$V1 <- str_sub(PNS$V1, end=-4)

#make additional variabilies for comparison
PNS$archive_PNS <- 'yes'
SPINS$archive_SPINS <- 'yes'

#keep only SPINS data from CMH site
SPINS <- filter(SPINS, grepl("CM", V1))

#merge dataframes for visual of missing data
df <- merge(df, SPINS, by.x='id_SPINS', by.y='V1', all = TRUE)
df <- merge(df, PNS, by.x='id_PNS', by.y='V1', all = TRUE)

#rearrange vars for clarity
df <- df[, c(2, 1, 3:ncol(df))]

#for clarity in table, explicitly indicate that data doesn't exist in archive
df$archive_SPINS <- ifelse(is.na(df$archive_SPINS), 'no', df$archive_SPINS)
df$archive_PNS <- ifelse(is.na(df$archive_PNS), 'no', df$archive_PNS)

#fill in other missing data
df$completeMRI_SPINS <- ifelse(is.na(df$completeMRI_SPINS) & df$archive_SPINS == 'no', 'no', df$completeMRI_SPINS)
df$completeMRI_SPINS <- ifelse(is.na(df$completeMRI_SPINS) & df$archive_SPINS == 'yes', 'yes', df$completeMRI_SPINS)
df$completeMRI_PNS <- ifelse(is.na(df$completeMRI_PNS) & df$archive_PNS == 'no', 'no', df$completeMRI_PNS)
df$completeMRI_PNS <- ifelse(is.na(df$completeMRI_PNS) & df$archive_PNS == 'yes', 'yes', df$completeMRI_PNS)

#sort the dataframe by SPINS_ID
df <- df[order(df$id_SPINS),]

#reset row names
rownames(df) <- NULL

#write out df
write.csv(df, paste0('../outputs/participantOverlap_', Sys.Date(), '.csv'), row.names=FALSE)

#allow for embedding
readLines("../outputs/participantOverlap_2020-06-29.csv") %>% 
  paste0(collapse="\n") %>% 
  openssl::base64_encode() -> encoded

Participants that ‘should’ have data in archive (scan indicated as complete in REDCap; I have not checked XNAT or MR server), but don’t:

SPINS: NA

PNS: total: 10 | PNS_CMH_0001, PNS_CMH_0002, PNS_CMH_0008, PNS_CMH_0009, PNS_CMH_0015, PNS_CMH_0016, PNS_CMH_0017, PNS_CMH_0018, PNS_CMH_0022, PNS_CMH_0103

Participants completing MRI in both studies (n=59) that have MR data only in one archive location:

SPINS: total: 10 | SPN01_CMH_0059, SPN01_CMH_0076, SPN01_CMH_0079, SPN01_CMH_0087, SPN01_CMH_0094, SPN01_CMH_0095, SPN01_CMH_0096, SPN01_CMH_0097, SPN01_CMH_0100, SPN01_CMH_0180

Participants participating in both studies that have data in both archive locations (SPINS and PNS):
total: 46

Download CSV

Participant data check Overlab between SPINS and PNS datasets