imaging data pulled: 2021-10-12
clinical data pulled: 2020-11-16
code written: 2021-11-05
last ran: 2021-11-17
Description. This script pulls together all demographic, clinical, cognition, and LC NM-MRI variables of interest, for the n=48 participants included in our study. We perform basic data cleaning, e.g., checking for missing values, etc. The cognition and LC NM-MRI variables are corrected for age and sex (_cor
). The dataset created is used in several subsequent analyses (note: the reaction time cognition variables on the DKEFS are further corrected for normality (_normcor
), in a subsequent script, 03_dataCorrection
.
Load libraries and data
#clear environment
rm(list = ls())
#list required libraries
packages <- c(
'tidyverse',
'naniar', #missing values
'kableExtra', #pretty tables
'vtable', #summary tables
'reshape2' #wrangling
)
#load required libraries
lapply(packages, require, character.only = TRUE)
#read in LC NM-MRI data (correctly registered, from Ben, October)
lc <- read.csv('../clinical/registered/LC_stats_Oct12.csv')
#read in redcap data (demographic, clinical)
redcap <- read.csv('../clinical/SEN01TheSenDepStudyD_DATA_2020-12-08_1052.csv', stringsAsFactors = F)
#clean up redcap data (remove adverse repeating form rows)
redcap <- redcap[!(redcap$redcap_repeat_instrument == 'participant_adverse_event_log'),]
#adjust redcap IDs, so match participant list
redcap$id <- substr(redcap$id, 11, 16)
#read in participant list (n=48)
participants <- read.csv('../clinical/participantList.csv', header=FALSE)
Munge, clean demographic data of interest
#pull out demographic variables of interest
demo <- redcap[redcap$redcap_event_name == 'screening_arm_1',
c('id',
'diagnosis',
'demo_age',
'demo_sex',
'demo_handedness',
'demo_yearseducation')]
#keep data only from participants include in present analysis (n=48)
demo <- demo[demo$id %in% participants$V1,]
#recode sex
demo$demo_sex <- factor(demo$demo_sex, levels=c(1, 2), labels= c('male', 'female'))
#recode diagnosis
demo$diagnosis <- factor(demo$diagnosis, levels=c(1, 2), labels= c('LLD', 'HC'))
#recode handedness
demo$demo_handedness <- factor(demo$demo_handedness, levels=c(1, 2, 3, 4), labels= c('L', 'R', 'M', 'FR'))
#rename select demographic variables, for nice tables
demo <- demo %>% rename(
'Sex'=demo_sex,
'Diagnosis'=diagnosis,
'Age'=demo_age,
'Education'=demo_yearseducation,
'Handedness'=demo_handedness)
Munge, clean clinical data
#make a copy of redcap data for clinical data munging
clinical <- redcap
#pull out phq9 vars
phq9 <- clinical[clinical$redcap_event_name =='screening_arm_1', c(
"id",
"phq9_1_interest",
"phq9_2_depressed",
"phq9_3_sleep",
"phq9_4_energy",
"phq9_5_appetite",
"phq9_6_guilt",
"phq9_7_concentrate",
"phq9_8_motor",
"phq9_9_selfharm",
"phq9_diff",
"phq9_total")]
#for phq9, replace values of 888 (incomplete) with NA
phq9[phq9==888] <- NA
#pull out madrs vars
madrs <- clinical[clinical$redcap_event_name =='t0_arm_1', c(
"id",
"madrs_apparant_sadness",
"madrs_reported_sadness",
"madrs_inner_tension",
"madrs_reduced_sleep",
"madrs_reduced_appetite",
"madrs_concentration_diff",
"madrs_lassitude",
"madrs_inability_to_feel",
"madrs_pessimistic",
"madrs_suicidalthought",
"madrs_total_calc")]
#pull out health-related vars, from baseline timepoint
health_t0 <- clinical[clinical$redcap_event_name =='t0_arm_1', c(
'id',
"cirsg_total", #general health
"wrat4_wr_total", #reading
'whodasv2_total', #whodas
'ucla3_total', #lonliness
'fas_total', #fatigue
'frail_total' #frailty
)]
#replace values of 999 (incomplete) with NA
health_t0[health_t0==999] <- NA
#add the string 'health' to all health variables
names(health_t0)[-1] <- paste("health_", names(health_t0[,-1]), sep = "")
#pull out health-related vars, from screening timepoint
health_screening <- clinical[clinical$redcap_event_name =='screening_arm_1', c("id","moca_total")]
#rename health from screening timepoint
names(health_screening) <- c('id', 'health_moca_total')
#pull out pss scale (perceived stress)
pss <- clinical[clinical$redcap_event_name =='t0_arm_1', grep('^id$|^pss_[0-9]|pss_total', names(clinical))]
#pull of gad7 scale (generalized anxiety
gad7 <- clinical[clinical$redcap_event_name =='t0_arm_1', grep('^id$|^gad7_[0-9]|gad7_total', names(clinical))]
#merge all clinical dfs
clinical <- list(phq9, madrs, health_t0, health_screening, pss, gad7) %>%
reduce(full_join, by='id')
#make sure limited to participants included in present analysis
clinical <- clinical[clinical$id %in% participants$V1,]
#check if missing values
sum(is.na(clinical)) #12
#identify variables for which missing values exist
names(which(colSums(is.na(clinical)) > 0)) #health_cirgs_total -- not a concern
#make sure no 'missing value'-type values (coded) exist in data
naniar::common_na_numbers %in% clinical #none
naniar::common_na_strings %in% clinical #none
#make sure all variables numeric
clinical[-1] <- sapply(clinical[-1], as.numeric)
#merge the clinical and demographic datasets
df <- merge(demo, clinical, by='id', all.x=TRUE)
#for clarity, remove clinical dfs no longer needed
rm(phq9, madrs, health_t0, health_screening, pss, gad7)
#for clarity, remove the redcap and participants dfs
rm(participants, demo, clinical)
Munge cognitive data
#make a copy of redcap data for clinical data munging
cognitive <- redcap
#pull out rbans
rbans <- cognitive[cognitive$redcap_event_name =='t0_arm_1', c(
"id",
"rbans_immmemory_index" ,
"rbans_visuo_index",
"rbans_language_index",
"rbans_attention_index",
"rbans_delmem_index",
"rbans_totalscale_tscore"
)]
#pull out dkefs
dkefs <- cognitive[cognitive$redcap_event_name =='t0_arm_1', c(
"id",
'dkefs_trails4_time', #trails
'dkefs_trails5_time',
'dkefs_cwi_1_time', #colour
'dkefs_cwi_2_time',
'dkefs_cwi_3_time',
'dkefs_cwi_4_time',
'dkefs_vf_lftotalcorrect', #fluency
'dkefs_vf_cftotalcorrect',
'dkefs_vf_csswitchtotcor'
)]
#replace impossible dkefs values with NA
dkefs <- dkefs %>% naniar::replace_with_na_all(condition = ~.x %in% c(0, 995, 999))
#also, turn time values under 10 to NA - highly unlikely to be real
dkefs <- dkefs %>% naniar::replace_with_na_at(.vars = grep('time', names(dkefs)), condition = ~.x <= 10)
#pull out everyday cognition
ecog <- cognitive[cognitive$redcap_event_name =='t0_arm_1', c(
"id",
"ecog12_1_obj",
"ecog12_2_day",
"ecog12_3_communic",
"ecog12_4_understanddir",
"ecog12_5_readmap",
"ecog12_6_findwayhome",
"ecog12_7_weatherplan",
"ecog12_8_thinkahead",
"ecog12_9_wrkspaceorg",
"ecog12_10_chqbal",
"ecog12_11_twothings",
"ecog12_12_cookwork")]
#merge all cognitive dfs
cognitive <- list(rbans, dkefs, ecog) %>%
reduce(full_join, by='id')
#make sure no 'missing value'-type values exist
naniar::common_na_numbers %in% cognitive #none
naniar::common_na_strings %in% cognitive #none
#make sure all variables numeric
cognitive[-1] <- sapply(cognitive[-1], as.numeric)
#merge the cognitive and demographic / clinical datasets
df <- merge(df, cognitive, by='id', all.x=TRUE)
#for clarity, remove cognitive dfs no longer needed
rm(rbans,dkefs,ecog)
Correct cognition variables (RBANS, DKEFS) for age and sex
(Note: this correction introduces negative values into our variables, including RT)
#identify all cognition variables
vars_cog <- names(cognitive)[grep('rbans|dkefs', names(cognitive))]
#for ease, make a separate df of cognition, and age and sex
cognition <- merge(df[,c('id', 'Age', 'Sex')], cognitive[c('id', vars_cog)])
#create function to correct for age and sex
adjustVars_fn <- function(x, data){
fit <- lm(x ~ Age + Sex, na.action = na.exclude, data)
adjusted <- resid(fit) + coef(fit)[1] #residual intercept
return(adjusted)
}
#apply function to all variables, put in new df for review
cognition_adjusted <- data.frame(cognition$id, sapply(cognition[vars_cog], function(x) adjustVars_fn(x, data=cognition)))
#adjust names in adjusted df, so clear have been corrected
names(cognition_adjusted)[-1] <- paste(names(cognition_adjusted[,-1]), '_cor', sep = "")
#add the adjusted variables back to our cognitive df
df <- merge(df, cognition_adjusted, by.x='id', by.y='cognition.id')
#for clarity, remove the unneeded dfs
rm(cognitive, cognition, cognition_adjusted)
Munge LC NM-MRI data
#from the LC NM-MRI data, keep only avg_max values
lc <- lc[, grep('record_id|avg_max', names(lc))]
#keep only data from participants in our study
lc <- lc[lc$record_id %in% df$id,]
#check for missing values
sum(is.na(lc)) #none
#add an average avg_max value across all 6 LC segment variables
lc$avg_max <- rowMeans(lc[ , c(2:ncol(lc))], na.rm=TRUE)
#add back to the dataframe
df <- merge(df, lc, by.x='id', by.y='record_id')
Correct LC for age and sex
#add age and sex to the same df
lc <- merge(df[,c('id', 'Age', 'Sex')], lc, by.x='id', by.y='record_id')
#apply our function to all LC NM-MRI variables, put in new df for review
lc_adjusted <- data.frame(lc$id, sapply(lc[,grep('avg_max', names(lc))], function(x) adjustVars_fn(x, data=lc)))
#adjust names in adjusted df, so clear have been corrected
names(lc_adjusted)[-1] <- paste(names(lc_adjusted[,-1]), '_cor', sep = "")
#add the adjusted variables back to df
df <- merge(df, lc_adjusted, by.x='id', by.y='lc.id')
#for clarity, remove the unneeded dfs
rm(lc, lc_adjusted)
Summary table
The following table summarises mean(df) values for variables of interest, by diagnostic group. Cognition and LC variables have been corrected for age and sex.
#organize data we want in table
vars <- c(
#demographics
"Sex",
"Age",
"health_wrat4_wr_total",
"phq9_total",
"madrs_total_calc",
"health_moca_total", #mci
"health_cirsg_total", #health
"health_whodasv2_total", #disability
"health_ucla3_total", #loneliness
"health_fas_total", #fatigue
"health_frail_total", #frailty
"gad7_total", #generalized anxiety
"pss_total", #perceived stress
"rbans_immmemory_index_cor",
"rbans_visuo_index_cor",
"rbans_language_index_cor",
"rbans_attention_index_cor",
"rbans_delmem_index_cor",
"dkefs_trails5_time_cor",
"dkefs_trails4_time_cor",
"dkefs_cwi_1_time_cor",
"dkefs_cwi_2_time_cor",
"dkefs_cwi_3_time_cor",
"dkefs_cwi_4_time_cor",
"dkefs_vf_lftotalcorrect_cor",
"dkefs_vf_cftotalcorrect_cor",
"dkefs_vf_csswitchtotcor_cor",
"avg_max_seg_1_cor",
"avg_max_seg_2_cor",
"avg_max_seg_3_cor",
"avg_max_seg_4_cor",
"avg_max_seg_5_cor",
"avg_max_seg_6_cor"
)
#initialize dataframe and name columns and rows
tbl <- data.frame(matrix(ncol=3, nrow=length(vars)))
names(tbl) <- c( paste('HC, n=', sum(df$Diagnosis == 'HC'), sep=''), paste('LLD, n=', sum(df$Diagnosis == 'LLD'), sep=''), '_p_')
#set rownames
row.names(tbl) <- c(
"Sex (F:M)",
"Age (years)",
"Wide Range Achievement Test (WRAT)",
"Patient Health Questionnaire (PHQ-9)",
"Montgomery-Asberg Depression Rating Scale (MADRS)",
"Montreal Cognitive Assessment (MoCA)",
"Cumulative Illness Rating Scale-Geriatric (CIRS-G)",
"WHO Disability Assessment (WHODAS)",
"UCLA Loneliness Scale",
"Fatigue Assessment Scale (FAS)",
"Frail Scale",
"Generalized Anxiety Disorder Scale (GAD-7) ",
"Perceived Stress Scale (PSS)",
#Repeatable Battery for the Assessment of Neuropsychological Status (RBANS)
"Immediate Memory",
"Visuospatial/Constructional",
"Language",
"Attention",
"Delayed Memory",
#Delis-Kaplan Executive Function System (DKEFS)
"Trail Making Test - Numbers",
"Trail Making Test - Number-Letter Switching",
"Color-Word Interference Test - 1 : Colour Naming",
"Color-Word Interference Test - 2: Word Reading",
"Color-Word Interference Test - 3: Inhibition",
"Color-Word Interference Test - 4: Inhibition/Switching",
'Verbal fluency: letters',
'Verbal fluency: categories',
'Verbal fluency: switching',
#LC contrast values (age and sex corrected)
"Segment 1 (rostral left)",
"Segment 2 (middle left)",
"Segment 3 (caudal left)",
"Segment 4 (rostral right)",
"Segment 5 (middle right)",
"Segment 6 (caudal right)"
)
#make sure group stays a factor...
group <- factor(x = c('HC', 'LLD'), levels = c('HC', 'LLD'))
#initialize counters (j = row, k = column)
j <- 1
k <- 1
#for loop for categorical VARS
for (var in vars[1:1]) {
# count observations for each cluster
for (group in group) {
out <- table(df$Diagnosis, df[[var]])
tbl[j,k] <- paste(out[group,], collapse = ':')
k <- k + 1}
# run chi-square tests
chi_sq <- chisq.test(out)
tbl[j,k] <- sub("^(-?)0.", "\\1.", sprintf("%.3f", chi_sq$p.value))
k <- 2
j <- j + 1}
# initialize counters (j = row, k = column)
j <- 2
k <- 1
#for loop for continuous variables
for (var in vars[2:length(vars)]) {
#make sure stays factor
group <- factor(x = c('HC', 'LLD'), levels = c('HC', 'LLD'))
# calculate means and SDs for each scanner
for (group in group) {
M <- sprintf('%.02f', mean(df[df$Diagnosis == group, var], na.rm = TRUE))
SD <- sprintf('%.02f', sd(df[df$Diagnosis == group, var], na.rm = TRUE))
tbl[j,k] <- paste( M,' (',SD,')', sep='')
k <- k + 1
}
# run one-way ANOVA with diagnosis as between-subjects variable
t_test <- t.test(df[[var]] ~ df$Diagnosis, na.action=na.omit)
t_test.p <- t_test$p.value
t_test.p <- sub("^(-?)0.", "\\1.", sprintf("%.3f", t_test.p))
tbl[j,k] <- t_test.p
#move counter
k <- 1
j <- j + 1
}
#pipe into pretty table
tbl %>%
kable(align=c('l', 'c', 'c', 'c'), digits = 2) %>%
kable_styling(bootstrap_options = c("striped", "hover")) %>%
pack_rows(index = c("Demographics" = 3,
"Clinical" = 10,
"Repeatable Battery for the Assessment of Neuropsychological Status (RBANS)" = 5,
"Delis-Kaplan Executive Function System (DKEFS)" = 9,
"LC Signal Intensity" = 6))
HC, n=23 | LLD, n=25 | p | |
---|---|---|---|
Demographics | |||
Sex (F:M) | 11:12 | 7:18 | .263 |
Age (years) | 70.00 (8.02) | 68.08 (5.41) | .341 |
Wide Range Achievement Test (WRAT) | 64.52 (3.42) | 62.52 (5.09) | .115 |
Clinical | |||
Patient Health Questionnaire (PHQ-9) | 0.78 (1.28) | 12.28 (4.31) | .000 |
Montgomery-Asberg Depression Rating Scale (MADRS) | 1.00 (1.41) | 16.60 (5.69) | .000 |
Montreal Cognitive Assessment (MoCA) | 26.22 (2.04) | 25.72 (2.65) | .469 |
Cumulative Illness Rating Scale-Geriatric (CIRS-G) | 6.43 (4.48) | 10.29 (4.33) | .004 |
WHO Disability Assessment (WHODAS) | 5.98 (7.68) | 27.25 (12.87) | .000 |
UCLA Loneliness Scale | 6.39 (6.17) | 34.16 (14.34) | .000 |
Fatigue Assessment Scale (FAS) | 15.57 (4.50) | 31.28 (6.55) | .000 |
Frail Scale | 0.26 (0.54) | 0.84 (0.80) | .005 |
Generalized Anxiety Disorder Scale (GAD-7) | 1.35 (1.67) | 9.56 (5.09) | .000 |
Perceived Stress Scale (PSS) | 7.83 (4.83) | 22.64 (6.94) | .000 |
Repeatable Battery for the Assessment of Neuropsychological Status (RBANS) | |||
Immediate Memory | 93.04 (11.77) | 94.56 (13.72) | .681 |
Visuospatial/Constructional | 96.61 (15.75) | 92.81 (17.29) | .431 |
Language | 85.03 (12.04) | 85.42 (9.44) | .901 |
Attention | 97.12 (16.01) | 90.49 (16.77) | .168 |
Delayed Memory | 124.17 (14.32) | 126.69 (11.12) | .502 |
Delis-Kaplan Executive Function System (DKEFS) | |||
Trail Making Test - Numbers | 23.41 (27.18) | 25.37 (15.04) | .762 |
Trail Making Test - Number-Letter Switching | -76.54 (44.37) | -59.65 (53.98) | .241 |
Color-Word Interference Test - 1 : Colour Naming | 38.72 (6.84) | 38.71 (10.41) | .996 |
Color-Word Interference Test - 2: Word Reading | 23.18 (5.27) | 22.24 (6.53) | .585 |
Color-Word Interference Test - 3: Inhibition | 35.57 (16.57) | 42.46 (23.09) | .239 |
Color-Word Interference Test - 4: Inhibition/Switching | 43.65 (13.22) | 46.96 (20.72) | .510 |
Verbal fluency: letters | 43.98 (10.93) | 41.83 (12.26) | .524 |
Verbal fluency: categories | 51.82 (8.87) | 52.75 (8.36) | .710 |
Verbal fluency: switching | 21.05 (3.16) | 21.01 (2.98) | .961 |
LC Signal Intensity | |||
Segment 1 (rostral left) | 33.30 (3.48) | 35.03 (3.05) | .074 |
Segment 2 (middle left) | 41.74 (4.77) | 42.00 (6.17) | .869 |
Segment 3 (caudal left) | 24.51 (4.18) | 25.98 (4.87) | .267 |
Segment 4 (rostral right) | 25.92 (3.22) | 27.36 (2.72) | .103 |
Segment 5 (middle right) | 35.62 (3.87) | 35.04 (5.11) | .662 |
Segment 6 (caudal right) | 18.11 (5.17) | 16.66 (4.20) | .295 |
Write out data
write.csv(df, paste0('../clinical/df_', Sys.Date(), '.csv', sep=''), row.names = F)
write.csv(tbl, paste0('../paperTables/tbl_demo_', Sys.Date(), '.csv', sep=''), row.names = F)