01_dataCleaning.utf8

imaging data pulled: 2021-10-12
clinical data pulled: 2020-11-16
code written: 2021-11-05
last ran: 2021-11-17

Description. This script pulls together all demographic, clinical, cognition, and LC NM-MRI variables of interest, for the n=48 participants included in our study. We perform basic data cleaning, e.g., checking for missing values, etc. The cognition and LC NM-MRI variables are corrected for age and sex (_cor). The dataset created is used in several subsequent analyses (note: the reaction time cognition variables on the DKEFS are further corrected for normality (_normcor), in a subsequent script, 03_dataCorrection.

Load libraries and data

#clear environment
rm(list = ls())

#list required libraries
packages <- c(
  'tidyverse',
  'naniar', #missing values
  'kableExtra', #pretty tables
  'vtable', #summary tables
  'reshape2' #wrangling
)

#load required libraries
lapply(packages, require, character.only = TRUE)

#read in LC NM-MRI data (correctly registered, from Ben, October) 
lc <- read.csv('../clinical/registered/LC_stats_Oct12.csv')

#read in redcap data (demographic, clinical)
redcap <- read.csv('../clinical/SEN01TheSenDepStudyD_DATA_2020-12-08_1052.csv', stringsAsFactors = F)

#clean up redcap data (remove adverse repeating form rows)
redcap <- redcap[!(redcap$redcap_repeat_instrument == 'participant_adverse_event_log'),]

#adjust redcap IDs, so match participant list
redcap$id <- substr(redcap$id, 11, 16)

#read in participant list (n=48)
participants <- read.csv('../clinical/participantList.csv', header=FALSE)

Demographics

Munge, clean demographic data of interest

#pull out demographic variables of interest
demo <- redcap[redcap$redcap_event_name == 'screening_arm_1', 
  c('id', 
    'diagnosis', 
    'demo_age', 
    'demo_sex', 
    'demo_handedness', 
    'demo_yearseducation')]

#keep data only from participants include in present analysis (n=48)
demo <- demo[demo$id %in% participants$V1,]

#recode sex 
demo$demo_sex <- factor(demo$demo_sex, levels=c(1, 2), labels= c('male', 'female'))

#recode diagnosis 
demo$diagnosis <- factor(demo$diagnosis, levels=c(1, 2), labels= c('LLD', 'HC'))

#recode handedness
demo$demo_handedness <- factor(demo$demo_handedness, levels=c(1, 2, 3, 4), labels= c('L', 'R', 'M', 'FR'))

#rename select demographic variables, for nice tables
demo <- demo %>% rename(
    'Sex'=demo_sex,
    'Diagnosis'=diagnosis,
    'Age'=demo_age,
    'Education'=demo_yearseducation,
    'Handedness'=demo_handedness)

Clinical

Munge, clean clinical data

#make a copy of redcap data for clinical data munging
clinical <- redcap

#pull out phq9 vars
phq9 <- clinical[clinical$redcap_event_name =='screening_arm_1', c(
  "id",
  "phq9_1_interest",                            
  "phq9_2_depressed",                          
  "phq9_3_sleep",                               
  "phq9_4_energy",                             
  "phq9_5_appetite",                            
  "phq9_6_guilt",                              
  "phq9_7_concentrate",                         
  "phq9_8_motor",                              
  "phq9_9_selfharm",                            
  "phq9_diff",                          
  "phq9_total")]

#for phq9, replace values of 888 (incomplete) with NA
phq9[phq9==888] <- NA
  
#pull out madrs vars
madrs <- clinical[clinical$redcap_event_name =='t0_arm_1', c(
  "id",
  "madrs_apparant_sadness",                                
  "madrs_reported_sadness",                                 
  "madrs_inner_tension",                                    
  "madrs_reduced_sleep",                                    
  "madrs_reduced_appetite",                                 
  "madrs_concentration_diff",                               
  "madrs_lassitude",                                        
  "madrs_inability_to_feel",                                
  "madrs_pessimistic",                                      
  "madrs_suicidalthought",                                  
  "madrs_total_calc")]  

#pull out health-related vars, from baseline timepoint
health_t0 <- clinical[clinical$redcap_event_name =='t0_arm_1', c(
  'id',
  "cirsg_total", #general health
  "wrat4_wr_total", #reading
  'whodasv2_total', #whodas
  'ucla3_total', #lonliness
  'fas_total', #fatigue
  'frail_total' #frailty
)]

#replace values of 999 (incomplete) with NA
health_t0[health_t0==999] <- NA

#add the string 'health' to all health variables
names(health_t0)[-1] <- paste("health_", names(health_t0[,-1]), sep = "")

#pull out health-related vars, from screening timepoint
health_screening <- clinical[clinical$redcap_event_name =='screening_arm_1', c("id","moca_total")]

#rename health from screening timepoint
names(health_screening) <- c('id', 'health_moca_total')

#pull out pss scale (perceived stress)
pss <- clinical[clinical$redcap_event_name =='t0_arm_1', grep('^id$|^pss_[0-9]|pss_total', names(clinical))]

#pull of gad7 scale (generalized anxiety
gad7 <- clinical[clinical$redcap_event_name =='t0_arm_1', grep('^id$|^gad7_[0-9]|gad7_total', names(clinical))]

#merge all clinical dfs
clinical <- list(phq9, madrs, health_t0, health_screening, pss, gad7) %>%
  reduce(full_join, by='id')

#make sure limited to participants included in present analysis
clinical <- clinical[clinical$id %in% participants$V1,]

#check if missing values
sum(is.na(clinical)) #12

#identify variables for which missing values exist
names(which(colSums(is.na(clinical)) > 0)) #health_cirgs_total -- not a concern

#make sure no 'missing value'-type values (coded) exist in data
naniar::common_na_numbers %in% clinical #none
naniar::common_na_strings %in% clinical #none

#make sure all variables numeric
clinical[-1] <- sapply(clinical[-1], as.numeric)

#merge the clinical and demographic datasets
df <- merge(demo, clinical, by='id', all.x=TRUE)

#for clarity, remove clinical dfs no longer needed 
rm(phq9, madrs, health_t0, health_screening, pss, gad7)

#for clarity, remove the redcap and participants dfs
rm(participants, demo, clinical)

Cognitive

Munge cognitive data

#make a copy of redcap data for clinical data munging
cognitive <- redcap

#pull out rbans  
rbans <- cognitive[cognitive$redcap_event_name =='t0_arm_1', c(
  "id",
  "rbans_immmemory_index" ,  
  "rbans_visuo_index",      
  "rbans_language_index",    
  "rbans_attention_index",  
  "rbans_delmem_index",      
  "rbans_totalscale_tscore"
)]

#pull out dkefs 
dkefs <- cognitive[cognitive$redcap_event_name =='t0_arm_1', c(
  "id",
  'dkefs_trails4_time', #trails
  'dkefs_trails5_time',
  'dkefs_cwi_1_time', #colour
  'dkefs_cwi_2_time',
  'dkefs_cwi_3_time',
  'dkefs_cwi_4_time',
  'dkefs_vf_lftotalcorrect', #fluency 
  'dkefs_vf_cftotalcorrect', 
  'dkefs_vf_csswitchtotcor' 
  )]

#replace impossible dkefs values with NA
dkefs <- dkefs %>% naniar::replace_with_na_all(condition = ~.x %in% c(0, 995, 999))

#also, turn time values under 10 to NA - highly unlikely to be real
dkefs <- dkefs %>% naniar::replace_with_na_at(.vars = grep('time', names(dkefs)), condition = ~.x <= 10)

#pull out everyday cognition
ecog <- cognitive[cognitive$redcap_event_name =='t0_arm_1', c(
  "id",
  "ecog12_1_obj",                          
  "ecog12_2_day",                            
  "ecog12_3_communic",                      
  "ecog12_4_understanddir",              
  "ecog12_5_readmap",                       
  "ecog12_6_findwayhome",                    
  "ecog12_7_weatherplan",                   
  "ecog12_8_thinkahead",                     
  "ecog12_9_wrkspaceorg",                   
  "ecog12_10_chqbal",                        
  "ecog12_11_twothings",                    
  "ecog12_12_cookwork")]  

#merge all cognitive dfs
cognitive <- list(rbans, dkefs, ecog) %>%
  reduce(full_join, by='id')

#make sure no 'missing value'-type values exist
naniar::common_na_numbers %in% cognitive #none
naniar::common_na_strings %in% cognitive #none

#make sure all variables numeric
cognitive[-1] <- sapply(cognitive[-1], as.numeric)

#merge the cognitive and demographic / clinical datasets
df <- merge(df, cognitive, by='id', all.x=TRUE)

#for clarity, remove cognitive dfs no longer needed 
rm(rbans,dkefs,ecog)

Correct cognition variables (RBANS, DKEFS) for age and sex
(Note: this correction introduces negative values into our variables, including RT)

#identify all cognition variables
vars_cog <- names(cognitive)[grep('rbans|dkefs', names(cognitive))]

#for ease, make a separate df of cognition, and age and sex
cognition <- merge(df[,c('id', 'Age', 'Sex')], cognitive[c('id', vars_cog)])

#create function to correct for age and sex
adjustVars_fn <- function(x, data){
  fit <- lm(x ~ Age + Sex, na.action = na.exclude, data) 
  adjusted <- resid(fit) + coef(fit)[1] #residual intercept
  return(adjusted)
}

#apply function to all variables, put in new df for review
cognition_adjusted <- data.frame(cognition$id, sapply(cognition[vars_cog], function(x) adjustVars_fn(x, data=cognition)))

#adjust names in adjusted df, so clear have been corrected
names(cognition_adjusted)[-1] <- paste(names(cognition_adjusted[,-1]), '_cor', sep = "")

#add the adjusted variables back to our cognitive df
df <- merge(df, cognition_adjusted, by.x='id', by.y='cognition.id')

#for clarity, remove the unneeded dfs
rm(cognitive, cognition, cognition_adjusted)

Imaging

Munge LC NM-MRI data

#from the LC NM-MRI data, keep only avg_max values
lc <- lc[, grep('record_id|avg_max', names(lc))]

#keep only data from participants in our study
lc <- lc[lc$record_id %in% df$id,]

#check for missing values
sum(is.na(lc)) #none

#add an average avg_max value across all 6 LC segment variables
lc$avg_max <- rowMeans(lc[ , c(2:ncol(lc))], na.rm=TRUE)

#add back to the dataframe
df <- merge(df, lc, by.x='id', by.y='record_id')

Correct LC for age and sex

#add age and sex to the same df
lc <- merge(df[,c('id', 'Age', 'Sex')], lc, by.x='id', by.y='record_id')

#apply our function to all LC NM-MRI variables, put in new df for review
lc_adjusted <- data.frame(lc$id, sapply(lc[,grep('avg_max', names(lc))], function(x) adjustVars_fn(x, data=lc)))

#adjust names in adjusted df, so clear have been corrected
names(lc_adjusted)[-1] <- paste(names(lc_adjusted[,-1]), '_cor', sep = "")

#add the adjusted variables back to df
df <- merge(df, lc_adjusted, by.x='id', by.y='lc.id')

#for clarity, remove the unneeded dfs
rm(lc, lc_adjusted)

Summary table
The following table summarises mean(df) values for variables of interest, by diagnostic group. Cognition and LC variables have been corrected for age and sex.

#organize data we want in table 
vars <- c(
#demographics
"Sex",                 
"Age",   
"health_wrat4_wr_total", 

"phq9_total",
"madrs_total_calc",

"health_moca_total", #mci
"health_cirsg_total", #health
"health_whodasv2_total", #disability
"health_ucla3_total", #loneliness
"health_fas_total", #fatigue
"health_frail_total", #frailty

"gad7_total",  #generalized anxiety
"pss_total", #perceived stress

"rbans_immmemory_index_cor",   
"rbans_visuo_index_cor",       
"rbans_language_index_cor",     
"rbans_attention_index_cor",    
"rbans_delmem_index_cor",       

"dkefs_trails5_time_cor",
"dkefs_trails4_time_cor",   
"dkefs_cwi_1_time_cor",         
"dkefs_cwi_2_time_cor",         
"dkefs_cwi_3_time_cor",        
"dkefs_cwi_4_time_cor",  
"dkefs_vf_lftotalcorrect_cor",
"dkefs_vf_cftotalcorrect_cor",
"dkefs_vf_csswitchtotcor_cor",

"avg_max_seg_1_cor",        
"avg_max_seg_2_cor",        
"avg_max_seg_3_cor",       
"avg_max_seg_4_cor",        
"avg_max_seg_5_cor",
"avg_max_seg_6_cor"
)   

#initialize dataframe and name columns and rows
tbl <- data.frame(matrix(ncol=3, nrow=length(vars)))
names(tbl) <- c( paste('HC, n=', sum(df$Diagnosis == 'HC'), sep=''), paste('LLD, n=', sum(df$Diagnosis == 'LLD'), sep=''), '_p_')

#set rownames
row.names(tbl) <- c(
"Sex (F:M)",                 
"Age (years)",  
"Wide Range Achievement Test (WRAT)", 

"Patient Health Questionnaire (PHQ-9)",
"Montgomery-Asberg Depression Rating Scale (MADRS)",

"Montreal Cognitive Assessment (MoCA)", 
"Cumulative Illness Rating Scale-Geriatric (CIRS-G)",  
"WHO Disability Assessment (WHODAS)",   
"UCLA Loneliness Scale",       
"Fatigue Assessment Scale (FAS)",         
"Frail Scale",      
"Generalized Anxiety Disorder Scale (GAD-7) ",        
"Perceived Stress Scale (PSS)",    

#Repeatable Battery for the Assessment of Neuropsychological Status (RBANS)        
"Immediate Memory",
"Visuospatial/Constructional", 
"Language", 
"Attention", 
"Delayed Memory",  
      
#Delis-Kaplan Executive Function System (DKEFS)
"Trail Making Test - Numbers",       
"Trail Making Test - Number-Letter Switching",       
"Color-Word Interference Test - 1 : Colour Naming",         
"Color-Word Interference Test - 2: Word Reading",         
"Color-Word Interference Test - 3: Inhibition",        
"Color-Word Interference Test - 4: Inhibition/Switching",  
'Verbal fluency: letters',
'Verbal fluency: categories',
'Verbal fluency: switching',

#LC contrast values (age and sex corrected)
"Segment 1 (rostral left)",        
"Segment 2 (middle left)",        
"Segment 3 (caudal left)",       
"Segment 4 (rostral right)",        
"Segment 5 (middle right)",
"Segment 6 (caudal right)"
)   

#make sure group stays a factor...
group <- factor(x = c('HC', 'LLD'), levels = c('HC', 'LLD'))

#initialize counters (j = row, k = column)
j <- 1
k <- 1

#for loop for categorical VARS
    for (var in vars[1:1]) {
      
      # count observations for each cluster
          for (group in group) {
            out <- table(df$Diagnosis, df[[var]])
            tbl[j,k] <- paste(out[group,], collapse = ':')
            k <- k + 1}
      
     # run chi-square tests 
          chi_sq <- chisq.test(out)
          tbl[j,k] <- sub("^(-?)0.", "\\1.", sprintf("%.3f", chi_sq$p.value))
          k <- 2
          j <- j + 1}

# initialize counters (j = row, k = column)
j <- 2
k <- 1

#for loop for continuous variables
    for (var in vars[2:length(vars)]) {
  
#make sure stays factor
    group <- factor(x = c('HC', 'LLD'), levels = c('HC', 'LLD'))

      # calculate means and SDs for each scanner
          for (group in group) {
            M <- sprintf('%.02f', mean(df[df$Diagnosis == group, var], na.rm = TRUE))
            SD <- sprintf('%.02f', sd(df[df$Diagnosis == group, var], na.rm = TRUE))
            tbl[j,k] <- paste( M,' (',SD,')', sep='') 
            k <- k + 1 
          }
      
      # run one-way ANOVA with diagnosis as between-subjects variable
          t_test <- t.test(df[[var]] ~ df$Diagnosis, na.action=na.omit)
          t_test.p <- t_test$p.value
          t_test.p <- sub("^(-?)0.", "\\1.", sprintf("%.3f", t_test.p))
          tbl[j,k] <- t_test.p 

      #move counter
        k <- 1
        j <- j + 1 

    }

#pipe into pretty table
tbl %>%
kable(align=c('l', 'c', 'c', 'c'), digits = 2) %>%
kable_styling(bootstrap_options = c("striped", "hover")) %>%
   pack_rows(index = c("Demographics" = 3, 
                      "Clinical" = 10, 
                      "Repeatable Battery for the Assessment of Neuropsychological Status (RBANS)" = 5,
                      "Delis-Kaplan Executive Function System (DKEFS)" = 9,
                      "LC Signal Intensity" = 6))

	HC, n=23	LLD, n=25	p
Demographics
Sex (F:M)	11:12	7:18	.263
Age (years)	70.00 (8.02)	68.08 (5.41)	.341
Wide Range Achievement Test (WRAT)	64.52 (3.42)	62.52 (5.09)	.115
Clinical
Patient Health Questionnaire (PHQ-9)	0.78 (1.28)	12.28 (4.31)	.000
Montgomery-Asberg Depression Rating Scale (MADRS)	1.00 (1.41)	16.60 (5.69)	.000
Montreal Cognitive Assessment (MoCA)	26.22 (2.04)	25.72 (2.65)	.469
Cumulative Illness Rating Scale-Geriatric (CIRS-G)	6.43 (4.48)	10.29 (4.33)	.004
WHO Disability Assessment (WHODAS)	5.98 (7.68)	27.25 (12.87)	.000
UCLA Loneliness Scale	6.39 (6.17)	34.16 (14.34)	.000
Fatigue Assessment Scale (FAS)	15.57 (4.50)	31.28 (6.55)	.000
Frail Scale	0.26 (0.54)	0.84 (0.80)	.005
Generalized Anxiety Disorder Scale (GAD-7)	1.35 (1.67)	9.56 (5.09)	.000
Perceived Stress Scale (PSS)	7.83 (4.83)	22.64 (6.94)	.000
Repeatable Battery for the Assessment of Neuropsychological Status (RBANS)
Immediate Memory	93.04 (11.77)	94.56 (13.72)	.681
Visuospatial/Constructional	96.61 (15.75)	92.81 (17.29)	.431
Language	85.03 (12.04)	85.42 (9.44)	.901
Attention	97.12 (16.01)	90.49 (16.77)	.168
Delayed Memory	124.17 (14.32)	126.69 (11.12)	.502
Delis-Kaplan Executive Function System (DKEFS)
Trail Making Test - Numbers	23.41 (27.18)	25.37 (15.04)	.762
Trail Making Test - Number-Letter Switching	-76.54 (44.37)	-59.65 (53.98)	.241
Color-Word Interference Test - 1 : Colour Naming	38.72 (6.84)	38.71 (10.41)	.996
Color-Word Interference Test - 2: Word Reading	23.18 (5.27)	22.24 (6.53)	.585
Color-Word Interference Test - 3: Inhibition	35.57 (16.57)	42.46 (23.09)	.239
Color-Word Interference Test - 4: Inhibition/Switching	43.65 (13.22)	46.96 (20.72)	.510
Verbal fluency: letters	43.98 (10.93)	41.83 (12.26)	.524
Verbal fluency: categories	51.82 (8.87)	52.75 (8.36)	.710
Verbal fluency: switching	21.05 (3.16)	21.01 (2.98)	.961
LC Signal Intensity
Segment 1 (rostral left)	33.30 (3.48)	35.03 (3.05)	.074
Segment 2 (middle left)	41.74 (4.77)	42.00 (6.17)	.869
Segment 3 (caudal left)	24.51 (4.18)	25.98 (4.87)	.267
Segment 4 (rostral right)	25.92 (3.22)	27.36 (2.72)	.103
Segment 5 (middle right)	35.62 (3.87)	35.04 (5.11)	.662
Segment 6 (caudal right)	18.11 (5.17)	16.66 (4.20)	.295

Write out data

write.csv(df, paste0('../clinical/df_', Sys.Date(), '.csv', sep=''), row.names = F) 
write.csv(tbl, paste0('../paperTables/tbl_demo_', Sys.Date(), '.csv', sep=''), row.names = F)

NM-MRI 01_dataCleaning

Demographics

Clinical

Cognitive

Imaging

NM-MRI
01_dataCleaning