Code written: 2020-01-16
Last ran: 2020-01-16
Website: http://rpubs.com/navona/thesis_dataInterpolationFA
Desciption. Here, we review and clean the FA data from Slicer. First, we remove tracts, across all participants, that are missing data from ≥50% of participants. Then, we remove data from participants who are missing data from ≥15 tracts. Lastly, we interpoloate data for participants missing <15 tracts, using the same multivariate imputation method as was applied to the clinical data, i.e., MICE
. (Hopefully, when rerun Slicer with improved preprocessing, won’t need to interpolate as many values; more details on methods, see http://rpubs.com/navona/thesis_dataInterpolationClinical.)
Data description.
#before removing problematic data, how many tracts (including left/right/commissural trifurcation):
ncol((df)[,-1])
## [1] 74
#how many participants:
nrow(df)
## [1] 407
Find and remove tracts with too much missing data.
#count missingness per tract (percent)
(missing_tracts <- df %>%
dplyr::summarise_each(funs(100*mean(is.na(.)))))
## participant_id FA.AF_R FA.AF_L FA.CB_R FA.CB_L FA.CC1_C FA.CC2_C
## 1 0 15.72482 3.685504 0 0 0.982801 0.2457002
## FA.CC3_C FA.CC4_C FA.CC5_C FA.CC6_C FA.CC7_C FA.CPC_L FA.CPC_R
## 1 0.2457002 0.7371007 0.7371007 0.2457002 1.965602 44.96314 34.64373
## FA.CR.F_L FA.CR.F_R FA.CR.P_R FA.CR.P_L FA.CST_L FA.CST_R FA.EC_R
## 1 0.7371007 0.7371007 20.63882 20.63882 0.2457002 0.982801 10.07371
## FA.EC_L FA.EmC_L FA.EmC_R FA.ICP_L FA.ICP_R FA.ILF_R FA.ILF_L
## 1 7.616708 19.41032 21.86732 9.82801 12.28501 0.982801 0.4914005
## FA.Intra.CBLM.I.P_R FA.Intra.CBLM.I.P_L FA.Intra.CBLM.PaT_L
## 1 1.474201 1.228501 1.228501
## FA.Intra.CBLM.PaT_R FA.IOFF_L FA.IOFF_R FA.MCP_C FA.MdLF_R FA.MdLF_L
## 1 3.194103 30.95823 28.99263 4.422604 0.2457002 0.2457002
## FA.PLIC_R FA.PLIC_L FA.SF_R FA.SF_L FA.SLF.I_R FA.SLF.I_L
## 1 1.965602 5.896806 0.2457002 0.2457002 88.20639 87.96069
## FA.SLF.II_L FA.SLF.II_R FA.SLF.III_R FA.SLF.III_L FA.SO_L FA.SO_R
## 1 48.89435 36.85504 1.228501 1.228501 14.25061 9.090909
## FA.SP_R FA.SP_L FA.Sup.F_R FA.Sup.F_L FA.Sup.FP_L FA.Sup.FP_R
## 1 0.4914005 0.2457002 0 0 0 0
## FA.Sup.O_L FA.Sup.O_R FA.Sup.OT_R FA.Sup.OT_L FA.Sup.P_L FA.Sup.P_R
## 1 6.388206 12.53071 0.2457002 0.2457002 0 0
## FA.Sup.PO_L FA.Sup.PO_R FA.Sup.PT_R FA.Sup.PT_L FA.Sup.T_R FA.Sup.T_L
## 1 0.982801 2.457002 0 0.4914005 0.2457002 0.2457002
## FA.TF_L FA.TF_R FA.TO_L FA.TO_R FA.TP_L FA.TP_R FA.UF_L
## 1 0.2457002 0 2.948403 2.702703 0.2457002 0.2457002 0.2457002
## FA.UF_R
## 1 0.2457002
#select which tracts have missingness equal to or over 50% threshold
(missing_tracts <- apply(missing_tracts, 1, function(x) colnames(df)[which(x >= 50)]))
## [,1]
## [1,] "FA.SLF.I_R"
## [2,] "FA.SLF.I_L"
#remove tracts surpassing missing_tractsness threshold
df <- df[, !(colnames(df) %in% missing_tracts)]
#count remaining tracts:
ncol((df)[,-1])
## [1] 72
Find and remove participants with too much missing data.
#count missingness per participant
(missing_pars <- apply(df, MARGIN = 1, function(x) sum(is.na(x))))
## [1] 0 2 1 3 1 0 1 3 3 0 1 2 1 2 1 2 3 3 1 0 2 0 1
## [24] 3 2 0 4 0 2 1 2 4 4 6 1 2 1 1 0 0 1 2 2 1 0 1
## [47] 2 0 0 0 0 1 1 0 3 1 0 0 4 1 2 3 1 2 3 0 0 1 1
## [70] 2 1 1 0 0 3 1 0 1 1 2 2 1 1 0 1 1 5 2 1 1 1 1
## [93] 0 1 1 1 1 1 2 1 0 0 5 0 1 3 1 2 1 1 1 0 2 3 1
## [116] 0 2 1 1 0 1 4 4 2 1 1 0 3 2 1 1 1 14 17 13 8 18 17
## [139] 12 9 4 9 11 2 1 5 8 14 9 10 1 6 11 17 2 4 15 6 3 7 5
## [162] 5 0 7 15 5 1 3 5 19 1 5 1 6 0 6 5 1 4 16 4 2 2 14
## [185] 9 7 2 2 11 15 6 3 6 14 11 8 9 7 3 4 20 13 2 6 9 2 0
## [208] 4 3 5 6 14 6 2 3 4 5 6 6 2 1 12 8 10 3 7 6 6 15 6
## [231] 12 1 15 3 12 4 9 3 11 5 13 7 1 7 4 6 10 5 8 8 9 2 1
## [254] 12 3 13 14 5 3 2 4 1 2 10 2 5 2 3 1 3 4 12 10 6 3 13
## [277] 10 6 9 7 5 57 4 13 0 1 3 2 2 2 1 3 2 2 2 2 3 1 2
## [300] 1 1 2 3 2 2 2 1 3 2 1 3 0 0 1 0 0 3 2 2 2 2 3
## [323] 3 5 5 2 1 6 0 4 1 11 8 9 2 3 8 5 5 11 9 0 3 12 12
## [346] 1 2 8 7 5 12 8 1 10 3 3 36 4 5 9 3 1 3 1 1 8 1 4
## [369] 2 8 2 3 18 7 6 8 3 1 1 3 7 3 6 9 5 5 0 4 6 3 2
## [392] 15 11 13 6 7 6 11 4 20 1 0 3 13 8 13 2
#select the participants that have missing_pars data for equal to or over 15 tracts
(missing_pars <- as.character(df$participant_id[which(missing_pars >=15)])) #17
## [1] "SPN01_CMP_0175" "SPN01_CMP_0186" "SPN01_CMP_0187" "SPN01_MRC_0006"
## [5] "SPN01_MRC_0009" "SPN01_MRC_0017" "SPN01_MRC_0024" "SPN01_MRC_0034"
## [9] "SPN01_MRC_0047" "SPN01_MRC_0061" "SPN01_MRP_0093" "SPN01_MRP_0097"
## [13] "SPN01_MRP_0164" "SPN01_ZHP_0114" "SPN01_ZHP_0131" "SPN01_ZHP_0154"
## [17] "SPN01_ZHP_0165"
#remove participants surpassing missingness threshold
df <- df[!(df$participant_id %in% missing_pars),]
#how many participants left:
nrow(df)
## [1] 390
Interpolate missing data from other tracts, using MICE
package.
#run imputation, with only 5 interations (for PAC)
#imputed_FA <- mice(df, m=5, maxit = 5, method = 'pmm', seed = 500)
#combine all 50 iterations into a single df
#df_imputedFA <- complete(imputed_FA)
#write out csv
#write.csv(df_imputedFA, paste0('../../../data/df_imputedFA_5it_', Sys.Date(), '.csv', sep=''), row.names = F) #