1.1 Sample Data
The
Traumatic
Brain Injury dataset from the book
Clinical Prediction
Models was used for this illustrated example.
Preliminary dataset assessment:
[A] 2159 rows (observations)
[B] 16 columns (variables)
[B.1] 2/16 response = unfav and mort variables (factor)
[B.2] 14/16 predictors = All remaining variables (4/14
numeric + 10/14 factor)
##################################
# Loading R libraries
##################################
library(datasets)
library(caret)
library(moments)
library(missCompare)
library(mi)
library(missMDA)
library(pcaMethods)
library(missForest)
library(rms)
library(mice)
library(VIM)
library(misty)
library(Hmisc)
library(Amelia)
library(foreign)
library(RBtest)
######################################
# Setting the working directory
######################################
setwd("C:/Users/User/Desktop/RMarkdown Projects/MissingDataImputation")
##################################
# Loading dataset
##################################
TBI <- read.spss("TBI.sav",use.value.labels=F,to.data.frame=T)
##################################
# Performing a general exploration of the dataset
##################################
dim(TBI)
## [1] 2159 24
## trial d.gos d.mort d.unfav cause
## 74:1118 Min. :1.000 Min. :0.000 Min. :0.0000 Min. :3.000
## 75:1041 1st Qu.:2.000 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:3.000
## Median :4.000 Median :0.000 Median :0.0000 Median :4.000
## Mean :3.543 Mean :0.233 Mean :0.3942 Mean :4.908
## 3rd Qu.:5.000 3rd Qu.:0.000 3rd Qu.:1.0000 3rd Qu.:6.000
## Max. :5.000 Max. :1.000 Max. :1.0000 Max. :9.000
##
## age d.motor d.pupil pupil.i
## Min. :14.00 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:22.00 1st Qu.:3.000 1st Qu.:1.000 1st Qu.:1.000
## Median :30.00 Median :4.000 Median :1.000 Median :1.000
## Mean :33.21 Mean :3.991 Mean :1.458 Mean :1.462
## 3rd Qu.:43.00 3rd Qu.:5.000 3rd Qu.:2.000 3rd Qu.:2.000
## Max. :79.00 Max. :6.000 Max. :3.000 Max. :3.000
## NA's :123
## hypoxia hypotens ctclass tsah
## Min. :0.0000 Min. :0.0000 Min. :1.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:2.000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :3.000 Median :0.0000
## Mean :0.2167 Mean :0.1795 Mean :3.281 Mean :0.4779
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:5.000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :6.000 Max. :1.0000
## NA's :249 NA's :59 NA's :25 NA's :96
## edh cisterns shift d.sysbpt
## Min. :0.0000 Min. :1.000 Min. :0.0000 Min. : 60.0
## 1st Qu.:0.0000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:121.1
## Median :0.0000 Median :1.000 Median :0.0000 Median :131.3
## Mean :0.1268 Mean :1.455 Mean :0.1987 Mean :131.4
## 3rd Qu.:0.0000 3rd Qu.:2.000 3rd Qu.:0.0000 3rd Qu.:142.1
## Max. :1.0000 Max. :3.000 Max. :1.0000 Max. :207.4
## NA's :38 NA's :257 NA's :252
## glucose glucoset ph sodium
## Min. : 0.5556 Min. : 3.000 Min. :6.790 Min. :114.0
## 1st Qu.: 6.7000 1st Qu.: 6.700 1st Qu.:7.320 1st Qu.:137.0
## Median : 8.2000 Median : 8.200 Median :7.380 Median :140.0
## Mean : 8.9078 Mean : 8.863 Mean :7.376 Mean :139.3
## 3rd Qu.:10.4000 3rd Qu.:10.400 3rd Qu.:7.450 3rd Qu.:142.0
## Max. :41.4000 Max. :20.000 Max. :7.660 Max. :154.0
## NA's :64 NA's :64 NA's :457 NA's :62
## sodiumt hb hbt
## Min. :125.0 Min. : 2.50 Min. : 6.00
## 1st Qu.:137.0 1st Qu.:10.90 1st Qu.:10.90
## Median :140.0 Median :12.90 Median :12.90
## Mean :139.3 Mean :12.45 Mean :12.47
## 3rd Qu.:142.0 3rd Qu.:14.30 3rd Qu.:14.30
## Max. :154.0 Max. :18.60 Max. :17.00
## NA's :62 NA's :24 NA's :24
## TBI
##
## 24 Variables 2159 Observations
## --------------------------------------------------------------------------------
## trial
## n missing distinct
## 2159 0 2
##
## Value 74 75
## Frequency 1118 1041
## Proportion 0.518 0.482
## --------------------------------------------------------------------------------
## d.gos
## n missing distinct Info Mean Gmd
## 2159 0 5 0.894 3.543 1.726
##
## lowest : 1 2 3 4 5, highest: 1 2 3 4 5
##
## Value 1 2 3 4 5
## Frequency 503 86 262 351 957
## Proportion 0.233 0.040 0.121 0.163 0.443
## --------------------------------------------------------------------------------
## d.mort
## n missing distinct Info Sum Mean Gmd
## 2159 0 2 0.536 503 0.233 0.3576
##
## --------------------------------------------------------------------------------
## d.unfav
## n missing distinct Info Sum Mean Gmd
## 2159 0 2 0.716 851 0.3942 0.4778
##
## --------------------------------------------------------------------------------
## cause
## n missing distinct Info Mean Gmd
## 2159 0 5 0.921 4.908 2.301
##
## lowest : 3 4 5 6 9, highest: 3 4 5 6 9
##
## Value 3 4 5 6 9
## Frequency 848 420 134 370 387
## Proportion 0.393 0.195 0.062 0.171 0.179
## --------------------------------------------------------------------------------
## age
## n missing distinct Info Mean Gmd .05 .10
## 2159 0 64 0.999 33.21 15.17 17 18
## .25 .50 .75 .90 .95
## 22 30 43 54 59
##
## lowest : 14 15 16 17 18, highest: 74 75 76 77 79
## --------------------------------------------------------------------------------
## d.motor
## n missing distinct Info Mean Gmd
## 2159 0 6 0.919 3.991 1.226
##
## lowest : 1 2 3 4 5, highest: 2 3 4 5 6
##
## Value 1 2 3 4 5 6
## Frequency 14 279 369 627 790 80
## Proportion 0.006 0.129 0.171 0.290 0.366 0.037
## --------------------------------------------------------------------------------
## d.pupil
## n missing distinct Info Mean Gmd
## 2036 123 3 0.647 1.458 0.6881
##
## Value 1 2 3
## Frequency 1430 279 327
## Proportion 0.702 0.137 0.161
## --------------------------------------------------------------------------------
## pupil.i
## n missing distinct Info Mean Gmd
## 2159 0 3 0.65 1.462 0.6915
##
## Value 1 2 3
## Frequency 1511 299 349
## Proportion 0.700 0.138 0.162
## --------------------------------------------------------------------------------
## hypoxia
## n missing distinct Info Sum Mean Gmd
## 1910 249 2 0.509 414 0.2168 0.3397
##
## --------------------------------------------------------------------------------
## hypotens
## n missing distinct Info Sum Mean Gmd
## 2100 59 2 0.442 377 0.1795 0.2947
##
## --------------------------------------------------------------------------------
## ctclass
## n missing distinct Info Mean Gmd
## 2134 25 6 0.928 3.281 1.689
##
## lowest : 1 2 3 4 5, highest: 2 3 4 5 6
##
## Value 1 2 3 4 5 6
## Frequency 149 781 414 85 521 184
## Proportion 0.070 0.366 0.194 0.040 0.244 0.086
## --------------------------------------------------------------------------------
## tsah
## n missing distinct Info Sum Mean Gmd
## 2063 96 2 0.749 986 0.4779 0.4993
##
## --------------------------------------------------------------------------------
## edh
## n missing distinct Info Sum Mean Gmd
## 2121 38 2 0.332 269 0.1268 0.2216
##
## --------------------------------------------------------------------------------
## cisterns
## n missing distinct Info Mean Gmd
## 1902 257 3 0.716 1.455 0.6367
##
## Value 1 2 3
## Frequency 1223 492 187
## Proportion 0.643 0.259 0.098
## --------------------------------------------------------------------------------
## shift
## n missing distinct Info Sum Mean Gmd
## 1907 252 2 0.478 379 0.1987 0.3187
##
## --------------------------------------------------------------------------------
## d.sysbpt
## n missing distinct Info Mean Gmd .05 .10
## 2159 0 1566 1 131.4 17.61 107.0 112.2
## .25 .50 .75 .90 .95
## 121.1 131.3 142.1 151.1 156.2
##
## lowest : 60.00 64.19 65.60 71.33 73.71, highest: 182.92 184.16 184.70 188.29 207.40
## --------------------------------------------------------------------------------
## glucose
## n missing distinct Info Mean Gmd .05 .10
## 2095 64 431 1 8.908 3.56 5.000 5.633
## .25 .50 .75 .90 .95
## 6.700 8.200 10.400 13.056 15.350
##
## lowest : 0.5555556 1.0000000 1.0300000 1.0400000 1.2200000
## highest: 29.2222222 30.8888889 31.1111111 32.5555556 41.4000000
## --------------------------------------------------------------------------------
## glucoset
## n missing distinct Info Mean Gmd .05 .10
## 2095 64 381 1 8.863 3.405 5.000 5.633
## .25 .50 .75 .90 .95
## 6.700 8.200 10.400 13.056 15.350
##
## lowest : 3.000000 3.030000 3.055556 3.200000 3.444444
## highest: 19.500000 19.555556 19.777778 19.800000 20.000000
## --------------------------------------------------------------------------------
## ph
## n missing distinct Info Mean Gmd .05 .10
## 1702 457 112 0.999 7.376 0.1183 7.19 7.24
## .25 .50 .75 .90 .95
## 7.32 7.38 7.45 7.50 7.54
##
## lowest : 6.79 6.83 6.87 6.90 6.91, highest: 7.62 7.63 7.64 7.65 7.66
## --------------------------------------------------------------------------------
## sodium
## n missing distinct Info Mean Gmd .05 .10
## 2097 62 62 0.993 139.3 4.448 132 134
## .25 .50 .75 .90 .95
## 137 140 142 144 145
##
## lowest : 114.0 117.0 118.0 121.0 123.0, highest: 150.0 151.0 151.7 153.0 154.0
## --------------------------------------------------------------------------------
## sodiumt
## n missing distinct Info Mean Gmd .05 .10
## 2097 62 55 0.993 139.3 4.408 132 134
## .25 .50 .75 .90 .95
## 137 140 142 144 145
##
## lowest : 125.0 126.0 126.5 127.0 127.4, highest: 150.0 151.0 151.7 153.0 154.0
## --------------------------------------------------------------------------------
## hb
## n missing distinct Info Mean Gmd .05 .10
## 2135 24 156 1 12.45 2.846 7.7 8.8
## .25 .50 .75 .90 .95
## 10.9 12.9 14.3 15.3 15.9
##
## lowest : 2.5 2.8 3.4 3.6 4.0, highest: 17.3 17.4 17.6 17.7 18.6
## --------------------------------------------------------------------------------
## hbt
## n missing distinct Info Mean Gmd .05 .10
## 2135 24 130 1 12.47 2.809 7.7 8.8
## .25 .50 .75 .90 .95
## 10.9 12.9 14.3 15.3 15.9
##
## lowest : 6.0 6.1 6.2 6.3 6.4, highest: 16.6 16.7 16.8 16.9 17.0
## --------------------------------------------------------------------------------
##################################
# Performing re-categorization and re-grouping
# of factor variables
##################################
TBI$pupil <- ifelse(TBI$d.pupil==9,NA,TBI$d.pupil)
TBI$motor <- ifelse(is.na(TBI$d.motor),9,TBI$d.motor)
TBI$motor1 <- ifelse(TBI$motor==1,1,0)
TBI$motor2 <- ifelse(TBI$motor==2,1,0)
TBI$motor3 <- ifelse(TBI$motor==3,1,0)
TBI$motor4 <- ifelse(TBI$motor==4,1,0)
TBI$motor56 <- ifelse(TBI$motor==5 | TBI$motor==6,1,0)
TBI$motor9 <- ifelse(TBI$motor==9,1,0)
TBI$motorG <- ifelse(TBI$motor1==1,5,
ifelse(TBI$motor2==1,4,
ifelse(TBI$motor3==1,3,
ifelse(TBI$motor4==1,2,
ifelse(TBI$motor56==1,1,
ifelse(TBI$motor9==1,9,NA))))))
TBI$motorG <- as.factor(TBI$motorG)
TBI$mort <- ifelse(TBI$d.gos==1,1,ifelse(TBI$d.gos==6,NA,0))
TBI$deadveg <- ifelse(TBI$d.gos<3,1,ifelse(TBI$d.gos==6,NA,0))
TBI$unfav <- ifelse(TBI$d.gos<4,1,ifelse(TBI$d.gos==6,NA,0))
TBI$good <- ifelse(TBI$d.gos<5,1,ifelse(TBI$d.gos==6,NA,0))
TBI$ctclass2 <- ifelse(TBI$ctclass==2,1,0)
TBI$ctclass1 <- ifelse(TBI$ctclass==1,1,0)
TBI$ctclass34 <- ifelse(TBI$ctclass==3 | TBI$ctclass==4,1,0)
TBI$ctclass56 <- ifelse(TBI$ctclass==5 | TBI$ctclass==6,1,0)
TBI$ctclassr4 <- ifelse(TBI$ctclass1==1,2,
ifelse(TBI$ctclass2==1,1,
ifelse(TBI$ctclass34==1,3,
ifelse(TBI$ctclass56==1,4,NA))))
TBI$ctclassr4 <- as.factor(TBI$ctclassr4)
TBI$ctclassr3 <- ifelse(TBI$ctclass1==1 | TBI$ctclass2==1,1,
ifelse(TBI$ctclass34==1,2,
ifelse(TBI$ctclass56==1,3,NA)))
TBI$ctclassr3 <- as.factor(TBI$ctclassr3)
TBI.Analysis <- TBI[,Cs(trial,
age,
hypoxia,
hypotens,
cisterns,
shift,
tsah,
edh,
pupil,
motorG,
ctclassr3,
d.sysbpt,
hbt,
glucoset,
unfav,
mort)]
names(TBI.Analysis) <- Cs(trial,
age,
hypoxia,
hypotens,
cisterns,
shift,
tsah,
edh,
pupil,
motor,
ctclass,
d.sysbp,
hb,
glucose,
unfav,
mort )
TBI.Analysis$cisterns <- ifelse(TBI.Analysis$cisterns>1,1,0)
TBI.Analysis$trial <- as.factor(TBI.Analysis$trial)
TBI.Analysis$age <- as.numeric(TBI.Analysis$age)
TBI.Analysis$hypoxia <-as.factor(TBI.Analysis$hypoxia)
TBI.Analysis$hypotens <-as.factor(TBI.Analysis$hypotens)
TBI.Analysis$cisterns <-as.factor(TBI.Analysis$cisterns)
TBI.Analysis$shift <-as.factor(TBI.Analysis$shift)
TBI.Analysis$tsah <-as.factor(TBI.Analysis$tsah)
TBI.Analysis$edh <-as.factor(TBI.Analysis$edh)
TBI.Analysis$pupil <-as.factor(TBI.Analysis$pupil)
TBI.Analysis$motor <-as.factor(TBI.Analysis$motor)
TBI.Analysis$ctclass <-as.factor(TBI.Analysis$ctclass)
TBI.Analysis$d.sysbp <-as.numeric(TBI.Analysis$d.sysbp)
TBI.Analysis$hb <-as.numeric(TBI.Analysis$hb)
TBI.Analysis$glucose <-as.numeric(TBI.Analysis$glucose)
TBI.Analysis$unfav <-as.factor(TBI.Analysis$unfav)
TBI.Analysis$mort <-as.factor(TBI.Analysis$mort)
dim(TBI.Analysis)
## [1] 2159 16
## TBI.Analysis
##
## 16 Variables 2159 Observations
## --------------------------------------------------------------------------------
## trial
## n missing distinct
## 2159 0 2
##
## Value 74 75
## Frequency 1118 1041
## Proportion 0.518 0.482
## --------------------------------------------------------------------------------
## age
## n missing distinct Info Mean Gmd .05 .10
## 2159 0 64 0.999 33.21 15.17 17 18
## .25 .50 .75 .90 .95
## 22 30 43 54 59
##
## lowest : 14 15 16 17 18, highest: 74 75 76 77 79
## --------------------------------------------------------------------------------
## hypoxia
## n missing distinct
## 1910 249 2
##
## Value 0 1
## Frequency 1496 414
## Proportion 0.783 0.217
## --------------------------------------------------------------------------------
## hypotens
## n missing distinct
## 2100 59 2
##
## Value 0 1
## Frequency 1723 377
## Proportion 0.82 0.18
## --------------------------------------------------------------------------------
## cisterns
## n missing distinct
## 1902 257 2
##
## Value 0 1
## Frequency 1223 679
## Proportion 0.643 0.357
## --------------------------------------------------------------------------------
## shift
## n missing distinct
## 1907 252 2
##
## Value 0 1
## Frequency 1528 379
## Proportion 0.801 0.199
## --------------------------------------------------------------------------------
## tsah
## n missing distinct
## 2063 96 2
##
## Value 0 1
## Frequency 1077 986
## Proportion 0.522 0.478
## --------------------------------------------------------------------------------
## edh
## n missing distinct
## 2121 38 2
##
## Value 0 1
## Frequency 1852 269
## Proportion 0.873 0.127
## --------------------------------------------------------------------------------
## pupil
## n missing distinct
## 2036 123 3
##
## Value 1 2 3
## Frequency 1430 279 327
## Proportion 0.702 0.137 0.161
## --------------------------------------------------------------------------------
## motor
## n missing distinct
## 2159 0 5
##
## lowest : 1 2 3 4 5, highest: 1 2 3 4 5
##
## Value 1 2 3 4 5
## Frequency 870 627 369 279 14
## Proportion 0.403 0.290 0.171 0.129 0.006
## --------------------------------------------------------------------------------
## ctclass
## n missing distinct
## 2134 25 3
##
## Value 1 2 3
## Frequency 930 499 705
## Proportion 0.436 0.234 0.330
## --------------------------------------------------------------------------------
## d.sysbp
## n missing distinct Info Mean Gmd .05 .10
## 2159 0 1566 1 131.4 17.61 107.0 112.2
## .25 .50 .75 .90 .95
## 121.1 131.3 142.1 151.1 156.2
##
## lowest : 60.00 64.19 65.60 71.33 73.71, highest: 182.92 184.16 184.70 188.29 207.40
## --------------------------------------------------------------------------------
## hb
## n missing distinct Info Mean Gmd .05 .10
## 2135 24 130 1 12.47 2.809 7.7 8.8
## .25 .50 .75 .90 .95
## 10.9 12.9 14.3 15.3 15.9
##
## lowest : 6.0 6.1 6.2 6.3 6.4, highest: 16.6 16.7 16.8 16.9 17.0
## --------------------------------------------------------------------------------
## glucose
## n missing distinct Info Mean Gmd .05 .10
## 2095 64 381 1 8.863 3.405 5.000 5.633
## .25 .50 .75 .90 .95
## 6.700 8.200 10.400 13.056 15.350
##
## lowest : 3.000000 3.030000 3.055556 3.200000 3.444444
## highest: 19.500000 19.555556 19.777778 19.800000 20.000000
## --------------------------------------------------------------------------------
## unfav
## n missing distinct
## 2159 0 2
##
## Value 0 1
## Frequency 1308 851
## Proportion 0.606 0.394
## --------------------------------------------------------------------------------
## mort
## n missing distinct
## 2159 0 2
##
## Value 0 1
## Frequency 1656 503
## Proportion 0.767 0.233
## --------------------------------------------------------------------------------
## trial age hypoxia hypotens cisterns shift
## 74:1118 Min. :14.00 0 :1496 0 :1723 0 :1223 0 :1528
## 75:1041 1st Qu.:22.00 1 : 414 1 : 377 1 : 679 1 : 379
## Median :30.00 NA's: 249 NA's: 59 NA's: 257 NA's: 252
## Mean :33.21
## 3rd Qu.:43.00
## Max. :79.00
##
## tsah edh pupil motor ctclass d.sysbp
## 0 :1077 0 :1852 1 :1430 1:870 1 :930 Min. : 60.0
## 1 : 986 1 : 269 2 : 279 2:627 2 :499 1st Qu.:121.1
## NA's: 96 NA's: 38 3 : 327 3:369 3 :705 Median :131.3
## NA's: 123 4:279 NA's: 25 Mean :131.4
## 5: 14 3rd Qu.:142.1
## Max. :207.4
##
## hb glucose unfav mort
## Min. : 6.00 Min. : 3.000 0:1308 0:1656
## 1st Qu.:10.90 1st Qu.: 6.700 1: 851 1: 503
## Median :12.90 Median : 8.200
## Mean :12.47 Mean : 8.863
## 3rd Qu.:14.30 3rd Qu.:10.400
## Max. :17.00 Max. :20.000
## NA's :24 NA's :64
##################################
# Formulating a data type assessment summary
##################################
PDA <- TBI.Analysis
(PDA.Summary <- data.frame(
Column.Index=c(1:length(names(PDA))),
Column.Name= names(PDA),
Column.Type=sapply(PDA, function(x) class(x)),
row.names=NULL)
)
## Column.Index Column.Name Column.Type
## 1 1 trial factor
## 2 2 age numeric
## 3 3 hypoxia factor
## 4 4 hypotens factor
## 5 5 cisterns factor
## 6 6 shift factor
## 7 7 tsah factor
## 8 8 edh factor
## 9 9 pupil factor
## 10 10 motor factor
## 11 11 ctclass factor
## 12 12 d.sysbp numeric
## 13 13 hb numeric
## 14 14 glucose numeric
## 15 15 unfav factor
## 16 16 mort factor
1.2 Data Quality Assessment
Data quality assessment:
[A] Missing observations noted for 10 variables with
NA.Count>0 and Fill.Rate<1.0.
[A.1] hypoxia
variable (factor) with NA.Count=249 and Fill.Rate=0.885
[A.2] hypotens
variable (factor) with NA.Count=59 and Fill.Rate=0.973
[A.3] cisterns
variable (factor) with NA.Count=257 and Fill.Rate=0.881
[A.4] shift
variable (factor) with NA.Count=252 and Fill.Rate=0.883
[A.5] tsah variable
(factor) with NA.Count=96 and Fill.Rate=0.956
[A.6] edh variable
(factor) with NA.Count=38 and Fill.Rate=0.982
[A.7] pupil
variable (factor) with NA.Count=123 and Fill.Rate=0.943
[A.8] ctclass
variable (factor) with NA.Count=25 and Fill.Rate=0.988
[A.9] hb variable
(numeric) with NA.Count=24 and Fill.Rate=0.989
[A.10] glucose
variable (numeric) with NA.Count=64 and Fill.Rate=0.970
[B] Low variance noted for 1 variable with
First.Second.Mode.Ratio>5 or Unique.Count.Ratio<0.01.
[B.1] edh variable
(factor) with First.Second.Mode.Ratio=6.885
[C] No high skewness (Skewness>3 or
Skewness<(-3)) noted for any variable.
##################################
# Loading dataset
##################################
DQA <- TBI.Analysis
##################################
# Listing all predictors
##################################
DQA.Predictors <- DQA[,names(DQA)]
##################################
# Formulating an overall data quality assessment summary
##################################
(DQA.Summary <- data.frame(
Column.Index=c(1:length(names(DQA))),
Column.Name= names(DQA),
Column.Type=sapply(DQA, function(x) class(x)),
Row.Count=sapply(DQA, function(x) nrow(DQA)),
NA.Count=sapply(DQA,function(x)sum(is.na(x))),
Fill.Rate=sapply(DQA,function(x)format(round((sum(!is.na(x))/nrow(DQA)),3),nsmall=3)),
row.names=NULL)
)
## Column.Index Column.Name Column.Type Row.Count NA.Count Fill.Rate
## 1 1 trial factor 2159 0 1.000
## 2 2 age numeric 2159 0 1.000
## 3 3 hypoxia factor 2159 249 0.885
## 4 4 hypotens factor 2159 59 0.973
## 5 5 cisterns factor 2159 257 0.881
## 6 6 shift factor 2159 252 0.883
## 7 7 tsah factor 2159 96 0.956
## 8 8 edh factor 2159 38 0.982
## 9 9 pupil factor 2159 123 0.943
## 10 10 motor factor 2159 0 1.000
## 11 11 ctclass factor 2159 25 0.988
## 12 12 d.sysbp numeric 2159 0 1.000
## 13 13 hb numeric 2159 24 0.989
## 14 14 glucose numeric 2159 64 0.970
## 15 15 unfav factor 2159 0 1.000
## 16 16 mort factor 2159 0 1.000
##################################
# Listing all numeric predictors
##################################
DQA.Predictors.Numeric <- DQA.Predictors[,sapply(DQA.Predictors, is.numeric)]
if (length(names(DQA.Predictors.Numeric))>0) {
print(paste0("There are ",
(length(names(DQA.Predictors.Numeric))),
" numeric predictor variable(s)."))
} else {
print("There are no numeric predictor variables.")
}
## [1] "There are 4 numeric predictor variable(s)."
##################################
# Listing all factor predictors
##################################
DQA.Predictors.Factor <- DQA.Predictors[,sapply(DQA.Predictors, is.factor)]
if (length(names(DQA.Predictors.Factor))>0) {
print(paste0("There are ",
(length(names(DQA.Predictors.Factor))),
" factor predictor variable(s)."))
} else {
print("There are no factor predictor variables.")
}
## [1] "There are 12 factor predictor variable(s)."
##################################
# Formulating a data quality assessment summary for factor predictors
##################################
if (length(names(DQA.Predictors.Factor))>0) {
##################################
# Formulating a function to determine the first mode
##################################
FirstModes <- function(x) {
ux <- unique(na.omit(x))
tab <- tabulate(match(x, ux))
ux[tab == max(tab)]
}
##################################
# Formulating a function to determine the second mode
##################################
SecondModes <- function(x) {
ux <- unique(na.omit(x))
tab <- tabulate(match(x, ux))
fm = ux[tab == max(tab)]
sm = x[!(x %in% fm)]
usm <- unique(sm)
tabsm <- tabulate(match(sm, usm))
usm[tabsm == max(tabsm)]
}
(DQA.Predictors.Factor.Summary <- data.frame(
Column.Name= names(DQA.Predictors.Factor),
Column.Type=sapply(DQA.Predictors.Factor, function(x) class(x)),
Unique.Count=sapply(DQA.Predictors.Factor, function(x) length(unique(x))),
First.Mode.Value=sapply(DQA.Predictors.Factor, function(x) as.character(FirstModes(x)[1])),
Second.Mode.Value=sapply(DQA.Predictors.Factor, function(x) as.character(SecondModes(x)[1])),
First.Mode.Count=sapply(DQA.Predictors.Factor, function(x) sum(na.omit(x) == FirstModes(x)[1])),
Second.Mode.Count=sapply(DQA.Predictors.Factor, function(x) sum(na.omit(x) == SecondModes(x)[1])),
Unique.Count.Ratio=sapply(DQA.Predictors.Factor, function(x) format(round((length(unique(x))/nrow(DQA.Predictors.Factor)),3), nsmall=3)),
First.Second.Mode.Ratio=sapply(DQA.Predictors.Factor, function(x) format(round((sum(na.omit(x) == FirstModes(x)[1])/sum(na.omit(x) == SecondModes(x)[1])),3), nsmall=3)),
row.names=NULL)
)
}
## Column.Name Column.Type Unique.Count First.Mode.Value Second.Mode.Value
## 1 trial factor 2 74 75
## 2 hypoxia factor 3 0 1
## 3 hypotens factor 3 0 1
## 4 cisterns factor 3 0 1
## 5 shift factor 3 0 1
## 6 tsah factor 3 0 1
## 7 edh factor 3 0 1
## 8 pupil factor 4 1 3
## 9 motor factor 5 1 2
## 10 ctclass factor 4 1 3
## 11 unfav factor 2 0 1
## 12 mort factor 2 0 1
## First.Mode.Count Second.Mode.Count Unique.Count.Ratio
## 1 1118 1041 0.001
## 2 1496 414 0.001
## 3 1723 377 0.001
## 4 1223 679 0.001
## 5 1528 379 0.001
## 6 1077 986 0.001
## 7 1852 269 0.001
## 8 1430 327 0.002
## 9 870 627 0.002
## 10 930 705 0.002
## 11 1308 851 0.001
## 12 1656 503 0.001
## First.Second.Mode.Ratio
## 1 1.074
## 2 3.614
## 3 4.570
## 4 1.801
## 5 4.032
## 6 1.092
## 7 6.885
## 8 4.373
## 9 1.388
## 10 1.319
## 11 1.537
## 12 3.292
##################################
# Formulating a data quality assessment summary for numeric predictors
##################################
if (length(names(DQA.Predictors.Numeric))>0) {
##################################
# Formulating a function to determine the first mode
##################################
FirstModes <- function(x) {
ux <- unique(na.omit(x))
tab <- tabulate(match(x, ux))
ux[tab == max(tab)]
}
##################################
# Formulating a function to determine the second mode
##################################
SecondModes <- function(x) {
ux <- unique(na.omit(x))
tab <- tabulate(match(x, ux))
fm = ux[tab == max(tab)]
sm = na.omit(x)[!(na.omit(x) %in% fm)]
usm <- unique(sm)
tabsm <- tabulate(match(sm, usm))
usm[tabsm == max(tabsm)]
}
(DQA.Predictors.Numeric.Summary <- data.frame(
Column.Name= names(DQA.Predictors.Numeric),
Column.Type=sapply(DQA.Predictors.Numeric, function(x) class(x)),
Unique.Count=sapply(DQA.Predictors.Numeric, function(x) length(unique(x))),
Unique.Count.Ratio=sapply(DQA.Predictors.Numeric, function(x) format(round((length(unique(x))/nrow(DQA.Predictors.Numeric)),3), nsmall=3)),
First.Mode.Value=sapply(DQA.Predictors.Numeric, function(x) format(round((FirstModes(x)[1]),3),nsmall=3)),
Second.Mode.Value=sapply(DQA.Predictors.Numeric, function(x) format(round((SecondModes(x)[1]),3),nsmall=3)),
First.Mode.Count=sapply(DQA.Predictors.Numeric, function(x) sum(na.omit(x) == FirstModes(x)[1])),
Second.Mode.Count=sapply(DQA.Predictors.Numeric, function(x) sum(na.omit(x) == SecondModes(x)[1])),
First.Second.Mode.Ratio=sapply(DQA.Predictors.Numeric, function(x) format(round((sum(na.omit(x) == FirstModes(x)[1])/sum(na.omit(x) == SecondModes(x)[1])),3), nsmall=3)),
Minimum=sapply(DQA.Predictors.Numeric, function(x) format(round(min(x,na.rm = TRUE),3), nsmall=3)),
Mean=sapply(DQA.Predictors.Numeric, function(x) format(round(mean(x,na.rm = TRUE),3), nsmall=3)),
Median=sapply(DQA.Predictors.Numeric, function(x) format(round(median(x,na.rm = TRUE),3), nsmall=3)),
Maximum=sapply(DQA.Predictors.Numeric, function(x) format(round(max(x,na.rm = TRUE),3), nsmall=3)),
Skewness=sapply(DQA.Predictors.Numeric, function(x) format(round(moments::skewness(x,na.rm = TRUE),3), nsmall=3)),
Kurtosis=sapply(DQA.Predictors.Numeric, function(x) format(round(moments::kurtosis(x,na.rm = TRUE),3), nsmall=3)),
Percentile25th=sapply(DQA.Predictors.Numeric, function(x) format(round(quantile(x,probs=0.25,na.rm = TRUE),3), nsmall=3)),
Percentile75th=sapply(DQA.Predictors.Numeric, function(x) format(round(quantile(x,probs=0.75,na.rm = TRUE),3), nsmall=3)),
row.names=NULL)
)
}
## Column.Name Column.Type Unique.Count Unique.Count.Ratio First.Mode.Value
## 1 age numeric 64 0.030 21.000
## 2 d.sysbp numeric 1566 0.725 130.000
## 3 hb numeric 131 0.061 13.900
## 4 glucose numeric 382 0.177 20.000
## Second.Mode.Value First.Mode.Count Second.Mode.Count First.Second.Mode.Ratio
## 1 19.000 104 100 1.040
## 2 120.000 9 8 1.125
## 3 13.600 47 43 1.093
## 4 6.500 29 27 1.074
## Minimum Mean Median Maximum Skewness Kurtosis Percentile25th
## 1 14.000 33.211 30.000 79.000 0.721 2.585 22.000
## 2 60.000 131.430 131.280 207.400 -0.022 3.982 121.085
## 3 6.000 12.468 12.900 17.000 -0.619 2.786 10.900
## 4 3.000 8.863 8.200 20.000 1.192 4.727 6.700
## Percentile75th
## 1 43.000
## 2 142.060
## 3 14.300
## 4 10.400
##################################
# Identifying potential data quality issues
##################################
##################################
# Checking for missing observations
##################################
if ((nrow(DQA.Summary[DQA.Summary$NA.Count>0,]))>0){
print(paste0("Missing observations noted for ",
(nrow(DQA.Summary[DQA.Summary$NA.Count>0,])),
" variable(s) with NA.Count>0 and Fill.Rate<1.0."))
DQA.Summary[DQA.Summary$NA.Count>0,]
} else {
print("No missing observations noted.")
}
## [1] "Missing observations noted for 10 variable(s) with NA.Count>0 and Fill.Rate<1.0."
## Column.Index Column.Name Column.Type Row.Count NA.Count Fill.Rate
## 3 3 hypoxia factor 2159 249 0.885
## 4 4 hypotens factor 2159 59 0.973
## 5 5 cisterns factor 2159 257 0.881
## 6 6 shift factor 2159 252 0.883
## 7 7 tsah factor 2159 96 0.956
## 8 8 edh factor 2159 38 0.982
## 9 9 pupil factor 2159 123 0.943
## 11 11 ctclass factor 2159 25 0.988
## 13 13 hb numeric 2159 24 0.989
## 14 14 glucose numeric 2159 64 0.970
##################################
# Checking for zero or near-zero variance predictors
##################################
if (length(names(DQA.Predictors.Factor))==0) {
print("No factor predictors noted.")
} else if (nrow(DQA.Predictors.Factor.Summary[as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,])>0){
print(paste0("Low variance observed for ",
(nrow(DQA.Predictors.Factor.Summary[as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,])),
" factor variable(s) with First.Second.Mode.Ratio>5."))
DQA.Predictors.Factor.Summary[as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,]
} else {
print("No low variance factor predictors due to high first-second mode ratio noted.")
}
## [1] "Low variance observed for 1 factor variable(s) with First.Second.Mode.Ratio>5."
## Column.Name Column.Type Unique.Count First.Mode.Value Second.Mode.Value
## 7 edh factor 3 0 1
## First.Mode.Count Second.Mode.Count Unique.Count.Ratio First.Second.Mode.Ratio
## 7 1852 269 0.001 6.885
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
} else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,])>0){
print(paste0("Low variance observed for ",
(nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,])),
" numeric variable(s) with First.Second.Mode.Ratio>5."))
DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,]
} else {
print("No low variance numeric predictors due to high first-second mode ratio noted.")
}
## [1] "No low variance numeric predictors due to high first-second mode ratio noted."
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
} else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,])>0){
print(paste0("Low variance observed for ",
(nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,])),
" numeric variable(s) with Unique.Count.Ratio<0.01."))
DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,]
} else {
print("No low variance numeric predictors due to low unique count ratio noted.")
}
## [1] "No low variance numeric predictors due to low unique count ratio noted."
##################################
# Checking for skewed predictors
##################################
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
} else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),])>0){
print(paste0("High skewness observed for ",
(nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),])),
" numeric variable(s) with Skewness>3 or Skewness<(-3)."))
DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),]
} else {
print("No skewed numeric predictors noted.")
}
## [1] "No skewed numeric predictors noted."
1.3 Missing Data Pattern Analysis
1.3.1 Assessment using MISSCOMPARE
Missing data pattern assessment:
[A] The original dataset was reconstructed using the
clean method from the
missCompare
package.
[A.1] All observations and variables retained.
[A.1.1] All variables were within the defined 50%
observation fill rate or missingness ratio criterion.
[A.1.2] All observation were within the defined 80%
variable fill rate or missingness ratio criterion.
[A.2] All 12 factor variables were converted to numeric
variables.
[A.3] All NA labels retained. No NA labels were
re-coded.
[B] The missing data patterns were summarized using the
get_data method from the
missCompare
package.
[B.1] There were 1431/2159 (0.663) observations
assessed as complete.
[B.2] There were 1187/34544 (0.034) instances assessed
as missing.
[B.3] The cisterns
variable (factor), shift variable
(factor) and hypoxia variable
(factor) contributed the most missing data.
[B.4] There are 59 missing data patterns observed
employing various combinations of the 16 variables.
[B.5] The missing data patterns are clustered between
the following variables:
[B.5.1] cisterns
variable (factor) and shift variable
(factor) variables.
[B.5.2] hb variable
(numeric) and glucose variable
(numeric) variables.
[B.5.3] trash
variable (factor) and edh variable
(factor) variables.
[B.5.4] hypoxia
variable (factor) and hypotens
(factor) variables.
[B.5.5] pupil
variable (factor) and ctclass
variable (factor) variables.
[B.6] Thresholding the minimum number of observations
per distinct missing data pattern showed different percentages of
observations retained:
[B.6.1] With the minimum number of observations per
distinct missing data pattern set to 5, 89% of observations are
retained.
[B.6.2] With the minimum number of observations per
distinct missing data pattern set to 10, 81% of observations are
retained.
[B.6.3] With the minimum number of observations per
distinct missing data pattern set to 20, 74% of observations are
retained.
##################################
# Loading dataset
##################################
MDPA <- TBI.Analysis
##################################
# Reconstructing the dataset by :
# removing variables with fill rate > 0.50
# removing observations with NA ratio > 0.80
# converting factor variables to numeric variables
##################################
MDPA.Cleaned <- missCompare::clean(MDPA,
var_removal_threshold=0.50,
ind_removal_threshold=0.80)
##################################
# Gathering the metadata using the reconstructed dataset
##################################
MDPA.Cleaned.Metadata <- missCompare::get_data(MDPA.Cleaned,
matrixplot_sort = T,
plot_transform = T)
##################################
# Gathering the number of complete cases
##################################
MDPA.Cleaned.Metadata$Complete_cases
## [1] 1431
MDPA.Cleaned.Metadata$Rows
## [1] 2159
MDPA.Cleaned.Metadata$Columns
## [1] 16
MDPA.Cleaned.Metadata$Complete_cases/MDPA.Cleaned.Metadata$Rows
## [1] 0.6628069
##################################
# Gathering the number of incomplete cases
##################################
MDPA.Cleaned.Metadata$Total_NA
## [1] 1187
MDPA.Cleaned.Metadata$NA_per_variable
## trial age hypoxia hypotens cisterns shift tsah edh
## 0 0 249 59 257 252 96 38
## pupil motor ctclass d.sysbp hb glucose unfav mort
## 123 0 25 0 24 64 0 0
MDPA.Cleaned.Metadata$Fraction_missingness
## [1] 0.03436197
MDPA.Cleaned.Metadata$Fraction_missingness_per_variable
## trial age hypoxia hypotens cisterns shift tsah
## 0.00000000 0.00000000 0.11533117 0.02732747 0.11903659 0.11672070 0.04446503
## edh pupil motor ctclass d.sysbp hb glucose
## 0.01760074 0.05697082 0.00000000 0.01157943 0.00000000 0.01111626 0.02964335
## unfav mort
## 0.00000000 0.00000000
##################################
# Gathering the missing data patterns
##################################
MDPA.Cleaned.Metadata$Matrix_plot

MDPA.Cleaned.Metadata$Cluster_plot

MDPA.Cleaned.Metadata$Corr_matrix
## trial age hypoxia hypotens cisterns
## trial 1.000000000 -0.030651416 1.656750e-01 0.1050912318 0.0090874012
## age -0.030651416 1.000000000 1.031147e-03 0.0797683643 0.0513284240
## hypoxia 0.165675039 0.001031147 1.000000e+00 0.2323622569 0.0460174467
## hypotens 0.105091232 0.079768364 2.323623e-01 1.0000000000 0.0004308286
## cisterns 0.009087401 0.051328424 4.601745e-02 0.0004308286 1.0000000000
## shift -0.065358546 0.104812735 4.487131e-20 -0.0549581210 0.4585598379
## tsah -0.097721950 0.174014741 1.627862e-02 -0.0075673746 0.0329649255
## edh -0.119425686 0.002670862 -4.682238e-02 -0.0770647036 0.0237954129
## pupil 0.088381902 0.016622292 1.438070e-01 0.1137020246 0.1464712963
## motor -0.051699973 -0.008694701 1.885604e-01 0.0656417934 0.1168144834
## ctclass -0.015770576 0.153012007 1.850571e-02 -0.0194182715 0.2788307306
## d.sysbp 0.087643656 0.012554957 -1.094494e-01 -0.2634324171 -0.0114882127
## hb 0.227845969 -0.031832004 -5.386741e-02 -0.2705358063 -0.0417687110
## glucose 0.090165100 0.069076332 1.619970e-01 0.1994708499 0.0498702817
## unfav -0.029068950 0.208375143 1.742552e-01 0.1776370570 0.1396093897
## mort -0.038439923 0.153665440 1.414791e-01 0.1666659119 0.1424181948
## shift tsah edh pupil motor
## trial -6.535855e-02 -0.097721950 -0.119425686 0.08838190 -0.051699973
## age 1.048127e-01 0.174014741 0.002670862 0.01662229 -0.008694701
## hypoxia 4.487131e-20 0.016278621 -0.046822378 0.14380701 0.188560393
## hypotens -5.495812e-02 -0.007567375 -0.077064704 0.11370202 0.065641793
## cisterns 4.585598e-01 0.032964926 0.023795413 0.14647130 0.116814483
## shift 1.000000e+00 0.090218463 0.041278708 0.09772835 0.088389352
## tsah 9.021846e-02 1.000000000 -0.010786454 0.14788861 0.116778650
## edh 4.127871e-02 -0.010786454 1.000000000 -0.02558442 -0.046758976
## pupil 9.772835e-02 0.147888607 -0.025584420 1.00000000 0.381055618
## motor 8.838935e-02 0.116778650 -0.046758976 0.38105562 1.000000000
## ctclass 2.793684e-01 0.191225179 0.296787667 0.21786366 0.167952008
## d.sysbp 9.615297e-03 -0.022296366 -0.015739545 -0.08779228 -0.123171876
## hb -3.378804e-02 0.030632398 -0.019653199 -0.08586573 -0.064731488
## glucose 2.573764e-02 0.102781495 -0.001029329 0.18136723 0.128402858
## unfav 1.340827e-01 0.249226529 -0.065700956 0.31905926 0.353979268
## mort 1.612055e-01 0.229655718 -0.061375715 0.25297999 0.267972519
## ctclass d.sysbp hb glucose unfav
## trial -0.015770576 0.087643656 0.227845969 0.090165100 -0.02906895
## age 0.153012007 0.012554957 -0.031832004 0.069076332 0.20837514
## hypoxia 0.018505712 -0.109449414 -0.053867408 0.161996976 0.17425522
## hypotens -0.019418271 -0.263432417 -0.270535806 0.199470850 0.17763706
## cisterns 0.278830731 -0.011488213 -0.041768711 0.049870282 0.13960939
## shift 0.279368362 0.009615297 -0.033788043 0.025737640 0.13408269
## tsah 0.191225179 -0.022296366 0.030632398 0.102781495 0.24922653
## edh 0.296787667 -0.015739545 -0.019653199 -0.001029329 -0.06570096
## pupil 0.217863665 -0.087792278 -0.085865732 0.181367225 0.31905926
## motor 0.167952008 -0.123171876 -0.064731488 0.128402858 0.35397927
## ctclass 1.000000000 -0.015114873 -0.000441974 0.123201094 0.24639544
## d.sysbp -0.015114873 1.000000000 0.195736194 -0.125362570 -0.09119451
## hb -0.000441974 0.195736194 1.000000000 -0.134613174 -0.15508193
## glucose 0.123201094 -0.125362570 -0.134613174 1.000000000 0.22878822
## unfav 0.246395438 -0.091194506 -0.155081925 0.228788217 1.00000000
## mort 0.222675184 -0.178138387 -0.162844394 0.229159186 0.68327089
## mort
## trial -0.03843992
## age 0.15366544
## hypoxia 0.14147910
## hypotens 0.16666591
## cisterns 0.14241819
## shift 0.16120554
## tsah 0.22965572
## edh -0.06137571
## pupil 0.25297999
## motor 0.26797252
## ctclass 0.22267518
## d.sysbp -0.17813839
## hb -0.16284439
## glucose 0.22915919
## unfav 0.68327089
## mort 1.00000000
MDPA.Cleaned.Metadata$MD_Pattern
## trial age hypoxia hypotens cisterns shift tsah edh pupil motor ctclass
## 1431 1 1 1 1 1 1 1 1 1 1 1
## 5 1 1 1 1 0 1 1 1 1 1 1
## 1 1 1 1 1 1 0 1 1 1 1 1
## 200 1 1 1 1 0 0 1 1 1 1 1
## 157 1 1 0 1 1 1 1 1 1 1 1
## 12 1 1 0 1 0 0 1 1 1 1 1
## 82 1 1 1 1 1 1 1 1 0 1 1
## 12 1 1 1 1 0 0 1 1 0 1 1
## 10 1 1 0 1 1 1 1 1 0 1 1
## 1 1 1 0 1 0 1 1 1 0 1 1
## 42 1 1 1 1 1 1 0 1 1 1 1
## 9 1 1 1 1 0 0 0 1 1 1 1
## 11 1 1 0 1 1 1 0 1 1 1 1
## 5 1 1 1 1 1 1 0 1 0 1 1
## 34 1 1 1 1 1 1 1 1 1 1 1
## 8 1 1 0 1 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1 1 1 0 1 1
## 2 1 1 1 1 1 1 0 1 1 1 1
## 10 1 1 1 0 1 1 1 1 1 1 1
## 27 1 1 0 0 1 1 1 1 1 1 1
## 2 1 1 0 0 0 0 1 1 1 1 1
## 3 1 1 0 0 1 1 1 1 0 1 1
## 2 1 1 1 0 1 1 0 1 1 1 1
## 2 1 1 0 0 1 1 0 1 1 1 1
## 3 1 1 1 0 1 1 1 1 1 1 1
## 1 1 1 0 0 1 1 1 1 1 1 1
## 1 1 1 0 0 1 1 0 1 1 1 1
## 7 1 1 1 1 1 1 1 0 1 1 1
## 3 1 1 1 1 0 0 1 0 1 1 1
## 3 1 1 0 1 1 1 1 0 1 1 1
## 1 1 1 1 1 1 1 1 0 0 1 1
## 1 1 1 0 1 1 1 1 0 0 1 1
## 10 1 1 1 1 1 1 0 0 1 1 1
## 4 1 1 1 1 0 0 0 0 1 1 1
## 1 1 1 0 1 1 1 0 0 1 1 1
## 1 1 1 1 1 1 1 0 0 0 1 1
## 1 1 1 0 1 1 1 0 0 1 1 1
## 1 1 1 1 0 1 1 1 0 1 1 1
## 1 1 1 0 0 1 1 1 0 1 1 1
## 1 1 1 1 0 1 1 0 0 1 1 1
## 11 1 1 1 1 1 1 1 1 1 1 0
## 2 1 1 1 1 0 0 1 1 1 1 0
## 4 1 1 0 1 1 1 1 1 1 1 0
## 3 1 1 1 1 1 1 1 1 0 1 0
## 1 1 1 1 0 1 1 1 1 1 1 0
## 1 1 1 1 0 0 0 1 1 1 1 0
## 1 1 1 1 1 1 1 1 0 1 1 0
## 2 1 1 1 1 1 1 0 0 1 1 0
## 5 1 1 1 1 1 1 1 1 1 1 1
## 4 1 1 1 1 0 0 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 0 1 1
## 7 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 0 0 1 1 1 1 1
## 1 1 1 0 1 1 1 1 1 1 1 1
## 1 1 1 1 1 0 0 1 1 0 1 1
## 1 1 1 1 1 1 1 0 1 1 1 1
## 2 1 1 0 0 1 1 1 1 1 1 1
## 1 1 1 1 0 1 1 0 1 1 1 1
## 0 0 249 59 257 252 96 38 123 0 25
## d.sysbp hb glucose unfav mort
## 1431 1 1 1 1 1
## 5 1 1 1 1 1
## 1 1 1 1 1 1
## 200 1 1 1 1 1
## 157 1 1 1 1 1
## 12 1 1 1 1 1
## 82 1 1 1 1 1
## 12 1 1 1 1 1
## 10 1 1 1 1 1
## 1 1 1 1 1 1
## 42 1 1 1 1 1
## 9 1 1 1 1 1
## 11 1 1 1 1 1
## 5 1 1 1 1 1
## 34 1 1 0 1 1
## 8 1 1 0 1 1
## 2 1 1 0 1 1
## 2 1 1 0 1 1
## 10 1 1 1 1 1
## 27 1 1 1 1 1
## 2 1 1 1 1 1
## 3 1 1 1 1 1
## 2 1 1 1 1 1
## 2 1 1 1 1 1
## 3 1 1 0 1 1
## 1 1 1 0 1 1
## 1 1 1 0 1 1
## 7 1 1 1 1 1
## 3 1 1 1 1 1
## 3 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 10 1 1 1 1 1
## 4 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 0 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 11 1 1 1 1 1
## 2 1 1 1 1 1
## 4 1 1 1 1 1
## 3 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 2 1 1 1 1 1
## 5 1 0 1 1 1
## 4 1 0 1 1 1
## 1 1 0 1 1 1
## 7 1 0 0 1 1
## 1 1 0 0 1 1
## 1 1 0 0 1 1
## 1 1 0 0 1 1
## 1 1 0 0 1 1
## 2 1 0 1 1 1
## 1 1 0 0 1 1
## 0 24 64 0 0
dim(MDPA.Cleaned.Metadata$MD_Pattern)
## [1] 59 16
MDPA.Cleaned.Metadata$NA_Correlations
## trial age hypoxia hypotens
## trial_is.na NaN NaN NaN NaN
## age_is.na NaN NaN NaN NaN
## hypoxia_is.na 0.003075342 -1.991261e-02 NaN -0.050871031
## hypotens_is.na -0.003139022 9.723747e-03 -0.070715627 NaN
## cisterns_is.na -0.020274316 5.403308e-03 -0.036463860 0.021305289
## shift_is.na -0.012972969 2.893106e-03 -0.040060700 0.018044005
## tsah_is.na -0.048171238 1.924787e-02 -0.067614722 -0.098680649
## edh_is.na 0.023420299 5.196470e-03 -0.043045790 -0.065097712
## pupil_is.na 0.037218347 -4.342998e-02 -0.030756342 -0.018479920
## motor_is.na NaN NaN NaN NaN
## ctclass_is.na -0.008194874 -8.522036e-03 -0.005461126 -0.046146806
## d.sysbp_is.na NaN NaN NaN NaN
## hb_is.na -0.039147773 2.245985e-05 -0.029832037 -0.002868117
## glucose_is.na 0.092140222 1.110012e-03 -0.005690478 0.003122214
## unfav_is.na NaN NaN NaN NaN
## mort_is.na NaN NaN NaN NaN
## cisterns shift tsah edh
## trial_is.na NaN NaN NaN NaN
## age_is.na NaN NaN NaN NaN
## hypoxia_is.na 0.005132467 -0.037162241 -8.090368e-03 0.0209094526
## hypotens_is.na 0.019420226 -0.014558767 -9.092611e-04 0.0009040846
## cisterns_is.na NaN 0.027979664 5.895326e-02 -0.0364386527
## shift_is.na -0.030781323 NaN 4.920710e-02 -0.0351378312
## tsah_is.na 0.035616071 -0.016129896 NaN 0.0201143139
## edh_is.na 0.017911716 0.012064215 3.759542e-02 NaN
## pupil_is.na 0.032634364 -0.017689375 -2.970642e-02 0.0320063995
## motor_is.na NaN NaN NaN NaN
## ctclass_is.na 0.008762896 0.004581525 -6.719949e-05 0.0390176196
## d.sysbp_is.na NaN NaN NaN NaN
## hb_is.na -0.029175385 0.007846099 4.863460e-03 -0.0128075625
## glucose_is.na -0.042431688 -0.027251502 -1.925087e-02 -0.0167735791
## unfav_is.na NaN NaN NaN NaN
## mort_is.na NaN NaN NaN NaN
## pupil motor ctclass d.sysbp hb
## trial_is.na NaN NaN NaN NaN NaN
## age_is.na NaN NaN NaN NaN NaN
## hypoxia_is.na -0.0383020501 -0.077961939 -0.018323675 0.06582073 0.02059243
## hypotens_is.na -0.0252221782 -0.029960226 -0.013411691 0.02261856 -0.02056744
## cisterns_is.na 0.0268112302 0.111893054 -0.006296335 -0.06654302 0.11436064
## shift_is.na 0.0192540664 0.105806117 -0.017223821 -0.06403836 0.12099795
## tsah_is.na -0.0308960246 -0.041118410 0.002862034 0.07287370 0.06924227
## edh_is.na -0.0298496078 -0.023869160 0.018298843 0.04278551 0.06563305
## pupil_is.na NaN -0.028658751 -0.020254575 0.02318343 0.03887609
## motor_is.na NaN NaN NaN NaN NaN
## ctclass_is.na -0.0750312122 -0.023662963 NaN 0.01626310 0.06052439
## d.sysbp_is.na NaN NaN NaN NaN NaN
## hb_is.na 0.0005132771 -0.003710246 -0.012940904 0.02107206 NaN
## glucose_is.na 0.0150930949 -0.020565988 -0.049791976 0.04466416 0.01725871
## unfav_is.na NaN NaN NaN NaN NaN
## mort_is.na NaN NaN NaN NaN NaN
## glucose unfav mort
## trial_is.na NaN NaN NaN
## age_is.na NaN NaN NaN
## hypoxia_is.na -0.004860708 -0.002531656 -0.02054161
## hypotens_is.na 0.011223556 0.013113682 0.01173212
## cisterns_is.na -0.046590315 0.033074703 0.03341347
## shift_is.na -0.045520597 0.024587698 0.02631154
## tsah_is.na -0.088678630 -0.055916708 -0.04589547
## edh_is.na -0.036218129 -0.021781268 -0.02622100
## pupil_is.na 0.010353762 -0.018474498 -0.02998735
## motor_is.na NaN NaN NaN
## ctclass_is.na -0.074101567 -0.063310212 -0.07348964
## d.sysbp_is.na NaN NaN NaN
## hb_is.na -0.008761202 -0.022962768 -0.03562051
## glucose_is.na NaN -0.015499980 -0.01349823
## unfav_is.na NaN NaN NaN
## mort_is.na NaN NaN NaN
MDPA.Cleaned.Metadata$NA_Correlation_plot

MDPA.Cleaned.Metadata$min_PDM_thresholds
## min_PDM_threshold perc_obs_retained
## 1 5 89
## 2 10 81
## 3 20 74
## 4 50 60
## 5 100 49
## 6 200 0
## 7 500 0
## 8 1000 0
MDPA.Cleaned.Metadata$Vars_above_half
## character(0)
1.3.2 Assessment using RBTEST
Missing data mechanism assessment:
[A] The missing data mechanisms were summarized using
the RBtest method from the
RBtest
package.
[A.1] Variables assessed with Missing at Random (MAR)
mechanism as evaluated against the trial variable (factor), age variable (numeric), motor variable (factor), d.sysbp variable (numeric), unfav variable (factor) and mort variable (factor) with complete
observations :
[A.1.1] hypoxia
variable (factor)
[A.1.2] cisterns
variable (factor)
[A.1.3] ctclass
variable (factor)
[A.2] Variables assessed with Missing Completely at
Random (MCAR) mechanism as evaluated against the trial variable (factor), age variable (numeric), motor variable (factor), d.sysbp variable (numeric), unfav variable (factor) and mort variable (factor) with complete
observations :
[A.2.1] hypotens
variable (factor)
[A.2.2] shift
variable (factor)
[A.2.3] tsah
variable (factor)
[A.2.4] edh
variable (factor)
[A.2.5] pupil
variable (factor)
[A.2.6] hb variable
(numeric)
[A.2.7] glucose
variable (numeric)
##################################
# Loading dataset
##################################
MDPA <- TBI.Analysis
##################################
# Testing the missingness mechanism using a regression-based approach
# MCAR (Missing Completely at Random) versus MAR (Missing at Random)
# for variables with missing data as evaluated against variables with complete observations
##################################
RBtest(TBI.Analysis)
## $abs.nbrMD
## trial age hypoxia hypotens cisterns shift tsah edh
## 0 0 249 59 257 252 96 38
## pupil motor ctclass d.sysbp hb glucose unfav mort
## 123 0 25 0 24 64 0 0
##
## $rel.nbrMD
## trial age hypoxia hypotens cisterns shift tsah edh
## 0.00 0.00 0.12 0.03 0.12 0.12 0.04 0.02
## pupil motor ctclass d.sysbp hb glucose unfav mort
## 0.06 0.00 0.01 0.00 0.01 0.03 0.00 0.00
##
## $type
## trial age hypoxia hypotens cisterns shift tsah edh
## -1 -1 1 0 1 0 0 0
## pupil motor ctclass d.sysbp hb glucose unfav mort
## 0 -1 1 -1 0 0 -1 -1
1.5 Missing Data Imputation and Post-Imputation Diagnostics
1.5.1 Imputation using MICE MIXED
Missing data imputation method implementation:
[A] The missing data imputation method from the
mice
package as implemented in the
missCompare
package showed:
[A.1] Relatively poor data plausibility for the numeric
variables in the imputed dataset as compared to the original
dataset.
[A.2] Relatively good data plausibility for the
categorical variables in the imputed dataset as compared to the original
dataset.
[A.3] Consistent numeric variable clustering in the
imputed dataset as compared to the original dataset.
##################################
# Loading dataset
##################################
MDIPID <- TBI.Analysis
##################################
# Performing missing data imputation
# on the original dataset using
# Method 11 : mice mixed
##################################
##################################
# Process parameters defined as follows :
# Number of iterations = 3 (for illustration only, actual number should be set high in real applications)
##################################
MDIPID.Imputed.MiceMixed <- missCompare::impute_data(MDIPID,
scale=F,
n.iter=3,
sel_method=c(11))
## [1] "mice mixed imputation - in progress"
##################################
# Performing post-imputation diagnostics
# using Method 11 : mice mixed
##################################
##################################
# Process parameters defined as follows :
# Number of bootstrap cycles = 5 (for illustration only, actual number should be set high in real applications)
##################################
MDIPID.Imputed.MiceMixed.Diagnostics <- missCompare::post_imp_diag(MDIPID,
MDIPID.Imputed.MiceMixed$mice_mixed_imputation[[1]],
scale=F,
n.boot=5)
##################################
# Comparing the histograms of the numeric variables
# between the imputed and original datasets
# from Method 11 : mice mixed
##################################
MDIPID.Imputed.MiceMixed.Diagnostics$Histograms
## $age

##
## $d.sysbp

##
## $hb

##
## $glucose

##################################
# Comparing the box plots of the numeric variables
# between the imputed and original datasets
# from Method 11 : mice mixed
##################################
MDIPID.Imputed.MiceMixed.Diagnostics$Boxplots
## $age

##
## $d.sysbp

##
## $hb

##
## $glucose

##################################
# Comparing the summary statistics of the numeric variables
# between the imputed and original datasets
# from Method 11 : mice mixed
##################################
MDIPID.Imputed.MiceMixed.Diagnostics$Statistics
## $age
## Mean_original SD_original Mean_imputed SD_imputed Welch_ttest_P
## 33.21121 13.57760 NA NA NA
## KS_test
## NA
##
## $d.sysbp
## Mean_original SD_original Mean_imputed SD_imputed Welch_ttest_P
## 131.42962 15.85943 NA NA NA
## KS_test
## NA
##
## $hb
## Mean_original SD_original Mean_imputed SD_imputed Welch_ttest_P
## 12.4676112 2.5033293 12.3083333 3.0329949 0.7999705
## KS_test.D
## 0.1403981
##
## $glucose
## Mean_original SD_original Mean_imputed SD_imputed Welch_ttest_P
## 8.86326120 3.19075643 9.58932292 3.32802492 0.08983496
## KS_test.D
## 0.15085770
##################################
# Comparing the correlation plots of the numeric variables
# between the imputed and original datasets
# from Method 11 : mice mixed
##################################
MDIPID.Imputed.MiceMixed.Diagnostics$Correlation_plot

##################################
# Comparing the bar charts of the numeric variables
# between the imputed and original datasets
# from Method 11 : mice mixed
##################################
MDIPID.Imputed.MiceMixed.Diagnostics$Barcharts
## $trial

##
## $hypoxia

##
## $hypotens

##
## $cisterns

##
## $shift

##
## $tsah

##
## $edh

##
## $pupil

##
## $motor

##
## $ctclass

##
## $unfav

##
## $mort

##################################
# Comparing the cluster plot of the variables
# between the imputed and original datasets
# from Method 11 : mice mixed
###############################
MDIPID.Imputed.MiceMixed.Diagnostics$Variable_clusters_imp

MDIPID.Imputed.MiceMixed.Diagnostics$Variable_clusters_orig

1.5.2 Imputation using HMISC AREGIMPUTE
Missing data imputation method implementation:
[A] The missing data imputation method from the
Hmisc
package as implemented in the
missCompare
package showed:
[A.1] Relatively poor data plausibility for the numeric
variables in the imputed dataset as compared to the original
dataset.
[A.2] Relatively good data plausibility for the
categorical variables in the imputed dataset as compared to the original
dataset.
[A.3] Consistent numeric variable clustering in the
imputed dataset as compared to the original dataset.
##################################
# Loading dataset
##################################
MDIPID <- TBI.Analysis
##################################
# Performing missing data imputation
# on the original dataset using
# Method 15 : Hmisc aregImpute
##################################
##################################
# Process parameters defined as follows :
# Number of iterations = 3 (for illustration only, actual number should be set high in real applications)
##################################
MDIPID.Imputed.HmiscAregImpute <- missCompare::impute_data(MDIPID,
scale=F,
n.iter=3,
sel_method=c(15))
## [1] "Hmisc aregImpute imputation - in progress"
##################################
# Performing post-imputation diagnostics
# Method 15 : Hmisc aregImpute
##################################
##################################
# Process parameters defined as follows :
# Number of bootstrap cycles = 5 (for illustration only, actual number should be set high in real applications)
##################################
MDIPID.Imputed.HmiscAregImpute.Diagnostics <- missCompare::post_imp_diag(MDIPID,
MDIPID.Imputed.HmiscAregImpute$Hmisc_aregImpute_imputation[[1]],
scale=F,
n.boot=5)
##################################
# Comparing the histograms of the numeric variables
# between the imputed and original datasets
# from Method 15 : Hmisc aregImpute
##################################
MDIPID.Imputed.HmiscAregImpute.Diagnostics$Histograms
## $age

##
## $d.sysbp

##
## $hb

##
## $glucose

##################################
# Comparing the box plots of the numeric variables
# between the imputed and original datasets
# from Method 15 : Hmisc aregImpute
##################################
MDIPID.Imputed.HmiscAregImpute.Diagnostics$Boxplots
## $age

##
## $d.sysbp

##
## $hb

##
## $glucose

##################################
# Comparing the summary statistics of the numeric variables
# between the imputed and original datasets
# from Method 15 : Hmisc aregImpute
##################################
MDIPID.Imputed.HmiscAregImpute.Diagnostics$Statistics
## $age
## Mean_original SD_original Mean_imputed SD_imputed Welch_ttest_P
## 33.21121 13.57760 NA NA NA
## KS_test
## NA
##
## $d.sysbp
## Mean_original SD_original Mean_imputed SD_imputed Welch_ttest_P
## 131.42962 15.85943 NA NA NA
## KS_test
## NA
##
## $hb
## Mean_original SD_original Mean_imputed SD_imputed Welch_ttest_P
## 12.4676112 2.5033293 12.8333333 1.7102420 0.3108353
## KS_test.D
## 0.2188329
##
## $glucose
## Mean_original SD_original Mean_imputed SD_imputed Welch_ttest_P
## 8.8632612 3.1907564 9.1427951 3.3199221 0.5088016
## KS_test.D
## 0.1510740
##################################
# Comparing the correlation plots of the numeric variables
# between the imputed and original datasets
# from Method 15 : Hmisc aregImpute
##################################
MDIPID.Imputed.HmiscAregImpute.Diagnostics$Correlation_plot

##################################
# Comparing the bar charts of the numeric variables
# between the imputed and original datasets
# from Method 15 : Hmisc aregImpute
##################################
MDIPID.Imputed.HmiscAregImpute.Diagnostics$Barcharts
## $trial

##
## $hypoxia

##
## $hypotens

##
## $cisterns

##
## $shift

##
## $tsah

##
## $edh

##
## $pupil

##
## $motor

##
## $ctclass

##
## $unfav

##
## $mort

##################################
# Comparing the cluster plot of the variables
# between the imputed and original datasets
# from Method 15 : Hmisc aregImpute
###############################
MDIPID.Imputed.HmiscAregImpute.Diagnostics$Variable_clusters_imp

MDIPID.Imputed.HmiscAregImpute.Diagnostics$Variable_clusters_orig

1.5.3 Imputation using VIM KNN
Missing data imputation method implementation:
[A] The missing data imputation method from the
VIM
package as implemented in the
missCompare
package showed:
[A.1] Relatively good data plausibility for the numeric
variables in the imputed dataset as compared to the original
dataset.
[A.2] Relatively poor data plausibility for the
categorical variables in the imputed dataset as compared to the original
dataset.
[A.3] Consistent numeric variable clustering in the
imputed dataset as compared to the original dataset.
##################################
# Loading dataset
##################################
MDIPID <- TBI.Analysis
##################################
# Performing missing data imputation
# on the original dataset using
# Method 16 : VIM kNN
##################################
##################################
# Process parameters defined as follows :
# Number of iterations = 3 (for illustration only, actual number should be set high in real applications)
##################################
MDIPID.Imputed.VIMkNN <- missCompare::impute_data(MDIPID,
scale=F,
n.iter=3,
sel_method=c(16))
## [1] "VIM kNN imputation - in progress"
##################################
# Performing post-imputation diagnostics
# using Method 16 : VIM kNN
##################################
##################################
# Process parameters defined as follows :
# Number of bootstrap cycles = 5 (for illustration only, actual number should be set high in real applications)
##################################
MDIPID.Imputed.VIMkNN.Diagnostics <- missCompare::post_imp_diag(MDIPID,
MDIPID.Imputed.VIMkNN$VIM_kNN_imputation[[1]],
scale=F,
n.boot=5)
##################################
# Comparing the histograms of the numeric variables
# between the imputed and original datasets
# from Method 16 : VIM kNN
##################################
MDIPID.Imputed.VIMkNN.Diagnostics$Histograms
## $age

##
## $d.sysbp

##
## $hb

##
## $glucose

##################################
# Comparing the box plots of the numeric variables
# between the imputed and original datasets
# from Method 16 : VIM kNN
##################################
MDIPID.Imputed.VIMkNN.Diagnostics$Boxplots
## $age

##
## $d.sysbp

##
## $hb

##
## $glucose

##################################
# Comparing the summary statistics of the numeric variables
# between the imputed and original datasets
# from Method 16 : VIM kNN
##################################
MDIPID.Imputed.VIMkNN.Diagnostics$Statistics
## $age
## Mean_original SD_original Mean_imputed SD_imputed Welch_ttest_P
## 33.21121 13.57760 NA NA NA
## KS_test
## NA
##
## $d.sysbp
## Mean_original SD_original Mean_imputed SD_imputed Welch_ttest_P
## 131.42962 15.85943 NA NA NA
## KS_test
## NA
##
## $hb
## Mean_original SD_original Mean_imputed SD_imputed Welch_ttest_P
## 12.46761124 2.50332935 12.93750000 1.27528769 0.08938506
## KS_test.D
## 0.20618657
##
## $glucose
## Mean_original SD_original Mean_imputed SD_imputed Welch_ttest_P
## 8.8632612039 3.1907564349 8.1872829861 1.2668232797 0.0001809788
## KS_test.D
## 0.2043556086
##################################
# Comparing the correlation plots of the numeric variables
# between the imputed and original datasets
# from Method 12 : VIM kNN
##################################
MDIPID.Imputed.VIMkNN.Diagnostics$Correlation_plot

##################################
# Comparing the bar charts of the numeric variables
# between the imputed and original datasets
# from Method 16 : VIM kNN
##################################
MDIPID.Imputed.VIMkNN.Diagnostics$Barcharts
## $trial

##
## $hypoxia

##
## $hypotens

##
## $cisterns

##
## $shift

##
## $tsah

##
## $edh

##
## $pupil

##
## $motor

##
## $ctclass

##
## $unfav

##
## $mort

##################################
# Comparing the cluster plot of the variables
# between the imputed and original datasets
# from Method 16 : VIM kNN
###############################
MDIPID.Imputed.VIMkNN.Diagnostics$Variable_clusters_imp

MDIPID.Imputed.VIMkNN.Diagnostics$Variable_clusters_orig
