Libraries

library(ggplot2)
library(readxl)
library(dplyr)
library(writexl)
library(data.table)
library(xlsx)
library(knitr)
library(reactable)

Functions

source("../scripts/useful-functions/get_column_position.R")
# In a normal script it will be:  source("./scripts/useful-functions/get_column_position.R")

1. Reading Data

df1 has information about the Descriptive Variables.

df <- as.data.frame(read_excel("../data/raw-data/00MONITORIZACION_READ_R.xlsx"))
#cat(colnames(df),"\t")

# It is intelligent to work with different data frames
# df1 for the explicative variables
df1 <- df[,c(1:get_column_position(df,"ANTIBIOTICO"))]
cat(colnames(df1),"\t")
## ID NHC INICIALES EM EDAD PESO SEXO ENFERMEDAD_BASE PREMATURIDAD EG PALIVIZUMAB LM DERMATITIS ALERGIAS TABACO DESNUTRICION ETIOLOGIA RADIOGRAFIA ANALITICA LEUCOCITOS NEUTROFILOS LINFOCITOS PCR PCT SAPI_0_8h SAPI_8_16h SAPI_16_24h FR_0_8h FR_8_16h FR_16_24h SCORE_CRUCES_INGRESO SCORE_WOOD_DOWNES_INGRESO SCORE_WOOD_DOWNES_24H ALIMENTACION SUERO SNG FLUJO2_0_8H FLUJO2_8_16h FLUJO2_16_24h OAF FLUJOAF_0_8h FLUJOAF_8_16h FLUJOAF_16-24h FiO2_0_8h FiO2_8_16h FiO2_16_24h DIAS_O2_TOTAL DIAS_GN DIAS_OAF OAF si1 no0 UCIP DETERIORO APNEA BRONCODILATADORES CORTICOIDES ANTIBIOTICO  
reactable(df1)

df2 has information about the time series variables each hour; I will calculate these values again.

# df2 for the O2 Sat and FC variables
df2 <- df[,c(1:3,(get_column_position(df,"ANTIBIOTICO")+1):get_column_position(df,"SatO2_24"))]
cat(colnames(df2),"\t")
## ID NHC INICIALES FC_1 FC_2 FC_3 FC_4 FC_5 FC_6 FC_7 FC_8 FC_9 FC_10 FC_11 FC_12 FC_13 FC_14 FC_15 FC_16 FC_17 FC_18 FC_19 FC_20 FC_21 FC_22 FC_23 FC_24 SatO2_1 SatO2_2 SatO2_3 SatO2_4 SatO2_5 SatO2_6 SatO2_7 SatO2_8 SatO2_9 SatO2_10 SatO2_11 SatO2_12 SatO2_13 SatO2_14 SatO2_15 SatO2_16 SatO2_17 SatO2_18 SatO2_19 SatO2_20 SatO2_21 SatO2_22 SatO2_23 SatO2_24   
reactable(df2)

1.2 Reading all the data frames

Now the work consists of reading all the different files that are in the folder and seeing if all of them match with the descriptive variables that are in MONITORIZACION_READ_R.xlsx

1.2.1 Reading all patients time series files

2. Cleaning the Data

2. 1 Initial checking

  1. Checking if all the file names meet the requirements

# If the file meet the requirements, print the file
for (name in file_names) {
  if (any(grepl(name,list.files(path = "../data/raw-data"))) ==  FALSE){
  print(name)}
  }
  1. Cleaning all the data frames

It is important to convert the values in the time columns into something we can work with, so I convert the first column into something I can work with.

# We want unique hours
for (name_variable in file_patient_name) {
  data = get(name_variable)
  #print(name_variable)
  colnames(data) <- c("Time","FC","SatO2")
  data[,1] <- format(as.POSIXct(data[,1],
                                        format = '%Y/%m/%d %H:%M:%S'),
                             format = '%H:%M:%S')
  assign(name_variable,data)
}
kable(head(data))
Time FC SatO2
03:50:00 134 94
03:51:00 125 100
03:52:00 127 100
03:53:00 133 100
03:54:00 129 100
03:55:00 149 100
  1. Adding one column with information about the hour
for (name_variable in file_patient_name) {
  data = get(name_variable)
  data$hour = format(as.POSIXct(data$Time,format="%H:%M:%S"), format = "%H")
  unique_values <- c()
  j = 1
  for(i in 1:length(data$hour)){
    if(data$hour[i] %in% unique_values ==  FALSE){
      unique_values[j] = data$hour[i]; 
      j = j + 1}
    else if (data$hour[i] %in% unique_values ==  TRUE && (data$hour[i-1] != data$hour[i])){
      data$hour[i] = paste0(data$hour[i],"_1") 
    }
  }
  assign(name_variable,data)
}

kable(head(data))
Time FC SatO2 hour
03:50:00 134 94 03
03:51:00 125 100 03
03:52:00 127 100 03
03:53:00 133 100 03
03:54:00 129 100 03
03:55:00 149 100 03

Let´s create a function that calculates the mean per hour, and studies the amount of missing values and the quantity of values tracks in this hour. This function will allow to see several important things for each patient:

  1. Hour: Time referring the recollection of the data.

  2. N: Amount of time values.

  3. Missing_FC: Missing FC values.

  4. Missing_SatO2: Missing SatO2 values.

  5. avg_FC_with_NA: Mean calculated using the available values of FC.

  6. avg_SatO2_with_NA: Mean calculated using the available values of SatO2

As many .xlsx files are created as there are patients, each file will be as follows:

Each file will be name NHC-ID.xlsx and it will be stored in ../data/info-patients

for (name_variable in file_patient_name) {
  data = get(name_variable)
  # Count of values
  valores_unicos <- unique(data$hour)
  count_values <- data %>% 
    group_by(hour) %>%
    count() %>%
    mutate(valor_orden = factor(hour, levels = valores_unicos)) %>% 
    arrange(valor_orden)
  count_values <- data.frame(count_values[,c("hour","n")])
  
  # Missing values in FC
  Missing_FC <- data %>%                                    
    group_by(hour) %>%
    dplyr::summarize(Missing_FC = sum(is.na(FC))) %>%
    mutate(valor_orden = factor(hour, levels = valores_unicos)) %>% 
    arrange(valor_orden)
  Missing_FC <- data.frame(Missing_FC[,c("Missing_FC")])
  
  # Missing values in SatO2
  Missing_SatO2 <- data %>%                                    
    group_by(hour) %>%
    dplyr::summarize(Missing_SatO2 = sum(is.na(SatO2))) %>%
    mutate(valor_orden = factor(hour, levels = valores_unicos)) %>% 
    arrange(valor_orden)
  Missing_SatO2 <- data.frame(Missing_SatO2[,c("Missing_SatO2")])
  
  # Mean FC
  data <- data.table(data)
  Mean_FC <- data.frame(data[,list(avg_SatFC=mean(FC)),by=hour])
  Mean_FC <- data.frame(Mean_FC[,c("avg_SatFC")])
  
  # Mean SatO2
  data <- data.table(data)
  Mean_SatO2 <- data.frame(data[,list(avg_SatO2=mean(SatO2)),by=hour])
  Mean_SatO2 <- data.frame(Mean_SatO2[,c("avg_SatO2")])
  
  # Mean with not NA values Sat02
  Mean_SatO2_with_NA <-  data %>%
    group_by(hour) %>%
    summarise(avg_SatO2_with_NA = mean(SatO2, na.rm = T)) %>%
    mutate(valor_orden = factor(hour, levels = valores_unicos)) %>% 
    arrange(valor_orden)
  Mean_SatO2_with_NA <- data.frame(Mean_SatO2_with_NA[,c("avg_SatO2_with_NA")])
  
  # Mean with not NA values FC
  Mean_FC_with_NA <-  data %>%
    group_by(hour) %>%
    summarise(avg_FC_with_NA = mean(FC, na.rm = T)) %>%
    mutate(valor_orden = factor(hour, levels = valores_unicos)) %>% 
    arrange(valor_orden)
  Mean_FC_with_NA <- data.frame(Mean_FC_with_NA[,c("avg_FC_with_NA")])
  
  
  # Merging all the data frames
  merged_df = data.frame(cbind(count_values,Missing_FC,Missing_SatO2,Mean_FC,Mean_SatO2,Mean_FC_with_NA,Mean_SatO2_with_NA))
  assign(paste0(name_variable,"_info"),merged_df)
  paste0("../data/info-patients/",name_variable,"_info",".xlsx")
  # For written the tables in excel files
  #write_xlsx(merged_df,paste0("../data/info-patients/",name_variable,"_info",".xlsx"))
}
kable(head(merged_df))
hour n Missing_FC Missing_SatO2 Mean_FC…c..avg_SatFC… Mean_SatO2…c..avg_SatO2… avg_FC_with_NA avg_SatO2_with_NA
03 10 0 0 139.7000 99.20000 139.7000 99.20000
04 60 0 0 113.7167 97.68333 113.7167 97.68333
05 60 0 0 111.7000 97.86667 111.7000 97.86667
06 60 0 0 105.6833 98.23333 105.6833 98.23333
07 60 0 0 112.3167 97.33333 112.3167 97.33333
08 60 12 12 NA NA 141.4167 86.95833

Let´s study if all the files have the same length; if not, I will imputing NA values to be able to work with the same methodology with all patients.

A complete file should have values for each minute in the 24 h of study. So 60 mins * 24 h = 1440 minutes + 1 min to close the circle

M <- length(file_patient_name)
N <- 2
matrix_time_length  <- as.data.frame(
  x = matrix(
    data = NA,
    nrow = M,
    ncol = N
  ), row.names = file_patient_name,
)
colnames(matrix_time_length) <- c("Time","1441?")


# 60 mins * 24 h = 1440 minutes + 1 min to close the circle
for (name_file in file_patient_name) {
  data = get(name_file)
  matrix_time_length[paste0(name_file),1] = length(data$Time)
  if (matrix_time_length[paste0(name_file),1] == 1441){
    matrix_time_length[paste0(name_file),2] = "YES"
  }
  else {matrix_time_length[paste0(name_file),2] = "NO"}
}

kable(head(matrix_time_length))
Time 1441?
ACR_11231843 1441 YES
ADAO_11159808 1441 YES
AGG_11236448 1441 YES
AHL_11239959 1441 YES
AJGD_11119689 1441 YES
AJJ_11233049 1441 YES
# Lets print the ones that don t fill the requirements
for (name_file in file_patient_name) {
  if (matrix_time_length[paste0(name_file),2] == "NO"){
    print(name_file)
    print(matrix_time_length[paste0(name_file),1])
  }
}

Those patients who do not meet the requirements have been manipulated. NA values have been added to the Time, FC, and SatO2 rows. In some files, it is indicated that missing values are from the head, and by default, NA values have been added to the tail.

2.2 Full data-frames of Heart Rate and SatO2

# Lets put together all the patients in the same data frame 
M <- 1441
N <- length(file_patient_name)
FC_all_patients  <- as.data.frame(
  x = matrix(
    data = NA,
    nrow = M,
    ncol = N
  )
)
SatO2_all_patients  <- as.data.frame(
  x = matrix(
    data = NA,
    nrow = M,
    ncol = N
  )
)
colnames(FC_all_patients) = file_patient_name
colnames(SatO2_all_patients) = file_patient_name


## Imputing the data inside the created data frames
for(name_file in file_patient_name) {
  data = get(name_file)
  #print(name_file)
  #print(length(data$Time))
  FC_all_patients[,paste0(name_file)] <- data$FC
  SatO2_all_patients[,paste0(name_file)] <- data$SatO2
}

#Adding an extra column for the time series reference
FC_all_patients$time <- c(1:M)
SatO2_all_patients$time <- c(1:M)


write_xlsx(FC_all_patients,"../data/FC_&_SatO2/FC_all_patients.xlsx")
write_xlsx(SatO2_all_patients,"../data/FC_&_SatO2/SatO2_all_patients.xlsx")

Merged dataframe of Heart Rate values FC_all_patients (all the patients have the same length 1441 values)

reactable(FC_all_patients)

Merged dataframe of Heart Rate values SatO2_all_patients (all the patients have the same length 1441 values)

reactable(SatO2_all_patients)

2.2.1 Plotting data-frames

Plotting the first 20 values of Heart Rate and SatO2 of all the patients.

# First N values in the time series
N = 20
gather_FC_all_patients <- tidyr::gather(
  data = FC_all_patients[1:N,],
  key = "time_series",
  value = "value",
  -time)  # First N values


gather_SatO2_all_patients <- tidyr::gather(
  data = SatO2_all_patients[1:N,],
  key = "time_series",
  value = "value",
  -time)  # First N values
ggplot(gather_FC_all_patients) +
  aes(x =  time, y = value, color = time_series) + 
  geom_line() +
  theme_bw() + 
  labs(title="Heart Rate\n by patient",
        x ="Time", y = "BPM") + 
  theme(legend.position = "none")

ggplot(gather_SatO2_all_patients) +
  aes(x =  time, y = value, color = time_series) + 
  geom_line() +
  theme_bw() + 
  labs(title="SatO2\n by patient",
        x ="Time", y = "sO2") + 
  theme(legend.position = "none")

Plotting all the Heart Rate and SatO2 values of all the patients.

# All the values in the time series for deploying this values will be used in the Graphic Interface
gather_FC_all_patients <- tidyr::gather(
  data = FC_all_patients,
  key = "time_series",
  value = "value",
  -time)  # First N values


gather_SatO2_all_patients <- tidyr::gather(
  data = SatO2_all_patients,
  key = "time_series",
  value = "value",
  -time)  # First N values

write_xlsx(gather_FC_all_patients,"../deploying/data/gather_FC_all_patients.xlsx")
write_xlsx(gather_SatO2_all_patients,"../deploying/data/gather_SatO2_all_patients.xlsx")
ggplot(gather_FC_all_patients) +
  aes(x =  time, y = value, color = time_series) + 
  geom_line() +
  theme_bw() + 
  labs(title="Heart Rate\n by patient",
        x ="Time", y = "BPM") + 
  theme(legend.position = "none")

ggplot(gather_SatO2_all_patients) +
  aes(x =  time, y = value, color = time_series) + 
  geom_line() +
  theme_bw() + 
  labs(title="SatO2\n by patient",
        x ="Time", y = "sO2") + 
  theme(legend.position = "none")

Graphic Interface for visualizing each patient isolated: GraphicInterface

3 Missing Data

This table shows the amount of missing data to work with:

kable(rbind(total_of_values, total_of_missing_values, PCT_of_missing_values), col.names = c("Heart Rate","SatO2"))
Heart Rate SatO2
total_of_values 1.152800e+05 1.152800e+05
total_of_missing_values 5.745000e+03 5.609000e+03
PCT_of_missing_values 4.983518e+00 4.865545e+00

Heart Rate each patient

vis_miss(FC_all_patients, sort_miss = TRUE) + 
  theme(axis.text.x = element_text(angle = 90)) + 
  labs(title = "Missing values in Heart Rate")

SatO2 each patient

vis_miss(SatO2_all_patients, sort_miss = TRUE) + 
  theme(axis.text.x = element_text(angle = 90)) + 
  labs(title = "Missing values in SatO2")

Common values

# Common variables
vis_miss(df1, sort_miss = TRUE) + 
  theme(axis.text.x = element_text(angle = 90)) + 
  labs(title = "Missing values in common variables")

Patients that show DETERIORIO

file_patient_name_DETERIORO <- paste0(df[df$DETERIORO == 1,c("ID","NHC","INICIALES")][,3],"_",df[df$DETERIORO == 1,c("ID","NHC","INICIALES")][,2])
DETERIORO PATIENTS [1] "AJJ_11233049" "APA_11204819" "ARR_11228585" [4] "DGS_11215248" "ECP_11169795" "HDBG_11139366" [7] "ILG_11229582" "JJB_11182744" "JJS2_11218322" [10] "JPT_11236205" "LMF_11116324" "MMB_11205362" [13] "MPF_11185698" "NSM_11223800" "PGF_11242386" [16] "PMR_11230016" "SMG_11123019" "VCR_11203302"

Patients that don´t show DETERIORO

file_patient_name_NO_DETERIORO <- paste0(df[df$DETERIORO == 0,c("ID","NHC","INICIALES")][,3],"_",df[df$DETERIORO == 0,c("ID","NHC","INICIALES")][,2])
NOT DETERIORO PATIENTS [1] "ACR_11231843" "ADAO_11159808" "AGG_11236448" [4] "AHL_11239959" "AJGD_11119689" "AMF_11220011" [7] "AMP_11228639" "AMT_11120363" "ASN_11226885" [10] "AZM_11047760" "CBA_11124187" "CGN_11234482" [13] "DEA_11243504" "DIPDLH_11241649" "DJSD_11178309" [16] "DVS_11231268" "DZL_11227036" "FNMM_11174240" [19] "GGG_11156716" "GGT_11208499" "GHP_11229529" [22] "GLR_11225596" "HCC_11203216" "HGSDA_11233118" [25] "IGC_11229255" "IGC2_11229255" "IPA_11147550" [28] "IRL_11034760" "JFM_11233223" "JJS_11218322" [31] "JNM_11242584" "JPD_11209658" "JQA_11091598" [34] "KSBS_11201840" "LMM_11139982" "LMP_11060996" [37] "LMP2_11060996" "LMS_11228310" "LVBB_11135653" [40] "MA_11216747" "MFEH_11191624" "MPF_11185697" [43] "MSMM_11239970" "MSPJ_11164541" "MTG_11220400" [46] "NEH_11181855" "PBO_11129516" "PGA_11180136" [49] "RBJ_11163775" "RGFGM_11156248" "RVS_11034257" [52] "SGG_11181506" "SLF_11214212" "SPB_11241570" [55] "SPM_11222444" "SSF_11207023" "TCD_11245595" [58] "TGJ_11200052" "VAM_11160159" "YPR_11188252" [61] "YVR_11188465"
## Saving the info
write.xlsx(file_patient_name_NO_DETERIORO, "../data/clean-data/file_patient_name_NO_DETERIORO.xlsx")
write.xlsx(file_patient_name_DETERIORO, "../data/clean-data/file_patient_name_DETERIORO.xlsx")


## For Deploying
write.xlsx(file_patient_name_NO_DETERIORO, "../deploying/data/file_patient_name_NO_DETERIORO.xlsx")
write.xlsx(file_patient_name_DETERIORO, "../deploying/data/file_patient_name_DETERIORO.xlsx")

3.1 Deterioro Patients

# FC DETERIORO Missing 
vis_miss(FC_all_patients[,file_patient_name_DETERIORO], sort_miss = TRUE) + 
  theme(axis.text.x = element_text(angle = 90)) + 
  labs(title = "Missing values in Heart Rate",
       subtitle = "DETERIORO")

# SatO2 DETERIORO Missing 
vis_miss(SatO2_all_patients[,file_patient_name_DETERIORO], sort_miss = TRUE) + 
  theme(axis.text.x = element_text(angle = 90)) + 
  labs(title = "Missing values in SatO2",
       subtitle = "DETERIORO")

Detail information of firts two patients with DETERIORO

library(ggpubr)
## Warning: package 'ggpubr' was built under R version 4.2.2
plot <- ggarrange(
  gg_miss_span(FC_all_patients[, c("SMG_11123019", "time")], var = SMG_11123019, span_every = 60) + labs(
    title = "MPF_11185697",
    subtitle = " ",
    caption = "Over a repeating span of 60"
  ),
  
  gg_miss_span(FC_all_patients[, c("HDBG_11139366", "time")], var = HDBG_11139366, span_every = 60) + labs(
    title = "HDBG_11139366",
    subtitle = " ",
    caption = "Over a repeating span of 60"
  ),
  common.legend = TRUE,
  legend = "bottom"
  
  # gg_miss_span(FC_all_patients[,c("MPF_11185697", "time")], var = HDBG_11139366, span_every = 60) + labs(title = "Proportion of missing values",
  #        subtitle = "Over a repeating span of 60",
  #        caption = "MPF_11185697")
)

annotate_figure(plot, top = text_grob("Proportion of missing values", 
               color = "black", face = "bold", size = 14))

3.2 NO Deterioro Patients

# FC DETERIORO Missing 
vis_miss(FC_all_patients[,file_patient_name_NO_DETERIORO], sort_miss = TRUE) + 
  theme(axis.text.x = element_text(angle = 90)) + 
  labs(title = "Missing values in Heart Rate",
       subtitle = "NO DETERIORO")

# SatO2 DETERIORO Missing 
vis_miss(SatO2_all_patients[,file_patient_name_NO_DETERIORO], sort_miss = TRUE) + 
  theme(axis.text.x = element_text(angle = 90)) + 
  labs(title = "Missing values in SatO2",
       subtitle = "NO DETERIORO")

Patients will be studied individually to see if they can be counted in the study. An admissible amount of missing data cannot exceed 5%.

3.3 Quality of patients

All patients

Heart Rate
row_names_info <- rownames(data.frame(t(FC_all_patients)))
missing_FC= as.data.frame(miss_case_summary(data.frame(t(FC_all_patients))))
missing_FC$case = row_names_info[missing_FC$case]
colnames(missing_FC) <- c("Patient", "N_Miss","PCT_Miss")
write.xlsx(missing_FC, "../data/missing-info/missing_FC.xlsx")
kable(head(missing_FC))
Patient N_Miss PCT_Miss
SMG_11123019 782 54.26787
HDBG_11139366 706 48.99375
JFM_11233223 659 45.73213
MSPJ_11164541 616 42.74809
IGC_11229255 281 19.50035
MPF_11185698 234 16.23872
SatO2
row_names_info <- rownames(data.frame(t(SatO2_all_patients)))
missing_SatO2= as.data.frame(miss_case_summary(data.frame(t(SatO2_all_patients))))
missing_SatO2$case = row_names_info[missing_SatO2$case]
colnames(missing_SatO2) <- c("Patient", "N_Miss","PCT_Miss")
write.xlsx(missing_SatO2, "../data/missing-info/missing_SatO2.xlsx")
kable(head(missing_SatO2))
Patient N_Miss PCT_Miss
SMG_11123019 781 54.19847
HDBG_11139366 706 48.99375
JFM_11233223 659 45.73213
MSPJ_11164541 616 42.74809
IGC_11229255 273 18.94518
MPF_11185698 220 15.26718

DETERIORO

Heart Rate
row_names_info <- rownames(data.frame(t(FC_all_patients[,file_patient_name_DETERIORO])))
missing_DETERIORO_FC= as.data.frame(miss_case_summary(data.frame(t(FC_all_patients[,file_patient_name_DETERIORO]))))
missing_DETERIORO_FC$case = row_names_info[missing_DETERIORO_FC$case]
colnames(missing_DETERIORO_FC) <- c("Patient", "N_Miss","PCT_Miss")
kable(head(missing_DETERIORO_FC))
Patient N_Miss PCT_Miss
SMG_11123019 782 54.267870
HDBG_11139366 706 48.993754
MPF_11185698 234 16.238723
DGS_11215248 182 12.630118
JJS2_11218322 62 4.302568
PGF_11242386 61 4.233171
SatO2
row_names_info <- rownames(data.frame(t(SatO2_all_patients[,file_patient_name_DETERIORO])))
missing_DETERIORO_SatO2= as.data.frame(miss_case_summary(data.frame(t(SatO2_all_patients[,file_patient_name_DETERIORO]))))
missing_DETERIORO_SatO2$case = row_names_info[missing_DETERIORO_SatO2$case]
colnames(missing_DETERIORO_SatO2) <- c("Patient", "N_Miss","PCT_Miss")
kable(head(missing_DETERIORO_SatO2))
Patient N_Miss PCT_Miss
SMG_11123019 781 54.198473
HDBG_11139366 706 48.993754
MPF_11185698 220 15.267176
DGS_11215248 181 12.560722
JJS2_11218322 61 4.233171
PGF_11242386 61 4.233171

No DETERIORO

Heart Rate
row_names_info <- rownames(data.frame(t(FC_all_patients[,file_patient_name_NO_DETERIORO])))

missing_NO_DETERIORO_FC= as.data.frame(miss_case_summary(data.frame(t(FC_all_patients[,file_patient_name_NO_DETERIORO]))))
missing_NO_DETERIORO_FC$case = row_names_info[missing_NO_DETERIORO_FC$case]
colnames(missing_NO_DETERIORO_FC) <- c("Patient", "N_Miss","PCT_Miss")
kable(head(missing_NO_DETERIORO_FC))
Patient N_Miss PCT_Miss
JFM_11233223 659 45.73213
MSPJ_11164541 616 42.74809
IGC_11229255 281 19.50035
MPF_11185697 215 14.92019
HGSDA_11233118 204 14.15684
SPB_11241570 163 11.31159
SatO2
row_names_info <- rownames(data.frame(t(SatO2_all_patients[,file_patient_name_NO_DETERIORO])))
missing_NO_DETERIORO_SatO2= as.data.frame(miss_case_summary(data.frame(t(SatO2_all_patients[,file_patient_name_NO_DETERIORO]))))
missing_NO_DETERIORO_SatO2$case = row_names_info[missing_NO_DETERIORO_SatO2$case]
colnames(missing_NO_DETERIORO_SatO2) <- c("Patient", "N_Miss","PCT_Miss")
kable(head(missing_NO_DETERIORO_SatO2))
Patient N_Miss PCT_Miss
JFM_11233223 659 45.73213
MSPJ_11164541 616 42.74809
IGC_11229255 273 18.94518
HGSDA_11233118 203 14.08744
MPF_11185697 188 13.04650
SPB_11241570 156 10.82582

Common values

row_names_info <- rownames(data.frame(t(df1)))
missing_df1 = as.data.frame(miss_case_summary(data.frame(t(df1))))
missing_df1$case = row_names_info[missing_df1$case]
colnames(missing_df1) <- c("Variable", "N_Miss","PCT_Miss")

#Variables with missing values
#colnames(df1)
df1_names <- as.data.frame(colnames(df1))
variable_names = paste0(df1_names[,1])
row_names_info <- rownames(t(df1))
missing_df1= as.data.frame(miss_case_summary(as.data.frame(t(df1))))
missing_df1$case = row_names_info[missing_df1$case]
colnames(missing_df1) <- c("Variable", "N_Miss","PCT_Miss")
# Adding the class of the variable
column_classes <- function(df) {
  classes <- vector(mode = "character", length = ncol(df)) # create a vector to store the class information
  for (i in 1:ncol(df)) { # iterate through each column of the data frame
    if (all(df[[i]] %in% c(0, 1))) { # check if all values in the column are 0 or 1
      classes[i] <- "factor" # if so, mark the class as "factor"
    } else {
      classes[i] <- class(df[[i]]) # otherwise, mark the class as the class of the column
    }
  }
  return(classes)
}
column_info<- data.frame(cbind(names(df1),column_classes(df1)))
colnames(column_info) <- c("Variable","Type")
df_merge <- merge(column_info,missing_df1,by="Variable")
# order the data frame in descending order based on the 'y' column
df_merge <- df_merge[order(df_merge$PCT_Miss, decreasing = TRUE),]
# print the ordered data frame
reactable(df_merge)
# Write information
write.xlsx(df_merge, "../data/info-variables/variables-info.xlsx")
write.csv(df_merge, "../data/info-variables/variables-info.csv", row.names=FALSE)

4 Descriptive analysis

rownames(df1) <- file_patient_name
reactable(df1)

To handle these variables effectively, we aim to exclude those with missing values from our analysis. We encounter the same issue as before, which highlights the importance of differentiating between a priori and a posteriori variables.

I will only perform a descriptive analysis on the variables that have no missing values.

# summary(df1)
df1_NO_NA = df1[,missing_df1[missing_df1$N_Miss == 0,]$Variable]
# Is important to delete also the "name" variables
# Those ones are
# "ID"                        "NHC"                       "INICIALES" 
df1_NO_NA <- subset(df1_NO_NA, select = - c(ID,NHC,INICIALES))
Variables With NO NA [1] "EDAD" "PESO" [3] "SEXO" "ENFERMEDAD_BASE" [5] "PREMATURIDAD" "EG" [7] "PALIVIZUMAB" "LM" [9] "DERMATITIS" "ALERGIAS" [11] "DESNUTRICION" "ETIOLOGIA" [13] "RADIOGRAFIA" "ANALITICA" [15] "SAPI_0_8h" "FR_0_8h" [17] "SCORE_WOOD_DOWNES_INGRESO" "SCORE_WOOD_DOWNES_24H" [19] "ALIMENTACION" "SUERO" [21] "SNG" "OAF" [23] "DIAS_OAF" "OAF si1 no0" [25] "UCIP" "DETERIORO" [27] "APNEA" "BRONCODILATADORES" [29] "CORTICOIDES" "ANTIBIOTICO"
reactable(df1_NO_NA)
#names(df1_NO_NA)
#head(df1_NO_NA)

4.1 Correlation Matrix

library(corrplot)
## corrplot 0.92 loaded
df1_NO_NA_cor <- df1_NO_NA
colnames(df1_NO_NA_cor) <- c(1:dim(df1_NO_NA_cor)[2])
corrplot(cor(df1_NO_NA_cor),  tl.col="black", tl.cex=0.8, tl.srt=70,order = "hclust")