Libraries

library(ggplot2)
library(readxl)
library(dplyr)
library(writexl)
library(data.table)
library(xlsx)
library(knitr)
library(reactable)

Functions

source("../scripts/useful-functions/get_column_position.R")
# In a normal script it will be:  source("./scripts/useful-functions/get_column_position.R")

1. Reading Data

df1 has information about the Descriptive Variables.

df <- as.data.frame(read_excel("../data/raw-data/00MONITORIZACION_READ_R.xlsx"))
#cat(colnames(df),"\t")

# It is intelligent to work with different data frames
# df1 for the explicative variables
df1 <- df[,c(1:get_column_position(df,"ANTIBIOTICO"))]
cat(colnames(df1),"\t")

## ID NHC INICIALES EM EDAD PESO SEXO ENFERMEDAD_BASE PREMATURIDAD EG PALIVIZUMAB LM DERMATITIS ALERGIAS TABACO DESNUTRICION ETIOLOGIA RADIOGRAFIA ANALITICA LEUCOCITOS NEUTROFILOS LINFOCITOS PCR PCT SAPI_0_8h SAPI_8_16h SAPI_16_24h FR_0_8h FR_8_16h FR_16_24h SCORE_CRUCES_INGRESO SCORE_WOOD_DOWNES_INGRESO SCORE_WOOD_DOWNES_24H ALIMENTACION SUERO SNG FLUJO2_0_8H FLUJO2_8_16h FLUJO2_16_24h OAF FLUJOAF_0_8h FLUJOAF_8_16h FLUJOAF_16-24h FiO2_0_8h FiO2_8_16h FiO2_16_24h DIAS_O2_TOTAL DIAS_GN DIAS_OAF OAF si1 no0 UCIP DETERIORO APNEA BRONCODILATADORES CORTICOIDES ANTIBIOTICO

reactable(df1)

df2 has information about the time series variables each hour; I will calculate these values again.

# df2 for the O2 Sat and FC variables
df2 <- df[,c(1:3,(get_column_position(df,"ANTIBIOTICO")+1):get_column_position(df,"SatO2_24"))]
cat(colnames(df2),"\t")

## ID NHC INICIALES FC_1 FC_2 FC_3 FC_4 FC_5 FC_6 FC_7 FC_8 FC_9 FC_10 FC_11 FC_12 FC_13 FC_14 FC_15 FC_16 FC_17 FC_18 FC_19 FC_20 FC_21 FC_22 FC_23 FC_24 SatO2_1 SatO2_2 SatO2_3 SatO2_4 SatO2_5 SatO2_6 SatO2_7 SatO2_8 SatO2_9 SatO2_10 SatO2_11 SatO2_12 SatO2_13 SatO2_14 SatO2_15 SatO2_16 SatO2_17 SatO2_18 SatO2_19 SatO2_20 SatO2_21 SatO2_22 SatO2_23 SatO2_24

reactable(df2)

1.2 Reading all the data frames

Now the work consists of reading all the different files that are in the folder and seeing if all of them match with the descriptive variables that are in MONITORIZACION_READ_R.xlsx

1.2.1 Reading all patients time series files

2. Cleaning the Data

2. 1 Initial checking

Checking if all the file names meet the requirements

# If the file meet the requirements, print the file
for (name in file_names) {
  if (any(grepl(name,list.files(path = "../data/raw-data"))) ==  FALSE){
  print(name)}
  }

Cleaning all the data frames

It is important to convert the values in the time columns into something we can work with, so I convert the first column into something I can work with.

# We want unique hours
for (name_variable in file_patient_name) {
  data = get(name_variable)
  #print(name_variable)
  colnames(data) <- c("Time","FC","SatO2")
  data[,1] <- format(as.POSIXct(data[,1],
                                        format = '%Y/%m/%d %H:%M:%S'),
                             format = '%H:%M:%S')
  assign(name_variable,data)
}
kable(head(data))

Time	FC	SatO2
03:50:00	134	94
03:51:00	125	100
03:52:00	127	100
03:53:00	133	100
03:54:00	129	100
03:55:00	149	100

Adding one column with information about the hour

for (name_variable in file_patient_name) {
  data = get(name_variable)
  data$hour = format(as.POSIXct(data$Time,format="%H:%M:%S"), format = "%H")
  unique_values <- c()
  j = 1
  for(i in 1:length(data$hour)){
    if(data$hour[i] %in% unique_values ==  FALSE){
      unique_values[j] = data$hour[i]; 
      j = j + 1}
    else if (data$hour[i] %in% unique_values ==  TRUE && (data$hour[i-1] != data$hour[i])){
      data$hour[i] = paste0(data$hour[i],"_1") 
    }
  }
  assign(name_variable,data)
}

kable(head(data))

Time	FC	SatO2	hour
03:50:00	134	94	03
03:51:00	125	100	03
03:52:00	127	100	03
03:53:00	133	100	03
03:54:00	129	100	03
03:55:00	149	100	03

Let´s create a function that calculates the mean per hour, and studies the amount of missing values and the quantity of values tracks in this hour. This function will allow to see several important things for each patient:

Hour: Time referring the recollection of the data.

N: Amount of time values.

Missing_FC: Missing FC values.

Missing_SatO2: Missing SatO2 values.

avg_FC_with_NA: Mean calculated using the available values of FC.

avg_SatO2_with_NA: Mean calculated using the available values of SatO2

As many .xlsx files are created as there are patients, each file will be as follows:

Each file will be name NHC-ID.xlsx and it will be stored in ../data/info-patients

for (name_variable in file_patient_name) {
  data = get(name_variable)
  # Count of values
  valores_unicos <- unique(data$hour)
  count_values <- data %>% 
    group_by(hour) %>%
    count() %>%
    mutate(valor_orden = factor(hour, levels = valores_unicos)) %>% 
    arrange(valor_orden)
  count_values <- data.frame(count_values[,c("hour","n")])
  
  # Missing values in FC
  Missing_FC <- data %>%                                    
    group_by(hour) %>%
    dplyr::summarize(Missing_FC = sum(is.na(FC))) %>%
    mutate(valor_orden = factor(hour, levels = valores_unicos)) %>% 
    arrange(valor_orden)
  Missing_FC <- data.frame(Missing_FC[,c("Missing_FC")])
  
  # Missing values in SatO2
  Missing_SatO2 <- data %>%                                    
    group_by(hour) %>%
    dplyr::summarize(Missing_SatO2 = sum(is.na(SatO2))) %>%
    mutate(valor_orden = factor(hour, levels = valores_unicos)) %>% 
    arrange(valor_orden)
  Missing_SatO2 <- data.frame(Missing_SatO2[,c("Missing_SatO2")])
  
  # Mean FC
  data <- data.table(data)
  Mean_FC <- data.frame(data[,list(avg_SatFC=mean(FC)),by=hour])
  Mean_FC <- data.frame(Mean_FC[,c("avg_SatFC")])
  
  # Mean SatO2
  data <- data.table(data)
  Mean_SatO2 <- data.frame(data[,list(avg_SatO2=mean(SatO2)),by=hour])
  Mean_SatO2 <- data.frame(Mean_SatO2[,c("avg_SatO2")])
  
  # Mean with not NA values Sat02
  Mean_SatO2_with_NA <-  data %>%
    group_by(hour) %>%
    summarise(avg_SatO2_with_NA = mean(SatO2, na.rm = T)) %>%
    mutate(valor_orden = factor(hour, levels = valores_unicos)) %>% 
    arrange(valor_orden)
  Mean_SatO2_with_NA <- data.frame(Mean_SatO2_with_NA[,c("avg_SatO2_with_NA")])
  
  # Mean with not NA values FC
  Mean_FC_with_NA <-  data %>%
    group_by(hour) %>%
    summarise(avg_FC_with_NA = mean(FC, na.rm = T)) %>%
    mutate(valor_orden = factor(hour, levels = valores_unicos)) %>% 
    arrange(valor_orden)
  Mean_FC_with_NA <- data.frame(Mean_FC_with_NA[,c("avg_FC_with_NA")])
  
  
  # Merging all the data frames
  merged_df = data.frame(cbind(count_values,Missing_FC,Missing_SatO2,Mean_FC,Mean_SatO2,Mean_FC_with_NA,Mean_SatO2_with_NA))
  assign(paste0(name_variable,"_info"),merged_df)
  paste0("../data/info-patients/",name_variable,"_info",".xlsx")
  # For written the tables in excel files
  #write_xlsx(merged_df,paste0("../data/info-patients/",name_variable,"_info",".xlsx"))
}
kable(head(merged_df))

hour	n	Missing_FC	Missing_SatO2	Mean_FC…c..avg_SatFC…	Mean_SatO2…c..avg_SatO2…	avg_FC_with_NA	avg_SatO2_with_NA
03	10	0	0	139.7000	99.20000	139.7000	99.20000
04	60	0	0	113.7167	97.68333	113.7167	97.68333
05	60	0	0	111.7000	97.86667	111.7000	97.86667
06	60	0	0	105.6833	98.23333	105.6833	98.23333
07	60	0	0	112.3167	97.33333	112.3167	97.33333
08	60	12	12	NA	NA	141.4167	86.95833

Let´s study if all the files have the same length; if not, I will imputing NA values to be able to work with the same methodology with all patients.

A complete file should have values for each minute in the 24 h of study. So 60 mins * 24 h = 1440 minutes + 1 min to close the circle

M <- length(file_patient_name)
N <- 2
matrix_time_length  <- as.data.frame(
  x = matrix(
    data = NA,
    nrow = M,
    ncol = N
  ), row.names = file_patient_name,
)
colnames(matrix_time_length) <- c("Time","1441?")


# 60 mins * 24 h = 1440 minutes + 1 min to close the circle
for (name_file in file_patient_name) {
  data = get(name_file)
  matrix_time_length[paste0(name_file),1] = length(data$Time)
  if (matrix_time_length[paste0(name_file),1] == 1441){
    matrix_time_length[paste0(name_file),2] = "YES"
  }
  else {matrix_time_length[paste0(name_file),2] = "NO"}
}

kable(head(matrix_time_length))

	Time	1441?
ACR_11231843	1441	YES
ADAO_11159808	1441	YES
AGG_11236448	1441	YES
AHL_11239959	1441	YES
AJGD_11119689	1441	YES
AJJ_11233049	1441	YES

# Lets print the ones that don t fill the requirements
for (name_file in file_patient_name) {
  if (matrix_time_length[paste0(name_file),2] == "NO"){
    print(name_file)
    print(matrix_time_length[paste0(name_file),1])
  }
}

Those patients who do not meet the requirements have been manipulated. NA values have been added to the Time, FC, and SatO2 rows. In some files, it is indicated that missing values are from the head, and by default, NA values have been added to the tail.

2.2 Full data-frames of Heart Rate and SatO2

# Lets put together all the patients in the same data frame 
M <- 1441
N <- length(file_patient_name)
FC_all_patients  <- as.data.frame(
  x = matrix(
    data = NA,
    nrow = M,
    ncol = N
  )
)
SatO2_all_patients  <- as.data.frame(
  x = matrix(
    data = NA,
    nrow = M,
    ncol = N
  )
)
colnames(FC_all_patients) = file_patient_name
colnames(SatO2_all_patients) = file_patient_name


## Imputing the data inside the created data frames
for(name_file in file_patient_name) {
  data = get(name_file)
  #print(name_file)
  #print(length(data$Time))
  FC_all_patients[,paste0(name_file)] <- data$FC
  SatO2_all_patients[,paste0(name_file)] <- data$SatO2
}

#Adding an extra column for the time series reference
FC_all_patients$time <- c(1:M)
SatO2_all_patients$time <- c(1:M)


write_xlsx(FC_all_patients,"../data/FC_&_SatO2/FC_all_patients.xlsx")
write_xlsx(SatO2_all_patients,"../data/FC_&_SatO2/SatO2_all_patients.xlsx")

Merged dataframe of Heart Rate values FC_all_patients (all the patients have the same length 1441 values)

reactable(FC_all_patients)

Merged dataframe of Heart Rate values SatO2_all_patients (all the patients have the same length 1441 values)

reactable(SatO2_all_patients)

2.2.1 Plotting `data-frames`

Plotting the first 20 values of Heart Rate and SatO2 of all the patients.

# First N values in the time series
N = 20
gather_FC_all_patients <- tidyr::gather(
  data = FC_all_patients[1:N,],
  key = "time_series",
  value = "value",
  -time)  # First N values


gather_SatO2_all_patients <- tidyr::gather(
  data = SatO2_all_patients[1:N,],
  key = "time_series",
  value = "value",
  -time)  # First N values

ggplot(gather_FC_all_patients) +
  aes(x =  time, y = value, color = time_series) + 
  geom_line() +
  theme_bw() + 
  labs(title="Heart Rate\n by patient",
        x ="Time", y = "BPM") + 
  theme(legend.position = "none")

ggplot(gather_SatO2_all_patients) +
  aes(x =  time, y = value, color = time_series) + 
  geom_line() +
  theme_bw() + 
  labs(title="SatO2\n by patient",
        x ="Time", y = "sO2") + 
  theme(legend.position = "none")

Plotting all the Heart Rate and SatO2 values of all the patients.

# All the values in the time series for deploying this values will be used in the Graphic Interface
gather_FC_all_patients <- tidyr::gather(
  data = FC_all_patients,
  key = "time_series",
  value = "value",
  -time)  # First N values


gather_SatO2_all_patients <- tidyr::gather(
  data = SatO2_all_patients,
  key = "time_series",
  value = "value",
  -time)  # First N values

write_xlsx(gather_FC_all_patients,"../deploying/data/gather_FC_all_patients.xlsx")
write_xlsx(gather_SatO2_all_patients,"../deploying/data/gather_SatO2_all_patients.xlsx")

ggplot(gather_FC_all_patients) +
  aes(x =  time, y = value, color = time_series) + 
  geom_line() +
  theme_bw() + 
  labs(title="Heart Rate\n by patient",
        x ="Time", y = "BPM") + 
  theme(legend.position = "none")

ggplot(gather_SatO2_all_patients) +
  aes(x =  time, y = value, color = time_series) + 
  geom_line() +
  theme_bw() + 
  labs(title="SatO2\n by patient",
        x ="Time", y = "sO2") + 
  theme(legend.position = "none")

Graphic Interface for visualizing each patient isolated: GraphicInterface

3 Missing Data

This table shows the amount of missing data to work with:

kable(rbind(total_of_values, total_of_missing_values, PCT_of_missing_values), col.names = c("Heart Rate","SatO2"))

	Heart Rate	SatO2
total_of_values	1.152800e+05	1.152800e+05
total_of_missing_values	5.745000e+03	5.609000e+03
PCT_of_missing_values	4.983518e+00	4.865545e+00

Heart Rate each patient

vis_miss(FC_all_patients, sort_miss = TRUE) + 
  theme(axis.text.x = element_text(angle = 90)) + 
  labs(title = "Missing values in Heart Rate")

SatO2 each patient

vis_miss(SatO2_all_patients, sort_miss = TRUE) + 
  theme(axis.text.x = element_text(angle = 90)) + 
  labs(title = "Missing values in SatO2")

Common values

# Common variables
vis_miss(df1, sort_miss = TRUE) + 
  theme(axis.text.x = element_text(angle = 90)) + 
  labs(title = "Missing values in common variables")

Patients that show DETERIORIO

file_patient_name_DETERIORO <- paste0(df[df$DETERIORO == 1,c("ID","NHC","INICIALES")][,3],"_",df[df$DETERIORO == 1,c("ID","NHC","INICIALES")][,2])

DETERIORO PATIENTS

[1] "AJJ_11233049" "APA_11204819" "ARR_11228585" [4] "DGS_11215248" "ECP_11169795" "HDBG_11139366" [7] "ILG_11229582" "JJB_11182744" "JJS2_11218322" [10] "JPT_11236205" "LMF_11116324" "MMB_11205362" [13] "MPF_11185698" "NSM_11223800" "PGF_11242386" [16] "PMR_11230016" "SMG_11123019" "VCR_11203302"

Patients that don´t show DETERIORO

file_patient_name_NO_DETERIORO <- paste0(df[df$DETERIORO == 0,c("ID","NHC","INICIALES")][,3],"_",df[df$DETERIORO == 0,c("ID","NHC","INICIALES")][,2])

NOT DETERIORO PATIENTS

[1] "ACR_11231843" "ADAO_11159808" "AGG_11236448" [4] "AHL_11239959" "AJGD_11119689" "AMF_11220011" [7] "AMP_11228639" "AMT_11120363" "ASN_11226885" [10] "AZM_11047760" "CBA_11124187" "CGN_11234482" [13] "DEA_11243504" "DIPDLH_11241649" "DJSD_11178309" [16] "DVS_11231268" "DZL_11227036" "FNMM_11174240" [19] "GGG_11156716" "GGT_11208499" "GHP_11229529" [22] "GLR_11225596" "HCC_11203216" "HGSDA_11233118" [25] "IGC_11229255" "IGC2_11229255" "IPA_11147550" [28] "IRL_11034760" "JFM_11233223" "JJS_11218322" [31] "JNM_11242584" "JPD_11209658" "JQA_11091598" [34] "KSBS_11201840" "LMM_11139982" "LMP_11060996" [37] "LMP2_11060996" "LMS_11228310" "LVBB_11135653" [40] "MA_11216747" "MFEH_11191624" "MPF_11185697" [43] "MSMM_11239970" "MSPJ_11164541" "MTG_11220400" [46] "NEH_11181855" "PBO_11129516" "PGA_11180136" [49] "RBJ_11163775" "RGFGM_11156248" "RVS_11034257" [52] "SGG_11181506" "SLF_11214212" "SPB_11241570" [55] "SPM_11222444" "SSF_11207023" "TCD_11245595" [58] "TGJ_11200052" "VAM_11160159" "YPR_11188252" [61] "YVR_11188465"

## Saving the info
write.xlsx(file_patient_name_NO_DETERIORO, "../data/clean-data/file_patient_name_NO_DETERIORO.xlsx")
write.xlsx(file_patient_name_DETERIORO, "../data/clean-data/file_patient_name_DETERIORO.xlsx")


## For Deploying
write.xlsx(file_patient_name_NO_DETERIORO, "../deploying/data/file_patient_name_NO_DETERIORO.xlsx")
write.xlsx(file_patient_name_DETERIORO, "../deploying/data/file_patient_name_DETERIORO.xlsx")

3.1 Deterioro Patients

# FC DETERIORO Missing 
vis_miss(FC_all_patients[,file_patient_name_DETERIORO], sort_miss = TRUE) + 
  theme(axis.text.x = element_text(angle = 90)) + 
  labs(title = "Missing values in Heart Rate",
       subtitle = "DETERIORO")

# SatO2 DETERIORO Missing 
vis_miss(SatO2_all_patients[,file_patient_name_DETERIORO], sort_miss = TRUE) + 
  theme(axis.text.x = element_text(angle = 90)) + 
  labs(title = "Missing values in SatO2",
       subtitle = "DETERIORO")

Detail information of firts two patients with DETERIORO

library(ggpubr)

## Warning: package 'ggpubr' was built under R version 4.2.2

plot <- ggarrange(
  gg_miss_span(FC_all_patients[, c("SMG_11123019", "time")], var = SMG_11123019, span_every = 60) + labs(
    title = "MPF_11185697",
    subtitle = " ",
    caption = "Over a repeating span of 60"
  ),
  
  gg_miss_span(FC_all_patients[, c("HDBG_11139366", "time")], var = HDBG_11139366, span_every = 60) + labs(
    title = "HDBG_11139366",
    subtitle = " ",
    caption = "Over a repeating span of 60"
  ),
  common.legend = TRUE,
  legend = "bottom"
  
  # gg_miss_span(FC_all_patients[,c("MPF_11185697", "time")], var = HDBG_11139366, span_every = 60) + labs(title = "Proportion of missing values",
  #        subtitle = "Over a repeating span of 60",
  #        caption = "MPF_11185697")
)

annotate_figure(plot, top = text_grob("Proportion of missing values", 
               color = "black", face = "bold", size = 14))

3.2 NO Deterioro Patients

# FC DETERIORO Missing 
vis_miss(FC_all_patients[,file_patient_name_NO_DETERIORO], sort_miss = TRUE) + 
  theme(axis.text.x = element_text(angle = 90)) + 
  labs(title = "Missing values in Heart Rate",
       subtitle = "NO DETERIORO")

# SatO2 DETERIORO Missing 
vis_miss(SatO2_all_patients[,file_patient_name_NO_DETERIORO], sort_miss = TRUE) + 
  theme(axis.text.x = element_text(angle = 90)) + 
  labs(title = "Missing values in SatO2",
       subtitle = "NO DETERIORO")

Patients will be studied individually to see if they can be counted in the study. An admissible amount of missing data cannot exceed 5%.

3.3 Quality of patients

All patients

Heart Rate

row_names_info <- rownames(data.frame(t(FC_all_patients)))
missing_FC= as.data.frame(miss_case_summary(data.frame(t(FC_all_patients))))
missing_FC$case = row_names_info[missing_FC$case]
colnames(missing_FC) <- c("Patient", "N_Miss","PCT_Miss")
write.xlsx(missing_FC, "../data/missing-info/missing_FC.xlsx")
kable(head(missing_FC))

Patient	N_Miss	PCT_Miss
SMG_11123019	782	54.26787
HDBG_11139366	706	48.99375
JFM_11233223	659	45.73213
MSPJ_11164541	616	42.74809
IGC_11229255	281	19.50035
MPF_11185698	234	16.23872

SatO2

row_names_info <- rownames(data.frame(t(SatO2_all_patients)))
missing_SatO2= as.data.frame(miss_case_summary(data.frame(t(SatO2_all_patients))))
missing_SatO2$case = row_names_info[missing_SatO2$case]
colnames(missing_SatO2) <- c("Patient", "N_Miss","PCT_Miss")
write.xlsx(missing_SatO2, "../data/missing-info/missing_SatO2.xlsx")
kable(head(missing_SatO2))

Patient	N_Miss	PCT_Miss
SMG_11123019	781	54.19847
HDBG_11139366	706	48.99375
JFM_11233223	659	45.73213
MSPJ_11164541	616	42.74809
IGC_11229255	273	18.94518
MPF_11185698	220	15.26718

DETERIORO

Heart Rate

row_names_info <- rownames(data.frame(t(FC_all_patients[,file_patient_name_DETERIORO])))
missing_DETERIORO_FC= as.data.frame(miss_case_summary(data.frame(t(FC_all_patients[,file_patient_name_DETERIORO]))))
missing_DETERIORO_FC$case = row_names_info[missing_DETERIORO_FC$case]
colnames(missing_DETERIORO_FC) <- c("Patient", "N_Miss","PCT_Miss")
kable(head(missing_DETERIORO_FC))

Patient	N_Miss	PCT_Miss
SMG_11123019	782	54.267870
HDBG_11139366	706	48.993754
MPF_11185698	234	16.238723
DGS_11215248	182	12.630118
JJS2_11218322	62	4.302568
PGF_11242386	61	4.233171

SatO2

row_names_info <- rownames(data.frame(t(SatO2_all_patients[,file_patient_name_DETERIORO])))
missing_DETERIORO_SatO2= as.data.frame(miss_case_summary(data.frame(t(SatO2_all_patients[,file_patient_name_DETERIORO]))))
missing_DETERIORO_SatO2$case = row_names_info[missing_DETERIORO_SatO2$case]
colnames(missing_DETERIORO_SatO2) <- c("Patient", "N_Miss","PCT_Miss")
kable(head(missing_DETERIORO_SatO2))

Patient	N_Miss	PCT_Miss
SMG_11123019	781	54.198473
HDBG_11139366	706	48.993754
MPF_11185698	220	15.267176
DGS_11215248	181	12.560722
JJS2_11218322	61	4.233171
PGF_11242386	61	4.233171

No DETERIORO

Heart Rate

row_names_info <- rownames(data.frame(t(FC_all_patients[,file_patient_name_NO_DETERIORO])))

missing_NO_DETERIORO_FC= as.data.frame(miss_case_summary(data.frame(t(FC_all_patients[,file_patient_name_NO_DETERIORO]))))
missing_NO_DETERIORO_FC$case = row_names_info[missing_NO_DETERIORO_FC$case]
colnames(missing_NO_DETERIORO_FC) <- c("Patient", "N_Miss","PCT_Miss")
kable(head(missing_NO_DETERIORO_FC))

Patient	N_Miss	PCT_Miss
JFM_11233223	659	45.73213
MSPJ_11164541	616	42.74809
IGC_11229255	281	19.50035
MPF_11185697	215	14.92019
HGSDA_11233118	204	14.15684
SPB_11241570	163	11.31159

SatO2

row_names_info <- rownames(data.frame(t(SatO2_all_patients[,file_patient_name_NO_DETERIORO])))
missing_NO_DETERIORO_SatO2= as.data.frame(miss_case_summary(data.frame(t(SatO2_all_patients[,file_patient_name_NO_DETERIORO]))))
missing_NO_DETERIORO_SatO2$case = row_names_info[missing_NO_DETERIORO_SatO2$case]
colnames(missing_NO_DETERIORO_SatO2) <- c("Patient", "N_Miss","PCT_Miss")
kable(head(missing_NO_DETERIORO_SatO2))

Patient	N_Miss	PCT_Miss
JFM_11233223	659	45.73213
MSPJ_11164541	616	42.74809
IGC_11229255	273	18.94518
HGSDA_11233118	203	14.08744
MPF_11185697	188	13.04650
SPB_11241570	156	10.82582

Common values

row_names_info <- rownames(data.frame(t(df1)))
missing_df1 = as.data.frame(miss_case_summary(data.frame(t(df1))))
missing_df1$case = row_names_info[missing_df1$case]
colnames(missing_df1) <- c("Variable", "N_Miss","PCT_Miss")

#Variables with missing values

#colnames(df1)
df1_names <- as.data.frame(colnames(df1))
variable_names = paste0(df1_names[,1])

row_names_info <- rownames(t(df1))
missing_df1= as.data.frame(miss_case_summary(as.data.frame(t(df1))))
missing_df1$case = row_names_info[missing_df1$case]
colnames(missing_df1) <- c("Variable", "N_Miss","PCT_Miss")

# Adding the class of the variable
column_classes <- function(df) {
  classes <- vector(mode = "character", length = ncol(df)) # create a vector to store the class information
  for (i in 1:ncol(df)) { # iterate through each column of the data frame
    if (all(df[[i]] %in% c(0, 1))) { # check if all values in the column are 0 or 1
      classes[i] <- "factor" # if so, mark the class as "factor"
    } else {
      classes[i] <- class(df[[i]]) # otherwise, mark the class as the class of the column
    }
  }
  return(classes)
}

column_info<- data.frame(cbind(names(df1),column_classes(df1)))
colnames(column_info) <- c("Variable","Type")
df_merge <- merge(column_info,missing_df1,by="Variable")
# order the data frame in descending order based on the 'y' column
df_merge <- df_merge[order(df_merge$PCT_Miss, decreasing = TRUE),]
# print the ordered data frame
reactable(df_merge)

# Write information
write.xlsx(df_merge, "../data/info-variables/variables-info.xlsx")
write.csv(df_merge, "../data/info-variables/variables-info.csv", row.names=FALSE)

4 Descriptive analysis

rownames(df1) <- file_patient_name
reactable(df1)

To handle these variables effectively, we aim to exclude those with missing values from our analysis. We encounter the same issue as before, which highlights the importance of differentiating between a priori and a posteriori variables.

I will only perform a descriptive analysis on the variables that have no missing values.

# summary(df1)
df1_NO_NA = df1[,missing_df1[missing_df1$N_Miss == 0,]$Variable]
# Is important to delete also the "name" variables
# Those ones are
# "ID"                        "NHC"                       "INICIALES" 
df1_NO_NA <- subset(df1_NO_NA, select = - c(ID,NHC,INICIALES))

Variables With NO NA

[1] "EDAD" "PESO" [3] "SEXO" "ENFERMEDAD_BASE" [5] "PREMATURIDAD" "EG" [7] "PALIVIZUMAB" "LM" [9] "DERMATITIS" "ALERGIAS" [11] "DESNUTRICION" "ETIOLOGIA" [13] "RADIOGRAFIA" "ANALITICA" [15] "SAPI_0_8h" "FR_0_8h" [17] "SCORE_WOOD_DOWNES_INGRESO" "SCORE_WOOD_DOWNES_24H" [19] "ALIMENTACION" "SUERO" [21] "SNG" "OAF" [23] "DIAS_OAF" "OAF si1 no0" [25] "UCIP" "DETERIORO" [27] "APNEA" "BRONCODILATADORES" [29] "CORTICOIDES" "ANTIBIOTICO"

reactable(df1_NO_NA)

#names(df1_NO_NA)
#head(df1_NO_NA)

4.1 Correlation Matrix

library(corrplot)

## corrplot 0.92 loaded

df1_NO_NA_cor <- df1_NO_NA
colnames(df1_NO_NA_cor) <- c(1:dim(df1_NO_NA_cor)[2])
corrplot(cor(df1_NO_NA_cor),  tl.col="black", tl.cex=0.8, tl.srt=70,order = "hclust")

TFM Insights

Gonzalo Aris 16021

2023-02-11

Libraries

Functions

1. Reading Data

1.2 Reading all the data frames

1.2.1 Reading all patients time series files

2. Cleaning the Data

2. 1 Initial checking

2.2 Full data-frames of Heart Rate and SatO2

2.2.1 Plotting `data-frames`

3 Missing Data

Heart Rate each patient

SatO2 each patient

Common values

3.1 Deterioro Patients

3.2 NO Deterioro Patients

3.3 Quality of patients

Heart Rate

SatO2

Heart Rate

SatO2

Heart Rate

SatO2

4 Descriptive analysis

4.1 Correlation Matrix

TFM Insights

Gonzalo Aris 16021

2023-02-11

Libraries

Functions

1. Reading Data

1.2 Reading all the data frames

1.2.1 Reading all patients time series files

2. Cleaning the Data

2. 1 Initial checking

2.2 Full data-frames of Heart Rate and SatO2

2.2.1 Plotting data-frames

3 Missing Data

Heart Rate each patient

SatO2 each patient

Common values

3.1 Deterioro Patients

3.2 NO Deterioro Patients

3.3 Quality of patients

Heart Rate

SatO2

Heart Rate

SatO2

Heart Rate

SatO2

4 Descriptive analysis

4.1 Correlation Matrix

2.2.1 Plotting `data-frames`