library(ggplot2)
library(readxl)
library(dplyr)
library(writexl)
library(data.table)
library(xlsx)
library(knitr)
library(reactable)
source("../scripts/useful-functions/get_column_position.R")
# In a normal script it will be: source("./scripts/useful-functions/get_column_position.R")
df1
has information about the Descriptive Variables.
df <- as.data.frame(read_excel("../data/raw-data/00MONITORIZACION_READ_R.xlsx"))
#cat(colnames(df),"\t")
# It is intelligent to work with different data frames
# df1 for the explicative variables
df1 <- df[,c(1:get_column_position(df,"ANTIBIOTICO"))]
cat(colnames(df1),"\t")
## ID NHC INICIALES EM EDAD PESO SEXO ENFERMEDAD_BASE PREMATURIDAD EG PALIVIZUMAB LM DERMATITIS ALERGIAS TABACO DESNUTRICION ETIOLOGIA RADIOGRAFIA ANALITICA LEUCOCITOS NEUTROFILOS LINFOCITOS PCR PCT SAPI_0_8h SAPI_8_16h SAPI_16_24h FR_0_8h FR_8_16h FR_16_24h SCORE_CRUCES_INGRESO SCORE_WOOD_DOWNES_INGRESO SCORE_WOOD_DOWNES_24H ALIMENTACION SUERO SNG FLUJO2_0_8H FLUJO2_8_16h FLUJO2_16_24h OAF FLUJOAF_0_8h FLUJOAF_8_16h FLUJOAF_16-24h FiO2_0_8h FiO2_8_16h FiO2_16_24h DIAS_O2_TOTAL DIAS_GN DIAS_OAF OAF si1 no0 UCIP DETERIORO APNEA BRONCODILATADORES CORTICOIDES ANTIBIOTICO
reactable(df1)
df2
has information about the time series variables each hour; I will calculate these values again.
# df2 for the O2 Sat and FC variables
df2 <- df[,c(1:3,(get_column_position(df,"ANTIBIOTICO")+1):get_column_position(df,"SatO2_24"))]
cat(colnames(df2),"\t")
## ID NHC INICIALES FC_1 FC_2 FC_3 FC_4 FC_5 FC_6 FC_7 FC_8 FC_9 FC_10 FC_11 FC_12 FC_13 FC_14 FC_15 FC_16 FC_17 FC_18 FC_19 FC_20 FC_21 FC_22 FC_23 FC_24 SatO2_1 SatO2_2 SatO2_3 SatO2_4 SatO2_5 SatO2_6 SatO2_7 SatO2_8 SatO2_9 SatO2_10 SatO2_11 SatO2_12 SatO2_13 SatO2_14 SatO2_15 SatO2_16 SatO2_17 SatO2_18 SatO2_19 SatO2_20 SatO2_21 SatO2_22 SatO2_23 SatO2_24
reactable(df2)
Now the work consists of reading all the different files that are in
the folder and seeing if all of them match with the descriptive
variables that are in MONITORIZACION_READ_R.xlsx
# If the file meet the requirements, print the file
for (name in file_names) {
if (any(grepl(name,list.files(path = "../data/raw-data"))) == FALSE){
print(name)}
}
It is important to convert the values in the time columns into something we can work with, so I convert the first column into something I can work with.
# We want unique hours
for (name_variable in file_patient_name) {
data = get(name_variable)
#print(name_variable)
colnames(data) <- c("Time","FC","SatO2")
data[,1] <- format(as.POSIXct(data[,1],
format = '%Y/%m/%d %H:%M:%S'),
format = '%H:%M:%S')
assign(name_variable,data)
}
kable(head(data))
Time | FC | SatO2 |
---|---|---|
03:50:00 | 134 | 94 |
03:51:00 | 125 | 100 |
03:52:00 | 127 | 100 |
03:53:00 | 133 | 100 |
03:54:00 | 129 | 100 |
03:55:00 | 149 | 100 |
for (name_variable in file_patient_name) {
data = get(name_variable)
data$hour = format(as.POSIXct(data$Time,format="%H:%M:%S"), format = "%H")
unique_values <- c()
j = 1
for(i in 1:length(data$hour)){
if(data$hour[i] %in% unique_values == FALSE){
unique_values[j] = data$hour[i];
j = j + 1}
else if (data$hour[i] %in% unique_values == TRUE && (data$hour[i-1] != data$hour[i])){
data$hour[i] = paste0(data$hour[i],"_1")
}
}
assign(name_variable,data)
}
kable(head(data))
Time | FC | SatO2 | hour |
---|---|---|---|
03:50:00 | 134 | 94 | 03 |
03:51:00 | 125 | 100 | 03 |
03:52:00 | 127 | 100 | 03 |
03:53:00 | 133 | 100 | 03 |
03:54:00 | 129 | 100 | 03 |
03:55:00 | 149 | 100 | 03 |
Let´s create a function that calculates the mean per hour, and studies the amount of missing values and the quantity of values tracks in this hour. This function will allow to see several important things for each patient:
Hour
: Time referring the recollection of the data.
N
: Amount of time values.
Missing_FC
: Missing FC values.
Missing_SatO2
: Missing SatO2 values.
avg_FC_with_NA
: Mean calculated using the available values of FC.
avg_SatO2_with_NA
: Mean calculated using the available values of SatO2
As many .xlsx
files are created as there are patients,
each file will be as follows:
Each file will be name NHC-ID.xlsx
and it will be stored
in ../data/info-patients
for (name_variable in file_patient_name) {
data = get(name_variable)
# Count of values
valores_unicos <- unique(data$hour)
count_values <- data %>%
group_by(hour) %>%
count() %>%
mutate(valor_orden = factor(hour, levels = valores_unicos)) %>%
arrange(valor_orden)
count_values <- data.frame(count_values[,c("hour","n")])
# Missing values in FC
Missing_FC <- data %>%
group_by(hour) %>%
dplyr::summarize(Missing_FC = sum(is.na(FC))) %>%
mutate(valor_orden = factor(hour, levels = valores_unicos)) %>%
arrange(valor_orden)
Missing_FC <- data.frame(Missing_FC[,c("Missing_FC")])
# Missing values in SatO2
Missing_SatO2 <- data %>%
group_by(hour) %>%
dplyr::summarize(Missing_SatO2 = sum(is.na(SatO2))) %>%
mutate(valor_orden = factor(hour, levels = valores_unicos)) %>%
arrange(valor_orden)
Missing_SatO2 <- data.frame(Missing_SatO2[,c("Missing_SatO2")])
# Mean FC
data <- data.table(data)
Mean_FC <- data.frame(data[,list(avg_SatFC=mean(FC)),by=hour])
Mean_FC <- data.frame(Mean_FC[,c("avg_SatFC")])
# Mean SatO2
data <- data.table(data)
Mean_SatO2 <- data.frame(data[,list(avg_SatO2=mean(SatO2)),by=hour])
Mean_SatO2 <- data.frame(Mean_SatO2[,c("avg_SatO2")])
# Mean with not NA values Sat02
Mean_SatO2_with_NA <- data %>%
group_by(hour) %>%
summarise(avg_SatO2_with_NA = mean(SatO2, na.rm = T)) %>%
mutate(valor_orden = factor(hour, levels = valores_unicos)) %>%
arrange(valor_orden)
Mean_SatO2_with_NA <- data.frame(Mean_SatO2_with_NA[,c("avg_SatO2_with_NA")])
# Mean with not NA values FC
Mean_FC_with_NA <- data %>%
group_by(hour) %>%
summarise(avg_FC_with_NA = mean(FC, na.rm = T)) %>%
mutate(valor_orden = factor(hour, levels = valores_unicos)) %>%
arrange(valor_orden)
Mean_FC_with_NA <- data.frame(Mean_FC_with_NA[,c("avg_FC_with_NA")])
# Merging all the data frames
merged_df = data.frame(cbind(count_values,Missing_FC,Missing_SatO2,Mean_FC,Mean_SatO2,Mean_FC_with_NA,Mean_SatO2_with_NA))
assign(paste0(name_variable,"_info"),merged_df)
paste0("../data/info-patients/",name_variable,"_info",".xlsx")
# For written the tables in excel files
#write_xlsx(merged_df,paste0("../data/info-patients/",name_variable,"_info",".xlsx"))
}
kable(head(merged_df))
hour | n | Missing_FC | Missing_SatO2 | Mean_FC…c..avg_SatFC… | Mean_SatO2…c..avg_SatO2… | avg_FC_with_NA | avg_SatO2_with_NA |
---|---|---|---|---|---|---|---|
03 | 10 | 0 | 0 | 139.7000 | 99.20000 | 139.7000 | 99.20000 |
04 | 60 | 0 | 0 | 113.7167 | 97.68333 | 113.7167 | 97.68333 |
05 | 60 | 0 | 0 | 111.7000 | 97.86667 | 111.7000 | 97.86667 |
06 | 60 | 0 | 0 | 105.6833 | 98.23333 | 105.6833 | 98.23333 |
07 | 60 | 0 | 0 | 112.3167 | 97.33333 | 112.3167 | 97.33333 |
08 | 60 | 12 | 12 | NA | NA | 141.4167 | 86.95833 |
Let´s study if all the files have the same length; if not, I will imputing NA values to be able to work with the same methodology with all patients.
A complete file should have values for each minute in the 24 h of study. So 60 mins * 24 h = 1440 minutes + 1 min to close the circle
M <- length(file_patient_name)
N <- 2
matrix_time_length <- as.data.frame(
x = matrix(
data = NA,
nrow = M,
ncol = N
), row.names = file_patient_name,
)
colnames(matrix_time_length) <- c("Time","1441?")
# 60 mins * 24 h = 1440 minutes + 1 min to close the circle
for (name_file in file_patient_name) {
data = get(name_file)
matrix_time_length[paste0(name_file),1] = length(data$Time)
if (matrix_time_length[paste0(name_file),1] == 1441){
matrix_time_length[paste0(name_file),2] = "YES"
}
else {matrix_time_length[paste0(name_file),2] = "NO"}
}
kable(head(matrix_time_length))
Time | 1441? | |
---|---|---|
ACR_11231843 | 1441 | YES |
ADAO_11159808 | 1441 | YES |
AGG_11236448 | 1441 | YES |
AHL_11239959 | 1441 | YES |
AJGD_11119689 | 1441 | YES |
AJJ_11233049 | 1441 | YES |
# Lets print the ones that don t fill the requirements
for (name_file in file_patient_name) {
if (matrix_time_length[paste0(name_file),2] == "NO"){
print(name_file)
print(matrix_time_length[paste0(name_file),1])
}
}
Those patients who do not meet the requirements have been manipulated. NA values have been added to the Time, FC, and SatO2 rows. In some files, it is indicated that missing values are from the head, and by default, NA values have been added to the tail.
# Lets put together all the patients in the same data frame
M <- 1441
N <- length(file_patient_name)
FC_all_patients <- as.data.frame(
x = matrix(
data = NA,
nrow = M,
ncol = N
)
)
SatO2_all_patients <- as.data.frame(
x = matrix(
data = NA,
nrow = M,
ncol = N
)
)
colnames(FC_all_patients) = file_patient_name
colnames(SatO2_all_patients) = file_patient_name
## Imputing the data inside the created data frames
for(name_file in file_patient_name) {
data = get(name_file)
#print(name_file)
#print(length(data$Time))
FC_all_patients[,paste0(name_file)] <- data$FC
SatO2_all_patients[,paste0(name_file)] <- data$SatO2
}
#Adding an extra column for the time series reference
FC_all_patients$time <- c(1:M)
SatO2_all_patients$time <- c(1:M)
write_xlsx(FC_all_patients,"../data/FC_&_SatO2/FC_all_patients.xlsx")
write_xlsx(SatO2_all_patients,"../data/FC_&_SatO2/SatO2_all_patients.xlsx")
Merged dataframe of Heart Rate values
FC_all_patients
(all the patients have the same length 1441 values)
reactable(FC_all_patients)
Merged dataframe of Heart Rate values
SatO2_all_patients
(all the patients have the same length 1441 values)
reactable(SatO2_all_patients)
data-frames
Plotting the first 20 values of Heart Rate and SatO2 of all the patients.
# First N values in the time series
N = 20
gather_FC_all_patients <- tidyr::gather(
data = FC_all_patients[1:N,],
key = "time_series",
value = "value",
-time) # First N values
gather_SatO2_all_patients <- tidyr::gather(
data = SatO2_all_patients[1:N,],
key = "time_series",
value = "value",
-time) # First N values
ggplot(gather_FC_all_patients) +
aes(x = time, y = value, color = time_series) +
geom_line() +
theme_bw() +
labs(title="Heart Rate\n by patient",
x ="Time", y = "BPM") +
theme(legend.position = "none")
ggplot(gather_SatO2_all_patients) +
aes(x = time, y = value, color = time_series) +
geom_line() +
theme_bw() +
labs(title="SatO2\n by patient",
x ="Time", y = "sO2") +
theme(legend.position = "none")
Plotting all the Heart Rate and SatO2 values of all the patients.
# All the values in the time series for deploying this values will be used in the Graphic Interface
gather_FC_all_patients <- tidyr::gather(
data = FC_all_patients,
key = "time_series",
value = "value",
-time) # First N values
gather_SatO2_all_patients <- tidyr::gather(
data = SatO2_all_patients,
key = "time_series",
value = "value",
-time) # First N values
write_xlsx(gather_FC_all_patients,"../deploying/data/gather_FC_all_patients.xlsx")
write_xlsx(gather_SatO2_all_patients,"../deploying/data/gather_SatO2_all_patients.xlsx")
ggplot(gather_FC_all_patients) +
aes(x = time, y = value, color = time_series) +
geom_line() +
theme_bw() +
labs(title="Heart Rate\n by patient",
x ="Time", y = "BPM") +
theme(legend.position = "none")
ggplot(gather_SatO2_all_patients) +
aes(x = time, y = value, color = time_series) +
geom_line() +
theme_bw() +
labs(title="SatO2\n by patient",
x ="Time", y = "sO2") +
theme(legend.position = "none")
Graphic Interface for visualizing each patient isolated: GraphicInterface
This table shows the amount of missing data to work with:
kable(rbind(total_of_values, total_of_missing_values, PCT_of_missing_values), col.names = c("Heart Rate","SatO2"))
Heart Rate | SatO2 | |
---|---|---|
total_of_values | 1.152800e+05 | 1.152800e+05 |
total_of_missing_values | 5.745000e+03 | 5.609000e+03 |
PCT_of_missing_values | 4.983518e+00 | 4.865545e+00 |
vis_miss(FC_all_patients, sort_miss = TRUE) +
theme(axis.text.x = element_text(angle = 90)) +
labs(title = "Missing values in Heart Rate")
vis_miss(SatO2_all_patients, sort_miss = TRUE) +
theme(axis.text.x = element_text(angle = 90)) +
labs(title = "Missing values in SatO2")
# Common variables
vis_miss(df1, sort_miss = TRUE) +
theme(axis.text.x = element_text(angle = 90)) +
labs(title = "Missing values in common variables")
Patients that show DETERIORIO
file_patient_name_DETERIORO <- paste0(df[df$DETERIORO == 1,c("ID","NHC","INICIALES")][,3],"_",df[df$DETERIORO == 1,c("ID","NHC","INICIALES")][,2])
Patients that don´t show DETERIORO
file_patient_name_NO_DETERIORO <- paste0(df[df$DETERIORO == 0,c("ID","NHC","INICIALES")][,3],"_",df[df$DETERIORO == 0,c("ID","NHC","INICIALES")][,2])
## Saving the info
write.xlsx(file_patient_name_NO_DETERIORO, "../data/clean-data/file_patient_name_NO_DETERIORO.xlsx")
write.xlsx(file_patient_name_DETERIORO, "../data/clean-data/file_patient_name_DETERIORO.xlsx")
## For Deploying
write.xlsx(file_patient_name_NO_DETERIORO, "../deploying/data/file_patient_name_NO_DETERIORO.xlsx")
write.xlsx(file_patient_name_DETERIORO, "../deploying/data/file_patient_name_DETERIORO.xlsx")
# FC DETERIORO Missing
vis_miss(FC_all_patients[,file_patient_name_DETERIORO], sort_miss = TRUE) +
theme(axis.text.x = element_text(angle = 90)) +
labs(title = "Missing values in Heart Rate",
subtitle = "DETERIORO")
# SatO2 DETERIORO Missing
vis_miss(SatO2_all_patients[,file_patient_name_DETERIORO], sort_miss = TRUE) +
theme(axis.text.x = element_text(angle = 90)) +
labs(title = "Missing values in SatO2",
subtitle = "DETERIORO")
Detail information of firts two patients with DETERIORO
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 4.2.2
plot <- ggarrange(
gg_miss_span(FC_all_patients[, c("SMG_11123019", "time")], var = SMG_11123019, span_every = 60) + labs(
title = "MPF_11185697",
subtitle = " ",
caption = "Over a repeating span of 60"
),
gg_miss_span(FC_all_patients[, c("HDBG_11139366", "time")], var = HDBG_11139366, span_every = 60) + labs(
title = "HDBG_11139366",
subtitle = " ",
caption = "Over a repeating span of 60"
),
common.legend = TRUE,
legend = "bottom"
# gg_miss_span(FC_all_patients[,c("MPF_11185697", "time")], var = HDBG_11139366, span_every = 60) + labs(title = "Proportion of missing values",
# subtitle = "Over a repeating span of 60",
# caption = "MPF_11185697")
)
annotate_figure(plot, top = text_grob("Proportion of missing values",
color = "black", face = "bold", size = 14))
# FC DETERIORO Missing
vis_miss(FC_all_patients[,file_patient_name_NO_DETERIORO], sort_miss = TRUE) +
theme(axis.text.x = element_text(angle = 90)) +
labs(title = "Missing values in Heart Rate",
subtitle = "NO DETERIORO")
# SatO2 DETERIORO Missing
vis_miss(SatO2_all_patients[,file_patient_name_NO_DETERIORO], sort_miss = TRUE) +
theme(axis.text.x = element_text(angle = 90)) +
labs(title = "Missing values in SatO2",
subtitle = "NO DETERIORO")
Patients will be studied individually to see if they can be counted in the study. An admissible amount of missing data cannot exceed 5%.
All patients
row_names_info <- rownames(data.frame(t(FC_all_patients)))
missing_FC= as.data.frame(miss_case_summary(data.frame(t(FC_all_patients))))
missing_FC$case = row_names_info[missing_FC$case]
colnames(missing_FC) <- c("Patient", "N_Miss","PCT_Miss")
write.xlsx(missing_FC, "../data/missing-info/missing_FC.xlsx")
kable(head(missing_FC))
Patient | N_Miss | PCT_Miss |
---|---|---|
SMG_11123019 | 782 | 54.26787 |
HDBG_11139366 | 706 | 48.99375 |
JFM_11233223 | 659 | 45.73213 |
MSPJ_11164541 | 616 | 42.74809 |
IGC_11229255 | 281 | 19.50035 |
MPF_11185698 | 234 | 16.23872 |
row_names_info <- rownames(data.frame(t(SatO2_all_patients)))
missing_SatO2= as.data.frame(miss_case_summary(data.frame(t(SatO2_all_patients))))
missing_SatO2$case = row_names_info[missing_SatO2$case]
colnames(missing_SatO2) <- c("Patient", "N_Miss","PCT_Miss")
write.xlsx(missing_SatO2, "../data/missing-info/missing_SatO2.xlsx")
kable(head(missing_SatO2))
Patient | N_Miss | PCT_Miss |
---|---|---|
SMG_11123019 | 781 | 54.19847 |
HDBG_11139366 | 706 | 48.99375 |
JFM_11233223 | 659 | 45.73213 |
MSPJ_11164541 | 616 | 42.74809 |
IGC_11229255 | 273 | 18.94518 |
MPF_11185698 | 220 | 15.26718 |
DETERIORO
row_names_info <- rownames(data.frame(t(FC_all_patients[,file_patient_name_DETERIORO])))
missing_DETERIORO_FC= as.data.frame(miss_case_summary(data.frame(t(FC_all_patients[,file_patient_name_DETERIORO]))))
missing_DETERIORO_FC$case = row_names_info[missing_DETERIORO_FC$case]
colnames(missing_DETERIORO_FC) <- c("Patient", "N_Miss","PCT_Miss")
kable(head(missing_DETERIORO_FC))
Patient | N_Miss | PCT_Miss |
---|---|---|
SMG_11123019 | 782 | 54.267870 |
HDBG_11139366 | 706 | 48.993754 |
MPF_11185698 | 234 | 16.238723 |
DGS_11215248 | 182 | 12.630118 |
JJS2_11218322 | 62 | 4.302568 |
PGF_11242386 | 61 | 4.233171 |
row_names_info <- rownames(data.frame(t(SatO2_all_patients[,file_patient_name_DETERIORO])))
missing_DETERIORO_SatO2= as.data.frame(miss_case_summary(data.frame(t(SatO2_all_patients[,file_patient_name_DETERIORO]))))
missing_DETERIORO_SatO2$case = row_names_info[missing_DETERIORO_SatO2$case]
colnames(missing_DETERIORO_SatO2) <- c("Patient", "N_Miss","PCT_Miss")
kable(head(missing_DETERIORO_SatO2))
Patient | N_Miss | PCT_Miss |
---|---|---|
SMG_11123019 | 781 | 54.198473 |
HDBG_11139366 | 706 | 48.993754 |
MPF_11185698 | 220 | 15.267176 |
DGS_11215248 | 181 | 12.560722 |
JJS2_11218322 | 61 | 4.233171 |
PGF_11242386 | 61 | 4.233171 |
No DETERIORO
row_names_info <- rownames(data.frame(t(FC_all_patients[,file_patient_name_NO_DETERIORO])))
missing_NO_DETERIORO_FC= as.data.frame(miss_case_summary(data.frame(t(FC_all_patients[,file_patient_name_NO_DETERIORO]))))
missing_NO_DETERIORO_FC$case = row_names_info[missing_NO_DETERIORO_FC$case]
colnames(missing_NO_DETERIORO_FC) <- c("Patient", "N_Miss","PCT_Miss")
kable(head(missing_NO_DETERIORO_FC))
Patient | N_Miss | PCT_Miss |
---|---|---|
JFM_11233223 | 659 | 45.73213 |
MSPJ_11164541 | 616 | 42.74809 |
IGC_11229255 | 281 | 19.50035 |
MPF_11185697 | 215 | 14.92019 |
HGSDA_11233118 | 204 | 14.15684 |
SPB_11241570 | 163 | 11.31159 |
row_names_info <- rownames(data.frame(t(SatO2_all_patients[,file_patient_name_NO_DETERIORO])))
missing_NO_DETERIORO_SatO2= as.data.frame(miss_case_summary(data.frame(t(SatO2_all_patients[,file_patient_name_NO_DETERIORO]))))
missing_NO_DETERIORO_SatO2$case = row_names_info[missing_NO_DETERIORO_SatO2$case]
colnames(missing_NO_DETERIORO_SatO2) <- c("Patient", "N_Miss","PCT_Miss")
kable(head(missing_NO_DETERIORO_SatO2))
Patient | N_Miss | PCT_Miss |
---|---|---|
JFM_11233223 | 659 | 45.73213 |
MSPJ_11164541 | 616 | 42.74809 |
IGC_11229255 | 273 | 18.94518 |
HGSDA_11233118 | 203 | 14.08744 |
MPF_11185697 | 188 | 13.04650 |
SPB_11241570 | 156 | 10.82582 |
Common values
row_names_info <- rownames(data.frame(t(df1)))
missing_df1 = as.data.frame(miss_case_summary(data.frame(t(df1))))
missing_df1$case = row_names_info[missing_df1$case]
colnames(missing_df1) <- c("Variable", "N_Miss","PCT_Miss")
#Variables with missing values
#colnames(df1)
df1_names <- as.data.frame(colnames(df1))
variable_names = paste0(df1_names[,1])
row_names_info <- rownames(t(df1))
missing_df1= as.data.frame(miss_case_summary(as.data.frame(t(df1))))
missing_df1$case = row_names_info[missing_df1$case]
colnames(missing_df1) <- c("Variable", "N_Miss","PCT_Miss")
# Adding the class of the variable
column_classes <- function(df) {
classes <- vector(mode = "character", length = ncol(df)) # create a vector to store the class information
for (i in 1:ncol(df)) { # iterate through each column of the data frame
if (all(df[[i]] %in% c(0, 1))) { # check if all values in the column are 0 or 1
classes[i] <- "factor" # if so, mark the class as "factor"
} else {
classes[i] <- class(df[[i]]) # otherwise, mark the class as the class of the column
}
}
return(classes)
}
column_info<- data.frame(cbind(names(df1),column_classes(df1)))
colnames(column_info) <- c("Variable","Type")
df_merge <- merge(column_info,missing_df1,by="Variable")
# order the data frame in descending order based on the 'y' column
df_merge <- df_merge[order(df_merge$PCT_Miss, decreasing = TRUE),]
# print the ordered data frame
reactable(df_merge)
# Write information
write.xlsx(df_merge, "../data/info-variables/variables-info.xlsx")
write.csv(df_merge, "../data/info-variables/variables-info.csv", row.names=FALSE)
rownames(df1) <- file_patient_name
reactable(df1)
To handle these variables effectively, we aim to exclude those with missing values from our analysis. We encounter the same issue as before, which highlights the importance of differentiating between a priori and a posteriori variables.
I will only perform a descriptive analysis on the variables that have no missing values.
# summary(df1)
df1_NO_NA = df1[,missing_df1[missing_df1$N_Miss == 0,]$Variable]
# Is important to delete also the "name" variables
# Those ones are
# "ID" "NHC" "INICIALES"
df1_NO_NA <- subset(df1_NO_NA, select = - c(ID,NHC,INICIALES))
reactable(df1_NO_NA)
#names(df1_NO_NA)
#head(df1_NO_NA)
library(corrplot)
## corrplot 0.92 loaded
df1_NO_NA_cor <- df1_NO_NA
colnames(df1_NO_NA_cor) <- c(1:dim(df1_NO_NA_cor)[2])
corrplot(cor(df1_NO_NA_cor), tl.col="black", tl.cex=0.8, tl.srt=70,order = "hclust")