It will be necessary to establish the criteria that I will follow to choose the patients for the study. This situation arises from the fact that most patients have missing values. After choosing the patients with whom I will work, data imputation will be done with the KNN method.
[The imputation of data will be done by groups, DETERIORO and NO DETERIORO]
less_5_PCT_missing_FC = c()
less_5_PCT_missing_SatO2 = c()
missing_FC = data.frame((read_xlsx("../data/missing-info/missing_FC.xlsx")))
missing_SatO2 = data.frame((read_xlsx("../data/missing-info/missing_SatO2.xlsx")))
FC_all_patients = data.frame(read_xlsx("../data/Fc_&_SatO2/FC_all_patients.xlsx"))
SatO2_all_patients = data.frame(read_xlsx("../data/Fc_&_SatO2/SatO2_all_patients.xlsx"))
## Info Patients
file_patient_name_NO_DETERIORO <- data.frame(read.csv("../data/info-patients/file_patient_name_NO_DETERIORO.csv"))
file_patient_name_NO_DETERIORO <- file_patient_name_NO_DETERIORO$x
file_patient_name_DETERIORO <- data.frame(read.csv("../data/info-patients/file_patient_name_DETERIORO.csv"))
file_patient_name_DETERIORO <- file_patient_name_DETERIORO$x
file_patient_name <- data.frame(read.csv("../data/clean-data/file_patient_name.csv"))
file_patient_name <- file_patient_name$x
vis_miss(FC_all_patients, sort_miss = TRUE) +
theme(axis.text.x = element_text(angle = 90)) +
labs(title = "Missing values in Heart Rate")
vis_miss(SatO2_all_patients, sort_miss = TRUE) +
theme(axis.text.x = element_text(angle = 90)) +
labs(title = "Missing values in Heart Rate")
total_patients_info <- data.frame(cbind(length(file_patient_name),length(file_patient_name_NO_DETERIORO),length(file_patient_name_DETERIORO),length(file_patient_name_DETERIORO)/length(file_patient_name_NO_DETERIORO)))
colnames(total_patients_info) <- c("TOTAL PATIENTS","NO DETERIORO","DETERIORO","RATIO [DETERIORO / NO DETERIORO]")
kable(total_patients_info)
TOTAL PATIENTS | NO DETERIORO | DETERIORO | RATIO [DETERIORO / NO DETERIORO] |
---|---|---|---|
79 | 61 | 18 | 0.295082 |
The first criteria will be to only select those patients that has less than
n %
of missing values.
It will also be checked if patients who have more than
n %
of missing values in SatO2
also have them
in Heart Rate
, and vice versa. For that purpose, patients
will be stored i patients_NaN_diverge
list and
patients_NaN_same
will inform if this situation occurs with
a concrete percentage of missing values.
Missing values will be studied between 5 % and 20 % in the following values:
5.0 7.5 10.0 12.5 15.0 17.5 20.0
missing_PCT = seq(5,20, by = 2.5)
patients_NaN_same = c()
patients_NaN_diverge<- list()
less_PCT_missing_list <- list()
for(i in missing_PCT){
vec_NaN_FC = c()
vec_NaN_SatO2 = c()
for (name_patient in file_patient_name) {
if (match(name_patient, missing_FC$Patient)) {
pos = match(name_patient, missing_FC$Patient)
if (missing_FC$PCT_Miss[pos] <= i){
#print(pos)
vec_NaN_FC = append(vec_NaN_FC ,name_patient)
}}
else { print("Error")
}}
for (name_patient in file_patient_name) {
if (match(name_patient, missing_SatO2$Patient)) {
pos = match(name_patient, missing_SatO2$Patient)
if (missing_SatO2$PCT_Miss[pos] <= i){
#print(pos)
vec_NaN_SatO2 = append(vec_NaN_SatO2 ,name_patient)
}}
else { print("Error")
}}
assign(paste0("less_",i,"_PCT_missing_FC"),vec_NaN_FC)
assign(paste0("less_",i,"_PCT_missing_SatO2"), vec_NaN_SatO2)
patients_NaN_same = append(patients_NaN_same ,setequal(vec_NaN_FC, vec_NaN_SatO2))
patients_NaN_diverge = list.append(patients_NaN_diverge, symdiff(vec_NaN_FC, vec_NaN_SatO2))
less_PCT_missing_list = list.append(less_PCT_missing_list,union(vec_NaN_FC,vec_NaN_SatO2))
}
Type_NaN = rep(c("All Patients","Deterioro","No Deterioro"), length(missing_PCT) * 3)
X_NaN_values <- rep(missing_PCT, each = 3:1)
## Warning in rep(missing_PCT, each = 3:1): first element used of 'each' argument
Values_NaN = c()
for (i in 1:length(missing_PCT)){
TOTAL = length(less_PCT_missing_list[[i]])
NO_DETERIORO = length(intersect(less_PCT_missing_list[[i]],file_patient_name_NO_DETERIORO))
DETERIORO = length(intersect(less_PCT_missing_list[[i]],file_patient_name_DETERIORO))
Values_NaN = append(Values_NaN,c(TOTAL, NO_DETERIORO, DETERIORO))
}
Plot_NaN_values = cbind.data.frame(Values_NaN,Type_NaN,X_NaN_values)
#Number of patients in the study
#Type
ggplot(data = Plot_NaN_values, aes(y = Values_NaN, x = X_NaN_values, color = Type_NaN)) +
geom_line() +
geom_point() +
geom_text(aes(label = round(Values_NaN, 1)),
vjust = "inward", hjust = "inward",
show.legend = FALSE) +
labs(x = "% of NaN", y = "Patients in the study",
title = "Plot of the % NaN based on the amout of patients studied")
Best first criteria: I will use 5% criteria for imputing data
So the valid patients that follows
theFirst Criteria of 5 %
are:
valid_patients_P1 = union(less_5_PCT_missing_SatO2,less_5_PCT_missing_FC)
file_patient_name_NO_DETERIORO_P1 = intersect(valid_patients_P1,file_patient_name_NO_DETERIORO)
file_patient_name_DETERIORO_P1 = intersect(valid_patients_P1,file_patient_name_DETERIORO)
total_patients_info_P1 <- data.frame(cbind(length(valid_patients_P1),length(file_patient_name_NO_DETERIORO_P1),length(file_patient_name_DETERIORO_P1),length(file_patient_name_DETERIORO_P1)/length(file_patient_name_NO_DETERIORO_P1)))
colnames(total_patients_info_P1) <- c("TOTAL PATIENTS P1","NO DETERIORO P1","DETERIORO P1", "RATIO [DETERIORO / NO DETERIORO] P1")
kable(total_patients_info_P1)
TOTAL PATIENTS P1 | NO DETERIORO P1 | DETERIORO P1 | RATIO [DETERIORO / NO DETERIORO] P1 |
---|---|---|---|
68 | 54 | 14 | 0.2592593 |
write.xlsx(valid_patients_P1,"../data/clean-data/valid_patients_P1.xlsx")
The second criteria will be to only select those patients that has less than 5 % of missing values in the intervals between 8h, 8.5h, …… 15h.
It will also be checked if patients who have more than 5% of missing
values in SatO2
also have them in Heart Rate
,
and vice versa.
missing_hours = seq(8,15, by = 0.5)
missing_hours_list_FC <- list()
missing_hours_list_SatO2 <- list()
for (i in 1:length(missing_hours)){
row_names_info <- rownames(data.frame(t(FC_all_patients)))
missing_FC= as.data.frame(miss_case_summary(data.frame(t(FC_all_patients[1:(missing_hours[i]*60),]))))
missing_FC$case = row_names_info[missing_FC$case]
colnames(missing_FC) <- c("Patient", "N_Miss","PCT_Miss")
row_names_info <- rownames(data.frame(t(SatO2_all_patients)))
missing_SatO2= as.data.frame(miss_case_summary(data.frame(t(SatO2_all_patients[1:(missing_hours[i]*60),]))))
missing_SatO2$case = row_names_info[missing_SatO2$case]
colnames(missing_SatO2) <- c("Patient", "N_Miss","PCT_Miss")
assign(paste0("missing_SatO2_",i) ,missing_SatO2)
assign(paste0("missing_FC_",i) ,missing_FC)
missing_hours_list_FC <- list.append(missing_hours_list_FC,missing_FC)
missing_hours_list_SatO2 <- list.append(missing_hours_list_SatO2,missing_SatO2)
}
patients_NaN_same_p2 = c()
patients_NaN_diverge_p2 <- list()
less_PCT_missing_list_p2 <- list()
for(i in 1:length(missing_hours)){
vec_NaN_FC = c()
vec_NaN_SatO2 = c()
missing_FC <- missing_hours_list_FC[[i]]
missing_SatO2 <- missing_hours_list_SatO2[[i]]
for (name_patient in file_patient_name) {
if (match(name_patient, missing_FC$Patient)) {
pos = match(name_patient, missing_FC$Patient)
if (missing_FC$PCT_Miss[pos] <= 5){
#print(pos)
vec_NaN_FC = append(vec_NaN_FC ,name_patient)
}}
else { print("Error")
}}
for (name_patient in file_patient_name) {
if (match(name_patient, missing_SatO2$Patient)) {
pos = match(name_patient, missing_SatO2$Patient)
if (missing_SatO2$PCT_Miss[pos] <= 5){
#print(pos)
vec_NaN_SatO2 = append(vec_NaN_SatO2 ,name_patient)
}}
else { print("Error")
}}
assign(paste0("less_PCT_missing_FC_in_",missing_hours[i]),vec_NaN_FC)
assign(paste0("less_PCT_missing_SatO2_in_",missing_hours[i]), vec_NaN_SatO2)
patients_NaN_same_p2 = append(patients_NaN_same_p2 ,setequal(vec_NaN_FC, vec_NaN_SatO2))
patients_NaN_diverge_p2 = list.append(patients_NaN_diverge_p2, symdiff(vec_NaN_FC, vec_NaN_SatO2))
less_PCT_missing_list_p2 = list.append(less_PCT_missing_list_p2, union(vec_NaN_FC,vec_NaN_SatO2))
}
Type_NaN_p2 = rep(c("All Patients","Deterioro","No Deterioro"), length(missing_hours) * 3)
X_NaN_values_p2 <- rep(missing_hours, each = 3:1)
Values_NaN_p2 = c()
for (i in 1:length(missing_hours)){
TOTAL = length(less_PCT_missing_list_p2[[i]])
NO_DETERIORO = length(intersect(less_PCT_missing_list_p2[[i]],file_patient_name_NO_DETERIORO))
DETERIORO = length(intersect(less_PCT_missing_list_p2[[i]],file_patient_name_DETERIORO))
Values_NaN_p2 = append(Values_NaN_p2,c(TOTAL, NO_DETERIORO, DETERIORO))
}
Plot_NaN_values_p2 = cbind.data.frame(Values_NaN_p2,Type_NaN_p2,X_NaN_values_p2)
#Number of patients in the study
#Type
ggplot(data = Plot_NaN_values_p2, aes(y = Values_NaN_p2, x = X_NaN_values_p2, color = Type_NaN_p2)) +
geom_line() +
geom_point() +
geom_text(aes(label = round(Values_NaN_p2, 1)),
vjust = "inward", hjust = "inward",
show.legend = FALSE) +
labs(x = "% of NaN", y = "Patients in the study",
title = "Plot of the % NaN based on the amout of patients studied")
Second Best Citeria: I will use 5% criteria in the first 10 h for imputing data
So the valid patients that follows
theFirst Criteria of 5 % in the first 10 h
are:
valid_patients_P2 = union(less_PCT_missing_SatO2_in_10,less_PCT_missing_FC_in_10)
file_patient_name_NO_DETERIORO_P2 = intersect(valid_patients_P2,file_patient_name_NO_DETERIORO)
file_patient_name_DETERIORO_P2 = intersect(valid_patients_P2,file_patient_name_DETERIORO)
total_patients_info_P2 <- data.frame(cbind(length(valid_patients_P2),length(file_patient_name_NO_DETERIORO_P1),length(file_patient_name_DETERIORO_P2),length(file_patient_name_DETERIORO_P2)/length(file_patient_name_NO_DETERIORO_P2)))
colnames(total_patients_info_P2) <- c("TOTAL PATIENTS P2","NO DETERIORO P1","DETERIORO P2", "RATIO [DETERIORO / NO DETERIORO] P2")
kable(total_patients_info_P2)
TOTAL PATIENTS P2 | NO DETERIORO P1 | DETERIORO P2 | RATIO [DETERIORO / NO DETERIORO] P2 |
---|---|---|---|
69 | 54 | 16 | 0.3018868 |
write.xlsx(valid_patients_P2,"../data/clean-data/valid_patients_P2.xlsx")
Let´s study the discrepancy between patients Studying
patients_NaN_diverge
print(patients_NaN_same)
## [1] FALSE TRUE TRUE TRUE TRUE TRUE TRUE
There is at least one patient that has more than 5% missing in one variable but less than 5% missing in other variable. Let ´s discover who is he/she.
setdiff(less_5_PCT_missing_SatO2, less_5_PCT_missing_FC)
## [1] "VAM_11160159"
# Two data sets diverge in VAM_11160159
missing_SatO2
missing_SatO2$PCT_Miss[match("VAM_11160159", missing_SatO2$Patient)]
## [1] 7.666667
missing_FC
missing_FC$PCT_Miss[match("VAM_11160159", missing_FC$Patient)]
## [1] 8.555556
I am going to use it since (5.343 + 4.788)/2 = 5
It is important to study
patients_NaN_same_p2
so I can see in which situation a patient has less than 5% NaN values in one variable [SatO2 or Heart Rate] but not in the other.
patients_NaN_same_p2
## [1] TRUE TRUE TRUE TRUE FALSE TRUE TRUE FALSE FALSE FALSE FALSE TRUE
## [13] TRUE FALSE FALSE
The positions where exists a discrepancy are:
print(which(!patients_NaN_same_p2))
[1] 5 8 9 10 11 14 15
for (i in which(!patients_NaN_same_p2)) {
cat(paste0("- ","In the first: ",missing_hours[i],"h, the patients that have more than 5 % of NaN in one varible but more in the other are: ",patients_NaN_diverge_p2[[i]],".", sep = "\n"))
}
cat(paste0("- In the first ",missing_hours[5], " h"))
symdiff(less_PCT_missing_FC_in_10, less_PCT_missing_SatO2_in_10)
## [1] "ASN_11226885"
# Two data sets diverge in ASN_11226885
missing_SatO2
missing_hours_list_SatO2[[5]]$PCT_Miss[match("ASN_11226885", missing_hours_list_SatO2[[5]]$Patient)]
## [1] 5
missing_Heart_Rate
missing_hours_list_FC[[5]]$PCT_Miss[match("ASN_11226885", missing_hours_list_FC[[5]]$Patient)]
## [1] 6
cat(paste0("- In the first ",missing_hours[8], " h"))
symdiff(less_PCT_missing_FC_in_11.5, less_PCT_missing_SatO2_in_11.5)
## [1] "ASN_11226885"
# Two data sets diverge in ASN_11226885
missing_SatO2
missing_hours_list_SatO2[[8]]$PCT_Miss[match("ASN_11226885", missing_hours_list_SatO2[[8]]$Patient)]
## [1] 4.927536
missing_Heart_Rate
missing_hours_list_FC[[8]]$PCT_Miss[match("ASN_11226885", missing_hours_list_FC[[8]]$Patient)]
## [1] 5.797101
cat(paste0("- In the first ",missing_hours[9], " h"))
symdiff(less_PCT_missing_FC_in_12, less_PCT_missing_SatO2_in_12)
## [1] "ASN_11226885"
# Two data sets diverge in ASN_11226885
missing_SatO2
missing_hours_list_SatO2[[9]]$PCT_Miss[match("ASN_11226885", missing_hours_list_SatO2[[9]]$Patient)]
## [1] 4.722222
missing_Heart_Rate
missing_hours_list_FC[[9]]$PCT_Miss[match("ASN_11226885", missing_hours_list_FC[[9]]$Patient)]
## [1] 5.555556
cat(paste0("- In the first ",missing_hours[10], " h"))
symdiff(less_PCT_missing_FC_in_12.5, less_PCT_missing_SatO2_in_12.5)
## [1] "ASN_11226885"
# Two data sets diverge in ASN_11226885
missing_SatO2
missing_hours_list_SatO2[[10]]$PCT_Miss[match("ASN_11226885", missing_hours_list_SatO2[[10]]$Patient)]
## [1] 4.533333
missing_Heart_Rate
missing_hours_list_FC[[10]]$PCT_Miss[match("ASN_11226885", missing_hours_list_FC[[10]]$Patient)]
## [1] 5.333333
cat(paste0("- In the first ",missing_hours[11], " h"))
symdiff(less_PCT_missing_FC_in_13, less_PCT_missing_SatO2_in_13)
## [1] "ASN_11226885"
# Two data sets diverge in ASN_11226885
missing_SatO2
missing_hours_list_SatO2[[11]]$PCT_Miss[match("ASN_11226885", missing_hours_list_SatO2[[11]]$Patient)]
## [1] 4.358974
missing_Heart_Rate
missing_hours_list_FC[[11]]$PCT_Miss[match("ASN_11226885", missing_hours_list_FC[[11]]$Patient)]
## [1] 5.128205
cat(paste0("- In the first ",missing_hours[14], " h"))
symdiff(less_PCT_missing_FC_in_14.5, less_PCT_missing_SatO2_in_14.5)
## [1] "MSMM_11239970"
# Two data sets diverge in MSMM_11239970
missing_SatO2
missing_hours_list_SatO2[[14]]$PCT_Miss[match("MSMM_11239970", missing_hours_list_SatO2[[14]]$Patient)]
## [1] 4.942529
missing_Heart_Rate
missing_hours_list_FC[[14]]$PCT_Miss[match("MSMM_11239970", missing_hours_list_FC[[14]]$Patient)]
## [1] 5.862069
cat(paste0("- In the first ",missing_hours[15], " h"))
symdiff(less_PCT_missing_FC_in_15, less_PCT_missing_SatO2_in_15)
## [1] "MSMM_11239970"
# Two data sets diverge in MSMM_11239970
missing_SatO2
missing_hours_list_SatO2[[15]]$PCT_Miss[match("MSMM_11239970", missing_hours_list_SatO2[[15]]$Patient)]
## [1] 4.777778
missing_Heart_Rate
missing_hours_list_FC[[15]]$PCT_Miss[match("MSMM_11239970", missing_hours_list_FC[[15]]$Patient)]
## [1] 5.666667