It will be necessary to establish the criteria that I will follow to choose the patients for the study. This situation arises from the fact that most patients have missing values. After choosing the patients with whom I will work, data imputation will be done with the KNN method.

[The imputation of data will be done by groups, DETERIORO and NO DETERIORO]

less_5_PCT_missing_FC = c()
less_5_PCT_missing_SatO2 = c()
missing_FC = data.frame((read_xlsx("../data/missing-info/missing_FC.xlsx")))
missing_SatO2 = data.frame((read_xlsx("../data/missing-info/missing_SatO2.xlsx")))
FC_all_patients = data.frame(read_xlsx("../data/Fc_&_SatO2/FC_all_patients.xlsx"))
SatO2_all_patients = data.frame(read_xlsx("../data/Fc_&_SatO2/SatO2_all_patients.xlsx"))



## Info Patients
file_patient_name_NO_DETERIORO <- data.frame(read.csv("../data/info-patients/file_patient_name_NO_DETERIORO.csv"))
file_patient_name_NO_DETERIORO <- file_patient_name_NO_DETERIORO$x
file_patient_name_DETERIORO <- data.frame(read.csv("../data/info-patients/file_patient_name_DETERIORO.csv"))
file_patient_name_DETERIORO <- file_patient_name_DETERIORO$x
file_patient_name <- data.frame(read.csv("../data/clean-data/file_patient_name.csv"))
file_patient_name <- file_patient_name$x

0.0.0.0.1 Missing values in Heart Rate

vis_miss(FC_all_patients, sort_miss = TRUE) + 
  theme(axis.text.x = element_text(angle = 90)) + 
  labs(title = "Missing values in Heart Rate")

0.0.0.0.2 Missing values in SatO2

vis_miss(SatO2_all_patients, sort_miss = TRUE) + 
  theme(axis.text.x = element_text(angle = 90)) + 
  labs(title = "Missing values in Heart Rate")

total_patients_info <- data.frame(cbind(length(file_patient_name),length(file_patient_name_NO_DETERIORO),length(file_patient_name_DETERIORO),length(file_patient_name_DETERIORO)/length(file_patient_name_NO_DETERIORO)))
colnames(total_patients_info) <- c("TOTAL PATIENTS","NO DETERIORO","DETERIORO","RATIO [DETERIORO / NO DETERIORO]")
kable(total_patients_info)

TOTAL PATIENTS	NO DETERIORO	DETERIORO	RATIO [DETERIORO / NO DETERIORO]
79	61	18	0.295082

1 First criteria

The first criteria will be to only select those patients that has less than n % of missing values.

It will also be checked if patients who have more than n % of missing values in SatO2 also have them in Heart Rate, and vice versa. For that purpose, patients will be stored i patients_NaN_diverge list and patients_NaN_same will inform if this situation occurs with a concrete percentage of missing values.

Missing values will be studied between 5 % and 20 % in the following values:

5.0 7.5 10.0 12.5 15.0 17.5 20.0

missing_PCT = seq(5,20, by = 2.5)
patients_NaN_same = c()
patients_NaN_diverge<- list()
less_PCT_missing_list <- list()

for(i in missing_PCT){
vec_NaN_FC = c()
vec_NaN_SatO2 = c()

for (name_patient in file_patient_name) {
  if (match(name_patient, missing_FC$Patient)) {
    pos = match(name_patient, missing_FC$Patient)
    
    if (missing_FC$PCT_Miss[pos] <= i){
      #print(pos)
      vec_NaN_FC  = append(vec_NaN_FC ,name_patient)
    }}
  
  else { print("Error")
  }}

for (name_patient in file_patient_name) {
  if (match(name_patient, missing_SatO2$Patient)) {
    pos = match(name_patient, missing_SatO2$Patient)
    
    if (missing_SatO2$PCT_Miss[pos] <= i){
      #print(pos)
      vec_NaN_SatO2  = append(vec_NaN_SatO2 ,name_patient)
    }}
  
  else { print("Error")
  }}

assign(paste0("less_",i,"_PCT_missing_FC"),vec_NaN_FC)
assign(paste0("less_",i,"_PCT_missing_SatO2"), vec_NaN_SatO2)
patients_NaN_same = append(patients_NaN_same ,setequal(vec_NaN_FC, vec_NaN_SatO2))
patients_NaN_diverge = list.append(patients_NaN_diverge, symdiff(vec_NaN_FC, vec_NaN_SatO2))
less_PCT_missing_list = list.append(less_PCT_missing_list,union(vec_NaN_FC,vec_NaN_SatO2))
}

1.1 Preparing the plot: First Criteria

Type_NaN = rep(c("All Patients","Deterioro","No Deterioro"), length(missing_PCT) * 3)
X_NaN_values <- rep(missing_PCT, each = 3:1)

## Warning in rep(missing_PCT, each = 3:1): first element used of 'each' argument

Values_NaN = c()
for (i in 1:length(missing_PCT)){
  
  TOTAL = length(less_PCT_missing_list[[i]])
  NO_DETERIORO = length(intersect(less_PCT_missing_list[[i]],file_patient_name_NO_DETERIORO))
  DETERIORO = length(intersect(less_PCT_missing_list[[i]],file_patient_name_DETERIORO))
  Values_NaN = append(Values_NaN,c(TOTAL, NO_DETERIORO, DETERIORO))
  
}

Plot_NaN_values = cbind.data.frame(Values_NaN,Type_NaN,X_NaN_values)

#Number of patients in the study 
#Type

ggplot(data = Plot_NaN_values, aes(y = Values_NaN, x = X_NaN_values, color = Type_NaN)) + 
  geom_line() + 
  geom_point() + 
  geom_text(aes(label = round(Values_NaN, 1)),
            vjust = "inward", hjust = "inward",
            show.legend = FALSE) +
  labs(x = "% of NaN", y = "Patients in the study", 
       title = "Plot of the % NaN based on the amout of patients studied")

Best first criteria: I will use 5% criteria for imputing data

So the valid patients that follows theFirst Criteria of 5 % are:

CRITERIA 1 PATIENTS:

[1] "ACR_11231843" "ADAO_11159808" "AGG_11236448" "AHL_11239959" [5] "AJGD_11119689" "AJJ_11233049" "AMP_11228639" "AMT_11120363" [9] "APA_11204819" "ARR_11228585" "ASN_11226885" "AZM_11047760" [13] "CBA_11124187" "CGN_11234482" "DEA_11243504" "DIPDLH_11241649" [17] "DJSD_11178309" "DVS_11231268" "DZL_11227036" "ECP_11169795" [21] "FNMM_11174240" "GGG_11156716" "GGT_11208499" "GHP_11229529" [25] "GLR_11225596" "HCC_11203216" "IGC2_11229255" "ILG_11229582" [29] "IPA_11147550" "IRL_11034760" "JJB_11182744" "JJS_11218322" [33] "JJS2_11218322" "JNM_11242584" "JPD_11209658" "JPT_11236205" [37] "JQA_11091598" "KSBS_11201840" "LMF_11116324" "LMM_11139982" [41] "LMP_11060996" "LMP2_11060996" "LMS_11228310" "LVBB_11135653" [45] "MA_11216747" "MFEH_11191624" "MMB_11205362" "MSMM_11239970" [49] "MTG_11220400" "NEH_11181855" "NSM_11223800" "PBO_11129516" [53] "PGA_11180136" "PGF_11242386" "PMR_11230016" "RBJ_11163775" [57] "RGFGM_11156248" "RVS_11034257" "SGG_11181506" "SLF_11214212" [61] "SPM_11222444" "SSF_11207023" "TCD_11245595" "TGJ_11200052" [65] "VAM_11160159" "VCR_11203302" "YPR_11188252" "YVR_11188465"

valid_patients_P1 = union(less_5_PCT_missing_SatO2,less_5_PCT_missing_FC)
file_patient_name_NO_DETERIORO_P1 = intersect(valid_patients_P1,file_patient_name_NO_DETERIORO)
file_patient_name_DETERIORO_P1 = intersect(valid_patients_P1,file_patient_name_DETERIORO)

total_patients_info_P1 <- data.frame(cbind(length(valid_patients_P1),length(file_patient_name_NO_DETERIORO_P1),length(file_patient_name_DETERIORO_P1),length(file_patient_name_DETERIORO_P1)/length(file_patient_name_NO_DETERIORO_P1)))
colnames(total_patients_info_P1) <- c("TOTAL PATIENTS P1","NO DETERIORO P1","DETERIORO P1", "RATIO [DETERIORO / NO DETERIORO] P1")
kable(total_patients_info_P1)

TOTAL PATIENTS P1	NO DETERIORO P1	DETERIORO P1	RATIO [DETERIORO / NO DETERIORO] P1
68	54	14	0.2592593

write.xlsx(valid_patients_P1,"../data/clean-data/valid_patients_P1.xlsx")

2 Second Criteria

The second criteria will be to only select those patients that has less than 5 % of missing values in the intervals between 8h, 8.5h, …… 15h.

It will also be checked if patients who have more than 5% of missing values in SatO2 also have them in Heart Rate, and vice versa.

missing_hours = seq(8,15, by = 0.5)

missing_hours_list_FC <- list()
missing_hours_list_SatO2 <- list()
for (i in 1:length(missing_hours)){
row_names_info <- rownames(data.frame(t(FC_all_patients)))
missing_FC= as.data.frame(miss_case_summary(data.frame(t(FC_all_patients[1:(missing_hours[i]*60),]))))
missing_FC$case = row_names_info[missing_FC$case]
colnames(missing_FC) <- c("Patient", "N_Miss","PCT_Miss")

row_names_info <- rownames(data.frame(t(SatO2_all_patients)))
missing_SatO2= as.data.frame(miss_case_summary(data.frame(t(SatO2_all_patients[1:(missing_hours[i]*60),]))))
missing_SatO2$case = row_names_info[missing_SatO2$case]
colnames(missing_SatO2) <- c("Patient", "N_Miss","PCT_Miss")

assign(paste0("missing_SatO2_",i) ,missing_SatO2)
assign(paste0("missing_FC_",i) ,missing_FC)

missing_hours_list_FC <- list.append(missing_hours_list_FC,missing_FC)
missing_hours_list_SatO2 <- list.append(missing_hours_list_SatO2,missing_SatO2)
}

patients_NaN_same_p2 = c()
patients_NaN_diverge_p2 <- list()
less_PCT_missing_list_p2 <- list()

for(i in 1:length(missing_hours)){
vec_NaN_FC = c()
vec_NaN_SatO2 = c()
missing_FC <- missing_hours_list_FC[[i]]
missing_SatO2 <- missing_hours_list_SatO2[[i]]

for (name_patient in file_patient_name) {
  if (match(name_patient, missing_FC$Patient)) {
    pos = match(name_patient, missing_FC$Patient)
    
    if (missing_FC$PCT_Miss[pos] <= 5){
      #print(pos)
      vec_NaN_FC  = append(vec_NaN_FC ,name_patient)
    }}
  
  else { print("Error")
  }}

for (name_patient in file_patient_name) {
  if (match(name_patient, missing_SatO2$Patient)) {
    pos = match(name_patient, missing_SatO2$Patient)
    
    if (missing_SatO2$PCT_Miss[pos] <= 5){
      #print(pos)
      vec_NaN_SatO2  = append(vec_NaN_SatO2 ,name_patient)
    }}
  
  else { print("Error")
  }}

assign(paste0("less_PCT_missing_FC_in_",missing_hours[i]),vec_NaN_FC)
assign(paste0("less_PCT_missing_SatO2_in_",missing_hours[i]), vec_NaN_SatO2)
patients_NaN_same_p2 = append(patients_NaN_same_p2 ,setequal(vec_NaN_FC, vec_NaN_SatO2))
patients_NaN_diverge_p2 = list.append(patients_NaN_diverge_p2, symdiff(vec_NaN_FC, vec_NaN_SatO2))
less_PCT_missing_list_p2 = list.append(less_PCT_missing_list_p2, union(vec_NaN_FC,vec_NaN_SatO2))
}

2.1 Preparing the plot: Second Criteria

Type_NaN_p2 = rep(c("All Patients","Deterioro","No Deterioro"), length(missing_hours) * 3)
X_NaN_values_p2 <- rep(missing_hours, each = 3:1)
Values_NaN_p2 = c()
for (i in 1:length(missing_hours)){
  
  TOTAL = length(less_PCT_missing_list_p2[[i]])
  NO_DETERIORO = length(intersect(less_PCT_missing_list_p2[[i]],file_patient_name_NO_DETERIORO))
  DETERIORO = length(intersect(less_PCT_missing_list_p2[[i]],file_patient_name_DETERIORO))
  Values_NaN_p2 = append(Values_NaN_p2,c(TOTAL, NO_DETERIORO, DETERIORO))
  
}

Plot_NaN_values_p2 = cbind.data.frame(Values_NaN_p2,Type_NaN_p2,X_NaN_values_p2)

#Number of patients in the study 
#Type

ggplot(data = Plot_NaN_values_p2, aes(y = Values_NaN_p2, x = X_NaN_values_p2, color = Type_NaN_p2)) + 
  geom_line() + 
  geom_point() + 
  geom_text(aes(label = round(Values_NaN_p2, 1)),
            vjust = "inward", hjust = "inward",
            show.legend = FALSE) +
  labs(x = "% of NaN", y = "Patients in the study", 
       title = "Plot of the % NaN based on the amout of patients studied")

Second Best Citeria: I will use 5% criteria in the first 10 h for imputing data

So the valid patients that follows theFirst Criteria of 5 % in the first 10 h are:

CRITERIA 2 PATIENTS:

[1] "ACR_11231843" "ADAO_11159808" "AGG_11236448" "AHL_11239959" [5] "AJGD_11119689" "AJJ_11233049" "AMP_11228639" "AMT_11120363" [9] "APA_11204819" "ARR_11228585" "ASN_11226885" "AZM_11047760" [13] "CBA_11124187" "CGN_11234482" "DEA_11243504" "DGS_11215248" [17] "DIPDLH_11241649" "DJSD_11178309" "DVS_11231268" "DZL_11227036" [21] "ECP_11169795" "FNMM_11174240" "GGG_11156716" "GGT_11208499" [25] "GHP_11229529" "GLR_11225596" "HCC_11203216" "HDBG_11139366" [29] "IGC2_11229255" "ILG_11229582" "IPA_11147550" "IRL_11034760" [33] "JJB_11182744" "JNM_11242584" "JPD_11209658" "JPT_11236205" [37] "JQA_11091598" "KSBS_11201840" "LMF_11116324" "LMM_11139982" [41] "LMP_11060996" "LMP2_11060996" "LMS_11228310" "LVBB_11135653" [45] "MA_11216747" "MFEH_11191624" "MMB_11205362" "MSPJ_11164541" [49] "MTG_11220400" "NEH_11181855" "NSM_11223800" "PBO_11129516" [53] "PGA_11180136" "PGF_11242386" "PMR_11230016" "RBJ_11163775" [57] "RGFGM_11156248" "RVS_11034257" "SGG_11181506" "SLF_11214212" [61] "SMG_11123019" "SPB_11241570" "SPM_11222444" "SSF_11207023" [65] "TCD_11245595" "TGJ_11200052" "VCR_11203302" "YPR_11188252" [69] "YVR_11188465"

valid_patients_P2 = union(less_PCT_missing_SatO2_in_10,less_PCT_missing_FC_in_10)
file_patient_name_NO_DETERIORO_P2 = intersect(valid_patients_P2,file_patient_name_NO_DETERIORO)
file_patient_name_DETERIORO_P2 = intersect(valid_patients_P2,file_patient_name_DETERIORO)

total_patients_info_P2 <- data.frame(cbind(length(valid_patients_P2),length(file_patient_name_NO_DETERIORO_P1),length(file_patient_name_DETERIORO_P2),length(file_patient_name_DETERIORO_P2)/length(file_patient_name_NO_DETERIORO_P2)))
colnames(total_patients_info_P2) <- c("TOTAL PATIENTS P2","NO DETERIORO P1","DETERIORO P2", "RATIO [DETERIORO / NO DETERIORO] P2")
kable(total_patients_info_P2)

TOTAL PATIENTS P2	NO DETERIORO P1	DETERIORO P2	RATIO [DETERIORO / NO DETERIORO] P2
69	54	16	0.3018868

write.xlsx(valid_patients_P2,"../data/clean-data/valid_patients_P2.xlsx")

3 Let´s study the discrepancy between patients

3.1 First Criteria

Let´s study the discrepancy between patients Studying patients_NaN_diverge

print(patients_NaN_same)

## [1] FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE

There is at least one patient that has more than 5% missing in one variable but less than 5% missing in other variable. Let ´s discover who is he/she.

setdiff(less_5_PCT_missing_SatO2, less_5_PCT_missing_FC)

## [1] "VAM_11160159"

# Two data sets diverge in VAM_11160159

missing_SatO2

missing_SatO2$PCT_Miss[match("VAM_11160159", missing_SatO2$Patient)]

## [1] 7.666667

missing_FC

missing_FC$PCT_Miss[match("VAM_11160159", missing_FC$Patient)]

## [1] 8.555556

I am going to use it since (5.343 + 4.788)/2 = 5

3.2 Second Criteria

It is important to study patients_NaN_same_p2 so I can see in which situation a patient has less than 5% NaN values in one variable [SatO2 or Heart Rate] but not in the other.

patients_NaN_same_p2

##  [1]  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE  TRUE
## [13]  TRUE FALSE FALSE

The positions where exists a discrepancy are:

print(which(!patients_NaN_same_p2))

[1] 5 8 9 10 11 14 15

for (i in which(!patients_NaN_same_p2)) {
  cat(paste0("- ","In the first: ",missing_hours[i],"h, the patients that have more than 5 % of NaN in one varible but more in the other are: ",patients_NaN_diverge_p2[[i]],".", sep = "\n"))
}

In the first: 10h, the patients that have more than 5 % of NaN in one varible but more in the other are: ASN_11226885.
In the first: 11.5h, the patients that have more than 5 % of NaN in one varible but more in the other are: ASN_11226885.
In the first: 12h, the patients that have more than 5 % of NaN in one varible but more in the other are: ASN_11226885.
In the first: 12.5h, the patients that have more than 5 % of NaN in one varible but more in the other are: ASN_11226885.
In the first: 13h, the patients that have more than 5 % of NaN in one varible but more in the other are: ASN_11226885.
In the first: 14.5h, the patients that have more than 5 % of NaN in one varible but more in the other are: MSMM_11239970.
In the first: 15h, the patients that have more than 5 % of NaN in one varible but more in the other are: MSMM_11239970.

cat(paste0("- In the first ",missing_hours[5], " h"))

In the first 10 h

symdiff(less_PCT_missing_FC_in_10, less_PCT_missing_SatO2_in_10)

## [1] "ASN_11226885"

# Two data sets diverge in ASN_11226885

missing_SatO2

missing_hours_list_SatO2[[5]]$PCT_Miss[match("ASN_11226885", missing_hours_list_SatO2[[5]]$Patient)]

## [1] 5

missing_Heart_Rate

missing_hours_list_FC[[5]]$PCT_Miss[match("ASN_11226885", missing_hours_list_FC[[5]]$Patient)]

## [1] 6

cat(paste0("- In the first ",missing_hours[8], " h"))

In the first 11.5 h

symdiff(less_PCT_missing_FC_in_11.5, less_PCT_missing_SatO2_in_11.5)

## [1] "ASN_11226885"

# Two data sets diverge in ASN_11226885

missing_SatO2

missing_hours_list_SatO2[[8]]$PCT_Miss[match("ASN_11226885", missing_hours_list_SatO2[[8]]$Patient)]

## [1] 4.927536

missing_Heart_Rate

missing_hours_list_FC[[8]]$PCT_Miss[match("ASN_11226885", missing_hours_list_FC[[8]]$Patient)]

## [1] 5.797101

cat(paste0("- In the first ",missing_hours[9], " h"))

In the first 12 h

symdiff(less_PCT_missing_FC_in_12, less_PCT_missing_SatO2_in_12)

## [1] "ASN_11226885"

# Two data sets diverge in ASN_11226885

missing_SatO2

missing_hours_list_SatO2[[9]]$PCT_Miss[match("ASN_11226885", missing_hours_list_SatO2[[9]]$Patient)]

## [1] 4.722222

missing_Heart_Rate

missing_hours_list_FC[[9]]$PCT_Miss[match("ASN_11226885", missing_hours_list_FC[[9]]$Patient)]

## [1] 5.555556

cat(paste0("- In the first ",missing_hours[10], " h"))

In the first 12.5 h

symdiff(less_PCT_missing_FC_in_12.5, less_PCT_missing_SatO2_in_12.5)

## [1] "ASN_11226885"

# Two data sets diverge in ASN_11226885

missing_SatO2

missing_hours_list_SatO2[[10]]$PCT_Miss[match("ASN_11226885", missing_hours_list_SatO2[[10]]$Patient)]

## [1] 4.533333

missing_Heart_Rate

missing_hours_list_FC[[10]]$PCT_Miss[match("ASN_11226885", missing_hours_list_FC[[10]]$Patient)]

## [1] 5.333333

cat(paste0("- In the first ",missing_hours[11], " h"))

In the first 13 h

symdiff(less_PCT_missing_FC_in_13, less_PCT_missing_SatO2_in_13)

## [1] "ASN_11226885"

# Two data sets diverge in ASN_11226885

missing_SatO2

missing_hours_list_SatO2[[11]]$PCT_Miss[match("ASN_11226885", missing_hours_list_SatO2[[11]]$Patient)]

## [1] 4.358974

missing_Heart_Rate

missing_hours_list_FC[[11]]$PCT_Miss[match("ASN_11226885", missing_hours_list_FC[[11]]$Patient)]

## [1] 5.128205

cat(paste0("- In the first ",missing_hours[14], " h"))

In the first 14.5 h

symdiff(less_PCT_missing_FC_in_14.5, less_PCT_missing_SatO2_in_14.5)

## [1] "MSMM_11239970"

# Two data sets diverge in MSMM_11239970

missing_SatO2

missing_hours_list_SatO2[[14]]$PCT_Miss[match("MSMM_11239970", missing_hours_list_SatO2[[14]]$Patient)]

## [1] 4.942529

missing_Heart_Rate

missing_hours_list_FC[[14]]$PCT_Miss[match("MSMM_11239970", missing_hours_list_FC[[14]]$Patient)]

## [1] 5.862069

cat(paste0("- In the first ",missing_hours[15], " h"))

In the first 15 h

symdiff(less_PCT_missing_FC_in_15, less_PCT_missing_SatO2_in_15)

## [1] "MSMM_11239970"

# Two data sets diverge in MSMM_11239970

missing_SatO2

missing_hours_list_SatO2[[15]]$PCT_Miss[match("MSMM_11239970", missing_hours_list_SatO2[[15]]$Patient)]

## [1] 4.777778

missing_Heart_Rate

missing_hours_list_FC[[15]]$PCT_Miss[match("MSMM_11239970", missing_hours_list_FC[[15]]$Patient)]

## [1] 5.666667

Generating different Data Sets For Clustering Purposes

Gonzalo Aris 16021

2023-02-11

0.0.0.0.1 Missing values in Heart Rate

0.0.0.0.2 Missing values in SatO2

1 First criteria

1.1 Preparing the plot: First Criteria

2 Second Criteria

2.1 Preparing the plot: Second Criteria

3 Let´s study the discrepancy between patients

3.1 First Criteria

3.2 Second Criteria