Libraries

library(tidyr)
library(readr)
library(ggplot2)
library(knitr)
library(readxl)
library(xlsx)
library(openxlsx)
library(reactable) # reactable(df)

## Knn imputation
library(caret)
library(RANN)

Imputation based in: https://rpubs.com/garisj98/TimeSeriesImputation

Reading Data

less_5_PCT_missing_FC = c()
less_5_PCT_missing_SatO2 = c()
missing_FC = data.frame((read_xlsx("../data/missing-info/missing_FC.xlsx")))
missing_SatO2 = data.frame((read_xlsx("../data/missing-info/missing_SatO2.xlsx")))
FC_all_patients = data.frame(read_xlsx("../data/Fc_&_SatO2/FC_all_patients.xlsx"))
SatO2_all_patients = data.frame(read_xlsx("../data/Fc_&_SatO2/SatO2_all_patients.xlsx"))
# Reading Patients names
file_patient_name <- read.csv("../deploying/data/file_patient_name.csv")
file_patient_name <- file_patient_name$x 


## Saving the info
file_patient_name_NO_DETERIORO <- data.frame(read_xlsx("../data/clean-data/file_patient_name_NO_DETERIORO.xlsx"))
file_patient_name_NO_DETERIORO <- file_patient_name_NO_DETERIORO$x
file_patient_name_DETERIORO <- data.frame(read_xlsx("../data/clean-data/file_patient_name_DETERIORO.xlsx"))
file_patient_name_DETERIORO <- file_patient_name_DETERIORO$x

1. Previous Study

Let´s study for which patients it is possible to input data. The condition is to have less than 5% of missing data.

The condition of less than 5 % of missing values in Heart Rate and SatO2 shall be studied. Later it will be checked if patients who have more than 5% of missing in one variable also have it in the other.

for (name_patient in file_patient_name) {
  if (match(name_patient, missing_FC$Patient)) {
    pos = match(name_patient, missing_FC$Patient)
    
    if (missing_FC$PCT_Miss[pos] <= 5){
      #print(pos)
      less_5_PCT_missing_FC = append(less_5_PCT_missing_FC,name_patient)
    }}
  
  else { print("Error")
  }}

for (name_patient in file_patient_name) {
  if (match(name_patient, missing_SatO2$Patient)) {
    pos = match(name_patient, missing_SatO2$Patient)
    
    if (missing_SatO2$PCT_Miss[pos] <= 5){
      #print(pos)
      less_5_PCT_missing_SatO2 = append(less_5_PCT_missing_SatO2,name_patient)
    }}
  
  else { print("Error")
  }}

Do patients who have less than 5% of missing values in Heart Rate also have them in SatO2 and vice versa?

setequal(less_5_PCT_missing_SatO2, less_5_PCT_missing_FC) 
## [1] FALSE
# [1] FALSE

There is at least one patient that has more than 5% missing in one variable but less than 5% missing in other variable. Let ´s discover who is he/she.

setdiff(less_5_PCT_missing_SatO2, less_5_PCT_missing_FC)
## [1] "VAM_11160159"
# Two data sets diverge in VAM_11160159

missing_SatO2

missing_SatO2$PCT_Miss[match("VAM_11160159", missing_SatO2$Patient)]
## [1] 4.788341

missing_FC

missing_FC$PCT_Miss[match("VAM_11160159", missing_FC$Patient)]
## [1] 5.343511

I am going to use it since (5.343 + 4.788)/2 = 5

valid_patients_for_imputing_data = union(less_5_PCT_missing_SatO2,less_5_PCT_missing_FC)

Valid DETERIORO PATIENTS

valid_patients_for_imputing_data_DETERIORO = intersect(valid_patients_for_imputing_data,file_patient_name_DETERIORO)
reactable(data.frame(valid_patients_for_imputing_data_DETERIORO))

Valid NO DETERIORO PATIENTS

valid_patients_for_imputing_data_NO_DETERIORO = intersect(valid_patients_for_imputing_data,file_patient_name_NO_DETERIORO)

reactable(data.frame(valid_patients_for_imputing_data_NO_DETERIORO))

2. Imputing Data: KNN

For imputation, I will distinguish between DETERIORO and NOT DETERIORO dataframes.

Deterioro imputation

## Heart Rate
FC_DETERIORO_KNN_input = preProcess(FC_all_patients[,valid_patients_for_imputing_data_DETERIORO], "knnImpute")
FC_DETERIORO_KNN_input_pred = predict(FC_DETERIORO_KNN_input, FC_all_patients[,valid_patients_for_imputing_data_DETERIORO])
## SatO2
SatO2_DETERIORO_KNN_input = preProcess(SatO2_all_patients[,valid_patients_for_imputing_data_DETERIORO], "knnImpute")
SatO2_DETERIORO_KNN_input_pred = predict(SatO2_DETERIORO_KNN_input, SatO2_all_patients[,valid_patients_for_imputing_data_DETERIORO])

No Deterioro imputation

## Heart Rate
FC_NO_DETERIORO_KNN_input = preProcess(FC_all_patients[,valid_patients_for_imputing_data_NO_DETERIORO], "knnImpute")
FC_NO_DETERIORO_KNN_input_pred = predict(FC_NO_DETERIORO_KNN_input, FC_all_patients[,valid_patients_for_imputing_data_NO_DETERIORO])
## SatO2
SatO2_NO_DETERIORO_KNN_input = preProcess(SatO2_all_patients[,valid_patients_for_imputing_data_NO_DETERIORO], "knnImpute")
SatO2_NO_DETERIORO_KNN_input_pred = predict(SatO2_NO_DETERIORO_KNN_input, SatO2_all_patients[,valid_patients_for_imputing_data_NO_DETERIORO])

Merge in one dataframe

FC_all_valid_patients_input = data.frame(FC_DETERIORO_KNN_input_pred,FC_NO_DETERIORO_KNN_input_pred)
SatO2_all_valid_patients_input = data.frame(SatO2_DETERIORO_KNN_input_pred,SatO2_NO_DETERIORO_KNN_input_pred)

2.1 Binary Mask

Let´s create a dataframe for identifying which patients show Nan values in which level

FC_all_valid_patients_Binary_Mask_NA = data.frame(is.na(FC_all_patients[,colnames(FC_all_valid_patients_input)]))
colnames(FC_all_valid_patients_Binary_Mask_NA) <- colnames(FC_all_patients[,colnames(FC_all_valid_patients_input)])
reactable(FC_all_valid_patients_Binary_Mask_NA)
SatO2_all_valid_patients_Binary_Mask_NA = data.frame(is.na(SatO2_all_patients[,colnames(SatO2_all_valid_patients_input)]))
colnames(SatO2_all_valid_patients_Binary_Mask_NA) <- colnames(SatO2_all_patients[,colnames(SatO2_all_valid_patients_input)])
reactable(SatO2_all_valid_patients_Binary_Mask_NA)

Do the patients in Heart Rate and SatO2 are the same

setdiff(colnames(FC_all_valid_patients_input),colnames(SatO2_all_valid_patients_input))
## character(0)
#  Write imputed data for later use
list_of_datasets1 <-
  list("FC_all_valid_patients_input" = FC_all_valid_patients_input,
       "Binary_Mask_NA" = FC_all_valid_patients_Binary_Mask_NA)

write.xlsx(list_of_datasets1,
           "../data/clean-data/FC_all_valid_patients_input.xlsx",
           colNames = TRUE)

list_of_datasets2 <-
  list(
    "SatO2_all_valid_patients_input" = SatO2_all_valid_patients_input,
    "Binary_Mask_NA" =
      SatO2_all_valid_patients_Binary_Mask_NA
  )

write.xlsx(
  list_of_datasets2,
  "../data/clean-data/SatO2_all_valid_patients_input.xlsx",
  colNames = TRUE
)
## For deploying
write.xlsx(list_of_datasets1,
           "../deploying/data/FC_all_valid_patients_input.xlsx",
           colNames = TRUE)
write.xlsx(
  list_of_datasets2,
  "../deploying/data/SatO2_all_valid_patients_input.xlsx",
  colNames = TRUE
)

GRaphic interface

Example with one patient: YPR_11188252

# Graph
patient_name = "YPR_11188252" 
graph_data_FC = data.frame(cbind(FC_all_valid_patients_input[,patient_name],FC_all_valid_patients_Binary_Mask_NA[,patient_name], FC_all_patients$time))
colnames(graph_data_FC) <- c("FC","Is_na","time")

graph_data_SatO2 = data.frame(cbind(SatO2_all_valid_patients_input[,patient_name],SatO2_all_valid_patients_Binary_Mask_NA[,patient_name], SatO2_all_patients$time))
colnames(graph_data_SatO2) <- c("SatO2","Is_na","time")
ggplot(graph_data_FC, aes(x = time, y = FC,)) +
  geom_line(color = "black") + xlab("") +
  geom_point(color = ifelse(graph_data_FC$Is_na == TRUE, '#69b3a2','black')) +
  theme(axis.text.x = element_text(angle = 60, hjust = 1)) + 
  labs(title  = "Interpolation - FC", subtitle = "Stine Method")

ggplot(graph_data_SatO2, aes(x = time, y = SatO2,)) +
  geom_line(color = "black") + xlab("") +
  geom_point(color = ifelse(graph_data_SatO2$Is_na == TRUE, '#69b3a2','black')) +
  theme(axis.text.x = element_text(angle = 60, hjust = 1)) + 
  labs(title  = "Interpolation - SatO2", subtitle = "Stine Method")