Libraries
library(tidyr)
library(readr)
library(ggplot2)
library(knitr)
library(readxl)
library(xlsx)
library(openxlsx)
library(reactable) # reactable(df)
## Knn imputation
library(caret)
library(RANN)
Imputation based in: https://rpubs.com/garisj98/TimeSeriesImputation
Reading Data
less_5_PCT_missing_FC = c()
less_5_PCT_missing_SatO2 = c()
missing_FC = data.frame((read_xlsx("../data/missing-info/missing_FC.xlsx")))
missing_SatO2 = data.frame((read_xlsx("../data/missing-info/missing_SatO2.xlsx")))
FC_all_patients = data.frame(read_xlsx("../data/Fc_&_SatO2/FC_all_patients.xlsx"))
SatO2_all_patients = data.frame(read_xlsx("../data/Fc_&_SatO2/SatO2_all_patients.xlsx"))
# Reading Patients names
file_patient_name <- read.csv("../deploying/data/file_patient_name.csv")
file_patient_name <- file_patient_name$x
## Saving the info
file_patient_name_NO_DETERIORO <- data.frame(read_xlsx("../data/clean-data/file_patient_name_NO_DETERIORO.xlsx"))
file_patient_name_NO_DETERIORO <- file_patient_name_NO_DETERIORO$x
file_patient_name_DETERIORO <- data.frame(read_xlsx("../data/clean-data/file_patient_name_DETERIORO.xlsx"))
file_patient_name_DETERIORO <- file_patient_name_DETERIORO$x
Let´s study for which patients it is possible to input data. The condition is to have less than 5% of missing data.
The condition of less than 5 % of missing values in Heart Rate and SatO2 shall be studied. Later it will be checked if patients who have more than 5% of missing in one variable also have it in the other.
for (name_patient in file_patient_name) {
if (match(name_patient, missing_FC$Patient)) {
pos = match(name_patient, missing_FC$Patient)
if (missing_FC$PCT_Miss[pos] <= 5){
#print(pos)
less_5_PCT_missing_FC = append(less_5_PCT_missing_FC,name_patient)
}}
else { print("Error")
}}
for (name_patient in file_patient_name) {
if (match(name_patient, missing_SatO2$Patient)) {
pos = match(name_patient, missing_SatO2$Patient)
if (missing_SatO2$PCT_Miss[pos] <= 5){
#print(pos)
less_5_PCT_missing_SatO2 = append(less_5_PCT_missing_SatO2,name_patient)
}}
else { print("Error")
}}
Do patients who have less than 5% of missing values in Heart Rate also have them in SatO2 and vice versa?
setequal(less_5_PCT_missing_SatO2, less_5_PCT_missing_FC)
## [1] FALSE
# [1] FALSE
There is at least one patient that has more than 5% missing in one variable but less than 5% missing in other variable. Let ´s discover who is he/she.
setdiff(less_5_PCT_missing_SatO2, less_5_PCT_missing_FC)
## [1] "VAM_11160159"
# Two data sets diverge in VAM_11160159
missing_SatO2
missing_SatO2$PCT_Miss[match("VAM_11160159", missing_SatO2$Patient)]
## [1] 4.788341
missing_FC
missing_FC$PCT_Miss[match("VAM_11160159", missing_FC$Patient)]
## [1] 5.343511
I am going to use it since (5.343 + 4.788)/2 = 5
valid_patients_for_imputing_data = union(less_5_PCT_missing_SatO2,less_5_PCT_missing_FC)
valid_patients_for_imputing_data_DETERIORO = intersect(valid_patients_for_imputing_data,file_patient_name_DETERIORO)
reactable(data.frame(valid_patients_for_imputing_data_DETERIORO))
valid_patients_for_imputing_data_NO_DETERIORO = intersect(valid_patients_for_imputing_data,file_patient_name_NO_DETERIORO)
reactable(data.frame(valid_patients_for_imputing_data_NO_DETERIORO))
For imputation, I will distinguish between DETERIORO and NOT DETERIORO dataframes.
## Heart Rate
FC_DETERIORO_KNN_input = preProcess(FC_all_patients[,valid_patients_for_imputing_data_DETERIORO], "knnImpute")
FC_DETERIORO_KNN_input_pred = predict(FC_DETERIORO_KNN_input, FC_all_patients[,valid_patients_for_imputing_data_DETERIORO])
## SatO2
SatO2_DETERIORO_KNN_input = preProcess(SatO2_all_patients[,valid_patients_for_imputing_data_DETERIORO], "knnImpute")
SatO2_DETERIORO_KNN_input_pred = predict(SatO2_DETERIORO_KNN_input, SatO2_all_patients[,valid_patients_for_imputing_data_DETERIORO])
## Heart Rate
FC_NO_DETERIORO_KNN_input = preProcess(FC_all_patients[,valid_patients_for_imputing_data_NO_DETERIORO], "knnImpute")
FC_NO_DETERIORO_KNN_input_pred = predict(FC_NO_DETERIORO_KNN_input, FC_all_patients[,valid_patients_for_imputing_data_NO_DETERIORO])
## SatO2
SatO2_NO_DETERIORO_KNN_input = preProcess(SatO2_all_patients[,valid_patients_for_imputing_data_NO_DETERIORO], "knnImpute")
SatO2_NO_DETERIORO_KNN_input_pred = predict(SatO2_NO_DETERIORO_KNN_input, SatO2_all_patients[,valid_patients_for_imputing_data_NO_DETERIORO])
FC_all_valid_patients_input = data.frame(FC_DETERIORO_KNN_input_pred,FC_NO_DETERIORO_KNN_input_pred)
SatO2_all_valid_patients_input = data.frame(SatO2_DETERIORO_KNN_input_pred,SatO2_NO_DETERIORO_KNN_input_pred)
Let´s create a dataframe
for identifying which patients
show Nan
values in which level
FC_all_valid_patients_Binary_Mask_NA = data.frame(is.na(FC_all_patients[,colnames(FC_all_valid_patients_input)]))
colnames(FC_all_valid_patients_Binary_Mask_NA) <- colnames(FC_all_patients[,colnames(FC_all_valid_patients_input)])
reactable(FC_all_valid_patients_Binary_Mask_NA)
SatO2_all_valid_patients_Binary_Mask_NA = data.frame(is.na(SatO2_all_patients[,colnames(SatO2_all_valid_patients_input)]))
colnames(SatO2_all_valid_patients_Binary_Mask_NA) <- colnames(SatO2_all_patients[,colnames(SatO2_all_valid_patients_input)])
reactable(SatO2_all_valid_patients_Binary_Mask_NA)
Do the patients in Heart Rate
and SatO2
are
the same
setdiff(colnames(FC_all_valid_patients_input),colnames(SatO2_all_valid_patients_input))
## character(0)
# Write imputed data for later use
list_of_datasets1 <-
list("FC_all_valid_patients_input" = FC_all_valid_patients_input,
"Binary_Mask_NA" = FC_all_valid_patients_Binary_Mask_NA)
write.xlsx(list_of_datasets1,
"../data/clean-data/FC_all_valid_patients_input.xlsx",
colNames = TRUE)
list_of_datasets2 <-
list(
"SatO2_all_valid_patients_input" = SatO2_all_valid_patients_input,
"Binary_Mask_NA" =
SatO2_all_valid_patients_Binary_Mask_NA
)
write.xlsx(
list_of_datasets2,
"../data/clean-data/SatO2_all_valid_patients_input.xlsx",
colNames = TRUE
)
## For deploying
write.xlsx(list_of_datasets1,
"../deploying/data/FC_all_valid_patients_input.xlsx",
colNames = TRUE)
write.xlsx(
list_of_datasets2,
"../deploying/data/SatO2_all_valid_patients_input.xlsx",
colNames = TRUE
)
YPR_11188252
# Graph
patient_name = "YPR_11188252"
graph_data_FC = data.frame(cbind(FC_all_valid_patients_input[,patient_name],FC_all_valid_patients_Binary_Mask_NA[,patient_name], FC_all_patients$time))
colnames(graph_data_FC) <- c("FC","Is_na","time")
graph_data_SatO2 = data.frame(cbind(SatO2_all_valid_patients_input[,patient_name],SatO2_all_valid_patients_Binary_Mask_NA[,patient_name], SatO2_all_patients$time))
colnames(graph_data_SatO2) <- c("SatO2","Is_na","time")
ggplot(graph_data_FC, aes(x = time, y = FC,)) +
geom_line(color = "black") + xlab("") +
geom_point(color = ifelse(graph_data_FC$Is_na == TRUE, '#69b3a2','black')) +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
labs(title = "Interpolation - FC", subtitle = "Stine Method")
ggplot(graph_data_SatO2, aes(x = time, y = SatO2,)) +
geom_line(color = "black") + xlab("") +
geom_point(color = ifelse(graph_data_SatO2$Is_na == TRUE, '#69b3a2','black')) +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
labs(title = "Interpolation - SatO2", subtitle = "Stine Method")