Libraries

library(tidyr)
library(readr)
library(ggplot2) # ggplot graphs
library(knitr)
library(readxl)
library(xlsx)
library(openxlsx)
library(reactable) # reactable(df)
library(naniar) # miss_case_summary

library(dplyr)

## Knn imputation
library(caret)
library(RANN)

# CLustering 
library(factoextra)    #clustering visualization
library(cluster)       #clustering algorithms
library(dendextend)    #for comparing two dendrograms
library(corrplot)      #corelation between dendrograms
library(tidyverse)     #data manupulation
library(NbClust)       #determine optimal no. of clusters  [not working...]
library(TSclust)

#RandomForest
library(randomForest) # RandomForest Discrete Classification
library(imbalance) # To create a more balanced dataset

Functions

source("../scripts/useful-functions/get_column_position.R")
# In a normal script it will be:  source("./scripts/useful-functions/get_column_position.R")

Reading Data

Time Series Data: Heart Rate and SatO2

cuantiles_TS_HR_P1 = data.frame(read_xlsx("../data/clean-data/BoxBasedImputation/cuantiles_TS_HR_valid_patients_input_P1.xlsx", sheet = "FC_valid_patients_input_P1" ))

## Deterioro and Not deterioro
file_patient_name_NO_DETERIORO <- data.frame(read_csv("../data/info-patients/file_patient_name_NO_DETERIORO.csv"))
file_patient_name_NO_DETERIORO <- file_patient_name_NO_DETERIORO$x
file_patient_name_DETERIORO <- data.frame(read_csv("../data/info-patients/file_patient_name_DETERIORO.csv"))
file_patient_name_DETERIORO <- file_patient_name_DETERIORO$x

Descriptive Data for Discriminant analysis

valid_patients_P1 <- data.frame(read_xlsx("../data/clean-data/valid_patients_P1.xlsx"))
valid_patients_P1 <- valid_patients_P1$x

file_patient_name <- data.frame(read_csv("../data/clean-data/file_patient_name.csv", show_col_types = FALSE))
file_patient_name <- file_patient_name$x
  
df1 <- data.frame(read_xlsx("../data/clean-data/descriptive-data/descriptive_data.xlsx"))
rownames(df1) <- file_patient_name
df1 <- df1[valid_patients_P1,]
df_descriptive <- data.frame(read_xlsx("../data/clean-data/descriptive-data/descriptive_data_imputed.xlsx"))
rownames(df_descriptive) <- file_patient_name
# pos_1 = get_column_position(df_descriptive, "SEXO")
# pos_2 = get_column_position(df_descriptive, "SNG")
# 
# col_names_factor <- names(df_descriptive[pos_1:pos_2])
# df_descriptive[col_names_factor] <- lapply(df_descriptive[col_names_factor] , factor)


df_descriptive <- df_descriptive[valid_patients_P1,]

df_descriptive <- df_descriptive[,-c(1:3)]

Create a dataframe with ACF [Heart Rate ]

dimension_col <- dim(cuantiles_TS_HR_P1)[2]
dimension_row <- 1440 #lag.max -1

# Heart Rate
cuantiles_TS_HR_P1_ACF <- data.frame(matrix(nrow = dimension_row, ncol = dimension_col))
colnames(cuantiles_TS_HR_P1_ACF) <- names(cuantiles_TS_HR_P1)[1:dimension_col]
for (i in names(cuantiles_TS_HR_P1_ACF)) {
  acf_result_FC <- acf(cuantiles_TS_HR_P1[[i]], lag.max = (dimension_row - 1), plot = FALSE)
  cuantiles_TS_HR_P1_ACF[, i] <- acf_result_FC$acf
}

Create a dataframe with peridiogram

library(TSA)
# Generar un dataset con varias series temporales
df <- cuantiles_TS_HR_P1

# Crear una matriz para almacenar los periodogramas
pg_mat <- data.frame(matrix(nrow = nrow(df), ncol =  ncol(df)))
colnames(pg_mat) = colnames(cuantiles_TS_HR_P1)

# Calcular el periodograma de cada serie temporal y almacenarlo en la matriz
for (i in colnames(cuantiles_TS_HR_P1)) {
  pg <- periodogram(ts(cuantiles_TS_HR_P1[,i]), plot = FALSE, normalize = TRUE, B = 500)
  pg_mat[,i] <- pg$spec
}

TsClust Comprobation

datos <- cuantiles_TS_HR_P1

diss.ACF Computes the dissimilarity between two time series as the distance between their estimated simple (ACF) or partial (PACF) autocorrelation coefficients

DD_ACF <- as.matrix(diss(datos, "ACF", lag.max = 50))

diss.EUCL

DD_EUCL <- as.matrix(diss(datos, "EUCL"))

diss.PER

DD_PER <- as.matrix(diss(datos, "PER"))

Euclidean Distance first 50 ACF

distance <- dist(t(cuantiles_TS_HR_P1_ACF[c(1:51),]), method = "euclidean")
distance_matrix_ACF <- as.matrix(distance)

Euclidean Distance

distance <- dist(datos, method = "euclidean")
distance_matrix_EUCL <- as.matrix(distance)

Eculidean PER Distance

distance <- dist(t(pg_mat), method = "euclidean")
distance_matrix_PER <- as.matrix(distance)

TSCLust in Action

ACF TSclust

DD_ACF <- diss(datos, "ACF", lag.max = 50)
hcintper_ACF <- hclust(DD_ACF, "ward.D2")
fviz_dend(hcintper_ACF, palette = "jco",
          rect = TRUE, show_labels = FALSE, k = 2)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <]8;;https://github.com/kassambara/factoextra/issueshttps://github.com/kassambara/factoextra/issues]8;;>.

DDclust_ACF <- cutree( hclust(DD_ACF, "ward.D2"), k = 2)
fviz_cluster(list(data = t(datos), cluster = DDclust_ACF))

fviz_silhouette(silhouette(DDclust_ACF, DD_ACF))
##   cluster size ave.sil.width
## 1       1   52          0.48
## 2       2   16          0.74

Contingency ACF lag.max = 50

DETERIORO_CLUST <- union(intersect(file_patient_name_DETERIORO,names_1),intersect(file_patient_name_DETERIORO,names_2))
NO_DETERIORO_CLUST <- union(intersect(file_patient_name_NO_DETERIORO,names_1),intersect(file_patient_name_NO_DETERIORO,names_2))
#DETERIORO
DETERIORO_patients = data.frame(t(rep("#4A235A", length(DETERIORO_CLUST))))
colnames(DETERIORO_patients)<- DETERIORO_CLUST


#NO DETERIORO
NO_DETERIORO_patients = data.frame(t(rep("#117864", length(NO_DETERIORO_CLUST))))
colnames(NO_DETERIORO_patients)<- NO_DETERIORO_CLUST

COLOR_ACF <- cbind(DETERIORO_patients,NO_DETERIORO_patients)
order_ACF <- union(names(DDclust_ACF[DDclust_ACF == 2]),names(DDclust_ACF[DDclust_ACF == 1]))
fviz_dend(hcintper_ACF, k = 2,  
          k_colors = c("blue", "green3"),
          label_cols =   as.vector(COLOR_ACF[,order_ACF]), cex = 0.6) 

n1 = length(intersect(file_patient_name_DETERIORO,names_1))
n2 = length(intersect(file_patient_name_DETERIORO,names_2))
n3 = length(intersect(file_patient_name_NO_DETERIORO,names_1))
n4 = length(intersect(file_patient_name_NO_DETERIORO,names_2))

conttingency_table <- data.frame("CLust1" = c(n1,n3), "Clust2" = c(n2,n4))
rownames(conttingency_table) <- c("DETERIORO","NO DETERIORO")


knitr::kable(conttingency_table, align = "lccrr")
CLust1 Clust2
DETERIORO 12 2
NO DETERIORO 40 14
conttingency_table_prop <- data.frame(c(n1,n3)/(n1+n3),c(n2,n4)/(n2+n4))
rownames(conttingency_table_prop) <- c("DETERIORO","NO DETERIORO")
colnames(conttingency_table_prop) <- c("Clust1","Clust2")

knitr::kable(conttingency_table_prop, align = "lccrr")
Clust1 Clust2
DETERIORO 0.2307692 0.125
NO DETERIORO 0.7692308 0.875

Random Forest: Discriminant TSCLust ACF

data_frame1_ACF = data.frame("CLUSTER" = DDclust_ACF)
data_frame2 = df_descriptive
data_frame_merge_ACF <-
  merge(data_frame1_ACF, data_frame2,                      by = 'row.names', all = TRUE)
data_frame_merge_ACF <- data_frame_merge_ACF[, 2:dim(data_frame_merge_ACF)[2]]
table(data_frame_merge_ACF$CLUSTER)
## 
##  1  2 
## 52 16
newMWMOTE_ACF <- oversample(data_frame_merge_ACF, ratio = 0.85, method = "MWMOTE", classAttr = "CLUSTER")
table(newMWMOTE_ACF$CLUSTER)
## 
##  1  2 
## 52 45
set.seed(123)
pos_1 = get_column_position(newMWMOTE_ACF, "CLUSTER")
pos_2 = get_column_position(newMWMOTE_ACF, "SNG")
col_names_factor <- names(newMWMOTE_ACF[pos_1:pos_2])
newMWMOTE_ACF[col_names_factor] <- lapply(newMWMOTE_ACF[col_names_factor] , factor)

RF_ACF <- randomForest(CLUSTER ~ ., data = newMWMOTE_ACF)
print(RF_ACF)
## 
## Call:
##  randomForest(formula = CLUSTER ~ ., data = newMWMOTE_ACF) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 5
## 
##         OOB estimate of  error rate: 11.34%
## Confusion matrix:
##    1  2 class.error
## 1 46  6   0.1153846
## 2  5 40   0.1111111

Importance

kable(importance(RF_ACF, type =2)[order(importance(RF_ACF, type =2), decreasing = TRUE),])
x
PESO 9.0982434
EDAD 8.1365895
SCORE_WOOD_DOWNES_24H 2.7665812
FR_8_16h 2.3716961
SCORE_CRUCES_INGRESO 2.3154848
SAPI_0_8h 2.1863742
LM 1.9513197
SEXO 1.8128066
ETIOLOGÍA 1.8077405
FLUJO2_0_8H 1.7721020
SUERO 1.6146053
SCORE_WOOD_DOWNES_INGRESO 1.5280813
FR_0_8h 1.5150593
FR_16_24h 1.3981804
FLUJO2_8_16h 1.1365348
EG 1.1326540
SAPI_16_24h 1.0171987
ENFERMEDAD_BASE 0.9563347
PREMATURIDAD 0.8635321
DERMATITIS 0.6792933
SAPI_8_16h 0.6783079
SNG 0.2822430
PALIVIZUMAB 0.2609165
ALERGIAS 0.1630979
DESNUTRICION 0.1508200
ALIMENTACION 0.1242604

Importance of first 50 ACF

data_frame1_ACF = data.frame("CLUSTER" = DDclust_ACF)
data_frame2 = data.frame(t(cuantiles_TS_HR_P1_ACF[c(2:51),]))
data_frame_merge_ACF <-
  merge(data_frame1_ACF, data_frame2,                      by = 'row.names', all = TRUE)
data_frame_merge_ACF <- data_frame_merge_ACF[, 2:dim(data_frame_merge_ACF)[2]]
set.seed(123)
data_frame_merge_ACF$CLUSTER <- as.factor(data_frame_merge_ACF$CLUSTER)
RF_0 <- randomForest(CLUSTER ~ ., data = data_frame_merge_ACF)
print(RF_0)
## 
## Call:
##  randomForest(formula = CLUSTER ~ ., data = data_frame_merge_ACF) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 7
## 
##         OOB estimate of  error rate: 1.47%
## Confusion matrix:
##    1  2 class.error
## 1 52  0      0.0000
## 2  1 15      0.0625
plot(importance(RF_0, type =2), type = "h")

EUCL TSclust

DD_EUCL <- diss(datos, "EUCL")
hcintper_EUCL <- hclust(DD_EUCL, "ward.D2")
fviz_dend(hcintper_EUCL, palette = "jco",
          rect = TRUE, show_labels = FALSE, k = 2)

DDclust_EUCL <- cutree( hclust(DD_EUCL, "ward.D2"), k = 2)
fviz_cluster(list(data = t(datos), cluster = DDclust_EUCL))

fviz_silhouette(silhouette(DDclust_EUCL, DD_EUCL))
##   cluster size ave.sil.width
## 1       1   40          0.06
## 2       2   28          0.06

Contingency EUCL

DETERIORO_CLUST <- union(intersect(file_patient_name_DETERIORO,names_1),intersect(file_patient_name_DETERIORO,names_2))
NO_DETERIORO_CLUST <- union(intersect(file_patient_name_NO_DETERIORO,names_1),intersect(file_patient_name_NO_DETERIORO,names_2))
#DETERIORO
DETERIORO_patients = data.frame(t(rep("#4A235A", length(DETERIORO_CLUST))))
colnames(DETERIORO_patients)<- DETERIORO_CLUST


#NO DETERIORO
NO_DETERIORO_patients = data.frame(t(rep("#117864", length(NO_DETERIORO_CLUST))))
colnames(NO_DETERIORO_patients)<- NO_DETERIORO_CLUST

COLOR_EUCL <- cbind(DETERIORO_patients,NO_DETERIORO_patients)
order_EUCL <- union(names(DDclust_EUCL[DDclust_EUCL == 2]),names(DDclust_EUCL[DDclust_EUCL == 1]))
fviz_dend(hcintper_EUCL, k = 2,  
          k_colors = c("blue", "green3"),
          label_cols =   as.vector(COLOR_EUCL[,order_EUCL]), cex = 0.6) 

n1 = length(intersect(file_patient_name_DETERIORO,names_1))
n2 = length(intersect(file_patient_name_DETERIORO,names_2))
n3 = length(intersect(file_patient_name_NO_DETERIORO,names_1))
n4 = length(intersect(file_patient_name_NO_DETERIORO,names_2))

conttingency_table <- data.frame("CLust1" = c(n1,n3), "Clust2" = c(n2,n4))
rownames(conttingency_table) <- c("DETERIORO","NO DETERIORO")


knitr::kable(conttingency_table, align = "lccrr")
CLust1 Clust2
DETERIORO 9 5
NO DETERIORO 31 23
conttingency_table_prop <- data.frame(c(n1,n3)/(n1+n3),c(n2,n4)/(n2+n4))
rownames(conttingency_table_prop) <- c("DETERIORO","NO DETERIORO")
colnames(conttingency_table_prop) <- c("Clust1","Clust2")

knitr::kable(conttingency_table_prop, align = "lccrr")
Clust1 Clust2
DETERIORO 0.225 0.1785714
NO DETERIORO 0.775 0.8214286

Random Forest: Discriminant TSCLust EUCL

data_frame1_EUCL = data.frame("CLUSTER" = DDclust_EUCL)
data_frame2 = df_descriptive
data_frame_merge_EUCL <-
  merge(data_frame1_EUCL, data_frame2,                      by = 'row.names', all = TRUE)
data_frame_merge_EUCL <- data_frame_merge_EUCL[, 2:dim(data_frame_merge_EUCL)[2]]
table(data_frame_merge_EUCL$CLUSTER)
## 
##  1  2 
## 40 28
newMWMOTE_EUCL <- oversample(data_frame_merge_EUCL, ratio = 0.85, method = "MWMOTE", classAttr = "CLUSTER")
table(newMWMOTE_EUCL$CLUSTER)
## 
##  1  2 
## 40 34
set.seed(123)
pos_1 = get_column_position(newMWMOTE_EUCL, "CLUSTER")
pos_2 = get_column_position(newMWMOTE_EUCL, "SNG")
col_names_factor <- names(newMWMOTE_EUCL[pos_1:pos_2])
newMWMOTE_EUCL[col_names_factor] <- lapply(newMWMOTE_EUCL[col_names_factor] , factor)

RF_EUCL <- randomForest(CLUSTER ~ ., data = newMWMOTE_EUCL)
print(RF_EUCL)
## 
## Call:
##  randomForest(formula = CLUSTER ~ ., data = newMWMOTE_EUCL) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 5
## 
##         OOB estimate of  error rate: 40.54%
## Confusion matrix:
##    1  2 class.error
## 1 26 14   0.3500000
## 2 16 18   0.4705882

Importance

kable(importance(RF_EUCL, type =2)[order(importance(RF_EUCL, type =2), decreasing = TRUE),])
x
PESO 3.0341244
EDAD 2.9404749
FR_8_16h 2.9348776
FR_0_8h 2.9160816
SCORE_CRUCES_INGRESO 2.6767961
SAPI_16_24h 2.4723817
SAPI_0_8h 2.1726419
SCORE_WOOD_DOWNES_INGRESO 2.0431108
SCORE_WOOD_DOWNES_24H 2.0388233
FLUJO2_8_16h 1.8820088
FR_16_24h 1.8161756
EG 1.6809786
FLUJO2_0_8H 1.5951335
SAPI_8_16h 0.9923468
LM 0.8885485
ETIOLOGÍA 0.6693241
SEXO 0.5875554
SNG 0.4416946
SUERO 0.3896287
ENFERMEDAD_BASE 0.3696867
ALIMENTACION 0.3623640
DESNUTRICION 0.3403335
PREMATURIDAD 0.3370544
ALERGIAS 0.3032729
PALIVIZUMAB 0.2000761
DERMATITIS 0.1698389

Importance of the TS-data

data_frame1_EUCL = data.frame("CLUSTER" = DDclust_EUCL)
data_frame2 = data.frame(t(datos))
data_frame_merge_EUCL <-
  merge(data_frame1_EUCL, data_frame2,                      by = 'row.names', all = TRUE)
data_frame_merge_EUCL <- data_frame_merge_EUCL[, 2:dim(data_frame_merge_ACF)[2]]
set.seed(123)
data_frame_merge_EUCL$CLUSTER <- as.factor(data_frame_merge_EUCL$CLUSTER)
RF_0 <- randomForest(CLUSTER ~ ., data = data_frame_merge_EUCL)
print(RF_0)
## 
## Call:
##  randomForest(formula = CLUSTER ~ ., data = data_frame_merge_EUCL) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 7
## 
##         OOB estimate of  error rate: 36.76%
## Confusion matrix:
##    1  2 class.error
## 1 30 10   0.2500000
## 2 15 13   0.5357143
plot(importance(RF_0, type =2), type = "h")

PER TSclust

DD_PER <- diss(datos, "PER")
hcintper_PER <- hclust(DD_PER, "ward.D2")
fviz_dend(hcintper_PER, palette = "jco",
          rect = TRUE, show_labels = FALSE, k = 2)

DDclust_PER <- cutree( hclust(DD_PER, "ward.D2"), k = 2)
fviz_cluster(list(data = t(datos), cluster = DDclust_PER))

fviz_silhouette(silhouette(DDclust_PER, DD_PER))
##   cluster size ave.sil.width
## 1       1   54          0.28
## 2       2   14          0.28

Contingency PER

DETERIORO_CLUST <- union(intersect(file_patient_name_DETERIORO,names_1),intersect(file_patient_name_DETERIORO,names_2))
NO_DETERIORO_CLUST <- union(intersect(file_patient_name_NO_DETERIORO,names_1),intersect(file_patient_name_NO_DETERIORO,names_2))
#DETERIORO
DETERIORO_patients = data.frame(t(rep("#4A235A", length(DETERIORO_CLUST))))
colnames(DETERIORO_patients)<- DETERIORO_CLUST


#NO DETERIORO
NO_DETERIORO_patients = data.frame(t(rep("#117864", length(NO_DETERIORO_CLUST))))
colnames(NO_DETERIORO_patients)<- NO_DETERIORO_CLUST

COLOR_PER <- cbind(NO_DETERIORO_patients,DETERIORO_patients)
order_PER <- union(names(DDclust_PER[DDclust_PER == 2]),names(DDclust_PER[DDclust_PER == 1]))
fviz_dend(hcintper_PER, k = 2,  
          k_colors = c("blue", "green3"),
          label_cols =   as.vector(COLOR_PER[,order_PER]), cex = 0.6) 

n1 = length(intersect(file_patient_name_DETERIORO,names_1))
n2 = length(intersect(file_patient_name_DETERIORO,names_2))
n3 = length(intersect(file_patient_name_NO_DETERIORO,names_1))
n4 = length(intersect(file_patient_name_NO_DETERIORO,names_2))

conttingency_table <- data.frame("CLust1" = c(n1,n3), "Clust2" = c(n2,n4))
rownames(conttingency_table) <- c("DETERIORO","NO DETERIORO")


knitr::kable(conttingency_table, align = "lccrr")
CLust1 Clust2
DETERIORO 13 1
NO DETERIORO 41 13
conttingency_table_prop <- data.frame(c(n1,n3)/(n1+n3),c(n2,n4)/(n2+n4))
rownames(conttingency_table_prop) <- c("DETERIORO","NO DETERIORO")
colnames(conttingency_table_prop) <- c("Clust1","Clust2")

knitr::kable(conttingency_table_prop, align = "lccrr")
Clust1 Clust2
DETERIORO 0.2407407 0.0714286
NO DETERIORO 0.7592593 0.9285714

Random Forest: Discriminant TSCLust PER

data_frame1_PER = data.frame("CLUSTER" = DDclust_PER)
data_frame2 = df_descriptive
data_frame_merge_PER <-
  merge(data_frame1_PER, data_frame2,                      by = 'row.names', all = TRUE)
data_frame_merge_PER <- data_frame_merge_PER[, 2:dim(data_frame_merge_PER)[2]]
table(data_frame_merge_PER$CLUSTER)
## 
##  1  2 
## 54 14
newMWMOTE_PER <- oversample(data_frame_merge_PER, ratio = 0.85, method = "MWMOTE", classAttr = "CLUSTER")
table(newMWMOTE_PER$CLUSTER)
## 
##  1  2 
## 54 46
set.seed(123)
pos_1 = get_column_position(newMWMOTE_PER, "CLUSTER")
pos_2 = get_column_position(newMWMOTE_PER, "SNG")
col_names_factor <- names(newMWMOTE_PER[pos_1:pos_2])
newMWMOTE_PER[col_names_factor] <- lapply(newMWMOTE_PER[col_names_factor] , factor)
RF_PER <- randomForest(CLUSTER ~ ., data = newMWMOTE_PER)
print(RF_PER)
## 
## Call:
##  randomForest(formula = CLUSTER ~ ., data = newMWMOTE_PER) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 5
## 
##         OOB estimate of  error rate: 13%
## Confusion matrix:
##    1  2 class.error
## 1 49  5  0.09259259
## 2  8 38  0.17391304

Importance

kable(importance(RF_PER, type =2)[order(importance(RF_PER, type =2), decreasing = TRUE),])
x
PESO 6.5652544
EDAD 5.9253091
SCORE_WOOD_DOWNES_INGRESO 3.4485809
SAPI_0_8h 3.2804061
SEXO 3.1184644
SUERO 2.6352030
SAPI_8_16h 2.3921213
FR_0_8h 2.2575836
ETIOLOGÍA 2.1633623
FR_16_24h 2.1364423
FR_8_16h 1.9913936
SCORE_WOOD_DOWNES_24H 1.8440330
FLUJO2_0_8H 1.6464318
FLUJO2_8_16h 1.6095209
LM 1.6067669
SCORE_CRUCES_INGRESO 1.2979480
EG 1.1161027
SAPI_16_24h 0.9595157
ENFERMEDAD_BASE 0.6635130
PREMATURIDAD 0.6338663
SNG 0.4191274
DERMATITIS 0.3744220
DESNUTRICION 0.3711575
ALERGIAS 0.3305712
ALIMENTACION 0.2160089
PALIVIZUMAB 0.1788668