Libraries

library(tidyr)
library(readr)
library(ggplot2)
library(knitr)
library(readxl)
library(xlsx)
library(openxlsx)
library(reactable) # reactable(df)
library(dplyr)
library(naniar)
library(randomForest) # RandomForest Clasification
library(TSclust)

## Knn imputation
library(caret)
library(RANN)

# CLustering 
library(factoextra)    #clustering visualization
library(cluster)       #clustering algorithms
library(dendextend)    #for comparing two dendrograms
library(corrplot)      #corelation between dendrograms
library(tidyverse)     #data manupulation
#library(NbClust)       #determine optimal no. of clusters  [not working...]

library(knitr)
library(factoextra)
library(ggplot2)

Reading Data

FC_all_valid_patients_input_P1 = data.frame(read_xlsx("../data/clean-data/CriteriaBasedImputation/FC_all_valid_patients_input_P1.xlsx", sheet = "FC_all_valid_patients_input_P1" ))
SatO2_all_valid_patients_input_P1 = data.frame(read_xlsx("../data/clean-data/CriteriaBasedImputation/SatO2_all_valid_patients_input_P1.xlsx", sheet = "St2_all_valid_patients_input_P1" ))

## Deterioro and Not deterioro
file_patient_name_NO_DETERIORO <- data.frame(read_xlsx("../data/info-patients/file_patient_name_NO_DETERIORO.xlsx"))
file_patient_name_NO_DETERIORO <- file_patient_name_NO_DETERIORO$x
file_patient_name_DETERIORO <- data.frame(read_xlsx("../data/info-patients/file_patient_name_DETERIORO.xlsx"))
file_patient_name_DETERIORO <- file_patient_name_DETERIORO$x

Data creation

I wanted a mean of 0 and a standard deviation of 1. Standardization consists of transforming the variables such that they have mean zero and standard deviation one.

# We add the variable time
M = dim(FC_all_valid_patients_input_P1)[1] # 1441
FC_all_valid_patients_input_P1$time <- c(1:M)
SatO2_all_valid_patients_input_P1$time <- c(1:M)

Create a dataframes with ACF and CCF values

dimension_col <- dim(FC_all_valid_patients_input_P1)[2] - 1
dimension_row <- 1441 #lag.max -1

# Heart Rate
FC_all_valid_patients_input_P1_ACF <- data.frame(matrix(nrow = dimension_row, ncol = dimension_col))
colnames(FC_all_valid_patients_input_P1_ACF) <- names(FC_all_valid_patients_input_P1)[1:dimension_col]
# SatO2
SatO2_all_valid_patients_input_P1_ACF <- data.frame(matrix(nrow = dimension_row, ncol = dimension_col))
colnames(SatO2_all_valid_patients_input_P1_ACF) <- names(SatO2_all_valid_patients_input_P1)[1:dimension_col]
# CCF SatO2 and Heart Rate
All_valid_patients_input_P1_CCF <- data.frame(matrix(nrow = 2*dimension_row-1, ncol = dimension_col))
colnames(All_valid_patients_input_P1_CCF) <- names(SatO2_all_valid_patients_input_P1)[1:dimension_col]
for (i in names(FC_all_valid_patients_input_P1_ACF)) {
  acf_result_FC <- acf(FC_all_valid_patients_input_P1[[i]], lag.max = (dimension_row - 1), plot = FALSE)
  FC_all_valid_patients_input_P1_ACF[, i] <- acf_result_FC$acf
  
  acf_result_SatO2 <- acf(SatO2_all_valid_patients_input_P1[[i]], lag.max = (dimension_row - 1), plot = FALSE)
  SatO2_all_valid_patients_input_P1_ACF[, i] <- acf_result_SatO2$acf
  
  
  ccf_result <- ccf(FC_all_valid_patients_input_P1[,i],SatO2_all_valid_patients_input_P1[,i],lag.max = (dimension_row - 1), plot = FALSE)
  All_valid_patients_input_P1_CCF[,i] <- ccf_result$acf
  
}

Descriptive Data for Discriminant analysis

valid_patients_P1 <- data.frame(read_xlsx("../data/clean-data/valid_patients_P1.xlsx"))
valid_patients_P1 <- valid_patients_P1$x

file_patient_name <- data.frame(read_csv("../data/clean-data/file_patient_name.csv", show_col_types = FALSE))
file_patient_name <- file_patient_name$x
  
df1 <- data.frame(read_xlsx("../data/clean-data/descriptive-data/descriptive_data.xlsx"))
rownames(df1) <- file_patient_name
df1 <- df1[valid_patients_P1,]

Selecting only the complete variables

row_names_info <- rownames(t(df1))
missing_df1= as.data.frame(miss_case_summary(as.data.frame(t(df1))))
missing_df1$case = row_names_info[missing_df1$case]
colnames(missing_df1) <- c("Variable", "N_Miss","PCT_Miss")

# summary(df1)
df1_NO_NA = df1[,missing_df1[missing_df1$N_Miss == 0,]$Variable]
# Is important to delete also the "name" variables
# Those ones are
# "ID"                        "NHC"                       "INICIALES" 
df1_NO_NA <- subset(df1_NO_NA, select = - c(ID,NHC,INICIALES))

Clustering Methods

ACF Hear Rate

datos <- t(FC_all_valid_patients_input_P1_ACF)
datos_norm <- scale(datos)
Matrix Distance
distance <- dist(datos_norm, method = "euclidean")
distance_matrix <- as.matrix(distance)

Visualizing Distance atrix

fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07")) + 
   theme(axis.text.x=element_blank(), #remove x axis labels
        axis.ticks.x=element_blank(), #remove x axis ticks
        axis.text.y=element_blank(),  #remove y axis labels
        axis.ticks.y=element_blank()  #remove y axis ticks
        )

Hclust

To find which hierarchical clustering methods that can identify stronger clustering structures. Here we see that Ward’s method identifies the strongest clustering structure of the four methods assessed. Visualize the clustered data with red rectangle border considering k = 2

res.hc <- datos_norm %>%
          eclust("hclust", k = 2, graph = FALSE, method = "ward.D2")
# Visualize with factoextra
fviz_dend(res.hc, palette = "jco",
          rect = TRUE, show_labels = FALSE)

Evaluate the result: To evaluate the result, we can calculate the silhouette index.

fviz_silhouette(res.hc)
##   cluster size ave.sil.width
## 1       1    4          0.54
## 2       2   64          0.30

clust_1 <- hclust(distance, method = "ward.D2")
fviz_cluster(list(data = datos_norm, cluster = cutree(clust_1, k = 2)))

Random Forest: Hierarchical clustering

data_frame1 = data.frame("CLUSTER" = res.hc$cluster)
data_frame2 = df1_NO_NA
data_frame_merge <-
  merge(data_frame1, data_frame2,                      by = 'row.names', all = TRUE)
data_frame_merge <- data_frame_merge[, 2:dim(data_frame_merge)[2]]
set.seed(123)
data_frame_merge$CLUSTER <- as.factor(data_frame_merge$CLUSTER)
RF_0 <- randomForest(CLUSTER ~ ., data = data_frame_merge)
print(RF_0)
## 
## Call:
##  randomForest(formula = CLUSTER ~ ., data = data_frame_merge) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 5
## 
##         OOB estimate of  error rate: 5.88%
## Confusion matrix:
##   1  2 class.error
## 1 0  4           1
## 2 0 64           0

Importance

kable(importance(RF_0, type =2)[order(importance(RF_0, type =2), decreasing = TRUE),])
x
EDAD 1.1949823
PESO 0.8982389
FR_0_8h 0.5938739
SCORE_WOOD_DOWNES_24H 0.5852357
RADIOGRAFIA 0.3907127
BRONCODILATADORES 0.3893944
EG 0.3750563
SCORE_WOOD_DOWNES_INGRESO 0.3619181
ETIOLOGIA 0.2618195
SAPI_0_8h 0.2545372
DIAS_OAF 0.2285301
CORTICOIDES 0.2140864
ANTIBIOTICO 0.1934095
ANALITICA 0.1931057
SUERO 0.1417103
ALERGIAS 0.1257025
SEXO 0.1245906
DETERIORO 0.0989267
OAF.si1.no0 0.0965132
OAF 0.0930055
PALIVIZUMAB 0.0872935
PREMATURIDAD 0.0869605
ENFERMEDAD_BASE 0.0860095
LM 0.0846746
DESNUTRICION 0.0736334
SNG 0.0573906
DERMATITIS 0.0547714
ALIMENTACION 0.0132810
APNEA 0.0081667
UCIP 0.0064788

K-means

k2_kmeans <- datos_norm %>%
          eclust("hclust", k = 2, graph = FALSE, FUNcluster = "kmeans")

# kmeans does not have dendogram
fviz_silhouette(k2_kmeans)
##   cluster size ave.sil.width
## 1       1   14          0.18
## 2       2   54          0.20

fviz_cluster(list(data = datos_norm, cluster = k2_kmeans$cluster))

k2 <- datos_norm %>%
          eclust("hclust", k = 2, graph = FALSE, FUNcluster = "kmeans")
k3 <- datos_norm %>%
          eclust("hclust", k = 3, graph = FALSE, FUNcluster = "kmeans")
k4 <-datos_norm %>%
          eclust("hclust", k = 4, graph = FALSE, FUNcluster = "kmeans")
k5 <- datos_norm %>%
          eclust("hclust", k = 5, graph = FALSE, FUNcluster = "kmeans")

# plots to compare
p1 <- fviz_cluster(k2, geom = "point", data = df) + ggtitle("k = 2")
p2 <- fviz_cluster(k3, geom = "point",  data = df) + ggtitle("k = 3")
p3 <- fviz_cluster(k4, geom = "point",  data = df) + ggtitle("k = 4")
p4 <- fviz_cluster(k5, geom = "point",  data = df) + ggtitle("k = 5")

library(gridExtra)
grid.arrange(p1, p2, p3, p4, nrow = 2)

set.seed(123)
fviz_nbclust(datos_norm, kmeans, method = "wss")

Average Silhouette Method

fviz_nbclust(datos_norm, kmeans, method = "silhouette")

Random Forest: Discriminant K-means

data_frame1 = data.frame("CLUSTER" = k2_kmeans$cluster)
data_frame2 = df1_NO_NA
data_frame_merge <-
  merge(data_frame1, data_frame2,                      by = 'row.names', all = TRUE)
data_frame_merge <- data_frame_merge[, 2:dim(data_frame_merge)[2]]
set.seed(123)
data_frame_merge$CLUSTER <- as.factor(data_frame_merge$CLUSTER)
RF_0 <- randomForest(CLUSTER ~ ., data = data_frame_merge)
print(RF_0)
## 
## Call:
##  randomForest(formula = CLUSTER ~ ., data = data_frame_merge) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 5
## 
##         OOB estimate of  error rate: 23.53%
## Confusion matrix:
##   1  2 class.error
## 1 0 14  1.00000000
## 2 2 52  0.03703704

Importance

kable(importance(RF_0, type =2)[order(importance(RF_0, type =2), decreasing = TRUE),])
x
EDAD 3.3839230
FR_0_8h 2.6277133
PESO 2.2223809
SCORE_WOOD_DOWNES_INGRESO 2.1795198
SCORE_WOOD_DOWNES_24H 1.5009898
EG 1.1637664
SAPI_0_8h 1.1305895
LM 0.8675757
SUERO 0.7895493
SEXO 0.7156952
DIAS_OAF 0.4217511
ETIOLOGIA 0.4152356
RADIOGRAFIA 0.4104377
ANALITICA 0.4042494
ENFERMEDAD_BASE 0.3928956
BRONCODILATADORES 0.3591701
PREMATURIDAD 0.3368451
ALIMENTACION 0.3060370
ANTIBIOTICO 0.2392297
OAF.si1.no0 0.2089416
PALIVIZUMAB 0.1976535
DESNUTRICION 0.1926680
ALERGIAS 0.1757072
DETERIORO 0.1561031
SNG 0.1558060
APNEA 0.1552365
CORTICOIDES 0.1359795
OAF 0.1301501
DERMATITIS 0.0882573
UCIP 0.0759790

TSCLust

DD <- diss(datos, "INT.PER", normalize=TRUE)
hcintper <- hclust(DD, "ward.D2")
fviz_dend(hcintper, palette = "jco",
          rect = TRUE, show_labels = FALSE, k=2)

DDclust <- cutree( hclust(DD, "ward.D2"), k = 2)
fviz_cluster(list(data = datos_norm, cluster = DDclust))

Random Forest: Discriminant TSCLust

data_frame1 = data.frame("CLUSTER" = DDclust)
data_frame2 = df1_NO_NA
data_frame_merge <-
  merge(data_frame1, data_frame2,                      by = 'row.names', all = TRUE)
data_frame_merge <- data_frame_merge[, 2:dim(data_frame_merge)[2]]
set.seed(123)
data_frame_merge$CLUSTER <- as.factor(data_frame_merge$CLUSTER)
RF_0 <- randomForest(CLUSTER ~ ., data = data_frame_merge)
print(RF_0)
## 
## Call:
##  randomForest(formula = CLUSTER ~ ., data = data_frame_merge) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 5
## 
##         OOB estimate of  error rate: 29.41%
## Confusion matrix:
##    1  2 class.error
## 1 38  7   0.1555556
## 2 13 10   0.5652174

Importance

kable(importance(RF_0, type =2)[order(importance(RF_0, type =2), decreasing = TRUE),])
x
EDAD 5.3634741
PESO 4.2317585
SCORE_WOOD_DOWNES_INGRESO 2.4385988
FR_0_8h 2.2780070
EG 2.1902793
SCORE_WOOD_DOWNES_24H 1.7794489
SAPI_0_8h 0.9796765
DIAS_OAF 0.9270895
ETIOLOGIA 0.7591922
PREMATURIDAD 0.6799956
SEXO 0.6456424
SUERO 0.6346292
APNEA 0.5650904
LM 0.5436275
RADIOGRAFIA 0.5408403
SNG 0.5376492
ANTIBIOTICO 0.4984785
ENFERMEDAD_BASE 0.4465248
ANALITICA 0.4376758
DESNUTRICION 0.3809640
OAF.si1.no0 0.3523257
DETERIORO 0.3342896
BRONCODILATADORES 0.3041642
OAF 0.2585900
ALERGIAS 0.2293939
DERMATITIS 0.1738660
PALIVIZUMAB 0.1712616
UCIP 0.1412915
ALIMENTACION 0.1313147
CORTICOIDES 0.1148715

Imp_ACF_FC

Hclust

data_frame1 = data.frame("CLUSTER" = res.hc$cluster)
data_frame2 = t(FC_all_valid_patients_input_P1_ACF)
data_frame_merge <-
  merge(data_frame1, data_frame2,                      by = 'row.names', all = TRUE)
data_frame_merge <- data_frame_merge[, 2:dim(data_frame_merge)[2]]
set.seed(123)
data_frame_merge$CLUSTER <- as.factor(data_frame_merge$CLUSTER)
RF_0 <- randomForest(CLUSTER ~ ., data = data_frame_merge)
print(RF_0)
## 
## Call:
##  randomForest(formula = CLUSTER ~ ., data = data_frame_merge) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 37
## 
##         OOB estimate of  error rate: 1.47%
## Confusion matrix:
##   1  2 class.error
## 1 3  1        0.25
## 2 0 64        0.00
plot(importance(RF_0, type =2), type = "h" )

k-means

data_frame1 = data.frame("CLUSTER" = k2_kmeans$cluster)
data_frame2 = t(FC_all_valid_patients_input_P1_ACF)
data_frame_merge <-
  merge(data_frame1, data_frame2,                      by = 'row.names', all = TRUE)
data_frame_merge <- data_frame_merge[, 2:dim(data_frame_merge)[2]]
set.seed(123)
data_frame_merge$CLUSTER <- as.factor(data_frame_merge$CLUSTER)
RF_0 <- randomForest(CLUSTER ~ ., data = data_frame_merge)
print(RF_0)
## 
## Call:
##  randomForest(formula = CLUSTER ~ ., data = data_frame_merge) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 37
## 
##         OOB estimate of  error rate: 7.35%
## Confusion matrix:
##   1  2 class.error
## 1 9  5   0.3571429
## 2 0 54   0.0000000
plot(importance(RF_0, type =2), type = "h" )

#### TSClust

data_frame1 = data.frame("CLUSTER" = DDclust)
data_frame2 = t(FC_all_valid_patients_input_P1_ACF)
data_frame_merge <-
  merge(data_frame1, data_frame2,                      by = 'row.names', all = TRUE)
data_frame_merge <- data_frame_merge[, 2:dim(data_frame_merge)[2]]
set.seed(123)
data_frame_merge$CLUSTER <- as.factor(data_frame_merge$CLUSTER)
RF_0 <- randomForest(CLUSTER ~ ., data = data_frame_merge)
print(RF_0)
## 
## Call:
##  randomForest(formula = CLUSTER ~ ., data = data_frame_merge) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 37
## 
##         OOB estimate of  error rate: 4.41%
## Confusion matrix:
##    1  2 class.error
## 1 43  2  0.04444444
## 2  1 22  0.04347826
plot(importance(RF_0, type =2), type = "h" )

ACF SatO2

datos <- t(SatO2_all_valid_patients_input_P1_ACF)
datos_norm <- scale(datos)
Matrix Distance
distance <- dist(datos_norm, method = "euclidean")
distance_matrix <- as.matrix(distance)

Visualizing Distance atrix

fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07")) + 
   theme(axis.text.x=element_blank(), #remove x axis labels
        axis.ticks.x=element_blank(), #remove x axis ticks
        axis.text.y=element_blank(),  #remove y axis labels
        axis.ticks.y=element_blank()  #remove y axis ticks
        )

Hclust

To find which hierarchical clustering methods that can identify stronger clustering structures. Here we see that Ward’s method identifies the strongest clustering structure of the four methods assessed.

res.hc <- datos_norm %>%
          eclust("hclust", k = 2, graph = FALSE, method = "ward.D2")
# Visualize with factoextra
fviz_dend(res.hc, palette = "jco",
          rect = TRUE, show_labels = FALSE)

Visualize the clustered data with red rectangle border considering k = 2

Evaluate the result: To evaluate the result, we can calculate the silhouette index.

fviz_silhouette(res.hc)
##   cluster size ave.sil.width
## 1       1   64          0.47
## 2       2    4          0.48

clust_1 <- hclust(distance, method = "ward.D2")
fviz_cluster(list(data = datos_norm, cluster = cutree(clust_1, k = 2)))

#### Random Forest: Hierarchical clustering

data_frame1 = data.frame("CLUSTER" = res.hc$cluster)
data_frame2 = df1_NO_NA
data_frame_merge <-
  merge(data_frame1, data_frame2,                      by = 'row.names', all = TRUE)
data_frame_merge <- data_frame_merge[, 2:dim(data_frame_merge)[2]]
set.seed(123)
data_frame_merge$CLUSTER <- as.factor(data_frame_merge$CLUSTER)
RF_0 <- randomForest(CLUSTER ~ ., data = data_frame_merge)
print(RF_0)
## 
## Call:
##  randomForest(formula = CLUSTER ~ ., data = data_frame_merge) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 5
## 
##         OOB estimate of  error rate: 5.88%
## Confusion matrix:
##    1 2 class.error
## 1 64 0           0
## 2  4 0           1

Importance

kable(importance(RF_0, type =2)[order(importance(RF_0, type =2), decreasing = TRUE),])
x
EG 1.5012645
SCORE_WOOD_DOWNES_24H 0.9403636
DIAS_OAF 0.6428180
PESO 0.5593814
EDAD 0.4503682
FR_0_8h 0.3248087
CORTICOIDES 0.3087738
ALERGIAS 0.2687718
ENFERMEDAD_BASE 0.2305884
SCORE_WOOD_DOWNES_INGRESO 0.1861729
SAPI_0_8h 0.1831349
DETERIORO 0.1796514
ALIMENTACION 0.1790532
SEXO 0.1627955
BRONCODILATADORES 0.1568883
OAF.si1.no0 0.1521014
PREMATURIDAD 0.1449579
PALIVIZUMAB 0.1393624
UCIP 0.1363409
ETIOLOGIA 0.1174646
LM 0.1053120
SUERO 0.0915257
ANALITICA 0.0845391
OAF 0.0831304
DESNUTRICION 0.0819021
RADIOGRAFIA 0.0696723
SNG 0.0428662
ANTIBIOTICO 0.0307846
DERMATITIS 0.0171378
APNEA 0.0077333

Importance of the ACF SatO2

data_frame1 = data.frame("CLUSTER" = res.hc$cluster)
data_frame2 = t(FC_all_valid_patients_input_P1_ACF)
data_frame_merge <-
  merge(data_frame1, data_frame2,                      by = 'row.names', all = TRUE)
data_frame_merge <- data_frame_merge[, 2:dim(data_frame_merge)[2]]
set.seed(123)
data_frame_merge$CLUSTER <- as.factor(data_frame_merge$CLUSTER)
RF_0 <- randomForest(CLUSTER ~ ., data = data_frame_merge)
print(RF_0)
## 
## Call:
##  randomForest(formula = CLUSTER ~ ., data = data_frame_merge) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 37
## 
##         OOB estimate of  error rate: 5.88%
## Confusion matrix:
##    1 2 class.error
## 1 64 0           0
## 2  4 0           1
plot(importance(RF_0, type =2), type = "h" )

K-means

k2_kmeans <- datos_norm %>%
          eclust("hclust", k = 2, graph = FALSE, FUNcluster = "kmeans")

# kmeans does not have dendogram
fviz_silhouette(k2_kmeans)
##   cluster size ave.sil.width
## 1       1   59          0.44
## 2       2    9          0.04

fviz_cluster(list(data = datos_norm, cluster = k2_kmeans$cluster))

k2 <- datos_norm %>%
          eclust("hclust", k = 2, graph = FALSE, FUNcluster = "kmeans")
k3 <- datos_norm %>%
          eclust("hclust", k = 3, graph = FALSE, FUNcluster = "kmeans")
k4 <-datos_norm %>%
          eclust("hclust", k = 4, graph = FALSE, FUNcluster = "kmeans")
k5 <- datos_norm %>%
          eclust("hclust", k = 5, graph = FALSE, FUNcluster = "kmeans")

# plots to compare
p1 <- fviz_cluster(k2, geom = "point", data = df) + ggtitle("k = 2")
p2 <- fviz_cluster(k3, geom = "point",  data = df) + ggtitle("k = 3")
p3 <- fviz_cluster(k4, geom = "point",  data = df) + ggtitle("k = 4")
p4 <- fviz_cluster(k5, geom = "point",  data = df) + ggtitle("k = 5")

library(gridExtra)
grid.arrange(p1, p2, p3, p4, nrow = 2)

set.seed(123)
fviz_nbclust(datos_norm, kmeans, method = "wss")

Average Silhouette Method

fviz_nbclust(datos_norm, kmeans, method = "silhouette")

Random Forest: Discriminant K-means

data_frame1 = data.frame("CLUSTER" = k2_kmeans$cluster)
data_frame2 = df1_NO_NA
data_frame_merge <-
  merge(data_frame1, data_frame2,                      by = 'row.names', all = TRUE)
data_frame_merge <- data_frame_merge[, 2:dim(data_frame_merge)[2]]
set.seed(123)
data_frame_merge$CLUSTER <- as.factor(data_frame_merge$CLUSTER)
RF_0 <- randomForest(CLUSTER ~ ., data = data_frame_merge)
print(RF_0)
## 
## Call:
##  randomForest(formula = CLUSTER ~ ., data = data_frame_merge) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 5
## 
##         OOB estimate of  error rate: 13.24%
## Confusion matrix:
##    1 2 class.error
## 1 59 0           0
## 2  9 0           1

Importance

kable(importance(RF_0, type =2)[order(importance(RF_0, type =2), decreasing = TRUE),])
x
PESO 1.8637411
EG 1.5617451
SCORE_WOOD_DOWNES_INGRESO 1.4687705
EDAD 1.4116161
SCORE_WOOD_DOWNES_24H 0.9236763
FR_0_8h 0.9144307
DESNUTRICION 0.8350052
SEXO 0.7414143
ENFERMEDAD_BASE 0.6706701
BRONCODILATADORES 0.5543442
SAPI_0_8h 0.5116409
DIAS_OAF 0.5000478
CORTICOIDES 0.2905115
PALIVIZUMAB 0.2834190
ANALITICA 0.2603768
PREMATURIDAD 0.2564485
SNG 0.2382675
ALERGIAS 0.2196853
LM 0.2117268
APNEA 0.2068648
ETIOLOGIA 0.2063904
RADIOGRAFIA 0.1686067
SUERO 0.1666663
DETERIORO 0.1610429
ANTIBIOTICO 0.1590619
OAF.si1.no0 0.1344170
ALIMENTACION 0.1247312
UCIP 0.1215508
OAF 0.0761842
DERMATITIS 0.0546917

TSCLust

DD <- diss(datos, "INT.PER", normalize=TRUE)
hcintper <- hclust(DD, "ward.D2")
fviz_dend(hcintper, palette = "jco",
          rect = TRUE, show_labels = FALSE, k=2)

DDclust <- cutree( hclust(DD, "ward.D2"), k = 2)
fviz_cluster(list(data = datos_norm, cluster = DDclust))

Random Forest: Discriminant TSCLust

data_frame1 = data.frame("CLUSTER" = DDclust)
data_frame2 = df1_NO_NA
data_frame_merge <-
  merge(data_frame1, data_frame2,                      by = 'row.names', all = TRUE)
data_frame_merge <- data_frame_merge[, 2:dim(data_frame_merge)[2]]
set.seed(123)
data_frame_merge$CLUSTER <- as.factor(data_frame_merge$CLUSTER)
RF_0 <- randomForest(CLUSTER ~ ., data = data_frame_merge)
print(RF_0)
## 
## Call:
##  randomForest(formula = CLUSTER ~ ., data = data_frame_merge) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 5
## 
##         OOB estimate of  error rate: 39.71%
## Confusion matrix:
##    1  2 class.error
## 1 24 14   0.3684211
## 2 13 17   0.4333333

Importance

kable(importance(RF_0, type =2)[order(importance(RF_0, type =2), decreasing = TRUE),])
x
PESO 5.5962954
EDAD 4.6522326
FR_0_8h 3.6358800
SCORE_WOOD_DOWNES_24H 2.4266129
EG 1.8058148
SCORE_WOOD_DOWNES_INGRESO 1.7246427
SAPI_0_8h 1.6643948
DIAS_OAF 1.1632593
SEXO 1.0733961
LM 0.9782990
ETIOLOGIA 0.9413451
DESNUTRICION 0.6427934
BRONCODILATADORES 0.6379262
ENFERMEDAD_BASE 0.5818366
RADIOGRAFIA 0.5577426
SUERO 0.4173942
ANTIBIOTICO 0.3757280
PREMATURIDAD 0.3555245
ALERGIAS 0.3546761
DERMATITIS 0.3505808
ANALITICA 0.3231879
OAF 0.2962828
OAF.si1.no0 0.2857509
DETERIORO 0.2747706
SNG 0.2633840
APNEA 0.2409242
ALIMENTACION 0.2031993
PALIVIZUMAB 0.1994680
UCIP 0.1566229
CORTICOIDES 0.0895820

Imp_ACF_SatO2

Hclust

data_frame1 = data.frame("CLUSTER" = res.hc$cluster)
data_frame2 = t(FC_all_valid_patients_input_P1_ACF)
data_frame_merge <-
  merge(data_frame1, data_frame2,                      by = 'row.names', all = TRUE)
data_frame_merge <- data_frame_merge[, 2:dim(data_frame_merge)[2]]
set.seed(123)
data_frame_merge$CLUSTER <- as.factor(data_frame_merge$CLUSTER)
RF_0 <- randomForest(CLUSTER ~ ., data = data_frame_merge)
print(RF_0)
## 
## Call:
##  randomForest(formula = CLUSTER ~ ., data = data_frame_merge) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 37
## 
##         OOB estimate of  error rate: 5.88%
## Confusion matrix:
##    1 2 class.error
## 1 64 0           0
## 2  4 0           1
plot(importance(RF_0, type =2), type = "h" )

k-means

data_frame1 = data.frame("CLUSTER" = k2_kmeans$cluster)
data_frame2 = t(FC_all_valid_patients_input_P1_ACF)
data_frame_merge <-
  merge(data_frame1, data_frame2,                      by = 'row.names', all = TRUE)
data_frame_merge <- data_frame_merge[, 2:dim(data_frame_merge)[2]]
set.seed(123)
data_frame_merge$CLUSTER <- as.factor(data_frame_merge$CLUSTER)
RF_0 <- randomForest(CLUSTER ~ ., data = data_frame_merge)
print(RF_0)
## 
## Call:
##  randomForest(formula = CLUSTER ~ ., data = data_frame_merge) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 37
## 
##         OOB estimate of  error rate: 13.24%
## Confusion matrix:
##    1 2 class.error
## 1 59 0           0
## 2  9 0           1
plot(importance(RF_0, type =2), type = "h" )

TSClust

data_frame1 = data.frame("CLUSTER" = DDclust)
data_frame2 = t(SatO2_all_valid_patients_input_P1_ACF)
data_frame_merge <-
  merge(data_frame1, data_frame2,                      by = 'row.names', all = TRUE)
data_frame_merge <- data_frame_merge[, 2:dim(data_frame_merge)[2]]
set.seed(123)
data_frame_merge$CLUSTER <- as.factor(data_frame_merge$CLUSTER)
RF_0 <- randomForest(CLUSTER ~ ., data = data_frame_merge)
print(RF_0)
## 
## Call:
##  randomForest(formula = CLUSTER ~ ., data = data_frame_merge) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 37
## 
##         OOB estimate of  error rate: 16.18%
## Confusion matrix:
##    1  2 class.error
## 1 32  6   0.1578947
## 2  5 25   0.1666667
plot(importance(RF_0, type =2), type = "h" )

CCF

datos <- t(All_valid_patients_input_P1_CCF)
datos_norm <- scale(datos)
Matrix Distance
distance <- dist(datos_norm, method = "euclidean")
distance_matrix <- as.matrix(distance)

Visualizing Distance atrix

fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07")) + 
   theme(axis.text.x=element_blank(), #remove x axis labels
        axis.ticks.x=element_blank(), #remove x axis ticks
        axis.text.y=element_blank(),  #remove y axis labels
        axis.ticks.y=element_blank()  #remove y axis ticks
        )

Hclust

To find which hierarchical clustering methods that can identify stronger clustering structures. Here we see that Ward’s method identifies the strongest clustering structure of the four methods assessed.

res.hc <- datos_norm %>%
          eclust("hclust", k = 2, graph = FALSE, method = "ward.D2")
# Visualize with factoextra
fviz_dend(res.hc, palette = "jco",
          rect = TRUE, show_labels = FALSE)

Visualize the clustered data with red rectangle border considering k = 2

Evaluate the result: To evaluate the result, we can calculate the silhouette index.

fviz_silhouette(res.hc)
##   cluster size ave.sil.width
## 1       1   21          0.13
## 2       2   47          0.02

clust_1 <- hclust(distance, method = "ward.D2")
fviz_cluster(list(data = datos_norm, cluster = cutree(clust_1, k = 2)))

#### Random Forest: Hierarchical clustering

data_frame1 = data.frame("CLUSTER" = res.hc$cluster)
data_frame2 = df1_NO_NA
data_frame_merge <-
  merge(data_frame1, data_frame2,                      by = 'row.names', all = TRUE)
data_frame_merge <- data_frame_merge[, 2:dim(data_frame_merge)[2]]
set.seed(123)
data_frame_merge$CLUSTER <- as.factor(data_frame_merge$CLUSTER)
RF_0 <- randomForest(CLUSTER ~ ., data = data_frame_merge)
print(RF_0)
## 
## Call:
##  randomForest(formula = CLUSTER ~ ., data = data_frame_merge) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 5
## 
##         OOB estimate of  error rate: 38.24%
## Confusion matrix:
##   1  2 class.error
## 1 0 21    1.000000
## 2 5 42    0.106383

Importance

kable(importance(RF_0, type =2)[order(importance(RF_0, type =2), decreasing = TRUE),])
x
EDAD 3.3047062
FR_0_8h 3.2736285
PESO 2.8852493
EG 2.1664523
SCORE_WOOD_DOWNES_INGRESO 2.0145034
SCORE_WOOD_DOWNES_24H 2.0127606
SAPI_0_8h 1.3711743
SEXO 1.3051146
ETIOLOGIA 0.9854514
LM 0.7822436
SUERO 0.7786464
RADIOGRAFIA 0.7207818
ENFERMEDAD_BASE 0.7177285
PREMATURIDAD 0.5016050
DIAS_OAF 0.4802313
ANALITICA 0.4282661
DESNUTRICION 0.3967778
SNG 0.3736544
ANTIBIOTICO 0.3525368
APNEA 0.3450846
OAF 0.3350206
OAF.si1.no0 0.3121337
DERMATITIS 0.3045369
BRONCODILATADORES 0.2984464
DETERIORO 0.2714084
ALERGIAS 0.2613869
ALIMENTACION 0.2092739
PALIVIZUMAB 0.2014913
UCIP 0.1578926
CORTICOIDES 0.1460778

K-means

k2_kmeans <- datos_norm %>%
          eclust("hclust", k = 2, graph = FALSE, FUNcluster = "kmeans")

# kmeans does not have dendogram
fviz_silhouette(k2_kmeans)
##   cluster size ave.sil.width
## 1       1   27          0.05
## 2       2   41          0.09

fviz_cluster(list(data = datos_norm, cluster = k2_kmeans$cluster))

k2 <- datos_norm %>%
          eclust("hclust", k = 2, graph = FALSE, FUNcluster = "kmeans")
k3 <- datos_norm %>%
          eclust("hclust", k = 3, graph = FALSE, FUNcluster = "kmeans")
k4 <-datos_norm %>%
          eclust("hclust", k = 4, graph = FALSE, FUNcluster = "kmeans")
k5 <- datos_norm %>%
          eclust("hclust", k = 5, graph = FALSE, FUNcluster = "kmeans")

# plots to compare
p1 <- fviz_cluster(k2, geom = "point", data = df) + ggtitle("k = 2")
p2 <- fviz_cluster(k3, geom = "point",  data = df) + ggtitle("k = 3")
p3 <- fviz_cluster(k4, geom = "point",  data = df) + ggtitle("k = 4")
p4 <- fviz_cluster(k5, geom = "point",  data = df) + ggtitle("k = 5")

library(gridExtra)
grid.arrange(p1, p2, p3, p4, nrow = 2)

set.seed(123)
fviz_nbclust(datos_norm, kmeans, method = "wss")

Average Silhouette Method

fviz_nbclust(datos_norm, kmeans, method = "silhouette")

Random Forest: Discriminant K-means

data_frame1 = data.frame("CLUSTER" = k2_kmeans$cluster)
data_frame2 = df1_NO_NA
data_frame_merge <-
  merge(data_frame1, data_frame2,                      by = 'row.names', all = TRUE)
data_frame_merge <- data_frame_merge[, 2:dim(data_frame_merge)[2]]
set.seed(123)
data_frame_merge$CLUSTER <- as.factor(data_frame_merge$CLUSTER)
RF_0 <- randomForest(CLUSTER ~ ., data = data_frame_merge, importance = TRUE)
print(RF_0)
## 
## Call:
##  randomForest(formula = CLUSTER ~ ., data = data_frame_merge,      importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 5
## 
##         OOB estimate of  error rate: 44.12%
## Confusion matrix:
##   1  2 class.error
## 1 5 22   0.8148148
## 2 8 33   0.1951220

Importance

kable(importance(RF_0, type =2)[order(importance(RF_0, type =2), decreasing = TRUE),])
x
PESO 3.8807871
EDAD 3.5691942
SCORE_WOOD_DOWNES_INGRESO 3.2405318
FR_0_8h 2.7851198
SCORE_WOOD_DOWNES_24H 2.7350443
EG 1.8709235
SAPI_0_8h 1.4447660
ENFERMEDAD_BASE 1.1207018
LM 1.0322701
RADIOGRAFIA 0.8331254
DIAS_OAF 0.7664236
ANALITICA 0.7369749
SEXO 0.7120522
ETIOLOGIA 0.6825440
SUERO 0.6081001
ANTIBIOTICO 0.5786599
APNEA 0.5530846
SNG 0.4987985
DESNUTRICION 0.4672756
BRONCODILATADORES 0.4225544
PREMATURIDAD 0.3525914
DETERIORO 0.3421964
ALERGIAS 0.3303123
DERMATITIS 0.3130308
OAF.si1.no0 0.2736918
ALIMENTACION 0.2460817
CORTICOIDES 0.2129243
UCIP 0.1981695
OAF 0.1870470
PALIVIZUMAB 0.1698971

TSCLust

DD <- diss(datos, "INT.PER", normalize=TRUE)
hcintper <- hclust(DD, "ward.D2")
fviz_dend(hcintper, palette = "jco",
          rect = TRUE, show_labels = FALSE, k = 2)

DDclust <- cutree( hclust(DD, "ward.D2"), k = 2)
fviz_cluster(list(data = datos_norm, cluster = DDclust))

Random Forest: Discriminant TSCLust

data_frame1 = data.frame("CLUSTER" = DDclust)
data_frame2 = df1_NO_NA
data_frame_merge <-
  merge(data_frame1, data_frame2,                      by = 'row.names', all = TRUE)
data_frame_merge <- data_frame_merge[, 2:dim(data_frame_merge)[2]]
set.seed(123)
data_frame_merge$CLUSTER <- as.factor(data_frame_merge$CLUSTER)
RF_0 <- randomForest(CLUSTER ~ ., data = data_frame_merge)
print(RF_0)
## 
## Call:
##  randomForest(formula = CLUSTER ~ ., data = data_frame_merge) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 5
## 
##         OOB estimate of  error rate: 36.76%
## Confusion matrix:
##    1  2 class.error
## 1 23 13   0.3611111
## 2 12 20   0.3750000

Importance

kable(importance(RF_0, type =2)[order(importance(RF_0, type =2), decreasing = TRUE),])
x
PESO 5.8487435
EDAD 5.6466244
FR_0_8h 2.6797139
EG 2.1614781
SCORE_WOOD_DOWNES_INGRESO 2.1283307
SEXO 1.8802123
SCORE_WOOD_DOWNES_24H 1.6708889
SAPI_0_8h 1.0632000
ETIOLOGIA 0.7724720
DIAS_OAF 0.6940631
LM 0.6578771
RADIOGRAFIA 0.5472899
OAF.si1.no0 0.5429020
ANALITICA 0.5324897
ENFERMEDAD_BASE 0.5303762
DETERIORO 0.5155704
PREMATURIDAD 0.4883636
OAF 0.4759788
SUERO 0.4710068
DESNUTRICION 0.4434190
BRONCODILATADORES 0.3946798
ALIMENTACION 0.3864052
SNG 0.3381893
ALERGIAS 0.3280936
ANTIBIOTICO 0.3235838
APNEA 0.2758255
CORTICOIDES 0.1959470
PALIVIZUMAB 0.1850103
DERMATITIS 0.1725256
UCIP 0.1054525

Imp_CCF

Hclust

data_frame1 = data.frame("CLUSTER" = res.hc$cluster)
data_frame2 = t(All_valid_patients_input_P1_CCF)
data_frame_merge <-
  merge(data_frame1, data_frame2,                      by = 'row.names', all = TRUE)
data_frame_merge <- data_frame_merge[, 2:dim(data_frame_merge)[2]]
set.seed(123)
data_frame_merge$CLUSTER <- as.factor(data_frame_merge$CLUSTER)
RF_0 <- randomForest(CLUSTER ~ ., data = data_frame_merge)
print(RF_0)
## 
## Call:
##  randomForest(formula = CLUSTER ~ ., data = data_frame_merge) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 53
## 
##         OOB estimate of  error rate: 14.71%
## Confusion matrix:
##    1  2 class.error
## 1 14  7  0.33333333
## 2  3 44  0.06382979
plot(importance(RF_0, type =2), type = "h" )

k-means

data_frame1 = data.frame("CLUSTER" = k2_kmeans$cluster)
data_frame2 = t(All_valid_patients_input_P1_CCF)
data_frame_merge <-
  merge(data_frame1, data_frame2,                      by = 'row.names', all = TRUE)
data_frame_merge <- data_frame_merge[, 2:dim(data_frame_merge)[2]]
set.seed(123)
data_frame_merge$CLUSTER <- as.factor(data_frame_merge$CLUSTER)
RF_0 <- randomForest(CLUSTER ~ ., data = data_frame_merge)
print(RF_0)
## 
## Call:
##  randomForest(formula = CLUSTER ~ ., data = data_frame_merge) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 53
## 
##         OOB estimate of  error rate: 10.29%
## Confusion matrix:
##    1  2 class.error
## 1 22  5  0.18518519
## 2  2 39  0.04878049
plot(importance(RF_0, type =2), type = "h" )

#### TSClust

data_frame1 = data.frame("CLUSTER" = DDclust)
data_frame2 = t(All_valid_patients_input_P1_CCF)
data_frame_merge <-
  merge(data_frame1, data_frame2,                      by = 'row.names', all = TRUE)
data_frame_merge <- data_frame_merge[, 2:dim(data_frame_merge)[2]]
set.seed(123)
data_frame_merge$CLUSTER <- as.factor(data_frame_merge$CLUSTER)
RF_0 <- randomForest(CLUSTER ~ ., data = data_frame_merge)
print(RF_0)
## 
## Call:
##  randomForest(formula = CLUSTER ~ ., data = data_frame_merge) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 53
## 
##         OOB estimate of  error rate: 23.53%
## Confusion matrix:
##    1  2 class.error
## 1 25 11   0.3055556
## 2  5 27   0.1562500
plot(importance(RF_0, type =2), type = "h" )