Libraries
library(tidyr)
library(readr)
library(ggplot2)
library(knitr)
library(readxl)
library(xlsx)
library(openxlsx)
library(reactable) # reactable(df)
library(dplyr)
library(naniar)
library(randomForest) # RandomForest Clasification
library(TSclust)
## Knn imputation
library(caret)
library(RANN)
# CLustering
library(factoextra) #clustering visualization
library(cluster) #clustering algorithms
library(dendextend) #for comparing two dendrograms
library(corrplot) #corelation between dendrograms
library(tidyverse) #data manupulation
#library(NbClust) #determine optimal no. of clusters [not working...]
library(knitr)
library(factoextra)
library(ggplot2)
Reading Data
FC_all_valid_patients_input_P1 = data.frame(read_xlsx("../data/clean-data/CriteriaBasedImputation/FC_all_valid_patients_input_P1.xlsx", sheet = "FC_all_valid_patients_input_P1" ))
SatO2_all_valid_patients_input_P1 = data.frame(read_xlsx("../data/clean-data/CriteriaBasedImputation/SatO2_all_valid_patients_input_P1.xlsx", sheet = "St2_all_valid_patients_input_P1" ))
## Deterioro and Not deterioro
file_patient_name_NO_DETERIORO <- data.frame(read_xlsx("../data/info-patients/file_patient_name_NO_DETERIORO.xlsx"))
file_patient_name_NO_DETERIORO <- file_patient_name_NO_DETERIORO$x
file_patient_name_DETERIORO <- data.frame(read_xlsx("../data/info-patients/file_patient_name_DETERIORO.xlsx"))
file_patient_name_DETERIORO <- file_patient_name_DETERIORO$x
I wanted a mean of 0 and a standard deviation of 1. Standardization consists of transforming the variables such that they have mean zero and standard deviation one.
# We add the variable time
M = dim(FC_all_valid_patients_input_P1)[1] # 1441
FC_all_valid_patients_input_P1$time <- c(1:M)
SatO2_all_valid_patients_input_P1$time <- c(1:M)
dimension_col <- dim(FC_all_valid_patients_input_P1)[2] - 1
dimension_row <- 1441 #lag.max -1
# Heart Rate
FC_all_valid_patients_input_P1_ACF <- data.frame(matrix(nrow = dimension_row, ncol = dimension_col))
colnames(FC_all_valid_patients_input_P1_ACF) <- names(FC_all_valid_patients_input_P1)[1:dimension_col]
# SatO2
SatO2_all_valid_patients_input_P1_ACF <- data.frame(matrix(nrow = dimension_row, ncol = dimension_col))
colnames(SatO2_all_valid_patients_input_P1_ACF) <- names(SatO2_all_valid_patients_input_P1)[1:dimension_col]
# CCF SatO2 and Heart Rate
All_valid_patients_input_P1_CCF <- data.frame(matrix(nrow = 2*dimension_row-1, ncol = dimension_col))
colnames(All_valid_patients_input_P1_CCF) <- names(SatO2_all_valid_patients_input_P1)[1:dimension_col]
for (i in names(FC_all_valid_patients_input_P1_ACF)) {
acf_result_FC <- acf(FC_all_valid_patients_input_P1[[i]], lag.max = (dimension_row - 1), plot = FALSE)
FC_all_valid_patients_input_P1_ACF[, i] <- acf_result_FC$acf
acf_result_SatO2 <- acf(SatO2_all_valid_patients_input_P1[[i]], lag.max = (dimension_row - 1), plot = FALSE)
SatO2_all_valid_patients_input_P1_ACF[, i] <- acf_result_SatO2$acf
ccf_result <- ccf(FC_all_valid_patients_input_P1[,i],SatO2_all_valid_patients_input_P1[,i],lag.max = (dimension_row - 1), plot = FALSE)
All_valid_patients_input_P1_CCF[,i] <- ccf_result$acf
}
valid_patients_P1 <- data.frame(read_xlsx("../data/clean-data/valid_patients_P1.xlsx"))
valid_patients_P1 <- valid_patients_P1$x
file_patient_name <- data.frame(read_csv("../data/clean-data/file_patient_name.csv", show_col_types = FALSE))
file_patient_name <- file_patient_name$x
df1 <- data.frame(read_xlsx("../data/clean-data/descriptive-data/descriptive_data.xlsx"))
rownames(df1) <- file_patient_name
df1 <- df1[valid_patients_P1,]
Selecting only the complete variables
row_names_info <- rownames(t(df1))
missing_df1= as.data.frame(miss_case_summary(as.data.frame(t(df1))))
missing_df1$case = row_names_info[missing_df1$case]
colnames(missing_df1) <- c("Variable", "N_Miss","PCT_Miss")
# summary(df1)
df1_NO_NA = df1[,missing_df1[missing_df1$N_Miss == 0,]$Variable]
# Is important to delete also the "name" variables
# Those ones are
# "ID" "NHC" "INICIALES"
df1_NO_NA <- subset(df1_NO_NA, select = - c(ID,NHC,INICIALES))
datos <- t(FC_all_valid_patients_input_P1_ACF)
datos_norm <- scale(datos)
distance <- dist(datos_norm, method = "euclidean")
distance_matrix <- as.matrix(distance)
fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07")) +
theme(axis.text.x=element_blank(), #remove x axis labels
axis.ticks.x=element_blank(), #remove x axis ticks
axis.text.y=element_blank(), #remove y axis labels
axis.ticks.y=element_blank() #remove y axis ticks
)
To find which hierarchical clustering methods that can identify stronger clustering structures. Here we see that Ward’s method identifies the strongest clustering structure of the four methods assessed. Visualize the clustered data with red rectangle border considering k = 2
res.hc <- datos_norm %>%
eclust("hclust", k = 2, graph = FALSE, method = "ward.D2")
# Visualize with factoextra
fviz_dend(res.hc, palette = "jco",
rect = TRUE, show_labels = FALSE)
Evaluate the result: To evaluate the result, we can calculate the silhouette index.
fviz_silhouette(res.hc)
## cluster size ave.sil.width
## 1 1 4 0.54
## 2 2 64 0.30
clust_1 <- hclust(distance, method = "ward.D2")
fviz_cluster(list(data = datos_norm, cluster = cutree(clust_1, k = 2)))
data_frame1 = data.frame("CLUSTER" = res.hc$cluster)
data_frame2 = df1_NO_NA
data_frame_merge <-
merge(data_frame1, data_frame2, by = 'row.names', all = TRUE)
data_frame_merge <- data_frame_merge[, 2:dim(data_frame_merge)[2]]
set.seed(123)
data_frame_merge$CLUSTER <- as.factor(data_frame_merge$CLUSTER)
RF_0 <- randomForest(CLUSTER ~ ., data = data_frame_merge)
print(RF_0)
##
## Call:
## randomForest(formula = CLUSTER ~ ., data = data_frame_merge)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 5
##
## OOB estimate of error rate: 5.88%
## Confusion matrix:
## 1 2 class.error
## 1 0 4 1
## 2 0 64 0
Importance
kable(importance(RF_0, type =2)[order(importance(RF_0, type =2), decreasing = TRUE),])
x | |
---|---|
EDAD | 1.1949823 |
PESO | 0.8982389 |
FR_0_8h | 0.5938739 |
SCORE_WOOD_DOWNES_24H | 0.5852357 |
RADIOGRAFIA | 0.3907127 |
BRONCODILATADORES | 0.3893944 |
EG | 0.3750563 |
SCORE_WOOD_DOWNES_INGRESO | 0.3619181 |
ETIOLOGIA | 0.2618195 |
SAPI_0_8h | 0.2545372 |
DIAS_OAF | 0.2285301 |
CORTICOIDES | 0.2140864 |
ANTIBIOTICO | 0.1934095 |
ANALITICA | 0.1931057 |
SUERO | 0.1417103 |
ALERGIAS | 0.1257025 |
SEXO | 0.1245906 |
DETERIORO | 0.0989267 |
OAF.si1.no0 | 0.0965132 |
OAF | 0.0930055 |
PALIVIZUMAB | 0.0872935 |
PREMATURIDAD | 0.0869605 |
ENFERMEDAD_BASE | 0.0860095 |
LM | 0.0846746 |
DESNUTRICION | 0.0736334 |
SNG | 0.0573906 |
DERMATITIS | 0.0547714 |
ALIMENTACION | 0.0132810 |
APNEA | 0.0081667 |
UCIP | 0.0064788 |
k2_kmeans <- datos_norm %>%
eclust("hclust", k = 2, graph = FALSE, FUNcluster = "kmeans")
# kmeans does not have dendogram
fviz_silhouette(k2_kmeans)
## cluster size ave.sil.width
## 1 1 14 0.18
## 2 2 54 0.20
fviz_cluster(list(data = datos_norm, cluster = k2_kmeans$cluster))
k2 <- datos_norm %>%
eclust("hclust", k = 2, graph = FALSE, FUNcluster = "kmeans")
k3 <- datos_norm %>%
eclust("hclust", k = 3, graph = FALSE, FUNcluster = "kmeans")
k4 <-datos_norm %>%
eclust("hclust", k = 4, graph = FALSE, FUNcluster = "kmeans")
k5 <- datos_norm %>%
eclust("hclust", k = 5, graph = FALSE, FUNcluster = "kmeans")
# plots to compare
p1 <- fviz_cluster(k2, geom = "point", data = df) + ggtitle("k = 2")
p2 <- fviz_cluster(k3, geom = "point", data = df) + ggtitle("k = 3")
p3 <- fviz_cluster(k4, geom = "point", data = df) + ggtitle("k = 4")
p4 <- fviz_cluster(k5, geom = "point", data = df) + ggtitle("k = 5")
library(gridExtra)
grid.arrange(p1, p2, p3, p4, nrow = 2)
set.seed(123)
fviz_nbclust(datos_norm, kmeans, method = "wss")
fviz_nbclust(datos_norm, kmeans, method = "silhouette")
data_frame1 = data.frame("CLUSTER" = k2_kmeans$cluster)
data_frame2 = df1_NO_NA
data_frame_merge <-
merge(data_frame1, data_frame2, by = 'row.names', all = TRUE)
data_frame_merge <- data_frame_merge[, 2:dim(data_frame_merge)[2]]
set.seed(123)
data_frame_merge$CLUSTER <- as.factor(data_frame_merge$CLUSTER)
RF_0 <- randomForest(CLUSTER ~ ., data = data_frame_merge)
print(RF_0)
##
## Call:
## randomForest(formula = CLUSTER ~ ., data = data_frame_merge)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 5
##
## OOB estimate of error rate: 23.53%
## Confusion matrix:
## 1 2 class.error
## 1 0 14 1.00000000
## 2 2 52 0.03703704
Importance
kable(importance(RF_0, type =2)[order(importance(RF_0, type =2), decreasing = TRUE),])
x | |
---|---|
EDAD | 3.3839230 |
FR_0_8h | 2.6277133 |
PESO | 2.2223809 |
SCORE_WOOD_DOWNES_INGRESO | 2.1795198 |
SCORE_WOOD_DOWNES_24H | 1.5009898 |
EG | 1.1637664 |
SAPI_0_8h | 1.1305895 |
LM | 0.8675757 |
SUERO | 0.7895493 |
SEXO | 0.7156952 |
DIAS_OAF | 0.4217511 |
ETIOLOGIA | 0.4152356 |
RADIOGRAFIA | 0.4104377 |
ANALITICA | 0.4042494 |
ENFERMEDAD_BASE | 0.3928956 |
BRONCODILATADORES | 0.3591701 |
PREMATURIDAD | 0.3368451 |
ALIMENTACION | 0.3060370 |
ANTIBIOTICO | 0.2392297 |
OAF.si1.no0 | 0.2089416 |
PALIVIZUMAB | 0.1976535 |
DESNUTRICION | 0.1926680 |
ALERGIAS | 0.1757072 |
DETERIORO | 0.1561031 |
SNG | 0.1558060 |
APNEA | 0.1552365 |
CORTICOIDES | 0.1359795 |
OAF | 0.1301501 |
DERMATITIS | 0.0882573 |
UCIP | 0.0759790 |
DD <- diss(datos, "INT.PER", normalize=TRUE)
hcintper <- hclust(DD, "ward.D2")
fviz_dend(hcintper, palette = "jco",
rect = TRUE, show_labels = FALSE, k=2)
DDclust <- cutree( hclust(DD, "ward.D2"), k = 2)
fviz_cluster(list(data = datos_norm, cluster = DDclust))
data_frame1 = data.frame("CLUSTER" = DDclust)
data_frame2 = df1_NO_NA
data_frame_merge <-
merge(data_frame1, data_frame2, by = 'row.names', all = TRUE)
data_frame_merge <- data_frame_merge[, 2:dim(data_frame_merge)[2]]
set.seed(123)
data_frame_merge$CLUSTER <- as.factor(data_frame_merge$CLUSTER)
RF_0 <- randomForest(CLUSTER ~ ., data = data_frame_merge)
print(RF_0)
##
## Call:
## randomForest(formula = CLUSTER ~ ., data = data_frame_merge)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 5
##
## OOB estimate of error rate: 29.41%
## Confusion matrix:
## 1 2 class.error
## 1 38 7 0.1555556
## 2 13 10 0.5652174
Importance
kable(importance(RF_0, type =2)[order(importance(RF_0, type =2), decreasing = TRUE),])
x | |
---|---|
EDAD | 5.3634741 |
PESO | 4.2317585 |
SCORE_WOOD_DOWNES_INGRESO | 2.4385988 |
FR_0_8h | 2.2780070 |
EG | 2.1902793 |
SCORE_WOOD_DOWNES_24H | 1.7794489 |
SAPI_0_8h | 0.9796765 |
DIAS_OAF | 0.9270895 |
ETIOLOGIA | 0.7591922 |
PREMATURIDAD | 0.6799956 |
SEXO | 0.6456424 |
SUERO | 0.6346292 |
APNEA | 0.5650904 |
LM | 0.5436275 |
RADIOGRAFIA | 0.5408403 |
SNG | 0.5376492 |
ANTIBIOTICO | 0.4984785 |
ENFERMEDAD_BASE | 0.4465248 |
ANALITICA | 0.4376758 |
DESNUTRICION | 0.3809640 |
OAF.si1.no0 | 0.3523257 |
DETERIORO | 0.3342896 |
BRONCODILATADORES | 0.3041642 |
OAF | 0.2585900 |
ALERGIAS | 0.2293939 |
DERMATITIS | 0.1738660 |
PALIVIZUMAB | 0.1712616 |
UCIP | 0.1412915 |
ALIMENTACION | 0.1313147 |
CORTICOIDES | 0.1148715 |
data_frame1 = data.frame("CLUSTER" = res.hc$cluster)
data_frame2 = t(FC_all_valid_patients_input_P1_ACF)
data_frame_merge <-
merge(data_frame1, data_frame2, by = 'row.names', all = TRUE)
data_frame_merge <- data_frame_merge[, 2:dim(data_frame_merge)[2]]
set.seed(123)
data_frame_merge$CLUSTER <- as.factor(data_frame_merge$CLUSTER)
RF_0 <- randomForest(CLUSTER ~ ., data = data_frame_merge)
print(RF_0)
##
## Call:
## randomForest(formula = CLUSTER ~ ., data = data_frame_merge)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 37
##
## OOB estimate of error rate: 1.47%
## Confusion matrix:
## 1 2 class.error
## 1 3 1 0.25
## 2 0 64 0.00
plot(importance(RF_0, type =2), type = "h" )
data_frame1 = data.frame("CLUSTER" = k2_kmeans$cluster)
data_frame2 = t(FC_all_valid_patients_input_P1_ACF)
data_frame_merge <-
merge(data_frame1, data_frame2, by = 'row.names', all = TRUE)
data_frame_merge <- data_frame_merge[, 2:dim(data_frame_merge)[2]]
set.seed(123)
data_frame_merge$CLUSTER <- as.factor(data_frame_merge$CLUSTER)
RF_0 <- randomForest(CLUSTER ~ ., data = data_frame_merge)
print(RF_0)
##
## Call:
## randomForest(formula = CLUSTER ~ ., data = data_frame_merge)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 37
##
## OOB estimate of error rate: 7.35%
## Confusion matrix:
## 1 2 class.error
## 1 9 5 0.3571429
## 2 0 54 0.0000000
plot(importance(RF_0, type =2), type = "h" )
#### TSClust
data_frame1 = data.frame("CLUSTER" = DDclust)
data_frame2 = t(FC_all_valid_patients_input_P1_ACF)
data_frame_merge <-
merge(data_frame1, data_frame2, by = 'row.names', all = TRUE)
data_frame_merge <- data_frame_merge[, 2:dim(data_frame_merge)[2]]
set.seed(123)
data_frame_merge$CLUSTER <- as.factor(data_frame_merge$CLUSTER)
RF_0 <- randomForest(CLUSTER ~ ., data = data_frame_merge)
print(RF_0)
##
## Call:
## randomForest(formula = CLUSTER ~ ., data = data_frame_merge)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 37
##
## OOB estimate of error rate: 4.41%
## Confusion matrix:
## 1 2 class.error
## 1 43 2 0.04444444
## 2 1 22 0.04347826
plot(importance(RF_0, type =2), type = "h" )
datos <- t(SatO2_all_valid_patients_input_P1_ACF)
datos_norm <- scale(datos)
distance <- dist(datos_norm, method = "euclidean")
distance_matrix <- as.matrix(distance)
fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07")) +
theme(axis.text.x=element_blank(), #remove x axis labels
axis.ticks.x=element_blank(), #remove x axis ticks
axis.text.y=element_blank(), #remove y axis labels
axis.ticks.y=element_blank() #remove y axis ticks
)
To find which hierarchical clustering methods that can identify stronger clustering structures. Here we see that Ward’s method identifies the strongest clustering structure of the four methods assessed.
res.hc <- datos_norm %>%
eclust("hclust", k = 2, graph = FALSE, method = "ward.D2")
# Visualize with factoextra
fviz_dend(res.hc, palette = "jco",
rect = TRUE, show_labels = FALSE)
Visualize the clustered data with red rectangle border considering k = 2
Evaluate the result: To evaluate the result, we can calculate the silhouette index.
fviz_silhouette(res.hc)
## cluster size ave.sil.width
## 1 1 64 0.47
## 2 2 4 0.48
clust_1 <- hclust(distance, method = "ward.D2")
fviz_cluster(list(data = datos_norm, cluster = cutree(clust_1, k = 2)))
#### Random Forest: Hierarchical clustering
data_frame1 = data.frame("CLUSTER" = res.hc$cluster)
data_frame2 = df1_NO_NA
data_frame_merge <-
merge(data_frame1, data_frame2, by = 'row.names', all = TRUE)
data_frame_merge <- data_frame_merge[, 2:dim(data_frame_merge)[2]]
set.seed(123)
data_frame_merge$CLUSTER <- as.factor(data_frame_merge$CLUSTER)
RF_0 <- randomForest(CLUSTER ~ ., data = data_frame_merge)
print(RF_0)
##
## Call:
## randomForest(formula = CLUSTER ~ ., data = data_frame_merge)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 5
##
## OOB estimate of error rate: 5.88%
## Confusion matrix:
## 1 2 class.error
## 1 64 0 0
## 2 4 0 1
Importance
kable(importance(RF_0, type =2)[order(importance(RF_0, type =2), decreasing = TRUE),])
x | |
---|---|
EG | 1.5012645 |
SCORE_WOOD_DOWNES_24H | 0.9403636 |
DIAS_OAF | 0.6428180 |
PESO | 0.5593814 |
EDAD | 0.4503682 |
FR_0_8h | 0.3248087 |
CORTICOIDES | 0.3087738 |
ALERGIAS | 0.2687718 |
ENFERMEDAD_BASE | 0.2305884 |
SCORE_WOOD_DOWNES_INGRESO | 0.1861729 |
SAPI_0_8h | 0.1831349 |
DETERIORO | 0.1796514 |
ALIMENTACION | 0.1790532 |
SEXO | 0.1627955 |
BRONCODILATADORES | 0.1568883 |
OAF.si1.no0 | 0.1521014 |
PREMATURIDAD | 0.1449579 |
PALIVIZUMAB | 0.1393624 |
UCIP | 0.1363409 |
ETIOLOGIA | 0.1174646 |
LM | 0.1053120 |
SUERO | 0.0915257 |
ANALITICA | 0.0845391 |
OAF | 0.0831304 |
DESNUTRICION | 0.0819021 |
RADIOGRAFIA | 0.0696723 |
SNG | 0.0428662 |
ANTIBIOTICO | 0.0307846 |
DERMATITIS | 0.0171378 |
APNEA | 0.0077333 |
Importance of the ACF SatO2
data_frame1 = data.frame("CLUSTER" = res.hc$cluster)
data_frame2 = t(FC_all_valid_patients_input_P1_ACF)
data_frame_merge <-
merge(data_frame1, data_frame2, by = 'row.names', all = TRUE)
data_frame_merge <- data_frame_merge[, 2:dim(data_frame_merge)[2]]
set.seed(123)
data_frame_merge$CLUSTER <- as.factor(data_frame_merge$CLUSTER)
RF_0 <- randomForest(CLUSTER ~ ., data = data_frame_merge)
print(RF_0)
##
## Call:
## randomForest(formula = CLUSTER ~ ., data = data_frame_merge)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 37
##
## OOB estimate of error rate: 5.88%
## Confusion matrix:
## 1 2 class.error
## 1 64 0 0
## 2 4 0 1
plot(importance(RF_0, type =2), type = "h" )
k2_kmeans <- datos_norm %>%
eclust("hclust", k = 2, graph = FALSE, FUNcluster = "kmeans")
# kmeans does not have dendogram
fviz_silhouette(k2_kmeans)
## cluster size ave.sil.width
## 1 1 59 0.44
## 2 2 9 0.04
fviz_cluster(list(data = datos_norm, cluster = k2_kmeans$cluster))
k2 <- datos_norm %>%
eclust("hclust", k = 2, graph = FALSE, FUNcluster = "kmeans")
k3 <- datos_norm %>%
eclust("hclust", k = 3, graph = FALSE, FUNcluster = "kmeans")
k4 <-datos_norm %>%
eclust("hclust", k = 4, graph = FALSE, FUNcluster = "kmeans")
k5 <- datos_norm %>%
eclust("hclust", k = 5, graph = FALSE, FUNcluster = "kmeans")
# plots to compare
p1 <- fviz_cluster(k2, geom = "point", data = df) + ggtitle("k = 2")
p2 <- fviz_cluster(k3, geom = "point", data = df) + ggtitle("k = 3")
p3 <- fviz_cluster(k4, geom = "point", data = df) + ggtitle("k = 4")
p4 <- fviz_cluster(k5, geom = "point", data = df) + ggtitle("k = 5")
library(gridExtra)
grid.arrange(p1, p2, p3, p4, nrow = 2)
set.seed(123)
fviz_nbclust(datos_norm, kmeans, method = "wss")
fviz_nbclust(datos_norm, kmeans, method = "silhouette")
data_frame1 = data.frame("CLUSTER" = k2_kmeans$cluster)
data_frame2 = df1_NO_NA
data_frame_merge <-
merge(data_frame1, data_frame2, by = 'row.names', all = TRUE)
data_frame_merge <- data_frame_merge[, 2:dim(data_frame_merge)[2]]
set.seed(123)
data_frame_merge$CLUSTER <- as.factor(data_frame_merge$CLUSTER)
RF_0 <- randomForest(CLUSTER ~ ., data = data_frame_merge)
print(RF_0)
##
## Call:
## randomForest(formula = CLUSTER ~ ., data = data_frame_merge)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 5
##
## OOB estimate of error rate: 13.24%
## Confusion matrix:
## 1 2 class.error
## 1 59 0 0
## 2 9 0 1
Importance
kable(importance(RF_0, type =2)[order(importance(RF_0, type =2), decreasing = TRUE),])
x | |
---|---|
PESO | 1.8637411 |
EG | 1.5617451 |
SCORE_WOOD_DOWNES_INGRESO | 1.4687705 |
EDAD | 1.4116161 |
SCORE_WOOD_DOWNES_24H | 0.9236763 |
FR_0_8h | 0.9144307 |
DESNUTRICION | 0.8350052 |
SEXO | 0.7414143 |
ENFERMEDAD_BASE | 0.6706701 |
BRONCODILATADORES | 0.5543442 |
SAPI_0_8h | 0.5116409 |
DIAS_OAF | 0.5000478 |
CORTICOIDES | 0.2905115 |
PALIVIZUMAB | 0.2834190 |
ANALITICA | 0.2603768 |
PREMATURIDAD | 0.2564485 |
SNG | 0.2382675 |
ALERGIAS | 0.2196853 |
LM | 0.2117268 |
APNEA | 0.2068648 |
ETIOLOGIA | 0.2063904 |
RADIOGRAFIA | 0.1686067 |
SUERO | 0.1666663 |
DETERIORO | 0.1610429 |
ANTIBIOTICO | 0.1590619 |
OAF.si1.no0 | 0.1344170 |
ALIMENTACION | 0.1247312 |
UCIP | 0.1215508 |
OAF | 0.0761842 |
DERMATITIS | 0.0546917 |
DD <- diss(datos, "INT.PER", normalize=TRUE)
hcintper <- hclust(DD, "ward.D2")
fviz_dend(hcintper, palette = "jco",
rect = TRUE, show_labels = FALSE, k=2)
DDclust <- cutree( hclust(DD, "ward.D2"), k = 2)
fviz_cluster(list(data = datos_norm, cluster = DDclust))
data_frame1 = data.frame("CLUSTER" = DDclust)
data_frame2 = df1_NO_NA
data_frame_merge <-
merge(data_frame1, data_frame2, by = 'row.names', all = TRUE)
data_frame_merge <- data_frame_merge[, 2:dim(data_frame_merge)[2]]
set.seed(123)
data_frame_merge$CLUSTER <- as.factor(data_frame_merge$CLUSTER)
RF_0 <- randomForest(CLUSTER ~ ., data = data_frame_merge)
print(RF_0)
##
## Call:
## randomForest(formula = CLUSTER ~ ., data = data_frame_merge)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 5
##
## OOB estimate of error rate: 39.71%
## Confusion matrix:
## 1 2 class.error
## 1 24 14 0.3684211
## 2 13 17 0.4333333
Importance
kable(importance(RF_0, type =2)[order(importance(RF_0, type =2), decreasing = TRUE),])
x | |
---|---|
PESO | 5.5962954 |
EDAD | 4.6522326 |
FR_0_8h | 3.6358800 |
SCORE_WOOD_DOWNES_24H | 2.4266129 |
EG | 1.8058148 |
SCORE_WOOD_DOWNES_INGRESO | 1.7246427 |
SAPI_0_8h | 1.6643948 |
DIAS_OAF | 1.1632593 |
SEXO | 1.0733961 |
LM | 0.9782990 |
ETIOLOGIA | 0.9413451 |
DESNUTRICION | 0.6427934 |
BRONCODILATADORES | 0.6379262 |
ENFERMEDAD_BASE | 0.5818366 |
RADIOGRAFIA | 0.5577426 |
SUERO | 0.4173942 |
ANTIBIOTICO | 0.3757280 |
PREMATURIDAD | 0.3555245 |
ALERGIAS | 0.3546761 |
DERMATITIS | 0.3505808 |
ANALITICA | 0.3231879 |
OAF | 0.2962828 |
OAF.si1.no0 | 0.2857509 |
DETERIORO | 0.2747706 |
SNG | 0.2633840 |
APNEA | 0.2409242 |
ALIMENTACION | 0.2031993 |
PALIVIZUMAB | 0.1994680 |
UCIP | 0.1566229 |
CORTICOIDES | 0.0895820 |
data_frame1 = data.frame("CLUSTER" = res.hc$cluster)
data_frame2 = t(FC_all_valid_patients_input_P1_ACF)
data_frame_merge <-
merge(data_frame1, data_frame2, by = 'row.names', all = TRUE)
data_frame_merge <- data_frame_merge[, 2:dim(data_frame_merge)[2]]
set.seed(123)
data_frame_merge$CLUSTER <- as.factor(data_frame_merge$CLUSTER)
RF_0 <- randomForest(CLUSTER ~ ., data = data_frame_merge)
print(RF_0)
##
## Call:
## randomForest(formula = CLUSTER ~ ., data = data_frame_merge)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 37
##
## OOB estimate of error rate: 5.88%
## Confusion matrix:
## 1 2 class.error
## 1 64 0 0
## 2 4 0 1
plot(importance(RF_0, type =2), type = "h" )
data_frame1 = data.frame("CLUSTER" = k2_kmeans$cluster)
data_frame2 = t(FC_all_valid_patients_input_P1_ACF)
data_frame_merge <-
merge(data_frame1, data_frame2, by = 'row.names', all = TRUE)
data_frame_merge <- data_frame_merge[, 2:dim(data_frame_merge)[2]]
set.seed(123)
data_frame_merge$CLUSTER <- as.factor(data_frame_merge$CLUSTER)
RF_0 <- randomForest(CLUSTER ~ ., data = data_frame_merge)
print(RF_0)
##
## Call:
## randomForest(formula = CLUSTER ~ ., data = data_frame_merge)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 37
##
## OOB estimate of error rate: 13.24%
## Confusion matrix:
## 1 2 class.error
## 1 59 0 0
## 2 9 0 1
plot(importance(RF_0, type =2), type = "h" )
data_frame1 = data.frame("CLUSTER" = DDclust)
data_frame2 = t(SatO2_all_valid_patients_input_P1_ACF)
data_frame_merge <-
merge(data_frame1, data_frame2, by = 'row.names', all = TRUE)
data_frame_merge <- data_frame_merge[, 2:dim(data_frame_merge)[2]]
set.seed(123)
data_frame_merge$CLUSTER <- as.factor(data_frame_merge$CLUSTER)
RF_0 <- randomForest(CLUSTER ~ ., data = data_frame_merge)
print(RF_0)
##
## Call:
## randomForest(formula = CLUSTER ~ ., data = data_frame_merge)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 37
##
## OOB estimate of error rate: 16.18%
## Confusion matrix:
## 1 2 class.error
## 1 32 6 0.1578947
## 2 5 25 0.1666667
plot(importance(RF_0, type =2), type = "h" )
datos <- t(All_valid_patients_input_P1_CCF)
datos_norm <- scale(datos)
distance <- dist(datos_norm, method = "euclidean")
distance_matrix <- as.matrix(distance)
fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07")) +
theme(axis.text.x=element_blank(), #remove x axis labels
axis.ticks.x=element_blank(), #remove x axis ticks
axis.text.y=element_blank(), #remove y axis labels
axis.ticks.y=element_blank() #remove y axis ticks
)
To find which hierarchical clustering methods that can identify stronger clustering structures. Here we see that Ward’s method identifies the strongest clustering structure of the four methods assessed.
res.hc <- datos_norm %>%
eclust("hclust", k = 2, graph = FALSE, method = "ward.D2")
# Visualize with factoextra
fviz_dend(res.hc, palette = "jco",
rect = TRUE, show_labels = FALSE)
Visualize the clustered data with red rectangle border considering k = 2
Evaluate the result: To evaluate the result, we can calculate the silhouette index.
fviz_silhouette(res.hc)
## cluster size ave.sil.width
## 1 1 21 0.13
## 2 2 47 0.02
clust_1 <- hclust(distance, method = "ward.D2")
fviz_cluster(list(data = datos_norm, cluster = cutree(clust_1, k = 2)))
#### Random Forest: Hierarchical clustering
data_frame1 = data.frame("CLUSTER" = res.hc$cluster)
data_frame2 = df1_NO_NA
data_frame_merge <-
merge(data_frame1, data_frame2, by = 'row.names', all = TRUE)
data_frame_merge <- data_frame_merge[, 2:dim(data_frame_merge)[2]]
set.seed(123)
data_frame_merge$CLUSTER <- as.factor(data_frame_merge$CLUSTER)
RF_0 <- randomForest(CLUSTER ~ ., data = data_frame_merge)
print(RF_0)
##
## Call:
## randomForest(formula = CLUSTER ~ ., data = data_frame_merge)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 5
##
## OOB estimate of error rate: 38.24%
## Confusion matrix:
## 1 2 class.error
## 1 0 21 1.000000
## 2 5 42 0.106383
Importance
kable(importance(RF_0, type =2)[order(importance(RF_0, type =2), decreasing = TRUE),])
x | |
---|---|
EDAD | 3.3047062 |
FR_0_8h | 3.2736285 |
PESO | 2.8852493 |
EG | 2.1664523 |
SCORE_WOOD_DOWNES_INGRESO | 2.0145034 |
SCORE_WOOD_DOWNES_24H | 2.0127606 |
SAPI_0_8h | 1.3711743 |
SEXO | 1.3051146 |
ETIOLOGIA | 0.9854514 |
LM | 0.7822436 |
SUERO | 0.7786464 |
RADIOGRAFIA | 0.7207818 |
ENFERMEDAD_BASE | 0.7177285 |
PREMATURIDAD | 0.5016050 |
DIAS_OAF | 0.4802313 |
ANALITICA | 0.4282661 |
DESNUTRICION | 0.3967778 |
SNG | 0.3736544 |
ANTIBIOTICO | 0.3525368 |
APNEA | 0.3450846 |
OAF | 0.3350206 |
OAF.si1.no0 | 0.3121337 |
DERMATITIS | 0.3045369 |
BRONCODILATADORES | 0.2984464 |
DETERIORO | 0.2714084 |
ALERGIAS | 0.2613869 |
ALIMENTACION | 0.2092739 |
PALIVIZUMAB | 0.2014913 |
UCIP | 0.1578926 |
CORTICOIDES | 0.1460778 |
k2_kmeans <- datos_norm %>%
eclust("hclust", k = 2, graph = FALSE, FUNcluster = "kmeans")
# kmeans does not have dendogram
fviz_silhouette(k2_kmeans)
## cluster size ave.sil.width
## 1 1 27 0.05
## 2 2 41 0.09
fviz_cluster(list(data = datos_norm, cluster = k2_kmeans$cluster))
k2 <- datos_norm %>%
eclust("hclust", k = 2, graph = FALSE, FUNcluster = "kmeans")
k3 <- datos_norm %>%
eclust("hclust", k = 3, graph = FALSE, FUNcluster = "kmeans")
k4 <-datos_norm %>%
eclust("hclust", k = 4, graph = FALSE, FUNcluster = "kmeans")
k5 <- datos_norm %>%
eclust("hclust", k = 5, graph = FALSE, FUNcluster = "kmeans")
# plots to compare
p1 <- fviz_cluster(k2, geom = "point", data = df) + ggtitle("k = 2")
p2 <- fviz_cluster(k3, geom = "point", data = df) + ggtitle("k = 3")
p3 <- fviz_cluster(k4, geom = "point", data = df) + ggtitle("k = 4")
p4 <- fviz_cluster(k5, geom = "point", data = df) + ggtitle("k = 5")
library(gridExtra)
grid.arrange(p1, p2, p3, p4, nrow = 2)
set.seed(123)
fviz_nbclust(datos_norm, kmeans, method = "wss")
fviz_nbclust(datos_norm, kmeans, method = "silhouette")
data_frame1 = data.frame("CLUSTER" = k2_kmeans$cluster)
data_frame2 = df1_NO_NA
data_frame_merge <-
merge(data_frame1, data_frame2, by = 'row.names', all = TRUE)
data_frame_merge <- data_frame_merge[, 2:dim(data_frame_merge)[2]]
set.seed(123)
data_frame_merge$CLUSTER <- as.factor(data_frame_merge$CLUSTER)
RF_0 <- randomForest(CLUSTER ~ ., data = data_frame_merge, importance = TRUE)
print(RF_0)
##
## Call:
## randomForest(formula = CLUSTER ~ ., data = data_frame_merge, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 5
##
## OOB estimate of error rate: 44.12%
## Confusion matrix:
## 1 2 class.error
## 1 5 22 0.8148148
## 2 8 33 0.1951220
Importance
kable(importance(RF_0, type =2)[order(importance(RF_0, type =2), decreasing = TRUE),])
x | |
---|---|
PESO | 3.8807871 |
EDAD | 3.5691942 |
SCORE_WOOD_DOWNES_INGRESO | 3.2405318 |
FR_0_8h | 2.7851198 |
SCORE_WOOD_DOWNES_24H | 2.7350443 |
EG | 1.8709235 |
SAPI_0_8h | 1.4447660 |
ENFERMEDAD_BASE | 1.1207018 |
LM | 1.0322701 |
RADIOGRAFIA | 0.8331254 |
DIAS_OAF | 0.7664236 |
ANALITICA | 0.7369749 |
SEXO | 0.7120522 |
ETIOLOGIA | 0.6825440 |
SUERO | 0.6081001 |
ANTIBIOTICO | 0.5786599 |
APNEA | 0.5530846 |
SNG | 0.4987985 |
DESNUTRICION | 0.4672756 |
BRONCODILATADORES | 0.4225544 |
PREMATURIDAD | 0.3525914 |
DETERIORO | 0.3421964 |
ALERGIAS | 0.3303123 |
DERMATITIS | 0.3130308 |
OAF.si1.no0 | 0.2736918 |
ALIMENTACION | 0.2460817 |
CORTICOIDES | 0.2129243 |
UCIP | 0.1981695 |
OAF | 0.1870470 |
PALIVIZUMAB | 0.1698971 |
DD <- diss(datos, "INT.PER", normalize=TRUE)
hcintper <- hclust(DD, "ward.D2")
fviz_dend(hcintper, palette = "jco",
rect = TRUE, show_labels = FALSE, k = 2)
DDclust <- cutree( hclust(DD, "ward.D2"), k = 2)
fviz_cluster(list(data = datos_norm, cluster = DDclust))
data_frame1 = data.frame("CLUSTER" = DDclust)
data_frame2 = df1_NO_NA
data_frame_merge <-
merge(data_frame1, data_frame2, by = 'row.names', all = TRUE)
data_frame_merge <- data_frame_merge[, 2:dim(data_frame_merge)[2]]
set.seed(123)
data_frame_merge$CLUSTER <- as.factor(data_frame_merge$CLUSTER)
RF_0 <- randomForest(CLUSTER ~ ., data = data_frame_merge)
print(RF_0)
##
## Call:
## randomForest(formula = CLUSTER ~ ., data = data_frame_merge)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 5
##
## OOB estimate of error rate: 36.76%
## Confusion matrix:
## 1 2 class.error
## 1 23 13 0.3611111
## 2 12 20 0.3750000
Importance
kable(importance(RF_0, type =2)[order(importance(RF_0, type =2), decreasing = TRUE),])
x | |
---|---|
PESO | 5.8487435 |
EDAD | 5.6466244 |
FR_0_8h | 2.6797139 |
EG | 2.1614781 |
SCORE_WOOD_DOWNES_INGRESO | 2.1283307 |
SEXO | 1.8802123 |
SCORE_WOOD_DOWNES_24H | 1.6708889 |
SAPI_0_8h | 1.0632000 |
ETIOLOGIA | 0.7724720 |
DIAS_OAF | 0.6940631 |
LM | 0.6578771 |
RADIOGRAFIA | 0.5472899 |
OAF.si1.no0 | 0.5429020 |
ANALITICA | 0.5324897 |
ENFERMEDAD_BASE | 0.5303762 |
DETERIORO | 0.5155704 |
PREMATURIDAD | 0.4883636 |
OAF | 0.4759788 |
SUERO | 0.4710068 |
DESNUTRICION | 0.4434190 |
BRONCODILATADORES | 0.3946798 |
ALIMENTACION | 0.3864052 |
SNG | 0.3381893 |
ALERGIAS | 0.3280936 |
ANTIBIOTICO | 0.3235838 |
APNEA | 0.2758255 |
CORTICOIDES | 0.1959470 |
PALIVIZUMAB | 0.1850103 |
DERMATITIS | 0.1725256 |
UCIP | 0.1054525 |
data_frame1 = data.frame("CLUSTER" = res.hc$cluster)
data_frame2 = t(All_valid_patients_input_P1_CCF)
data_frame_merge <-
merge(data_frame1, data_frame2, by = 'row.names', all = TRUE)
data_frame_merge <- data_frame_merge[, 2:dim(data_frame_merge)[2]]
set.seed(123)
data_frame_merge$CLUSTER <- as.factor(data_frame_merge$CLUSTER)
RF_0 <- randomForest(CLUSTER ~ ., data = data_frame_merge)
print(RF_0)
##
## Call:
## randomForest(formula = CLUSTER ~ ., data = data_frame_merge)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 53
##
## OOB estimate of error rate: 14.71%
## Confusion matrix:
## 1 2 class.error
## 1 14 7 0.33333333
## 2 3 44 0.06382979
plot(importance(RF_0, type =2), type = "h" )
data_frame1 = data.frame("CLUSTER" = k2_kmeans$cluster)
data_frame2 = t(All_valid_patients_input_P1_CCF)
data_frame_merge <-
merge(data_frame1, data_frame2, by = 'row.names', all = TRUE)
data_frame_merge <- data_frame_merge[, 2:dim(data_frame_merge)[2]]
set.seed(123)
data_frame_merge$CLUSTER <- as.factor(data_frame_merge$CLUSTER)
RF_0 <- randomForest(CLUSTER ~ ., data = data_frame_merge)
print(RF_0)
##
## Call:
## randomForest(formula = CLUSTER ~ ., data = data_frame_merge)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 53
##
## OOB estimate of error rate: 10.29%
## Confusion matrix:
## 1 2 class.error
## 1 22 5 0.18518519
## 2 2 39 0.04878049
plot(importance(RF_0, type =2), type = "h" )
#### TSClust
data_frame1 = data.frame("CLUSTER" = DDclust)
data_frame2 = t(All_valid_patients_input_P1_CCF)
data_frame_merge <-
merge(data_frame1, data_frame2, by = 'row.names', all = TRUE)
data_frame_merge <- data_frame_merge[, 2:dim(data_frame_merge)[2]]
set.seed(123)
data_frame_merge$CLUSTER <- as.factor(data_frame_merge$CLUSTER)
RF_0 <- randomForest(CLUSTER ~ ., data = data_frame_merge)
print(RF_0)
##
## Call:
## randomForest(formula = CLUSTER ~ ., data = data_frame_merge)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 53
##
## OOB estimate of error rate: 23.53%
## Confusion matrix:
## 1 2 class.error
## 1 25 11 0.3055556
## 2 5 27 0.1562500
plot(importance(RF_0, type =2), type = "h" )