Libraries
library(tidyr)
library(readr)
library(ggplot2) # ggplot graphs
library(knitr)
library(readxl)
library(xlsx)
library(openxlsx)
library(reactable) # reactable(df)
library(naniar) # miss_case_summary
library(dplyr)
## Knn imputation
library(caret)
library(RANN)
# CLustering
library(factoextra) #clustering visualization
library(cluster) #clustering algorithms
library(dendextend) #for comparing two dendrograms
library(corrplot) #corelation between dendrograms
library(tidyverse) #data manupulation
library(NbClust) #determine optimal no. of clusters [not working...]
library(TSclust)
#RandomForest
library(randomForest) # RandomForest Discrete Classification
library(imbalance) # To create a more balanced dataset
Functions
source("../scripts/useful-functions/get_column_position.R")
# In a normal script it will be: source("./scripts/useful-functions/get_column_position.R")
Reading Data
cuantiles_TS_HR_P1 = data.frame(read_xlsx("../data/clean-data/BoxBasedImputation/cuantiles_TS_HR_valid_patients_input_P1.xlsx", sheet = "FC_valid_patients_input_P1" ))
## Deterioro and Not deterioro
file_patient_name_NO_DETERIORO <- data.frame(read_csv("../data/info-patients/file_patient_name_NO_DETERIORO.csv"))
file_patient_name_NO_DETERIORO <- file_patient_name_NO_DETERIORO$x
file_patient_name_DETERIORO <- data.frame(read_csv("../data/info-patients/file_patient_name_DETERIORO.csv"))
file_patient_name_DETERIORO <- file_patient_name_DETERIORO$x
valid_patients_P1 <- data.frame(read_xlsx("../data/clean-data/valid_patients_P1.xlsx"))
valid_patients_P1 <- valid_patients_P1$x
file_patient_name <- data.frame(read_csv("../data/clean-data/file_patient_name.csv", show_col_types = FALSE))
file_patient_name <- file_patient_name$x
df1 <- data.frame(read_xlsx("../data/clean-data/descriptive-data/descriptive_data.xlsx"))
rownames(df1) <- file_patient_name
df1 <- df1[valid_patients_P1,]
df_descriptive <- data.frame(read_xlsx("../data/clean-data/descriptive-data/descriptive_data_imputed.xlsx"))
rownames(df_descriptive) <- file_patient_name
# pos_1 = get_column_position(df_descriptive, "SEXO")
# pos_2 = get_column_position(df_descriptive, "SNG")
#
# col_names_factor <- names(df_descriptive[pos_1:pos_2])
# df_descriptive[col_names_factor] <- lapply(df_descriptive[col_names_factor] , factor)
df_descriptive <- df_descriptive[valid_patients_P1,]
df_descriptive <- df_descriptive[,-c(1:3)]
Create a dataframe with ACF [Heart Rate ]
dimension_col <- dim(cuantiles_TS_HR_P1)[2]
dimension_row <- 1440 #lag.max -1
# Heart Rate
cuantiles_TS_HR_P1_ACF <- data.frame(matrix(nrow = dimension_row, ncol = dimension_col))
colnames(cuantiles_TS_HR_P1_ACF) <- names(cuantiles_TS_HR_P1)[1:dimension_col]
for (i in names(cuantiles_TS_HR_P1_ACF)) {
acf_result_FC <- acf(cuantiles_TS_HR_P1[[i]], lag.max = (dimension_row - 1), plot = FALSE)
cuantiles_TS_HR_P1_ACF[, i] <- acf_result_FC$acf
}
Create a dataframe with peridiogram
library(TSA)
# Generar un dataset con varias series temporales
df <- cuantiles_TS_HR_P1
# Crear una matriz para almacenar los periodogramas
pg_mat <- data.frame(matrix(nrow = nrow(df), ncol = ncol(df)))
colnames(pg_mat) = colnames(cuantiles_TS_HR_P1)
# Calcular el periodograma de cada serie temporal y almacenarlo en la matriz
for (i in colnames(cuantiles_TS_HR_P1)) {
pg <- periodogram(ts(cuantiles_TS_HR_P1[,i]), plot = FALSE, normalize = TRUE, B = 500)
pg_mat[,i] <- pg$spec
}
datos <- cuantiles_TS_HR_P1
diss.ACF
Computes the dissimilarity between two time
series as the distance between their estimated simple (ACF) or partial
(PACF) autocorrelation coefficients
DD_ACF <- as.matrix(diss(datos, "ACF", lag.max = 50))
diss.EUCL
DD_EUCL <- as.matrix(diss(datos, "EUCL"))
diss.PER
DD_PER <- as.matrix(diss(datos, "PER"))
distance <- dist(t(cuantiles_TS_HR_P1_ACF[c(1:51),]), method = "euclidean")
distance_matrix_ACF <- as.matrix(distance)
distance <- dist(datos, method = "euclidean")
distance_matrix_EUCL <- as.matrix(distance)
distance <- dist(t(pg_mat), method = "euclidean")
distance_matrix_PER <- as.matrix(distance)
DD_ACF <- diss(datos, "ACF", lag.max = 50)
hcintper_ACF <- hclust(DD_ACF, "ward.D2")
fviz_dend(hcintper_ACF, palette = "jco",
rect = TRUE, show_labels = FALSE, k = 2)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <]8;;https://github.com/kassambara/factoextra/issueshttps://github.com/kassambara/factoextra/issues]8;;>.
DDclust_ACF <- cutree( hclust(DD_ACF, "ward.D2"), k = 2)
fviz_cluster(list(data = t(datos), cluster = DDclust_ACF))
fviz_silhouette(silhouette(DDclust_ACF, DD_ACF))
## cluster size ave.sil.width
## 1 1 52 0.48
## 2 2 16 0.74
DETERIORO_CLUST <- union(intersect(file_patient_name_DETERIORO,names_1),intersect(file_patient_name_DETERIORO,names_2))
NO_DETERIORO_CLUST <- union(intersect(file_patient_name_NO_DETERIORO,names_1),intersect(file_patient_name_NO_DETERIORO,names_2))
#DETERIORO
DETERIORO_patients = data.frame(t(rep("#4A235A", length(DETERIORO_CLUST))))
colnames(DETERIORO_patients)<- DETERIORO_CLUST
#NO DETERIORO
NO_DETERIORO_patients = data.frame(t(rep("#117864", length(NO_DETERIORO_CLUST))))
colnames(NO_DETERIORO_patients)<- NO_DETERIORO_CLUST
COLOR_ACF <- cbind(DETERIORO_patients,NO_DETERIORO_patients)
order_ACF <- union(names(DDclust_ACF[DDclust_ACF == 2]),names(DDclust_ACF[DDclust_ACF == 1]))
fviz_dend(hcintper_ACF, k = 2,
k_colors = c("blue", "green3"),
label_cols = as.vector(COLOR_ACF[,order_ACF]), cex = 0.6)
n1 = length(intersect(file_patient_name_DETERIORO,names_1))
n2 = length(intersect(file_patient_name_DETERIORO,names_2))
n3 = length(intersect(file_patient_name_NO_DETERIORO,names_1))
n4 = length(intersect(file_patient_name_NO_DETERIORO,names_2))
conttingency_table <- data.frame("CLust1" = c(n1,n3), "Clust2" = c(n2,n4))
rownames(conttingency_table) <- c("DETERIORO","NO DETERIORO")
knitr::kable(conttingency_table, align = "lccrr")
CLust1 | Clust2 | |
---|---|---|
DETERIORO | 12 | 2 |
NO DETERIORO | 40 | 14 |
conttingency_table_prop <- data.frame(c(n1,n3)/(n1+n3),c(n2,n4)/(n2+n4))
rownames(conttingency_table_prop) <- c("DETERIORO","NO DETERIORO")
colnames(conttingency_table_prop) <- c("Clust1","Clust2")
knitr::kable(conttingency_table_prop, align = "lccrr")
Clust1 | Clust2 | |
---|---|---|
DETERIORO | 0.2307692 | 0.125 |
NO DETERIORO | 0.7692308 | 0.875 |
data_frame1_ACF = data.frame("CLUSTER" = DDclust_ACF)
data_frame2 = df_descriptive
data_frame_merge_ACF <-
merge(data_frame1_ACF, data_frame2, by = 'row.names', all = TRUE)
data_frame_merge_ACF <- data_frame_merge_ACF[, 2:dim(data_frame_merge_ACF)[2]]
table(data_frame_merge_ACF$CLUSTER)
##
## 1 2
## 52 16
newMWMOTE_ACF <- oversample(data_frame_merge_ACF, ratio = 0.85, method = "MWMOTE", classAttr = "CLUSTER")
table(newMWMOTE_ACF$CLUSTER)
##
## 1 2
## 52 45
set.seed(123)
pos_1 = get_column_position(newMWMOTE_ACF, "CLUSTER")
pos_2 = get_column_position(newMWMOTE_ACF, "SNG")
col_names_factor <- names(newMWMOTE_ACF[pos_1:pos_2])
newMWMOTE_ACF[col_names_factor] <- lapply(newMWMOTE_ACF[col_names_factor] , factor)
RF_ACF <- randomForest(CLUSTER ~ ., data = newMWMOTE_ACF)
print(RF_ACF)
##
## Call:
## randomForest(formula = CLUSTER ~ ., data = newMWMOTE_ACF)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 5
##
## OOB estimate of error rate: 11.34%
## Confusion matrix:
## 1 2 class.error
## 1 46 6 0.1153846
## 2 5 40 0.1111111
Importance
kable(importance(RF_ACF, type =2)[order(importance(RF_ACF, type =2), decreasing = TRUE),])
x | |
---|---|
PESO | 9.0982434 |
EDAD | 8.1365895 |
SCORE_WOOD_DOWNES_24H | 2.7665812 |
FR_8_16h | 2.3716961 |
SCORE_CRUCES_INGRESO | 2.3154848 |
SAPI_0_8h | 2.1863742 |
LM | 1.9513197 |
SEXO | 1.8128066 |
ETIOLOGÍA | 1.8077405 |
FLUJO2_0_8H | 1.7721020 |
SUERO | 1.6146053 |
SCORE_WOOD_DOWNES_INGRESO | 1.5280813 |
FR_0_8h | 1.5150593 |
FR_16_24h | 1.3981804 |
FLUJO2_8_16h | 1.1365348 |
EG | 1.1326540 |
SAPI_16_24h | 1.0171987 |
ENFERMEDAD_BASE | 0.9563347 |
PREMATURIDAD | 0.8635321 |
DERMATITIS | 0.6792933 |
SAPI_8_16h | 0.6783079 |
SNG | 0.2822430 |
PALIVIZUMAB | 0.2609165 |
ALERGIAS | 0.1630979 |
DESNUTRICION | 0.1508200 |
ALIMENTACION | 0.1242604 |
data_frame1_ACF = data.frame("CLUSTER" = DDclust_ACF)
data_frame2 = data.frame(t(cuantiles_TS_HR_P1_ACF[c(2:51),]))
data_frame_merge_ACF <-
merge(data_frame1_ACF, data_frame2, by = 'row.names', all = TRUE)
data_frame_merge_ACF <- data_frame_merge_ACF[, 2:dim(data_frame_merge_ACF)[2]]
set.seed(123)
data_frame_merge_ACF$CLUSTER <- as.factor(data_frame_merge_ACF$CLUSTER)
RF_0 <- randomForest(CLUSTER ~ ., data = data_frame_merge_ACF)
print(RF_0)
##
## Call:
## randomForest(formula = CLUSTER ~ ., data = data_frame_merge_ACF)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 7
##
## OOB estimate of error rate: 1.47%
## Confusion matrix:
## 1 2 class.error
## 1 52 0 0.0000
## 2 1 15 0.0625
plot(importance(RF_0, type =2), type = "h")
DD_EUCL <- diss(datos, "EUCL")
hcintper_EUCL <- hclust(DD_EUCL, "ward.D2")
fviz_dend(hcintper_EUCL, palette = "jco",
rect = TRUE, show_labels = FALSE, k = 2)
DDclust_EUCL <- cutree( hclust(DD_EUCL, "ward.D2"), k = 2)
fviz_cluster(list(data = t(datos), cluster = DDclust_EUCL))
fviz_silhouette(silhouette(DDclust_EUCL, DD_EUCL))
## cluster size ave.sil.width
## 1 1 40 0.06
## 2 2 28 0.06
DETERIORO_CLUST <- union(intersect(file_patient_name_DETERIORO,names_1),intersect(file_patient_name_DETERIORO,names_2))
NO_DETERIORO_CLUST <- union(intersect(file_patient_name_NO_DETERIORO,names_1),intersect(file_patient_name_NO_DETERIORO,names_2))
#DETERIORO
DETERIORO_patients = data.frame(t(rep("#4A235A", length(DETERIORO_CLUST))))
colnames(DETERIORO_patients)<- DETERIORO_CLUST
#NO DETERIORO
NO_DETERIORO_patients = data.frame(t(rep("#117864", length(NO_DETERIORO_CLUST))))
colnames(NO_DETERIORO_patients)<- NO_DETERIORO_CLUST
COLOR_EUCL <- cbind(DETERIORO_patients,NO_DETERIORO_patients)
order_EUCL <- union(names(DDclust_EUCL[DDclust_EUCL == 2]),names(DDclust_EUCL[DDclust_EUCL == 1]))
fviz_dend(hcintper_EUCL, k = 2,
k_colors = c("blue", "green3"),
label_cols = as.vector(COLOR_EUCL[,order_EUCL]), cex = 0.6)
n1 = length(intersect(file_patient_name_DETERIORO,names_1))
n2 = length(intersect(file_patient_name_DETERIORO,names_2))
n3 = length(intersect(file_patient_name_NO_DETERIORO,names_1))
n4 = length(intersect(file_patient_name_NO_DETERIORO,names_2))
conttingency_table <- data.frame("CLust1" = c(n1,n3), "Clust2" = c(n2,n4))
rownames(conttingency_table) <- c("DETERIORO","NO DETERIORO")
knitr::kable(conttingency_table, align = "lccrr")
CLust1 | Clust2 | |
---|---|---|
DETERIORO | 9 | 5 |
NO DETERIORO | 31 | 23 |
conttingency_table_prop <- data.frame(c(n1,n3)/(n1+n3),c(n2,n4)/(n2+n4))
rownames(conttingency_table_prop) <- c("DETERIORO","NO DETERIORO")
colnames(conttingency_table_prop) <- c("Clust1","Clust2")
knitr::kable(conttingency_table_prop, align = "lccrr")
Clust1 | Clust2 | |
---|---|---|
DETERIORO | 0.225 | 0.1785714 |
NO DETERIORO | 0.775 | 0.8214286 |
data_frame1_EUCL = data.frame("CLUSTER" = DDclust_EUCL)
data_frame2 = df_descriptive
data_frame_merge_EUCL <-
merge(data_frame1_EUCL, data_frame2, by = 'row.names', all = TRUE)
data_frame_merge_EUCL <- data_frame_merge_EUCL[, 2:dim(data_frame_merge_EUCL)[2]]
table(data_frame_merge_EUCL$CLUSTER)
##
## 1 2
## 40 28
newMWMOTE_EUCL <- oversample(data_frame_merge_EUCL, ratio = 0.85, method = "MWMOTE", classAttr = "CLUSTER")
table(newMWMOTE_EUCL$CLUSTER)
##
## 1 2
## 40 34
set.seed(123)
pos_1 = get_column_position(newMWMOTE_EUCL, "CLUSTER")
pos_2 = get_column_position(newMWMOTE_EUCL, "SNG")
col_names_factor <- names(newMWMOTE_EUCL[pos_1:pos_2])
newMWMOTE_EUCL[col_names_factor] <- lapply(newMWMOTE_EUCL[col_names_factor] , factor)
RF_EUCL <- randomForest(CLUSTER ~ ., data = newMWMOTE_EUCL)
print(RF_EUCL)
##
## Call:
## randomForest(formula = CLUSTER ~ ., data = newMWMOTE_EUCL)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 5
##
## OOB estimate of error rate: 40.54%
## Confusion matrix:
## 1 2 class.error
## 1 26 14 0.3500000
## 2 16 18 0.4705882
Importance
kable(importance(RF_EUCL, type =2)[order(importance(RF_EUCL, type =2), decreasing = TRUE),])
x | |
---|---|
PESO | 3.0341244 |
EDAD | 2.9404749 |
FR_8_16h | 2.9348776 |
FR_0_8h | 2.9160816 |
SCORE_CRUCES_INGRESO | 2.6767961 |
SAPI_16_24h | 2.4723817 |
SAPI_0_8h | 2.1726419 |
SCORE_WOOD_DOWNES_INGRESO | 2.0431108 |
SCORE_WOOD_DOWNES_24H | 2.0388233 |
FLUJO2_8_16h | 1.8820088 |
FR_16_24h | 1.8161756 |
EG | 1.6809786 |
FLUJO2_0_8H | 1.5951335 |
SAPI_8_16h | 0.9923468 |
LM | 0.8885485 |
ETIOLOGÍA | 0.6693241 |
SEXO | 0.5875554 |
SNG | 0.4416946 |
SUERO | 0.3896287 |
ENFERMEDAD_BASE | 0.3696867 |
ALIMENTACION | 0.3623640 |
DESNUTRICION | 0.3403335 |
PREMATURIDAD | 0.3370544 |
ALERGIAS | 0.3032729 |
PALIVIZUMAB | 0.2000761 |
DERMATITIS | 0.1698389 |
data_frame1_EUCL = data.frame("CLUSTER" = DDclust_EUCL)
data_frame2 = data.frame(t(datos))
data_frame_merge_EUCL <-
merge(data_frame1_EUCL, data_frame2, by = 'row.names', all = TRUE)
data_frame_merge_EUCL <- data_frame_merge_EUCL[, 2:dim(data_frame_merge_ACF)[2]]
set.seed(123)
data_frame_merge_EUCL$CLUSTER <- as.factor(data_frame_merge_EUCL$CLUSTER)
RF_0 <- randomForest(CLUSTER ~ ., data = data_frame_merge_EUCL)
print(RF_0)
##
## Call:
## randomForest(formula = CLUSTER ~ ., data = data_frame_merge_EUCL)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 7
##
## OOB estimate of error rate: 36.76%
## Confusion matrix:
## 1 2 class.error
## 1 30 10 0.2500000
## 2 15 13 0.5357143
plot(importance(RF_0, type =2), type = "h")
DD_PER <- diss(datos, "PER")
hcintper_PER <- hclust(DD_PER, "ward.D2")
fviz_dend(hcintper_PER, palette = "jco",
rect = TRUE, show_labels = FALSE, k = 2)
DDclust_PER <- cutree( hclust(DD_PER, "ward.D2"), k = 2)
fviz_cluster(list(data = t(datos), cluster = DDclust_PER))
fviz_silhouette(silhouette(DDclust_PER, DD_PER))
## cluster size ave.sil.width
## 1 1 54 0.28
## 2 2 14 0.28
DETERIORO_CLUST <- union(intersect(file_patient_name_DETERIORO,names_1),intersect(file_patient_name_DETERIORO,names_2))
NO_DETERIORO_CLUST <- union(intersect(file_patient_name_NO_DETERIORO,names_1),intersect(file_patient_name_NO_DETERIORO,names_2))
#DETERIORO
DETERIORO_patients = data.frame(t(rep("#4A235A", length(DETERIORO_CLUST))))
colnames(DETERIORO_patients)<- DETERIORO_CLUST
#NO DETERIORO
NO_DETERIORO_patients = data.frame(t(rep("#117864", length(NO_DETERIORO_CLUST))))
colnames(NO_DETERIORO_patients)<- NO_DETERIORO_CLUST
COLOR_PER <- cbind(NO_DETERIORO_patients,DETERIORO_patients)
order_PER <- union(names(DDclust_PER[DDclust_PER == 2]),names(DDclust_PER[DDclust_PER == 1]))
fviz_dend(hcintper_PER, k = 2,
k_colors = c("blue", "green3"),
label_cols = as.vector(COLOR_PER[,order_PER]), cex = 0.6)
n1 = length(intersect(file_patient_name_DETERIORO,names_1))
n2 = length(intersect(file_patient_name_DETERIORO,names_2))
n3 = length(intersect(file_patient_name_NO_DETERIORO,names_1))
n4 = length(intersect(file_patient_name_NO_DETERIORO,names_2))
conttingency_table <- data.frame("CLust1" = c(n1,n3), "Clust2" = c(n2,n4))
rownames(conttingency_table) <- c("DETERIORO","NO DETERIORO")
knitr::kable(conttingency_table, align = "lccrr")
CLust1 | Clust2 | |
---|---|---|
DETERIORO | 13 | 1 |
NO DETERIORO | 41 | 13 |
conttingency_table_prop <- data.frame(c(n1,n3)/(n1+n3),c(n2,n4)/(n2+n4))
rownames(conttingency_table_prop) <- c("DETERIORO","NO DETERIORO")
colnames(conttingency_table_prop) <- c("Clust1","Clust2")
knitr::kable(conttingency_table_prop, align = "lccrr")
Clust1 | Clust2 | |
---|---|---|
DETERIORO | 0.2407407 | 0.0714286 |
NO DETERIORO | 0.7592593 | 0.9285714 |
data_frame1_PER = data.frame("CLUSTER" = DDclust_PER)
data_frame2 = df_descriptive
data_frame_merge_PER <-
merge(data_frame1_PER, data_frame2, by = 'row.names', all = TRUE)
data_frame_merge_PER <- data_frame_merge_PER[, 2:dim(data_frame_merge_PER)[2]]
table(data_frame_merge_PER$CLUSTER)
##
## 1 2
## 54 14
newMWMOTE_PER <- oversample(data_frame_merge_PER, ratio = 0.85, method = "MWMOTE", classAttr = "CLUSTER")
table(newMWMOTE_PER$CLUSTER)
##
## 1 2
## 54 46
set.seed(123)
pos_1 = get_column_position(newMWMOTE_PER, "CLUSTER")
pos_2 = get_column_position(newMWMOTE_PER, "SNG")
col_names_factor <- names(newMWMOTE_PER[pos_1:pos_2])
newMWMOTE_PER[col_names_factor] <- lapply(newMWMOTE_PER[col_names_factor] , factor)
RF_PER <- randomForest(CLUSTER ~ ., data = newMWMOTE_PER)
print(RF_PER)
##
## Call:
## randomForest(formula = CLUSTER ~ ., data = newMWMOTE_PER)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 5
##
## OOB estimate of error rate: 13%
## Confusion matrix:
## 1 2 class.error
## 1 49 5 0.09259259
## 2 8 38 0.17391304
Importance
kable(importance(RF_PER, type =2)[order(importance(RF_PER, type =2), decreasing = TRUE),])
x | |
---|---|
PESO | 6.5652544 |
EDAD | 5.9253091 |
SCORE_WOOD_DOWNES_INGRESO | 3.4485809 |
SAPI_0_8h | 3.2804061 |
SEXO | 3.1184644 |
SUERO | 2.6352030 |
SAPI_8_16h | 2.3921213 |
FR_0_8h | 2.2575836 |
ETIOLOGÍA | 2.1633623 |
FR_16_24h | 2.1364423 |
FR_8_16h | 1.9913936 |
SCORE_WOOD_DOWNES_24H | 1.8440330 |
FLUJO2_0_8H | 1.6464318 |
FLUJO2_8_16h | 1.6095209 |
LM | 1.6067669 |
SCORE_CRUCES_INGRESO | 1.2979480 |
EG | 1.1161027 |
SAPI_16_24h | 0.9595157 |
ENFERMEDAD_BASE | 0.6635130 |
PREMATURIDAD | 0.6338663 |
SNG | 0.4191274 |
DERMATITIS | 0.3744220 |
DESNUTRICION | 0.3711575 |
ALERGIAS | 0.3305712 |
ALIMENTACION | 0.2160089 |
PALIVIZUMAB | 0.1788668 |