Código

Libraries

library(caret)
library(ConfusionTableR)
library(DataExplorer)
library(dplyr)
library(ggplot2)
library(ggthemes)
library(kableExtra)
library(ModelMetrics)
library(openxlsx)
library(plotly)
library(probably)
library(pROC)
library(psych)
library(purrr)
library(randomForest)
library(reshape2)
library(skimr)
library(stringr)
library(tidymodels)
library(tidyverse)
library(univariateML)
library(vip)
library(xgboost)
library(WRS2)

Dataset

Descripción

Attribute information:

Gender: M(male), F(female) Age: Age of the patient Smoking: YES=2 , NO=1 Yellow fingers: YES=2 , NO=1 Anxiety: YES=2 , NO=1 Peer_pressure: YES=2 , NO=1 Chronic Disease: YES=2 , NO=1 Fatigue: YES=2 , NO=1 Allergy: YES=2 , NO=1 Wheezing: YES=2 , NO=1 Alcohol: YES=2 , NO=1 Coughing: YES=2 , NO=1 Shortness of Breath: YES=2 , NO=1 Swallowing Difficulty: YES=2 , NO=1 Chest pain: YES=2 , NO=1 Lung Cancer: YES , NO

Preprocesado

data = readr::read_csv("lung_cancer.csv")


names(data) = c("Gender", "Age", "Smoking", "Yellow_fingers", "Anxiety", "Peer_pressure", "Chronic_disease", "Fatigue", "Allergy", "Wheezing", "Alcohol_consuming", "Coughing", "Shortness_of_breath", "Swallowing_difficulty", "Chest_pain", "LUNG_CANCER")
twoTOone = function(x){
  return(ifelse(x == 2, 1, 0))
}


data[ ,3:15] = twoTOone(data[ ,3:15])
data = data %>%
  mutate(Diagnosis = ifelse(str_detect(LUNG_CANCER,"NO"), "Negative", "Positive")) %>%
  mutate(Diagnosis = factor(Diagnosis, levels = c("Positive", "Negative"), labels = c("Positive", "Negative"))) %>%
  relocate(Diagnosis, .before = LUNG_CANCER) %>%
  select(-LUNG_CANCER) #Sustituído por Diagnosis


DT::datatable(data,
              rownames = F,
              options = list(pageLength = 10, scrollX = T),
              class = "white-space: nowrap")
dim(data)
[1] 309  16
cols = c(1, 3:15)

data[ ,cols] = lapply(data[ ,cols], as.factor)


str(data)
tibble [309 × 16] (S3: tbl_df/tbl/data.frame)
 $ Gender               : Factor w/ 2 levels "F","M": 2 2 1 2 1 1 2 1 1 2 ...
 $ Age                  : num [1:309] 69 74 59 63 63 75 52 51 68 53 ...
 $ Smoking              : Factor w/ 2 levels "0","1": 1 2 1 2 1 1 2 2 2 2 ...
 $ Yellow_fingers       : Factor w/ 2 levels "0","1": 2 1 1 2 2 2 1 2 1 2 ...
 $ Anxiety              : Factor w/ 2 levels "0","1": 2 1 1 2 1 1 1 2 2 2 ...
 $ Peer_pressure        : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 1 2 1 2 ...
 $ Chronic_disease      : Factor w/ 2 levels "0","1": 1 2 1 1 1 2 1 1 1 2 ...
 $ Fatigue              : Factor w/ 2 levels "0","1": 2 2 2 1 1 2 2 2 2 1 ...
 $ Allergy              : Factor w/ 2 levels "0","1": 1 2 1 1 1 2 1 2 1 2 ...
 $ Wheezing             : Factor w/ 2 levels "0","1": 2 1 2 1 2 2 2 1 1 1 ...
 $ Alcohol_consuming    : Factor w/ 2 levels "0","1": 2 1 1 2 1 1 2 1 1 2 ...
 $ Coughing             : Factor w/ 2 levels "0","1": 2 1 2 1 2 2 2 1 1 1 ...
 $ Shortness_of_breath  : Factor w/ 2 levels "0","1": 2 2 2 1 2 2 2 2 1 1 ...
 $ Swallowing_difficulty: Factor w/ 2 levels "0","1": 2 2 1 2 1 1 1 2 1 2 ...
 $ Chest_pain           : Factor w/ 2 levels "0","1": 2 2 2 2 1 1 2 1 1 2 ...
 $ Diagnosis            : Factor w/ 2 levels "Positive","Negative": 1 1 2 2 2 1 1 1 2 1 ...
skim(data) %>% yank("factor")

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
Gender 0 1 FALSE 2 M: 162, F: 147
Smoking 0 1 FALSE 2 1: 174, 0: 135
Yellow_fingers 0 1 FALSE 2 1: 176, 0: 133
Anxiety 0 1 FALSE 2 0: 155, 1: 154
Peer_pressure 0 1 FALSE 2 1: 155, 0: 154
Chronic_disease 0 1 FALSE 2 1: 156, 0: 153
Fatigue 0 1 FALSE 2 1: 208, 0: 101
Allergy 0 1 FALSE 2 1: 172, 0: 137
Wheezing 0 1 FALSE 2 1: 172, 0: 137
Alcohol_consuming 0 1 FALSE 2 1: 172, 0: 137
Coughing 0 1 FALSE 2 1: 179, 0: 130
Shortness_of_breath 0 1 FALSE 2 1: 198, 0: 111
Swallowing_difficulty 0 1 FALSE 2 0: 164, 1: 145
Chest_pain 0 1 FALSE 2 1: 172, 0: 137
Diagnosis 0 1 FALSE 2 Pos: 270, Neg: 39
skim(data) %>% yank("numeric")

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Age 0 1 62.67 8.21 21 57 62 69 87 ▁▁▆▇▂
na_values = function(x){sum(is.na(x))}

apply(data, 2, na_values)
               Gender                   Age               Smoking 
                    0                     0                     0 
       Yellow_fingers               Anxiety         Peer_pressure 
                    0                     0                     0 
      Chronic_disease               Fatigue               Allergy 
                    0                     0                     0 
             Wheezing     Alcohol_consuming              Coughing 
                    0                     0                     0 
  Shortness_of_breath Swallowing_difficulty            Chest_pain 
                    0                     0                     0 
            Diagnosis 
                    0 

Gráficos

Variables Categóricas

# Lista de variables
variables <- c("Gender", "Smoking", "Yellow_fingers", "Anxiety", "Peer_pressure", "Chronic_disease", "Fatigue", "Allergy", "Wheezing", "Alcohol_consuming", "Coughing", "Shortness_of_breath", "Swallowing_difficulty", "Chest_pain")

# Bucle for para generar y visualizar gráficos
for (variable in variables) {
  p <- ggplot(data, aes_string(x = variable)) +
    geom_bar(aes(fill = Diagnosis), show.legend = FALSE) +
    facet_wrap(~Diagnosis) +
    scale_fill_brewer(palette = 10) +
    theme_classic()
  p_plotly <- ggplotly(p)
  
  # Puedes imprimir o visualizar los gráficos según tus necesidades
  print(p_plotly)
}

Variables Numéricas

p8= ggplot(data, aes(Diagnosis, Age)) +
  geom_boxplot(aes(fill = Diagnosis)) +
  scale_fill_brewer(palette = 10) +
  theme_classic()
ggplotly(p8)
data_r = melt(data, id.vars = "Diagnosis", measure.vars = c(2)) #solo hay 1 variable numérica


ggplot(data_r, aes(x = Diagnosis, y = value)) +
  geom_boxplot() +
  facet_wrap(~variable, nrow = 1, ncol = 1, scales = "free")

Comparaciones

Variables Categóricas

m = matrix(nrow = 14, #hay 14 variables
           ncol = 2, #lo quiero en dos columnas
           dimnames = list(colnames(data[,c(1,3:15)]),
                           c("p_valor", "Coeficiente_V_de_Cramer")
                           ))

#m


for (i in c(1,3:15)) {
  tabla = table(data$Diagnosis, data[[i]]
                )
  test = chisq.test(tabla)
  
  cramer = function(x) {
    unname(sqrt(chisq.test(x)$statistic / (sum(x)*(min(dim(x))-1))))
  }
  
  m[colnames(data)[i], ] = c(round(test$p.value, 4),
                             round(cramer(tabla), 4)
                             )
}



m = data.frame(m)

formato = c("striped", "bordered", "hover", "responsive")

m %>% kable() %>% kable_styling(bootstrap_options = formato,
                                full_width = F,
                                position = "center",
                                font_size = 12) %>%
  column_spec(2, background = ifelse(m$p_valor > (0.05), "#ff0000", "#C0FF3E"))
p_valor Coeficiente_V_de_Cramer
Gender 0.3122 0.0575
Smoking 0.3953 0.0484
Yellow_fingers 0.0026 0.1715
Anxiety 0.0175 0.1352
Peer_pressure 0.0019 0.1766
Chronic_disease 0.0754 0.1011
Fatigue 0.0137 0.1403
Allergy 0.0000 0.3180
Wheezing 0.0000 0.2395
Alcohol_consuming 0.0000 0.2787
Coughing 0.0000 0.2387
Shortness_of_breath 0.3739 0.0506
Swallowing_difficulty 0.0000 0.2500
Chest_pain 0.0015 0.1806
# Asociación: pequeña: <0.01, mediana: 0.1-0.3 y grande: <0.3
library(corrplot)
library(vcd)

df = data[ ,c(1,3:16)]

#Partimos de una matriz vacía con las dimensiones apropiadas
empty_m = matrix(ncol = length(df),
                 nrow = length(df),
                 dimnames = list(names(df),
                                 names(df)))

#Calculamos el estadístico y vamos rellenando la matriz
calculate_cramer = function(m, df) {
  for (r in seq(nrow(m))) {
    for (c in seq(ncol(m))) {
      m[[r,c]] = assocstats(table(df[[r]], df[[c]]))$cramer
    }
  }
  return(m)
}


cor_matrix = calculate_cramer(empty_m, df)

#Ya podemos graficarlo
corrplot(cor_matrix, method = "circle", is.corr = F, type = "upper", diag = F, cl.lim = c(0,1))

remove(df)

Variables Numéricas

No paramétrica

m = matrix(nrow = 1,
           ncol = 1,
           dimnames = list(colnames(data[ , c(2)]),
                           c("p-valor")))

#m


for (i in c(2)) {
  f = formula(paste(colnames(data[i]), "~ Diagnosis"))
  test = wilcox.test(f, data = data)
  m[colnames(data)[i], ] = c(round(test$p.value, 4))
}



formato = c("striped", "bordered", "hover", "responsive")

m %>% kable() %>% kable_styling(bootstrap_options = formato,
                                full_width = F,
                                position = "center",
                                font_size = 12)
p-valor
Age 0.1823

Robusta

# La funicón pb2gen utiliza un método de remuestreo con reemplazo. Esto significa que se generan nuevas muestras de los datos originales seleeccionados aleatoriamente. Sembrando una semilla los resultados serán posteriormente reproducibles. set.seed()

m = matrix(nrow = 1,
           ncol = 1,
           dimnames = list(colnames(data[ , c(2)]),
                           c("p-valor")))

m
    p-valor
Age      NA
for (i in c(2)) {
  f = formula(paste(colnames(data[i]), "~ Diagnosis"))
  test = pb2gen(f, 
                data = data,
                est = "median")
  
  m[colnames(data)[i], ] = c(round(test$p.value, 4))
}




formato = c("striped", "bordered", "hover", "responsive")

m %>% kable() %>% kable_styling(bootstrap_options = formato,
                                full_width = F,
                                position = "center",
                                font_size = 12)
p-valor
Age 0.3072
m = data.frame(m)

Tablas

Variables numéricas

# lung_cancer = 0

lung_cancer_0 = data |> filter(Diagnosis %in% c("Negative")) #"|>" es lo mismo que %>%

names_num = colnames(select_if(lung_cancer_0, is.numeric))

data_num_lung_cancer0 = lung_cancer_0 |> select(all_of(names_num))



results_data_num_lung_cancer0 = NULL

for (i in colnames(data_num_lung_cancer0)) {
  median = median(data_num_lung_cancer0[[i]], na.rm = T) #convertivos a vector con [[]]
  iqr = IQR(data_num_lung_cancer0[[i]], na.rm = T)
  row = data.frame("VARIABLE" = i,
                   "MEDIAN_lung_cancer_0" = median,
                   "IQR_lung_cancer_0" = iqr)
  results_data_num_lung_cancer0 = rbind(results_data_num_lung_cancer0, row)
}


rownames(results_data_num_lung_cancer0) = NULL
results_data_num_lung_cancer0 |> kable() |> kable_styling()
VARIABLE MEDIAN_lung_cancer_0 IQR_lung_cancer_0
Age 61 8.5
# lung_cancer = 1

lung_cancer_1 = data |> filter(Diagnosis %in% c("Positive")) #"|>" es lo mismo que %>%

names_num = colnames(select_if(lung_cancer_1, is.numeric))

data_num_lung_cancer1 = lung_cancer_1 |> select(all_of(names_num))



results_data_num_lung_cancer1 = NULL

for (i in colnames(data_num_lung_cancer1)) {
  median = median(data_num_lung_cancer1[[i]], na.rm = T) #convertivos a vector con [[]]
  iqr = IQR(data_num_lung_cancer1[[i]], na.rm = T)
  row = data.frame("VARIABLE" = i,
                   "MEDIAN_lung_cancer_1" = median,
                   "IQR_lung_cancer_1" = iqr)
  results_data_num_lung_cancer1 = rbind(results_data_num_lung_cancer1, row)
}


rownames(results_data_num_lung_cancer1) = NULL
results_data_num_lung_cancer1 |> kable() |> kable_styling()
VARIABLE MEDIAN_lung_cancer_1 IQR_lung_cancer_1
Age 62.5 11
descriptive_table = data.frame(
  variable = c("Age"),
  median_lung_cancer_0 = results_data_num_lung_cancer0$MEDIAN_lung_cancer_0,
  IQR_lung_cancer_0 = results_data_num_lung_cancer0$IQR_lung_cancer_0,
  median_lung_cancer_1 = results_data_num_lung_cancer1$MEDIAN_lung_cancer_1,
  IQR_lung_cancer_1 = results_data_num_lung_cancer1$IQR_lung_cancer_1,
  p_value = m$p.valor)

descriptive_table
  variable median_lung_cancer_0 IQR_lung_cancer_0 median_lung_cancer_1
1      Age                   61               8.5                 62.5
  IQR_lung_cancer_1 p_value
1                11  0.3072
names(descriptive_table) = c("Variable", "Median", "IQR", "Median", "IQR", "p-value")

descriptive_table
  Variable Median IQR Median IQR p-value
1      Age     61 8.5   62.5  11  0.3072
kbl(descriptive_table,
    caption = "Numerical Variables") %>%
  kable_paper("striped", full_width = F) %>%
  add_header_above(c(" " = 1, "Negative" = 2, "Positive" = 2, " " = 1), #de esta manera me coge el primer cuadrado vacío, los dos siguientes negativo, los dos siguientes positivos y el final vacío
                   italic = T,
                   align = "c",
                   color = "white",
                   background = "lightsalmon") %>%
  column_spec(6, background = ifelse(m$p.valor >(0.05), "#ff0000", "#C0FF3E")) %>%
  footnote(general = "p-value < 0.05 significant")
Numerical Variables
Negative
Positive
Variable Median IQR Median IQR p-value
Age 61 8.5 62.5 11 0.3072
Note:
p-value < 0.05 significant

Variables Categóricas

m = matrix(nrow = 14, #hay 14 variables
           ncol = 2, #lo quiero en dos columnas
           dimnames = list(colnames(data[,c(1,3:15)]),
                           c("p_valor", "Coeficiente_V_de_Cramer")
                           ))

#m



for (i in c(1,3:15)) {
  tabla = table(data$Diagnosis, data[[i]]
                )
  test = chisq.test(tabla)
  
  cramer = function(x) {
    unname(sqrt(chisq.test(x)$statistic / (sum(x)*(min(dim(x))-1))))
  }
  
  m[colnames(data)[i], ] = c(round(test$p.value, 4),
                             round(cramer(tabla), 4)
                             )
}



m = data.frame(m)
str(data)
tibble [309 × 16] (S3: tbl_df/tbl/data.frame)
 $ Gender               : Factor w/ 2 levels "F","M": 2 2 1 2 1 1 2 1 1 2 ...
 $ Age                  : num [1:309] 69 74 59 63 63 75 52 51 68 53 ...
 $ Smoking              : Factor w/ 2 levels "0","1": 1 2 1 2 1 1 2 2 2 2 ...
 $ Yellow_fingers       : Factor w/ 2 levels "0","1": 2 1 1 2 2 2 1 2 1 2 ...
 $ Anxiety              : Factor w/ 2 levels "0","1": 2 1 1 2 1 1 1 2 2 2 ...
 $ Peer_pressure        : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 1 2 1 2 ...
 $ Chronic_disease      : Factor w/ 2 levels "0","1": 1 2 1 1 1 2 1 1 1 2 ...
 $ Fatigue              : Factor w/ 2 levels "0","1": 2 2 2 1 1 2 2 2 2 1 ...
 $ Allergy              : Factor w/ 2 levels "0","1": 1 2 1 1 1 2 1 2 1 2 ...
 $ Wheezing             : Factor w/ 2 levels "0","1": 2 1 2 1 2 2 2 1 1 1 ...
 $ Alcohol_consuming    : Factor w/ 2 levels "0","1": 2 1 1 2 1 1 2 1 1 2 ...
 $ Coughing             : Factor w/ 2 levels "0","1": 2 1 2 1 2 2 2 1 1 1 ...
 $ Shortness_of_breath  : Factor w/ 2 levels "0","1": 2 2 2 1 2 2 2 2 1 1 ...
 $ Swallowing_difficulty: Factor w/ 2 levels "0","1": 2 2 1 2 1 1 1 2 1 2 ...
 $ Chest_pain           : Factor w/ 2 levels "0","1": 2 2 2 2 1 1 2 1 1 2 ...
 $ Diagnosis            : Factor w/ 2 levels "Positive","Negative": 1 1 2 2 2 1 1 1 2 1 ...
# https://www.danieldsjoberg.com/gtsummary/articles/tbl_summary.html

library(gtsummary)


data %>%
  tbl_summary(by = Diagnosis,
              statistic = list(
                Age ~ "{mean}({sd})",
                all_categorical() ~"{n} / {N} ({p}%)"),
              missing = "no") %>% # no lista elementos faltantes separados
  add_overall() %>%
  add_n() %>%
  add_p(pvalue_fun = ~ style_pvalue(.x, digits = 3)) %>%
  modify_header(label = "**Variables**") %>%
  bold_labels() %>%
  italicize_levels() %>%
  modify_caption("**Table 1. Summary table of patients with Positive or Negative diagnosis**")
Table 1. Summary table of patients with Positive or Negative diagnosis
Variables N Overall, N = 3091 Positive, N = 2701 Negative, N = 391 p-value2
Gender 309


0.237
    F
147 / 309 (48%) 125 / 270 (46%) 22 / 39 (56%)
    M
162 / 309 (52%) 145 / 270 (54%) 17 / 39 (44%)
Age 309 63(8) 63(8) 61(10) 0.182
Smoking 309


0.306
    0
135 / 309 (44%) 115 / 270 (43%) 20 / 39 (51%)
    1
174 / 309 (56%) 155 / 270 (57%) 19 / 39 (49%)
Yellow_fingers 309


0.001
    0
133 / 309 (43%) 107 / 270 (40%) 26 / 39 (67%)
    1
176 / 309 (57%) 163 / 270 (60%) 13 / 39 (33%)
Anxiety 309


0.011
    0
155 / 309 (50%) 128 / 270 (47%) 27 / 39 (69%)
    1
154 / 309 (50%) 142 / 270 (53%) 12 / 39 (31%)
Peer_pressure 309


0.001
    0
154 / 309 (50%) 125 / 270 (46%) 29 / 39 (74%)
    1
155 / 309 (50%) 145 / 270 (54%) 10 / 39 (26%)
Chronic_disease 309


0.051
    0
153 / 309 (50%) 128 / 270 (47%) 25 / 39 (64%)
    1
156 / 309 (50%) 142 / 270 (53%) 14 / 39 (36%)
Fatigue 309


0.008
    0
101 / 309 (33%) 81 / 270 (30%) 20 / 39 (51%)
    1
208 / 309 (67%) 189 / 270 (70%) 19 / 39 (49%)
Allergy 309


<0.001
    0
137 / 309 (44%) 103 / 270 (38%) 34 / 39 (87%)
    1
172 / 309 (56%) 167 / 270 (62%) 5 / 39 (13%)
Wheezing 309


<0.001
    0
137 / 309 (44%) 107 / 270 (40%) 30 / 39 (77%)
    1
172 / 309 (56%) 163 / 270 (60%) 9 / 39 (23%)
Alcohol_consuming 309


<0.001
    0
137 / 309 (44%) 105 / 270 (39%) 32 / 39 (82%)
    1
172 / 309 (56%) 165 / 270 (61%) 7 / 39 (18%)
Coughing 309


<0.001
    0
130 / 309 (42%) 101 / 270 (37%) 29 / 39 (74%)
    1
179 / 309 (58%) 169 / 270 (63%) 10 / 39 (26%)
Shortness_of_breath 309


0.286
    0
111 / 309 (36%) 94 / 270 (35%) 17 / 39 (44%)
    1
198 / 309 (64%) 176 / 270 (65%) 22 / 39 (56%)
Swallowing_difficulty 309


<0.001
    0
164 / 309 (53%) 130 / 270 (48%) 34 / 39 (87%)
    1
145 / 309 (47%) 140 / 270 (52%) 5 / 39 (13%)
Chest_pain 309


<0.001
    0
137 / 309 (44%) 110 / 270 (41%) 27 / 39 (69%)
    1
172 / 309 (56%) 160 / 270 (59%) 12 / 39 (31%)
1 n / N (%); Mean(SD)
2 Pearson’s Chi-squared test; Wilcoxon rank sum test

Modelos

Random forest

library(tidymodels)
library(caret)
library(ROCR)


df_results_rf = NULL

for (i in (1:10)) {
  set.seed(i) #distintas semillas diferentes datos para poder comparar
  train_row_numbers = createDataPartition(data$Diagnosis, p = 0.8, list = F)
  d_train = data[train_row_numbers, ]
  d_test = data[-train_row_numbers, ]
  
  transformer = recipe(formula = Diagnosis ~ .,
                       data = d_train) %>%
    step_dummy(all_nominal_predictors()) %>%
    step_center(where(is.numeric)) %>%
    step_scale(where(is.numeric))
  
  data_train = transformer %>%
    prep(d_train) %>%
    bake(new_data = NULL)
  
  data_test = transformer %>%
    prep(d_test) %>%
    bake(new_data = d_test)
  
  ctrl = trainControl(method = "cv",
                      number = 10,
                      returnResamp = "final",
                      verboseIter = F,
                      summaryFunction = twoClassSummary,
                      classProbs = T,
                      savePredictions = T,
                      allowParallel = T,
                      sampling = "up")
  
  
  tuneGrid = expand.grid(mtry = 1:15) #15 predictores totales
  
  set.seed(i)
  rf_fit = train(Diagnosis ~ .,
                 data = data_train,
                 method = "rf",
                 metric = "ROC", #coge el máximo modelo de sensi y especi
                 trControl = ctrl,
                 tuneGrid = tuneGrid,
                 )
  
  
  probs = seq(0.1, 0.9, by = 0.1)
  set.seed(i)
  ths_rf_fit = thresholder(rf_fit,
                           threshold = probs,
                           final = T,
                           statistics = "all")
  
  
  ths_rf_fit %>%
    mutate(prob = probs) %>%
    filter(J == max(J)) %>%
    pull(prob) -> thresh_prob_rf_fit
  ths_rf_fit %>%
    mutate(prob = probs) %>%
    filter(J == max(J)) %>%
    pull(J) -> max_J_train
  preds = as.factor(ifelse(predict(rf_fit, data_test, type = "prob")[, "Positive"] >= thresh_prob_rf_fit, "Positive", "Negative"))
  real = factor(data_test$Diagnosis)
  cm = ConfusionTableR::binary_class_cm(preds,
                                        real,
                                        mode = 'everything',
                                        positive = 'Positive')
  
  
  sensitivity = cm$confusion_matrix$byClass[1]
  specificity = cm$confusion_matrix$byClass[2]
  df = data.frame(preds = preds, real = real)
  df$preds = as.numeric(ifelse(df$preds == "Positive", 1, 0))
  df$real = as.numeric(ifelse(df$real == "Positive", 1, 0))
  prediction = prediction(df$preds, df$real)
  AUC = as.numeric(performance(prediction, "auc") @y.values)
  row = data.frame(model = "Random forest",
                   seed = i,
                   probab = thresh_prob_rf_fit,
                   max_J_train = max_J_train,
                   sensitivity = sensitivity,
                   specificity = specificity,
                   AUC = AUC)
  df_results_rf = rbind(df_results_rf, row)
  
}


df_results_rf %>% kable() %>% kable_styling()
model seed probab max_J_train sensitivity specificity AUC
Sensitivity Random forest 1 0.6 0.7586580 0.9259259 0.8571429 0.8915344
Sensitivity1 Random forest 2 0.6 0.7786797 0.7777778 1.0000000 0.8888889
Sensitivity2 Random forest 3 0.6 0.8256494 0.7037037 0.8571429 0.7804233
Sensitivity3 Random forest 4 0.8 0.7993506 0.7777778 1.0000000 0.8888889
Sensitivity4 Random forest 5 0.6 0.8037879 0.7777778 0.8571429 0.8174603
Sensitivity5 Random forest 6 0.6 0.7784632 0.8333333 0.7142857 0.7738095
Sensitivity6 Random forest 7 0.6 0.8240260 0.8703704 0.7142857 0.7923280
Sensitivity7 Random forest 8 0.6 0.7734848 0.7962963 1.0000000 0.8981481
Sensitivity8 Random forest 9 0.6 0.7935065 0.8703704 0.8571429 0.8637566
Sensitivity9 Random forest 10 0.6 0.7212121 0.8148148 0.8571429 0.8359788
write.xlsx(df_results_rf, "lung_rf.xlsx")

Métrica

df_results = rbind(df_results_rf)
write.csv(df_results, "resultados_modelos_lung_cancer.csv")
data = read.csv("resultados_modelos_lung_cancer.csv")

str(data)
'data.frame':   10 obs. of  8 variables:
 $ X          : chr  "Sensitivity" "Sensitivity1" "Sensitivity2" "Sensitivity3" ...
 $ model      : chr  "Random forest" "Random forest" "Random forest" "Random forest" ...
 $ seed       : int  1 2 3 4 5 6 7 8 9 10
 $ probab     : num  0.6 0.6 0.6 0.8 0.6 0.6 0.6 0.6 0.6 0.6
 $ max_J_train: num  0.759 0.779 0.826 0.799 0.804 ...
 $ sensitivity: num  0.926 0.778 0.704 0.778 0.778 ...
 $ specificity: num  0.857 1 0.857 1 0.857 ...
 $ AUC        : num  0.892 0.889 0.78 0.889 0.817 ...
data["X"] = NULL
data["seed"] = NULL
data["max_J_train"] = NULL
data["probab"] = NULL


data$model = as.factor(as.character(data$model))

formato = c("striped", "bordered", "hover", "responsive")

data %>% kable() %>% kable_styling(bootstrap_options = formato,
                                   full_width = F,
                                   position = "center",
                                   font_size = 12) %>%
  row_spec(0, bold = T, color = "black", align = "center") %>%
  row_spec(1:nrow(data), bold = T, color = "grey", align = "center")
model sensitivity specificity AUC
Random forest 0.9259259 0.8571429 0.8915344
Random forest 0.7777778 1.0000000 0.8888889
Random forest 0.7037037 0.8571429 0.7804233
Random forest 0.7777778 1.0000000 0.8888889
Random forest 0.7777778 0.8571429 0.8174603
Random forest 0.8333333 0.7142857 0.7738095
Random forest 0.8703704 0.7142857 0.7923280
Random forest 0.7962963 1.0000000 0.8981481
Random forest 0.8703704 0.8571429 0.8637566
Random forest 0.8148148 0.8571429 0.8359788

Sensibilidad

gsen = 
  ggplot(data, aes(x = model, y = data$sensitivity, fill = model))+
  geom_boxplot()+
  scale_fill_brewer(palette = 10)+
  theme_classic()

ggplotly(gsen)
sensibilidad = tapply(data$sensitivity, data$model, median)

sensibilidad = data.frame(sensibilidad)

names(sensibilidad) = c("Sensibilidad")

Especificidad

gspe =
  ggplot(data, aes(x = model, y = data$specificity, fill = model)) +
  geom_boxplot() +
  scale_fill_brewer(palette = 10) +
  theme_classic()

ggplotly(gspe)
especificidad = tapply(data$specificity, data$model, median)

especificidad = data.frame(especificidad)

names(especificidad) = c("Especificidad")

AUC

gauc = 
  ggplot(data, aes(x = model, y = data$AUC, fill = model)) +
  geom_boxplot() +
  scale_fill_brewer(palette = 10) +
  theme_classic()

ggplotly(gauc)
AUC = tapply(data$AUC, data$model, median)

AUC = data.frame(AUC)

names(AUC) = c("AUC")

Resultado final

final = cbind(sensibilidad, especificidad, AUC)

formato = c("striped", "bordered", "hover", "responsive")

final %>% kable() %>% kable_styling(bootstrap_options = formato,
                                   full_width = F,
                                   position = "center",
                                   font_size = 12)
Sensibilidad Especificidad AUC
Random forest 0.8055556 0.8571429 0.8498677

Comparación métricas

data_r = melt(data, id.vars = "model", measure.vars = 2:4)

ggplot(data_r, aes(x = model, y = value)) +
  geom_boxplot() +
  facet_wrap(~ variable, nrow = 3, ncol = 1, scales = "free")

m = matrix(nrow = 3,
           ncol = 1,
           dimnames = list(colnames(data)[-1], #quitamos model
                                    c("p-valor")))

m
            p-valor
sensitivity      NA
specificity      NA
AUC              NA
for (i in 2:4) {
  f = formula(paste(colnames(data)[i], "~ model"))
  test = med1way(f,
                 data = data) #comparación por medianas
  m[colnames(data)[i], ] = c(round(test$p.value, 4))
}

formato = c("striped", "bordered", "hover", "responsive")

m %>% kable() %>% kable_styling(bootstrap_options = formato,
                                   full_width = F,
                                   position = "center",
                                   font_size = 12)
p-valor
sensitivity NA
specificity 0.102
AUC NA

Explicación

  • Sensibilidad: Proporción de individuos enfermos que poseen un diagnóstico (+)

Sensibilidad = VP/(VP+FN)

Menos FN —–> Más Sensibilidad (Más importante para el estudio)

FN = Diagnóstico (-), pero realmente Enfermo

*Especificidad = VN/(VN+FP)

Menos FP —–> Más Especificidad

El análisis basado en las curvas ROC permite:

  1. Determinar el punto de corte en el que se alcanza la sensibilidad y la especificidad más alta

  2. Evaluar la capacidad de diferenciar individuos sanos frente enfermos

Los ejes de la curva ROC tienen valores entre 0 y 1 (0% y 100%), lo que delimita un cuadrado de área 1. A medida que el AUC se acerca a 1 mayor será la capacidad de diferenciar individuos sanos frente enfermos.

El punto de corte que determina la Sensibilidad y Especificidad más alta, es aquel que presenta el mayor índice de Youden, calculado según la fórmula: Sensibilidad + (1- Especificidad).

Gráficamente, este corresponde al punto de la curva ROC más cercano al ángulo superior izquierdo del gráfico; es decir, más cercano al punto del gráfico cuya Sensibilidad y Especificidad es igual a 100%.

Es precisio aclarar que el Índice de Youden identifica el punto de corte que determina la Sensibilidad y Especificidad más alta conjuntamnete, es decir, un mismo punto de corte no necesariamente determinada la Sensibilidad ni la Especificidad más alta, ya que generalmente, la Sensibilidad más alta está determinada por un punto de corte, mientras que la Especificidad más alta está determinada por otro.

*El eje Y del gráfico de la curva ROC corresponde a la Sensibilidad: % de VP sobre el total de Enfermos

*El eje X del gráfico de la curva ROC corresponde a (1-Especificidad): % FP sobre el total de Sanos

m = matrix(nrow = 2,
           ncol = 2,
           byrow = T,
           dimnames = list(c("Diagnóstico (+)", "Diagnóstico (-)"),
                           c("Enfermo", "No enfermo")
                           ))

df = as.data.frame(m)

#Fila 1
df [1,1] = "Verdaderos Positivos (VP)"
df [1,2] = "Falsos Positivos (FP)"

#Fila 2
df[2,1] = "Falsos Negativos (FN)"
df[2,2] = "Verdaderos Negativos (FN)"

df %>% kable() %>%
  kable_styling("striped",
                full_width = F,
                position = "center",
                font_size = 16) %>%
  row_spec(0,monospace = T ,bold = T, color = "black") %>%
  row_spec(1:2, color = "White", background = "lightgrey") %>%
  column_spec(1:1, bold = T, color = "orange", background = "white")
Enfermo No enfermo
Diagnóstico (+) Verdaderos Positivos (VP) Falsos Positivos (FP)
Diagnóstico (-) Falsos Negativos (FN) Verdaderos Negativos (FN)