Código

Libraries

library(caret)
library(ConfusionTableR)
library(DataExplorer)
library(dplyr)
library(ggplot2)
library(ggthemes)
library(kableExtra)
library(ModelMetrics)
library(openxlsx)
library(plotly)
library(probably)
library(pROC)
library(psych)
library(purrr)
library(randomForest)
library(reshape2)
library(skimr)
library(stringr)
library(tidymodels)
library(tidyverse)
library(univariateML)
library(vip)
library(xgboost)
library(WRS2)

Dataset

Descripción

Attribute information:

Gender: M(male), F(female) Age: Age of the patient Smoking: YES=2 , NO=1 Yellow fingers: YES=2 , NO=1 Anxiety: YES=2 , NO=1 Peer_pressure: YES=2 , NO=1 Chronic Disease: YES=2 , NO=1 Fatigue: YES=2 , NO=1 Allergy: YES=2 , NO=1 Wheezing: YES=2 , NO=1 Alcohol: YES=2 , NO=1 Coughing: YES=2 , NO=1 Shortness of Breath: YES=2 , NO=1 Swallowing Difficulty: YES=2 , NO=1 Chest pain: YES=2 , NO=1 Lung Cancer: YES , NO

Preprocesado

data = readr::read_csv("lung_cancer.csv")


names(data) = c("Gender", "Age", "Smoking", "Yellow_fingers", "Anxiety", "Peer_pressure", "Chronic_disease", "Fatigue", "Allergy", "Wheezing", "Alcohol_consuming", "Coughing", "Shortness_of_breath", "Swallowing_difficulty", "Chest_pain", "LUNG_CANCER")

twoTOone = function(x){
  return(ifelse(x == 2, 1, 0))
}


data[ ,3:15] = twoTOone(data[ ,3:15])

data = data %>%
  mutate(Diagnosis = ifelse(str_detect(LUNG_CANCER,"NO"), "Negative", "Positive")) %>%
  mutate(Diagnosis = factor(Diagnosis, levels = c("Positive", "Negative"), labels = c("Positive", "Negative"))) %>%
  relocate(Diagnosis, .before = LUNG_CANCER) %>%
  select(-LUNG_CANCER) #Sustituído por Diagnosis


DT::datatable(data,
              rownames = F,
              options = list(pageLength = 10, scrollX = T),
              class = "white-space: nowrap")

dim(data)

[1] 309  16

cols = c(1, 3:15)

data[ ,cols] = lapply(data[ ,cols], as.factor)


str(data)

tibble [309 × 16] (S3: tbl_df/tbl/data.frame)
 $ Gender               : Factor w/ 2 levels "F","M": 2 2 1 2 1 1 2 1 1 2 ...
 $ Age                  : num [1:309] 69 74 59 63 63 75 52 51 68 53 ...
 $ Smoking              : Factor w/ 2 levels "0","1": 1 2 1 2 1 1 2 2 2 2 ...
 $ Yellow_fingers       : Factor w/ 2 levels "0","1": 2 1 1 2 2 2 1 2 1 2 ...
 $ Anxiety              : Factor w/ 2 levels "0","1": 2 1 1 2 1 1 1 2 2 2 ...
 $ Peer_pressure        : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 1 2 1 2 ...
 $ Chronic_disease      : Factor w/ 2 levels "0","1": 1 2 1 1 1 2 1 1 1 2 ...
 $ Fatigue              : Factor w/ 2 levels "0","1": 2 2 2 1 1 2 2 2 2 1 ...
 $ Allergy              : Factor w/ 2 levels "0","1": 1 2 1 1 1 2 1 2 1 2 ...
 $ Wheezing             : Factor w/ 2 levels "0","1": 2 1 2 1 2 2 2 1 1 1 ...
 $ Alcohol_consuming    : Factor w/ 2 levels "0","1": 2 1 1 2 1 1 2 1 1 2 ...
 $ Coughing             : Factor w/ 2 levels "0","1": 2 1 2 1 2 2 2 1 1 1 ...
 $ Shortness_of_breath  : Factor w/ 2 levels "0","1": 2 2 2 1 2 2 2 2 1 1 ...
 $ Swallowing_difficulty: Factor w/ 2 levels "0","1": 2 2 1 2 1 1 1 2 1 2 ...
 $ Chest_pain           : Factor w/ 2 levels "0","1": 2 2 2 2 1 1 2 1 1 2 ...
 $ Diagnosis            : Factor w/ 2 levels "Positive","Negative": 1 1 2 2 2 1 1 1 2 1 ...

skim(data) %>% yank("factor")

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
Gender	1	FALSE	2	M: 162, F: 147
Smoking	1	FALSE	2	1: 174, 0: 135
Yellow_fingers	1	FALSE	2	1: 176, 0: 133
Anxiety	1	FALSE	2	0: 155, 1: 154
Peer_pressure	1	FALSE	2	1: 155, 0: 154
Chronic_disease	1	FALSE	2	1: 156, 0: 153
Fatigue	1	FALSE	2	1: 208, 0: 101
Allergy	1	FALSE	2	1: 172, 0: 137
Wheezing	1	FALSE	2	1: 172, 0: 137
Alcohol_consuming	1	FALSE	2	1: 172, 0: 137
Coughing	1	FALSE	2	1: 179, 0: 130
Shortness_of_breath	1	FALSE	2	1: 198, 0: 111
Swallowing_difficulty	1	FALSE	2	0: 164, 1: 145
Chest_pain	1	FALSE	2	1: 172, 0: 137
Diagnosis	1	FALSE	2	Pos: 270, Neg: 39

skim(data) %>% yank("numeric")

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
Age	0	1	62.67	8.21	21	57	62	69	87	▁▁▆▇▂

na_values = function(x){sum(is.na(x))}

apply(data, 2, na_values)

               Gender                   Age               Smoking 
                    0                     0                     0 
       Yellow_fingers               Anxiety         Peer_pressure 
                    0                     0                     0 
      Chronic_disease               Fatigue               Allergy 
                    0                     0                     0 
             Wheezing     Alcohol_consuming              Coughing 
                    0                     0                     0 
  Shortness_of_breath Swallowing_difficulty            Chest_pain 
                    0                     0                     0 
            Diagnosis 
                    0

Gráficos

Variables Categóricas

# Lista de variables
variables <- c("Gender", "Smoking", "Yellow_fingers", "Anxiety", "Peer_pressure", "Chronic_disease", "Fatigue", "Allergy", "Wheezing", "Alcohol_consuming", "Coughing", "Shortness_of_breath", "Swallowing_difficulty", "Chest_pain")

# Bucle for para generar y visualizar gráficos
for (variable in variables) {
  p <- ggplot(data, aes_string(x = variable)) +
    geom_bar(aes(fill = Diagnosis), show.legend = FALSE) +
    facet_wrap(~Diagnosis) +
    scale_fill_brewer(palette = 10) +
    theme_classic()
  p_plotly <- ggplotly(p)
  
  # Puedes imprimir o visualizar los gráficos según tus necesidades
  print(p_plotly)
}

Variables Numéricas

p8= ggplot(data, aes(Diagnosis, Age)) +
  geom_boxplot(aes(fill = Diagnosis)) +
  scale_fill_brewer(palette = 10) +
  theme_classic()
ggplotly(p8)

data_r = melt(data, id.vars = "Diagnosis", measure.vars = c(2)) #solo hay 1 variable numérica


ggplot(data_r, aes(x = Diagnosis, y = value)) +
  geom_boxplot() +
  facet_wrap(~variable, nrow = 1, ncol = 1, scales = "free")

Comparaciones

Variables Categóricas

m = matrix(nrow = 14, #hay 14 variables
           ncol = 2, #lo quiero en dos columnas
           dimnames = list(colnames(data[,c(1,3:15)]),
                           c("p_valor", "Coeficiente_V_de_Cramer")
                           ))

#m


for (i in c(1,3:15)) {
  tabla = table(data$Diagnosis, data[[i]]
                )
  test = chisq.test(tabla)
  
  cramer = function(x) {
    unname(sqrt(chisq.test(x)$statistic / (sum(x)*(min(dim(x))-1))))
  }
  
  m[colnames(data)[i], ] = c(round(test$p.value, 4),
                             round(cramer(tabla), 4)
                             )
}



m = data.frame(m)

formato = c("striped", "bordered", "hover", "responsive")

m %>% kable() %>% kable_styling(bootstrap_options = formato,
                                full_width = F,
                                position = "center",
                                font_size = 12) %>%
  column_spec(2, background = ifelse(m$p_valor > (0.05), "#ff0000", "#C0FF3E"))

	p_valor	Coeficiente_V_de_Cramer
Gender	0.3122	0.0575
Smoking	0.3953	0.0484
Yellow_fingers	0.0026	0.1715
Anxiety	0.0175	0.1352
Peer_pressure	0.0019	0.1766
Chronic_disease	0.0754	0.1011
Fatigue	0.0137	0.1403
Allergy	0.0000	0.3180
Wheezing	0.0000	0.2395
Alcohol_consuming	0.0000	0.2787
Coughing	0.0000	0.2387
Shortness_of_breath	0.3739	0.0506
Swallowing_difficulty	0.0000	0.2500
Chest_pain	0.0015	0.1806

# Asociación: pequeña: <0.01, mediana: 0.1-0.3 y grande: <0.3

library(corrplot)
library(vcd)

df = data[ ,c(1,3:16)]

#Partimos de una matriz vacía con las dimensiones apropiadas
empty_m = matrix(ncol = length(df),
                 nrow = length(df),
                 dimnames = list(names(df),
                                 names(df)))

#Calculamos el estadístico y vamos rellenando la matriz
calculate_cramer = function(m, df) {
  for (r in seq(nrow(m))) {
    for (c in seq(ncol(m))) {
      m[[r,c]] = assocstats(table(df[[r]], df[[c]]))$cramer
    }
  }
  return(m)
}


cor_matrix = calculate_cramer(empty_m, df)

#Ya podemos graficarlo
corrplot(cor_matrix, method = "circle", is.corr = F, type = "upper", diag = F, cl.lim = c(0,1))

remove(df)

Variables Numéricas

No paramétrica

m = matrix(nrow = 1,
           ncol = 1,
           dimnames = list(colnames(data[ , c(2)]),
                           c("p-valor")))

#m


for (i in c(2)) {
  f = formula(paste(colnames(data[i]), "~ Diagnosis"))
  test = wilcox.test(f, data = data)
  m[colnames(data)[i], ] = c(round(test$p.value, 4))
}



formato = c("striped", "bordered", "hover", "responsive")

m %>% kable() %>% kable_styling(bootstrap_options = formato,
                                full_width = F,
                                position = "center",
                                font_size = 12)

	p-valor
Age	0.1823

Robusta

# La funicón pb2gen utiliza un método de remuestreo con reemplazo. Esto significa que se generan nuevas muestras de los datos originales seleeccionados aleatoriamente. Sembrando una semilla los resultados serán posteriormente reproducibles. set.seed()

m = matrix(nrow = 1,
           ncol = 1,
           dimnames = list(colnames(data[ , c(2)]),
                           c("p-valor")))

m

    p-valor
Age      NA

for (i in c(2)) {
  f = formula(paste(colnames(data[i]), "~ Diagnosis"))
  test = pb2gen(f, 
                data = data,
                est = "median")
  
  m[colnames(data)[i], ] = c(round(test$p.value, 4))
}




formato = c("striped", "bordered", "hover", "responsive")

m %>% kable() %>% kable_styling(bootstrap_options = formato,
                                full_width = F,
                                position = "center",
                                font_size = 12)

	p-valor
Age	0.3072

m = data.frame(m)

Tablas

Variables numéricas

# lung_cancer = 0

lung_cancer_0 = data |> filter(Diagnosis %in% c("Negative")) #"|>" es lo mismo que %>%

names_num = colnames(select_if(lung_cancer_0, is.numeric))

data_num_lung_cancer0 = lung_cancer_0 |> select(all_of(names_num))



results_data_num_lung_cancer0 = NULL

for (i in colnames(data_num_lung_cancer0)) {
  median = median(data_num_lung_cancer0[[i]], na.rm = T) #convertivos a vector con [[]]
  iqr = IQR(data_num_lung_cancer0[[i]], na.rm = T)
  row = data.frame("VARIABLE" = i,
                   "MEDIAN_lung_cancer_0" = median,
                   "IQR_lung_cancer_0" = iqr)
  results_data_num_lung_cancer0 = rbind(results_data_num_lung_cancer0, row)
}


rownames(results_data_num_lung_cancer0) = NULL
results_data_num_lung_cancer0 |> kable() |> kable_styling()

VARIABLE	MEDIAN_lung_cancer_0	IQR_lung_cancer_0
Age	61	8.5

# lung_cancer = 1

lung_cancer_1 = data |> filter(Diagnosis %in% c("Positive")) #"|>" es lo mismo que %>%

names_num = colnames(select_if(lung_cancer_1, is.numeric))

data_num_lung_cancer1 = lung_cancer_1 |> select(all_of(names_num))



results_data_num_lung_cancer1 = NULL

for (i in colnames(data_num_lung_cancer1)) {
  median = median(data_num_lung_cancer1[[i]], na.rm = T) #convertivos a vector con [[]]
  iqr = IQR(data_num_lung_cancer1[[i]], na.rm = T)
  row = data.frame("VARIABLE" = i,
                   "MEDIAN_lung_cancer_1" = median,
                   "IQR_lung_cancer_1" = iqr)
  results_data_num_lung_cancer1 = rbind(results_data_num_lung_cancer1, row)
}


rownames(results_data_num_lung_cancer1) = NULL
results_data_num_lung_cancer1 |> kable() |> kable_styling()

VARIABLE	MEDIAN_lung_cancer_1	IQR_lung_cancer_1
Age	62.5	11

descriptive_table = data.frame(
  variable = c("Age"),
  median_lung_cancer_0 = results_data_num_lung_cancer0$MEDIAN_lung_cancer_0,
  IQR_lung_cancer_0 = results_data_num_lung_cancer0$IQR_lung_cancer_0,
  median_lung_cancer_1 = results_data_num_lung_cancer1$MEDIAN_lung_cancer_1,
  IQR_lung_cancer_1 = results_data_num_lung_cancer1$IQR_lung_cancer_1,
  p_value = m$p.valor)

descriptive_table

  variable median_lung_cancer_0 IQR_lung_cancer_0 median_lung_cancer_1
1      Age                   61               8.5                 62.5
  IQR_lung_cancer_1 p_value
1                11  0.3072

names(descriptive_table) = c("Variable", "Median", "IQR", "Median", "IQR", "p-value")

descriptive_table

  Variable Median IQR Median IQR p-value
1      Age     61 8.5   62.5  11  0.3072

kbl(descriptive_table,
    caption = "Numerical Variables") %>%
  kable_paper("striped", full_width = F) %>%
  add_header_above(c(" " = 1, "Negative" = 2, "Positive" = 2, " " = 1), #de esta manera me coge el primer cuadrado vacío, los dos siguientes negativo, los dos siguientes positivos y el final vacío
                   italic = T,
                   align = "c",
                   color = "white",
                   background = "lightsalmon") %>%
  column_spec(6, background = ifelse(m$p.valor >(0.05), "#ff0000", "#C0FF3E")) %>%
  footnote(general = "p-value < 0.05 significant")

Numerical Variables
	Negative		Positive
Variable	Median	IQR	Median	IQR	p-value
Age	61	8.5	62.5	11	0.3072
Note:
p-value < 0.05 significant

Variables Categóricas

m = matrix(nrow = 14, #hay 14 variables
           ncol = 2, #lo quiero en dos columnas
           dimnames = list(colnames(data[,c(1,3:15)]),
                           c("p_valor", "Coeficiente_V_de_Cramer")
                           ))

#m



for (i in c(1,3:15)) {
  tabla = table(data$Diagnosis, data[[i]]
                )
  test = chisq.test(tabla)
  
  cramer = function(x) {
    unname(sqrt(chisq.test(x)$statistic / (sum(x)*(min(dim(x))-1))))
  }
  
  m[colnames(data)[i], ] = c(round(test$p.value, 4),
                             round(cramer(tabla), 4)
                             )
}



m = data.frame(m)

str(data)

tibble [309 × 16] (S3: tbl_df/tbl/data.frame)
 $ Gender               : Factor w/ 2 levels "F","M": 2 2 1 2 1 1 2 1 1 2 ...
 $ Age                  : num [1:309] 69 74 59 63 63 75 52 51 68 53 ...
 $ Smoking              : Factor w/ 2 levels "0","1": 1 2 1 2 1 1 2 2 2 2 ...
 $ Yellow_fingers       : Factor w/ 2 levels "0","1": 2 1 1 2 2 2 1 2 1 2 ...
 $ Anxiety              : Factor w/ 2 levels "0","1": 2 1 1 2 1 1 1 2 2 2 ...
 $ Peer_pressure        : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 1 2 1 2 ...
 $ Chronic_disease      : Factor w/ 2 levels "0","1": 1 2 1 1 1 2 1 1 1 2 ...
 $ Fatigue              : Factor w/ 2 levels "0","1": 2 2 2 1 1 2 2 2 2 1 ...
 $ Allergy              : Factor w/ 2 levels "0","1": 1 2 1 1 1 2 1 2 1 2 ...
 $ Wheezing             : Factor w/ 2 levels "0","1": 2 1 2 1 2 2 2 1 1 1 ...
 $ Alcohol_consuming    : Factor w/ 2 levels "0","1": 2 1 1 2 1 1 2 1 1 2 ...
 $ Coughing             : Factor w/ 2 levels "0","1": 2 1 2 1 2 2 2 1 1 1 ...
 $ Shortness_of_breath  : Factor w/ 2 levels "0","1": 2 2 2 1 2 2 2 2 1 1 ...
 $ Swallowing_difficulty: Factor w/ 2 levels "0","1": 2 2 1 2 1 1 1 2 1 2 ...
 $ Chest_pain           : Factor w/ 2 levels "0","1": 2 2 2 2 1 1 2 1 1 2 ...
 $ Diagnosis            : Factor w/ 2 levels "Positive","Negative": 1 1 2 2 2 1 1 1 2 1 ...

# https://www.danieldsjoberg.com/gtsummary/articles/tbl_summary.html

library(gtsummary)


data %>%
  tbl_summary(by = Diagnosis,
              statistic = list(
                Age ~ "{mean}({sd})",
                all_categorical() ~"{n} / {N} ({p}%)"),
              missing = "no") %>% # no lista elementos faltantes separados
  add_overall() %>%
  add_n() %>%
  add_p(pvalue_fun = ~ style_pvalue(.x, digits = 3)) %>%
  modify_header(label = "**Variables**") %>%
  bold_labels() %>%
  italicize_levels() %>%
  modify_caption("**Table 1. Summary table of patients with Positive or Negative diagnosis**")

**Table 1. Summary table of patients with Positive or Negative diagnosis**
Variables	N	Overall, N = 309¹	Positive, N = 270¹	Negative, N = 39¹	p-value²
Gender	309				0.237
F		147 / 309 (48%)	125 / 270 (46%)	22 / 39 (56%)
M		162 / 309 (52%)	145 / 270 (54%)	17 / 39 (44%)
Age	309	63(8)	63(8)	61(10)	0.182
Smoking	309				0.306
0		135 / 309 (44%)	115 / 270 (43%)	20 / 39 (51%)
1		174 / 309 (56%)	155 / 270 (57%)	19 / 39 (49%)
Yellow_fingers	309				0.001
0		133 / 309 (43%)	107 / 270 (40%)	26 / 39 (67%)
1		176 / 309 (57%)	163 / 270 (60%)	13 / 39 (33%)
Anxiety	309				0.011
0		155 / 309 (50%)	128 / 270 (47%)	27 / 39 (69%)
1		154 / 309 (50%)	142 / 270 (53%)	12 / 39 (31%)
Peer_pressure	309				0.001
0		154 / 309 (50%)	125 / 270 (46%)	29 / 39 (74%)
1		155 / 309 (50%)	145 / 270 (54%)	10 / 39 (26%)
Chronic_disease	309				0.051
0		153 / 309 (50%)	128 / 270 (47%)	25 / 39 (64%)
1		156 / 309 (50%)	142 / 270 (53%)	14 / 39 (36%)
Fatigue	309				0.008
0		101 / 309 (33%)	81 / 270 (30%)	20 / 39 (51%)
1		208 / 309 (67%)	189 / 270 (70%)	19 / 39 (49%)
Allergy	309				<0.001
0		137 / 309 (44%)	103 / 270 (38%)	34 / 39 (87%)
1		172 / 309 (56%)	167 / 270 (62%)	5 / 39 (13%)
Wheezing	309				<0.001
0		137 / 309 (44%)	107 / 270 (40%)	30 / 39 (77%)
1		172 / 309 (56%)	163 / 270 (60%)	9 / 39 (23%)
Alcohol_consuming	309				<0.001
0		137 / 309 (44%)	105 / 270 (39%)	32 / 39 (82%)
1		172 / 309 (56%)	165 / 270 (61%)	7 / 39 (18%)
Coughing	309				<0.001
0		130 / 309 (42%)	101 / 270 (37%)	29 / 39 (74%)
1		179 / 309 (58%)	169 / 270 (63%)	10 / 39 (26%)
Shortness_of_breath	309				0.286
0		111 / 309 (36%)	94 / 270 (35%)	17 / 39 (44%)
1		198 / 309 (64%)	176 / 270 (65%)	22 / 39 (56%)
Swallowing_difficulty	309				<0.001
0		164 / 309 (53%)	130 / 270 (48%)	34 / 39 (87%)
1		145 / 309 (47%)	140 / 270 (52%)	5 / 39 (13%)
Chest_pain	309				<0.001
0		137 / 309 (44%)	110 / 270 (41%)	27 / 39 (69%)
1		172 / 309 (56%)	160 / 270 (59%)	12 / 39 (31%)
¹ n / N (%); Mean(SD)
² Pearson’s Chi-squared test; Wilcoxon rank sum test

Modelos

Random forest

library(tidymodels)
library(caret)
library(ROCR)


df_results_rf = NULL

for (i in (1:10)) {
  set.seed(i) #distintas semillas diferentes datos para poder comparar
  train_row_numbers = createDataPartition(data$Diagnosis, p = 0.8, list = F)
  d_train = data[train_row_numbers, ]
  d_test = data[-train_row_numbers, ]
  
  transformer = recipe(formula = Diagnosis ~ .,
                       data = d_train) %>%
    step_dummy(all_nominal_predictors()) %>%
    step_center(where(is.numeric)) %>%
    step_scale(where(is.numeric))
  
  data_train = transformer %>%
    prep(d_train) %>%
    bake(new_data = NULL)
  
  data_test = transformer %>%
    prep(d_test) %>%
    bake(new_data = d_test)
  
  ctrl = trainControl(method = "cv",
                      number = 10,
                      returnResamp = "final",
                      verboseIter = F,
                      summaryFunction = twoClassSummary,
                      classProbs = T,
                      savePredictions = T,
                      allowParallel = T,
                      sampling = "up")
  
  
  tuneGrid = expand.grid(mtry = 1:15) #15 predictores totales
  
  set.seed(i)
  rf_fit = train(Diagnosis ~ .,
                 data = data_train,
                 method = "rf",
                 metric = "ROC", #coge el máximo modelo de sensi y especi
                 trControl = ctrl,
                 tuneGrid = tuneGrid,
                 )
  
  
  probs = seq(0.1, 0.9, by = 0.1)
  set.seed(i)
  ths_rf_fit = thresholder(rf_fit,
                           threshold = probs,
                           final = T,
                           statistics = "all")
  
  
  ths_rf_fit %>%
    mutate(prob = probs) %>%
    filter(J == max(J)) %>%
    pull(prob) -> thresh_prob_rf_fit
  ths_rf_fit %>%
    mutate(prob = probs) %>%
    filter(J == max(J)) %>%
    pull(J) -> max_J_train
  preds = as.factor(ifelse(predict(rf_fit, data_test, type = "prob")[, "Positive"] >= thresh_prob_rf_fit, "Positive", "Negative"))
  real = factor(data_test$Diagnosis)
  cm = ConfusionTableR::binary_class_cm(preds,
                                        real,
                                        mode = 'everything',
                                        positive = 'Positive')
  
  
  sensitivity = cm$confusion_matrix$byClass[1]
  specificity = cm$confusion_matrix$byClass[2]
  df = data.frame(preds = preds, real = real)
  df$preds = as.numeric(ifelse(df$preds == "Positive", 1, 0))
  df$real = as.numeric(ifelse(df$real == "Positive", 1, 0))
  prediction = prediction(df$preds, df$real)
  AUC = as.numeric(performance(prediction, "auc") @y.values)
  row = data.frame(model = "Random forest",
                   seed = i,
                   probab = thresh_prob_rf_fit,
                   max_J_train = max_J_train,
                   sensitivity = sensitivity,
                   specificity = specificity,
                   AUC = AUC)
  df_results_rf = rbind(df_results_rf, row)
  
}


df_results_rf %>% kable() %>% kable_styling()

	model	seed	probab	max_J_train	sensitivity	specificity	AUC
Sensitivity	Random forest	1	0.6	0.7586580	0.9259259	0.8571429	0.8915344
Sensitivity1	Random forest	2	0.6	0.7786797	0.7777778	1.0000000	0.8888889
Sensitivity2	Random forest	3	0.6	0.8256494	0.7037037	0.8571429	0.7804233
Sensitivity3	Random forest	4	0.8	0.7993506	0.7777778	1.0000000	0.8888889
Sensitivity4	Random forest	5	0.6	0.8037879	0.7777778	0.8571429	0.8174603
Sensitivity5	Random forest	6	0.6	0.7784632	0.8333333	0.7142857	0.7738095
Sensitivity6	Random forest	7	0.6	0.8240260	0.8703704	0.7142857	0.7923280
Sensitivity7	Random forest	8	0.6	0.7734848	0.7962963	1.0000000	0.8981481
Sensitivity8	Random forest	9	0.6	0.7935065	0.8703704	0.8571429	0.8637566
Sensitivity9	Random forest	10	0.6	0.7212121	0.8148148	0.8571429	0.8359788

write.xlsx(df_results_rf, "lung_rf.xlsx")

Métrica

df_results = rbind(df_results_rf)
write.csv(df_results, "resultados_modelos_lung_cancer.csv")

data = read.csv("resultados_modelos_lung_cancer.csv")

str(data)

'data.frame':   10 obs. of  8 variables:
 $ X          : chr  "Sensitivity" "Sensitivity1" "Sensitivity2" "Sensitivity3" ...
 $ model      : chr  "Random forest" "Random forest" "Random forest" "Random forest" ...
 $ seed       : int  1 2 3 4 5 6 7 8 9 10
 $ probab     : num  0.6 0.6 0.6 0.8 0.6 0.6 0.6 0.6 0.6 0.6
 $ max_J_train: num  0.759 0.779 0.826 0.799 0.804 ...
 $ sensitivity: num  0.926 0.778 0.704 0.778 0.778 ...
 $ specificity: num  0.857 1 0.857 1 0.857 ...
 $ AUC        : num  0.892 0.889 0.78 0.889 0.817 ...

data["X"] = NULL
data["seed"] = NULL
data["max_J_train"] = NULL
data["probab"] = NULL


data$model = as.factor(as.character(data$model))

formato = c("striped", "bordered", "hover", "responsive")

data %>% kable() %>% kable_styling(bootstrap_options = formato,
                                   full_width = F,
                                   position = "center",
                                   font_size = 12) %>%
  row_spec(0, bold = T, color = "black", align = "center") %>%
  row_spec(1:nrow(data), bold = T, color = "grey", align = "center")

model	sensitivity	specificity	AUC
Random forest	0.9259259	0.8571429	0.8915344
Random forest	0.7777778	1.0000000	0.8888889
Random forest	0.7037037	0.8571429	0.7804233
Random forest	0.7777778	1.0000000	0.8888889
Random forest	0.7777778	0.8571429	0.8174603
Random forest	0.8333333	0.7142857	0.7738095
Random forest	0.8703704	0.7142857	0.7923280
Random forest	0.7962963	1.0000000	0.8981481
Random forest	0.8703704	0.8571429	0.8637566
Random forest	0.8148148	0.8571429	0.8359788

Sensibilidad

gsen = 
  ggplot(data, aes(x = model, y = data$sensitivity, fill = model))+
  geom_boxplot()+
  scale_fill_brewer(palette = 10)+
  theme_classic()

ggplotly(gsen)

sensibilidad = tapply(data$sensitivity, data$model, median)

sensibilidad = data.frame(sensibilidad)

names(sensibilidad) = c("Sensibilidad")

Especificidad

gspe =
  ggplot(data, aes(x = model, y = data$specificity, fill = model)) +
  geom_boxplot() +
  scale_fill_brewer(palette = 10) +
  theme_classic()

ggplotly(gspe)

especificidad = tapply(data$specificity, data$model, median)

especificidad = data.frame(especificidad)

names(especificidad) = c("Especificidad")

AUC

gauc = 
  ggplot(data, aes(x = model, y = data$AUC, fill = model)) +
  geom_boxplot() +
  scale_fill_brewer(palette = 10) +
  theme_classic()

ggplotly(gauc)

AUC = tapply(data$AUC, data$model, median)

AUC = data.frame(AUC)

names(AUC) = c("AUC")

Resultado final

final = cbind(sensibilidad, especificidad, AUC)

formato = c("striped", "bordered", "hover", "responsive")

final %>% kable() %>% kable_styling(bootstrap_options = formato,
                                   full_width = F,
                                   position = "center",
                                   font_size = 12)

	Sensibilidad	Especificidad	AUC
Random forest	0.8055556	0.8571429	0.8498677

Comparación métricas

data_r = melt(data, id.vars = "model", measure.vars = 2:4)

ggplot(data_r, aes(x = model, y = value)) +
  geom_boxplot() +
  facet_wrap(~ variable, nrow = 3, ncol = 1, scales = "free")

m = matrix(nrow = 3,
           ncol = 1,
           dimnames = list(colnames(data)[-1], #quitamos model
                                    c("p-valor")))

m

            p-valor
sensitivity      NA
specificity      NA
AUC              NA

for (i in 2:4) {
  f = formula(paste(colnames(data)[i], "~ model"))
  test = med1way(f,
                 data = data) #comparación por medianas
  m[colnames(data)[i], ] = c(round(test$p.value, 4))
}

formato = c("striped", "bordered", "hover", "responsive")

m %>% kable() %>% kable_styling(bootstrap_options = formato,
                                   full_width = F,
                                   position = "center",
                                   font_size = 12)

	p-valor
sensitivity	NA
specificity	0.102
AUC	NA

Explicación

Sensibilidad: Proporción de individuos enfermos que poseen un diagnóstico (+)

Sensibilidad = VP/(VP+FN)

Menos FN —–> Más Sensibilidad (Más importante para el estudio)

FN = Diagnóstico (-), pero realmente Enfermo

*Especificidad = VN/(VN+FP)

Menos FP —–> Más Especificidad

El análisis basado en las curvas ROC permite:

Determinar el punto de corte en el que se alcanza la sensibilidad y la especificidad más alta
Evaluar la capacidad de diferenciar individuos sanos frente enfermos

Los ejes de la curva ROC tienen valores entre 0 y 1 (0% y 100%), lo que delimita un cuadrado de área 1. A medida que el AUC se acerca a 1 mayor será la capacidad de diferenciar individuos sanos frente enfermos.

El punto de corte que determina la Sensibilidad y Especificidad más alta, es aquel que presenta el mayor índice de Youden, calculado según la fórmula: Sensibilidad + (1- Especificidad).

Gráficamente, este corresponde al punto de la curva ROC más cercano al ángulo superior izquierdo del gráfico; es decir, más cercano al punto del gráfico cuya Sensibilidad y Especificidad es igual a 100%.

Es precisio aclarar que el Índice de Youden identifica el punto de corte que determina la Sensibilidad y Especificidad más alta conjuntamnete, es decir, un mismo punto de corte no necesariamente determinada la Sensibilidad ni la Especificidad más alta, ya que generalmente, la Sensibilidad más alta está determinada por un punto de corte, mientras que la Especificidad más alta está determinada por otro.

*El eje Y del gráfico de la curva ROC corresponde a la Sensibilidad: % de VP sobre el total de Enfermos

*El eje X del gráfico de la curva ROC corresponde a (1-Especificidad): % FP sobre el total de Sanos

m = matrix(nrow = 2,
           ncol = 2,
           byrow = T,
           dimnames = list(c("Diagnóstico (+)", "Diagnóstico (-)"),
                           c("Enfermo", "No enfermo")
                           ))

df = as.data.frame(m)

#Fila 1
df [1,1] = "Verdaderos Positivos (VP)"
df [1,2] = "Falsos Positivos (FP)"

#Fila 2
df[2,1] = "Falsos Negativos (FN)"
df[2,2] = "Verdaderos Negativos (FN)"

df %>% kable() %>%
  kable_styling("striped",
                full_width = F,
                position = "center",
                font_size = 16) %>%
  row_spec(0,monospace = T ,bold = T, color = "black") %>%
  row_spec(1:2, color = "White", background = "lightgrey") %>%
  column_spec(1:1, bold = T, color = "orange", background = "white")

	Enfermo	No enfermo
Diagnóstico (+)	Verdaderos Positivos (VP)	Falsos Positivos (FP)
Diagnóstico (-)	Falsos Negativos (FN)	Verdaderos Negativos (FN)

CARDONAMACHADO_ACTIVIDAD_14_15

CRISTIAN CARDONA

2024-02-06

Código

Libraries

Dataset

Descripción

Preprocesado

Gráficos

Variables Categóricas

Variables Numéricas

Comparaciones

Variables Categóricas

Variables Numéricas

No paramétrica

Robusta

Tablas

Variables numéricas

Variables Categóricas

Modelos

Random forest

Métrica

Sensibilidad

Especificidad

AUC

Resultado final

Comparación métricas

Explicación