library(egg)
## Warning: package 'egg' was built under R version 4.5.3
## Cargando paquete requerido: gridExtra
## Warning: package 'gridExtra' was built under R version 4.5.2
## Cargando paquete requerido: ggplot2
## Warning: package 'ggplot2' was built under R version 4.5.3
# Instalar y cargar paquetes
pacman::p_load(
readxl, # Importar archivos de Excel
dlookr, # Diagnóstico y exploración de datos
DataExplorer, # Exploración de datos
ggplot2, # Visualización
gtsummary, # Resúmenes estadísticos
skimr, # Resumen de datos
visdat, # Visualización de valores NA
corrplot, # Matriz de correlación
plotly, # Gráficos interactivos
missRanger, # Imputación de valores faltantes
flextable, # Tablas para informes
tidyverse # Colección de paquetes para ciencia de datos
)
setwd("~/MAESTRIA EPIDEMIOLOGIA ICESI/aseguramiento_datos/semana5-6")
2.1 Cargar Base de Datos
library(readxl)
cardiology_dataset <- read_excel("cardiology_dataset.xlsx")
View(cardiology_dataset)
Cambiar nombre de BD a data
data <-cardiology_dataset
2.2 Estructura y contenido del conjunto de datos
head(data, 10)
## # A tibble: 10 × 23
## patient_id age sex bmi systolic_bp diastolic_bp chol_total ldl hdl
## <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 71 Male 23.8 124 66 120 85 15
## 2 2 63 Male 30.8 117 83 188 131 34
## 3 3 73 Female 27.8 105 65 226 129 37
## 4 4 85 Female 20 119 93 229 99 52
## 5 5 62 Female 30.1 102 78 203 120 53
## 6 6 62 Female 25.1 172 92 236 166 60
## 7 7 86 Male 32.4 130 75 302 91 NA
## 8 8 75 Male 28 129 83 268 61 30
## 9 9 59 Female 30 142 82 179 134 NA
## 10 10 72 Female 23.2 149 80 142 117 NA
## # ℹ 14 more variables: triglycerides <dbl>, ejection_fraction <dbl>,
## # troponin <dbl>, creatinine <dbl>, diabetes <chr>, hypertension <chr>,
## # heart_failure <chr>, smoking_status <chr>, treat_statin <chr>,
## # treat_beta_blocker <chr>, treat_acei <chr>, length_of_stay <dbl>,
## # mortality_30d <chr>, readmission_30d <chr>
data %>% dplyr::glimpse()
## Rows: 2,506
## Columns: 23
## $ patient_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, …
## $ age <dbl> 71, 63, 73, 85, 62, 62, 86, 75, 59, 72, 59, 59, 68,…
## $ sex <chr> "Male", "Male", "Female", "Female", "Female", "Fema…
## $ bmi <dbl> 23.8, 30.8, 27.8, 20.0, 30.1, 25.1, 32.4, 28.0, 30.…
## $ systolic_bp <dbl> 124, 117, 105, 119, 102, 172, 130, 129, 142, 149, 1…
## $ diastolic_bp <dbl> 66, 83, 65, 93, 78, 92, 75, 83, 82, 80, 76, 84, 71,…
## $ chol_total <dbl> 120, 188, 226, 229, 203, 236, 302, 268, 179, 142, 1…
## $ ldl <dbl> 85, 131, 129, 99, 120, 166, 91, 61, 134, 117, 99, 1…
## $ hdl <dbl> 15, 34, 37, 52, 53, 60, NA, 30, NA, NA, 31, NA, 41,…
## $ triglycerides <dbl> 70, 125, 192, 170, 124, 150, 108, 148, 357, 258, 15…
## $ ejection_fraction <dbl> 49.9, 67.5, 74.0, 66.7, 60.6, 62.6, 73.6, 62.6, 49.…
## $ troponin <dbl> 0.045, 0.036, 0.089, 0.060, 0.060, 0.057, 0.079, 0.…
## $ creatinine <dbl> 0.96, 0.95, 0.98, NA, 0.93, NA, 0.96, 0.77, NA, NA,…
## $ diabetes <chr> "Yes", "No", "Yes", "No", "No", "Yes", "No", "Yes",…
## $ hypertension <chr> "No", "No", "Yes", "Yes", "No", "Yes", "No", "Yes",…
## $ heart_failure <chr> "No", "No", "No", "No", "No", "No", "Yes", "Yes", "…
## $ smoking_status <chr> "Never", "Former", NA, NA, "Never", "Never", "Forme…
## $ treat_statin <chr> "No", "Yes", "No", "Yes", "Yes", "Yes", "No", "Yes"…
## $ treat_beta_blocker <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "No", "Yes", "Ye…
## $ treat_acei <chr> "No", "No", "No", "Yes", "No", "No", "No", "No", "Y…
## $ length_of_stay <dbl> 4, 3, 6, 7, 6, 8, 5, 7, 6, 6, 7, 7, 7, 4, 6, 6, 6, …
## $ mortality_30d <chr> "No", "Yes", "No", "Yes", "No", "Yes", "Yes", "No",…
## $ readmission_30d <chr> "No", "No", "No", "Yes", "No", "Yes", "No", "No", "…
Podemos pensar en diagnosticar nuestros datos de manera similar al diagnóstico de una enfermedad: solo tratamos de descubrir qué es lo que está mal.
El diagnóstico de datos es un proceso importante para conocer la estructura y calidad de los datos antes de realizar cualquier análisis.
data %>%
diagnose() %>%
flextable()
variables | types | missing_count | missing_percent | unique_count | unique_rate |
|---|---|---|---|---|---|
patient_id | numeric | 0 | 0.0000000 | 2,500 | 0.9976057462 |
age | numeric | 0 | 0.0000000 | 72 | 0.0287310455 |
sex | character | 0 | 0.0000000 | 2 | 0.0007980846 |
bmi | numeric | 10 | 0.3990423 | 261 | 0.1041500399 |
systolic_bp | numeric | 0 | 0.0000000 | 103 | 0.0411013567 |
diastolic_bp | numeric | 0 | 0.0000000 | 62 | 0.0247406225 |
chol_total | numeric | 451 | 17.9968077 | 192 | 0.0766161213 |
ldl | numeric | 10 | 0.3990423 | 157 | 0.0626496409 |
hdl | numeric | 448 | 17.8770950 | 83 | 0.0331205108 |
triglycerides | numeric | 0 | 0.0000000 | 239 | 0.0953711093 |
ejection_fraction | numeric | 0 | 0.0000000 | 422 | 0.1683958500 |
troponin | numeric | 10 | 0.3990423 | 276 | 0.1101356744 |
creatinine | numeric | 451 | 17.9968077 | 143 | 0.0570630487 |
diabetes | character | 0 | 0.0000000 | 2 | 0.0007980846 |
hypertension | character | 0 | 0.0000000 | 2 | 0.0007980846 |
heart_failure | character | 0 | 0.0000000 | 2 | 0.0007980846 |
smoking_status | character | 449 | 17.9169992 | 4 | 0.0015961692 |
treat_statin | character | 0 | 0.0000000 | 2 | 0.0007980846 |
treat_beta_blocker | character | 0 | 0.0000000 | 2 | 0.0007980846 |
treat_acei | character | 0 | 0.0000000 | 2 | 0.0007980846 |
length_of_stay | numeric | 0 | 0.0000000 | 15 | 0.0059856345 |
mortality_30d | character | 0 | 0.0000000 | 2 | 0.0007980846 |
readmission_30d | character | 0 | 0.0000000 | 2 | 0.0007980846 |
Hablando de variables categóricas: usando la función diagnostic_category() podemos diagnosticar todas las variables categóricas de nuestro conjunto de datos a la vez. Dicho diagnóstico revela los nombres de las categorías, sus conteos y porcentajes e incluso el número de clasificación de la categoría más grande a la más pequeña.
variables: nombres de variables
levels: nombres de niveles/categorias
N: número de observaciones
freq: número de observación en los niveles/categorias
ratio: porcentaje de observación en los niveles
rank: rango de tasa de ocupación de los niveles
data %>%
diagnose_category() %>%
flextable()
variables | levels | N | freq | ratio | rank |
|---|---|---|---|---|---|
sex | Male | 2,506 | 1,341 | 53.511572 | 1 |
sex | Female | 2,506 | 1,165 | 46.488428 | 2 |
diabetes | No | 2,506 | 1,756 | 70.071828 | 1 |
diabetes | Yes | 2,506 | 750 | 29.928172 | 2 |
hypertension | Yes | 2,506 | 1,508 | 60.175579 | 1 |
hypertension | No | 2,506 | 998 | 39.824421 | 2 |
heart_failure | No | 2,506 | 1,868 | 74.541101 | 1 |
heart_failure | Yes | 2,506 | 638 | 25.458899 | 2 |
smoking_status | Never | 2,506 | 939 | 37.470072 | 1 |
smoking_status | Former | 2,506 | 713 | 28.451716 | 2 |
smoking_status | 2,506 | 449 | 17.916999 | 3 | |
smoking_status | Current | 2,506 | 405 | 16.161213 | 4 |
treat_statin | Yes | 2,506 | 1,770 | 70.630487 | 1 |
treat_statin | No | 2,506 | 736 | 29.369513 | 2 |
treat_beta_blocker | Yes | 2,506 | 1,467 | 58.539505 | 1 |
treat_beta_blocker | No | 2,506 | 1,039 | 41.460495 | 2 |
treat_acei | Yes | 2,506 | 1,255 | 50.079808 | 1 |
treat_acei | No | 2,506 | 1,251 | 49.920192 | 2 |
mortality_30d | No | 2,506 | 2,282 | 91.061453 | 1 |
mortality_30d | Yes | 2,506 | 224 | 8.938547 | 2 |
readmission_30d | No | 2,506 | 2,113 | 84.317638 | 1 |
readmission_30d | Yes | 2,506 | 393 | 15.682362 | 2 |
El diagnóstico de todas las variables numéricas a la vez es igualmente fácil. La función diagnostic_numeric() no solo calcula las estadísticas descriptivas más comunes, como mínimo, primer cuartil, promedio, mediana, tercer cuartil y máximo, sino que también le brinda la cantidad de ceros, valores negativos e incluso la cantidad de posibles valores atípicos para cada variable numérica.
data %>%
dplyr::select(
patient_id,
age,
bmi,
systolic_bp,
diastolic_bp,
chol_total,
ldl,
hdl,
triglycerides,
ejection_fraction,
troponin,
creatinine,
length_of_stay
) %>%
diagnose_numeric() %>%
flextable()
variables | min | Q1 | mean | median | Q3 | max | zero | minus | outlier |
|---|---|---|---|---|---|---|---|---|---|
patient_id | 1.00 | 626.25000 | 1,251.07501995 | 1,251.500 | 1,875.750 | 2,500.00 | 0 | 0 | 0 |
age | -5.00 | 57.00000 | 65.38268156 | 65.000 | 74.000 | 160.00 | 0 | 1 | 15 |
bmi | -3.00 | 24.40000 | 27.85729167 | 27.800 | 31.300 | 95.00 | 0 | 1 | 12 |
systolic_bp | -20.00 | 117.00000 | 130.40502793 | 130.000 | 144.000 | 198.00 | 0 | 1 | 10 |
diastolic_bp | 50.00 | 73.00000 | 79.76217079 | 80.000 | 86.000 | 114.00 | 0 | 0 | 29 |
chol_total | 120.00 | 173.00000 | 201.19805353 | 201.000 | 228.000 | 337.00 | 0 | 0 | 9 |
ldl | 50.00 | 100.00000 | 120.71634615 | 121.000 | 141.000 | 250.00 | 0 | 0 | 6 |
hdl | 15.00 | 40.00000 | 50.20505345 | 50.000 | 61.000 | 100.00 | 0 | 0 | 5 |
triglycerides | 50.00 | 121.00000 | 155.31843575 | 149.000 | 182.000 | 439.00 | 0 | 0 | 40 |
ejection_fraction | 18.40 | 47.82500 | 54.68599362 | 54.900 | 61.400 | 120.00 | 0 | 0 | 6 |
troponin | -0.50 | 0.02975 | 0.09874279 | 0.051 | 0.085 | 75.00 | 0 | 1 | 167 |
creatinine | 0.42 | 0.83000 | 1.02418978 | 0.990 | 1.190 | 2.31 | 0 | 0 | 24 |
length_of_stay | 1.00 | 4.00000 | 6.03711093 | 6.000 | 7.000 | 15.00 | 0 | 0 | 43 |
#3.4 Valores atípicos (outliers) #
Esto incluye la identificación de valores extremos o anómalos en el conjunto de datos que puedan afectar los resultados de los análisis.
data %>%
dplyr::select(-patient_id) %>%
diagnose_outlier() %>%
flextable()
variables | outliers_cnt | outliers_ratio | outliers_mean | with_mean | without_mean |
|---|---|---|---|---|---|
age | 15 | 0.5985634 | 35.0000000 | 65.38268156 | 65.56563629 |
bmi | 12 | 0.4788508 | 43.4250000 | 27.85729167 | 27.78208535 |
systolic_bp | 10 | 0.3990423 | 168.6000000 | 130.40502793 | 130.25200321 |
diastolic_bp | 29 | 1.1572227 | 68.9655172 | 79.76217079 | 79.88857489 |
chol_total | 9 | 0.3591381 | 324.7777778 | 201.19805353 | 200.65444770 |
ldl | 6 | 0.2394254 | 214.5000000 | 120.71634615 | 120.49036145 |
hdl | 5 | 0.1995211 | 96.0000000 | 50.20505345 | 50.09352168 |
triglycerides | 40 | 1.5961692 | 304.2000000 | 155.31843575 | 152.90348743 |
ejection_fraction | 6 | 0.2394254 | 40.0333333 | 54.68599362 | 54.72116000 |
troponin | 167 | 6.6640064 | 0.6913353 | 0.09874279 | 0.05625118 |
creatinine | 24 | 0.9577015 | 1.9158333 | 1.02418978 | 1.01365337 |
length_of_stay | 43 | 1.7158819 | 12.7209302 | 6.03711093 | 5.92042225 |
#3.4.1 Gráfico de valores atípicos #
Los gráficos de valores atípicos son herramientas útiles para identificar valores extremos o anómalos en un conjunto de datos. En Rstudio, hay varias formas de crear gráficos para identificar valores atípicos, tales como:
Gráfico de caja y bigote: Este gráfico muestra la distribución de los datos y permite identificar valores atípicos fuera de los límites de la caja que contiene la mayoría de los datos.
Histograma: Este gráfico muestra la frecuencia de ocurrencia de los valores en un conjunto de datos y permite identificar valores atípicos que están fuera de la distribución normal.
Gráfico de dispersión: Este gráfico muestra la relación entre dos variables y permite identificar valores atípicos que se alejan de la tendencia general.
data %>%
plot_outlier(triglycerides)
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the dlookr package.
## Please report the issue at <https://github.com/choonghyunryu/dlookr/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
En el Boxplot con Outliers, se observan varios pacientes con
concentraciones de triglicéridos mucho más altas que el resto,
alcanzando aproximadamente 450 mg/dL. En el histograma con outliers, la
mayoría de los pacientes presenta valores entre 100 y 200 mg/dL. Existe
una cola hacia la derecha (asimetría positiva), causada por algunos
pacientes con valores muy elevados. En el boxplot sin outliers,
desaparecen los puntos extremos y la caja mantiene una posición similar.
En el hisrograma sin outliers, la mayor concentración de pacientes
continúa entre 100 y 200 mg/dL, y la forma general de la distribución se
conserva.
data %>%
plot_outlier(troponin)
En primer lugar, debemos asegurarnos de que tenemos valores faltantes en nuestro conjunto de datos. Usando la función plot_na_pareto() del paquete {dlookr} podemos producir un gráfico de Pareto, que muestra los recuentos y las proporciones de los valores faltantes en cada variable. Si tiene muchas variables y desea mostrar solo las que tienen valores faltantes, use el argumento only_na = TRUE. El único problema con el gráfico de Pareto es que no sabemos si los valores faltantes en diferentes columnas pertenecen a la misma observación.
data %>%
plot_na_pareto(., only_na = TRUE)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the dlookr package.
## Please report the issue at <https://github.com/choonghyunryu/dlookr/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Estas cuatro variables concentran prácticamente todos los datos
faltantes del conjunto de datos. Aunque el porcentaje no supera el 20 %,
es suficientemente alto como para requerir un tratamiento antes del
análisis, por ejemplo mediante imputación o evaluando si la ausencia de
datos es aleatoria.
La función plot_na_intersect() del paquete {dlookr} visualiza las combinaciones de valores faltantes en las columnas. El eje x muestra las variables con valores faltantes, mientras que los recuentos de valores faltantes se muestran en la parte superior de la gráfica como barras. El eje y representa la combinación de variables y sus frecuencias.
data %>%
plot_na_intersect()
data %>%
vis_miss()
Este 3.2 % representa el porcentaje de celdas con datos faltantes en
toda la matriz de datos, no el porcentaje de pacientes con algún NA.
La estadística descriptiva es un conjunto de técnicas que se utilizan para resumir y describir las características de un conjunto de datos. En Rstudio, existen muchas funciones y paquetes que permiten realizar estadística descriptiva de un dataset. Algunas de las técnicas más comunes incluyen calcular la media, la mediana, la desviación estándar, el percentil y el rango. También se pueden generar gráficos de barras, histogramas, box plots, etc., para visualizar la distribución de los datos.
data %>%
dplyr::select(-patient_id) %>%
skimr::skim() # resumen descriptivo de datos
| Name | Piped data |
| Number of rows | 2506 |
| Number of columns | 22 |
| _______________________ | |
| Column type frequency: | |
| character | 10 |
| numeric | 12 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| sex | 0 | 1.00 | 4 | 6 | 0 | 2 | 0 |
| diabetes | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
| hypertension | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
| heart_failure | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
| smoking_status | 449 | 0.82 | 5 | 7 | 0 | 3 | 0 |
| treat_statin | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
| treat_beta_blocker | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
| treat_acei | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
| mortality_30d | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
| readmission_30d | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| age | 0 | 1.00 | 65.38 | 12.87 | -5.00 | 57.00 | 65.00 | 74.00 | 160.00 | ▁▅▇▁▁ |
| bmi | 10 | 1.00 | 27.86 | 5.24 | -3.00 | 24.40 | 27.80 | 31.30 | 95.00 | ▁▇▁▁▁ |
| systolic_bp | 0 | 1.00 | 130.41 | 20.04 | -20.00 | 117.00 | 130.00 | 144.00 | 198.00 | ▁▁▂▇▁ |
| diastolic_bp | 0 | 1.00 | 79.76 | 9.84 | 50.00 | 73.00 | 80.00 | 86.00 | 114.00 | ▁▅▇▃▁ |
| chol_total | 451 | 0.82 | 201.20 | 39.56 | 120.00 | 173.00 | 201.00 | 228.00 | 337.00 | ▃▇▇▂▁ |
| ldl | 10 | 1.00 | 120.72 | 29.84 | 50.00 | 100.00 | 121.00 | 141.00 | 250.00 | ▂▇▆▁▁ |
| hdl | 448 | 0.82 | 50.21 | 14.88 | 15.00 | 40.00 | 50.00 | 61.00 | 100.00 | ▂▇▇▂▁ |
| triglycerides | 0 | 1.00 | 155.32 | 47.14 | 50.00 | 121.00 | 149.00 | 182.00 | 439.00 | ▅▇▂▁▁ |
| ejection_fraction | 0 | 1.00 | 54.69 | 9.75 | 18.40 | 47.82 | 54.90 | 61.40 | 120.00 | ▁▇▅▁▁ |
| troponin | 10 | 1.00 | 0.10 | 1.50 | -0.50 | 0.03 | 0.05 | 0.09 | 75.00 | ▇▁▁▁▁ |
| creatinine | 451 | 0.82 | 1.02 | 0.26 | 0.42 | 0.83 | 0.99 | 1.19 | 2.31 | ▃▇▃▁▁ |
| length_of_stay | 0 | 1.00 | 6.04 | 2.23 | 1.00 | 4.00 | 6.00 | 7.00 | 15.00 | ▂▇▅▁▁ |
En el análisis descriptivo agrupado, podemos utilizar la función skim() junto con datos agrupados creados mediante dplyr::group_by(). En este caso, se agrega la variable “stroke” para realizar un análisis descriptivo agrupado:
data %>%
dplyr::select(-patient_id) %>%
dplyr::group_by(sex) %>%
skimr::skim() # resumen descriptivo de datos agrupados
| Name | Piped data |
| Number of rows | 2506 |
| Number of columns | 22 |
| _______________________ | |
| Column type frequency: | |
| character | 9 |
| numeric | 12 |
| ________________________ | |
| Group variables | sex |
Variable type: character
| skim_variable | sex | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|---|
| diabetes | Female | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
| diabetes | Male | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
| hypertension | Female | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
| hypertension | Male | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
| heart_failure | Female | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
| heart_failure | Male | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
| smoking_status | Female | 211 | 0.82 | 5 | 7 | 0 | 3 | 0 |
| smoking_status | Male | 238 | 0.82 | 5 | 7 | 0 | 3 | 0 |
| treat_statin | Female | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
| treat_statin | Male | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
| treat_beta_blocker | Female | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
| treat_beta_blocker | Male | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
| treat_acei | Female | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
| treat_acei | Male | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
| mortality_30d | Female | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
| mortality_30d | Male | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
| readmission_30d | Female | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
| readmission_30d | Male | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | sex | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|---|
| age | Female | 0 | 1.00 | 65.32 | 12.61 | 23.00 | 56.00 | 65.00 | 74.00 | 95.00 | ▁▂▇▇▂ |
| age | Male | 0 | 1.00 | 65.44 | 13.10 | -5.00 | 57.00 | 65.00 | 74.00 | 160.00 | ▁▅▇▁▁ |
| bmi | Female | 6 | 0.99 | 27.83 | 5.47 | -3.00 | 24.45 | 27.80 | 31.30 | 95.00 | ▁▇▁▁▁ |
| bmi | Male | 4 | 1.00 | 27.88 | 5.03 | 15.00 | 24.40 | 27.80 | 31.30 | 45.60 | ▂▇▇▂▁ |
| systolic_bp | Female | 0 | 1.00 | 129.86 | 20.53 | -20.00 | 117.00 | 129.00 | 143.00 | 198.00 | ▁▁▂▇▁ |
| systolic_bp | Male | 0 | 1.00 | 130.88 | 19.60 | 90.00 | 117.00 | 131.00 | 145.00 | 193.00 | ▃▇▇▃▁ |
| diastolic_bp | Female | 0 | 1.00 | 80.25 | 9.92 | 50.00 | 73.00 | 80.00 | 87.00 | 114.00 | ▁▅▇▃▁ |
| diastolic_bp | Male | 0 | 1.00 | 79.34 | 9.76 | 50.00 | 73.00 | 79.00 | 86.00 | 113.00 | ▁▆▇▃▁ |
| chol_total | Female | 212 | 0.82 | 202.09 | 38.89 | 120.00 | 175.00 | 201.00 | 228.00 | 324.00 | ▃▇▇▃▁ |
| chol_total | Male | 239 | 0.82 | 200.43 | 40.14 | 120.00 | 172.00 | 200.00 | 228.00 | 337.00 | ▃▇▇▂▁ |
| ldl | Female | 6 | 0.99 | 121.31 | 29.55 | 50.00 | 100.00 | 122.00 | 141.00 | 250.00 | ▂▇▆▁▁ |
| ldl | Male | 4 | 1.00 | 120.20 | 30.10 | 50.00 | 100.00 | 120.00 | 140.00 | 214.00 | ▂▇▇▃▁ |
| hdl | Female | 200 | 0.83 | 49.71 | 15.06 | 15.00 | 39.00 | 50.00 | 60.00 | 96.00 | ▂▇▇▃▁ |
| hdl | Male | 248 | 0.82 | 50.64 | 14.72 | 15.00 | 41.00 | 51.00 | 61.00 | 100.00 | ▂▇▇▃▁ |
| triglycerides | Female | 0 | 1.00 | 155.80 | 46.72 | 50.00 | 122.00 | 148.00 | 183.00 | 439.00 | ▅▇▂▁▁ |
| triglycerides | Male | 0 | 1.00 | 154.90 | 47.53 | 57.00 | 120.00 | 149.00 | 181.00 | 437.00 | ▆▇▂▁▁ |
| ejection_fraction | Female | 0 | 1.00 | 54.81 | 9.70 | 22.00 | 48.00 | 54.70 | 61.60 | 75.00 | ▁▂▇▇▃ |
| ejection_fraction | Male | 0 | 1.00 | 54.58 | 9.79 | 18.40 | 47.70 | 54.90 | 61.40 | 120.00 | ▁▇▅▁▁ |
| troponin | Female | 4 | 1.00 | 0.13 | 2.20 | 0.00 | 0.03 | 0.05 | 0.09 | 75.00 | ▇▁▁▁▁ |
| troponin | Male | 6 | 1.00 | 0.07 | 0.06 | -0.50 | 0.03 | 0.05 | 0.09 | 0.65 | ▁▁▇▁▁ |
| creatinine | Female | 215 | 0.82 | 1.03 | 0.26 | 0.51 | 0.84 | 1.00 | 1.20 | 2.31 | ▅▇▃▁▁ |
| creatinine | Male | 236 | 0.82 | 1.02 | 0.26 | 0.42 | 0.83 | 0.99 | 1.17 | 2.19 | ▃▇▃▁▁ |
| length_of_stay | Female | 0 | 1.00 | 5.96 | 2.31 | 1.00 | 4.00 | 6.00 | 7.00 | 15.00 | ▂▇▅▁▁ |
| length_of_stay | Male | 0 | 1.00 | 6.10 | 2.15 | 1.00 | 5.00 | 6.00 | 7.00 | 15.00 | ▂▇▆▁▁ |
Analisis univariado (variables numericas) En esta sección, nos enfocaremos en el análisis univariado de variables numéricas. El análisis univariado se centra en examinar una variable a la vez, sin considerar las relaciones con otras variables. Al explorar variables numéricas, podemos utilizar diversas técnicas de visualización para obtener una comprensión más profunda de la distribución, la tendencia central, la dispersión y la presencia de valores atípicos.
Edad (en años)
library(ggplot2)
gg <- ggplot(data)
p1 <- gg +
geom_histogram(aes(x = age),
color = "black",
fill = "white",
binwidth = 5) +
ggtitle("Edad") +
xlab("Edad") +
ylab("Conteo") +
geom_vline(xintercept = mean(data$age, na.rm = TRUE),
color = "red") +
geom_vline(xintercept = median(data$age, na.rm = TRUE),
color = "blue")
p2 <- ggplot(data, aes(x = "", y = age)) +
geom_boxplot() +
coord_flip()
p1
p2
#Analisis univariado (variables categoricas) #
En esta sección, nos centraremos en el análisis univariado de variables categóricas, lo cual implica examinar una variable a la vez sin considerar su relación con otras variables. A través de diversas técnicas de visualización podemos revelar patrones, frecuencias y distribuciones de categorías en nuestros conjuntos de datos.
data %>%
dplyr::select(
sex,
diabetes,
hypertension,
heart_failure,
smoking_status,
treat_statin,
treat_beta_blocker,
treat_acei,
mortality_30d,
readmission_30d
) %>%
dplyr::mutate(across(everything(), as.factor)) %>%
dlookr::plot_bar_category()
La línea azul punteada representa el promedio de frecuencias entre las
categorías de esa variable. Sirve como referencia visual para
identificar qué categorías tienen más o menos observaciones que el
promedio.
La comprobación de normalidad se utiliza para determinar si un conjunto de datos se distribuye de manera normal o no. La normalidad es una suposición importante en muchos métodos estadísticos, por lo que es crucial verificar si los datos cumplen con esta condición antes de continuar con el análisis.
data %>%
dplyr::select(-patient_id) %>%
dplyr::select_if(is.numeric) %>%
normality()
## # A tibble: 12 × 4
## vars statistic p_value sample
## <chr> <dbl> <dbl> <dbl>
## 1 age 0.991 2.57e-11 2506
## 2 bmi 0.963 8.44e-25 2506
## 3 systolic_bp 0.989 9.16e-13 2506
## 4 diastolic_bp 0.998 4.51e- 3 2506
## 5 chol_total 0.994 2.06e- 7 2506
## 6 ldl 0.998 5.48e- 4 2506
## 7 hdl 0.997 1.51e- 3 2506
## 8 triglycerides 0.961 2.33e-25 2506
## 9 ejection_fraction 0.992 3.96e-10 2506
## 10 troponin 0.0122 8.50e-78 2506
## 11 creatinine 0.969 1.67e-20 2506
## 12 length_of_stay 0.967 1.10e-23 2506
data %>%
dplyr::select(-patient_id) %>%
dplyr::select_if(is.numeric) %>%
plot_normality()
# 4.4 Análisis bivariado de las características #
El objetivo es descubrir las relaciones y los patrones entre cada par de características. Preguntas, cómo:
¿Hay multicolinealidad en las características continuas?
¿Son independientes las variables categóricas?
En general, ¿hay alguna relación entre estas características médicas?
Matriz de correlación La matriz de correlación se representa en forma de una tabla cuadrada, donde las filas y columnas representan las variables y los valores en cada celda representan el coeficiente de correlación entre esas dos variables. Los coeficientes de correlación varían entre -1 y 1, donde -1 indica una correlación negativa perfecta, 1 indica una correlación positiva perfecta y un valor de cero indica la ausencia de correlación.
Es importante tener en cuenta que la matriz de correlación sólo mide la correlación lineal entre las variables y no puede determinar la causalidad entre ellas. Es una herramienta útil para explorar las relaciones entre las variables y para seleccionar las variables que deben incluirse en un modelo estadístico.
data %>%
dplyr::select(where(is.numeric)) %>%
dlookr::correlate()
## # A tibble: 156 × 3
## var1 var2 coef_corr
## <fct> <fct> <dbl>
## 1 age patient_id -0.00197
## 2 bmi patient_id -0.00507
## 3 systolic_bp patient_id -0.00972
## 4 diastolic_bp patient_id -0.00381
## 5 chol_total patient_id -0.00189
## 6 ldl patient_id 0.00859
## 7 hdl patient_id -0.00879
## 8 triglycerides patient_id 0.0129
## 9 ejection_fraction patient_id 0.0285
## 10 troponin patient_id -0.0112
## # ℹ 146 more rows
library(dlookr)
corr <- data %>%
dplyr::select(
age,
bmi,
systolic_bp,
diastolic_bp,
chol_total,
ldl,
hdl,
triglycerides,
ejection_fraction,
troponin,
creatinine,
length_of_stay
) %>%
correlate()
corr
## # A tibble: 132 × 3
## var1 var2 coef_corr
## <fct> <fct> <dbl>
## 1 bmi age 0.0110
## 2 systolic_bp age -0.0246
## 3 diastolic_bp age 0.0282
## 4 chol_total age 0.0196
## 5 ldl age -0.0272
## 6 hdl age -0.0327
## 7 triglycerides age -0.0256
## 8 ejection_fraction age 0.0652
## 9 troponin age 0.0128
## 10 creatinine age 0.0498
## # ℹ 122 more rows
#mattriz dee correlacion #
matriz <- data %>%
dplyr::select(where(is.numeric), -patient_id) %>%
cor(use = "pairwise.complete.obs")
corrplot(matriz, method = "color")
La matriz de correlación de las variables numéricas muestra que la
mayoría de los coeficientes de correlación son cercanos a cero, lo que
indica relaciones lineales débiles entre las variables analizadas. No se
identifican correlaciones positivas o negativas de gran magnitud fuera
de la diagonal principal.
#4.4.2 Variables categóricas #
En este caso, utilizamos la estadística de Cramer’s V como medida de asociación para evaluar la relación entre dos variables categóricas. Cramer’s V es una medida que nos indica la fuerza y dirección de la asociación entre estas variables, y su valor puede oscilar entre 0 y 1. Un valor de Cramer’s V cercano a 0 indica una asociación débil, mientras que un valor cercano a 1 indica una asociación fuerte. De esta manera, podemos analizar cuán significativa y sólida es la relación entre las variables en cuestión.
library(DescTools)
## Warning: package 'DescTools' was built under R version 4.5.3
df_cat <- data %>%
dplyr::select(sex,
hypertension,
heart_failure,
diabetes)
m <- DescTools::PairApply(df_cat, DescTools::CramerV)
corrplot::corrplot(
m,
is.corr = FALSE,
method = "circle",
tl.col = "black",
tl.cex = 0.6,
tl.srt = 45
)
# 5. Tabla 1. #
El objetivo general de la tabla 1. es proporcionar una visión general de la población o muestra estudiada y de las variables relevantes en el estudio, permitiendo al lector comprender de manera clara y concisa los resultados y la metodología utilizada.
theme_gtsummary_language("en", big.mark = "")
## Setting theme "language: en"
# Tabla 1 con mmissings
data %>%
dplyr::select(-patient_id) %>%
gtsummary::tbl_summary(
by = NULL,
missing = "ifany",
missing_text = "(Missing)",
statistic = list(all_continuous() ~ "{median} ({p25}, {p75})"),
digits = list(all_categorical() ~ c(0, 1),
all_continuous() ~ c(1, 1))) %>%
gtsummary::modify_header(label = "**Variable**") %>%
gtsummary::add_n() %>%
gtsummary::bold_labels()
| Variable | N | N = 25061 |
|---|---|---|
| age | 2506 | 65.0 (57.0, 74.0) |
| sex | 2506 | |
| Female | 1165 (46.5%) | |
| Male | 1341 (53.5%) | |
| bmi | 2496 | 27.8 (24.4, 31.3) |
| (Missing) | 10 | |
| systolic_bp | 2506 | 130.0 (117.0, 144.0) |
| diastolic_bp | 2506 | 80.0 (73.0, 86.0) |
| chol_total | 2055 | 201.0 (173.0, 228.0) |
| (Missing) | 451 | |
| ldl | 2496 | 121.0 (100.0, 141.0) |
| (Missing) | 10 | |
| hdl | 2058 | 50.0 (40.0, 61.0) |
| (Missing) | 448 | |
| triglycerides | 2506 | 149.0 (121.0, 182.0) |
| ejection_fraction | 2506 | 54.9 (47.8, 61.4) |
| troponin | 2496 | 0.1 (0.0, 0.1) |
| (Missing) | 10 | |
| creatinine | 2055 | 1.0 (0.8, 1.2) |
| (Missing) | 451 | |
| diabetes | 2506 | 750 (29.9%) |
| hypertension | 2506 | 1508 (60.2%) |
| heart_failure | 2506 | 638 (25.5%) |
| smoking_status | 2057 | |
| Current | 405 (19.7%) | |
| Former | 713 (34.7%) | |
| Never | 939 (45.6%) | |
| (Missing) | 449 | |
| treat_statin | 2506 | 1770 (70.6%) |
| treat_beta_blocker | 2506 | 1467 (58.5%) |
| treat_acei | 2506 | 1255 (50.1%) |
| length_of_stay | 2506 | 6.0 (4.0, 7.0) |
| mortality_30d | 2506 | 224 (8.9%) |
| readmission_30d | 2506 | 393 (15.7%) |
| 1 Median (Q1, Q3); n (%) | ||
#5.1 Tabla 1. Estratificada
theme_gtsummary_language("en", big.mark = "")
## Setting theme "language: en"
data %>%
dplyr::select(-patient_id) %>%
gtsummary::tbl_summary(by = "sex",
missing = "ifany",
missing_text = "(Missing)",
digits = list(all_categorical() ~ c(0, 1),
all_continuous() ~ c(1, 1))) %>%
gtsummary::add_n() %>%
gtsummary::add_overall() %>%
gtsummary::modify_header(label = "**Variable**") %>%
gtsummary::modify_spanning_header(all_stat_cols() ~ "**stroke**") %>%
gtsummary::bold_labels()
| Variable | N |
stroke
|
||
|---|---|---|---|---|
| Overall N = 25061 |
Female N = 11651 |
Male N = 13411 |
||
| age | 2506 | 65.0 (57.0, 74.0) | 65.0 (56.0, 74.0) | 65.0 (57.0, 74.0) |
| bmi | 2496 | 27.8 (24.4, 31.3) | 27.8 (24.4, 31.3) | 27.8 (24.4, 31.3) |
| (Missing) | 10 | 6 | 4 | |
| systolic_bp | 2506 | 130.0 (117.0, 144.0) | 129.0 (117.0, 143.0) | 131.0 (117.0, 145.0) |
| diastolic_bp | 2506 | 80.0 (73.0, 86.0) | 80.0 (73.0, 87.0) | 79.0 (73.0, 86.0) |
| chol_total | 2055 | 201.0 (173.0, 228.0) | 201.0 (175.0, 228.0) | 200.0 (172.0, 228.0) |
| (Missing) | 451 | 212 | 239 | |
| ldl | 2496 | 121.0 (100.0, 141.0) | 122.0 (100.0, 141.0) | 120.0 (100.0, 140.0) |
| (Missing) | 10 | 6 | 4 | |
| hdl | 2058 | 50.0 (40.0, 61.0) | 50.0 (39.0, 60.0) | 51.0 (41.0, 61.0) |
| (Missing) | 448 | 200 | 248 | |
| triglycerides | 2506 | 149.0 (121.0, 182.0) | 148.0 (122.0, 183.0) | 149.0 (120.0, 181.0) |
| ejection_fraction | 2506 | 54.9 (47.8, 61.4) | 54.7 (48.0, 61.6) | 54.9 (47.7, 61.4) |
| troponin | 2496 | 0.1 (0.0, 0.1) | 0.1 (0.0, 0.1) | 0.1 (0.0, 0.1) |
| (Missing) | 10 | 4 | 6 | |
| creatinine | 2055 | 1.0 (0.8, 1.2) | 1.0 (0.8, 1.2) | 1.0 (0.8, 1.2) |
| (Missing) | 451 | 215 | 236 | |
| diabetes | 2506 | 750 (29.9%) | 345 (29.6%) | 405 (30.2%) |
| hypertension | 2506 | 1508 (60.2%) | 695 (59.7%) | 813 (60.6%) |
| heart_failure | 2506 | 638 (25.5%) | 277 (23.8%) | 361 (26.9%) |
| smoking_status | 2057 | |||
| Current | 405 (19.7%) | 183 (19.2%) | 222 (20.1%) | |
| Former | 713 (34.7%) | 341 (35.7%) | 372 (33.7%) | |
| Never | 939 (45.6%) | 430 (45.1%) | 509 (46.1%) | |
| (Missing) | 449 | 211 | 238 | |
| treat_statin | 2506 | 1770 (70.6%) | 827 (71.0%) | 943 (70.3%) |
| treat_beta_blocker | 2506 | 1467 (58.5%) | 679 (58.3%) | 788 (58.8%) |
| treat_acei | 2506 | 1255 (50.1%) | 600 (51.5%) | 655 (48.8%) |
| length_of_stay | 2506 | 6.0 (4.0, 7.0) | 6.0 (4.0, 7.0) | 6.0 (5.0, 7.0) |
| mortality_30d | 2506 | 224 (8.9%) | 105 (9.0%) | 119 (8.9%) |
| readmission_30d | 2506 | 393 (15.7%) | 176 (15.1%) | 217 (16.2%) |
| 1 Median (Q1, Q3); n (%) | ||||
data %>%
gtsummary::tbl_summary(by = sex,
missing = "ifany",
missing_text = "(Missing)",
digits = list(all_categorical() ~ c(0, 1),
all_continuous() ~ c(1, 1))) %>%
gtsummary::add_n() %>%
gtsummary::add_overall() %>%
gtsummary::add_p(list(all_continuous() ~ "wilcox.test",
all_categorical() ~ "chisq.test")) %>%
gtsummary::modify_header(label = "**Variable**") %>%
gtsummary::bold_p() %>%
gtsummary::modify_spanning_header(all_stat_cols() ~ "**sex**") %>%
gtsummary::bold_labels()
| Variable | N |
sex
|
p-value2 | ||
|---|---|---|---|---|---|
| Overall N = 25061 |
Female N = 11651 |
Male N = 13411 |
|||
| patient_id | 2506 | 1251.5 (626.0, 1876.0) | 1249.0 (639.0, 1875.0) | 1254.0 (615.0, 1878.0) | 0.9 |
| age | 2506 | 65.0 (57.0, 74.0) | 65.0 (56.0, 74.0) | 65.0 (57.0, 74.0) | >0.9 |
| bmi | 2496 | 27.8 (24.4, 31.3) | 27.8 (24.4, 31.3) | 27.8 (24.4, 31.3) | 0.8 |
| (Missing) | 10 | 6 | 4 | ||
| systolic_bp | 2506 | 130.0 (117.0, 144.0) | 129.0 (117.0, 143.0) | 131.0 (117.0, 145.0) | 0.2 |
| diastolic_bp | 2506 | 80.0 (73.0, 86.0) | 80.0 (73.0, 87.0) | 79.0 (73.0, 86.0) | 0.019 |
| chol_total | 2055 | 201.0 (173.0, 228.0) | 201.0 (175.0, 228.0) | 200.0 (172.0, 228.0) | 0.3 |
| (Missing) | 451 | 212 | 239 | ||
| ldl | 2496 | 121.0 (100.0, 141.0) | 122.0 (100.0, 141.0) | 120.0 (100.0, 140.0) | 0.4 |
| (Missing) | 10 | 6 | 4 | ||
| hdl | 2058 | 50.0 (40.0, 61.0) | 50.0 (39.0, 60.0) | 51.0 (41.0, 61.0) | 0.13 |
| (Missing) | 448 | 200 | 248 | ||
| triglycerides | 2506 | 149.0 (121.0, 182.0) | 148.0 (122.0, 183.0) | 149.0 (120.0, 181.0) | 0.5 |
| ejection_fraction | 2506 | 54.9 (47.8, 61.4) | 54.7 (48.0, 61.6) | 54.9 (47.7, 61.4) | 0.6 |
| troponin | 2496 | 0.1 (0.0, 0.1) | 0.1 (0.0, 0.1) | 0.1 (0.0, 0.1) | 0.5 |
| (Missing) | 10 | 4 | 6 | ||
| creatinine | 2055 | 1.0 (0.8, 1.2) | 1.0 (0.8, 1.2) | 1.0 (0.8, 1.2) | 0.2 |
| (Missing) | 451 | 215 | 236 | ||
| diabetes | 2506 | 750 (29.9%) | 345 (29.6%) | 405 (30.2%) | 0.8 |
| hypertension | 2506 | 1508 (60.2%) | 695 (59.7%) | 813 (60.6%) | 0.7 |
| heart_failure | 2506 | 638 (25.5%) | 277 (23.8%) | 361 (26.9%) | 0.079 |
| smoking_status | 2057 | 0.6 | |||
| Current | 405 (19.7%) | 183 (19.2%) | 222 (20.1%) | ||
| Former | 713 (34.7%) | 341 (35.7%) | 372 (33.7%) | ||
| Never | 939 (45.6%) | 430 (45.1%) | 509 (46.1%) | ||
| (Missing) | 449 | 211 | 238 | ||
| treat_statin | 2506 | 1770 (70.6%) | 827 (71.0%) | 943 (70.3%) | 0.7 |
| treat_beta_blocker | 2506 | 1467 (58.5%) | 679 (58.3%) | 788 (58.8%) | 0.8 |
| treat_acei | 2506 | 1255 (50.1%) | 600 (51.5%) | 655 (48.8%) | 0.2 |
| length_of_stay | 2506 | 6.0 (4.0, 7.0) | 6.0 (4.0, 7.0) | 6.0 (5.0, 7.0) | 0.045 |
| mortality_30d | 2506 | 224 (8.9%) | 105 (9.0%) | 119 (8.9%) | >0.9 |
| readmission_30d | 2506 | 393 (15.7%) | 176 (15.1%) | 217 (16.2%) | 0.5 |
| 1 Median (Q1, Q3); n (%) | |||||
| 2 Wilcoxon rank sum test; Pearson’s Chi-squared test | |||||
# *************************************************************************************************** #
library(janitor)
## Warning: package 'janitor' was built under R version 4.5.3
##
## Adjuntando el paquete: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(gtsummary)
(Los demás paquetes ya fueron cargados en el desarrollo de la guía de clase en la parte superioe)
data <- read_excel("cardiology_dataset.xlsx") %>%
clean_names()
dim(data)
## [1] 2506 23
glimpse(data)
## Rows: 2,506
## Columns: 23
## $ patient_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, …
## $ age <dbl> 71, 63, 73, 85, 62, 62, 86, 75, 59, 72, 59, 59, 68,…
## $ sex <chr> "Male", "Male", "Female", "Female", "Female", "Fema…
## $ bmi <dbl> 23.8, 30.8, 27.8, 20.0, 30.1, 25.1, 32.4, 28.0, 30.…
## $ systolic_bp <dbl> 124, 117, 105, 119, 102, 172, 130, 129, 142, 149, 1…
## $ diastolic_bp <dbl> 66, 83, 65, 93, 78, 92, 75, 83, 82, 80, 76, 84, 71,…
## $ chol_total <dbl> 120, 188, 226, 229, 203, 236, 302, 268, 179, 142, 1…
## $ ldl <dbl> 85, 131, 129, 99, 120, 166, 91, 61, 134, 117, 99, 1…
## $ hdl <dbl> 15, 34, 37, 52, 53, 60, NA, 30, NA, NA, 31, NA, 41,…
## $ triglycerides <dbl> 70, 125, 192, 170, 124, 150, 108, 148, 357, 258, 15…
## $ ejection_fraction <dbl> 49.9, 67.5, 74.0, 66.7, 60.6, 62.6, 73.6, 62.6, 49.…
## $ troponin <dbl> 0.045, 0.036, 0.089, 0.060, 0.060, 0.057, 0.079, 0.…
## $ creatinine <dbl> 0.96, 0.95, 0.98, NA, 0.93, NA, 0.96, 0.77, NA, NA,…
## $ diabetes <chr> "Yes", "No", "Yes", "No", "No", "Yes", "No", "Yes",…
## $ hypertension <chr> "No", "No", "Yes", "Yes", "No", "Yes", "No", "Yes",…
## $ heart_failure <chr> "No", "No", "No", "No", "No", "No", "Yes", "Yes", "…
## $ smoking_status <chr> "Never", "Former", NA, NA, "Never", "Never", "Forme…
## $ treat_statin <chr> "No", "Yes", "No", "Yes", "Yes", "Yes", "No", "Yes"…
## $ treat_beta_blocker <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "No", "Yes", "Ye…
## $ treat_acei <chr> "No", "No", "No", "Yes", "No", "No", "No", "No", "Y…
## $ length_of_stay <dbl> 4, 3, 6, 7, 6, 8, 5, 7, 6, 6, 7, 7, 7, 4, 6, 6, 6, …
## $ mortality_30d <chr> "No", "Yes", "No", "Yes", "No", "Yes", "Yes", "No",…
## $ readmission_30d <chr> "No", "No", "No", "Yes", "No", "Yes", "No", "No", "…
skim(data)
| Name | data |
| Number of rows | 2506 |
| Number of columns | 23 |
| _______________________ | |
| Column type frequency: | |
| character | 10 |
| numeric | 13 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| sex | 0 | 1.00 | 4 | 6 | 0 | 2 | 0 |
| diabetes | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
| hypertension | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
| heart_failure | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
| smoking_status | 449 | 0.82 | 5 | 7 | 0 | 3 | 0 |
| treat_statin | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
| treat_beta_blocker | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
| treat_acei | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
| mortality_30d | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
| readmission_30d | 0 | 1.00 | 2 | 3 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| patient_id | 0 | 1.00 | 1251.08 | 721.58 | 1.00 | 626.25 | 1251.50 | 1875.75 | 2500.00 | ▇▇▇▇▇ |
| age | 0 | 1.00 | 65.38 | 12.87 | -5.00 | 57.00 | 65.00 | 74.00 | 160.00 | ▁▅▇▁▁ |
| bmi | 10 | 1.00 | 27.86 | 5.24 | -3.00 | 24.40 | 27.80 | 31.30 | 95.00 | ▁▇▁▁▁ |
| systolic_bp | 0 | 1.00 | 130.41 | 20.04 | -20.00 | 117.00 | 130.00 | 144.00 | 198.00 | ▁▁▂▇▁ |
| diastolic_bp | 0 | 1.00 | 79.76 | 9.84 | 50.00 | 73.00 | 80.00 | 86.00 | 114.00 | ▁▅▇▃▁ |
| chol_total | 451 | 0.82 | 201.20 | 39.56 | 120.00 | 173.00 | 201.00 | 228.00 | 337.00 | ▃▇▇▂▁ |
| ldl | 10 | 1.00 | 120.72 | 29.84 | 50.00 | 100.00 | 121.00 | 141.00 | 250.00 | ▂▇▆▁▁ |
| hdl | 448 | 0.82 | 50.21 | 14.88 | 15.00 | 40.00 | 50.00 | 61.00 | 100.00 | ▂▇▇▂▁ |
| triglycerides | 0 | 1.00 | 155.32 | 47.14 | 50.00 | 121.00 | 149.00 | 182.00 | 439.00 | ▅▇▂▁▁ |
| ejection_fraction | 0 | 1.00 | 54.69 | 9.75 | 18.40 | 47.82 | 54.90 | 61.40 | 120.00 | ▁▇▅▁▁ |
| troponin | 10 | 1.00 | 0.10 | 1.50 | -0.50 | 0.03 | 0.05 | 0.09 | 75.00 | ▇▁▁▁▁ |
| creatinine | 451 | 0.82 | 1.02 | 0.26 | 0.42 | 0.83 | 0.99 | 1.19 | 2.31 | ▃▇▃▁▁ |
| length_of_stay | 0 | 1.00 | 6.04 | 2.23 | 1.00 | 4.00 | 6.00 | 7.00 | 15.00 | ▂▇▅▁▁ |
#3. Limpieza de base de datos #
colSums(is.na(data))
## patient_id age sex bmi
## 0 0 0 10
## systolic_bp diastolic_bp chol_total ldl
## 0 0 451 10
## hdl triglycerides ejection_fraction troponin
## 448 0 0 10
## creatinine diabetes hypertension heart_failure
## 451 0 0 0
## smoking_status treat_statin treat_beta_blocker treat_acei
## 449 0 0 0
## length_of_stay mortality_30d readmission_30d
## 0 0 0
o tambien mediante el comando de diagnostico suministrado en la guía de clase
#Diagnóstico general de la calidad de los datos
data %>%
diagnose() %>%
flextable()
variables | types | missing_count | missing_percent | unique_count | unique_rate |
|---|---|---|---|---|---|
patient_id | numeric | 0 | 0.0000000 | 2,500 | 0.9976057462 |
age | numeric | 0 | 0.0000000 | 72 | 0.0287310455 |
sex | character | 0 | 0.0000000 | 2 | 0.0007980846 |
bmi | numeric | 10 | 0.3990423 | 261 | 0.1041500399 |
systolic_bp | numeric | 0 | 0.0000000 | 103 | 0.0411013567 |
diastolic_bp | numeric | 0 | 0.0000000 | 62 | 0.0247406225 |
chol_total | numeric | 451 | 17.9968077 | 192 | 0.0766161213 |
ldl | numeric | 10 | 0.3990423 | 157 | 0.0626496409 |
hdl | numeric | 448 | 17.8770950 | 83 | 0.0331205108 |
triglycerides | numeric | 0 | 0.0000000 | 239 | 0.0953711093 |
ejection_fraction | numeric | 0 | 0.0000000 | 422 | 0.1683958500 |
troponin | numeric | 10 | 0.3990423 | 276 | 0.1101356744 |
creatinine | numeric | 451 | 17.9968077 | 143 | 0.0570630487 |
diabetes | character | 0 | 0.0000000 | 2 | 0.0007980846 |
hypertension | character | 0 | 0.0000000 | 2 | 0.0007980846 |
heart_failure | character | 0 | 0.0000000 | 2 | 0.0007980846 |
smoking_status | character | 449 | 17.9169992 | 4 | 0.0015961692 |
treat_statin | character | 0 | 0.0000000 | 2 | 0.0007980846 |
treat_beta_blocker | character | 0 | 0.0000000 | 2 | 0.0007980846 |
treat_acei | character | 0 | 0.0000000 | 2 | 0.0007980846 |
length_of_stay | numeric | 0 | 0.0000000 | 15 | 0.0059856345 |
mortality_30d | character | 0 | 0.0000000 | 2 | 0.0007980846 |
readmission_30d | character | 0 | 0.0000000 | 2 | 0.0007980846 |
sum(duplicated(data))
## [1] 6
dataclean <- data %>% distinct()
sum(duplicated(dataclean))
## [1] 0
# Verificar los tipos de datos de las variables
glimpse(dataclean)
## Rows: 2,500
## Columns: 23
## $ patient_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, …
## $ age <dbl> 71, 63, 73, 85, 62, 62, 86, 75, 59, 72, 59, 59, 68,…
## $ sex <chr> "Male", "Male", "Female", "Female", "Female", "Fema…
## $ bmi <dbl> 23.8, 30.8, 27.8, 20.0, 30.1, 25.1, 32.4, 28.0, 30.…
## $ systolic_bp <dbl> 124, 117, 105, 119, 102, 172, 130, 129, 142, 149, 1…
## $ diastolic_bp <dbl> 66, 83, 65, 93, 78, 92, 75, 83, 82, 80, 76, 84, 71,…
## $ chol_total <dbl> 120, 188, 226, 229, 203, 236, 302, 268, 179, 142, 1…
## $ ldl <dbl> 85, 131, 129, 99, 120, 166, 91, 61, 134, 117, 99, 1…
## $ hdl <dbl> 15, 34, 37, 52, 53, 60, NA, 30, NA, NA, 31, NA, 41,…
## $ triglycerides <dbl> 70, 125, 192, 170, 124, 150, 108, 148, 357, 258, 15…
## $ ejection_fraction <dbl> 49.9, 67.5, 74.0, 66.7, 60.6, 62.6, 73.6, 62.6, 49.…
## $ troponin <dbl> 0.045, 0.036, 0.089, 0.060, 0.060, 0.057, 0.079, 0.…
## $ creatinine <dbl> 0.96, 0.95, 0.98, NA, 0.93, NA, 0.96, 0.77, NA, NA,…
## $ diabetes <chr> "Yes", "No", "Yes", "No", "No", "Yes", "No", "Yes",…
## $ hypertension <chr> "No", "No", "Yes", "Yes", "No", "Yes", "No", "Yes",…
## $ heart_failure <chr> "No", "No", "No", "No", "No", "No", "Yes", "Yes", "…
## $ smoking_status <chr> "Never", "Former", NA, NA, "Never", "Never", "Forme…
## $ treat_statin <chr> "No", "Yes", "No", "Yes", "Yes", "Yes", "No", "Yes"…
## $ treat_beta_blocker <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "No", "Yes", "Ye…
## $ treat_acei <chr> "No", "No", "No", "Yes", "No", "No", "No", "No", "Y…
## $ length_of_stay <dbl> 4, 3, 6, 7, 6, 8, 5, 7, 6, 6, 7, 7, 7, 4, 6, 6, 6, …
## $ mortality_30d <chr> "No", "Yes", "No", "Yes", "No", "Yes", "Yes", "No",…
## $ readmission_30d <chr> "No", "No", "No", "Yes", "No", "Yes", "No", "No", "…
# O, de forma alternativa
str(dataclean)
## tibble [2,500 × 23] (S3: tbl_df/tbl/data.frame)
## $ patient_id : num [1:2500] 1 2 3 4 5 6 7 8 9 10 ...
## $ age : num [1:2500] 71 63 73 85 62 62 86 75 59 72 ...
## $ sex : chr [1:2500] "Male" "Male" "Female" "Female" ...
## $ bmi : num [1:2500] 23.8 30.8 27.8 20 30.1 25.1 32.4 28 30 23.2 ...
## $ systolic_bp : num [1:2500] 124 117 105 119 102 172 130 129 142 149 ...
## $ diastolic_bp : num [1:2500] 66 83 65 93 78 92 75 83 82 80 ...
## $ chol_total : num [1:2500] 120 188 226 229 203 236 302 268 179 142 ...
## $ ldl : num [1:2500] 85 131 129 99 120 166 91 61 134 117 ...
## $ hdl : num [1:2500] 15 34 37 52 53 60 NA 30 NA NA ...
## $ triglycerides : num [1:2500] 70 125 192 170 124 150 108 148 357 258 ...
## $ ejection_fraction : num [1:2500] 49.9 67.5 74 66.7 60.6 62.6 73.6 62.6 49.1 58.9 ...
## $ troponin : num [1:2500] 0.045 0.036 0.089 0.06 0.06 0.057 0.079 0.04 0.194 0.019 ...
## $ creatinine : num [1:2500] 0.96 0.95 0.98 NA 0.93 NA 0.96 0.77 NA NA ...
## $ diabetes : chr [1:2500] "Yes" "No" "Yes" "No" ...
## $ hypertension : chr [1:2500] "No" "No" "Yes" "Yes" ...
## $ heart_failure : chr [1:2500] "No" "No" "No" "No" ...
## $ smoking_status : chr [1:2500] "Never" "Former" NA NA ...
## $ treat_statin : chr [1:2500] "No" "Yes" "No" "Yes" ...
## $ treat_beta_blocker: chr [1:2500] "Yes" "Yes" "Yes" "Yes" ...
## $ treat_acei : chr [1:2500] "No" "No" "No" "Yes" ...
## $ length_of_stay : num [1:2500] 4 3 6 7 6 8 5 7 6 6 ...
## $ mortality_30d : chr [1:2500] "No" "Yes" "No" "Yes" ...
## $ readmission_30d : chr [1:2500] "No" "No" "No" "Yes" ...
# Normalizar valores faltantes en variables de texto
datacleanN <- dataclean %>%
mutate(
across(
where(is.character),
~ na_if(na_if(., "NA"), "")
)
)
# Definir las variables categóricas
cat_vars <- c(
"sex", "diabetes", "hypertension", "heart_failure",
"smoking_status", "treat_statin", "treat_beta_blocker",
"treat_acei", "mortality_30d", "readmission_30d"
)
# Convertir las variables categóricas a factor
datacleanN <- datacleanN %>%
mutate(across(all_of(cat_vars), as.factor))
# Verificar los cambios realizados
glimpse(datacleanN)
## Rows: 2,500
## Columns: 23
## $ patient_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, …
## $ age <dbl> 71, 63, 73, 85, 62, 62, 86, 75, 59, 72, 59, 59, 68,…
## $ sex <fct> Male, Male, Female, Female, Female, Female, Male, M…
## $ bmi <dbl> 23.8, 30.8, 27.8, 20.0, 30.1, 25.1, 32.4, 28.0, 30.…
## $ systolic_bp <dbl> 124, 117, 105, 119, 102, 172, 130, 129, 142, 149, 1…
## $ diastolic_bp <dbl> 66, 83, 65, 93, 78, 92, 75, 83, 82, 80, 76, 84, 71,…
## $ chol_total <dbl> 120, 188, 226, 229, 203, 236, 302, 268, 179, 142, 1…
## $ ldl <dbl> 85, 131, 129, 99, 120, 166, 91, 61, 134, 117, 99, 1…
## $ hdl <dbl> 15, 34, 37, 52, 53, 60, NA, 30, NA, NA, 31, NA, 41,…
## $ triglycerides <dbl> 70, 125, 192, 170, 124, 150, 108, 148, 357, 258, 15…
## $ ejection_fraction <dbl> 49.9, 67.5, 74.0, 66.7, 60.6, 62.6, 73.6, 62.6, 49.…
## $ troponin <dbl> 0.045, 0.036, 0.089, 0.060, 0.060, 0.057, 0.079, 0.…
## $ creatinine <dbl> 0.96, 0.95, 0.98, NA, 0.93, NA, 0.96, 0.77, NA, NA,…
## $ diabetes <fct> Yes, No, Yes, No, No, Yes, No, Yes, No, No, Yes, No…
## $ hypertension <fct> No, No, Yes, Yes, No, Yes, No, Yes, No, Yes, Yes, Y…
## $ heart_failure <fct> No, No, No, No, No, No, Yes, Yes, No, Yes, No, No, …
## $ smoking_status <fct> Never, Former, NA, NA, Never, Never, Former, Curren…
## $ treat_statin <fct> No, Yes, No, Yes, Yes, Yes, No, Yes, Yes, No, Yes, …
## $ treat_beta_blocker <fct> Yes, Yes, Yes, Yes, Yes, No, Yes, Yes, Yes, Yes, No…
## $ treat_acei <fct> No, No, No, Yes, No, No, No, No, Yes, Yes, Yes, Yes…
## $ length_of_stay <dbl> 4, 3, 6, 7, 6, 8, 5, 7, 6, 6, 7, 7, 7, 4, 6, 6, 6, …
## $ mortality_30d <fct> No, Yes, No, Yes, No, Yes, Yes, No, No, Yes, No, No…
## $ readmission_30d <fct> No, No, No, Yes, No, Yes, No, No, No, No, Yes, Yes,…
rangos <- tibble::tribble(
~variable, ~minimo, ~maximo,
"age", 18, 110,
"bmi", 10, 70,
"systolic_bp", 60, 250,
"diastolic_bp", 40, 160,
"ejection_fraction", 10, 90,
"troponin", 0, 50,
"creatinine", 0, 8,
"chol_total", 100, 450,
"ldl", 30, 300,
"hdl", 10, 120,
"triglycerides", 30, 800
)
rangos
## # A tibble: 11 × 3
## variable minimo maximo
## <chr> <dbl> <dbl>
## 1 age 18 110
## 2 bmi 10 70
## 3 systolic_bp 60 250
## 4 diastolic_bp 40 160
## 5 ejection_fraction 10 90
## 6 troponin 0 50
## 7 creatinine 0 8
## 8 chol_total 100 450
## 9 ldl 30 300
## 10 hdl 10 120
## 11 triglycerides 30 800
reporte_fuera <- rangos %>%
mutate(n_fuera_rango = purrr::pmap_int(
list(variable, minimo, maximo),
~ sum(!dplyr::between(datacleanN[[..1]], ..2, ..3), na.rm = TRUE)
))
reporte_fuera %>% arrange(desc(n_fuera_rango))
## # A tibble: 11 × 4
## variable minimo maximo n_fuera_rango
## <chr> <dbl> <dbl> <int>
## 1 age 18 110 2
## 2 bmi 10 70 2
## 3 troponin 0 50 2
## 4 systolic_bp 60 250 1
## 5 ejection_fraction 10 90 1
## 6 diastolic_bp 40 160 0
## 7 creatinine 0 8 0
## 8 chol_total 100 450 0
## 9 ldl 30 300 0
## 10 hdl 10 120 0
## 11 triglycerides 30 800 0
datacleanNfinal <- datacleanN %>%
mutate(
age = if_else(between(age, 18, 110), age, NA_real_),
bmi = if_else(between(bmi, 10, 70), bmi, NA_real_),
systolic_bp = if_else(between(systolic_bp, 60, 250), systolic_bp, NA_real_),
diastolic_bp = if_else(between(diastolic_bp, 40, 160), diastolic_bp, NA_real_),
ejection_fraction = if_else(between(ejection_fraction, 10, 90), ejection_fraction, NA_real_),
troponin = if_else(between(troponin, 0, 50), troponin, NA_real_),
creatinine = if_else(between(creatinine, 0, 8), creatinine, NA_real_),
chol_total = if_else(between(chol_total, 100, 450), chol_total, NA_real_),
ldl = if_else(between(ldl, 30, 300), ldl, NA_real_),
hdl = if_else(between(hdl, 10, 120), hdl, NA_real_),
triglycerides = if_else(between(triglycerides, 30, 800), triglycerides, NA_real_)
)
colSums(is.na(datacleanNfinal))
## patient_id age sex bmi
## 0 2 0 12
## systolic_bp diastolic_bp chol_total ldl
## 1 0 450 10
## hdl triglycerides ejection_fraction troponin
## 448 0 1 12
## creatinine diabetes hypertension heart_failure
## 450 0 0 0
## smoking_status treat_statin treat_beta_blocker treat_acei
## 449 0 0 0
## length_of_stay mortality_30d readmission_30d
## 0 0 0
skimr::skim(datacleanNfinal)
| Name | datacleanNfinal |
| Number of rows | 2500 |
| Number of columns | 23 |
| _______________________ | |
| Column type frequency: | |
| factor | 10 |
| numeric | 13 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| sex | 0 | 1.00 | FALSE | 2 | Mal: 1340, Fem: 1160 |
| diabetes | 0 | 1.00 | FALSE | 2 | No: 1751, Yes: 749 |
| hypertension | 0 | 1.00 | FALSE | 2 | Yes: 1506, No: 994 |
| heart_failure | 0 | 1.00 | FALSE | 2 | No: 1863, Yes: 637 |
| smoking_status | 449 | 0.82 | FALSE | 3 | Nev: 936, For: 712, Cur: 403 |
| treat_statin | 0 | 1.00 | FALSE | 2 | Yes: 1766, No: 734 |
| treat_beta_blocker | 0 | 1.00 | FALSE | 2 | Yes: 1462, No: 1038 |
| treat_acei | 0 | 1.00 | FALSE | 2 | Yes: 1252, No: 1248 |
| mortality_30d | 0 | 1.00 | FALSE | 2 | No: 2276, Yes: 224 |
| readmission_30d | 0 | 1.00 | FALSE | 2 | No: 2108, Yes: 392 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| patient_id | 0 | 1.00 | 1250.50 | 721.83 | 1.00 | 625.75 | 1250.50 | 1875.25 | 2500.00 | ▇▇▇▇▇ |
| age | 2 | 1.00 | 65.38 | 12.65 | 23.00 | 57.00 | 65.00 | 74.00 | 95.00 | ▁▂▇▇▂ |
| bmi | 12 | 1.00 | 27.84 | 5.03 | 15.00 | 24.40 | 27.80 | 31.30 | 45.60 | ▂▇▇▂▁ |
| systolic_bp | 1 | 1.00 | 130.45 | 19.82 | 90.00 | 117.00 | 130.00 | 144.00 | 198.00 | ▃▇▆▂▁ |
| diastolic_bp | 0 | 1.00 | 79.77 | 9.85 | 50.00 | 73.00 | 80.00 | 86.00 | 114.00 | ▁▅▇▃▁ |
| chol_total | 450 | 0.82 | 201.19 | 39.57 | 120.00 | 173.00 | 201.00 | 228.00 | 337.00 | ▃▇▇▂▁ |
| ldl | 10 | 1.00 | 120.69 | 29.84 | 50.00 | 100.00 | 121.00 | 141.00 | 250.00 | ▂▇▆▁▁ |
| hdl | 448 | 0.82 | 50.21 | 14.90 | 15.00 | 40.00 | 50.00 | 61.00 | 100.00 | ▂▇▇▂▁ |
| triglycerides | 0 | 1.00 | 155.32 | 47.19 | 50.00 | 121.00 | 149.00 | 182.00 | 439.00 | ▅▇▂▁▁ |
| ejection_fraction | 1 | 1.00 | 54.65 | 9.66 | 18.40 | 47.80 | 54.90 | 61.40 | 75.00 | ▁▂▆▇▃ |
| troponin | 12 | 1.00 | 0.07 | 0.06 | 0.00 | 0.03 | 0.05 | 0.09 | 0.65 | ▇▁▁▁▁ |
| creatinine | 450 | 0.82 | 1.02 | 0.26 | 0.42 | 0.83 | 0.99 | 1.19 | 2.31 | ▃▇▃▁▁ |
| length_of_stay | 0 | 1.00 | 6.04 | 2.22 | 1.00 | 4.00 | 6.00 | 7.00 | 15.00 | ▂▇▅▁▁ |
tabla1 <- datacleanNfinal %>%
dplyr::select(-patient_id) %>%
gtsummary::tbl_summary(
by = sex,
missing = "ifany",
missing_text = "(Missing)",
digits = list(
all_categorical() ~ c(0, 1),
all_continuous() ~ c(1, 1)
)
) %>%
gtsummary::add_n() %>%
gtsummary::add_overall() %>%
gtsummary::modify_header(label = "**Variable**") %>%
gtsummary::modify_spanning_header(all_stat_cols() ~ "**Sex**") %>%
gtsummary::bold_labels()
tabla1
| Variable | N |
Sex
|
||
|---|---|---|---|---|
| Overall N = 25001 |
Female N = 11601 |
Male N = 13401 |
||
| age | 2498 | 65.0 (57.0, 74.0) | 65.0 (56.0, 74.0) | 65.0 (57.0, 74.0) |
| (Missing) | 2 | 0 | 2 | |
| bmi | 2488 | 27.8 (24.4, 31.3) | 27.8 (24.4, 31.3) | 27.8 (24.4, 31.3) |
| (Missing) | 12 | 8 | 4 | |
| systolic_bp | 2499 | 130.0 (117.0, 144.0) | 129.0 (117.0, 143.0) | 131.0 (117.0, 145.0) |
| (Missing) | 1 | 1 | 0 | |
| diastolic_bp | 2500 | 80.0 (73.0, 86.0) | 80.0 (73.0, 87.0) | 79.0 (73.0, 86.0) |
| chol_total | 2050 | 201.0 (173.0, 228.0) | 201.0 (175.0, 228.0) | 200.0 (172.0, 228.0) |
| (Missing) | 450 | 211 | 239 | |
| ldl | 2490 | 121.0 (100.0, 141.0) | 122.0 (100.0, 141.0) | 120.0 (100.0, 140.5) |
| (Missing) | 10 | 6 | 4 | |
| hdl | 2052 | 50.0 (40.0, 61.0) | 50.0 (39.0, 60.0) | 51.0 (41.0, 61.0) |
| (Missing) | 448 | 200 | 248 | |
| triglycerides | 2500 | 149.0 (121.0, 182.0) | 148.0 (122.0, 183.0) | 149.0 (120.0, 181.0) |
| ejection_fraction | 2499 | 54.9 (47.8, 61.4) | 54.7 (48.0, 61.5) | 54.9 (47.7, 61.4) |
| (Missing) | 1 | 0 | 1 | |
| troponin | 2488 | 0.1 (0.0, 0.1) | 0.1 (0.0, 0.1) | 0.1 (0.0, 0.1) |
| (Missing) | 12 | 5 | 7 | |
| creatinine | 2050 | 1.0 (0.8, 1.2) | 1.0 (0.8, 1.2) | 1.0 (0.8, 1.2) |
| (Missing) | 450 | 215 | 235 | |
| diabetes | 2500 | 749 (30.0%) | 344 (29.7%) | 405 (30.2%) |
| hypertension | 2500 | 1506 (60.2%) | 693 (59.7%) | 813 (60.7%) |
| heart_failure | 2500 | 637 (25.5%) | 276 (23.8%) | 361 (26.9%) |
| smoking_status | 2051 | |||
| Current | 403 (19.6%) | 181 (19.1%) | 222 (20.1%) | |
| Former | 712 (34.7%) | 341 (35.9%) | 371 (33.7%) | |
| Never | 936 (45.6%) | 427 (45.0%) | 509 (46.2%) | |
| (Missing) | 449 | 211 | 238 | |
| treat_statin | 2500 | 1766 (70.6%) | 823 (70.9%) | 943 (70.4%) |
| treat_beta_blocker | 2500 | 1462 (58.5%) | 675 (58.2%) | 787 (58.7%) |
| treat_acei | 2500 | 1252 (50.1%) | 598 (51.6%) | 654 (48.8%) |
| length_of_stay | 2500 | 6.0 (4.0, 7.0) | 6.0 (4.0, 7.0) | 6.0 (5.0, 7.0) |
| mortality_30d | 2500 | 224 (9.0%) | 105 (9.1%) | 119 (8.9%) |
| readmission_30d | 2500 | 392 (15.7%) | 175 (15.1%) | 217 (16.2%) |
| 1 Median (Q1, Q3); n (%) | ||||
#11. Tabla descriptiva estratificada
tabla1_mortalidad <- datacleanNfinal %>%
dplyr::select(-patient_id) %>%
gtsummary::tbl_summary(
by = mortality_30d,
missing = "ifany",
missing_text = "(Missing)",
digits = list(
all_categorical() ~ c(0, 1),
all_continuous() ~ c(1, 1)
)
) %>%
gtsummary::add_n() %>%
gtsummary::add_overall() %>%
gtsummary::add_p() %>%
gtsummary::modify_header(label = "**Variable**") %>%
gtsummary::modify_spanning_header(all_stat_cols() ~ "**Mortalidad a 30 días**") %>%
gtsummary::bold_labels()
tabla1_mortalidad
| Variable | N |
Mortalidad a 30 días
|
p-value2 | ||
|---|---|---|---|---|---|
| Overall N = 25001 |
No N = 22761 |
Yes N = 2241 |
|||
| age | 2498 | 65.0 (57.0, 74.0) | 65.0 (56.0, 73.0) | 71.5 (62.0, 79.5) | <0.001 |
| (Missing) | 2 | 2 | 0 | ||
| sex | 2500 | 0.9 | |||
| Female | 1160 (46.4%) | 1055 (46.4%) | 105 (46.9%) | ||
| Male | 1340 (53.6%) | 1221 (53.6%) | 119 (53.1%) | ||
| bmi | 2488 | 27.8 (24.4, 31.3) | 27.8 (24.4, 31.3) | 28.5 (24.7, 31.5) | 0.2 |
| (Missing) | 12 | 12 | 0 | ||
| systolic_bp | 2499 | 130.0 (117.0, 144.0) | 130.0 (117.0, 144.0) | 129.5 (116.0, 145.0) | 0.8 |
| (Missing) | 1 | 1 | 0 | ||
| diastolic_bp | 2500 | 80.0 (73.0, 86.0) | 80.0 (73.0, 86.0) | 79.0 (72.0, 87.0) | 0.6 |
| chol_total | 2050 | 201.0 (173.0, 228.0) | 200.0 (173.0, 227.0) | 205.0 (173.0, 236.0) | 0.092 |
| (Missing) | 450 | 400 | 50 | ||
| ldl | 2490 | 121.0 (100.0, 141.0) | 121.0 (100.0, 141.0) | 121.0 (103.5, 139.0) | 0.5 |
| (Missing) | 10 | 10 | 0 | ||
| hdl | 2052 | 50.0 (40.0, 61.0) | 50.0 (40.0, 61.0) | 50.0 (41.0, 59.0) | 0.7 |
| (Missing) | 448 | 418 | 30 | ||
| triglycerides | 2500 | 149.0 (121.0, 182.0) | 148.5 (122.0, 182.0) | 149.0 (118.5, 189.0) | >0.9 |
| ejection_fraction | 2499 | 54.9 (47.8, 61.4) | 54.8 (47.8, 61.3) | 55.7 (48.5, 62.8) | 0.2 |
| (Missing) | 1 | 1 | 0 | ||
| troponin | 2488 | 0.1 (0.0, 0.1) | 0.1 (0.0, 0.1) | 0.0 (0.0, 0.1) | 0.2 |
| (Missing) | 12 | 10 | 2 | ||
| creatinine | 2050 | 1.0 (0.8, 1.2) | 1.0 (0.8, 1.2) | 1.0 (0.8, 1.2) | 0.3 |
| (Missing) | 450 | 413 | 37 | ||
| diabetes | 2500 | 749 (30.0%) | 687 (30.2%) | 62 (27.7%) | 0.4 |
| hypertension | 2500 | 1506 (60.2%) | 1364 (59.9%) | 142 (63.4%) | 0.3 |
| heart_failure | 2500 | 637 (25.5%) | 578 (25.4%) | 59 (26.3%) | 0.8 |
| smoking_status | 2051 | >0.9 | |||
| Current | 403 (19.6%) | 366 (19.7%) | 37 (19.4%) | ||
| Former | 712 (34.7%) | 646 (34.7%) | 66 (34.6%) | ||
| Never | 936 (45.6%) | 848 (45.6%) | 88 (46.1%) | ||
| (Missing) | 449 | 416 | 33 | ||
| treat_statin | 2500 | 1766 (70.6%) | 1601 (70.3%) | 165 (73.7%) | 0.3 |
| treat_beta_blocker | 2500 | 1462 (58.5%) | 1332 (58.5%) | 130 (58.0%) | 0.9 |
| treat_acei | 2500 | 1252 (50.1%) | 1142 (50.2%) | 110 (49.1%) | 0.8 |
| length_of_stay | 2500 | 6.0 (4.0, 7.0) | 6.0 (4.0, 7.0) | 6.0 (5.0, 7.0) | 0.9 |
| readmission_30d | 2500 | 392 (15.7%) | 353 (15.5%) | 39 (17.4%) | 0.5 |
| 1 Median (Q1, Q3); n (%) | |||||
| 2 Wilcoxon rank sum test; Pearson’s Chi-squared test | |||||
Para el desarrollo de este taller contamos con una base de datos de cardiología que contaba inicialmente con 2506 filas y 23 columnas. Para la limpieza de la base de datos primero, se hizo un diagnóstico en la que se determinó que el tabaquismo era la variable con mayor cantidad de missings (449), seguida por creatinina y colesterol total (451), colesterol HDL (448), colesterol ldl e índice de masa corporal (10). Después, se identificaron la cantidad de números duplicados por el ID de cada paciente, los cuales fueron 6 y posteriormente fueron eliminados. Después, se corrigió el tipo de dato según la variable, en este caso las variables categóricas se cambiaron a factor. Luego, se definieron los rangos según el documento de la base de datos y a partir de este rango se detectaron los outliers que no son clínicamente plausibles. Posteriormente, esos valores fuera de rango se recodificaron a NA.
Luego después de haber eliminado los duplicados y recodificado los outliers a NA, el tabaquismo se mantuvo con 449 NA, la creatinina y colesterol total con 450, el hdl se mantuvo con 448, el bmi aumento a 12 junto con la troponina , el colesterol LDL con 10 , la edad con 2 y finalmente la presión sistólica con 1. Finalmente, se obtuvo una base de datos limpia a la cual se le aplico el comando skim para observar las medidas de tendencia central, y la distribución de las variables categóricas. Se obtuvo una base de datos de 2500 filas. Finalmente se tabuló en una tabla con con la mediana y el rango intercuartil para variables cuantitativas, y la frecuencia y porcentajes para variables cualitativas. De igual forma sobre la población total, luego se estratifico por sexo y finalmente por el desenlace de mortalidad a 30 días.
La población tiene una mediana de 65 años, con sobrepeso grado II (IMC de 27.8), una presión arterial con mediana de 130 mmHG con valores para perfil lipídico, fracción de eyección, creatinina, y comorbilidades con distribución similar para ambos sexos. En cuanto a la estratificación por el desenlace de mortalidad a 30 días, la edad de pacientes fallecidos fue ligeramente mayor (71,5 años) que aquellos no fallecidos (65 años) con una significancia estadística p <0.001. Otra variable importante, fue la hipertensión aunque esta no alcanzo una significancia estadística p < 0.3.