El trabajo consiste en realizar un analisis discriminante en un dataset de Strokes, tomando como variables predictoras Age, Hypertension y Bmi
library(missRanger)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(dplyr)
library(broom)
library(fastDummies)
## Thank you for using fastDummies!
## To acknowledge our work, please cite the package:
## Kaplan, J. & Schlegel, B. (2023). fastDummies: Fast Creation of Dummy (Binary) Columns and Rows from Categorical Variables. Version 1.7.1. URL: https://github.com/jacobkap/fastDummies, https://jacobkap.github.io/fastDummies/.
datos <- read.csv("C:/Users/Lucas/Desktop/Maestria/Infartos.csv")
datos$bmi[datos$bmi == "N/A"] <- NA
datos$smoking_status[datos$smoking_status == "Unknown"] <- NA
head(datos)
## id gender age hypertension heart_disease ever_married work_type
## 1 9046 Male 67 0 1 Yes Private
## 2 51676 Female 61 0 0 Yes Self-employed
## 3 31112 Male 80 0 1 Yes Private
## 4 60182 Female 49 0 0 Yes Private
## 5 1665 Female 79 1 0 Yes Self-employed
## 6 56669 Male 81 0 0 Yes Private
## Residence_type avg_glucose_level bmi smoking_status stroke
## 1 Urban 228.69 36.6 formerly smoked 1
## 2 Rural 202.21 <NA> never smoked 1
## 3 Rural 105.92 32.5 never smoked 1
## 4 Urban 171.23 34.4 smokes 1
## 5 Rural 174.12 24 never smoked 1
## 6 Urban 186.21 29 formerly smoked 1
str(datos)
## 'data.frame': 5110 obs. of 12 variables:
## $ id : int 9046 51676 31112 60182 1665 56669 53882 10434 27419 60491 ...
## $ gender : chr "Male" "Female" "Male" "Female" ...
## $ age : num 67 61 80 49 79 81 74 69 59 78 ...
## $ hypertension : int 0 0 0 0 1 0 1 0 0 0 ...
## $ heart_disease : int 1 0 1 0 0 0 1 0 0 0 ...
## $ ever_married : chr "Yes" "Yes" "Yes" "Yes" ...
## $ work_type : chr "Private" "Self-employed" "Private" "Private" ...
## $ Residence_type : chr "Urban" "Rural" "Rural" "Urban" ...
## $ avg_glucose_level: num 229 202 106 171 174 ...
## $ bmi : chr "36.6" NA "32.5" "34.4" ...
## $ smoking_status : chr "formerly smoked" "never smoked" "never smoked" "smokes" ...
## $ stroke : int 1 1 1 1 1 1 1 1 1 1 ...
sum(is.na(datos))
## [1] 1745
#Ante la presencia de valores nulos, se procede a imputar valores faltantes por medio de MissRanger
datos$gender <- as.factor(datos$gender)
datos$hypertension <- as.factor(datos$hypertension)
datos$ever_married <- as.factor(datos$ever_married)
datos$work_type <- as.factor(datos$work_type)
datos$Residence_type <- as.factor(datos$Residence_type)
datos$bmi <- as.numeric(datos$bmi)
datos$smoking_status <- as.factor(datos$smoking_status)
str(datos)
## 'data.frame': 5110 obs. of 12 variables:
## $ id : int 9046 51676 31112 60182 1665 56669 53882 10434 27419 60491 ...
## $ gender : Factor w/ 3 levels "Female","Male",..: 2 1 2 1 1 2 2 1 1 1 ...
## $ age : num 67 61 80 49 79 81 74 69 59 78 ...
## $ hypertension : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 2 1 1 1 ...
## $ heart_disease : int 1 0 1 0 0 0 1 0 0 0 ...
## $ ever_married : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 1 2 2 ...
## $ work_type : Factor w/ 5 levels "children","Govt_job",..: 4 5 4 4 5 4 4 4 4 4 ...
## $ Residence_type : Factor w/ 2 levels "Rural","Urban": 2 1 1 2 1 2 1 2 1 2 ...
## $ avg_glucose_level: num 229 202 106 171 174 ...
## $ bmi : num 36.6 NA 32.5 34.4 24 29 27.4 22.8 NA 24.2 ...
## $ smoking_status : Factor w/ 3 levels "formerly smoked",..: 1 2 2 3 2 1 2 2 NA NA ...
## $ stroke : int 1 1 1 1 1 1 1 1 1 1 ...
# Imputar valores faltantes con missRanger
datos_imputados <- missRanger(datos)
##
## Missing value imputation by random forests
##
## Variables to impute: bmi, smoking_status
## Variables used to impute: id, gender, age, hypertension, heart_disease, ever_married, work_type, Residence_type, avg_glucose_level, bmi, smoking_status, stroke
##
## iter 1
## | | | 0% | |=================================== | 50% | |======================================================================| 100%
## iter 2
## | | | 0% | |=================================== | 50% | |======================================================================| 100%
## iter 3
## | | | 0% | |=================================== | 50% | |======================================================================| 100%
## iter 4
## | | | 0% | |=================================== | 50% | |======================================================================| 100%
sum(is.na(datos_imputados))
## [1] 0
#Comparación
table(datos$smoking_status)
##
## formerly smoked never smoked smokes
## 885 1892 789
table(datos_imputados$smoking_status)
##
## formerly smoked never smoked smokes
## 983 3326 801
# Definir la función age_cohort
age_intervalos <- function(age) {
ifelse(age >= 0 & age <= 20, "0-20",
ifelse(age > 20 & age <= 40, "20-40",
ifelse(age > 40 & age <= 50, "40-50",
ifelse(age > 50 & age <= 60, "50-60",
ifelse(age > 60, "60+", NA)))))
}
edad <- datos_imputados %>%
mutate(age_group = sapply(age, age_intervalos))%>%
arrange(age_group)
###################################################################
#Ver si los datos siguen distribucion normal
datos_imputados$stroke <- as.factor(datos$stroke)
# gráfico Q-Q para bmi, diferenciado por stroke
ggplot(datos_imputados, aes(sample = bmi, color = stroke)) +
stat_qq() +
stat_qq_line() +
facet_wrap(~stroke, ncol = 2, labeller = labeller(stroke = c("0" = "No Stroke", "1" = "Stroke"))) +
theme_minimal() +
labs(title = "Q-Q Plot for BMI by Stroke Status",
x = "Theoretical Quantiles",
y = "Sample Quantiles") +
scale_color_manual(values = c("0" = "blue", "1" = "red"),
labels = c("0" = "No Stroke", "1" = "Stroke")) +
theme(legend.position = "bottom")
datos_imputados$hypertension <- as.factor(datos$hypertension)
# gráfico Q-Q para hypertension, diferenciado por hypertension
ggplot(datos_imputados, aes(sample = hypertension, color = stroke)) +
stat_qq() +
stat_qq_line() +
facet_wrap(~stroke, ncol = 2, labeller = labeller(stroke = c("0" = "No Stroke", "1" = "Stroke"))) +
theme_minimal() +
labs(title = "Q-Q Plot for hypertension by Stroke Status",
x = "Theoretical Quantiles",
y = "Sample Quantiles") +
scale_color_manual(values = c("0" = "blue", "1" = "red"),
labels = c("0" = "No Stroke", "1" = "Stroke")) +
theme(legend.position = "bottom")
# gráfico Q-Q para bmi, diferenciado por age
ggplot(datos_imputados, aes(sample = age, color = stroke)) +
stat_qq() +
stat_qq_line() +
facet_wrap(~stroke, ncol = 2, labeller = labeller(stroke = c("0" = "No Stroke", "1" = "Stroke"))) +
theme_minimal() +
labs(title = "Q-Q Plot for age by Stroke Status",
x = "Theoretical Quantiles",
y = "Sample Quantiles") +
scale_color_manual(values = c("0" = "blue", "1" = "red"),
labels = c("0" = "No Stroke", "1" = "Stroke")) +
theme(legend.position = "bottom")
# Partición de la tabla
set.seed(123)
# Crear una partición de datos con 70% para entrenamiento y 30% para prueba
particion <- createDataPartition(datos_imputados$stroke, p = 0.7, list = FALSE)
datos_train <- datos_imputados[particion, ]
datos_test <- datos_imputados[-particion, ]
param <- datos_train %>%
preProcess(method = c("center","scale"))
#Transformar los datos usando parametros estimados
datos_train_transformed <- param %>%
predict(datos_train)
datos_test_transformed <- param %>%
predict(datos_test)
model <- lda(stroke ~ hypertension + age + avg_glucose_level , data = datos_train_transformed)
predictions <- model %>%
predict(datos_test_transformed)
mean(predictions$class==datos_test_transformed$stroke)
## [1] 0.9516971
Para el grupo -0.222 la probabilidad a priori es aproximadamente 0.951.
Para el grupo 4.4899 (que probablemente representa los afectados por un derrame cerebral), la probabilidad a priori es aproximadamente 0.047.
Esto sugiere que en el conjunto de datos de entrenamiento, alrededor del 95.1% de las observaciones no tenían un derrame cerebral, mientras que el 4.7% sí lo tenían.
El primer discriminante lineal explica aproximadamente el 94.8% de la varianza entre los grupos, lo cual sugiere que LD1 captura la mayor parte de la información necesaria para discriminar entre los grupos.
Group means:
En promedio, las personas que tuvieron un derrame cerebral (grupo 4.48999363891418) tienden a ser mayores, tener un nivel de glucosa en sangre más alto y una mayor prevalencia de hipertensión.
lda = cbind(datos_train_transformed,predict(model)$x)
ggplot(lda, aes(x = LD1, fill = stroke)) +
geom_histogram(position = "identity", alpha = 0.7, bins = 30) +
theme_minimal() +
labs(title = "Distribución de LD1 por Clase de Stroke",
x = "LD1 Score",
y = "Frecuencia",
fill = "Stroke") +
scale_fill_manual(values = c("0" = "blue", "1" = "red"))
roc_obj <- roc(lda$stroke, lda$LD1)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_obj, main = "Curva ROC para el modelo LDA")
auc(roc_obj)
## Area under the curve: 0.8375
roc_obj <- roc(lda$stroke, lda$LD1)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_value <- auc(roc_obj)
Un area bajo la curva de 0.83 es considerado bueno y significa que hay un 85% de probabilidad de que el modelo clasifique correctamente un caso positivo (stroke) elegido al azar por encima de un caso negativo (no stroke) elegido al azar.