1. Cargar librerías y datos

library(readr)
library(neuralnet)
library(corrplot)
library(caret)
library(dplyr)

diabetes <- read_csv("diabetes.csv")

2. Eliminar valores faltantes

colSums(is.na(diabetes))
##                          ...1                           Age 
##                             0                             0 
##                           Sex                     Ethnicity 
##                             0                             0 
##                           BMI           Waist_Circumference 
##                             0                             0 
##         Fasting_Blood_Glucose                         HbA1c 
##                             0                             0 
##       Blood_Pressure_Systolic      Blood_Pressure_Diastolic 
##                             0                             0 
##             Cholesterol_Total               Cholesterol_HDL 
##                             0                             0 
##               Cholesterol_LDL                           GGT 
##                             0                             0 
##                   Serum_Urate       Physical_Activity_Level 
##                             0                             0 
##       Dietary_Intake_Calories           Alcohol_Consumption 
##                             0                             0 
##                Smoking_Status    Family_History_of_Diabetes 
##                             0                             0 
## Previous_Gestational_Diabetes 
##                             0
diabetes <- diabetes[complete.cases(diabetes), ]
colSums(is.na(diabetes))
##                          ...1                           Age 
##                             0                             0 
##                           Sex                     Ethnicity 
##                             0                             0 
##                           BMI           Waist_Circumference 
##                             0                             0 
##         Fasting_Blood_Glucose                         HbA1c 
##                             0                             0 
##       Blood_Pressure_Systolic      Blood_Pressure_Diastolic 
##                             0                             0 
##             Cholesterol_Total               Cholesterol_HDL 
##                             0                             0 
##               Cholesterol_LDL                           GGT 
##                             0                             0 
##                   Serum_Urate       Physical_Activity_Level 
##                             0                             0 
##       Dietary_Intake_Calories           Alcohol_Consumption 
##                             0                             0 
##                Smoking_Status    Family_History_of_Diabetes 
##                             0                             0 
## Previous_Gestational_Diabetes 
##                             0

3. Crear variable objetivo

diabetes$diabetic <- ifelse(diabetes$HbA1c >= 6.5, 1, 0)

4. Codificar variables categóricas

diabetes$Sex <- as.numeric(as.factor(diabetes$Sex)) - 1
diabetes$Ethnicity <- as.numeric(as.factor(diabetes$Ethnicity)) - 1
diabetes$Physical_Activity_Level <- as.numeric(as.factor(diabetes$Physical_Activity_Level)) - 1
diabetes$Alcohol_Consumption <- as.numeric(as.factor(diabetes$Alcohol_Consumption)) - 1
diabetes$Smoking_Status <- as.numeric(as.factor(diabetes$Smoking_Status)) - 1

5. Normalizar variables numéricas

numeric_cols <- sapply(diabetes, is.numeric) & !colnames(diabetes) %in% "diabetic"
numeric_data <- diabetes[, numeric_cols]
maxs <- apply(numeric_data, 2, max)
mins <- apply(numeric_data, 2, min)
ranges <- maxs - mins
ranges[ranges == 0] <- 1
scaled <- as.data.frame(scale(numeric_data, center = mins, scale = ranges))
scaled_data <- cbind(scaled, diabetes[, !numeric_cols])

6. Análisis exploratorio

table(scaled_data$diabetic)
## 
##    0    1 
## 2216 7784
cor_matrix <- cor(scaled_data[, sapply(scaled_data, is.numeric)])
corrplot(cor_matrix, method = "number", type = "upper", tl.cex = 0.7)

7. División de datos

set.seed(123)
if (nrow(scaled_data) > 2000) {
  scaled_data <- scaled_data[sample(nrow(scaled_data), 1000), ]
}
indices <- createDataPartition(scaled_data$diabetic, p = 0.8, list = FALSE)
train <- scaled_data[indices, ]
test <- scaled_data[-indices, ]

8. Fórmula del modelo

predictors <- names(train)[!names(train) %in% c("diabetic", "HbA1c")]
formula_nn <- as.formula(paste("diabetic ~", paste(predictors, collapse = " + ")))

9. Crear y entrenar red neuronal

set.seed(123)
red <- neuralnet(
  formula_nn,
  data = train,
  hidden = c(3, 2),
  linear.output = FALSE,
  act.fct = "logistic",
  err.fct = "ce",
  threshold = 0.2,
  stepmax = 1e6
)
plot(red)

10. Evaluación del modelo

predicciones_bin <- unlist(red$net.result[[1]]) > 0.5 * 1

tablita <- data.frame(
  Predicho = predicciones_bin,
  Real = train$diabetic,
  Acertado = predicciones_bin == train$diabetic
)
head(tablita)
##      Predicho Real Acertado
## 8718     TRUE    1     TRUE
## 2986     TRUE    1     TRUE
## 1842     TRUE    1     TRUE
## 9334    FALSE    0     TRUE
## 3371     TRUE    0    FALSE
## 6746     TRUE    0    FALSE
tabla_resultados <- table(tablita$Acertado)
porcentajes <- round(prop.table(tabla_resultados) * 100, 2)
names(porcentajes) <- c("Falló (FALSE)", "Acertó (TRUE)")
porcentajes
## Falló (FALSE) Acertó (TRUE) 
##         15.12         84.88

11. Matriz de confusión

predicciones_factor <- factor(predicciones_bin, levels = c(0, 1))
real_factor <- factor(train$diabetic, levels = c(0, 1))
confusionMatrix(predicciones_factor, real_factor)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction 0 1
##          0 0 0
##          1 0 0
##                                   
##                Accuracy : NaN     
##                  95% CI : (NA, NA)
##     No Information Rate : NA      
##     P-Value [Acc > NIR] : NA      
##                                   
##                   Kappa : NaN     
##                                   
##  Mcnemar's Test P-Value : NA      
##                                   
##             Sensitivity :  NA     
##             Specificity :  NA     
##          Pos Pred Value :  NA     
##          Neg Pred Value :  NA     
##              Prevalence : NaN     
##          Detection Rate : NaN     
##    Detection Prevalence : NaN     
##       Balanced Accuracy :  NA     
##                                   
##        'Positive' Class : 0       
## 

12.¿Qué aprendiste de la práctica?

13. ¿Qué mejorarías del modelo?