library(readr)
library(neuralnet)
library(corrplot)
library(caret)
library(dplyr)
diabetes <- read_csv("diabetes.csv")
colSums(is.na(diabetes))
## ...1 Age
## 0 0
## Sex Ethnicity
## 0 0
## BMI Waist_Circumference
## 0 0
## Fasting_Blood_Glucose HbA1c
## 0 0
## Blood_Pressure_Systolic Blood_Pressure_Diastolic
## 0 0
## Cholesterol_Total Cholesterol_HDL
## 0 0
## Cholesterol_LDL GGT
## 0 0
## Serum_Urate Physical_Activity_Level
## 0 0
## Dietary_Intake_Calories Alcohol_Consumption
## 0 0
## Smoking_Status Family_History_of_Diabetes
## 0 0
## Previous_Gestational_Diabetes
## 0
diabetes <- diabetes[complete.cases(diabetes), ]
colSums(is.na(diabetes))
## ...1 Age
## 0 0
## Sex Ethnicity
## 0 0
## BMI Waist_Circumference
## 0 0
## Fasting_Blood_Glucose HbA1c
## 0 0
## Blood_Pressure_Systolic Blood_Pressure_Diastolic
## 0 0
## Cholesterol_Total Cholesterol_HDL
## 0 0
## Cholesterol_LDL GGT
## 0 0
## Serum_Urate Physical_Activity_Level
## 0 0
## Dietary_Intake_Calories Alcohol_Consumption
## 0 0
## Smoking_Status Family_History_of_Diabetes
## 0 0
## Previous_Gestational_Diabetes
## 0
diabetes$diabetic <- ifelse(diabetes$HbA1c >= 6.5, 1, 0)
diabetes$Sex <- as.numeric(as.factor(diabetes$Sex)) - 1
diabetes$Ethnicity <- as.numeric(as.factor(diabetes$Ethnicity)) - 1
diabetes$Physical_Activity_Level <- as.numeric(as.factor(diabetes$Physical_Activity_Level)) - 1
diabetes$Alcohol_Consumption <- as.numeric(as.factor(diabetes$Alcohol_Consumption)) - 1
diabetes$Smoking_Status <- as.numeric(as.factor(diabetes$Smoking_Status)) - 1
numeric_cols <- sapply(diabetes, is.numeric) & !colnames(diabetes) %in% "diabetic"
numeric_data <- diabetes[, numeric_cols]
maxs <- apply(numeric_data, 2, max)
mins <- apply(numeric_data, 2, min)
ranges <- maxs - mins
ranges[ranges == 0] <- 1
scaled <- as.data.frame(scale(numeric_data, center = mins, scale = ranges))
scaled_data <- cbind(scaled, diabetes[, !numeric_cols])
table(scaled_data$diabetic)
##
## 0 1
## 2216 7784
cor_matrix <- cor(scaled_data[, sapply(scaled_data, is.numeric)])
corrplot(cor_matrix, method = "number", type = "upper", tl.cex = 0.7)
set.seed(123)
if (nrow(scaled_data) > 2000) {
scaled_data <- scaled_data[sample(nrow(scaled_data), 1000), ]
}
indices <- createDataPartition(scaled_data$diabetic, p = 0.8, list = FALSE)
train <- scaled_data[indices, ]
test <- scaled_data[-indices, ]
predictors <- names(train)[!names(train) %in% c("diabetic", "HbA1c")]
formula_nn <- as.formula(paste("diabetic ~", paste(predictors, collapse = " + ")))
set.seed(123)
red <- neuralnet(
formula_nn,
data = train,
hidden = c(3, 2),
linear.output = FALSE,
act.fct = "logistic",
err.fct = "ce",
threshold = 0.2,
stepmax = 1e6
)
plot(red)
predicciones_bin <- unlist(red$net.result[[1]]) > 0.5 * 1
tablita <- data.frame(
Predicho = predicciones_bin,
Real = train$diabetic,
Acertado = predicciones_bin == train$diabetic
)
head(tablita)
## Predicho Real Acertado
## 8718 TRUE 1 TRUE
## 2986 TRUE 1 TRUE
## 1842 TRUE 1 TRUE
## 9334 FALSE 0 TRUE
## 3371 TRUE 0 FALSE
## 6746 TRUE 0 FALSE
tabla_resultados <- table(tablita$Acertado)
porcentajes <- round(prop.table(tabla_resultados) * 100, 2)
names(porcentajes) <- c("Falló (FALSE)", "Acertó (TRUE)")
porcentajes
## Falló (FALSE) Acertó (TRUE)
## 15.12 84.88
predicciones_factor <- factor(predicciones_bin, levels = c(0, 1))
real_factor <- factor(train$diabetic, levels = c(0, 1))
confusionMatrix(predicciones_factor, real_factor)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 0 0
## 1 0 0
##
## Accuracy : NaN
## 95% CI : (NA, NA)
## No Information Rate : NA
## P-Value [Acc > NIR] : NA
##
## Kappa : NaN
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : NA
## Specificity : NA
## Pos Pred Value : NA
## Neg Pred Value : NA
## Prevalence : NaN
## Detection Rate : NaN
## Detection Prevalence : NaN
## Balanced Accuracy : NA
##
## 'Positive' Class : 0
##
Se proponen posibles mejoras al modelo, como ajustar la estructura de la red neuronal (agregar más capas o neuronas) y utilizar técnicas como la validación cruzada para obtener una evaluación más robusta del rendimiento del modelo.
Este flujo de trabajo cubre todo el proceso de modelado en un análisis de datos de diabetes utilizando redes neuronales, desde la preparación de los datos hasta la evaluación del modelo final.