knitr::opts_chunk$set(echo = TRUE)
# Cargar librerías
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(gridExtra)
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
library(missForest)
# Cargar el conjunto de datos
df <- read.csv("/cloud/project/water_potability.csv")
# Mostrar las primeras filas del conjunto de datos
head(df)
## ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon
## 1 NA 204.8905 20791.32 7.300212 368.5164 564.3087 10.379783
## 2 3.716080 129.4229 18630.06 6.635246 NA 592.8854 15.180013
## 3 8.099124 224.2363 19909.54 9.275884 NA 418.6062 16.868637
## 4 8.316766 214.3734 22018.42 8.059332 356.8861 363.2665 18.436524
## 5 9.092223 181.1015 17978.99 6.546600 310.1357 398.4108 11.558279
## 6 5.584087 188.3133 28748.69 7.544869 326.6784 280.4679 8.399735
## Trihalomethanes Turbidity Potability
## 1 86.99097 2.963135 0
## 2 56.32908 4.500656 0
## 3 66.42009 3.055934 0
## 4 100.34167 4.628771 0
## 5 31.99799 4.075075 0
## 6 54.91786 2.559708 0
# Resumen del conjunto de datos
summary(df)
## ph Hardness Solids Chloramines
## Min. : 0.000 Min. : 47.43 Min. : 320.9 Min. : 0.352
## 1st Qu.: 6.093 1st Qu.:176.85 1st Qu.:15666.7 1st Qu.: 6.127
## Median : 7.037 Median :196.97 Median :20927.8 Median : 7.130
## Mean : 7.081 Mean :196.37 Mean :22014.1 Mean : 7.122
## 3rd Qu.: 8.062 3rd Qu.:216.67 3rd Qu.:27332.8 3rd Qu.: 8.115
## Max. :14.000 Max. :323.12 Max. :61227.2 Max. :13.127
## NA's :491
## Sulfate Conductivity Organic_carbon Trihalomethanes
## Min. :129.0 Min. :181.5 Min. : 2.20 Min. : 0.738
## 1st Qu.:307.7 1st Qu.:365.7 1st Qu.:12.07 1st Qu.: 55.845
## Median :333.1 Median :421.9 Median :14.22 Median : 66.622
## Mean :333.8 Mean :426.2 Mean :14.28 Mean : 66.396
## 3rd Qu.:360.0 3rd Qu.:481.8 3rd Qu.:16.56 3rd Qu.: 77.337
## Max. :481.0 Max. :753.3 Max. :28.30 Max. :124.000
## NA's :781 NA's :162
## Turbidity Potability
## Min. :1.450 Min. :0.0000
## 1st Qu.:3.440 1st Qu.:0.0000
## Median :3.955 Median :0.0000
## Mean :3.967 Mean :0.3901
## 3rd Qu.:4.500 3rd Qu.:1.0000
## Max. :6.739 Max. :1.0000
##
# Información sobre el conjunto de datos
str(df)
## 'data.frame': 3276 obs. of 10 variables:
## $ ph : num NA 3.72 8.1 8.32 9.09 ...
## $ Hardness : num 205 129 224 214 181 ...
## $ Solids : num 20791 18630 19910 22018 17979 ...
## $ Chloramines : num 7.3 6.64 9.28 8.06 6.55 ...
## $ Sulfate : num 369 NA NA 357 310 ...
## $ Conductivity : num 564 593 419 363 398 ...
## $ Organic_carbon : num 10.4 15.2 16.9 18.4 11.6 ...
## $ Trihalomethanes: num 87 56.3 66.4 100.3 32 ...
## $ Turbidity : num 2.96 4.5 3.06 4.63 4.08 ...
## $ Potability : int 0 0 0 0 0 0 0 0 0 0 ...
# Visualización de la variable dependiente ("Potability")
potability_counts <- table(df$Potability)
pie_chart <- plot_ly(labels = c("Not Potable", "Potable"), values = potability_counts, type = "pie", hole = 0.35, opacity = 0.8) %>%
layout(title = "Pie Chart of Potability Feature",
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
pie_chart
# Visualización de la correlación entre las características con un clustermap
correlation_matrix <- cor(df)
heatmap <- plot_ly(z = correlation_matrix, colorscale = "YlGnBu", reversescale = TRUE, type = "heatmap",
x = colnames(df), y = colnames(df), zmin = -1, zmax = 1) %>%
layout(title = "Correlation Matrix", xaxis = list(title = ""), yaxis = list(title = ""))
heatmap
# Distribuciones de las características para clases potable y no potable
non_potable <- df %>% filter(Potability == 0)
potable <- df %>% filter(Potability == 1)
plots <- list()
for (col in colnames(df)[-10]) {
plot <- ggplot() +
geom_density(data = non_potable, aes_string(x = col), color = "blue", fill = "blue", alpha = 0.5) +
geom_density(data = potable, aes_string(x = col), color = "red", fill = "red", alpha = 0.5) +
labs(title = col) +
theme_minimal()
plots[[col]] <- plot
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
grid.arrange(grobs = plots, ncol = 3)
## Warning: Removed 314 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning: Removed 177 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning: Removed 488 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning: Removed 293 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning: Removed 107 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning: Removed 55 rows containing non-finite outside the scale range
## (`stat_density()`).

# Visualización de valores faltantes
missingness_matrix <- df %>% is.na() %>% as.data.frame() %>%
gather(key = "variable", value = "missing") %>%
ggplot(aes(x = variable, y = missing, fill = missing)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("FALSE" = "blue", "TRUE" = "red")) +
labs(title = "Missing Values", x = "Variable", y = "Missing") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
missingness_matrix

# Rellenar valores faltantes con la media
df_filled <- df %>%
mutate(ph = ifelse(is.na(ph), mean(df$ph, na.rm = TRUE), ph),
Sulfate = ifelse(is.na(Sulfate), mean(df$Sulfate, na.rm = TRUE), Sulfate),
Trihalomethanes = ifelse(is.na(Trihalomethanes), mean(df$Trihalomethanes, na.rm = TRUE), Trihalomethanes))
# Verificar si hay valores faltantes
sum(is.na(df_filled))
## [1] 0
# División de datos en conjuntos de entrenamiento y prueba
set.seed(3)
train_index <- createDataPartition(df_filled$Potability, p = 0.7, list = FALSE)
train_data <- df_filled[train_index, ]
test_data <- df_filled[-train_index, ]
# Normalización min-max
normalize <- function(x) {
return((x - min(x)) / (max(x) - min(x)))
}
train_data[, -10] <- apply(train_data[, -10], 2, normalize)
test_data[, -10] <- apply(test_data[, -10], 2, normalize)