knitr::opts_chunk$set(echo = TRUE)
# Cargar librerías
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
library(missForest)
# Cargar el conjunto de datos
df <- read.csv("/cloud/project/water_potability.csv")

# Mostrar las primeras filas del conjunto de datos
head(df)
##         ph Hardness   Solids Chloramines  Sulfate Conductivity Organic_carbon
## 1       NA 204.8905 20791.32    7.300212 368.5164     564.3087      10.379783
## 2 3.716080 129.4229 18630.06    6.635246       NA     592.8854      15.180013
## 3 8.099124 224.2363 19909.54    9.275884       NA     418.6062      16.868637
## 4 8.316766 214.3734 22018.42    8.059332 356.8861     363.2665      18.436524
## 5 9.092223 181.1015 17978.99    6.546600 310.1357     398.4108      11.558279
## 6 5.584087 188.3133 28748.69    7.544869 326.6784     280.4679       8.399735
##   Trihalomethanes Turbidity Potability
## 1        86.99097  2.963135          0
## 2        56.32908  4.500656          0
## 3        66.42009  3.055934          0
## 4       100.34167  4.628771          0
## 5        31.99799  4.075075          0
## 6        54.91786  2.559708          0
# Resumen del conjunto de datos
summary(df)
##        ph            Hardness          Solids         Chloramines    
##  Min.   : 0.000   Min.   : 47.43   Min.   :  320.9   Min.   : 0.352  
##  1st Qu.: 6.093   1st Qu.:176.85   1st Qu.:15666.7   1st Qu.: 6.127  
##  Median : 7.037   Median :196.97   Median :20927.8   Median : 7.130  
##  Mean   : 7.081   Mean   :196.37   Mean   :22014.1   Mean   : 7.122  
##  3rd Qu.: 8.062   3rd Qu.:216.67   3rd Qu.:27332.8   3rd Qu.: 8.115  
##  Max.   :14.000   Max.   :323.12   Max.   :61227.2   Max.   :13.127  
##  NA's   :491                                                         
##     Sulfate       Conductivity   Organic_carbon  Trihalomethanes  
##  Min.   :129.0   Min.   :181.5   Min.   : 2.20   Min.   :  0.738  
##  1st Qu.:307.7   1st Qu.:365.7   1st Qu.:12.07   1st Qu.: 55.845  
##  Median :333.1   Median :421.9   Median :14.22   Median : 66.622  
##  Mean   :333.8   Mean   :426.2   Mean   :14.28   Mean   : 66.396  
##  3rd Qu.:360.0   3rd Qu.:481.8   3rd Qu.:16.56   3rd Qu.: 77.337  
##  Max.   :481.0   Max.   :753.3   Max.   :28.30   Max.   :124.000  
##  NA's   :781                                     NA's   :162      
##    Turbidity       Potability    
##  Min.   :1.450   Min.   :0.0000  
##  1st Qu.:3.440   1st Qu.:0.0000  
##  Median :3.955   Median :0.0000  
##  Mean   :3.967   Mean   :0.3901  
##  3rd Qu.:4.500   3rd Qu.:1.0000  
##  Max.   :6.739   Max.   :1.0000  
## 
# Información sobre el conjunto de datos
str(df)
## 'data.frame':    3276 obs. of  10 variables:
##  $ ph             : num  NA 3.72 8.1 8.32 9.09 ...
##  $ Hardness       : num  205 129 224 214 181 ...
##  $ Solids         : num  20791 18630 19910 22018 17979 ...
##  $ Chloramines    : num  7.3 6.64 9.28 8.06 6.55 ...
##  $ Sulfate        : num  369 NA NA 357 310 ...
##  $ Conductivity   : num  564 593 419 363 398 ...
##  $ Organic_carbon : num  10.4 15.2 16.9 18.4 11.6 ...
##  $ Trihalomethanes: num  87 56.3 66.4 100.3 32 ...
##  $ Turbidity      : num  2.96 4.5 3.06 4.63 4.08 ...
##  $ Potability     : int  0 0 0 0 0 0 0 0 0 0 ...
# Visualización de la variable dependiente ("Potability")
potability_counts <- table(df$Potability)
pie_chart <- plot_ly(labels = c("Not Potable", "Potable"), values = potability_counts, type = "pie", hole = 0.35, opacity = 0.8) %>%
  layout(title = "Pie Chart of Potability Feature",
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))

pie_chart
# Visualización de la correlación entre las características con un clustermap
correlation_matrix <- cor(df)
heatmap <- plot_ly(z = correlation_matrix, colorscale = "YlGnBu", reversescale = TRUE, type = "heatmap", 
                   x = colnames(df), y = colnames(df), zmin = -1, zmax = 1) %>%
  layout(title = "Correlation Matrix", xaxis = list(title = ""), yaxis = list(title = ""))
heatmap
# Distribuciones de las características para clases potable y no potable
non_potable <- df %>% filter(Potability == 0)
potable <- df %>% filter(Potability == 1)

plots <- list()
for (col in colnames(df)[-10]) {
  plot <- ggplot() +
    geom_density(data = non_potable, aes_string(x = col), color = "blue", fill = "blue", alpha = 0.5) +
    geom_density(data = potable, aes_string(x = col), color = "red", fill = "red", alpha = 0.5) +
    labs(title = col) +
    theme_minimal()
  plots[[col]] <- plot
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
grid.arrange(grobs = plots, ncol = 3)
## Warning: Removed 314 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning: Removed 177 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning: Removed 488 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning: Removed 293 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning: Removed 107 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning: Removed 55 rows containing non-finite outside the scale range
## (`stat_density()`).

# Visualización de valores faltantes
missingness_matrix <- df %>% is.na() %>% as.data.frame() %>% 
  gather(key = "variable", value = "missing") %>%
  ggplot(aes(x = variable, y = missing, fill = missing)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = c("FALSE" = "blue", "TRUE" = "red")) +
  labs(title = "Missing Values", x = "Variable", y = "Missing") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

missingness_matrix

# Rellenar valores faltantes con la media
df_filled <- df %>%
  mutate(ph = ifelse(is.na(ph), mean(df$ph, na.rm = TRUE), ph),
         Sulfate = ifelse(is.na(Sulfate), mean(df$Sulfate, na.rm = TRUE), Sulfate),
         Trihalomethanes = ifelse(is.na(Trihalomethanes), mean(df$Trihalomethanes, na.rm = TRUE), Trihalomethanes))

# Verificar si hay valores faltantes
sum(is.na(df_filled))
## [1] 0
# División de datos en conjuntos de entrenamiento y prueba
set.seed(3)
train_index <- createDataPartition(df_filled$Potability, p = 0.7, list = FALSE)
train_data <- df_filled[train_index, ]
test_data <- df_filled[-train_index, ]

# Normalización min-max
normalize <- function(x) {
  return((x - min(x)) / (max(x) - min(x)))
}

train_data[, -10] <- apply(train_data[, -10], 2, normalize)
test_data[, -10] <- apply(test_data[, -10], 2, normalize)