Potabilidad de agua

El acceso al agua potable es un derecho humano esencial, pero más de 2 mil millones de personas en el mundo carecen de fuentes seguras. La contaminación, la sobreexplotación de acuíferos y el cambio climático complican la situación, afectando la salud y el bienestar de las comunidades. En áreas urbanas, el agua es tratada y distribuida por sistemas de tuberías, mientras que en zonas rurales a menudo se depende de pozos. Para mejorar el acceso, se promueven tecnologías de tratamiento, la conservación del agua y la cooperación internacional. Proteger y gestionar adecuadamente los recursos hídricos es crucial para un futuro sostenible.

Se carga la base de datos

library(readr)

df <-read_csv ("water_potability.csv")
## Rows: 3276 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (10): ph, Hardness, Solids, Chloramines, Sulfate, Conductivity, Organic_...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Se analizan las dimensiones

dim(df)
## [1] 3276   10

Se visualiza la base de datos

str(df)
## spc_tbl_ [3,276 × 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ph             : num [1:3276] NA 3.72 8.1 8.32 9.09 ...
##  $ Hardness       : num [1:3276] 205 129 224 214 181 ...
##  $ Solids         : num [1:3276] 20791 18630 19910 22018 17979 ...
##  $ Chloramines    : num [1:3276] 7.3 6.64 9.28 8.06 6.55 ...
##  $ Sulfate        : num [1:3276] 369 NA NA 357 310 ...
##  $ Conductivity   : num [1:3276] 564 593 419 363 398 ...
##  $ Organic_carbon : num [1:3276] 10.4 15.2 16.9 18.4 11.6 ...
##  $ Trihalomethanes: num [1:3276] 87 56.3 66.4 100.3 32 ...
##  $ Turbidity      : num [1:3276] 2.96 4.5 3.06 4.63 4.08 ...
##  $ Potability     : num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ph = col_double(),
##   ..   Hardness = col_double(),
##   ..   Solids = col_double(),
##   ..   Chloramines = col_double(),
##   ..   Sulfate = col_double(),
##   ..   Conductivity = col_double(),
##   ..   Organic_carbon = col_double(),
##   ..   Trihalomethanes = col_double(),
##   ..   Turbidity = col_double(),
##   ..   Potability = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
hist(sort(df$Hardness))

hist(sort(df$Potability),las=2)

hist(sort(df$Turbidity),las=2)

hist(sort(df$Hardness))

library(Amelia)
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.2, built: 2024-04-10)
## ## Copyright (C) 2005-2024 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(df)
## Warning: Unknown or uninitialised column: `arguments`.
## Unknown or uninitialised column: `arguments`.
## Warning: Unknown or uninitialised column: `imputations`.

par(mfrow=c(1,4))
 boxplot(df$Sulfate,main="Sulfate")
 boxplot(df$ph,main="ph")
 boxplot(df$Turbidity,main="Turbidity")
 boxplot(df$Conductivity,main="Conductivity")

par(mfrow=c(1,5))
 boxplot(df$Hardness,main="Hardness")
  boxplot(df$Solids,main="Solids")
   boxplot(df$Chloramines,main="Cloramines")
    boxplot(df$Organic_carbon,main="Organic")
     boxplot(df$Trihalomethanes,main="Trihalomethanes")

summary(df)# Analicemos las mediadas de tendencia
##        ph            Hardness          Solids         Chloramines    
##  Min.   : 0.000   Min.   : 47.43   Min.   :  320.9   Min.   : 0.352  
##  1st Qu.: 6.093   1st Qu.:176.85   1st Qu.:15666.7   1st Qu.: 6.127  
##  Median : 7.037   Median :196.97   Median :20927.8   Median : 7.130  
##  Mean   : 7.081   Mean   :196.37   Mean   :22014.1   Mean   : 7.122  
##  3rd Qu.: 8.062   3rd Qu.:216.67   3rd Qu.:27332.8   3rd Qu.: 8.115  
##  Max.   :14.000   Max.   :323.12   Max.   :61227.2   Max.   :13.127  
##  NA's   :491                                                         
##     Sulfate       Conductivity   Organic_carbon  Trihalomethanes  
##  Min.   :129.0   Min.   :181.5   Min.   : 2.20   Min.   :  0.738  
##  1st Qu.:307.7   1st Qu.:365.7   1st Qu.:12.07   1st Qu.: 55.845  
##  Median :333.1   Median :421.9   Median :14.22   Median : 66.622  
##  Mean   :333.8   Mean   :426.2   Mean   :14.28   Mean   : 66.396  
##  3rd Qu.:360.0   3rd Qu.:481.8   3rd Qu.:16.56   3rd Qu.: 77.337  
##  Max.   :481.0   Max.   :753.3   Max.   :28.30   Max.   :124.000  
##  NA's   :781                                     NA's   :162      
##    Turbidity       Potability    
##  Min.   :1.450   Min.   :0.0000  
##  1st Qu.:3.440   1st Qu.:0.0000  
##  Median :3.955   Median :0.0000  
##  Mean   :3.967   Mean   :0.3901  
##  3rd Qu.:4.500   3rd Qu.:1.0000  
##  Max.   :6.739   Max.   :1.0000  
## 
par(mfrow=c(1,5))
 boxplot(df$Hardness,main="Hardness")
  boxplot(df$Solids,main="Solids")
   boxplot(df$Chloramines,main="Cloramines")
    boxplot(df$Organic_carbon,main="Organic")
     boxplot(df$Trihalomethanes,main="Trihalomethanes")

# Cargar los paquetes necesarios
library(ggplot2)
library(reshape2)
library(pheatmap)
library(dplyr) # Para la selección de columnas numéricas
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Seleccionar solo las columnas numéricas del DataFrame
numeric_df <- df %>% select_if(is.numeric)

# Calcular la matriz de correlación
cor_matrix <- cor(numeric_df, use = "complete.obs")

# Crear el mapa de calor con pheatmap
pheatmap(
  mat = cor_matrix, 
  color = colorRampPalette(c("pink", "white", "purple"))(100), # Paleta de colores
  display_numbers = TRUE, # Mostrar los números en las celdas
  number_format = "%.2f", # Formato de los números
  main = "Correlation of Water Potability" # Título del gráfico
)

# Análisis:
# 1. Correlaciones Fuertes Positivas:
#    - Se observa que la variable `Hardness` tiene una correlación positiva fuerte con `Solids`.
#      Esto indica que a medida que aumentan los sólidos disueltos en el agua, también aumenta la dureza.
#      Este comportamiento es esperado ya que los sólidos contribuyen a la dureza del agua.

# 2. Correlaciones Negativas:
#    - `pH` muestra una correlación negativa moderada con `Chloramines`, sugiriendo que a medida que el pH aumenta,
#      la concentración de cloraminas tiende a disminuir. Esta relación podría estar relacionada con la química del agua.

# 3. Correlaciones Débiles o Nulas:
#    - Las correlaciones entre `Conductivity` y otras variables como `pH` y `Turbidity` son bastante débiles,
#      lo que indica que no hay una relación lineal significativa entre ellas.

# 4. Relación con la Potabilidad:
#    - Si analizamos la correlación de `Potability` con otras variables, vemos que no hay correlaciones lineales fuertes.
#
library(ggplot2)


df$Potability <- as.factor(df$Potability)

# Crear el gráfico de distribución con facetas
ggplot(df, aes(x = ph)) +
  geom_histogram(aes(y = ..density..), bins = 30, fill = "pink", color = "black", alpha = 0.7) +
  geom_density(color = "red") +
  facet_wrap(~ Potability) +
  ggtitle("Distribution of pH by Potability") +
  theme_minimal() +
  theme(plot.title = element_text(size = 15))
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 491 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 491 rows containing non-finite outside the scale range
## (`stat_density()`).

# Análisis del gráfico:
# 1. Diferencias en la Distribución de pH:
#    - El gráfico muestra las distribuciones de pH para agua potable y no potable, usando facetas para separar los grupos.
#    - Comparando las facetas, podemos observar si hay diferencias notables en las distribuciones de pH entre los dos grupos.

# 2. Forma de las Distribuciones:
#    - La curva de densidad (en rojo) muestra la forma de la distribución del pH para cada grupo.
#    - Se debe observar si las distribuciones son más anchas (mayor variabilidad) o estrechas (menor variabilidad).
#    - Examinar la simetría y las colas de las distribuciones para identificar si hay diferencias significativas.

# 3. Valor Central del pH:
#    - Observar el valor central (pico de la distribución) en cada faceta.
#    - Determinar si el agua potable tiende a tener un rango de pH diferente comparado con el agua no potable.

# 4. Variabilidad en pH:
#    - Evaluar la dispersión de los valores de pH en cada grupo.
#    - Identificar si uno de los grupos presenta mayor variabilidad en pH, lo cual podría estar relacionado con la potabilidad.

# Posibles Implicaciones:
# - Si se observan diferencias significativas en las distribuciones de pH entre el agua potable y no potable, esto podría indicar que el pH es un factor importante en la determinación de la potabilidad.
# - En caso de que el agua no potable muestre una mayor dispersión o desviación hacia valores extremos de pH, esto podría sugerir que el control del pH es relevante para mejorar la potabilidad del agua.
# Cargar los paquetes necesarios
library(ggplot2)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(dplyr)

# Supongamos que df es tu DataFrame
# Eliminar la columna 'Potability' del DataFrame
df_without_potability <- df %>% select(-Potability)

# Crear una lista para almacenar los gráficos
plot_list <- list()

# Loop a través de cada columna en df_without_potability
for (col in colnames(df_without_potability)) {
  # Crear el gráfico de histograma
  p <- ggplot(df_without_potability, aes_string(x = col)) +
    geom_histogram(bins = 50, fill = "skyblue", color = "black", alpha = 0.7) +
    ggtitle(paste("Histogram of", col)) +
    theme_minimal()
  
  # Añadir el gráfico a la lista
  plot_list[[col]] <- p
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Organizar los gráficos en una cuadrícula
grid.arrange(grobs = plot_list, ncol = 3, nrow = 3)
## Warning: Removed 491 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 781 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 162 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Cargar los paquetes necesarios
library(ggplot2)
library(dplyr)

# Asegurarse de que df es tu DataFrame
# Convertir 'Potability' en un factor si no lo es
df$Potability <- as.factor(df$Potability)

# Seleccionar las columnas numéricas
numeric_columns <- df %>% select_if(is.numeric) %>% colnames()

# Loop para crear histogramas
for (col in numeric_columns) {
  p <- ggplot(df, aes_string(x = col, fill = "Potability")) +
    geom_histogram(position = "stack", bins = 20, alpha = 0.6, color = "purple") +
    geom_density(aes(y = ..density.. * 0.1), color = "pink", alpha = 0.5) +
    labs(title = paste(col, "vs Potability")) +
    theme_minimal() +
    theme(plot.title = element_text(size = 15))
  
  print(p)  # Mostrar el gráfico
}
## Warning: Removed 491 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 491 rows containing non-finite outside the scale range
## (`stat_density()`).

## Warning: Removed 781 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 781 rows containing non-finite outside the scale range
## (`stat_density()`).

## Warning: Removed 162 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 162 rows containing non-finite outside the scale range
## (`stat_density()`).

combn(numeric_columns, 2, function(cols) {
  p <- ggplot(df, aes_string(x = cols[1], y = cols[2], color = "Potability")) +
    geom_point(alpha = 0.6) +
    labs(title = paste("Dispersión de", cols[1], "vs", cols[2], "por Potability")) +
    theme_minimal() +
    theme(plot.title = element_text(size = 15))
  
  print(p)
})
## Warning: Removed 491 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 491 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 491 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 1160 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 491 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 491 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 627 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 491 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 781 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 162 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 781 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 162 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 781 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 162 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 781 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 781 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 903 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 781 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 162 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 162 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 162 rows containing missing values or values outside the scale range
## (`geom_point()`).

##       [,1]             [,2]             [,3]             [,4]            
##  [1,] spec_tbl_df,10   spec_tbl_df,10   spec_tbl_df,10   spec_tbl_df,10  
##  [2,] list,1           list,1           list,1           list,1          
##  [3,] ScalesList,2     ScalesList,2     ScalesList,2     ScalesList,2    
##  [4,] Guides,2         Guides,2         Guides,2         Guides,2        
##  [5,] uneval,3         uneval,3         uneval,3         uneval,3        
##  [6,] theme,136        theme,136        theme,136        theme,136       
##  [7,] CoordCartesian,5 CoordCartesian,5 CoordCartesian,5 CoordCartesian,5
##  [8,] FacetNull,2      FacetNull,2      FacetNull,2      FacetNull,2     
##  [9,] ?                ?                ?                ?               
## [10,] Layout,1         Layout,1         Layout,1         Layout,1        
## [11,] list,4           list,4           list,4           list,4          
##       [,5]             [,6]             [,7]             [,8]            
##  [1,] spec_tbl_df,10   spec_tbl_df,10   spec_tbl_df,10   spec_tbl_df,10  
##  [2,] list,1           list,1           list,1           list,1          
##  [3,] ScalesList,2     ScalesList,2     ScalesList,2     ScalesList,2    
##  [4,] Guides,2         Guides,2         Guides,2         Guides,2        
##  [5,] uneval,3         uneval,3         uneval,3         uneval,3        
##  [6,] theme,136        theme,136        theme,136        theme,136       
##  [7,] CoordCartesian,5 CoordCartesian,5 CoordCartesian,5 CoordCartesian,5
##  [8,] FacetNull,2      FacetNull,2      FacetNull,2      FacetNull,2     
##  [9,] ?                ?                ?                ?               
## [10,] Layout,1         Layout,1         Layout,1         Layout,1        
## [11,] list,4           list,4           list,4           list,4          
##       [,9]             [,10]            [,11]            [,12]           
##  [1,] spec_tbl_df,10   spec_tbl_df,10   spec_tbl_df,10   spec_tbl_df,10  
##  [2,] list,1           list,1           list,1           list,1          
##  [3,] ScalesList,2     ScalesList,2     ScalesList,2     ScalesList,2    
##  [4,] Guides,2         Guides,2         Guides,2         Guides,2        
##  [5,] uneval,3         uneval,3         uneval,3         uneval,3        
##  [6,] theme,136        theme,136        theme,136        theme,136       
##  [7,] CoordCartesian,5 CoordCartesian,5 CoordCartesian,5 CoordCartesian,5
##  [8,] FacetNull,2      FacetNull,2      FacetNull,2      FacetNull,2     
##  [9,] ?                ?                ?                ?               
## [10,] Layout,1         Layout,1         Layout,1         Layout,1        
## [11,] list,4           list,4           list,4           list,4          
##       [,13]            [,14]            [,15]            [,16]           
##  [1,] spec_tbl_df,10   spec_tbl_df,10   spec_tbl_df,10   spec_tbl_df,10  
##  [2,] list,1           list,1           list,1           list,1          
##  [3,] ScalesList,2     ScalesList,2     ScalesList,2     ScalesList,2    
##  [4,] Guides,2         Guides,2         Guides,2         Guides,2        
##  [5,] uneval,3         uneval,3         uneval,3         uneval,3        
##  [6,] theme,136        theme,136        theme,136        theme,136       
##  [7,] CoordCartesian,5 CoordCartesian,5 CoordCartesian,5 CoordCartesian,5
##  [8,] FacetNull,2      FacetNull,2      FacetNull,2      FacetNull,2     
##  [9,] ?                ?                ?                ?               
## [10,] Layout,1         Layout,1         Layout,1         Layout,1        
## [11,] list,4           list,4           list,4           list,4          
##       [,17]            [,18]            [,19]            [,20]           
##  [1,] spec_tbl_df,10   spec_tbl_df,10   spec_tbl_df,10   spec_tbl_df,10  
##  [2,] list,1           list,1           list,1           list,1          
##  [3,] ScalesList,2     ScalesList,2     ScalesList,2     ScalesList,2    
##  [4,] Guides,2         Guides,2         Guides,2         Guides,2        
##  [5,] uneval,3         uneval,3         uneval,3         uneval,3        
##  [6,] theme,136        theme,136        theme,136        theme,136       
##  [7,] CoordCartesian,5 CoordCartesian,5 CoordCartesian,5 CoordCartesian,5
##  [8,] FacetNull,2      FacetNull,2      FacetNull,2      FacetNull,2     
##  [9,] ?                ?                ?                ?               
## [10,] Layout,1         Layout,1         Layout,1         Layout,1        
## [11,] list,4           list,4           list,4           list,4          
##       [,21]            [,22]            [,23]            [,24]           
##  [1,] spec_tbl_df,10   spec_tbl_df,10   spec_tbl_df,10   spec_tbl_df,10  
##  [2,] list,1           list,1           list,1           list,1          
##  [3,] ScalesList,2     ScalesList,2     ScalesList,2     ScalesList,2    
##  [4,] Guides,2         Guides,2         Guides,2         Guides,2        
##  [5,] uneval,3         uneval,3         uneval,3         uneval,3        
##  [6,] theme,136        theme,136        theme,136        theme,136       
##  [7,] CoordCartesian,5 CoordCartesian,5 CoordCartesian,5 CoordCartesian,5
##  [8,] FacetNull,2      FacetNull,2      FacetNull,2      FacetNull,2     
##  [9,] ?                ?                ?                ?               
## [10,] Layout,1         Layout,1         Layout,1         Layout,1        
## [11,] list,4           list,4           list,4           list,4          
##       [,25]            [,26]            [,27]            [,28]           
##  [1,] spec_tbl_df,10   spec_tbl_df,10   spec_tbl_df,10   spec_tbl_df,10  
##  [2,] list,1           list,1           list,1           list,1          
##  [3,] ScalesList,2     ScalesList,2     ScalesList,2     ScalesList,2    
##  [4,] Guides,2         Guides,2         Guides,2         Guides,2        
##  [5,] uneval,3         uneval,3         uneval,3         uneval,3        
##  [6,] theme,136        theme,136        theme,136        theme,136       
##  [7,] CoordCartesian,5 CoordCartesian,5 CoordCartesian,5 CoordCartesian,5
##  [8,] FacetNull,2      FacetNull,2      FacetNull,2      FacetNull,2     
##  [9,] ?                ?                ?                ?               
## [10,] Layout,1         Layout,1         Layout,1         Layout,1        
## [11,] list,4           list,4           list,4           list,4          
##       [,29]            [,30]            [,31]            [,32]           
##  [1,] spec_tbl_df,10   spec_tbl_df,10   spec_tbl_df,10   spec_tbl_df,10  
##  [2,] list,1           list,1           list,1           list,1          
##  [3,] ScalesList,2     ScalesList,2     ScalesList,2     ScalesList,2    
##  [4,] Guides,2         Guides,2         Guides,2         Guides,2        
##  [5,] uneval,3         uneval,3         uneval,3         uneval,3        
##  [6,] theme,136        theme,136        theme,136        theme,136       
##  [7,] CoordCartesian,5 CoordCartesian,5 CoordCartesian,5 CoordCartesian,5
##  [8,] FacetNull,2      FacetNull,2      FacetNull,2      FacetNull,2     
##  [9,] ?                ?                ?                ?               
## [10,] Layout,1         Layout,1         Layout,1         Layout,1        
## [11,] list,4           list,4           list,4           list,4          
##       [,33]            [,34]            [,35]            [,36]           
##  [1,] spec_tbl_df,10   spec_tbl_df,10   spec_tbl_df,10   spec_tbl_df,10  
##  [2,] list,1           list,1           list,1           list,1          
##  [3,] ScalesList,2     ScalesList,2     ScalesList,2     ScalesList,2    
##  [4,] Guides,2         Guides,2         Guides,2         Guides,2        
##  [5,] uneval,3         uneval,3         uneval,3         uneval,3        
##  [6,] theme,136        theme,136        theme,136        theme,136       
##  [7,] CoordCartesian,5 CoordCartesian,5 CoordCartesian,5 CoordCartesian,5
##  [8,] FacetNull,2      FacetNull,2      FacetNull,2      FacetNull,2     
##  [9,] ?                ?                ?                ?               
## [10,] Layout,1         Layout,1         Layout,1         Layout,1        
## [11,] list,4           list,4           list,4           list,4
df %>%
  count(Potability) %>%
  ggplot(aes(x = "", y = n, fill = Potability)) +
  geom_bar(width = 1, stat = "identity") +
  coord_polar(theta = "y") +
  geom_text(aes(label = paste0(round(n/sum(n)*100, 1), "%")), position = position_stack(vjust = 0.5)) +
  theme_void() +
  labs(title = "Distribución de Potability (Gráfico de Donut)") +
  theme(legend.position = "bottom")

#Potable (0): Hay 1998 muestras que se consideran potables. No potable (1): Hay 1278 muestras que se consideran no potables.Esto sugiere que, de todas las muestras analizadas: El 61% (1998/3276) de las muestras son potables. El 39% (1278/3276) de las muestras no son potables.En general, esto significa que más de la mitad de las muestras son aptas para #el consumo, pero todavía hay un porcentaje significativo que no lo es.