library(readr)
## Warning: package 'readr' was built under R version 4.3.3
df <- read_csv("water_potability.csv")
## Rows: 3276 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (10): ph, Hardness, Solids, Chloramines, Sulfate, Conductivity, Organic_...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Ver dimensiones
dim(df)
## [1] 3276   10
# Ver primeras filas
head(df)
## # A tibble: 6 × 10
##      ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon
##   <dbl>    <dbl>  <dbl>       <dbl>   <dbl>        <dbl>          <dbl>
## 1 NA        205. 20791.        7.30    369.         564.          10.4 
## 2  3.72     129. 18630.        6.64     NA          593.          15.2 
## 3  8.10     224. 19910.        9.28     NA          419.          16.9 
## 4  8.32     214. 22018.        8.06    357.         363.          18.4 
## 5  9.09     181. 17979.        6.55    310.         398.          11.6 
## 6  5.58     188. 28749.        7.54    327.         280.           8.40
## # ℹ 3 more variables: Trihalomethanes <dbl>, Turbidity <dbl>, Potability <dbl>
# Estructura de datos
str(df)
## spc_tbl_ [3,276 × 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ph             : num [1:3276] NA 3.72 8.1 8.32 9.09 ...
##  $ Hardness       : num [1:3276] 205 129 224 214 181 ...
##  $ Solids         : num [1:3276] 20791 18630 19910 22018 17979 ...
##  $ Chloramines    : num [1:3276] 7.3 6.64 9.28 8.06 6.55 ...
##  $ Sulfate        : num [1:3276] 369 NA NA 357 310 ...
##  $ Conductivity   : num [1:3276] 564 593 419 363 398 ...
##  $ Organic_carbon : num [1:3276] 10.4 15.2 16.9 18.4 11.6 ...
##  $ Trihalomethanes: num [1:3276] 87 56.3 66.4 100.3 32 ...
##  $ Turbidity      : num [1:3276] 2.96 4.5 3.06 4.63 4.08 ...
##  $ Potability     : num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ph = col_double(),
##   ..   Hardness = col_double(),
##   ..   Solids = col_double(),
##   ..   Chloramines = col_double(),
##   ..   Sulfate = col_double(),
##   ..   Conductivity = col_double(),
##   ..   Organic_carbon = col_double(),
##   ..   Trihalomethanes = col_double(),
##   ..   Turbidity = col_double(),
##   ..   Potability = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
# Gráfico de barras para Potabilidad
barplot(sort(table(df$Potability), decreasing=TRUE), las=2)

# Histograma de Hardness
hist(df$Hardness, main="Histograma de Hardness", xlab="Hardness")

# Histograma de Chloramines
hist(df$Chloramines, main="Histograma de Chloramines", xlab="Chloramines")

# Mapa de valores perdidos
library(Amelia)
## Warning: package 'Amelia' was built under R version 4.3.3
## Loading required package: Rcpp
## Warning: package 'Rcpp' was built under R version 4.3.3
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.2, built: 2024-04-10)
## ## Copyright (C) 2005-2024 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(df)
## Warning: Unknown or uninitialised column: `arguments`.
## Warning: Unknown or uninitialised column: `arguments`.
## Warning: Unknown or uninitialised column: `imputations`.

# Calcular las densidades
density_hardness <- density(df$Hardness, na.rm = TRUE)
density_chloramines <- density(df$Chloramines, na.rm = TRUE)

# Graficar la densidad de Hardness
plot(density_hardness, main="Density Plot of Hardness and Chloramines", 
     xlab="Value", col="blue", lwd=2, xlim=range(c(density_hardness$x, density_chloramines$x)))

# Añadir la densidad de Chloramines
lines(density_chloramines, col="red", lwd=2)

# Añadir la leyenda
legend("topright", legend=c("Hardness", "Chloramines"), col=c("blue", "red"), lwd=2)

# Librerías necesarias para ggplot2
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Boxplot: Hardness vs Potability
ggplot(df, aes(x = factor(Potability), y = Hardness, fill = factor(Potability))) +
  geom_boxplot() +
  scale_fill_manual(values = c("lightblue", "lightgreen")) +
  labs(title = "Hardness vs Potability", x = "Potability", y = "Hardness") +
  theme_minimal()

# Boxplot: Chloramines vs Potability
ggplot(df, aes(x = factor(Potability), y = Chloramines, fill = factor(Potability))) +
  geom_boxplot() +
  scale_fill_manual(values = c("lightblue", "lightgreen")) +
  labs(title = "Chloramines vs Potability", x = "Potability", y = "Chloramines") +
  theme_minimal()

# Gráfico de dispersión (Hardness vs Chloramines) coloreado por Potability
ggplot(df, aes(x = Hardness, y = Chloramines, color = factor(Potability))) +
  geom_point(alpha = 0.6) +
  scale_color_manual(values = c("red", "green")) +
  labs(title = "Hardness vs Chloramines", x = "Hardness", y = "Chloramines") +
  theme_minimal() +
  theme(legend.position = "right") +
  guides(color=guide_legend(title="Potability"))