library(readr)
## Warning: package 'readr' was built under R version 4.3.3
df <- read_csv("water_potability.csv")
## Rows: 3276 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (10): ph, Hardness, Solids, Chloramines, Sulfate, Conductivity, Organic_...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Ver dimensiones
dim(df)
## [1] 3276 10
# Ver primeras filas
head(df)
## # A tibble: 6 × 10
## ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 NA 205. 20791. 7.30 369. 564. 10.4
## 2 3.72 129. 18630. 6.64 NA 593. 15.2
## 3 8.10 224. 19910. 9.28 NA 419. 16.9
## 4 8.32 214. 22018. 8.06 357. 363. 18.4
## 5 9.09 181. 17979. 6.55 310. 398. 11.6
## 6 5.58 188. 28749. 7.54 327. 280. 8.40
## # ℹ 3 more variables: Trihalomethanes <dbl>, Turbidity <dbl>, Potability <dbl>
# Estructura de datos
str(df)
## spc_tbl_ [3,276 × 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ph : num [1:3276] NA 3.72 8.1 8.32 9.09 ...
## $ Hardness : num [1:3276] 205 129 224 214 181 ...
## $ Solids : num [1:3276] 20791 18630 19910 22018 17979 ...
## $ Chloramines : num [1:3276] 7.3 6.64 9.28 8.06 6.55 ...
## $ Sulfate : num [1:3276] 369 NA NA 357 310 ...
## $ Conductivity : num [1:3276] 564 593 419 363 398 ...
## $ Organic_carbon : num [1:3276] 10.4 15.2 16.9 18.4 11.6 ...
## $ Trihalomethanes: num [1:3276] 87 56.3 66.4 100.3 32 ...
## $ Turbidity : num [1:3276] 2.96 4.5 3.06 4.63 4.08 ...
## $ Potability : num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, "spec")=
## .. cols(
## .. ph = col_double(),
## .. Hardness = col_double(),
## .. Solids = col_double(),
## .. Chloramines = col_double(),
## .. Sulfate = col_double(),
## .. Conductivity = col_double(),
## .. Organic_carbon = col_double(),
## .. Trihalomethanes = col_double(),
## .. Turbidity = col_double(),
## .. Potability = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
# Gráfico de barras para Potabilidad
barplot(sort(table(df$Potability), decreasing=TRUE), las=2)

# Histograma de Hardness
hist(df$Hardness, main="Histograma de Hardness", xlab="Hardness")

# Histograma de Chloramines
hist(df$Chloramines, main="Histograma de Chloramines", xlab="Chloramines")

# Mapa de valores perdidos
library(Amelia)
## Warning: package 'Amelia' was built under R version 4.3.3
## Loading required package: Rcpp
## Warning: package 'Rcpp' was built under R version 4.3.3
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.2, built: 2024-04-10)
## ## Copyright (C) 2005-2024 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(df)
## Warning: Unknown or uninitialised column: `arguments`.
## Warning: Unknown or uninitialised column: `arguments`.
## Warning: Unknown or uninitialised column: `imputations`.

# Calcular las densidades
density_hardness <- density(df$Hardness, na.rm = TRUE)
density_chloramines <- density(df$Chloramines, na.rm = TRUE)
# Graficar la densidad de Hardness
plot(density_hardness, main="Density Plot of Hardness and Chloramines",
xlab="Value", col="blue", lwd=2, xlim=range(c(density_hardness$x, density_chloramines$x)))
# Añadir la densidad de Chloramines
lines(density_chloramines, col="red", lwd=2)
# Añadir la leyenda
legend("topright", legend=c("Hardness", "Chloramines"), col=c("blue", "red"), lwd=2)

# LibrerÃas necesarias para ggplot2
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Boxplot: Hardness vs Potability
ggplot(df, aes(x = factor(Potability), y = Hardness, fill = factor(Potability))) +
geom_boxplot() +
scale_fill_manual(values = c("lightblue", "lightgreen")) +
labs(title = "Hardness vs Potability", x = "Potability", y = "Hardness") +
theme_minimal()

# Boxplot: Chloramines vs Potability
ggplot(df, aes(x = factor(Potability), y = Chloramines, fill = factor(Potability))) +
geom_boxplot() +
scale_fill_manual(values = c("lightblue", "lightgreen")) +
labs(title = "Chloramines vs Potability", x = "Potability", y = "Chloramines") +
theme_minimal()

# Gráfico de dispersión (Hardness vs Chloramines) coloreado por Potability
ggplot(df, aes(x = Hardness, y = Chloramines, color = factor(Potability))) +
geom_point(alpha = 0.6) +
scale_color_manual(values = c("red", "green")) +
labs(title = "Hardness vs Chloramines", x = "Hardness", y = "Chloramines") +
theme_minimal() +
theme(legend.position = "right") +
guides(color=guide_legend(title="Potability"))
