Se carga la base de datos
library(readr)
df <-read_csv("water_potability.csv")
## Rows: 3276 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (10): ph, Hardness, Solids, Chloramines, Sulfate, Conductivity, Organic_...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Se analizan las dimensiones
dim(df)
## [1] 3276 10
Se visualiza la base de datos
str(df)
## spc_tbl_ [3,276 × 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ph : num [1:3276] NA 3.72 8.1 8.32 9.09 ...
## $ Hardness : num [1:3276] 205 129 224 214 181 ...
## $ Solids : num [1:3276] 20791 18630 19910 22018 17979 ...
## $ Chloramines : num [1:3276] 7.3 6.64 9.28 8.06 6.55 ...
## $ Sulfate : num [1:3276] 369 NA NA 357 310 ...
## $ Conductivity : num [1:3276] 564 593 419 363 398 ...
## $ Organic_carbon : num [1:3276] 10.4 15.2 16.9 18.4 11.6 ...
## $ Trihalomethanes: num [1:3276] 87 56.3 66.4 100.3 32 ...
## $ Turbidity : num [1:3276] 2.96 4.5 3.06 4.63 4.08 ...
## $ Potability : num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, "spec")=
## .. cols(
## .. ph = col_double(),
## .. Hardness = col_double(),
## .. Solids = col_double(),
## .. Chloramines = col_double(),
## .. Sulfate = col_double(),
## .. Conductivity = col_double(),
## .. Organic_carbon = col_double(),
## .. Trihalomethanes = col_double(),
## .. Turbidity = col_double(),
## .. Potability = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
hist(sort(df$Hardness))
hist(sort(df$Potability),las=2)
hist(sort(df$Chloramines))
library(Amelia)
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.2, built: 2024-04-10)
## ## Copyright (C) 2005-2024 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(df)
## Warning: Unknown or uninitialised column: `arguments`.
## Unknown or uninitialised column: `arguments`.
## Warning: Unknown or uninitialised column: `imputations`.
par(mfrow=c(1,4))
boxplot(df$Sulfate,main="Sulfate")
boxplot(df$ph,main="ph")
boxplot(df$Turbidity,main="Turbidity")
boxplot(df$Conductivity,main="Conductivity")
par(mfrow=c(1,5))
boxplot(df$Hardness,main="Hardness")
boxplot(df$Solids,main="Solids")
boxplot(df$Chloramines,main="Cloramines")
boxplot(df$Organic_carbon,main="Organic")
boxplot(df$Trihalomethanes,main="Trihalomethanes")
hist(sort(df$Turbidity),las=2)
summary(df)# Analicemos las mediadas de tendencia
## ph Hardness Solids Chloramines
## Min. : 0.000 Min. : 47.43 Min. : 320.9 Min. : 0.352
## 1st Qu.: 6.093 1st Qu.:176.85 1st Qu.:15666.7 1st Qu.: 6.127
## Median : 7.037 Median :196.97 Median :20927.8 Median : 7.130
## Mean : 7.081 Mean :196.37 Mean :22014.1 Mean : 7.122
## 3rd Qu.: 8.062 3rd Qu.:216.67 3rd Qu.:27332.8 3rd Qu.: 8.115
## Max. :14.000 Max. :323.12 Max. :61227.2 Max. :13.127
## NA's :491
## Sulfate Conductivity Organic_carbon Trihalomethanes
## Min. :129.0 Min. :181.5 Min. : 2.20 Min. : 0.738
## 1st Qu.:307.7 1st Qu.:365.7 1st Qu.:12.07 1st Qu.: 55.845
## Median :333.1 Median :421.9 Median :14.22 Median : 66.622
## Mean :333.8 Mean :426.2 Mean :14.28 Mean : 66.396
## 3rd Qu.:360.0 3rd Qu.:481.8 3rd Qu.:16.56 3rd Qu.: 77.337
## Max. :481.0 Max. :753.3 Max. :28.30 Max. :124.000
## NA's :781 NA's :162
## Turbidity Potability
## Min. :1.450 Min. :0.0000
## 1st Qu.:3.440 1st Qu.:0.0000
## Median :3.955 Median :0.0000
## Mean :3.967 Mean :0.3901
## 3rd Qu.:4.500 3rd Qu.:1.0000
## Max. :6.739 Max. :1.0000
##
par(mfrow=c(1,4))
boxplot(df$Sulfate,main="Sulfate")
boxplot(df$ph,main="ph")
boxplot(df$Turbidity,main="Turbidity")
boxplot(df$Conductivity,main="Conductivity")
par(mfrow=c(1,5))
boxplot(df$Hardness,main="Hardness")
boxplot(df$Solids,main="Solids")
boxplot(df$Chloramines,main="Cloramines")
boxplot(df$Organic_carbon,main="Organic")
boxplot(df$Trihalomethanes,main="Trihalomethanes")
# Cargar los paquetes necesarios
library(ggplot2)
library(reshape2)
library(pheatmap)
library(dplyr) # Para la selección de columnas numéricas
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Seleccionar solo las columnas numéricas del DataFrame
numeric_df <- df %>% select_if(is.numeric)
# Calcular la matriz de correlación
cor_matrix <- cor(numeric_df, use = "complete.obs")
# Crear el mapa de calor con pheatmap
pheatmap(
mat = cor_matrix,
color = colorRampPalette(c("green", "white", "black"))(100), # Paleta de colores
display_numbers = TRUE, # Mostrar los números en las celdas
number_format = "%.2f", # Formato de los números
main = "Correlation of Water Potability" # Título del gráfico
)
# Análisis:
# 1. Correlaciones Fuertes Positivas:
# - Se observa que la variable `Hardness` tiene una correlación positiva fuerte con `Solids`.
# Esto indica que a medida que aumentan los sólidos disueltos en el agua, también aumenta la dureza.
# Este comportamiento es esperado ya que los sólidos contribuyen a la dureza del agua.
# 2. Correlaciones Negativas:
# - `pH` muestra una correlación negativa moderada con `Chloramines`, sugiriendo que a medida que el pH aumenta,
# la concentración de cloraminas tiende a disminuir. Esta relación podría estar relacionada con la química del agua.
# 3. Correlaciones Débiles o Nulas:
# - Las correlaciones entre `Conductivity` y otras variables como `pH` y `Turbidity` son bastante débiles,
# lo que indica que no hay una relación lineal significativa entre ellas.
# 4. Relación con la Potabilidad:
# - Si analizamos la correlación de `Potability` con otras variables, vemos que no hay correlaciones lineales fuertes.
# Cargar los paquetes necesarios
library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(dplyr)
# Supongamos que df es tu DataFrame
# Eliminar la columna 'Potability' del DataFrame
df_without_potability <- df %>% select(-Potability)
# Crear una lista para almacenar los gráficos
plot_list <- list()
# Loop a través de cada columna en df_without_potability
for (col in colnames(df_without_potability)) {
# Crear el gráfico de histograma
p <- ggplot(df_without_potability, aes_string(x = col)) +
geom_histogram(bins = 50, fill = "skyblue", color = "black", alpha = 0.7) +
ggtitle(paste("Histogram of", col)) +
theme_minimal()
# Añadir el gráfico a la lista
plot_list[[col]] <- p
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Organizar los gráficos en una cuadrícula
grid.arrange(grobs = plot_list, ncol = 3, nrow = 3)
## Warning: Removed 491 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 781 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 162 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Cargar los paquetes necesarios
library(ggplot2)
library(dplyr)
# Asegurarse de que df es tu DataFrame
# Convertir 'Potability' en un factor si no lo es
df$Potability <- as.factor(df$Potability)
# Seleccionar las columnas numéricas
numeric_columns <- df %>% select_if(is.numeric) %>% colnames()
# Loop para crear histogramas
for (col in numeric_columns) {
p <- ggplot(df, aes_string(x = col, fill = "Potability")) +
geom_histogram(position = "stack", bins = 20, alpha = 0.6, color = "purple") +
geom_density(aes(y = ..density.. * 0.1), color = "pink", alpha = 0.5) +
labs(title = paste(col, "vs Potability")) +
theme_minimal() +
theme(plot.title = element_text(size = 15))
print(p) # Mostrar el gráfico
}
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 491 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 491 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning: Removed 781 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 781 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning: Removed 162 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 162 rows containing non-finite outside the scale range
## (`stat_density()`).
df %>%
count(Potability) %>%
ggplot(aes(x = "", y = n, fill = Potability)) +
geom_bar(width = 1, stat = "identity") +
coord_polar(theta = "y") +
geom_text(aes(label = paste0(round(n/sum(n)*100, 1), "%")), position = position_stack(vjust = 0.5)) +
theme_void() +
labs(title = "Distribución de Potability (Gráfico de Donut)") +
theme(legend.position = "bottom")
#Potable (0): Hay 1998 muestras que se consideran potables. No potable
(1): Hay 1278 muestras que se consideran no potables.Esto sugiere que,
de todas las muestras analizadas: El 61% (1998/3276) de las muestras son
potables. El 39% (1278/3276) de las muestras no son potables.En general,
esto significa que más de la mitad de las muestras son aptas para #el
consumo, pero todavía hay un porcentaje significativo que no lo es.