Se carga la base de datos
library(readr)
## Warning: package 'readr' was built under R version 4.4.1
df <-read.csv("C:\\Users\\ACER\\Downloads\\water_potability.csv")
Se analizan las dimensiones
dim(df)
## [1] 3276 10
Se visualiza la base de datos
head(df)
## ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon
## 1 NA 204.8905 20791.32 7.300212 368.5164 564.3087 10.379783
## 2 3.716080 129.4229 18630.06 6.635246 NA 592.8854 15.180013
## 3 8.099124 224.2363 19909.54 9.275884 NA 418.6062 16.868637
## 4 8.316766 214.3734 22018.42 8.059332 356.8861 363.2665 18.436524
## 5 9.092223 181.1015 17978.99 6.546600 310.1357 398.4108 11.558279
## 6 5.584087 188.3133 28748.69 7.544869 326.6784 280.4679 8.399735
## Trihalomethanes Turbidity Potability
## 1 86.99097 2.963135 0
## 2 56.32908 4.500656 0
## 3 66.42009 3.055934 0
## 4 100.34167 4.628771 0
## 5 31.99799 4.075075 0
## 6 54.91786 2.559708 0
Se observan las primeras 5 filas de la base de datos
str(df)
## 'data.frame': 3276 obs. of 10 variables:
## $ ph : num NA 3.72 8.1 8.32 9.09 ...
## $ Hardness : num 205 129 224 214 181 ...
## $ Solids : num 20791 18630 19910 22018 17979 ...
## $ Chloramines : num 7.3 6.64 9.28 8.06 6.55 ...
## $ Sulfate : num 369 NA NA 357 310 ...
## $ Conductivity : num 564 593 419 363 398 ...
## $ Organic_carbon : num 10.4 15.2 16.9 18.4 11.6 ...
## $ Trihalomethanes: num 87 56.3 66.4 100.3 32 ...
## $ Turbidity : num 2.96 4.5 3.06 4.63 4.08 ...
## $ Potability : int 0 0 0 0 0 0 0 0 0 0 ...
barplot(sort(table(df$Potability)),las=2)
hist(sort(df$Hardness))
hist(sort(df$Chloramines))
boxplot(df$Sulfate)
boxplot(df$Trihalomethanes)
library(Amelia)
## Warning: package 'Amelia' was built under R version 4.4.1
## Cargando paquete requerido: Rcpp
## Warning: package 'Rcpp' was built under R version 4.4.1
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.2, built: 2024-04-10)
## ## Copyright (C) 2005-2024 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(df)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.1
library(plotly)
plot_ly(data = df, x = ~ph, y = ~Conductivity, type = ‘scatter’, mode = ‘markers’, marker = list(size = 10, color = ‘rgba(255, 182, 193, .9)’, line = list(color = ‘rgba(152, 0, 0, .8)’, width = 2))) %>% layout(title = ‘Relación entre pH y Conductividad’, xaxis = list(title = ‘pH’), yaxis = list(title = ‘Conductividad’)) # Instalar la librería si no está instalada # install.packages(“lattice”) library(lattice)
splom(df[c(“ph”, “Hardness”, “Conductivity”, “Sulfate”, “Turbidity”)], main = “Matriz de Dispersión entre Variables”)
library(highcharter)
hchart(df, type = “line”, hcaes(x = 1:nrow(df), y = Hardness), color = “blue”) %>% hc_title(text = “Distribución de la Dureza del Agua”)
library(ggridges)
ggplot(df, aes(x = ph, y = factor(Potability), fill = Potability)) + geom_density_ridges() + labs(title = “Distribución del pH según Potabilidad”, x = “pH”, y = “Potabilidad”)
library(cowplot)
p1 <- ggplot(df, aes(x = Chloramines)) + geom_histogram(fill = “purple”) p2 <- ggplot(df, aes(x = Trihalomethanes)) + geom_histogram(fill = “green”) plot_grid(p1, p2, labels = “AUTO”)