Potabilidad del agua

library(readr)
df <- read_csv("C:/Users/FREDY/Downloads/water_potability.csv")
## Rows: 3276 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (10): ph, Hardness, Solids, Chloramines, Sulfate, Conductivity, Organic_...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df
## # A tibble: 3,276 × 10
##       ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon
##    <dbl>    <dbl>  <dbl>       <dbl>   <dbl>        <dbl>          <dbl>
##  1 NA        205. 20791.        7.30    369.         564.          10.4 
##  2  3.72     129. 18630.        6.64     NA          593.          15.2 
##  3  8.10     224. 19910.        9.28     NA          419.          16.9 
##  4  8.32     214. 22018.        8.06    357.         363.          18.4 
##  5  9.09     181. 17979.        6.55    310.         398.          11.6 
##  6  5.58     188. 28749.        7.54    327.         280.           8.40
##  7 10.2      248. 28750.        7.51    394.         284.          13.8 
##  8  8.64     203. 13672.        4.56    303.         475.          12.4 
##  9 NA        119. 14286.        7.80    269.         389.          12.7 
## 10 11.2      227. 25485.        9.08    404.         564.          17.9 
## # ℹ 3,266 more rows
## # ℹ 3 more variables: Trihalomethanes <dbl>, Turbidity <dbl>, Potability <dbl>
dim(df)
## [1] 3276   10

La base de datos de agua .. consta de observacio y filas

head(df)
## # A tibble: 6 × 10
##      ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon
##   <dbl>    <dbl>  <dbl>       <dbl>   <dbl>        <dbl>          <dbl>
## 1 NA        205. 20791.        7.30    369.         564.          10.4 
## 2  3.72     129. 18630.        6.64     NA          593.          15.2 
## 3  8.10     224. 19910.        9.28     NA          419.          16.9 
## 4  8.32     214. 22018.        8.06    357.         363.          18.4 
## 5  9.09     181. 17979.        6.55    310.         398.          11.6 
## 6  5.58     188. 28749.        7.54    327.         280.           8.40
## # ℹ 3 more variables: Trihalomethanes <dbl>, Turbidity <dbl>, Potability <dbl>

Se observan las primeras 5 filas de la base de datos

str(df)
## spc_tbl_ [3,276 × 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ph             : num [1:3276] NA 3.72 8.1 8.32 9.09 ...
##  $ Hardness       : num [1:3276] 205 129 224 214 181 ...
##  $ Solids         : num [1:3276] 20791 18630 19910 22018 17979 ...
##  $ Chloramines    : num [1:3276] 7.3 6.64 9.28 8.06 6.55 ...
##  $ Sulfate        : num [1:3276] 369 NA NA 357 310 ...
##  $ Conductivity   : num [1:3276] 564 593 419 363 398 ...
##  $ Organic_carbon : num [1:3276] 10.4 15.2 16.9 18.4 11.6 ...
##  $ Trihalomethanes: num [1:3276] 87 56.3 66.4 100.3 32 ...
##  $ Turbidity      : num [1:3276] 2.96 4.5 3.06 4.63 4.08 ...
##  $ Potability     : num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ph = col_double(),
##   ..   Hardness = col_double(),
##   ..   Solids = col_double(),
##   ..   Chloramines = col_double(),
##   ..   Sulfate = col_double(),
##   ..   Conductivity = col_double(),
##   ..   Organic_carbon = col_double(),
##   ..   Trihalomethanes = col_double(),
##   ..   Turbidity = col_double(),
##   ..   Potability = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
barplot(sort(table(df$Potability),decreasing=TRUE),
 las=2)

hist(sort(df$Hardness))

hist(sort(df$Chloramines))

library(Amelia)
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.2, built: 2024-04-10)
## ## Copyright (C) 2005-2024 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(df)
## Warning: Unknown or uninitialised column: `arguments`.
## Unknown or uninitialised column: `arguments`.
## Warning: Unknown or uninitialised column: `imputations`.

hist(pressure$temperature)

boxplot(pressure$temperature)

```