library(readr)
df <- read_csv("C:/Users/FREDY/Downloads/water_potability.csv")
## Rows: 3276 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (10): ph, Hardness, Solids, Chloramines, Sulfate, Conductivity, Organic_...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df
## # A tibble: 3,276 × 10
## ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 NA 205. 20791. 7.30 369. 564. 10.4
## 2 3.72 129. 18630. 6.64 NA 593. 15.2
## 3 8.10 224. 19910. 9.28 NA 419. 16.9
## 4 8.32 214. 22018. 8.06 357. 363. 18.4
## 5 9.09 181. 17979. 6.55 310. 398. 11.6
## 6 5.58 188. 28749. 7.54 327. 280. 8.40
## 7 10.2 248. 28750. 7.51 394. 284. 13.8
## 8 8.64 203. 13672. 4.56 303. 475. 12.4
## 9 NA 119. 14286. 7.80 269. 389. 12.7
## 10 11.2 227. 25485. 9.08 404. 564. 17.9
## # ℹ 3,266 more rows
## # ℹ 3 more variables: Trihalomethanes <dbl>, Turbidity <dbl>, Potability <dbl>
dim(df)
## [1] 3276 10
La base de datos de agua .. consta de observacio y filas
head(df)
## # A tibble: 6 × 10
## ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 NA 205. 20791. 7.30 369. 564. 10.4
## 2 3.72 129. 18630. 6.64 NA 593. 15.2
## 3 8.10 224. 19910. 9.28 NA 419. 16.9
## 4 8.32 214. 22018. 8.06 357. 363. 18.4
## 5 9.09 181. 17979. 6.55 310. 398. 11.6
## 6 5.58 188. 28749. 7.54 327. 280. 8.40
## # ℹ 3 more variables: Trihalomethanes <dbl>, Turbidity <dbl>, Potability <dbl>
Se observan las primeras 5 filas de la base de datos
str(df)
## spc_tbl_ [3,276 × 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ph : num [1:3276] NA 3.72 8.1 8.32 9.09 ...
## $ Hardness : num [1:3276] 205 129 224 214 181 ...
## $ Solids : num [1:3276] 20791 18630 19910 22018 17979 ...
## $ Chloramines : num [1:3276] 7.3 6.64 9.28 8.06 6.55 ...
## $ Sulfate : num [1:3276] 369 NA NA 357 310 ...
## $ Conductivity : num [1:3276] 564 593 419 363 398 ...
## $ Organic_carbon : num [1:3276] 10.4 15.2 16.9 18.4 11.6 ...
## $ Trihalomethanes: num [1:3276] 87 56.3 66.4 100.3 32 ...
## $ Turbidity : num [1:3276] 2.96 4.5 3.06 4.63 4.08 ...
## $ Potability : num [1:3276] 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, "spec")=
## .. cols(
## .. ph = col_double(),
## .. Hardness = col_double(),
## .. Solids = col_double(),
## .. Chloramines = col_double(),
## .. Sulfate = col_double(),
## .. Conductivity = col_double(),
## .. Organic_carbon = col_double(),
## .. Trihalomethanes = col_double(),
## .. Turbidity = col_double(),
## .. Potability = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
barplot(sort(table(df$Potability),decreasing=TRUE),
las=2)
hist(sort(df$Hardness))
hist(sort(df$Chloramines))
library(Amelia)
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.2, built: 2024-04-10)
## ## Copyright (C) 2005-2024 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(df)
## Warning: Unknown or uninitialised column: `arguments`.
## Unknown or uninitialised column: `arguments`.
## Warning: Unknown or uninitialised column: `imputations`.
hist(pressure$temperature)
boxplot(pressure$temperature)
```