trabajo

#potabilidad del agua ##contexto

library(readr)
df<-read.csv("water_potability.csv")

dim(df)

## [1] 3276   10

str(df)

## 'data.frame':    3276 obs. of  10 variables:
##  $ ph             : num  NA 3.72 8.1 8.32 9.09 ...
##  $ Hardness       : num  205 129 224 214 181 ...
##  $ Solids         : num  20791 18630 19910 22018 17979 ...
##  $ Chloramines    : num  7.3 6.64 9.28 8.06 6.55 ...
##  $ Sulfate        : num  369 NA NA 357 310 ...
##  $ Conductivity   : num  564 593 419 363 398 ...
##  $ Organic_carbon : num  10.4 15.2 16.9 18.4 11.6 ...
##  $ Trihalomethanes: num  87 56.3 66.4 100.3 32 ...
##  $ Turbidity      : num  2.96 4.5 3.06 4.63 4.08 ...
##  $ Potability     : int  0 0 0 0 0 0 0 0 0 0 ...

barplot(sort(table(df$Potability), decreasing = TRUE), las = 2,
        main = "Frecuencia de Potabilidad",
        xlab = "Potabilidad",
        ylab = "Frecuencia",
        col = "blue")

#El diagrama de barras muestra la frecuencia de los valores en la variable portabilidad, ordenado en orden descendente, eje x potabilidad, eje y frecuencia de cada valor.

boxplot(df$ph, horizontal = TRUE, col = 'steelblue', 
        main = "Distribución del pH del Agua", 
        xlab = "pH", 
        na.rm = TRUE)

#boxplot que muestra la distribución de la variable ph.

library(ggplot2)
ggplot(data = df, aes(x = ph)) +
  geom_histogram(binwidth = 0.5, fill = "skyblue", color = "black", alpha = 0.8) +
  labs(title = "Histograma del pH del Agua",
       x = "pH",
       y = "Frecuencia")

## Warning: Removed 491 rows containing non-finite outside the scale range
## (`stat_bin()`).

#El histograma muestra la distribución de los valores de pH del agua en tu conjunto de datos. Cada barra representa la frecuencia de los valores de pH dentro de intervalos específicos, permitiendo visualizar cómo se distribuyen los valores a lo largo del rango observado.

# Cargar la librería ggplot2
library(ggplot2)

# Graficar utilizando las columnas del dataframe que me proporcionaste
ggplot(data = df, aes(x = Potability, y = ph)) + 
  geom_bar(stat = "identity") +
  labs(title = "Distribución del pH según Potabilidad", 
       x = "Potabilidad", 
       y = "pH")

## Warning: Removed 491 rows containing missing values or values outside the scale range
## (`geom_bar()`).

#El gráfico ayuda a visualizar cómo varían los niveles de pH entre muestras de agua que son potables y no potables.

G1 <- ggplot(df, aes(x = factor(Potability), y = ..count.., fill = factor(Potability))) + 
  geom_bar(stat = "count", width = 0.7, color = "black", position = position_dodge()) +
  labs(x = "Potabilidad", y = "Frecuencia de pH") +
  theme(axis.text.x = element_text(angle = 0, vjust = 1, hjust = 1)) +
  theme_bw(base_size = 16) +
  facet_wrap(~"Distribución del pH según Potabilidad")

G1

## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# El gráfico presenta un las diferentes categorías de potabilidad

ggplot(df, aes(fill = factor(Potability), x = factor(Potability))) +
  geom_bar(position = "stack") +
  geom_text(stat = 'count', aes(label = ..count..), position = position_stack(vjust = 0.5), size = 3, color = "black") +
  labs(title = "Distribución de Potabilidad",
       x = "Potabilidad",
       y = "Frecuencia") +
  scale_fill_manual(values = c("blue2", "pink2"), name = "Potabilidad", labels = c("No Potable", "Potable")) +
  theme(legend.position = "right")  # Ubicación de la leyenda

#En este gráfico de barras se muestra la distribución de la potabilidad del agua. El eje x representa las categorías de potabilidad (potable o no potable), y el eje y muestra la frecuencia de muestras en cada categoría. Las barras están coloreadas para distinguir entre las categorías de potabilidad.

library(questionr)

# Crear una tabla de frecuencias acumuladas para la variable 'Potability'
Tabla_Potabilidad <- questionr::freq(df$Potability, cum = TRUE, sort = "dec", total = TRUE)

# Mostrar la tabla
knitr::kable(Tabla_Potabilidad)

	n	%	val%	%cum	val%cum
0	1998	61	61	61	61
1	1278	39	39	100	100
Total	3276	100	100	100	100

#La tabla muestra la frecuencia y la frecuencia acumulada de cada categoría en la variable Potability. Organice las categorías de manera descendente.

# Convertir la columna Potability a factor
FACTOR_POTABILITY <- as.factor(df$Potability)

# Calcular la tabla de frecuencias
FRECUENCIAS_POTABILITY <- summary(FACTOR_POTABILITY)

# Graficar las frecuencias
barplot(FRECUENCIAS_POTABILITY, 
        main = "Distribución de Potabilidad del Agua",
        xlab = "Potabilidad (0 = No potable, 1 = Potable)", 
        ylab = "Frecuencia", 
        col = c("red", "green"))

summary(df$ph)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   6.093   7.037   7.081   8.062  14.000     491

summary(df$Hardness)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   47.43  176.85  196.97  196.37  216.67  323.12

summary(df$Solids)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   320.9 15666.7 20927.8 22014.1 27332.8 61227.2

summary(df$Chloramines)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.352   6.127   7.130   7.122   8.115  13.127

summary(df$Sulfate)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   129.0   307.7   333.1   333.8   360.0   481.0     781

summary(df$Conductivity)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   181.5   365.7   421.9   426.2   481.8   753.3

summary(df$Organic_carbon)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.20   12.07   14.22   14.28   16.56   28.30

summary(df$Trihalomethanes)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.738  55.845  66.622  66.396  77.337 124.000     162

summary(df$Turbidity)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.450   3.440   3.955   3.967   4.500   6.739

summary(df$Potability)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.3901  1.0000  1.0000

#La tabla fue realizada por summary(), la cual muestra estadísticas descriptivas para cada una de las variables del conjunto de datos sobre la potabilidad del agua.

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)

##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

trabajo

andrea

2024-08-13

R Markdown

Including Plots