1. Carga de Datos, Libreria y Extracción de la Variable

library(kableExtra)
library(knitr)
library(magrittr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:kableExtra':
## 
##     group_rows
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
getwd()
## [1] "/cloud/project"
setwd("/cloud/project")

datos <- read.csv("china_water_pollution_data.csv",
                  header = TRUE,
                  sep = ",",
                  dec = ".")
Turbidez <- as.numeric(datos$Turbidity_NTU)
Turbidez <- na.omit(Turbidez)

2. Tabla de distribución de frecuencia

Hist_Turbidez <- hist(Turbidez,
                      breaks = 6,     # menos clases → FE ≥ 5
                      plot = FALSE)

Li <- Hist_Turbidez$breaks[-length(Hist_Turbidez$breaks)]
Ls <- Hist_Turbidez$breaks[-1]
ni <- Hist_Turbidez$counts
n  <- sum(ni)
Mc <- Hist_Turbidez$mids
hi <- ni / n

Ni_asc  <- cumsum(ni)
Hi_asc  <- cumsum(hi)
Ni_desc <- rev(cumsum(rev(ni)))
Hi_desc <- rev(cumsum(rev(hi)))

TDFTurbidez <- data.frame(
  Li = round(Li, 2),
  Ls = round(Ls, 2),
  Mc = round(Mc, 2),
  ni = ni,
  hi = round(hi * 100, 2),
  Ni_asc = Ni_asc,
  Ni_desc = Ni_desc,
  Hi_asc = round(Hi_asc * 100, 2),
  Hi_desc = round(Hi_desc * 100, 2)
)

# Fila de totales
totales <- data.frame(
  Li = "TOTAL", Ls = "-", Mc = "-",
  ni = sum(ni),
  hi = sum(hi * 100),
  Ni_asc = "-", Ni_desc = "-",
  Hi_asc = "-", Hi_desc = "-"
)

TDFTurbidez <- rbind(TDFTurbidez, totales)
kable(TDFTurbidez,
      align = "c",
      caption = "Tabla de Frecuencias de la Turbidez (NTU)") %>%
  kable_styling(full_width = FALSE,
                position = "center",
                bootstrap_options = c("striped", "hover", "condensed"))
Tabla de Frecuencias de la Turbidez (NTU)
Li Ls Mc ni hi Ni_asc Ni_desc Hi_asc Hi_desc
0 10 5 2589 86.30 2589 3000 86.3 100
10 20 15 366 12.20 2955 411 98.5 13.7
20 30 25 38 1.27 2993 45 99.77 1.5
30 40 35 5 0.17 2998 7 99.93 0.23
40 50 45 2 0.07 3000 2 100 0.07
TOTAL
3000 100.00

3. Histograma y Conjetura

hist(Turbidez,
     breaks = 6,
     col = "pink",
     main = "Gráfica N°1: Distribución de la Turbidez (NTU)",
     xlab = "Turbidez (NTU)",
     ylab = "Cantidad")

base <- min(Turbidez)
lambda <- 1 / (mean(Turbidez) - base)
hist(Turbidez,
     breaks = 6,
     freq = FALSE,
     col = "orange2",
     main = "Gráfica N°2: Ajuste Exponencial de la Turbidez",
     xlab = "Turbidez (NTU)",
     ylab = "Cantidad")

curve(dexp(x - base, rate = lambda),
      from = base,
      to = max(Turbidez),
      col = "blue",
      lwd = 2,
      add = TRUE)

4. Cálculo de frecuencias observadas y esperadas

FO <- ni
FE <- numeric(length(FO))

for (i in 1:length(FO)) {
  P <- pexp(Ls[i] - base, rate = lambda) -
    pexp(Li[i] - base, rate = lambda)
  FE[i] <- n * P
}
FE
## [1] 2602.4079112  344.8989324   45.7096957    6.0579378    0.8028627
cor(FO, FE)
## [1] 0.9999471
x2 <- sum((FO - FE)^2 / FE)

gl <- length(FO) - 1 - 1   # k - 1 - parámetros
VC <- qchisq(0.95, gl)

x2
## [1] 4.630209
VC
## [1] 7.814728
x2 < VC
## [1] TRUE
plot(1, type = "n", axes = FALSE, xlab = "", ylab = "") # Crear un gráfico vacío
text(x = 1, y = 1, 
     labels = "Cálculos de probabilidad\n(Estimación general)\n
     ¿Cuál es la probabilidad de que\nel valor de Turbidez afectada
     ((NTU) \n esté entre 0.5 y 1 (NTU) ?\n\n
     R:8.69 (%)",
     cex = 1.5,  # Tamaño del texto (ajustable)
     col = "blue",  # Color del texto
     font =6) #tipo

plot(FO, FE,
     main = "Gráfica N°3: Frecuencias Observadas vs Esperadas",
     xlab = "Observadas",
     ylab = "Esperadas",
     pch = 19)

abline(lm(FE ~ FO), col = "red", lwd = 2)

P1 <- (pexp(1 - base, rate = lambda) -
         pexp(0.5 - base, rate = lambda)) * 100
P1
## [1] 8.687246
x <- seq(base, max(Turbidez), length.out = 1000)

plot(x, dexp(x - base, rate = lambda),
     type = "l",
     col = "red",
     lwd = 2,
     main = "Gráfica N°4: Modelo Exponencial de la Turbidez",
     xlab = "Turbidez (NTU)",
     ylab = "Densidad")

x_area <- seq(0.5, 1, length.out = 200)
polygon(c(0.5, x_area, 1),
        c(0, dexp(x_area - base, rate = lambda), 0),
        col = rgb(0, 0, 1, 0.3),
        border = NA)

legend("topright",
       legend = c("Modelo Exponencial", "Área [0.5 – 1]"),
       col = c("red", rgb(0, 0, 1, 0.3)),
       lwd = c(2, NA),
       pch = c(NA, 15),
       pt.cex = 2,
       bty = "n")

lim_inf <- mean(Turbidez) - 2 * sd(Turbidez) / sqrt(n)
lim_sup <- mean(Turbidez) + 2 * sd(Turbidez) / sqrt(n)

lim_inf
## [1] 4.768884
lim_sup
## [1] 5.127496
plot(1, type = "n", axes = FALSE, xlab = "", ylab = "") # Crear un gráfico vacío
text(x = 1, y = 1,
     labels = "CONCLUSIONES",
     cex = 2, # Tamaño del texto (ajustable)
     col = "blue", # Color del texto
     font = 6) #tipo

# =========================================================
# TABLAS DE RESULTADOS DEL MODELO
# =========================================================

tabla_modelos <- data.frame(
  "Turbidez (NTU)" = c("[0.5 , 1.0]", "Media"),
  "Modelo" = c("Exponencial", ""),
  "Parámetros" = c(paste("Lambda =", round(lambda, 4)), ""),
  "Test Pearson" = c(round(cor(FO, FE), 3), ""),
  "Test Chi-cuadrado" = c(ifelse(x2 < VC, "Aprobado", "No aprobado"), "")
)

tabla_intervalos <- data.frame(
  "Intervalo de Confianza" = c("Límite Inferior", "Límite Superior"),
  "Grado Confianza (%)" = c("95", "95"),
  "Turbidez (NTU)" = c(round(lim_inf, 4), round(lim_sup, 4))
)



library(knitr)

kable(tabla_modelos,
      align = "c",
      caption = "Conclusiones del Modelo Exponencial para Turbidez (NTU)")
Conclusiones del Modelo Exponencial para Turbidez (NTU)
Turbidez..NTU. Modelo Parámetros Test.Pearson Test.Chi.cuadrado
[0.5 , 1.0] Exponencial Lambda = 0.2021 1 Aprobado
Media
kable(tabla_intervalos,
      align = "c",
      caption = "Intervalos de Confianza de la Media Poblacional")
Intervalos de Confianza de la Media Poblacional
Intervalo.de.Confianza Grado.Confianza…. Turbidez..NTU.
Límite Inferior 95 4.7689
Límite Superior 95 5.1275

5. Conclusión

La variable Turbidez (NTU) en la contaminación del agua en China presenta un comportamiento de tipo exponencial con parámetro λ = 1.99 y podemos afirmar con un 95% de confianza que la media aritmética poblacional se encuentra entre 4.7689 y 5.1275 NTU, y el modelo expresa en 99.99% la realidad.