# =============================
# CARGA DE LIBRERÍAS
# =============================
library(kableExtra)
library(knitr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:kableExtra':
## 
##     group_rows
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(magrittr)
library(e1071)

# =============================
# CARGA DE DATOS
# =============================
setwd("/cloud/project")
datos <- read.csv("china_water_pollution_data.csv", header = TRUE)

Oxigeno_disuelto <- datos$Dissolved_Oxygen_mg_L

# =============================
# TABLA DE FRECUENCIAS MANUAL
# =============================
minimo <- min(Oxigeno_disuelto)
maximo <- max(Oxigeno_disuelto)
R <- maximo - minimo
K <- floor(1 + 3.33 * log10(length(Oxigeno_disuelto)))
A <- R / K

lim_inf <- round(seq(from = minimo, to = maximo - A, by = A), 2)
lim_sup <- round(seq(from = minimo + A, to = maximo, by = A), 2)
MC <- (lim_inf + lim_sup) / 2

ni <- sapply(1:K, function(i) {
  if (i < K) sum(Oxigeno_disuelto >= lim_inf[i] & Oxigeno_disuelto < lim_sup[i])
  else sum(Oxigeno_disuelto >= lim_inf[i] & Oxigeno_disuelto <= lim_sup[i])
})

hi <- round((ni / sum(ni)) * 100, 2)
Ni_asc <- cumsum(ni)
Ni_desc <- rev(cumsum(rev(ni)))
Hi_asc <- round(cumsum(hi), 2)
Hi_desc <- round(rev(cumsum(rev(hi))), 2)

TDF_Oxigeno <- data.frame(
  Lim_inf = lim_inf,
  Lim_sup = lim_sup,
  MC = MC,
  ni = ni,
  hi = hi,
  Ni_asc = Ni_asc,
  Ni_desc = Ni_desc,
  Hi_asc = Hi_asc,
  Hi_desc = Hi_desc
)

# fila de totales
totales <- data.frame(
  Lim_inf = "TOTAL",
  Lim_sup = "-",
  MC = "-",
  ni = sum(ni),
  hi = sum(hi),
  Ni_asc = "-",
  Ni_desc = "-",
  Hi_asc = "-",
  Hi_desc = "-"
)

TDF_Oxigeno_total <- rbind(TDF_Oxigeno, totales)

kable(TDF_Oxigeno_total, align = 'c',
      caption = "Tabla de distribución de Oxígeno Disuelto (mg/L)") %>%
  kable_styling(full_width = FALSE,
                bootstrap_options = c("striped", "hover"))
Tabla de distribución de Oxígeno Disuelto (mg/L)
Lim_inf Lim_sup MC ni hi Ni_asc Ni_desc Hi_asc Hi_desc
1.47 2.55 2.01 6 0.20 6 2997 0.2 99.99
2.55 3.64 3.095 30 1.00 36 2991 1.2 99.79
3.64 4.72 4.18 114 3.80 150 2961 5 98.79
4.72 5.81 5.265 247 8.24 397 2847 13.24 94.99
5.81 6.89 6.35 470 15.68 867 2600 28.92 86.75
6.89 7.98 7.435 608 20.29 1475 2130 49.21 71.07
7.98 9.06 8.52 629 20.99 2104 1522 70.2 50.78
9.06 10.15 9.605 467 15.58 2571 893 85.78 29.79
10.15 11.23 10.69 274 9.14 2845 426 94.92 14.21
11.24 12.32 11.78 110 3.67 2955 152 98.59 5.07
12.32 13.4 12.86 33 1.10 2988 42 99.69 1.4
13.4 14.49 13.945 9 0.30 2997 9 99.99 0.3
TOTAL
2997 99.99
# =============================
# HISTOGRAMA SIMPLIFICADO
# =============================
Hist_Oxigeno <- hist(Oxigeno_disuelto, breaks = 10, plot = FALSE)

Li <- Hist_Oxigeno$breaks[-length(Hist_Oxigeno$breaks)]
Ls <- Hist_Oxigeno$breaks[-1]
ni <- Hist_Oxigeno$counts
hi <- round((ni / sum(ni)) * 100, 2)
MC <- Hist_Oxigeno$mids
Ni_asc <- cumsum(ni)
Ni_desc <- rev(cumsum(rev(ni)))
Hi_asc <- cumsum(hi)
Hi_desc <- rev(cumsum(rev(hi)))

# =============================
# GRÁFICAS
# =============================

# HISTOGRAMA 1
hist(Oxigeno_disuelto,
     main = "Gráfica 1: Histograma",
     xlab = "Oxígeno Disuelto (mg/L)",
     ylab = "Frecuencia",
     col = "skyblue")

# HISTOGRAMA 2 (GLOBAL)
hist(Oxigeno_disuelto,
     main = "Gráfica 2: Histograma Global",
     xlab = "Oxígeno Disuelto (mg/L)",
     col = "lightgreen")

# BARPLOT DE PORCENTAJES
barplot(hi,
        names.arg = round(MC,2),
        col = "skyblue",
        main = "Gráfica 3: Porcentaje por Intervalo",
        ylab = "hi (%)")

# BARPLOT LOCAL
barplot(hi,
        space = 0,
        col = "lightblue",
        main = "Gráfica 4: Porcentaje Detallado",
        ylab = "Porcentaje (%)")

# =============================
# OJIVA ASCENDENTE Y DESCENDENTE
# =============================
plot(Li, Ni_desc,
     type = "o",
     col = "blue",
     lwd = 2,
     main = "Ojiva Ascendente y Descendente",
     xlab = "Oxígeno Disuelto (mg/L)",
     ylab = "Frecuencia")

lines(Ls, Ni_asc, type = "o", col = "red", lwd = 2)

# =============================
# DIAGRAMA DE CAJA
# =============================
boxplot(Oxigeno_disuelto,
        horizontal = TRUE,
        main = "Gráfica 8: Diagrama de Caja",
        xlab = "Oxígeno Disuelto (mg/L)",
        col = "orange")

# =============================
# INDICADORES ESTADÍSTICOS
# =============================

media <- round(mean(Oxigeno_disuelto), 2)
mediana <- round(median(Oxigeno_disuelto), 2)
varianza <- round(var(Oxigeno_disuelto), 2)
sd <- round(sd(Oxigeno_disuelto), 2)
cv <- round((sd / media) * 100, 2)
asimetria <- round(skewness(Oxigeno_disuelto), 4)
curtosis <- round(kurtosis(Oxigeno_disuelto), 2)

tabla_indicadores <- data.frame(
  Variable = "Oxígeno Disuelto (mg/L)",
  Rango = "[0;20]",
  Media = media,
  Mediana = mediana,
  Moda = "No existe",
  Varianza = varianza,
  Desv_Estandar = sd,
  CV = cv,
  Asimetria = asimetria,
  Curtosis = curtosis,
  Valores_Atipicos = "Sí existen"
)

kable(tabla_indicadores, align = 'c',
      caption = "Indicadores Estadísticos del Oxígeno Disuelto")
Indicadores Estadísticos del Oxígeno Disuelto
Variable Rango Media Mediana Moda Varianza Desv_Estandar CV Asimetria Curtosis Valores_Atipicos
Oxígeno Disuelto (mg/L) [0;20] 8.01 8.03 No existe 3.9 1.97 24.59 0.0056 -0.14 Sí existen