# Carga de datos
setwd("C:/Users/lenovo/OneDrive/Escritorio/ESTADISTICA")
datos <- read.csv("china_water_pollution_data.csv")

# Instalar si falta
# install.packages("gt")

library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.2
## 
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(gt)
## Warning: package 'gt' was built under R version 4.5.2
# ========= 1) Tabla base ==========
TDF <- datos %>% 
  count(Monitoring_Station, name = "ni")

# ========= 2) Intervalos ==========
TDF$Intervalo <- cut(
  TDF$ni,
  breaks = c(0, 5, 10, 20, 50, 100, Inf),
  labels = c("1–5 registros", "6–10", "11–20", "21–50", "51–100", "100+")
)

# ========= 3) Tabla por intervalos ==========
tabla_intervalos <- TDF %>% 
  group_by(Intervalo) %>% 
  summarise(
    Num_Estaciones = n(),
    ni = sum(ni),
    .groups = "drop"
  ) %>% 
  mutate(`hi(%)` = round((Num_Estaciones / sum(Num_Estaciones)) * 100, 2))

# ========= 4) Convertir todo a character para evitar conflicto ==========
tabla_intervalos <- tabla_intervalos %>% 
  mutate(across(everything(), as.character))

# ========= 5) Crear total general con exactamente los mismos nombres ==========
total_general <- tibble(
  Intervalo = "TOTAL GENERAL",
  Num_Estaciones = sum(as.numeric(tabla_intervalos$Num_Estaciones)),
  ni = sum(as.numeric(tabla_intervalos$ni)),
  `hi(%)` = sum(as.numeric(tabla_intervalos$`hi(%)`))
) %>% 
  mutate(across(everything(), as.character))   # ⚠️ Muy importante

# ========= 6) Unir sin errores ==========
tabla_final <- bind_rows(tabla_intervalos, total_general)

# ========= 7) Tabla GT bonitamente formateada ==========
tabla_final_gt <- tabla_final %>% 
  gt() %>% 
  tab_header(
    title = md("**Tabla N° 1**"),
    subtitle = md("**Distribución de Frecuencias de las Estaciones de Monitoreo por Intervalos (2015–2023)**")
  ) %>% 
  tab_source_note(
    source_note = md("Autor: Grupo 1")
  ) %>% 
  tab_options(
    table.border.top.color = "black",
    table.border.bottom.color = "black",
    column_labels.border.bottom.color = "black",
    row.striping.include_table_body = TRUE
  ) %>% 
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_body(rows = Intervalo == "TOTAL GENERAL")
  )

tabla_final_gt
Tabla N° 1
Distribución de Frecuencias de las Estaciones de Monitoreo por Intervalos (2015–2023)
Intervalo Num_Estaciones ni hi(%)
1–5 registros 1 5 0.56
6–10 17 158 9.44
11–20 136 2086 75.56
21–50 26 751 14.44
TOTAL GENERAL 180 3000 100
Autor: Grupo 1