# Carga de datos
setwd("C:/Users/LENOVO/OneDrive/Escritorio/ESTADISTICA")
datos <- read.csv("china_water_pollution_data.csv")

# Variable cualitativa
Estacion <- datos$Monitoring_Station

# Tabla de frecuencia
TDF_Estacion <- data.frame(table(Estacion))

# Renombrar columnas correctamente
colnames(TDF_Estacion) <- c("Estacion", "ni")

# Frecuencia relativa
TDF_Estacion$hi <- (TDF_Estacion$ni / sum(TDF_Estacion$ni)) * 100

# Fila de sumatoria
Sumatoria <- data.frame(
  Estacion = "Sumatoria",
  ni = sum(TDF_Estacion$ni),
  hi = sum(TDF_Estacion$hi)
)

# Unir tabla + sumatoria
TDF_Estacion_suma <- rbind(TDF_Estacion, Sumatoria)

#librerias
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.2
## 
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(knitr)
## Warning: package 'knitr' was built under R version 4.5.2
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.5.2
## 
## Adjuntando el paquete: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(stringr)
## Warning: package 'stringr' was built under R version 4.5.2
# Tabla final
kable(TDF_Estacion_suma, align = 'c',
      caption = "Tabla:1 Distribución de Frecuencias de las Estaciones de 
      Monitoreo del agua en China") %>%
  kable_styling(full_width = FALSE, position = "center",
                bootstrap_options = c("striped", "hover", "condensed"))
Tabla:1 Distribución de Frecuencias de las Estaciones de Monitoreo del agua en China
Estacion ni hi
Beijing_Station_1 26 0.8666667
Beijing_Station_10 29 0.9666667
Beijing_Station_2 34 1.1333333
Beijing_Station_3 29 0.9666667
Beijing_Station_4 27 0.9000000
Beijing_Station_5 33 1.1000000
Beijing_Station_6 26 0.8666667
Beijing_Station_7 42 1.4000000
Beijing_Station_8 15 0.5000000
Beijing_Station_9 38 1.2666667
Chengdu_Station_1 17 0.5666667
Chengdu_Station_10 17 0.5666667
Chengdu_Station_2 20 0.6666667
Chengdu_Station_3 15 0.5000000
Chengdu_Station_4 16 0.5333333
Chengdu_Station_5 18 0.6000000
Chengdu_Station_6 14 0.4666667
Chengdu_Station_7 11 0.3666667
Chengdu_Station_8 18 0.6000000
Chengdu_Station_9 19 0.6333333
Dali_Station_1 13 0.4333333
Dali_Station_10 12 0.4000000
Dali_Station_2 20 0.6666667
Dali_Station_3 14 0.4666667
Dali_Station_4 12 0.4000000
Dali_Station_5 11 0.3666667
Dali_Station_6 15 0.5000000
Dali_Station_7 20 0.6666667
Dali_Station_8 14 0.4666667
Dali_Station_9 13 0.4333333
Guangzhou_Station_1 20 0.6666667
Guangzhou_Station_10 11 0.3666667
Guangzhou_Station_2 13 0.4333333
Guangzhou_Station_3 13 0.4333333
Guangzhou_Station_4 12 0.4000000
Guangzhou_Station_5 17 0.5666667
Guangzhou_Station_6 14 0.4666667
Guangzhou_Station_7 19 0.6333333
Guangzhou_Station_8 17 0.5666667
Guangzhou_Station_9 10 0.3333333
Hangzhou_Station_1 15 0.5000000
Hangzhou_Station_10 12 0.4000000
Hangzhou_Station_2 16 0.5333333
Hangzhou_Station_3 10 0.3333333
Hangzhou_Station_4 14 0.4666667
Hangzhou_Station_5 20 0.6666667
Hangzhou_Station_6 21 0.7000000
Hangzhou_Station_7 13 0.4333333
Hangzhou_Station_8 9 0.3000000
Hangzhou_Station_9 18 0.6000000
Jinan_Station_1 16 0.5333333
Jinan_Station_10 21 0.7000000
Jinan_Station_2 17 0.5666667
Jinan_Station_3 13 0.4333333
Jinan_Station_4 25 0.8333333
Jinan_Station_5 10 0.3333333
Jinan_Station_6 19 0.6333333
Jinan_Station_7 16 0.5333333
Jinan_Station_8 14 0.4666667
Jinan_Station_9 9 0.3000000
Kunming_Station_1 20 0.6666667
Kunming_Station_10 15 0.5000000
Kunming_Station_2 16 0.5333333
Kunming_Station_3 12 0.4000000
Kunming_Station_4 9 0.3000000
Kunming_Station_5 14 0.4666667
Kunming_Station_6 19 0.6333333
Kunming_Station_7 15 0.5000000
Kunming_Station_8 15 0.5000000
Kunming_Station_9 17 0.5666667
Luoyang_Station_1 18 0.6000000
Luoyang_Station_10 11 0.3666667
Luoyang_Station_2 12 0.4000000
Luoyang_Station_3 12 0.4000000
Luoyang_Station_4 11 0.3666667
Luoyang_Station_5 10 0.3333333
Luoyang_Station_6 19 0.6333333
Luoyang_Station_7 13 0.4333333
Luoyang_Station_8 13 0.4333333
Luoyang_Station_9 19 0.6333333
Mianyang_Station_1 15 0.5000000
Mianyang_Station_10 9 0.3000000
Mianyang_Station_2 10 0.3333333
Mianyang_Station_3 16 0.5333333
Mianyang_Station_4 18 0.6000000
Mianyang_Station_5 21 0.7000000
Mianyang_Station_6 18 0.6000000
Mianyang_Station_7 20 0.6666667
Mianyang_Station_8 10 0.3333333
Mianyang_Station_9 9 0.3000000
Nanjing_Station_1 8 0.2666667
Nanjing_Station_10 25 0.8333333
Nanjing_Station_2 11 0.3666667
Nanjing_Station_3 17 0.5666667
Nanjing_Station_4 20 0.6666667
Nanjing_Station_5 18 0.6000000
Nanjing_Station_6 14 0.4666667
Nanjing_Station_7 16 0.5333333
Nanjing_Station_8 12 0.4000000
Nanjing_Station_9 12 0.4000000
Ningbo_Station_1 16 0.5333333
Ningbo_Station_10 12 0.4000000
Ningbo_Station_2 14 0.4666667
Ningbo_Station_3 13 0.4333333
Ningbo_Station_4 18 0.6000000
Ningbo_Station_5 12 0.4000000
Ningbo_Station_6 19 0.6333333
Ningbo_Station_7 14 0.4666667
Ningbo_Station_8 20 0.6666667
Ningbo_Station_9 18 0.6000000
Qingdao_Station_1 15 0.5000000
Qingdao_Station_10 15 0.5000000
Qingdao_Station_2 13 0.4333333
Qingdao_Station_3 13 0.4333333
Qingdao_Station_4 9 0.3000000
Qingdao_Station_5 18 0.6000000
Qingdao_Station_6 15 0.5000000
Qingdao_Station_7 18 0.6000000
Qingdao_Station_8 13 0.4333333
Qingdao_Station_9 11 0.3666667
Shanghai_Station_1 34 1.1333333
Shanghai_Station_10 28 0.9333333
Shanghai_Station_2 39 1.3000000
Shanghai_Station_3 29 0.9666667
Shanghai_Station_4 39 1.3000000
Shanghai_Station_5 30 1.0000000
Shanghai_Station_6 28 0.9333333
Shanghai_Station_7 33 1.1000000
Shanghai_Station_8 26 0.8666667
Shanghai_Station_9 26 0.8666667
Shenzhen_Station_1 20 0.6666667
Shenzhen_Station_10 10 0.3333333
Shenzhen_Station_2 13 0.4333333
Shenzhen_Station_3 16 0.5333333
Shenzhen_Station_4 21 0.7000000
Shenzhen_Station_5 18 0.6000000
Shenzhen_Station_6 9 0.3000000
Shenzhen_Station_7 15 0.5000000
Shenzhen_Station_8 13 0.4333333
Shenzhen_Station_9 20 0.6666667
Suzhou_Station_1 11 0.3666667
Suzhou_Station_10 12 0.4000000
Suzhou_Station_2 12 0.4000000
Suzhou_Station_3 17 0.5666667
Suzhou_Station_4 11 0.3666667
Suzhou_Station_5 9 0.3000000
Suzhou_Station_6 15 0.5000000
Suzhou_Station_7 21 0.7000000
Suzhou_Station_8 16 0.5333333
Suzhou_Station_9 16 0.5333333
Wuhan_Station_1 16 0.5333333
Wuhan_Station_10 18 0.6000000
Wuhan_Station_2 13 0.4333333
Wuhan_Station_3 16 0.5333333
Wuhan_Station_4 15 0.5000000
Wuhan_Station_5 11 0.3666667
Wuhan_Station_6 19 0.6333333
Wuhan_Station_7 18 0.6000000
Wuhan_Station_8 11 0.3666667
Wuhan_Station_9 17 0.5666667
Yichang_Station_1 14 0.4666667
Yichang_Station_10 8 0.2666667
Yichang_Station_2 16 0.5333333
Yichang_Station_3 11 0.3666667
Yichang_Station_4 14 0.4666667
Yichang_Station_5 17 0.5666667
Yichang_Station_6 19 0.6333333
Yichang_Station_7 19 0.6333333
Yichang_Station_8 5 0.1666667
Yichang_Station_9 15 0.5000000
Zhengzhou_Station_1 13 0.4333333
Zhengzhou_Station_10 17 0.5666667
Zhengzhou_Station_2 19 0.6333333
Zhengzhou_Station_3 17 0.5666667
Zhengzhou_Station_4 20 0.6666667
Zhengzhou_Station_5 12 0.4000000
Zhengzhou_Station_6 12 0.4000000
Zhengzhou_Station_7 17 0.5666667
Zhengzhou_Station_8 15 0.5000000
Zhengzhou_Station_9 12 0.4000000
Sumatoria 3000 100.0000000
# Debido al gran número de Estaciones de Monitoreo, se realizó una agrupación por regiones


#Extraer ciudad automáticamente 
datos <- datos %>%
  mutate(
    Ciudad = str_extract(Monitoring_Station, "^[A-Za-z]+")
  )

#Crear variable REGION según ciudad 
datos <- datos %>%
  mutate(
    Region = case_when(
      Ciudad %in% c("Beijing", "Jinan", "Qingdao") ~ "Norte",
      Ciudad %in% c("Guangzhou", "Shenzhen", "Dali", "Kunming") ~ "Sur",
      Ciudad %in% c("Wuhan", "Zhengzhou", "Luoyang", "Yichang", "Mianyang") ~ "Centro",
      Ciudad %in% c("Shanghai", "Suzhou", "Hangzhou", "Ningbo", "Nanjing") ~ "Este",
      Ciudad %in% c("Chengdu") ~ "Oeste",
      TRUE ~ "Otra"
    )
  )

#Tabla de frecuencias por región
TDF_Region <- datos %>%
  count(Region, name = "ni") %>%
  mutate(
    hi = round((ni / sum(ni)) * 100, 2)  
  )

# Enumeración
TDF_Region$N <- 1:nrow(TDF_Region)
TDF_Region <- TDF_Region[, c("N", "Region", "ni", "hi")]

#Fila de sumatoria
Sumatoria <- data.frame(
  N = "",
  Region = "Sumatoria",
  ni = sum(TDF_Region$ni),
  hi = sum(TDF_Region$hi)  
)

TDF_Region_suma <- rbind(TDF_Region, Sumatoria)

# Tabla final
kable(
  TDF_Region_suma,
  align = "c",
  caption = "Tabla Nº 4:Distribución de frecuencias de las estaciones de 
  monitoreo por región en el estudio de contaminación del agua en China, 
  año 2023"
) |>
  kable_styling(
    full_width = FALSE,
    bootstrap_options = c("striped", "hover", "condensed"),
    position = "center"
  )
Tabla Nº 4:Distribución de frecuencias de las estaciones de monitoreo por región en el estudio de contaminación del agua en China, año 2023
N Region ni hi
1 Centro 730 24.33
2 Este 909 30.30
3 Norte 599 19.97
4 Oeste 165 5.50
5 Sur 597 19.90
Sumatoria 3000 100.00
# Crear vector de frecuencias
ni <- TDF_Region$ni

#Gráficas

# Gráfica de barras local
barplot(
  ni,
  main = "Gráfica N°1: Distribución de las estaciones de
  monitoreo por región en el estudio de contaminación del agua en China, 
  año 2023",
  xlab = "Región",
  ylab = "Cantidad",
  col = "skyblue",
  ylim = c(0, 1000),
  names.arg = TDF_Region$Region,
  las = 2
)

# Crear vector de frecuencias relativas
hi <- TDF_Region$hi

# Gráfica de barras porcentual local
barplot(
  hi,
  main = "Gráfica N°2: Distribución Distribución de frecuencias de las 
  estaciones de monitoreo por región en el estudio de contaminación del agua en 
  China, año 2023",
  xlab = "Región",
  ylab = "Porcentaje",
  col = "green",
  ylim = c(0, 30),
  names.arg = TDF_Region$Region,
  las = 2)

# Diagrama de barras global
barplot(
  ni,
  main = "Gráfica N°3: Distribución de frecuencias de las estaciones de
  monitoreo por región en el estudio de contaminación del agua en China, 
  año 2023",
  xlab = "Fuentes de Agua",
  ylab = "Cantidad",
  col = "pink",
  ylim = c(0, 3000),
  names.arg = TDF_Region$Region,
  las = 2)

# Gráfica de barras porcentual global
barplot(
  hi,
  main = "Gráfica N°4: Distribución de frecuencias de las estaciones de 
  monitoreo por región en el estudio de contaminación del agua en China, 2023",
  xlab = "Región",
  ylab = "Porcentaje",
  col = "green",
  ylim = c(0, 100),
  names.arg = TDF_Region$Region,
  las = 2)

# Colores para 5 regiones
colores <- rev(heat.colors(length(hi)))

# Diagrama circular
pie(
  hi,
  main = "Gráfica N°5: Distribución de frecuencias de las estaciones de 
  monitoreo por región en el estudio de contaminación del agua en China, 2023",
  radius = 1,
  labels = paste0(hi, "%"),
  col = colores,
  cex = 1,
  cex.main = 1
)

# Leyenda
legend(
  "bottomright",
  legend = TDF_Region$Region,
  fill = colores,
  cex = 0.9,
  title = "Regiones"
)