UNIVERSIDAD CENTRAL DEL ECUADOR

PROYECTO:ESTUDIO ESTADÍSTICO DE LA CALIDAD DEL AIRE EN INDIA

FECHA: 22/11/2025

# ====================================================================
# ESTADÍSTICA DESCRIPTIVA: ANÁLISIS DE CO (µg/m³) EN INDIA (2015-2020)
# Adaptado por: Ariel Chiluisa
# Fecha: 10/12/2025
# ====================================================================
library(gt)
library(dplyr)

# ============================================
# 1. CARGA Y LIMPIEZA DE DATOS (CO)
# ============================================
# Asegúrate de que 'Datos Cambiados.csv' esté en el directorio de trabajo o usa la ruta absoluta.
datos <- read.csv("C:/Users/JOSELYN/Desktop/kangle/Datos Cambiados.csv", header = TRUE, sep = ",") # Usando el nombre del archivo si ya está cargado

CO_raw <- datos$CO[datos$CO != "-"]
CO_raw <- as.numeric(as.character(CO_raw)) 
CO_clean <- CO_raw[!is.na(CO_raw)]
CO <- CO_clean # Variable limpia principal

# ====================================================================
# PARTE A: TABLA Y GRÁFICOS BASADOS EN REGLA DE STURGES (Tabla Nro. 1)
# ====================================================================

# --- CÁLCULOS DE STURGES ---
n <- length(CO)
min_CO <- min(CO)
max_CO <- max(CO)
R <- max_CO - min_CO
k_sturges <- 1 + 3.322 * log10(n)
k <- round(k_sturges) 
A <- R / k 
Li <- seq(from = min_CO, by = A, length.out = k)
Ls <- Li + A
Ls[k] <- max_CO 
breaks_intervals <- c(Li[1], Ls) 
MC <- (Li + Ls) / 2
CO_calc <- round(CO, 4) 
Li_R <- round(Li, 4)
Ls_R <- round(Ls, 4)
MC_R <- round(MC, 4)

# --- CÁLCULO DE FRECUENCIAS ---
ni <- numeric(k)
for (i in 1:k) {
  if (i < k) {
    ni[i] <- sum(CO_calc >= Li_R[i] & CO_calc < Ls_R[i])
  } else {
    ni[i] <- sum(CO_calc >= Li_R[i] & CO_calc <= Ls_R[i])
  }
}

# --- CÁLCULOS COMPLEMENTARIOS ---
hi <- (ni / sum(ni)) * 100
Ni_asc <- cumsum(ni)
Hi_asc <- cumsum(hi)
Ni_desc <- rev(cumsum(rev(ni)))
Hi_desc <- rev(cumsum(rev(hi)))

# --- CONSTRUCCIÓN DE LA TABLA NRO. 1 ---
Intervalo_txt <- paste0("[", Li_R, " - ", Ls_R, ")")
Intervalo_txt[k] <- paste0("[", Li_R[k], " - ", Ls_R[k], "]")
TDF_CO <- data.frame(
  Intervalo = Intervalo_txt, MC = MC_R, ni = ni, hi = round(hi, 2), 
  Ni_ascendente = Ni_asc, Ni_descendente = Ni_desc, Hi_ascendente = round(Hi_asc, 2), Hi_descendente = round(Hi_desc, 2)
)
colnames(TDF_CO) <- c("Intervalo", "MC", "ni", "hi", "Ni_ascendente", "Ni_descendente", "Hi_ascendente", "Hi_descendente")
TDF_CO_Sturges <- TDF_CO # Guardamos TDF sin totales para gráficos

totales <- data.frame(
  Intervalo = "Totales", MC = "-", ni = sum(ni), hi = sum(hi), 
  Ni_ascendente = "-", Ni_descendente = "-", Hi_ascendente = "-", Hi_descendente = "-"
)
TDF_Final <- rbind(TDF_CO, totales)

# MOSTRAR TABLA NRO. 1 (NO SE MODIFICAN ESTILOS)
TDF_Final %>%
  gt() %>%
  tab_header(
    title = md("**Tabla Nro. 1**"),
    subtitle = md(paste0("*Distribución de frecuencia de CO (k=", k, " según Sturges)*"))
  ) %>%
  tab_source_note(
    source_note = md("Fuente: Elaboración propia a partir de Datos Cambiados.csv")
  ) %>%
  tab_style(style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"), locations = cells_body()) %>%
  tab_style(style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"), locations = cells_body()) %>%
  tab_style(style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"), locations = cells_column_labels()) %>%
  tab_style(style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"), locations = cells_column_labels()) %>%
  tab_options(
    table.border.top.color = "black", table.border.bottom.color = "black", table.border.top.style = "solid", table.border.bottom.style = "solid",
    column_labels.border.top.color = "black", column_labels.border.bottom.color = "black", column_labels.border.bottom.width = px(2),
    row.striping.include_table_body = TRUE, heading.border.bottom.color = "black", heading.border.bottom.width = px(2),
    table_body.hlines.color = "gray", table_body.border.bottom.color = "black"
  )
Tabla Nro. 1
Distribución de frecuencia de CO (k=16 según Sturges)
Intervalo MC ni hi Ni_ascendente Ni_descendente Hi_ascendente Hi_descendente
[0 - 10.9881) 5.4941 26414 96.15 26414 27472 96.15 100
[10.9881 - 21.9763) 16.4822 485 1.77 26899 1058 97.91 3.85
[21.9763 - 32.9644) 27.4703 270 0.98 27169 573 98.9 2.09
[32.9644 - 43.9525) 38.4584 134 0.49 27303 303 99.38 1.1
[43.9525 - 54.9406) 49.4466 68 0.25 27371 169 99.63 0.62
[54.9406 - 65.9287) 60.4347 37 0.13 27408 101 99.77 0.37
[65.9288 - 76.9169) 71.4228 17 0.06 27425 64 99.83 0.23
[76.9169 - 87.905) 82.4109 15 0.05 27440 47 99.88 0.17
[87.905 - 98.8931) 93.3991 13 0.05 27453 32 99.93 0.12
[98.8931 - 109.8812) 104.3872 6 0.02 27459 19 99.95 0.07
[109.8812 - 120.8694) 115.3753 7 0.03 27466 13 99.98 0.05
[120.8694 - 131.8575) 126.3634 1 0.00 27467 6 99.98 0.02
[131.8575 - 142.8456) 137.3516 3 0.01 27470 5 99.99 0.02
[142.8456 - 153.8338) 148.3397 1 0.00 27471 2 100 0.01
[153.8338 - 164.8219) 159.3278 0 0.00 27471 1 100 0
[164.8219 - 175.81] 170.3159 1 0.00 27472 1 100 0
Totales - 27472 100.00 - - - -
Fuente: Elaboración propia a partir de Datos Cambiados.csv
# ====================================================================
# PARTE B: TABLA Y GRÁFICOS BASADOS EN K=12 (Tabla Nro. 2)
# ====================================================================

# --- CÁLCULOS DE K=12 ---
k <- 12 # K FIJO
A <- R / k
Lis <- seq(from = min_CO, to = max_CO - A, by = A)
Lss <- c(seq(from = min_CO + A, to = max_CO - A, by = A), max_CO)
MCs <- (Lis + Lss) / 2
CO_calc <- round(CO, 3) 
Lis_calc <- round(Lis, 3)
Lss_calc <- round(Lss, 3)

# --- CÁLCULO DE FRECUENCIAS ---
ni <- numeric(length(Lis))
for (i in 1:length(Lis)) {
  if (i < length(Lis)) {
    ni[i] <- sum(CO_calc >= Lis_calc[i] & CO_calc < Lss_calc[i])
  } else {
    ni[i] <- sum(CO_calc >= Lis_calc[i] & CO_calc <= Lss_calc[i])
  }
}

# --- AJUSTE FORZADO (hi y Hi) ---
hi <- (ni / sum(ni)) * 100
hi <- round(hi, 3)
ajuste_hi <- 100 - sum(hi)
hi[length(hi)] <- hi[length(hi)] + ajuste_hi
hi <- round(hi, 3) 

Hi_asc <- cumsum(hi)
Hi_desc <- rev(cumsum(rev(hi)))
Hi_asc[length(Hi_asc)] <- 100
Hi_desc[1] <- 100
Ni_asc <- cumsum(ni)
Ni_desc <- rev(cumsum(rev(ni)))

# --- CONSTRUCCIÓN DE LA TABLA NRO. 2 ---
Intervalo_txt <- paste0("[", round(Lis, 2), " - ", round(Lss, 2), ")")
Intervalo_txt[k] <- paste0("[", round(Lis[k], 2), " - ", round(Lss[k], 2), "]")

TDF_CO_Simplificada <- data.frame(
  Intervalo = Intervalo_txt, MC = round(MCs, 3), ni = ni, hi = hi,
  Ni_ascendente = Ni_asc, Hi_ascendente = Hi_asc, Ni_descendente = Ni_desc, Hi_descendente = Hi_desc
)
colnames(TDF_CO_Simplificada) <- c("Intervalo", "MC", "ni", "hi(%)", "Ni_asc", "Hi_asc (%)", "Ni_desc", "Hi_desc (%)")

totales <- data.frame(
  Intervalo = "Totales", MC = "-", ni = sum(ni), hi. = 100, Ni_asc = "-", Hi_asc. = "-", Ni_desc = "-", Hi_desc. = "-"
)
colnames(totales) <- colnames(TDF_CO_Simplificada)
TDF_Final_Simplificada <- rbind(TDF_CO_Simplificada, totales)

# MOSTRAR TABLA NRO. 2 (NO SE MODIFICAN ESTILOS)
TDF_Final_Simplificada %>%
  gt() %>%
  tab_header(
    title = md("**Tabla Nro. 2**"),
    subtitle = md(paste0("*Distribución de frecuencia simplificada de CO (k=", k, ")*"))
  ) %>%
  tab_source_note(
    source_note = md("Fuente: Elaboración propia a partir de Datos Cambiados.csv")
  ) %>%
  tab_style(style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"), locations = cells_body()) %>%
  tab_style(style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"), locations = cells_body()) %>%
  tab_style(style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"), locations = cells_column_labels()) %>%
  tab_style(style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"), locations = cells_column_labels()) %>%
  tab_options(
    table.border.top.color = "black", table.border.bottom.color = "black", table.border.top.style = "solid", table.border.bottom.style = "solid",
    column_labels.border.top.color = "black", column_labels.border.bottom.color = "black", column_labels.border.bottom.width = px(2),
    row.striping.include_table_body = TRUE, heading.border.bottom.color = "black", heading.border.bottom.width = px(2),
    table_body.hlines.color = "gray", table_body.border.bottom.color = "black"
  )
Tabla Nro. 2
Distribución de frecuencia simplificada de CO (k=12)
Intervalo MC ni hi(%) Ni_asc Hi_asc (%) Ni_desc Hi_desc (%)
[0 - 14.65) 7.325 26628 96.928 26628 96.928 27472 100
[14.65 - 29.3) 21.976 465 1.693 27093 98.621 844 3.072
[29.3 - 43.95) 36.627 210 0.764 27303 99.385 379 1.379
[43.95 - 58.6) 51.278 81 0.295 27384 99.68 169 0.615
[58.6 - 73.25) 65.929 38 0.138 27422 99.818 88 0.32
[73.25 - 87.9) 80.58 18 0.066 27440 99.884 50 0.182
[87.9 - 102.56) 95.23 15 0.055 27455 99.939 32 0.116
[102.56 - 117.21) 109.881 7 0.025 27462 99.964 17 0.061
[117.21 - 131.86) 124.532 5 0.018 27467 99.982 10 0.036
[131.86 - 146.51) 139.183 4 0.015 27471 99.997 5 0.018
[146.51 - 161.16) 153.834 0 0.000 27471 99.997 1 0.003
[161.16 - 175.81] 168.485 1 0.003 27472 100 1 0.003
Totales - 27472 100.000 - - - -
Fuente: Elaboración propia a partir de Datos Cambiados.csv
# --------------------------------------------------------------------
# GRÁFICA N°1: HISTOGRAMA DE CO (Frecuencia Absoluta - Sturges)
# --------------------------------------------------------------------
histo_CO_Sturges <- hist(CO_clean, breaks = breaks_intervals, plot = FALSE)
hist(CO_clean, breaks = breaks_intervals,
     main = "Gráfica N°1: Histograma de Frecuencia Absoluta (Sturges)",
     xlab = "Concentración de CO (µg/m³)",
     ylab = "Frecuencia (ni)",
     ylim = c(0, max(TDF_CO_Sturges$ni) * 1.1), 
     col = "lightskyblue", cex.main = 0.9, cex.lab = 1, cex.axis = 0.9, xaxt = "n", border = "black")
axis(1, at = histo_CO_Sturges$breaks,
     labels = round(histo_CO_Sturges$breaks, 0), # AJUSTE A ENTERO
     las = 2, cex.axis = 0.8)

# --------------------------------------------------------------------
# GRÁFICA N°2: HISTOGRAMA DE FRECUENCIA ABSOLUTA (Sturges — k=16)
# AJUSTADO A EJE Y = 0 A 27472
# --------------------------------------------------------------------

# Los breaks ya están calculados como "breaks_intervals" en tu código
histo_CO_Sturges <- hist(CO_clean, breaks = breaks_intervals, plot = FALSE)

hist(CO_clean, breaks = breaks_intervals,
     main = "Gráfica N°2: Histograma de Frecuencia Absoluta (Sturges)",
     xlab = "Concentración de CO (µg/m³)",
     ylab = "Frecuencia (ni)",
     ylim = c(0, 27472),   # <<–– AJUSTE SOLICITADO
     col = "lightskyblue",
     cex.main = 0.9, cex.lab = 1, cex.axis = 0.9,
     xaxt = "n", border = "black")

axis(1, at = histo_CO_Sturges$breaks,
     labels = round(histo_CO_Sturges$breaks, 0),
     las = 2, cex.axis = 0.8)

# --------------------------------------------------------------------
# GRÁFICA N°3: HISTOGRAMA DE CO (Frecuencia Absoluta - k=12)
# --------------------------------------------------------------------
breaks_simplificado <- c(Lis[1], Lss)
histo_CO_simplificado <- hist(CO, breaks = breaks_simplificado, plot = FALSE)
hist(CO, breaks = breaks_simplificado,
     main = "Gráfica N°3: Histograma de Frecuencia Absoluta (k=12)",
     xlab = "CO (µg/m³)",
     ylab = "Frecuencia (ni)",
     ylim = c(0, max(ni) * 1.1), 
     col = "lightskyblue", cex.main = 0.9, cex.lab = 1, cex.axis = 0.9, xaxt = "n", border = "black")
axis(1, at = histo_CO_simplificado$breaks,
     labels = round(histo_CO_simplificado$breaks, 0), # AJUSTE A ENTERO
     las = 2, cex.axis = 0.8)

# --------------------------------------------------------------------
# GRÁFICA N°4: HISTOGRAMA DE CO (Frecuencia Absoluta - k=12)
# AJUSTADO A EJE Y DE 0 A 27472
# --------------------------------------------------------------------

breaks_simplificado <- c(Lis[1], Lss)
histo_CO_simplificado <- hist(CO, breaks = breaks_simplificado, plot = FALSE)

hist(CO, breaks = breaks_simplificado,
     main = "Gráfica N°4: Histograma de Frecuencia Absoluta (k=12)",
     xlab = "CO (µg/m³)",
     ylab = "Frecuencia (ni)",
     ylim = c(0, 27472),   # <<–– AJUSTE SOLICITADO
     col = "lightskyblue",
     cex.main = 0.9, cex.lab = 1, cex.axis = 0.9,
     xaxt = "n", border = "black")

axis(1, at = histo_CO_simplificado$breaks,
     labels = round(histo_CO_simplificado$breaks, 0),
     las = 2, cex.axis = 0.8)

# --------------------------------------------------------------------
# GRÁFICA N°5: BOXPLOT (Diagrama de Caja)
# --------------------------------------------------------------------
boxplot(CO, horizontal = TRUE, col = "lightgreen", border = "black",
        main = "Gráfica N°5: Diagrama de Caja de la Concentración de CO",
        xlab = "CO (µg/m³)")

# --------------------------------------------------------------------
# GRÁFICA N°6: OJIVA (Frecuencia Absoluta - k=12)
# --------------------------------------------------------------------
# Usamos variables de k=12: Lis, Lss, Ni_asc, Ni_desc.
X_coordenadas <- c(Lis[1], Lss) 
Y_asc <- c(0, Ni_asc) 
Y_desc <- c(Ni_desc, 0) 

plot(X_coordenadas, Y_asc, type = "b", 
     main = "Gráfica N°6: Ojiva Ascendente y Descendente",
     xlab = "CO (µg/m³)", ylab = "Frecuencia Absoluta Acumulada (Ni)",
     pch = 19, col = "darkblue", ylim = c(0, max(Ni_asc)), xaxt = "n")
lines(X_coordenadas, Y_desc, type = "b", col = "red", pch = 19)

axis(1, at = X_coordenadas,
     labels = round(X_coordenadas, 0), # AJUSTE A ENTERO
     las = 2, cex.axis = 0.8)
legend("topright", legend = c("Ojiva Ascendente (Ni)", "Ojiva Descendente (Ni)"), 
       col = c("darkblue", "red"), lty = 1, pch = 19, cex = 0.8)

# --------------------------------------------------------------------
# GRÁFICA N°7: OJIVA (Frecuencia Relativa Porcentual - k=12)
# --------------------------------------------------------------------
# Usamos variables de k=12: Lis, Lss, Hi_asc, Hi_desc.
Y_asc_pct <- c(0, Hi_asc) 
Y_asc_pct[length(Y_asc_pct)] <- 100 # Forzar a 100
Y_desc_pct <- c(Hi_desc, 0) 
Y_desc_pct[1] <- 100 # Forzar a 100

plot(X_coordenadas, Y_asc_pct, type = "b", 
     main = "Gráfica N°7: Ojiva Ascendente y Descendente (Porcentaje)",
     xlab = "CO (µg/m³)", ylab = "Frecuencia Relativa Acumulada (%)",
     pch = 19, col = "darkgreen", ylim = c(0, 100), xaxt = "n")
lines(X_coordenadas, Y_desc_pct, type = "b", col = "red", pch = 19)

axis(1, at = X_coordenadas,
     labels = round(X_coordenadas, 0), # AJUSTE A ENTERO
     las = 2, cex.axis = 0.8)
legend("topright", legend = c("Ojiva Ascendente (Hi%)", "Ojiva Descendente (Hi%)"), 
       col = c("darkgreen", "red"), lty = 1, pch = 19, cex = 0.8)