UNIVERSIDAD CENTRAL DEL ECUADOR

PROYECTO:ESTUDIO ESTADÍSTICO DE LA CALIDAD DEL AIRE EN INDIA

FECHA: 21/11/2025

# Estadística Descriptiva
# 09/12/2025
# Sebastian Chiluisa

# ======= CARGA DE PAQUETES =======
library(gt)
library(dplyr)

# ======= CARGAR DATOS =======
datos <- read.csv(
  "~/ariana tercer semestre/Estadistica/city_day.csv",
  header = TRUE,
  sep = ",",
  dec = "."
)

# ======= LIMPIEZA DE LA VARIABLE NOx =======
NOx <- datos$NOx[datos$NOx != "-"]
NOx <- as.numeric(NOx)

length(NOx)
## [1] 25346
# ======= MIN, MAX, RANGO =======
min_NOx <- min(NOx)
max_NOx <- max(NOx)
R <- max_NOx - min_NOx

# ======= USAMOS k = 16 =======
k <- 16
A <- R / k

# ======= GENERACIÓN DE INTERVALOS =======
Li <- seq(from = min_NOx, to = max_NOx - A, by = A)
Ls <- c(seq(from = min_NOx + A, to = max_NOx - A, by = A), max_NOx)

MC <- (Li + Ls) / 2

# ======= CÁLCULO DE FRECUENCIAS =======
NOx <- round(NOx, 3)
Li <- round(Li, 3)
Ls <- round(Ls, 3)

ni <- numeric(length(Li))

for (i in 1:length(Li)) {
  if (i < length(Li)) {
    ni[i] <- sum(NOx >= Li[i] & NOx < Ls[i])
  } else {
    ni[i] <- sum(NOx >= Li[i] & NOx <= Ls[i])
  }
}

# ======= CÁLCULOS COMPLEMENTARIOS =======
N <- sum(ni)
hi <- (ni / N) * 100

Ni_asc <- cumsum(ni)
Ni_desc <- rev(cumsum(rev(ni)))
Hi_asc <- cumsum(hi)
Hi_desc <- rev(cumsum(rev(hi)))

# ======= FORMATO DE INTERVALOS =======
Intervalo <- paste0("[", round(Li,2), " - ", round(Ls,2), ")")
Intervalo[length(Intervalo)] <- paste0("[", round(Li[length(Li)],2), " - ",
                                       round(Ls[length(Ls)],2), "]")

# ======= TABLA FINAL =======
TDF_NOx <- data.frame(
  Intervalo = Intervalo,
  MC = round(MC, 2),
  ni = ni,
  hi = round(hi, 2),
  Ni_ascendente = Ni_asc,
  Ni_descendente = Ni_desc,
  Hi_ascendente = round(Hi_asc, 2),
  Hi_descendente = round(Hi_desc, 2)
)

# ======= AGREGAR FILA DE TOTALES =======
totales <- data.frame(
  Intervalo = "Totales",
  MC = "-",
  ni = sum(ni),
  hi = sum(hi),
  Ni_ascendente = "-",
  Ni_descendente = "-",
  Hi_ascendente = "-",
  Hi_descendente = "-"
)

TDF_NOx <- rbind(TDF_NOx, totales)

# ======= TABLA BONITA CON gt() =======
TDF_NOx %>%
  gt() %>%
  tab_header(
    title = md("*Tabla Nro. 1*"),
    subtitle = md("**Distribución de frecuencia de la concentración de NOx, estudio de calidad del aire en India**")
  ) %>%
  tab_source_note(
    source_note = md("Fuente: Datos obtenidos y procesados por medio de https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india")
  ) %>%
  tab_style(
    style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"),
    locations = cells_body()
  ) %>%
  tab_style(
    style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"),
    locations = cells_body()
  ) %>%
  tab_style(
    style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"),
    locations = cells_column_labels()
  ) %>%
  tab_style(
    style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"),
    locations = cells_column_labels()
  ) %>%
  tab_options(
    table.border.top.color = "black",
    table.border.bottom.color = "black",
    table.border.top.style = "solid",
    table.border.bottom.style = "solid",
    column_labels.border.top.color = "black",
    column_labels.border.bottom.color = "black",
    column_labels.border.bottom.width = px(2),
    row.striping.include_table_body = TRUE,
    heading.border.bottom.color = "black",
    heading.border.bottom.width = px(2),
    table_body.hlines.color = "gray",
    table_body.border.bottom.color = "black"
  )
Tabla Nro. 1
Distribución de frecuencia de la concentración de NOx, estudio de calidad del aire en India
Intervalo MC ni hi Ni_ascendente Ni_descendente Hi_ascendente Hi_descendente
[0 - 29.23) 14.61 15554 61.37 15554 25346 61.37 100
[29.23 - 58.45) 43.84 6269 24.73 21823 9792 86.1 38.63
[58.45 - 87.68) 73.07 1961 7.74 23784 3523 93.84 13.9
[87.68 - 116.91) 102.29 830 3.27 24614 1562 97.11 6.16
[116.91 - 146.13) 131.52 401 1.58 25015 732 98.69 2.89
[146.13 - 175.36) 160.75 191 0.75 25206 331 99.45 1.31
[175.36 - 204.59) 189.97 76 0.30 25282 140 99.75 0.55
[204.59 - 233.82) 219.2 35 0.14 25317 64 99.89 0.25
[233.82 - 263.04) 248.43 18 0.07 25335 29 99.96 0.11
[263.04 - 292.27) 277.66 5 0.02 25340 11 99.98 0.04
[292.27 - 321.5) 306.88 2 0.01 25342 6 99.98 0.02
[321.5 - 350.72) 336.11 0 0.00 25342 4 99.98 0.02
[350.72 - 379.95) 365.34 2 0.01 25344 4 99.99 0.02
[379.95 - 409.18) 394.56 1 0.00 25345 2 100 0.01
[409.18 - 438.4) 423.79 0 0.00 25345 1 100 0
[438.4 - 467.63] 453.02 1 0.00 25346 1 100 0
Totales - 25346 100.00 - - - -
Fuente: Datos obtenidos y procesados por medio de https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india
# ============================================
# PROCESO DE SIMPLIFICACIÓN — VARIABLE NOx
# ============================================

library(gt)
library(dplyr)

NOx <- datos$NOx[datos$NOx != "-"]
NOx <- as.numeric(NOx)

n <- length(NOx)

min_NOx <- min(NOx)
max_NOx <- max(NOx)
R <- max_NOx - min_NOx

k <- 12
A <- R / k

Lis <- seq(from = min_NOx, to = max_NOx - A, by = A)
Lss <- c(seq(from = min_NOx + A, to = max_NOx - A, by = A), max_NOx)

MCs <- (Lis + Lss) / 2

NOx <- round(NOx, 3)
Lis <- round(Lis, 3)
Lss <- round(Lss, 3)

ni <- numeric(length(Lis))

for (i in 1:length(Lis)) {
  if (i < length(Lis)) {
    ni[i] <- sum(NOx >= Lis[i] & NOx < Lss[i])
  } else {
    ni[i] <- sum(NOx >= Lis[i] & NOx <= Lss[i])
  }
}

# FRECUENCIAS RELATIVAS
hi <- round((ni / sum(ni)) * 100, 2)
hi[length(hi)] <- 100 - sum(hi[-length(hi)])

Ni_asc <- cumsum(ni)
Hi_asc <- round(cumsum(hi), 2)
Ni_desc <- rev(cumsum(rev(ni)))
Hi_desc <- round(rev(cumsum(rev(hi))), 2)

# CREACIÓN DE TABLA
TDF_NOx <- data.frame(
  Intervalo = paste0("[", Lis, " - ", Lss, ")"),
  MC = round(MCs, 3),
  ni = ni,
  `hi(%)` = hi,
  Ni_asc = Ni_asc,
  `Hi_asc (%)` = Hi_asc,
  Ni_desc = Ni_desc,
  `Hi_desc (%)` = Hi_desc
)

totales <- data.frame(
  Intervalo = "Totales",
  MC = "-",
  ni = sum(ni),
  `hi(%)` = sum(hi),
  Ni_asc = "-",
  `Hi_asc (%)` = "-",
  Ni_desc = "-",
  `Hi_desc (%)` = "-"
)

# Detectar nombre REAL de la columna hi
col_hi <- grep("^hi", colnames(TDF_NOx), value = TRUE)

# Convertir a numérico donde corresponda
TDF_NOx$MC <- suppressWarnings(as.numeric(TDF_NOx$MC))
TDF_NOx[[col_hi]] <- suppressWarnings(as.numeric(TDF_NOx[[col_hi]]))

TDF_NOx <- rbind(TDF_NOx, totales)

# TABLA BONITA
TDF_NOx %>%
  gt() %>%
  tab_header(
    title = md("*Tabla Nro. 2*"),
    subtitle = md("**Distribución de frecuencia simplificada de la concentración de NOx, estudio de calidad del aire en India**")
  ) %>%
  tab_source_note(
    source_note = md("Fuente: Datos obtenidos y procesados por medio de https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india")
  ) %>%
  tab_style(
    style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"),
    locations = cells_body()
  ) %>%
  tab_style(
    style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"),
    locations = cells_body()
  ) %>%
  tab_style(
    style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"),
    locations = cells_column_labels()
  ) %>%
  tab_style(
    style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"),
    locations = cells_column_labels()
  ) %>%
  tab_options(
    table.border.top.color = "black",
    table.border.bottom.color = "black",
    table.border.top.style = "solid",
    table.border.bottom.style = "solid",
    column_labels.border.top.color = "black",
    column_labels.border.bottom.color = "black",
    column_labels.border.bottom.width = px(2),
    row.striping.include_table_body = TRUE,
    heading.border.bottom.color = "black",
    heading.border.bottom.width = px(2),
    table_body.hlines.color = "gray",
    table_body.border.bottom.color = "black"
  )
Tabla Nro. 2
Distribución de frecuencia simplificada de la concentración de NOx, estudio de calidad del aire en India
Intervalo MC ni hi... Ni_asc Hi_asc.... Ni_desc Hi_desc....
[0 - 38.969) 19.485 18733 73.91 18733 73.91 25346 100
[38.969 - 77.938) 58.454 4597 18.14 23330 92.05 6613 26.09
[77.938 - 116.908) 97.423 1284 5.07 24614 97.12 2016 7.95
[116.908 - 155.877) 136.392 488 1.93 25102 99.05 732 2.88
[155.877 - 194.846) 175.361 156 0.62 25258 99.67 244 0.95
[194.846 - 233.815) 214.33 59 0.23 25317 99.9 88 0.33
[233.815 - 272.784) 253.3 20 0.08 25337 99.98 29 0.1
[272.784 - 311.753) 292.269 5 0.02 25342 100 9 0.02
[311.753 - 350.722) 331.238 0 0.00 25342 100 4 0
[350.722 - 389.692) 370.207 3 0.01 25345 100.01 4 0
[389.692 - 428.661) 409.176 0 0.00 25345 100.01 1 -0.01
[428.661 - 467.63) 448.145 1 -0.01 25346 100 1 -0.01
Totales - 25346 100.00 - - - -
Fuente: Datos obtenidos y procesados por medio de https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india
# ========= CREAR HISTOGRAMA PARA OBTENER LOS BREAKS ==========
Histograma_NOx <- hist(NOx, breaks = 11, plot = FALSE)

# ========= GRÁFICA — HISTOGRAMA DE NOx =========
hist(NOx, breaks = 11,
     main = "Gráfica N°1: Distribución de la Concentración de NOx
     presente en el estudio sobre calidad del aire en India entre 2015-2020",
     xlab = "NOx (µg/m3)",
     ylab = "Cantidad",
     ylim = c(0, max(ni)),
     col = "#FFC300",   # amarillo cálido
     cex.main = 0.9,
     cex.lab = 1,
     cex.axis = 0.9,
     xaxt = "n")

axis(1, at = Histograma_NOx$breaks,
     labels = Histograma_NOx$breaks,
     las = 1,
     cex.axis = 0.9)

hist(NOx, breaks = 11,
     main = "Gráfica N°2: Distribución de la Concentración de NOx
     presente en el estudio sobre calidad del aire en India entre 2015-2020",
     xlab = "NOx (µg/m3)",
     ylab = "Cantidad",
     ylim = c(0, length(NOx)),
     col = "#FF5733",   # naranja suave
     cex.main = 1,
     cex.lab = 1,
     cex.axis = 0.9,
     xaxt = "n")

axis(1, at = Histograma_NOx$breaks,
     labels = Histograma_NOx$breaks,
     las = 1,
     cex.axis = 0.9)

# --- Número de filas sin contar totales ---
n <- nrow(TDF_NOx) - 1

# --- Asegurar que MC sea numérico solo para filas de datos ---
MC_num <- as.numeric(TDF_NOx$MC[1:n])

# --- Detectar columna hi(%) automáticamente ---
col_hi <- grep("hi", names(TDF_NOx), value = TRUE)
hi_num <- as.numeric(TDF_NOx[[col_hi]][1:n])

# --- Crear la gráfica — Gráfica 3 ---
bp <- barplot(
  height = hi_num,
  space = 0,
  col = "#33C3FF",   # azul claro
  main = "Gráfica N°3: Distribución de la concentración de NOx\nEstudio calidad del aire en India, 2015-2020",
  xlab = "NOx (µg/m3)",
  ylab = "Porcentaje (%)",
  names.arg = rep("", n),   # temporal
  ylim = c(0, 100),
  las = 1
)

# --- Eje X con MC enteros ---
axis(
  side = 1,
  at = bp,
  labels = round(MC_num, 0),  # redondea a enteros
  las = 2,
  cex.axis = 0.8
)

# --- Número de filas sin contar totales ---
n <- nrow(TDF_NOx) - 1

# --- Asegurar que MC sea numérico solo para filas de datos ---
MC_num <- as.numeric(TDF_NOx$MC[1:n])

# --- Detectar columna hi(%) automáticamente ---
col_hi <- grep("hi", names(TDF_NOx), value = TRUE)
hi_num <- as.numeric(TDF_NOx[[col_hi]][1:n])

# --- Crear la Gráfica 4 ---
bp4 <- barplot(
  height = hi_num,
  space = 0,
  col = "#33FF99",   # verde menta
  main = "Gráfica N°4: Distribución de la concentración de NOx\nEstudio calidad del aire en India, 2015-2020",
  xlab = "NOx (µg/m3)",
  ylab = "Porcentaje (%)",
  names.arg = rep("", n),  # temporal
  las = 1
)

# --- Eje X con MC enteros ---
axis(
  side = 1,
  at = bp4,
  labels = round(MC_num, 0),  # redondea a enteros
  las = 2,
  cex.axis = 0.8
)

# --- Gráfica 5: Boxplot de NOx ---
CajaNOx <- boxplot(
  NOx,
  horizontal = TRUE,
  col = "#FFB347",   # naranja suave
  border = "black",
  main = "Gráfica No. 5: Distribución de la concentración de NOx\nEstudio calidad del aire en India 2015-2020",
  xlab = "NOx (µg/m3)"
)

# --- Asegurarse de tener Lis, Lss, ni, hi (sin fila de totales) ---
# Recalcular acumuladas
Ni_asc  <- cumsum(ni)
Ni_desc <- rev(cumsum(rev(ni)))
Hi_asc  <- cumsum(hi)
Hi_desc <- rev(cumsum(rev(hi)))

k <- length(Lis)  # número de clases

# --- Gráfica 6: Ojiva (cantidad) para NOx ---
plot(Lss, Ni_asc,
     type = "b",
     main = "Gráfica N°6: Ojiva ascendente y descendente (Cantidad) - NOx",
     xlab = "NOx (µg/m3)",
     ylab = "Cantidad",
     pch = 19,
     col = "#33C3FF",   # azul claro
     ylim = c(0, max(Ni_asc))) 

lines(Lis, Ni_desc, type = "b", col = "#FF5733", pch = 19)  # rojo suave

# --- Gráfica 7: Ojiva (porcentaje) para NOx ---
plot(Lss, Hi_asc,
     type = "b",
     main = "Gráfica N°7: Ojiva ascendente y descendente (Porcentaje) - NOx",
     xlab = "NOx (µg/m3)",
     ylab = "Porcentaje (%)",
     pch = 19,
     col = "blue",   # verde menta
     ylim = c(0, max(Hi_asc, Hi_desc))) 

lines(Lis, Hi_desc, type = "b", col = "#FF5733", pch = 19)  # rojo suave