UNIVERSIDAD CENTRAL DEL ECUADOR

PROYECTO:ESTUDIO ESTADÍSTICO DE LA CALIDAD DEL AIRE EN INDIA

FECHA: 21/11/2025

# Estadística Descriptiva
# 09/12/2025
# Sebastian Chiluisa

# ======= CARGA DE PAQUETES =======
library(gt)
library(dplyr)

# ======= CARGAR DATOS =======
datos <- read.csv(
  "~/ariana tercer semestre/Estadistica/city_day.csv",
  header = TRUE,
  sep = ",",
  dec = "."
)

# ======= LIMPIEZA DE LA VARIABLE NOx =======
NOx <- datos$NOx[datos$NOx != "-"]
NOx <- as.numeric(NOx)

length(NOx)

## [1] 25346

# ======= MIN, MAX, RANGO =======
min_NOx <- min(NOx)
max_NOx <- max(NOx)
R <- max_NOx - min_NOx

# ======= USAMOS k = 16 =======
k <- 16
A <- R / k

# ======= GENERACIÓN DE INTERVALOS =======
Li <- seq(from = min_NOx, to = max_NOx - A, by = A)
Ls <- c(seq(from = min_NOx + A, to = max_NOx - A, by = A), max_NOx)

MC <- (Li + Ls) / 2

# ======= CÁLCULO DE FRECUENCIAS =======
NOx <- round(NOx, 3)
Li <- round(Li, 3)
Ls <- round(Ls, 3)

ni <- numeric(length(Li))

for (i in 1:length(Li)) {
  if (i < length(Li)) {
    ni[i] <- sum(NOx >= Li[i] & NOx < Ls[i])
  } else {
    ni[i] <- sum(NOx >= Li[i] & NOx <= Ls[i])
  }
}

# ======= CÁLCULOS COMPLEMENTARIOS =======
N <- sum(ni)
hi <- (ni / N) * 100

Ni_asc <- cumsum(ni)
Ni_desc <- rev(cumsum(rev(ni)))
Hi_asc <- cumsum(hi)
Hi_desc <- rev(cumsum(rev(hi)))

# ======= FORMATO DE INTERVALOS =======
Intervalo <- paste0("[", round(Li,2), " - ", round(Ls,2), ")")
Intervalo[length(Intervalo)] <- paste0("[", round(Li[length(Li)],2), " - ",
                                       round(Ls[length(Ls)],2), "]")

# ======= TABLA FINAL =======
TDF_NOx <- data.frame(
  Intervalo = Intervalo,
  MC = round(MC, 2),
  ni = ni,
  hi = round(hi, 2),
  Ni_ascendente = Ni_asc,
  Ni_descendente = Ni_desc,
  Hi_ascendente = round(Hi_asc, 2),
  Hi_descendente = round(Hi_desc, 2)
)

# ======= AGREGAR FILA DE TOTALES =======
totales <- data.frame(
  Intervalo = "Totales",
  MC = "-",
  ni = sum(ni),
  hi = sum(hi),
  Ni_ascendente = "-",
  Ni_descendente = "-",
  Hi_ascendente = "-",
  Hi_descendente = "-"
)

TDF_NOx <- rbind(TDF_NOx, totales)

# ======= TABLA BONITA CON gt() =======
TDF_NOx %>%
  gt() %>%
  tab_header(
    title = md("*Tabla Nro. 1*"),
    subtitle = md("**Distribución de frecuencia de la concentración de NOx, estudio de calidad del aire en India**")
  ) %>%
  tab_source_note(
    source_note = md("Fuente: Datos obtenidos y procesados por medio de https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india")
  ) %>%
  tab_style(
    style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"),
    locations = cells_body()
  ) %>%
  tab_style(
    style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"),
    locations = cells_body()
  ) %>%
  tab_style(
    style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"),
    locations = cells_column_labels()
  ) %>%
  tab_style(
    style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"),
    locations = cells_column_labels()
  ) %>%
  tab_options(
    table.border.top.color = "black",
    table.border.bottom.color = "black",
    table.border.top.style = "solid",
    table.border.bottom.style = "solid",
    column_labels.border.top.color = "black",
    column_labels.border.bottom.color = "black",
    column_labels.border.bottom.width = px(2),
    row.striping.include_table_body = TRUE,
    heading.border.bottom.color = "black",
    heading.border.bottom.width = px(2),
    table_body.hlines.color = "gray",
    table_body.border.bottom.color = "black"
  )

Intervalo	MC	ni	hi	Ni_ascendente	Ni_descendente	Hi_ascendente	Hi_descendente
Tabla Nro. 1
Distribución de frecuencia de la concentración de NOx, estudio de calidad del aire en India
[0 - 29.23)	14.61	15554	61.37	15554	25346	61.37	100
[29.23 - 58.45)	43.84	6269	24.73	21823	9792	86.1	38.63
[58.45 - 87.68)	73.07	1961	7.74	23784	3523	93.84	13.9
[87.68 - 116.91)	102.29	830	3.27	24614	1562	97.11	6.16
[116.91 - 146.13)	131.52	401	1.58	25015	732	98.69	2.89
[146.13 - 175.36)	160.75	191	0.75	25206	331	99.45	1.31
[175.36 - 204.59)	189.97	76	0.30	25282	140	99.75	0.55
[204.59 - 233.82)	219.2	35	0.14	25317	64	99.89	0.25
[233.82 - 263.04)	248.43	18	0.07	25335	29	99.96	0.11
[263.04 - 292.27)	277.66	5	0.02	25340	11	99.98	0.04
[292.27 - 321.5)	306.88	2	0.01	25342	6	99.98	0.02
[321.5 - 350.72)	336.11	0	0.00	25342	4	99.98	0.02
[350.72 - 379.95)	365.34	2	0.01	25344	4	99.99	0.02
[379.95 - 409.18)	394.56	1	0.00	25345	2	100	0.01
[409.18 - 438.4)	423.79	0	0.00	25345	1	100	0
[438.4 - 467.63]	453.02	1	0.00	25346	1	100	0
Totales	-	25346	100.00	-	-	-	-
Fuente: Datos obtenidos y procesados por medio de https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india

# ============================================
# PROCESO DE SIMPLIFICACIÓN — VARIABLE NOx
# ============================================

library(gt)
library(dplyr)

NOx <- datos$NOx[datos$NOx != "-"]
NOx <- as.numeric(NOx)

n <- length(NOx)

min_NOx <- min(NOx)
max_NOx <- max(NOx)
R <- max_NOx - min_NOx

k <- 12
A <- R / k

Lis <- seq(from = min_NOx, to = max_NOx - A, by = A)
Lss <- c(seq(from = min_NOx + A, to = max_NOx - A, by = A), max_NOx)

MCs <- (Lis + Lss) / 2

NOx <- round(NOx, 3)
Lis <- round(Lis, 3)
Lss <- round(Lss, 3)

ni <- numeric(length(Lis))

for (i in 1:length(Lis)) {
  if (i < length(Lis)) {
    ni[i] <- sum(NOx >= Lis[i] & NOx < Lss[i])
  } else {
    ni[i] <- sum(NOx >= Lis[i] & NOx <= Lss[i])
  }
}

# FRECUENCIAS RELATIVAS
hi <- round((ni / sum(ni)) * 100, 2)
hi[length(hi)] <- 100 - sum(hi[-length(hi)])

Ni_asc <- cumsum(ni)
Hi_asc <- round(cumsum(hi), 2)
Ni_desc <- rev(cumsum(rev(ni)))
Hi_desc <- round(rev(cumsum(rev(hi))), 2)

# CREACIÓN DE TABLA
TDF_NOx <- data.frame(
  Intervalo = paste0("[", Lis, " - ", Lss, ")"),
  MC = round(MCs, 3),
  ni = ni,
  `hi(%)` = hi,
  Ni_asc = Ni_asc,
  `Hi_asc (%)` = Hi_asc,
  Ni_desc = Ni_desc,
  `Hi_desc (%)` = Hi_desc
)

totales <- data.frame(
  Intervalo = "Totales",
  MC = "-",
  ni = sum(ni),
  `hi(%)` = sum(hi),
  Ni_asc = "-",
  `Hi_asc (%)` = "-",
  Ni_desc = "-",
  `Hi_desc (%)` = "-"
)

# Detectar nombre REAL de la columna hi
col_hi <- grep("^hi", colnames(TDF_NOx), value = TRUE)

# Convertir a numérico donde corresponda
TDF_NOx$MC <- suppressWarnings(as.numeric(TDF_NOx$MC))
TDF_NOx[[col_hi]] <- suppressWarnings(as.numeric(TDF_NOx[[col_hi]]))

TDF_NOx <- rbind(TDF_NOx, totales)

# TABLA BONITA
TDF_NOx %>%
  gt() %>%
  tab_header(
    title = md("*Tabla Nro. 2*"),
    subtitle = md("**Distribución de frecuencia simplificada de la concentración de NOx, estudio de calidad del aire en India**")
  ) %>%
  tab_source_note(
    source_note = md("Fuente: Datos obtenidos y procesados por medio de https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india")
  ) %>%
  tab_style(
    style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"),
    locations = cells_body()
  ) %>%
  tab_style(
    style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"),
    locations = cells_body()
  ) %>%
  tab_style(
    style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"),
    locations = cells_column_labels()
  ) %>%
  tab_style(
    style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"),
    locations = cells_column_labels()
  ) %>%
  tab_options(
    table.border.top.color = "black",
    table.border.bottom.color = "black",
    table.border.top.style = "solid",
    table.border.bottom.style = "solid",
    column_labels.border.top.color = "black",
    column_labels.border.bottom.color = "black",
    column_labels.border.bottom.width = px(2),
    row.striping.include_table_body = TRUE,
    heading.border.bottom.color = "black",
    heading.border.bottom.width = px(2),
    table_body.hlines.color = "gray",
    table_body.border.bottom.color = "black"
  )

Intervalo	MC	ni	hi...	Ni_asc	Hi_asc....	Ni_desc	Hi_desc....
Tabla Nro. 2
Distribución de frecuencia simplificada de la concentración de NOx, estudio de calidad del aire en India
[0 - 38.969)	19.485	18733	73.91	18733	73.91	25346	100
[38.969 - 77.938)	58.454	4597	18.14	23330	92.05	6613	26.09
[77.938 - 116.908)	97.423	1284	5.07	24614	97.12	2016	7.95
[116.908 - 155.877)	136.392	488	1.93	25102	99.05	732	2.88
[155.877 - 194.846)	175.361	156	0.62	25258	99.67	244	0.95
[194.846 - 233.815)	214.33	59	0.23	25317	99.9	88	0.33
[233.815 - 272.784)	253.3	20	0.08	25337	99.98	29	0.1
[272.784 - 311.753)	292.269	5	0.02	25342	100	9	0.02
[311.753 - 350.722)	331.238	0	0.00	25342	100	4	0
[350.722 - 389.692)	370.207	3	0.01	25345	100.01	4	0
[389.692 - 428.661)	409.176	0	0.00	25345	100.01	1	-0.01
[428.661 - 467.63)	448.145	1	-0.01	25346	100	1	-0.01
Totales	-	25346	100.00	-	-	-	-
Fuente: Datos obtenidos y procesados por medio de https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india

# ========= CREAR HISTOGRAMA PARA OBTENER LOS BREAKS ==========
Histograma_NOx <- hist(NOx, breaks = 11, plot = FALSE)

# ========= GRÁFICA — HISTOGRAMA DE NOx =========
hist(NOx, breaks = 11,
     main = "Gráfica N°1: Distribución de la Concentración de NOx
     presente en el estudio sobre calidad del aire en India entre 2015-2020",
     xlab = "NOx (µg/m3)",
     ylab = "Cantidad",
     ylim = c(0, max(ni)),
     col = "#FFC300",   # amarillo cálido
     cex.main = 0.9,
     cex.lab = 1,
     cex.axis = 0.9,
     xaxt = "n")

axis(1, at = Histograma_NOx$breaks,
     labels = Histograma_NOx$breaks,
     las = 1,
     cex.axis = 0.9)

hist(NOx, breaks = 11,
     main = "Gráfica N°2: Distribución de la Concentración de NOx
     presente en el estudio sobre calidad del aire en India entre 2015-2020",
     xlab = "NOx (µg/m3)",
     ylab = "Cantidad",
     ylim = c(0, length(NOx)),
     col = "#FF5733",   # naranja suave
     cex.main = 1,
     cex.lab = 1,
     cex.axis = 0.9,
     xaxt = "n")

axis(1, at = Histograma_NOx$breaks,
     labels = Histograma_NOx$breaks,
     las = 1,
     cex.axis = 0.9)

# --- Número de filas sin contar totales ---
n <- nrow(TDF_NOx) - 1

# --- Asegurar que MC sea numérico solo para filas de datos ---
MC_num <- as.numeric(TDF_NOx$MC[1:n])

# --- Detectar columna hi(%) automáticamente ---
col_hi <- grep("hi", names(TDF_NOx), value = TRUE)
hi_num <- as.numeric(TDF_NOx[[col_hi]][1:n])

# --- Crear la gráfica — Gráfica 3 ---
bp <- barplot(
  height = hi_num,
  space = 0,
  col = "#33C3FF",   # azul claro
  main = "Gráfica N°3: Distribución de la concentración de NOx\nEstudio calidad del aire en India, 2015-2020",
  xlab = "NOx (µg/m3)",
  ylab = "Porcentaje (%)",
  names.arg = rep("", n),   # temporal
  ylim = c(0, 100),
  las = 1
)

# --- Eje X con MC enteros ---
axis(
  side = 1,
  at = bp,
  labels = round(MC_num, 0),  # redondea a enteros
  las = 2,
  cex.axis = 0.8
)

# --- Número de filas sin contar totales ---
n <- nrow(TDF_NOx) - 1

# --- Asegurar que MC sea numérico solo para filas de datos ---
MC_num <- as.numeric(TDF_NOx$MC[1:n])

# --- Detectar columna hi(%) automáticamente ---
col_hi <- grep("hi", names(TDF_NOx), value = TRUE)
hi_num <- as.numeric(TDF_NOx[[col_hi]][1:n])

# --- Crear la Gráfica 4 ---
bp4 <- barplot(
  height = hi_num,
  space = 0,
  col = "#33FF99",   # verde menta
  main = "Gráfica N°4: Distribución de la concentración de NOx\nEstudio calidad del aire en India, 2015-2020",
  xlab = "NOx (µg/m3)",
  ylab = "Porcentaje (%)",
  names.arg = rep("", n),  # temporal
  las = 1
)

# --- Eje X con MC enteros ---
axis(
  side = 1,
  at = bp4,
  labels = round(MC_num, 0),  # redondea a enteros
  las = 2,
  cex.axis = 0.8
)

# --- Gráfica 5: Boxplot de NOx ---
CajaNOx <- boxplot(
  NOx,
  horizontal = TRUE,
  col = "#FFB347",   # naranja suave
  border = "black",
  main = "Gráfica No. 5: Distribución de la concentración de NOx\nEstudio calidad del aire en India 2015-2020",
  xlab = "NOx (µg/m3)"
)

# --- Asegurarse de tener Lis, Lss, ni, hi (sin fila de totales) ---
# Recalcular acumuladas
Ni_asc  <- cumsum(ni)
Ni_desc <- rev(cumsum(rev(ni)))
Hi_asc  <- cumsum(hi)
Hi_desc <- rev(cumsum(rev(hi)))

k <- length(Lis)  # número de clases

# --- Gráfica 6: Ojiva (cantidad) para NOx ---
plot(Lss, Ni_asc,
     type = "b",
     main = "Gráfica N°6: Ojiva ascendente y descendente (Cantidad) - NOx",
     xlab = "NOx (µg/m3)",
     ylab = "Cantidad",
     pch = 19,
     col = "#33C3FF",   # azul claro
     ylim = c(0, max(Ni_asc))) 

lines(Lis, Ni_desc, type = "b", col = "#FF5733", pch = 19)  # rojo suave

# --- Gráfica 7: Ojiva (porcentaje) para NOx ---
plot(Lss, Hi_asc,
     type = "b",
     main = "Gráfica N°7: Ojiva ascendente y descendente (Porcentaje) - NOx",
     xlab = "NOx (µg/m3)",
     ylab = "Porcentaje (%)",
     pch = 19,
     col = "blue",   # verde menta
     ylim = c(0, max(Hi_asc, Hi_desc))) 

lines(Lis, Hi_desc, type = "b", col = "#FF5733", pch = 19)  # rojo suave