UNIVERSIDAD CENTRAL DEL ECUADOR

PROYECTO:ESTUDIO ESTADÍSTICO DE LA CALIDAD DEL AIRE EN INDIA

FECHA: 21/11/2025

# Estadística Descriptiva
# 09/12/2025
# Ariana Viteri

# ======= CARGA DE PAQUETES =======
library(gt)
library(dplyr)

# ======= CARGAR DATOS =======
datos <- read.csv(
  "~/ariana tercer semestre/Estadistica/city_day.csv",
  header = TRUE,
  sep = ",",
  dec = "."
)

# ======= LIMPIEZA DE LA VARIABLE PM2.5 =======
PM25 <- datos$PM2.5[datos$PM2.5 != "-"]
PM25 <- as.numeric(PM25)

length(PM25)

## [1] 24933

# ======= MIN, MAX, RANGO =======
min_PM25 <- min(PM25)
max_PM25 <- max(PM25)
R <- max_PM25 - min_PM25

# ======= USAMOS k = 16 =======
k <- 16
A <- R / k

# ======= GENERACIÓN DE INTERVALOS =======
Li <- seq(from = min_PM25, to = max_PM25 - A, by = A)
Ls <- c(seq(from = min_PM25 + A, to = max_PM25 - A, by = A), max_PM25)

MC <- (Li + Ls) / 2

# ======= CÁLCULO DE FRECUENCIAS =======
PM25 <- round(PM25, 3)
Li <- round(Li, 3)
Ls <- round(Ls, 3)

ni <- numeric(length(Li))

for (i in 1:length(Li)) {
  if (i < length(Li)) {
    ni[i] <- sum(PM25 >= Li[i] & PM25 < Ls[i])
  } else {
    ni[i] <- sum(PM25 >= Li[i] & PM25 <= Ls[i])
  }
}

# ======= CÁLCULOS COMPLEMENTARIOS =======
N <- sum(ni)
hi <- (ni / N) * 100

Ni_asc <- cumsum(ni)
Ni_desc <- rev(cumsum(rev(ni)))
Hi_asc <- cumsum(hi)
Hi_desc <- rev(cumsum(rev(hi)))

# ======= FORMATO DE INTERVALOS =======
Intervalo <- paste0("[", round(Li,2), " - ", round(Ls,2), ")")
Intervalo[length(Intervalo)] <- paste0("[", round(Li[length(Li)],2), " - ",
                                       round(Ls[length(Ls)],2), "]")

# ======= TABLA FINAL =======
TDF_PM25 <- data.frame(
  Intervalo = Intervalo,
  MC = round(MC, 2),
  ni = ni,
  hi = round(hi, 2),
  Ni_ascendente = Ni_asc,
  Ni_descendente = Ni_desc,
  Hi_ascendente = round(Hi_asc, 2),
  Hi_descendente = round(Hi_desc, 2)
)

# ======= AGREGAR FILA DE TOTALES =======
totales <- data.frame(
  Intervalo = "Totales",
  MC = "-",
  ni = sum(ni),
  hi = sum(hi),
  Ni_ascendente = "-",
  Ni_descendente = "-",
  Hi_ascendente = "-",
  Hi_descendente = "-"
)

TDF_PM25 <- rbind(TDF_PM25, totales)

# ======= TABLA BONITA CON gt() =======
TDF_PM25 %>%
  gt() %>%
  tab_header(
    title = md("*Tabla Nro. 1*"),
    subtitle = md("**Distribución de frecuencia de la concentración de PM2.5, estudio de calidad del aire en India**")
  ) %>%
  tab_source_note(
    source_note = md("Fuente: Datos obtenidos y procesados por medio de https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india")
  ) %>%
  tab_style(
    style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"),
    locations = cells_body()
  ) %>%
  tab_style(
    style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"),
    locations = cells_body()
  ) %>%
  tab_style(
    style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"),
    locations = cells_column_labels()
  ) %>%
  tab_style(
    style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"),
    locations = cells_column_labels()
  ) %>%
  tab_options(
    table.border.top.color = "black",
    table.border.bottom.color = "black",
    table.border.top.style = "solid",
    table.border.bottom.style = "solid",
    column_labels.border.top.color = "black",
    column_labels.border.bottom.color = "black",
    column_labels.border.bottom.width = px(2),
    row.striping.include_table_body = TRUE,
    heading.border.bottom.color = "black",
    heading.border.bottom.width = px(2),
    table_body.hlines.color = "gray",
    table_body.border.bottom.color = "black"
  )

Intervalo	MC	ni	hi	Ni_ascendente	Ni_descendente	Hi_ascendente	Hi_descendente
Tabla Nro. 1
Distribución de frecuencia de la concentración de PM2.5, estudio de calidad del aire en India
[0.04 - 59.41)	29.73	15069	60.44	15069	24933	60.44	100
[59.41 - 118.78)	89.1	6572	26.36	21641	9864	86.8	39.56
[118.78 - 178.16)	148.47	1772	7.11	23413	3292	93.9	13.2
[178.16 - 237.53)	207.84	820	3.29	24233	1520	97.19	6.1
[237.53 - 296.9)	267.21	401	1.61	24634	700	98.8	2.81
[296.9 - 356.27)	326.59	138	0.55	24772	299	99.35	1.2
[356.27 - 415.64)	385.96	75	0.30	24847	161	99.66	0.65
[415.64 - 475.02)	445.33	39	0.16	24886	86	99.81	0.34
[475.02 - 534.39)	504.7	16	0.06	24902	47	99.88	0.19
[534.39 - 593.76)	564.07	11	0.04	24913	31	99.92	0.12
[593.76 - 653.13)	623.44	5	0.02	24918	20	99.94	0.08
[653.13 - 712.5)	682.82	2	0.01	24920	15	99.95	0.06
[712.5 - 771.87)	742.19	2	0.01	24922	13	99.96	0.05
[771.87 - 831.25)	801.56	2	0.01	24924	11	99.96	0.04
[831.25 - 890.62)	860.93	3	0.01	24927	9	99.98	0.04
[890.62 - 949.99]	920.3	6	0.02	24933	6	100	0.02
Totales	-	24933	100.00	-	-	-	-
Fuente: Datos obtenidos y procesados por medio de https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india

# ============================================
# PROCESO DE SIMPLIFICACIÓN — VARIABLE PM2.5
# ============================================

library(gt)
library(dplyr)

PM25 <- datos$PM2.5[datos$PM2.5 != "-"]
PM25 <- as.numeric(PM25)

n <- length(PM25)

min_PM25 <- min(PM25)
max_PM25 <- max(PM25)
R <- max_PM25 - min_PM25

k <- 12
A <- R / k

Lis <- seq(from = min_PM25, to = max_PM25 - A, by = A)
Lss <- c(seq(from = min_PM25 + A, to = max_PM25 - A, by = A), max_PM25)

MCs <- (Lis + Lss) / 2

PM25 <- round(PM25, 3)
Lis <- round(Lis, 3)
Lss <- round(Lss, 3)

ni <- numeric(length(Lis))

for (i in 1:length(Lis)) {
  if (i < length(Lis)) {
    ni[i] <- sum(PM25 >= Lis[i] & PM25 < Lss[i])
  } else {
    ni[i] <- sum(PM25 >= Lis[i] & PM25 <= Lss[i])
  }
}

# FRECUENCIAS RELATIVAS
hi <- round((ni / sum(ni)) * 100, 2)
hi[length(hi)] <- 100 - sum(hi[-length(hi)])

Ni_asc <- cumsum(ni)
Hi_asc <- round(cumsum(hi), 2)
Ni_desc <- rev(cumsum(rev(ni)))
Hi_desc <- round(rev(cumsum(rev(hi))), 2)

# CREACIÓN DE TABLA
TDF_PM25 <- data.frame(
  Intervalo = paste0("[", Lis, " - ", Lss, ")"),
  MC = round(MCs, 3),
  ni = ni,
  `hi(%)` = hi,
  Ni_asc = Ni_asc,
  `Hi_asc (%)` = Hi_asc,
  Ni_desc = Ni_desc,
  `Hi_desc (%)` = Hi_desc
)

totales <- data.frame(
  Intervalo = "Totales",
  MC = "-",
  ni = sum(ni),
  `hi(%)` = sum(hi),
  Ni_asc = "-",
  `Hi_asc (%)` = "-",
  Ni_desc = "-",
  `Hi_desc (%)` = "-"
)
# Detectar nombre REAL de la columna hi
col_hi <- grep("^hi", colnames(TDF_PM25), value = TRUE)

# Convertir a numérico donde corresponda
TDF_PM25$MC <- suppressWarnings(as.numeric(TDF_PM25$MC))
TDF_PM25[[col_hi]] <- suppressWarnings(as.numeric(TDF_PM25[[col_hi]]))

TDF_PM25 <- rbind(TDF_PM25, totales)
#tabla 
TDF_PM25 %>%
  gt() %>%
  tab_header(
    title = md("*Tabla Nro. 2*"),
    subtitle = md("**Distribución de frecuencia simploificada de la concentración de PM2.5, estudio de calidad del aire en India**")
  ) %>%
  tab_source_note(
    source_note = md("Fuente: Datos obtenidos y procesados por medio de https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india")
  ) %>%
  tab_style(
    style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"),
    locations = cells_body()
  ) %>%
  tab_style(
    style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"),
    locations = cells_body()
  ) %>%
  tab_style(
    style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"),
    locations = cells_column_labels()
  ) %>%
  tab_style(
    style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"),
    locations = cells_column_labels()
  ) %>%
  tab_options(
    table.border.top.color = "black",
    table.border.bottom.color = "black",
    table.border.top.style = "solid",
    table.border.bottom.style = "solid",
    column_labels.border.top.color = "black",
    column_labels.border.bottom.color = "black",
    column_labels.border.bottom.width = px(2),
    row.striping.include_table_body = TRUE,
    heading.border.bottom.color = "black",
    heading.border.bottom.width = px(2),
    table_body.hlines.color = "gray",
    table_body.border.bottom.color = "black"
  )

Intervalo	MC	ni	hi...	Ni_asc	Hi_asc....	Ni_desc	Hi_desc....
Tabla Nro. 2
Distribución de frecuencia simploificada de la concentración de PM2.5, estudio de calidad del aire en India
[0.04 - 79.203)	39.621	18527	74.31	18527	74.31	24933	100
[79.203 - 158.365)	118.784	4426	17.75	22953	92.06	6406	25.69
[158.365 - 237.528)	197.946	1280	5.13	24233	97.19	1980	7.94
[237.528 - 316.69)	277.109	464	1.86	24697	99.05	700	2.81
[316.69 - 395.853)	356.271	134	0.54	24831	99.59	236	0.95
[395.853 - 475.015)	435.434	55	0.22	24886	99.81	102	0.41
[475.015 - 554.178)	514.596	21	0.08	24907	99.89	47	0.19
[554.178 - 633.34)	593.759	8	0.03	24915	99.92	26	0.11
[633.34 - 712.503)	672.921	5	0.02	24920	99.94	18	0.08
[712.503 - 791.665)	752.084	2	0.01	24922	99.95	13	0.06
[791.665 - 870.828)	831.246	5	0.02	24927	99.97	11	0.05
[870.828 - 949.99)	910.409	6	0.03	24933	100	6	0.03
Totales	-	24933	100.00	-	-	-	-
Fuente: Datos obtenidos y procesados por medio de https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india

# ========= CREAR HISTOGRAMA PARA OBTENER LOS BREAKS ==========
Histograma_PM25 <- hist(PM25, breaks = 11, plot = FALSE)

# ========= GRÁFICA — HISTOGRAMA DE PM25 =========
hist(PM25, breaks = 11,
     main = "Gráfica N°1: Distribución de la Concentración de PM2.5
     presente en el estudio sobre calidad del aire en India entre 2015-2020 ",
     xlab = "PM2.5 (µg/m3)",
     ylab = "Cantidad",
     ylim = c(0, max(ni)),
     col = "lightskyblue",
     cex.main = 0.9,
     cex.lab = 1,
     cex.axis = 0.9,
     xaxt = "n")

axis(1, at = Histograma_PM25$breaks,
     labels = Histograma_PM25$breaks,
     las = 1,
     cex.axis = 0.9)

hist(PM25, breaks = 11,
     main = "Gráfica N°2: Distribución de la Concentración de PM2.5
     presente en el estudio sobre calidad del aire en India entre 2015-2020",
     xlab = "PM2.5 (µg/m3)",
     ylab = "Cantidad",
     ylim = c(0, length(PM25)),
     col = "lightskyblue",
     cex.main = 1,
     cex.lab = 1,
     cex.axis = 0.9,
     xaxt = "n")

axis(1, at = Histograma_PM25$breaks,
     labels = Histograma_PM25$breaks,
     las = 1,
     cex.axis = 0.9)

# --- Número de filas sin contar totales ---
n <- nrow(TDF_PM25) - 1

# --- Asegurar que MC sea numérico solo para filas de datos ---
MC_num <- as.numeric(TDF_PM25$MC[1:n])

# --- Detectar columna hi(%) automáticamente ---
col_hi <- grep("hi", names(TDF_PM25), value = TRUE)
hi_num <- as.numeric(TDF_PM25[[col_hi]][1:n])

# --- Crear la gráfica ---
bp <- barplot(
  height = hi_num,
  space = 0,
  col = "skyblue",
  main = "Gráfica N°3: Distribución de la concentración de PM2.5\nEstudio calidad del aire en India, 2015-2020",
  xlab = "PM2.5 (µg/m3)",
  ylab = "Porcentaje (%)",
  names.arg = rep("", n),   # temporal
  ylim = c(0, 100),
  las = 1
)

# --- Eje X con MC enteros ---
axis(
  side = 1,
  at = bp,
  labels = round(MC_num, 0),  # redondea a enteros
  las = 2,
  cex.axis = 0.8
)

# --- Número de filas sin contar totales ---
n <- nrow(TDF_PM25) - 1

# --- Asegurar que MC sea numérico solo para filas de datos ---
MC_num <- as.numeric(TDF_PM25$MC[1:n])

# --- Detectar columna hi(%) automáticamente ---
col_hi <- grep("hi", names(TDF_PM25), value = TRUE)
hi_num <- as.numeric(TDF_PM25[[col_hi]][1:n])

# --- Crear la Gráfica 4 ---
bp4 <- barplot(
  height = hi_num,
  space = 0,
  col = "skyblue",
  main = "Gráfica N°4: Distribución de la concentración de PM2.5\nEstudio calidad del aire en India, 2015-2020",
  xlab = "PM2.5 (µg/m3)",
  ylab = "Porcentaje (%)",
  names.arg = rep("", n),  # temporal
  las = 1
)

# --- Eje X con MC enteros ---
axis(
  side = 1,
  at = bp4,
  labels = round(MC_num, 0),  # redondea a enteros
  las = 2,
  cex.axis = 0.8
)

# --- Gráfica 5: Boxplot de PM2.5 ---
CajaPM25 <- boxplot(
  PM25,
  horizontal = TRUE,
  col = "turquoise",
  border = "black",
  main = "Gráfica No. 5: Distribución de la concentración de PM2.5\nEstudio calidad del aire en India 2015-2020",
  xlab = "PM2.5 (µg/m3)"
)

# --- Asegurarse de tener Lis, Lss, ni, hi (sin fila de totales) ---
# Recalcular acumuladas
Ni_asc  <- cumsum(ni)
Ni_desc <- rev(cumsum(rev(ni)))
Hi_asc  <- cumsum(hi)
Hi_desc <- rev(cumsum(rev(hi)))

k <- length(Lis)  # número de clases

# --- Gráfica 6: Ojiva (cantidad) para PM2.5 ---
plot(Lss, Ni_asc,
     type = "b",
     main = "Gráfica N°6: Ojiva ascendente y descendente (Cantidad) - PM2.5",
     xlab = "PM2.5 (µg/m3)",
     ylab = "Cantidad",
     pch = 19,
     col = "turquoise",
     ylim = c(0, max(Ni_asc))) 

lines(Lis, Ni_desc, type = "b", col = "red", pch = 19)

# --- Gráfica 7: Ojiva (porcentaje) para PM2.5 ---
plot(Lss, Hi_asc,
     type = "b",
     main = "Gráfica N°7: Ojiva ascendente y descendente (Porcentaje) - PM2.5",
     xlab = "PM2.5 (µg/m3)",
     ylab = "Porcentaje (%)",
     pch = 19,
     col = "blue",
     ylim = c(0, max(Hi_asc, Hi_desc))) 

lines(Lis, Hi_desc, type = "b", col = "red", pch = 19)