UNIVERSIDAD CENTRAL DEL ECUADOR

PROYECTO:ESTUDIO ESTADÍSTICO DE LA CALIDAD DEL AIRE EN INDIA

FECHA: 21/11/2025

# Estadística Descriptiva
# 09/12/2025
# Ariana Viteri

# ======= CARGA DE PAQUETES =======
library(gt)
library(dplyr)

# ======= CARGAR DATOS =======
datos <- read.csv(
  "~/ariana tercer semestre/Estadistica/city_day.csv",
  header = TRUE,
  sep = ",",
  dec = "."
)

# ======= LIMPIEZA DE LA VARIABLE NO =======
NO <- datos$NO[datos$NO != "-"]
NO <- as.numeric(NO)
length(NO)

## [1] 25949

# ======= MIN, MAX, RANGO =======
min_NO <- min(NO)
max_NO <- max(NO)
R <- max_NO - min_NO

# ======= USAMOS k = 16 =======
k <- 16
A <- R / k

# ======= GENERACIÓN DE INTERVALOS =======
Li <- seq(from = min_NO, to = max_NO - A, by = A)
Ls <- c(seq(from = min_NO + A, to = max_NO - A, by = A), max_NO)
MC <- (Li + Ls) / 2

# ======= CÁLCULO DE FRECUENCIAS =======
NO <- round(NO, 3)
Li <- round(Li, 3)
Ls <- round(Ls, 3)

ni <- numeric(length(Li))
for (i in 1:length(Li)) {
  if (i < length(Li)) {
    ni[i] <- sum(NO >= Li[i] & NO < Ls[i])
  } else {
    ni[i] <- sum(NO >= Li[i] & NO <= Ls[i])
  }
}

# ======= CÁLCULOS COMPLEMENTARIOS =======
N <- sum(ni)

# --- Nuevo método: calcular hi, redondear y ajustar diferencia para que sume exactamente 100.000 ---
hi_raw <- (ni / N) * 100
hi <- round(hi_raw, 3)

# Calcula la diferencia (puede ser positiva o negativa, pequeña)
diff_hi <- round(100 - sum(hi), 3)

# Si existe diferencia, añádela a la clase con mayor hi (minimiza distorsión)
if (abs(diff_hi) > 0) {
  idx_max <- which.max(hi)
  hi[idx_max] <- round(hi[idx_max] + diff_hi, 3)
}
# ------------------------------------------------------------------------------

Ni_asc <- cumsum(ni)
Ni_desc <- rev(cumsum(rev(ni)))
Hi_asc <- cumsum(hi)
Hi_desc <- rev(cumsum(rev(hi)))

# ======= FORMATO DE INTERVALOS =======
Intervalo <- paste0("[", round(Li,3), " - ", round(Ls,3), ")")
Intervalo[length(Intervalo)] <- paste0("[", round(Li[length(Li)],3), " - ",
                                       round(Ls[length(Ls)],3), "]")

# ======= TABLA FINAL =======
TDF_NO <- data.frame(
  Intervalo = Intervalo,
  MC = round(MC, 3),
  ni = ni,
  hi = hi,
  Ni_ascendente = Ni_asc,
  Ni_descendente = Ni_desc,
  Hi_ascendente = round(Hi_asc, 3),
  Hi_descendente = round(Hi_desc, 3)
)

# ======= FILA DE TOTALES =======
totales <- data.frame(
  Intervalo = "Totales",
  MC = "-",
  ni = sum(ni),
  hi = sum(hi),
  Ni_ascendente = "-",
  Ni_descendente = "-",
  Hi_ascendente = "-",
  Hi_descendente = "-"
)
TDF_NO <- rbind(TDF_NO, totales)

# ======= TABLA BONITA =======
TDF_NO %>%
  gt() %>%
  tab_header(
    title = md("*Tabla Nro. 1*"),
    subtitle = md("**Distribución de frecuencia de la concentración de NO, estudio de calidad del aire en India**")
  ) %>%
  tab_source_note(
    source_note = md("Fuente: Datos obtenidos y procesados por medio de https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india")
  ) %>%
  tab_style(
    style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"),
    locations = cells_body()
  ) %>%
  tab_style(
    style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"),
    locations = cells_body()
  ) %>%
  tab_style(
    style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"),
    locations = cells_column_labels()
  ) %>%
  tab_style(
    style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"),
    locations = cells_column_labels()
  ) %>%
  tab_options(
    table.border.top.color = "black",
    table.border.bottom.color = "black",
    table.border.top.style = "solid",
    table.border.bottom.style = "solid",
    column_labels.border.top.color = "black",
    column_labels.border.bottom.color = "black",
    column_labels.border.bottom.width = px(2),
    row.striping.include_table_body = TRUE,
    heading.border.bottom.color = "black",
    heading.border.bottom.width = px(2),
    table_body.hlines.color = "gray",
    table_body.border.bottom.color = "black"
  )

Intervalo	MC	ni	hi	Ni_ascendente	Ni_descendente	Hi_ascendente	Hi_descendente
Tabla Nro. 1
Distribución de frecuencia de la concentración de NO, estudio de calidad del aire en India
[0.02 - 24.436)	12.228	20815	80.213	20815	25949	80.213	100
[24.436 - 48.853)	36.644	3209	12.367	24024	5134	92.58	19.787
[48.853 - 73.269)	61.061	1025	3.950	25049	1925	96.53	7.42
[73.269 - 97.685)	85.477	471	1.815	25520	900	98.345	3.47
[97.685 - 122.101)	109.893	215	0.829	25735	429	99.174	1.655
[122.101 - 146.518)	134.309	119	0.459	25854	214	99.633	0.826
[146.518 - 170.934)	158.726	49	0.189	25903	95	99.822	0.367
[170.934 - 195.35)	183.142	17	0.066	25920	46	99.888	0.178
[195.35 - 219.766)	207.558	12	0.046	25932	29	99.934	0.112
[219.766 - 244.183)	231.974	5	0.019	25937	17	99.953	0.066
[244.183 - 268.599)	256.391	3	0.012	25940	12	99.965	0.047
[268.599 - 293.015)	280.807	5	0.019	25945	9	99.984	0.035
[293.015 - 317.431)	305.223	1	0.004	25946	4	99.988	0.016
[317.431 - 341.847)	329.639	0	0.000	25946	3	99.988	0.012
[341.848 - 366.264)	354.056	1	0.004	25947	3	99.992	0.012
[366.264 - 390.68]	378.472	2	0.008	25949	2	100	0.008
Totales	-	25949	100.000	-	-	-	-
Fuente: Datos obtenidos y procesados por medio de https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india

# ============================================
# PROCESO DE SIMPLIFICACIÓN — VARIABLE NO

library(gt)
library(dplyr)

NO <- datos$NO[datos$NO != "-"]
NO <- as.numeric(NO)

n <- length(NO)
min_NO <- min(NO)
max_NO <- max(NO)
R <- max_NO - min_NO

# 12 intervalos
k <- 12
A <- R / k

Lis <- seq(from = min_NO, to = max_NO - A, by = A)
Lss <- c(seq(from = min_NO + A, to = max_NO - A, by = A), max_NO)

MCs <- (Lis + Lss) / 2

NO <- round(NO, 3)
Lis <- round(Lis, 3)
Lss <- round(Lss, 3)

ni <- numeric(length(Lis))
for (i in 1:length(Lis)) {
  if (i < length(Lis)) {
    ni[i] <- sum(NO >= Lis[i] & NO < Lss[i])
  } else {
    ni[i] <- sum(NO >= Lis[i] & NO <= Lss[i])
  }
}

# -------- AJUSTE FORZADO PARA QUE hi Y Hi DEN 100 --------

# hi original
hi <- (ni / sum(ni)) * 100
hi <- round(hi, 3)

# ajustar último valor de hi para que total = 100 EXACTO
ajuste_hi <- 100 - sum(hi)
hi[length(hi)] <- hi[length(hi)] + ajuste_hi
hi <- round(hi, 3)

# ------- ACUMULADAS -------
Hi_asc <- cumsum(hi)
Hi_desc <- rev(cumsum(rev(hi)))

# Forzar que la última acumulada ascendente sea 100 EXACTO
Hi_asc[length(Hi_asc)] <- 100

# Forzar que la primera descendente sea 100 EXACTO
Hi_desc[1] <- 100

Ni_asc <- cumsum(ni)
Ni_desc <- rev(cumsum(rev(ni)))

# -------- CREACIÓN DE TABLA --------
TDF_NO <- data.frame(  Intervalo = paste0("[", Lis, " - ", Lss, ")"),
  MC = round(MCs, 3),
  ni = ni,
  hi = hi,
  Ni_ascendente = Ni_asc,
  Hi_ascendente = Hi_asc,
  Ni_descendente = Ni_desc,
  Hi_descendente = Hi_desc
)

colnames(TDF_NO) <- c(
  "Intervalo",
  "MC",
  "ni",
  "hi(%)",
  "Ni_asc",
  "Hi_asc (%)",
  "Ni_desc",
  "Hi_desc (%)"
)

# -------- FILA DE TOTALES --------
totales <- data.frame(
  Intervalo = "Totales",
  MC = "-",
  ni = sum(ni),
  hi = 100,
  Ni_asc = "-",
  `Hi_asc (%)` = "-",
  Ni_desc = "-",
  `Hi_desc (%)` = "-"
)

colnames(totales) <- colnames(TDF_NO)

TDF_NO <- rbind(TDF_NO, totales)

# -------- TABLA GT --------
TDF_NO %>%
  gt() %>%
  tab_header(
    title = md("*Tabla Nro. 2*"),
    subtitle = md("**Distribución de frecuencia simplificada de la concentración de NO**")
  ) %>%
  tab_source_note(
    source_note = md("Fuente: Datos obtenidos y procesados por medio de https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india")
  ) %>%
  tab_style(
    style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"),
    locations = cells_body()
  ) %>%
  tab_style(
    style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"),
    locations = cells_body()
  ) %>%
  tab_style(
    style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"),
    locations = cells_column_labels()
  ) %>%
  tab_style(
    style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"),
    locations = cells_column_labels()
  ) %>%
  tab_options(
    table.border.top.color = "black",
    table.border.bottom.color = "black",
    table.border.top.style = "solid",
    table.border.bottom.style = "solid",
    column_labels.border.top.color = "black",
    column_labels.border.bottom.color = "black",
    column_labels.border.bottom.width = px(2),
    row.striping.include_table_body = TRUE,
    heading.border.bottom.color = "black",
    heading.border.bottom.width = px(2),
    table_body.hlines.color = "gray",
    table_body.border.bottom.color = "black"
  )

Intervalo	MC	ni	hi(%)	Ni_asc	Hi_asc (%)	Ni_desc	Hi_desc (%)
Tabla Nro. 2
Distribución de frecuencia simplificada de la concentración de NO
[0.02 - 32.575)	16.298	22463	86.566	22463	86.566	25949	100
[32.575 - 65.13)	48.852	2330	8.979	24793	95.545	3486	13.434
[65.13 - 97.685)	81.408	727	2.802	25520	98.347	1156	4.455
[97.685 - 130.24)	113.963	261	1.006	25781	99.353	429	1.653
[130.24 - 162.795)	146.518	113	0.435	25894	99.788	168	0.647
[162.795 - 195.35)	179.073	26	0.100	25920	99.888	55	0.212
[195.35 - 227.905)	211.628	16	0.062	25936	99.95	29	0.112
[227.905 - 260.46)	244.183	2	0.008	25938	99.958	13	0.05
[260.46 - 293.015)	276.737	7	0.027	25945	99.985	11	0.042
[293.015 - 325.57)	309.293	1	0.004	25946	99.989	4	0.015
[325.57 - 358.125)	341.847	1	0.004	25947	99.993	3	0.011
[358.125 - 390.68)	374.403	2	0.007	25949	100	2	0.007
Totales	-	25949	100.000	-	-	-	-
Fuente: Datos obtenidos y procesados por medio de https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india

# ========= GRÁFICA — HISTOGRAMA DE NO =========
Histograma_NO <- hist(NO, breaks = 11, plot = FALSE)
hist(NO, breaks = 11,
     main = "Gráfica N°1: Distribución de la Concentración de NO
     presente en el estudio sobre calidad del aire en India entre 2015-2020 ",
     xlab = " NO (µg/m3)",
     ylab = "Cantidad",
     ylim = c(0, max(ni)),     # ni de tu tabla de NO
     col = "lightskyblue",
     cex.main = 0.9,
     cex.lab = 1,
     cex.axis = 0.9,
     xaxt = "n")
axis(1, at = Histograma_NO$breaks,
     labels = Histograma_NO$breaks,
     las = 1,
     cex.axis = 0.9)

hist(NO, breaks = 11,
     main = "Gráfica N°2: Distribución de la Concentración de NO
     presente en el estudio sobre calidad del aire en India entre 2015-2020",
     xlab = "NO (µg/m3)",
     ylab = "Cantidad",
     ylim = c(0, length(NO)),
     col = "lightskyblue",
     cex.main = 1,
     cex.lab = 1,
     cex.axis = 0.9,
     xaxt = "n")
axis(1, at = Histograma_NO$breaks,
     labels = Histograma_NO$breaks,
     las = 1,
     cex.axis = 0.9)

# Convertir a numérico
TDF_NO$MC <- as.numeric(TDF_NO$MC)
TDF_NO$`hi(%)` <- as.numeric(TDF_NO$`hi(%)`)

# Número de barras
n <- nrow(TDF_NO) - 1

# --- GRÁFICO ---
bp <- barplot(
  TDF_NO$`hi(%)`[1:n],
  space = 0,
  col = "skyblue",
  main = "Gráfica N°3: Distribución de la Concentración de NO,\nestudio calidad del aire en India, 2015-2020",
  xlab = "NO (µg/m3)",
  ylab = "Porcentaje (%)",
  names.arg = rep("", n),    # ← NO mostrar decimales del MC
  ylim = c(0, 100),
  las = 1
)
# --- EJE X CON MC REDONDEADOS ---
MC_enteros <- round(TDF_NO$MC[1:n], 0)

axis(
  side = 1,
  at = bp,               # posiciones reales de las barras
  labels = MC_enteros    # valores MC redondeados, sin decimales
)

# Número de filas
n <- nrow(TDF_NO)

# Asegurar que MC y hi(%) sean numéricos
TDF_NO$MC <- as.numeric(TDF_NO$MC)
TDF_NO$`hi(%)` <- as.numeric(TDF_NO$`hi(%)`)

# --- GRÁFICO ---
bp <- barplot(
  TDF_NO$`hi(%)`[1:(n-1)],
  space = 0,
  main = "Gráfica N°4: Distribución de la concentración de NO
  en el estudio calidad del aire en India, 2015-2020",
  ylab = "Porcentaje (%)",
  xlab = "NO (µg/m3)",
  names.arg = rep("", n-1),   # ← quitar etiquetas para evitar decimales
  col = "skyblue",
  las = 1
)
# --- EJE X CON MC REDONDEADOS ---
MC_enteros <- round(TDF_NO$MC[1:(n-1)], 0)
axis(
  side = 1,
  at = bp,               # posiciones de las barras
  labels = MC_enteros    # MC sin decimales
)

CajaNO <- boxplot(
  NO,
  horizontal = TRUE,
  col = "turquoise",
  border = "black",
  main = "Gráfica No. 5: Distribución de la concentración de NO,
  estudio calidad del aire en India desde 2015-2020",
  xlab = "NO (µg/m3)"
)

# Recalcular acumuladas (longitud = k)
Ni_asc  <- cumsum(ni)
Ni_desc <- rev(cumsum(rev(ni)))
Hi_asc  <- cumsum(hi)
Hi_desc <- rev(cumsum(rev(hi)))
k <- length(Lis)  # número de clases
# 1) Ojiva (cantidad) — usar Lss como coordenada x (marca superior de clase)
plot(Lss, Ni_asc,
     type = "b",
     main = "Gráfica N°6: Ojiva ascendente y descendente (Cantidad) - NO",
     xlab = "NO (µg/m3)",
     ylab = "Cantidad",
     pch = 19,
     col = "turquoise",
     ylim = c(0, max(Ni_asc))) 
lines(Lis, Ni_desc, type = "b", col = "red", pch = 19)

# 2) Ojiva (porcentaje)
plot(Lss, Hi_asc,
     type = "b",
     main = "Gráfica N°7: Ojiva ascendente y descendente (Porcentaje) - NO",
     xlab = "NO (µg/m3)",
     ylab = "Porcentaje (%)",
     pch = 19,
     col = "blue",
     ylim = c(0, max(Hi_asc, Hi_desc))) 
lines(Lis, Hi_desc, type = "b", col = "red", pch = 19)