UNIVERSIDAD CENTRAL DEL ECUADOR

PROYECTO:ESTUDIO ESTADÍSTICO DE LA CALIDAD DEL AIRE EN INDIA

FECHA: 21/11/2025

# Estadística Descriptiva
# 09/12/2025
# Ariana Viteri
# ======= CARGA DE PAQUETES =======
library(gt)
library(dplyr)
# ======= CARGAR DATOS =======
datos <- read.csv(
  "~/ariana tercer semestre/Estadistica/city_day.csv",
  header = TRUE,
  sep = ",",
  dec = "."
)
# ======= LIMPIEZA DE LA VARIABLE NO =======
# Algunos valores pueden venir como "-" (vacíos)
NO <- datos$NO[datos$NO != "-"]
NO <- as.numeric(NO)
length(NO)   # tamaño muestral REAL después de limpiar
## [1] 25949
# ======= MIN, MAX, RANGO =======
min_NO <- min(NO)
max_NO <- max(NO)
R <- max_NO - min_NO
# ======= USAMOS k = 16 (intervalos fijados) =======
k <- 16
# Ancho del intervalo
A <- R / k
# ======= GENERACIÓN DE INTERVALOS =======
Li <- seq(from = min_NO, to = max_NO - A, by = A)
Ls <- c(seq(from = min_NO + A, to = max_NO - A, by = A), max_NO)
MC <- (Li + Ls) / 2  # marca de clase
# ======= CÁLCULO DE FRECUENCIAS =======
NO <- round(NO, 3)
Li <- round(Li, 3)
Ls <- round(Ls, 3)
ni <- numeric(length(Li))
for (i in 1:length(Li)) {
  if (i < length(Li)) {
    ni[i] <- sum(NO >= Li[i] & NO < Ls[i])
  } else {
    ni[i] <- sum(NO >= Li[i] & NO <= Ls[i])  # último intervalo cerrado
  }
}
# ======= CÁLCULOS COMPLEMENTARIOS =======
N <- sum(ni)
hi <- (ni / N) * 100
Ni_asc <- cumsum(ni)
Ni_desc <- rev(cumsum(rev(ni)))
Hi_asc <- cumsum(hi)
Hi_desc <- rev(cumsum(rev(hi)))
# ======= FORMATO DE INTERVALOS =======
Intervalo <- paste0("[", round(Li,2), " - ", round(Ls,2), ")")
Intervalo[length(Intervalo)] <- paste0("[", round(Li[length(Li)],2), " - ",
                                       round(Ls[length(Ls)],2), "]")
# ======= TABLA FINAL =======
TDF_NO <- data.frame(
  Intervalo = Intervalo,
  MC = round(MC, 2),
  ni = ni,
  hi = round(hi, 2),
  Ni_ascendente = Ni_asc,
  Ni_descendente = Ni_desc,
  Hi_ascendente = round(Hi_asc, 2),
  Hi_descendente = round(Hi_desc, 2)
)
# ======= AGREGAR FILA DE TOTALES =======
totales <- data.frame(
  Intervalo = "Totales",
  MC = "-",
  ni = sum(ni),
  hi = sum(hi),
  Ni_ascendente = "-",
  Ni_descendente = "-",
  Hi_ascendente = "-",
  Hi_descendente = "-"
)
TDF_NO <- rbind(TDF_NO, totales)
# ======= REVISIÓN RÁPIDA =======
length(Li)
## [1] 16
length(Ls)
## [1] 16
max(NO)
## [1] 390.68
max(Ls)
## [1] 390.68
range(NO)
## [1]   0.02 390.68
summary(NO)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.02    5.63    9.89   17.57   19.95  390.68
cbind(Li, Ls, ni)
##            Li      Ls    ni
##  [1,]   0.020  24.436 20815
##  [2,]  24.436  48.853  3209
##  [3,]  48.853  73.269  1025
##  [4,]  73.269  97.685   471
##  [5,]  97.685 122.101   215
##  [6,] 122.101 146.518   119
##  [7,] 146.518 170.934    49
##  [8,] 170.934 195.350    17
##  [9,] 195.350 219.766    12
## [10,] 219.766 244.183     5
## [11,] 244.183 268.599     3
## [12,] 268.599 293.015     5
## [13,] 293.015 317.431     1
## [14,] 317.431 341.847     0
## [15,] 341.848 366.264     1
## [16,] 366.264 390.680     2
# ======= TABLA BONITA CON gt() =======
TDF_NO %>%
  gt() %>%
  tab_header(
    title = md("*Tabla Nro. 1*"),
    subtitle = md("**Distribución de frecuencia de la concentración de NO, estudio de calidad del aire en China**")
  ) %>%
  tab_source_note(
    source_note = md("Fuente: Datos obtenidos y procesados por medio de https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india")
  ) %>%
  tab_style(
    style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"),
    locations = cells_body()
  ) %>%
  tab_style(
    style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"),
    locations = cells_body()
  ) %>%
  tab_style(
    style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"),
    locations = cells_column_labels()
  ) %>%
  tab_style(
    style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"),
    locations = cells_column_labels()
  ) %>%
  tab_options(
    table.border.top.color = "black",
    table.border.bottom.color = "black",
    table.border.top.style = "solid",
    table.border.bottom.style = "solid",
    column_labels.border.top.color = "black",
    column_labels.border.bottom.color = "black",
    column_labels.border.bottom.width = px(2),
    row.striping.include_table_body = TRUE,
    heading.border.bottom.color = "black",
    heading.border.bottom.width = px(2),
    table_body.hlines.color = "gray",
    table_body.border.bottom.color = "black"
  )
Tabla Nro. 1
Distribución de frecuencia de la concentración de NO, estudio de calidad del aire en China
Intervalo MC ni hi Ni_ascendente Ni_descendente Hi_ascendente Hi_descendente
[0.02 - 24.44) 12.23 20815 80.22 20815 25949 80.22 100
[24.44 - 48.85) 36.64 3209 12.37 24024 5134 92.58 19.78
[48.85 - 73.27) 61.06 1025 3.95 25049 1925 96.53 7.42
[73.27 - 97.68) 85.48 471 1.82 25520 900 98.35 3.47
[97.68 - 122.1) 109.89 215 0.83 25735 429 99.18 1.65
[122.1 - 146.52) 134.31 119 0.46 25854 214 99.63 0.82
[146.52 - 170.93) 158.73 49 0.19 25903 95 99.82 0.37
[170.93 - 195.35) 183.14 17 0.07 25920 46 99.89 0.18
[195.35 - 219.77) 207.56 12 0.05 25932 29 99.93 0.11
[219.77 - 244.18) 231.97 5 0.02 25937 17 99.95 0.07
[244.18 - 268.6) 256.39 3 0.01 25940 12 99.97 0.05
[268.6 - 293.02) 280.81 5 0.02 25945 9 99.98 0.03
[293.02 - 317.43) 305.22 1 0.00 25946 4 99.99 0.02
[317.43 - 341.85) 329.64 0 0.00 25946 3 99.99 0.01
[341.85 - 366.26) 354.06 1 0.00 25947 3 99.99 0.01
[366.26 - 390.68] 378.47 2 0.01 25949 2 100 0.01
Totales - 25949 100.00 - - - -
Fuente: Datos obtenidos y procesados por medio de https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india
# ============================================
# PROCESO DE SIMPLIFICACIÓN — VARIABLE NO
library(gt)
library(dplyr)
# ========= LIMPIEZA VARIABLE NO ==========
NO <- datos$NO[datos$NO != "-"]
NO <- as.numeric(NO)
# ========= NÚMERO DE DATOS ==========
n <- length(NO)
# ========= MIN, MAX, RANGO ==========
min_NO <- min(NO)
max_NO <- max(NO)
R <- max_NO - min_NO
# ========= NÚMERO DE INTERVALOS (12) ==========
k <- 12
A <- R / k   # ancho
# ========= LIMITES ==========
Lis <- seq(from = min_NO, to = max_NO - A, by = A)
Lss <- c(seq(from = min_NO + A, to = max_NO - A, by = A), max_NO)
# ========= MARCA DE CLASE ==========
MCs <- (Lis + Lss) / 2
# ========= FRECUENCIAS ==========
NO <- round(NO, 3)
Lis <- round(Lis, 3)
Lss <- round(Lss, 3)
ni <- numeric(length(Lis))
for (i in 1:length(Lis)) {
  if (i < length(Lis)) {
    ni[i] <- sum(NO >= Lis[i] & NO < Lss[i])
  } else {
    ni[i] <- sum(NO >= Lis[i] & NO <= Lss[i])
  }
}
# ========= FRECUENCIAS RELATIVAS (%)
hi <- round((ni / sum(ni)) * 100, 2)
# ========= ACUMULADAS ==========
Ni_asc <- cumsum(ni)
Hi_asc <- cumsum(hi)
Ni_desc <- rev(cumsum(rev(ni)))
Hi_desc <- rev(cumsum(rev(hi)))
# ========= CREACIÓN DE TABLA ==========
TDF_NO <- data.frame(
  Intervalo = paste0("[", Lis, " - ", Lss, ")"),
  MC = round(MCs, 3),
  ni = ni,
  hi = hi,
  Ni_ascendente = Ni_asc,
  Hi_ascendente = Hi_asc,
  Ni_descendente = Ni_desc,
  Hi_descendente = Hi_desc
)
colnames(TDF_NO) <- c(
  "Intervalo",
  "MC",
  "ni",
  "hi(%)",
  "Ni_asc",
  "Hi_asc (%)",
  "Ni_desc",
  "Hi_desc (%)"
)
# ========= FILA DE TOTALES ==========
totales <- data.frame(
  Intervalo = "Totales",
  MC = "-",
  ni = sum(ni),
  hi = sum(hi),
  Ni_asc = "-",
  `Hi_asc (%)` = "-",
  Ni_desc = "-",
  `Hi_desc (%)` = "-"
)
# Ajustar columnas
colnames(totales) <- colnames(TDF_NO)
# Agregar al final
TDF_NO <- rbind(TDF_NO, totales)
# ========= TABLA EN GT ==========
TDF_NO %>%
  gt() %>%
  tab_header(
    title = md("*Tabla Nro. 2*"),
    subtitle = md("**Distribución de frecuencia simplificada de la concentración de NO**")
  ) %>%
  tab_source_note(
    source_note = md("Fuente: Datos obtenidos y procesados por medio de https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india")
  ) %>%
  tab_style(
    style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"),
    locations = cells_body()
  ) %>%
  tab_style(
    style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"),
    locations = cells_body()
  ) %>%
  tab_style(
    style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"),
    locations = cells_column_labels()
  ) %>%
  tab_style(
    style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"),
    locations = cells_column_labels()
  ) %>%
  tab_options(
    table.border.top.color = "black",
    table.border.bottom.color = "black",
    table.border.top.style = "solid",
    table.border.bottom.style = "solid",
    column_labels.border.top.color = "black",
    column_labels.border.bottom.color = "black",
    column_labels.border.bottom.width = px(2),
    row.striping.include_table_body = TRUE,
    heading.border.bottom.color = "black",
    heading.border.bottom.width = px(2),
    table_body.hlines.color = "gray",
    table_body.border.bottom.color = "black"
  )
Tabla Nro. 2
Distribución de frecuencia simplificada de la concentración de NO
Intervalo MC ni hi(%) Ni_asc Hi_asc (%) Ni_desc Hi_desc (%)
[0.02 - 32.575) 16.298 22463 86.57 22463 86.57 25949 100.01
[32.575 - 65.13) 48.852 2330 8.98 24793 95.55 3486 13.44
[65.13 - 97.685) 81.408 727 2.80 25520 98.35 1156 4.46
[97.685 - 130.24) 113.963 261 1.01 25781 99.36 429 1.66
[130.24 - 162.795) 146.518 113 0.44 25894 99.8 168 0.65
[162.795 - 195.35) 179.073 26 0.10 25920 99.9 55 0.21
[195.35 - 227.905) 211.628 16 0.06 25936 99.96 29 0.11
[227.905 - 260.46) 244.183 2 0.01 25938 99.97 13 0.05
[260.46 - 293.015) 276.737 7 0.03 25945 100 11 0.04
[293.015 - 325.57) 309.293 1 0.00 25946 100 4 0.01
[325.57 - 358.125) 341.847 1 0.00 25947 100 3 0.01
[358.125 - 390.68) 374.403 2 0.01 25949 100.01 2 0.01
Totales - 25949 100.01 - - - -
Fuente: Datos obtenidos y procesados por medio de https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india
# ========= GRÁFICA — HISTOGRAMA DE NO =========
Histograma_NO <- hist(NO, breaks = 11, plot = FALSE)
hist(NO, breaks = 11,
     main = "Gráfica N°1: Distribución de la Concentración de NO
     presente en el estudio sobre calidad del aire en India entre 2015-2020 ",
     xlab = " NO (µg/m3)",
     ylab = "Cantidad",
     ylim = c(0, max(ni)),     # ni de tu tabla de NO
     col = "lightskyblue",
     cex.main = 0.9,
     cex.lab = 1,
     cex.axis = 0.9,
     xaxt = "n")
axis(1, at = Histograma_NO$breaks,
     labels = Histograma_NO$breaks,
     las = 1,
     cex.axis = 0.9)

hist(NO, breaks = 11,
     main = "Gráfica N°2: Distribución de la Concentración de NO
     presente en el estudio sobre calidad del aire en India entre 2015-2020",
     xlab = "NO (µg/m3)",
     ylab = "Cantidad",
     ylim = c(0, length(NO)),
     col = "lightskyblue",
     cex.main = 1,
     cex.lab = 1,
     cex.axis = 0.9,
     xaxt = "n")
axis(1, at = Histograma_NO$breaks,
     labels = Histograma_NO$breaks,
     las = 1,
     cex.axis = 0.9)

TDF_NO$`hi (%)` <- as.numeric(TDF_NO$`hi(%)`)
barplot(
  TDF_NO$`hi(%)`[1:(nrow(TDF_NO)-1)],
  space = 0,
  col = "skyblue",
  main = "Gráfica N°3: Distribución de la Concentración de NO,
  estudio calidad del aire en India, 2015-2020",
  xlab = "NO (µg/m3)",
  ylab = "Porcentaje (%)",
  names.arg = TDF_NO$MC[1:(nrow(TDF_NO)-1)],
  ylim = c(0, 100)
)

n <- as.numeric(nrow(TDF_NO))
barplot(
  TDF_NO$`hi(%)`[1:(n-1)],
  space = 0,
  main = "Gráfica N°4: Distribución de la concentración de NO
  en el estudio calidad del aire en India, 2015-2020",
  ylab = "Porcentaje (%)",
  xlab = "NO (µg/m3)",
  names.arg = TDF_NO$MC[1:(n-1)],
  col = "skyblue"
)

CajaNO <- boxplot(
  NO,
  horizontal = TRUE,
  col = "turquoise",
  border = "black",
  main = "Gráfica No. 5: Distribución de la concentración de NO,
  estudio calidad del aire en India desde 2015-2020",
  xlab = "NO (µg/m3)"
)

# Recalcular acumuladas (longitud = k)
Ni_asc  <- cumsum(ni)
Ni_desc <- rev(cumsum(rev(ni)))
Hi_asc  <- cumsum(hi)
Hi_desc <- rev(cumsum(rev(hi)))
k <- length(Lis)  # número de clases
# 1) Ojiva (cantidad) — usar Lss como coordenada x (marca superior de clase)
plot(Lss, Ni_asc,
     type = "b",
     main = "Gráfica N°6: Ojiva ascendente y descendente (Cantidad) - NO",
     xlab = "NO (µg/m3)",
     ylab = "Cantidad",
     pch = 19,
     col = "turquoise",
     ylim = c(0, max(Ni_asc))) 
lines(Lis, Ni_desc, type = "b", col = "red", pch = 19)

# 2) Ojiva (porcentaje)
plot(Lss, Hi_asc,
     type = "b",
     main = "Gráfica N°7: Ojiva ascendente y descendente (Porcentaje) - NO",
     xlab = "NO (µg/m3)",
     ylab = "Porcentaje (%)",
     pch = 19,
     col = "blue",
     ylim = c(0, max(Hi_asc, Hi_desc))) 
lines(Lis, Hi_desc, type = "b", col = "red", pch = 19)