FECHA: 21/11/2025
# Estadística Descriptiva
# 09/12/2025
# Sebastian Chiluisa
# ======= CARGA DE PAQUETES =======
library(gt)
library(dplyr)
# ======= CARGAR DATOS =======
datos <- read.csv(
"~/ariana tercer semestre/Estadistica/city_day.csv",
header = TRUE,
sep = ",",
dec = "."
)
# ======= LIMPIEZA DE LA VARIABLE NOx =======
NOx <- datos$NOx[datos$NOx != "-"]
NOx <- as.numeric(NOx)
length(NOx)
## [1] 25346
# ======= MIN, MAX, RANGO =======
min_NOx <- min(NOx)
max_NOx <- max(NOx)
R <- max_NOx - min_NOx
# ======= USAMOS k = 16 =======
k <- 16
A <- R / k
# ======= GENERACIÓN DE INTERVALOS =======
Li <- seq(from = min_NOx, to = max_NOx - A, by = A)
Ls <- c(seq(from = min_NOx + A, to = max_NOx - A, by = A), max_NOx)
MC <- (Li + Ls) / 2
# ======= CÁLCULO DE FRECUENCIAS =======
NOx <- round(NOx, 3)
Li <- round(Li, 3)
Ls <- round(Ls, 3)
ni <- numeric(length(Li))
for (i in 1:length(Li)) {
if (i < length(Li)) {
ni[i] <- sum(NOx >= Li[i] & NOx < Ls[i])
} else {
ni[i] <- sum(NOx >= Li[i] & NOx <= Ls[i])
}
}
# ======= CÁLCULOS COMPLEMENTARIOS =======
N <- sum(ni)
hi <- (ni / N) * 100
Ni_asc <- cumsum(ni)
Ni_desc <- rev(cumsum(rev(ni)))
Hi_asc <- cumsum(hi)
Hi_desc <- rev(cumsum(rev(hi)))
# ======= FORMATO DE INTERVALOS =======
Intervalo <- paste0("[", round(Li,2), " - ", round(Ls,2), ")")
Intervalo[length(Intervalo)] <- paste0("[", round(Li[length(Li)],2), " - ",
round(Ls[length(Ls)],2), "]")
# ======= TABLA FINAL =======
TDF_NOx <- data.frame(
Intervalo = Intervalo,
MC = round(MC, 2),
ni = ni,
hi = round(hi, 2),
Ni_ascendente = Ni_asc,
Ni_descendente = Ni_desc,
Hi_ascendente = round(Hi_asc, 2),
Hi_descendente = round(Hi_desc, 2)
)
# ======= AGREGAR FILA DE TOTALES =======
totales <- data.frame(
Intervalo = "Totales",
MC = "-",
ni = sum(ni),
hi = sum(hi),
Ni_ascendente = "-",
Ni_descendente = "-",
Hi_ascendente = "-",
Hi_descendente = "-"
)
TDF_NOx <- rbind(TDF_NOx, totales)
# ======= TABLA BONITA CON gt() =======
TDF_NOx %>%
gt() %>%
tab_header(
title = md("*Tabla Nro. 1*"),
subtitle = md("**Distribución de frecuencia de la concentración de NOx, estudio de calidad del aire en India**")
) %>%
tab_source_note(
source_note = md("Fuente: Datos obtenidos y procesados por medio de https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india")
) %>%
tab_style(
style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"),
locations = cells_body()
) %>%
tab_style(
style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"),
locations = cells_body()
) %>%
tab_style(
style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"),
locations = cells_column_labels()
) %>%
tab_style(
style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"),
locations = cells_column_labels()
) %>%
tab_options(
table.border.top.color = "black",
table.border.bottom.color = "black",
table.border.top.style = "solid",
table.border.bottom.style = "solid",
column_labels.border.top.color = "black",
column_labels.border.bottom.color = "black",
column_labels.border.bottom.width = px(2),
row.striping.include_table_body = TRUE,
heading.border.bottom.color = "black",
heading.border.bottom.width = px(2),
table_body.hlines.color = "gray",
table_body.border.bottom.color = "black"
)
| Tabla Nro. 1 |
| Distribución de frecuencia de la concentración de NOx, estudio de calidad del aire en India |
| Intervalo |
MC |
ni |
hi |
Ni_ascendente |
Ni_descendente |
Hi_ascendente |
Hi_descendente |
| [0 - 29.23) |
14.61 |
15554 |
61.37 |
15554 |
25346 |
61.37 |
100 |
| [29.23 - 58.45) |
43.84 |
6269 |
24.73 |
21823 |
9792 |
86.1 |
38.63 |
| [58.45 - 87.68) |
73.07 |
1961 |
7.74 |
23784 |
3523 |
93.84 |
13.9 |
| [87.68 - 116.91) |
102.29 |
830 |
3.27 |
24614 |
1562 |
97.11 |
6.16 |
| [116.91 - 146.13) |
131.52 |
401 |
1.58 |
25015 |
732 |
98.69 |
2.89 |
| [146.13 - 175.36) |
160.75 |
191 |
0.75 |
25206 |
331 |
99.45 |
1.31 |
| [175.36 - 204.59) |
189.97 |
76 |
0.30 |
25282 |
140 |
99.75 |
0.55 |
| [204.59 - 233.82) |
219.2 |
35 |
0.14 |
25317 |
64 |
99.89 |
0.25 |
| [233.82 - 263.04) |
248.43 |
18 |
0.07 |
25335 |
29 |
99.96 |
0.11 |
| [263.04 - 292.27) |
277.66 |
5 |
0.02 |
25340 |
11 |
99.98 |
0.04 |
| [292.27 - 321.5) |
306.88 |
2 |
0.01 |
25342 |
6 |
99.98 |
0.02 |
| [321.5 - 350.72) |
336.11 |
0 |
0.00 |
25342 |
4 |
99.98 |
0.02 |
| [350.72 - 379.95) |
365.34 |
2 |
0.01 |
25344 |
4 |
99.99 |
0.02 |
| [379.95 - 409.18) |
394.56 |
1 |
0.00 |
25345 |
2 |
100 |
0.01 |
| [409.18 - 438.4) |
423.79 |
0 |
0.00 |
25345 |
1 |
100 |
0 |
| [438.4 - 467.63] |
453.02 |
1 |
0.00 |
25346 |
1 |
100 |
0 |
| Totales |
- |
25346 |
100.00 |
- |
- |
- |
- |
| Fuente: Datos obtenidos y procesados por medio de https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india |
# ============================================
# PROCESO DE SIMPLIFICACIÓN — VARIABLE NOx
# ============================================
library(gt)
library(dplyr)
NOx <- datos$NOx[datos$NOx != "-"]
NOx <- as.numeric(NOx)
n <- length(NOx)
min_NOx <- min(NOx)
max_NOx <- max(NOx)
R <- max_NOx - min_NOx
k <- 12
A <- R / k
Lis <- seq(from = min_NOx, to = max_NOx - A, by = A)
Lss <- c(seq(from = min_NOx + A, to = max_NOx - A, by = A), max_NOx)
MCs <- (Lis + Lss) / 2
NOx <- round(NOx, 3)
Lis <- round(Lis, 3)
Lss <- round(Lss, 3)
ni <- numeric(length(Lis))
for (i in 1:length(Lis)) {
if (i < length(Lis)) {
ni[i] <- sum(NOx >= Lis[i] & NOx < Lss[i])
} else {
ni[i] <- sum(NOx >= Lis[i] & NOx <= Lss[i])
}
}
# FRECUENCIAS RELATIVAS
hi <- round((ni / sum(ni)) * 100, 2)
hi[length(hi)] <- 100 - sum(hi[-length(hi)])
Ni_asc <- cumsum(ni)
Hi_asc <- round(cumsum(hi), 2)
Ni_desc <- rev(cumsum(rev(ni)))
Hi_desc <- round(rev(cumsum(rev(hi))), 2)
# CREACIÓN DE TABLA
TDF_NOx <- data.frame(
Intervalo = paste0("[", Lis, " - ", Lss, ")"),
MC = round(MCs, 3),
ni = ni,
`hi(%)` = hi,
Ni_asc = Ni_asc,
`Hi_asc (%)` = Hi_asc,
Ni_desc = Ni_desc,
`Hi_desc (%)` = Hi_desc
)
totales <- data.frame(
Intervalo = "Totales",
MC = "-",
ni = sum(ni),
`hi(%)` = sum(hi),
Ni_asc = "-",
`Hi_asc (%)` = "-",
Ni_desc = "-",
`Hi_desc (%)` = "-"
)
# Detectar nombre REAL de la columna hi
col_hi <- grep("^hi", colnames(TDF_NOx), value = TRUE)
# Convertir a numérico donde corresponda
TDF_NOx$MC <- suppressWarnings(as.numeric(TDF_NOx$MC))
TDF_NOx[[col_hi]] <- suppressWarnings(as.numeric(TDF_NOx[[col_hi]]))
TDF_NOx <- rbind(TDF_NOx, totales)
# TABLA BONITA
TDF_NOx %>%
gt() %>%
tab_header(
title = md("*Tabla Nro. 2*"),
subtitle = md("**Distribución de frecuencia simplificada de la concentración de NOx, estudio de calidad del aire en India**")
) %>%
tab_source_note(
source_note = md("Fuente: Datos obtenidos y procesados por medio de https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india")
) %>%
tab_style(
style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"),
locations = cells_body()
) %>%
tab_style(
style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"),
locations = cells_body()
) %>%
tab_style(
style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"),
locations = cells_column_labels()
) %>%
tab_style(
style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"),
locations = cells_column_labels()
) %>%
tab_options(
table.border.top.color = "black",
table.border.bottom.color = "black",
table.border.top.style = "solid",
table.border.bottom.style = "solid",
column_labels.border.top.color = "black",
column_labels.border.bottom.color = "black",
column_labels.border.bottom.width = px(2),
row.striping.include_table_body = TRUE,
heading.border.bottom.color = "black",
heading.border.bottom.width = px(2),
table_body.hlines.color = "gray",
table_body.border.bottom.color = "black"
)
| Tabla Nro. 2 |
| Distribución de frecuencia simplificada de la concentración de NOx, estudio de calidad del aire en India |
| Intervalo |
MC |
ni |
hi... |
Ni_asc |
Hi_asc.... |
Ni_desc |
Hi_desc.... |
| [0 - 38.969) |
19.485 |
18733 |
73.91 |
18733 |
73.91 |
25346 |
100 |
| [38.969 - 77.938) |
58.454 |
4597 |
18.14 |
23330 |
92.05 |
6613 |
26.09 |
| [77.938 - 116.908) |
97.423 |
1284 |
5.07 |
24614 |
97.12 |
2016 |
7.95 |
| [116.908 - 155.877) |
136.392 |
488 |
1.93 |
25102 |
99.05 |
732 |
2.88 |
| [155.877 - 194.846) |
175.361 |
156 |
0.62 |
25258 |
99.67 |
244 |
0.95 |
| [194.846 - 233.815) |
214.33 |
59 |
0.23 |
25317 |
99.9 |
88 |
0.33 |
| [233.815 - 272.784) |
253.3 |
20 |
0.08 |
25337 |
99.98 |
29 |
0.1 |
| [272.784 - 311.753) |
292.269 |
5 |
0.02 |
25342 |
100 |
9 |
0.02 |
| [311.753 - 350.722) |
331.238 |
0 |
0.00 |
25342 |
100 |
4 |
0 |
| [350.722 - 389.692) |
370.207 |
3 |
0.01 |
25345 |
100.01 |
4 |
0 |
| [389.692 - 428.661) |
409.176 |
0 |
0.00 |
25345 |
100.01 |
1 |
-0.01 |
| [428.661 - 467.63) |
448.145 |
1 |
-0.01 |
25346 |
100 |
1 |
-0.01 |
| Totales |
- |
25346 |
100.00 |
- |
- |
- |
- |
| Fuente: Datos obtenidos y procesados por medio de https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india |
# ========= CREAR HISTOGRAMA PARA OBTENER LOS BREAKS ==========
Histograma_NOx <- hist(NOx, breaks = 11, plot = FALSE)
# ========= GRÁFICA — HISTOGRAMA DE NOx =========
hist(NOx, breaks = 11,
main = "Gráfica N°1: Distribución de la Concentración de NOx
presente en el estudio sobre calidad del aire en India entre 2015-2020",
xlab = "NOx (µg/m3)",
ylab = "Cantidad",
ylim = c(0, max(ni)),
col = "#FFC300", # amarillo cálido
cex.main = 0.9,
cex.lab = 1,
cex.axis = 0.9,
xaxt = "n")
axis(1, at = Histograma_NOx$breaks,
labels = Histograma_NOx$breaks,
las = 1,
cex.axis = 0.9)

hist(NOx, breaks = 11,
main = "Gráfica N°2: Distribución de la Concentración de NOx
presente en el estudio sobre calidad del aire en India entre 2015-2020",
xlab = "NOx (µg/m3)",
ylab = "Cantidad",
ylim = c(0, length(NOx)),
col = "#FF5733", # naranja suave
cex.main = 1,
cex.lab = 1,
cex.axis = 0.9,
xaxt = "n")
axis(1, at = Histograma_NOx$breaks,
labels = Histograma_NOx$breaks,
las = 1,
cex.axis = 0.9)

# --- Número de filas sin contar totales ---
n <- nrow(TDF_NOx) - 1
# --- Asegurar que MC sea numérico solo para filas de datos ---
MC_num <- as.numeric(TDF_NOx$MC[1:n])
# --- Detectar columna hi(%) automáticamente ---
col_hi <- grep("hi", names(TDF_NOx), value = TRUE)
hi_num <- as.numeric(TDF_NOx[[col_hi]][1:n])
# --- Crear la gráfica — Gráfica 3 ---
bp <- barplot(
height = hi_num,
space = 0,
col = "#33C3FF", # azul claro
main = "Gráfica N°3: Distribución de la concentración de NOx\nEstudio calidad del aire en India, 2015-2020",
xlab = "NOx (µg/m3)",
ylab = "Porcentaje (%)",
names.arg = rep("", n), # temporal
ylim = c(0, 100),
las = 1
)
# --- Eje X con MC enteros ---
axis(
side = 1,
at = bp,
labels = round(MC_num, 0), # redondea a enteros
las = 2,
cex.axis = 0.8
)

# --- Número de filas sin contar totales ---
n <- nrow(TDF_NOx) - 1
# --- Asegurar que MC sea numérico solo para filas de datos ---
MC_num <- as.numeric(TDF_NOx$MC[1:n])
# --- Detectar columna hi(%) automáticamente ---
col_hi <- grep("hi", names(TDF_NOx), value = TRUE)
hi_num <- as.numeric(TDF_NOx[[col_hi]][1:n])
# --- Crear la Gráfica 4 ---
bp4 <- barplot(
height = hi_num,
space = 0,
col = "#33FF99", # verde menta
main = "Gráfica N°4: Distribución de la concentración de NOx\nEstudio calidad del aire en India, 2015-2020",
xlab = "NOx (µg/m3)",
ylab = "Porcentaje (%)",
names.arg = rep("", n), # temporal
las = 1
)
# --- Eje X con MC enteros ---
axis(
side = 1,
at = bp4,
labels = round(MC_num, 0), # redondea a enteros
las = 2,
cex.axis = 0.8
)

# --- Gráfica 5: Boxplot de NOx ---
CajaNOx <- boxplot(
NOx,
horizontal = TRUE,
col = "#FFB347", # naranja suave
border = "black",
main = "Gráfica No. 5: Distribución de la concentración de NOx\nEstudio calidad del aire en India 2015-2020",
xlab = "NOx (µg/m3)"
)

# --- Asegurarse de tener Lis, Lss, ni, hi (sin fila de totales) ---
# Recalcular acumuladas
Ni_asc <- cumsum(ni)
Ni_desc <- rev(cumsum(rev(ni)))
Hi_asc <- cumsum(hi)
Hi_desc <- rev(cumsum(rev(hi)))
k <- length(Lis) # número de clases
# --- Gráfica 6: Ojiva (cantidad) para NOx ---
plot(Lss, Ni_asc,
type = "b",
main = "Gráfica N°6: Ojiva ascendente y descendente (Cantidad) - NOx",
xlab = "NOx (µg/m3)",
ylab = "Cantidad",
pch = 19,
col = "#33C3FF", # azul claro
ylim = c(0, max(Ni_asc)))
lines(Lis, Ni_desc, type = "b", col = "#FF5733", pch = 19) # rojo suave

# --- Gráfica 7: Ojiva (porcentaje) para NOx ---
plot(Lss, Hi_asc,
type = "b",
main = "Gráfica N°7: Ojiva ascendente y descendente (Porcentaje) - NOx",
xlab = "NOx (µg/m3)",
ylab = "Porcentaje (%)",
pch = 19,
col = "blue", # verde menta
ylim = c(0, max(Hi_asc, Hi_desc)))
lines(Lis, Hi_desc, type = "b", col = "#FF5733", pch = 19) # rojo suave
