FECHA: 21/11/2025
# Estadística Descriptiva
# 09/12/2025
# Ariana Viteri
# ======= CARGA DE PAQUETES =======
library(gt)
library(dplyr)
# ======= CARGAR DATOS =======
datos <- read.csv(
"~/ariana tercer semestre/Estadistica/city_day.csv",
header = TRUE,
sep = ",",
dec = "."
)
# ======= LIMPIEZA DE LA VARIABLE PM10 =======
PM10 <- datos$PM10[datos$PM10 != "-"]
PM10 <- as.numeric(PM10)
length(PM10)
## [1] 18391
# ======= MIN, MAX, RANGO =======
min_PM10 <- min(PM10)
max_PM10 <- max(PM10)
R <- max_PM10 - min_PM10
# ======= USAMOS k = 16 =======
k <- 16
A <- R / k
# ======= GENERACIÓN DE INTERVALOS =======
Li <- seq(from = min_PM10, to = max_PM10 - A, by = A)
Ls <- c(seq(from = min_PM10 + A, to = max_PM10 - A, by = A), max_PM10)
MC <- (Li + Ls) / 2
# ======= CÁLCULO DE FRECUENCIAS =======
PM10 <- round(PM10, 3)
Li <- round(Li, 3)
Ls <- round(Ls, 3)
ni <- numeric(length(Li))
for (i in 1:length(Li)) {
if (i < length(Li)) {
ni[i] <- sum(PM10 >= Li[i] & PM10 < Ls[i])
} else {
ni[i] <- sum(PM10 >= Li[i] & PM10 <= Ls[i])
}
}
# ======= CÁLCULOS COMPLEMENTARIOS =======
N <- sum(ni)
hi <- (ni / N) * 100
Ni_asc <- cumsum(ni)
Ni_desc <- rev(cumsum(rev(ni)))
Hi_asc <- cumsum(hi)
Hi_desc <- rev(cumsum(rev(hi)))
# ======= FORMATO DE INTERVALOS =======
Intervalo <- paste0("[", round(Li,2), " - ", round(Ls,2), ")")
Intervalo[length(Intervalo)] <- paste0("[", round(Li[length(Li)],2), " - ",
round(Ls[length(Ls)],2), "]")
# ======= TABLA FINAL =======
TDF_PM10 <- data.frame(
Intervalo = Intervalo,
MC = round(MC, 2),
ni = ni,
hi = round(hi, 2),
Ni_ascendente = Ni_asc,
Ni_descendente = Ni_desc,
Hi_ascendente = round(Hi_asc, 2),
Hi_descendente = round(Hi_desc, 2)
)
# ======= AGREGAR FILA DE TOTALES =======
totales <- data.frame(
Intervalo = "Totales",
MC = "-",
ni = sum(ni),
hi = sum(hi),
Ni_ascendente = "-",
Ni_descendente = "-",
Hi_ascendente = "-",
Hi_descendente = "-"
)
TDF_PM10 <- rbind(TDF_PM10, totales)
# ======= TABLA BONITA CON gt() =======
TDF_PM10 %>%
gt() %>%
tab_header(
title = md("*Tabla Nro. 1*"),
subtitle = md("**Distribución de frecuencia de la concentración de PM10, estudio de calidad del aire en India**")
) %>%
tab_source_note(
source_note = md("Fuente: Datos obtenidos y procesados por medio de https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india")
) %>%
tab_style(
style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"),
locations = cells_body()
) %>%
tab_style(
style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"),
locations = cells_body()
) %>%
tab_style(
style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"),
locations = cells_column_labels()
) %>%
tab_style(
style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"),
locations = cells_column_labels()
) %>%
tab_options(
table.border.top.color = "black",
table.border.bottom.color = "black",
table.border.top.style = "solid",
table.border.bottom.style = "solid",
column_labels.border.top.color = "black",
column_labels.border.bottom.color = "black",
column_labels.border.bottom.width = px(2),
row.striping.include_table_body = TRUE,
heading.border.bottom.color = "black",
heading.border.bottom.width = px(2),
table_body.hlines.color = "gray",
table_body.border.bottom.color = "black"
)
| Tabla Nro. 1 |
| Distribución de frecuencia de la concentración de PM10, estudio de calidad del aire en India |
| Intervalo |
MC |
ni |
hi |
Ni_ascendente |
Ni_descendente |
Hi_ascendente |
Hi_descendente |
| [0.01 - 62.51) |
31.26 |
5395 |
29.34 |
5395 |
18391 |
29.34 |
100 |
| [62.51 - 125.01) |
93.76 |
6760 |
36.76 |
12155 |
12996 |
66.09 |
70.66 |
| [125.01 - 187.51) |
156.26 |
3335 |
18.13 |
15490 |
6236 |
84.23 |
33.91 |
| [187.51 - 250.01) |
218.76 |
1346 |
7.32 |
16836 |
2901 |
91.54 |
15.77 |
| [250.01 - 312.51) |
281.26 |
700 |
3.81 |
17536 |
1555 |
95.35 |
8.46 |
| [312.51 - 375.01) |
343.76 |
425 |
2.31 |
17961 |
855 |
97.66 |
4.65 |
| [375.01 - 437.51) |
406.26 |
228 |
1.24 |
18189 |
430 |
98.9 |
2.34 |
| [437.51 - 500) |
468.76 |
112 |
0.61 |
18301 |
202 |
99.51 |
1.1 |
| [500 - 562.5) |
531.25 |
49 |
0.27 |
18350 |
90 |
99.78 |
0.49 |
| [562.5 - 625) |
593.75 |
18 |
0.10 |
18368 |
41 |
99.87 |
0.22 |
| [625 - 687.5) |
656.25 |
8 |
0.04 |
18376 |
23 |
99.92 |
0.13 |
| [687.5 - 750) |
718.75 |
5 |
0.03 |
18381 |
15 |
99.95 |
0.08 |
| [750 - 812.5) |
781.25 |
5 |
0.03 |
18386 |
10 |
99.97 |
0.05 |
| [812.5 - 875) |
843.75 |
1 |
0.01 |
18387 |
5 |
99.98 |
0.03 |
| [875 - 937.5) |
906.25 |
1 |
0.01 |
18388 |
4 |
99.98 |
0.02 |
| [937.5 - 1000] |
968.75 |
3 |
0.02 |
18391 |
3 |
100 |
0.02 |
| Totales |
- |
18391 |
100.00 |
- |
- |
- |
- |
| Fuente: Datos obtenidos y procesados por medio de https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india |
# ============================================
# PROCESO DE SIMPLIFICACIÓN — VARIABLE PM10
# ============================================
library(gt)
library(dplyr)
PM10 <- datos$PM10[datos$PM10 != "-"]
PM10 <- as.numeric(PM10)
n <- length(PM10)
min_PM10 <- min(PM10)
max_PM10 <- max(PM10)
R <- max_PM10 - min_PM10
k <- 12
A <- R / k
Lis <- seq(from = min_PM10, to = max_PM10 - A, by = A)
Lss <- c(seq(from = min_PM10 + A, to = max_PM10 - A, by = A), max_PM10)
MCs <- (Lis + Lss) / 2
PM10 <- round(PM10, 3)
Lis <- round(Lis, 3)
Lss <- round(Lss, 3)
ni <- numeric(length(Lis))
for (i in 1:length(Lis)) {
if (i < length(Lis)) {
ni[i] <- sum(PM10 >= Lis[i] & PM10 < Lss[i])
} else {
ni[i] <- sum(PM10 >= Lis[i] & PM10 <= Lss[i])
}
}
# FRECUENCIAS RELATIVAS
hi <- round((ni / sum(ni)) * 100, 2)
hi[length(hi)] <- 100 - sum(hi[-length(hi)])
Ni_asc <- cumsum(ni)
Hi_asc <- round(cumsum(hi), 2)
Ni_desc <- rev(cumsum(rev(ni)))
Hi_desc <- round(rev(cumsum(rev(hi))), 2)
# CREACIÓN DE TABLA
TDF_PM10 <- data.frame(
Intervalo = paste0("[", Lis, " - ", Lss, ")"),
MC = round(MCs, 3),
ni = ni,
`hi(%)` = hi,
Ni_asc = Ni_asc,
`Hi_asc (%)` = Hi_asc,
Ni_desc = Ni_desc,
`Hi_desc (%)` = Hi_desc
)
totales <- data.frame(
Intervalo = "Totales",
MC = "-",
ni = sum(ni),
`hi(%)` = sum(hi),
Ni_asc = "-",
`Hi_asc (%)` = "-",
Ni_desc = "-",
`Hi_desc (%)` = "-"
)
# Detectar nombre REAL de la columna hi
col_hi <- grep("^hi", colnames(TDF_PM10), value = TRUE)
# Convertir a numérico donde corresponda
TDF_PM10$MC <- suppressWarnings(as.numeric(TDF_PM10$MC))
TDF_PM10[[col_hi]] <- suppressWarnings(as.numeric(TDF_PM10[[col_hi]]))
TDF_PM10 <- rbind(TDF_PM10, totales)
#tabla
TDF_PM10 %>%
gt() %>%
tab_header(
title = md("*Tabla Nro. 2*"),
subtitle = md("**Distribución de frecuencia simplificada de la concentración de PM10, estudio de calidad del aire en India**")
) %>%
tab_source_note(
source_note = md("Fuente: Datos obtenidos y procesados por medio de https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india")
) %>%
tab_style(
style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"),
locations = cells_body()
) %>%
tab_style(
style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"),
locations = cells_body()
) %>%
tab_style(
style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"),
locations = cells_column_labels()
) %>%
tab_style(
style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"),
locations = cells_column_labels()
) %>%
tab_options(
table.border.top.color = "black",
table.border.bottom.color = "black",
table.border.top.style = "solid",
table.border.bottom.style = "solid",
column_labels.border.top.color = "black",
column_labels.border.bottom.color = "black",
column_labels.border.bottom.width = px(2),
row.striping.include_table_body = TRUE,
heading.border.bottom.color = "black",
heading.border.bottom.width = px(2),
table_body.hlines.color = "gray",
table_body.border.bottom.color = "black"
)
| Tabla Nro. 2 |
| Distribución de frecuencia simplificada de la concentración de PM10, estudio de calidad del aire en India |
| Intervalo |
MC |
ni |
hi... |
Ni_asc |
Hi_asc.... |
Ni_desc |
Hi_desc.... |
| [0.01 - 83.342) |
41.676 |
7848 |
42.67 |
7848 |
42.67 |
18391 |
100 |
| [83.342 - 166.675) |
125.009 |
6850 |
37.25 |
14698 |
79.92 |
10543 |
57.33 |
| [166.675 - 250.007) |
208.341 |
2138 |
11.63 |
16836 |
91.55 |
3693 |
20.08 |
| [250.007 - 333.34) |
291.674 |
864 |
4.70 |
17700 |
96.25 |
1555 |
8.45 |
| [333.34 - 416.672) |
375.006 |
434 |
2.36 |
18134 |
98.61 |
691 |
3.75 |
| [416.672 - 500.005) |
458.339 |
167 |
0.91 |
18301 |
99.52 |
257 |
1.39 |
| [500.005 - 583.338) |
541.671 |
53 |
0.29 |
18354 |
99.81 |
90 |
0.48 |
| [583.338 - 666.67) |
625.004 |
21 |
0.11 |
18375 |
99.92 |
37 |
0.19 |
| [666.67 - 750.002) |
708.336 |
6 |
0.03 |
18381 |
99.95 |
16 |
0.08 |
| [750.002 - 833.335) |
791.669 |
5 |
0.03 |
18386 |
99.98 |
10 |
0.05 |
| [833.335 - 916.667) |
875.001 |
1 |
0.01 |
18387 |
99.99 |
5 |
0.02 |
| [916.667 - 1000) |
958.334 |
4 |
0.01 |
18391 |
100 |
4 |
0.01 |
| Totales |
- |
18391 |
100.00 |
- |
- |
- |
- |
| Fuente: Datos obtenidos y procesados por medio de https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india |
# ========= CREAR HISTOGRAMA PARA OBTENER LOS BREAKS ==========
Histograma_PM10 <- hist(PM10, breaks = 11, plot = FALSE)
# ========= GRÁFICA — HISTOGRAMA DE PM10 =========
hist(PM10, breaks = 11,
main = "Gráfica N°1: Distribución de la Concentración de PM10
presente en el estudio sobre calidad del aire en India entre 2015-2020 ",
xlab = "PM10 (µg/m3)",
ylab = "Cantidad",
ylim = c(0, max(ni)),
col = "yellow",
cex.main = 0.9,
cex.lab = 1,
cex.axis = 0.9,
xaxt = "n")
axis(1, at = Histograma_PM10$breaks,
labels = Histograma_PM10$breaks,
las = 1,
cex.axis = 0.9)

hist(PM10, breaks = 11,
main = "Gráfica N°2: Distribución de la Concentración de PM10
presente en el estudio sobre calidad del aire en India entre 2015-2020",
xlab = "PM10 (µg/m3)",
ylab = "Cantidad",
ylim = c(0, length(PM10)),
col = "orange",
cex.main = 1,
cex.lab = 1,
cex.axis = 0.9,
xaxt = "n")
axis(1, at = Histograma_PM10$breaks,
labels = Histograma_PM10$breaks,
las = 1,
cex.axis = 0.9)

# --- Número de filas sin contar totales ---
n <- nrow(TDF_PM10) - 1
# --- Asegurar que MC sea numérico solo para filas de datos ---
MC_num <- as.numeric(TDF_PM10$MC[1:n])
# --- Detectar columna hi(%) automáticamente ---
col_hi <- grep("hi", names(TDF_PM10), value = TRUE)
hi_num <- as.numeric(TDF_PM10[[col_hi]][1:n])
# --- Crear la gráfica ---
bp <- barplot(
height = hi_num,
space = 0,
col = "blue",
main = "Gráfica N°3: Distribución de la concentración de PM10\nEstudio calidad del aire en India, 2015-2020",
xlab = "PM10 (µg/m3)",
ylab = "Porcentaje (%)",
names.arg = rep("", n), # temporal
ylim = c(0, 100),
las = 1
)
# --- Eje X con MC enteros ---
axis(
side = 1,
at = bp,
labels = round(MC_num, 0), # redondea a enteros
las = 2,
cex.axis = 0.8
)

# --- Número de filas sin contar totales ---
n <- nrow(TDF_PM10) - 1
# --- Asegurar que MC sea numérico solo para filas de datos ---
MC_num <- as.numeric(TDF_PM10$MC[1:n])
# --- Detectar columna hi(%) automáticamente ---
col_hi <- grep("hi", names(TDF_PM10), value = TRUE)
hi_num <- as.numeric(TDF_PM10[[col_hi]][1:n])
# --- Crear la Gráfica 4 ---
bp4 <- barplot(
height = hi_num,
space = 0,
col = "skyblue",
main = "Gráfica N°4: Distribución de la concentración de PM10\nEstudio calidad del aire en India, 2015-2020",
xlab = "PM10 (µg/m3)",
ylab = "Porcentaje (%)",
names.arg = rep("", n), # temporal
las = 1
)
# --- Eje X con MC enteros ---
axis(
side = 1,
at = bp4,
labels = round(MC_num, 0), # redondea a enteros
las = 2,
cex.axis = 0.8
)

# --- Gráfica 5: Boxplot de PM10 ---
CajaPM10 <- boxplot(
PM10,
horizontal = TRUE,
col = "turquoise",
border = "black",
main = "Gráfica No. 5: Distribución de la concentración de PM10\nEstudio calidad del aire en India 2015-2020",
xlab = "PM10 (µg/m3)"
)

# --- Asegurarse de tener Lis, Lss, ni, hi (sin fila de totales) ---
# Recalcular acumuladas
Ni_asc <- cumsum(ni)
Ni_desc <- rev(cumsum(rev(ni)))
Hi_asc <- cumsum(hi)
Hi_desc <- rev(cumsum(rev(hi)))
k <- length(Lis) # número de clases
# --- Gráfica 6: Ojiva (cantidad) para PM10 ---
plot(Lss, Ni_asc,
type = "b",
main = "Gráfica N°6: Ojiva ascendente y descendente (Cantidad) - PM10",
xlab = "PM10 (µg/m3)",
ylab = "Cantidad",
pch = 19,
col = "turquoise",
ylim = c(0, max(Ni_asc)))
lines(Lis, Ni_desc, type = "b", col = "red", pch = 19)

# --- Gráfica 7: Ojiva (porcentaje) para PM10 ---
plot(Lss, Hi_asc,
type = "b",
main = "Gráfica N°7: Ojiva ascendente y descendente (Porcentaje) - PM10",
xlab = "PM10 (µg/m3)",
ylab = "Porcentaje (%)",
pch = 19,
col = "blue",
ylim = c(0, max(Hi_asc, Hi_desc)))
lines(Lis, Hi_desc, type = "b", col = "red", pch = 19)
