FECHA: 21/11/2025
# Estadística Descriptiva
# 09/12/2025
# Ariana Viteri
# ======= CARGA DE PAQUETES =======
library(gt)
library(dplyr)
# ======= CARGAR DATOS =======
datos <- read.csv(
"~/ariana tercer semestre/Estadistica/city_day.csv",
header = TRUE,
sep = ",",
dec = "."
)
# ======= LIMPIEZA DE LA VARIABLE PM2.5 =======
PM25 <- datos$PM2.5[datos$PM2.5 != "-"]
PM25 <- as.numeric(PM25)
length(PM25)
## [1] 24933
# ======= MIN, MAX, RANGO =======
min_PM25 <- min(PM25)
max_PM25 <- max(PM25)
R <- max_PM25 - min_PM25
# ======= USAMOS k = 16 =======
k <- 16
A <- R / k
# ======= GENERACIÓN DE INTERVALOS =======
Li <- seq(from = min_PM25, to = max_PM25 - A, by = A)
Ls <- c(seq(from = min_PM25 + A, to = max_PM25 - A, by = A), max_PM25)
MC <- (Li + Ls) / 2
# ======= CÁLCULO DE FRECUENCIAS =======
PM25 <- round(PM25, 3)
Li <- round(Li, 3)
Ls <- round(Ls, 3)
ni <- numeric(length(Li))
for (i in 1:length(Li)) {
if (i < length(Li)) {
ni[i] <- sum(PM25 >= Li[i] & PM25 < Ls[i])
} else {
ni[i] <- sum(PM25 >= Li[i] & PM25 <= Ls[i])
}
}
# ======= CÁLCULOS COMPLEMENTARIOS =======
N <- sum(ni)
hi <- (ni / N) * 100
Ni_asc <- cumsum(ni)
Ni_desc <- rev(cumsum(rev(ni)))
Hi_asc <- cumsum(hi)
Hi_desc <- rev(cumsum(rev(hi)))
# ======= FORMATO DE INTERVALOS =======
Intervalo <- paste0("[", round(Li,2), " - ", round(Ls,2), ")")
Intervalo[length(Intervalo)] <- paste0("[", round(Li[length(Li)],2), " - ",
round(Ls[length(Ls)],2), "]")
# ======= TABLA FINAL =======
TDF_PM25 <- data.frame(
Intervalo = Intervalo,
MC = round(MC, 2),
ni = ni,
hi = round(hi, 2),
Ni_ascendente = Ni_asc,
Ni_descendente = Ni_desc,
Hi_ascendente = round(Hi_asc, 2),
Hi_descendente = round(Hi_desc, 2)
)
# ======= AGREGAR FILA DE TOTALES =======
totales <- data.frame(
Intervalo = "Totales",
MC = "-",
ni = sum(ni),
hi = sum(hi),
Ni_ascendente = "-",
Ni_descendente = "-",
Hi_ascendente = "-",
Hi_descendente = "-"
)
TDF_PM25 <- rbind(TDF_PM25, totales)
# ======= TABLA BONITA CON gt() =======
TDF_PM25 %>%
gt() %>%
tab_header(
title = md("*Tabla Nro. 1*"),
subtitle = md("**Distribución de frecuencia de la concentración de PM2.5, estudio de calidad del aire en India**")
) %>%
tab_source_note(
source_note = md("Fuente: Datos obtenidos y procesados por medio de https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india")
) %>%
tab_style(
style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"),
locations = cells_body()
) %>%
tab_style(
style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"),
locations = cells_body()
) %>%
tab_style(
style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"),
locations = cells_column_labels()
) %>%
tab_style(
style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"),
locations = cells_column_labels()
) %>%
tab_options(
table.border.top.color = "black",
table.border.bottom.color = "black",
table.border.top.style = "solid",
table.border.bottom.style = "solid",
column_labels.border.top.color = "black",
column_labels.border.bottom.color = "black",
column_labels.border.bottom.width = px(2),
row.striping.include_table_body = TRUE,
heading.border.bottom.color = "black",
heading.border.bottom.width = px(2),
table_body.hlines.color = "gray",
table_body.border.bottom.color = "black"
)
| Tabla Nro. 1 |
| Distribución de frecuencia de la concentración de PM2.5, estudio de calidad del aire en India |
| Intervalo |
MC |
ni |
hi |
Ni_ascendente |
Ni_descendente |
Hi_ascendente |
Hi_descendente |
| [0.04 - 59.41) |
29.73 |
15069 |
60.44 |
15069 |
24933 |
60.44 |
100 |
| [59.41 - 118.78) |
89.1 |
6572 |
26.36 |
21641 |
9864 |
86.8 |
39.56 |
| [118.78 - 178.16) |
148.47 |
1772 |
7.11 |
23413 |
3292 |
93.9 |
13.2 |
| [178.16 - 237.53) |
207.84 |
820 |
3.29 |
24233 |
1520 |
97.19 |
6.1 |
| [237.53 - 296.9) |
267.21 |
401 |
1.61 |
24634 |
700 |
98.8 |
2.81 |
| [296.9 - 356.27) |
326.59 |
138 |
0.55 |
24772 |
299 |
99.35 |
1.2 |
| [356.27 - 415.64) |
385.96 |
75 |
0.30 |
24847 |
161 |
99.66 |
0.65 |
| [415.64 - 475.02) |
445.33 |
39 |
0.16 |
24886 |
86 |
99.81 |
0.34 |
| [475.02 - 534.39) |
504.7 |
16 |
0.06 |
24902 |
47 |
99.88 |
0.19 |
| [534.39 - 593.76) |
564.07 |
11 |
0.04 |
24913 |
31 |
99.92 |
0.12 |
| [593.76 - 653.13) |
623.44 |
5 |
0.02 |
24918 |
20 |
99.94 |
0.08 |
| [653.13 - 712.5) |
682.82 |
2 |
0.01 |
24920 |
15 |
99.95 |
0.06 |
| [712.5 - 771.87) |
742.19 |
2 |
0.01 |
24922 |
13 |
99.96 |
0.05 |
| [771.87 - 831.25) |
801.56 |
2 |
0.01 |
24924 |
11 |
99.96 |
0.04 |
| [831.25 - 890.62) |
860.93 |
3 |
0.01 |
24927 |
9 |
99.98 |
0.04 |
| [890.62 - 949.99] |
920.3 |
6 |
0.02 |
24933 |
6 |
100 |
0.02 |
| Totales |
- |
24933 |
100.00 |
- |
- |
- |
- |
| Fuente: Datos obtenidos y procesados por medio de https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india |
# ============================================
# PROCESO DE SIMPLIFICACIÓN — VARIABLE PM2.5
# ============================================
library(gt)
library(dplyr)
PM25 <- datos$PM2.5[datos$PM2.5 != "-"]
PM25 <- as.numeric(PM25)
n <- length(PM25)
min_PM25 <- min(PM25)
max_PM25 <- max(PM25)
R <- max_PM25 - min_PM25
k <- 12
A <- R / k
Lis <- seq(from = min_PM25, to = max_PM25 - A, by = A)
Lss <- c(seq(from = min_PM25 + A, to = max_PM25 - A, by = A), max_PM25)
MCs <- (Lis + Lss) / 2
PM25 <- round(PM25, 3)
Lis <- round(Lis, 3)
Lss <- round(Lss, 3)
ni <- numeric(length(Lis))
for (i in 1:length(Lis)) {
if (i < length(Lis)) {
ni[i] <- sum(PM25 >= Lis[i] & PM25 < Lss[i])
} else {
ni[i] <- sum(PM25 >= Lis[i] & PM25 <= Lss[i])
}
}
# FRECUENCIAS RELATIVAS
hi <- round((ni / sum(ni)) * 100, 2)
hi[length(hi)] <- 100 - sum(hi[-length(hi)])
Ni_asc <- cumsum(ni)
Hi_asc <- round(cumsum(hi), 2)
Ni_desc <- rev(cumsum(rev(ni)))
Hi_desc <- round(rev(cumsum(rev(hi))), 2)
# CREACIÓN DE TABLA
TDF_PM25 <- data.frame(
Intervalo = paste0("[", Lis, " - ", Lss, ")"),
MC = round(MCs, 3),
ni = ni,
`hi(%)` = hi,
Ni_asc = Ni_asc,
`Hi_asc (%)` = Hi_asc,
Ni_desc = Ni_desc,
`Hi_desc (%)` = Hi_desc
)
totales <- data.frame(
Intervalo = "Totales",
MC = "-",
ni = sum(ni),
`hi(%)` = sum(hi),
Ni_asc = "-",
`Hi_asc (%)` = "-",
Ni_desc = "-",
`Hi_desc (%)` = "-"
)
# Detectar nombre REAL de la columna hi
col_hi <- grep("^hi", colnames(TDF_PM25), value = TRUE)
# Convertir a numérico donde corresponda
TDF_PM25$MC <- suppressWarnings(as.numeric(TDF_PM25$MC))
TDF_PM25[[col_hi]] <- suppressWarnings(as.numeric(TDF_PM25[[col_hi]]))
TDF_PM25 <- rbind(TDF_PM25, totales)
#tabla
TDF_PM25 %>%
gt() %>%
tab_header(
title = md("*Tabla Nro. 2*"),
subtitle = md("**Distribución de frecuencia simploificada de la concentración de PM2.5, estudio de calidad del aire en India**")
) %>%
tab_source_note(
source_note = md("Fuente: Datos obtenidos y procesados por medio de https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india")
) %>%
tab_style(
style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"),
locations = cells_body()
) %>%
tab_style(
style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"),
locations = cells_body()
) %>%
tab_style(
style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"),
locations = cells_column_labels()
) %>%
tab_style(
style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"),
locations = cells_column_labels()
) %>%
tab_options(
table.border.top.color = "black",
table.border.bottom.color = "black",
table.border.top.style = "solid",
table.border.bottom.style = "solid",
column_labels.border.top.color = "black",
column_labels.border.bottom.color = "black",
column_labels.border.bottom.width = px(2),
row.striping.include_table_body = TRUE,
heading.border.bottom.color = "black",
heading.border.bottom.width = px(2),
table_body.hlines.color = "gray",
table_body.border.bottom.color = "black"
)
| Tabla Nro. 2 |
| Distribución de frecuencia simploificada de la concentración de PM2.5, estudio de calidad del aire en India |
| Intervalo |
MC |
ni |
hi... |
Ni_asc |
Hi_asc.... |
Ni_desc |
Hi_desc.... |
| [0.04 - 79.203) |
39.621 |
18527 |
74.31 |
18527 |
74.31 |
24933 |
100 |
| [79.203 - 158.365) |
118.784 |
4426 |
17.75 |
22953 |
92.06 |
6406 |
25.69 |
| [158.365 - 237.528) |
197.946 |
1280 |
5.13 |
24233 |
97.19 |
1980 |
7.94 |
| [237.528 - 316.69) |
277.109 |
464 |
1.86 |
24697 |
99.05 |
700 |
2.81 |
| [316.69 - 395.853) |
356.271 |
134 |
0.54 |
24831 |
99.59 |
236 |
0.95 |
| [395.853 - 475.015) |
435.434 |
55 |
0.22 |
24886 |
99.81 |
102 |
0.41 |
| [475.015 - 554.178) |
514.596 |
21 |
0.08 |
24907 |
99.89 |
47 |
0.19 |
| [554.178 - 633.34) |
593.759 |
8 |
0.03 |
24915 |
99.92 |
26 |
0.11 |
| [633.34 - 712.503) |
672.921 |
5 |
0.02 |
24920 |
99.94 |
18 |
0.08 |
| [712.503 - 791.665) |
752.084 |
2 |
0.01 |
24922 |
99.95 |
13 |
0.06 |
| [791.665 - 870.828) |
831.246 |
5 |
0.02 |
24927 |
99.97 |
11 |
0.05 |
| [870.828 - 949.99) |
910.409 |
6 |
0.03 |
24933 |
100 |
6 |
0.03 |
| Totales |
- |
24933 |
100.00 |
- |
- |
- |
- |
| Fuente: Datos obtenidos y procesados por medio de https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india |
# ========= CREAR HISTOGRAMA PARA OBTENER LOS BREAKS ==========
Histograma_PM25 <- hist(PM25, breaks = 11, plot = FALSE)
# ========= GRÁFICA — HISTOGRAMA DE PM25 =========
hist(PM25, breaks = 11,
main = "Gráfica N°1: Distribución de la Concentración de PM2.5
presente en el estudio sobre calidad del aire en India entre 2015-2020 ",
xlab = "PM2.5 (µg/m3)",
ylab = "Cantidad",
ylim = c(0, max(ni)),
col = "lightskyblue",
cex.main = 0.9,
cex.lab = 1,
cex.axis = 0.9,
xaxt = "n")
axis(1, at = Histograma_PM25$breaks,
labels = Histograma_PM25$breaks,
las = 1,
cex.axis = 0.9)

hist(PM25, breaks = 11,
main = "Gráfica N°2: Distribución de la Concentración de PM2.5
presente en el estudio sobre calidad del aire en India entre 2015-2020",
xlab = "PM2.5 (µg/m3)",
ylab = "Cantidad",
ylim = c(0, length(PM25)),
col = "lightskyblue",
cex.main = 1,
cex.lab = 1,
cex.axis = 0.9,
xaxt = "n")
axis(1, at = Histograma_PM25$breaks,
labels = Histograma_PM25$breaks,
las = 1,
cex.axis = 0.9)

# --- Número de filas sin contar totales ---
n <- nrow(TDF_PM25) - 1
# --- Asegurar que MC sea numérico solo para filas de datos ---
MC_num <- as.numeric(TDF_PM25$MC[1:n])
# --- Detectar columna hi(%) automáticamente ---
col_hi <- grep("hi", names(TDF_PM25), value = TRUE)
hi_num <- as.numeric(TDF_PM25[[col_hi]][1:n])
# --- Crear la gráfica ---
bp <- barplot(
height = hi_num,
space = 0,
col = "skyblue",
main = "Gráfica N°3: Distribución de la concentración de PM2.5\nEstudio calidad del aire en India, 2015-2020",
xlab = "PM2.5 (µg/m3)",
ylab = "Porcentaje (%)",
names.arg = rep("", n), # temporal
ylim = c(0, 100),
las = 1
)
# --- Eje X con MC enteros ---
axis(
side = 1,
at = bp,
labels = round(MC_num, 0), # redondea a enteros
las = 2,
cex.axis = 0.8
)

# --- Número de filas sin contar totales ---
n <- nrow(TDF_PM25) - 1
# --- Asegurar que MC sea numérico solo para filas de datos ---
MC_num <- as.numeric(TDF_PM25$MC[1:n])
# --- Detectar columna hi(%) automáticamente ---
col_hi <- grep("hi", names(TDF_PM25), value = TRUE)
hi_num <- as.numeric(TDF_PM25[[col_hi]][1:n])
# --- Crear la Gráfica 4 ---
bp4 <- barplot(
height = hi_num,
space = 0,
col = "skyblue",
main = "Gráfica N°4: Distribución de la concentración de PM2.5\nEstudio calidad del aire en India, 2015-2020",
xlab = "PM2.5 (µg/m3)",
ylab = "Porcentaje (%)",
names.arg = rep("", n), # temporal
las = 1
)
# --- Eje X con MC enteros ---
axis(
side = 1,
at = bp4,
labels = round(MC_num, 0), # redondea a enteros
las = 2,
cex.axis = 0.8
)

# --- Gráfica 5: Boxplot de PM2.5 ---
CajaPM25 <- boxplot(
PM25,
horizontal = TRUE,
col = "turquoise",
border = "black",
main = "Gráfica No. 5: Distribución de la concentración de PM2.5\nEstudio calidad del aire en India 2015-2020",
xlab = "PM2.5 (µg/m3)"
)

# --- Asegurarse de tener Lis, Lss, ni, hi (sin fila de totales) ---
# Recalcular acumuladas
Ni_asc <- cumsum(ni)
Ni_desc <- rev(cumsum(rev(ni)))
Hi_asc <- cumsum(hi)
Hi_desc <- rev(cumsum(rev(hi)))
k <- length(Lis) # número de clases
# --- Gráfica 6: Ojiva (cantidad) para PM2.5 ---
plot(Lss, Ni_asc,
type = "b",
main = "Gráfica N°6: Ojiva ascendente y descendente (Cantidad) - PM2.5",
xlab = "PM2.5 (µg/m3)",
ylab = "Cantidad",
pch = 19,
col = "turquoise",
ylim = c(0, max(Ni_asc)))
lines(Lis, Ni_desc, type = "b", col = "red", pch = 19)

# --- Gráfica 7: Ojiva (porcentaje) para PM2.5 ---
plot(Lss, Hi_asc,
type = "b",
main = "Gráfica N°7: Ojiva ascendente y descendente (Porcentaje) - PM2.5",
xlab = "PM2.5 (µg/m3)",
ylab = "Porcentaje (%)",
pch = 19,
col = "blue",
ylim = c(0, max(Hi_asc, Hi_desc)))
lines(Lis, Hi_desc, type = "b", col = "red", pch = 19)
