FECHA: 21/11/2025
# Estadística Descriptiva
# 09/12/2025
# Ariana Viteri
# ======= CARGA DE PAQUETES =======
library(gt)
library(dplyr)
# ======= CARGAR DATOS =======
datos <- read.csv(
"~/ariana tercer semestre/Estadistica/city_day.csv",
header = TRUE,
sep = ",",
dec = "."
)
# ======= LIMPIEZA DE LA VARIABLE NO =======
# Algunos valores pueden venir como "-" (vacíos)
NO <- datos$NO[datos$NO != "-"]
NO <- as.numeric(NO)
length(NO) # tamaño muestral REAL después de limpiar
## [1] 25949
# ======= MIN, MAX, RANGO =======
min_NO <- min(NO)
max_NO <- max(NO)
R <- max_NO - min_NO
# ======= USAMOS k = 16 (intervalos fijados) =======
k <- 16
# Ancho del intervalo
A <- R / k
# ======= GENERACIÓN DE INTERVALOS =======
Li <- seq(from = min_NO, to = max_NO - A, by = A)
Ls <- c(seq(from = min_NO + A, to = max_NO - A, by = A), max_NO)
MC <- (Li + Ls) / 2 # marca de clase
# ======= CÁLCULO DE FRECUENCIAS =======
NO <- round(NO, 3)
Li <- round(Li, 3)
Ls <- round(Ls, 3)
ni <- numeric(length(Li))
for (i in 1:length(Li)) {
if (i < length(Li)) {
ni[i] <- sum(NO >= Li[i] & NO < Ls[i])
} else {
ni[i] <- sum(NO >= Li[i] & NO <= Ls[i]) # último intervalo cerrado
}
}
# ======= CÁLCULOS COMPLEMENTARIOS =======
N <- sum(ni)
hi <- (ni / N) * 100
Ni_asc <- cumsum(ni)
Ni_desc <- rev(cumsum(rev(ni)))
Hi_asc <- cumsum(hi)
Hi_desc <- rev(cumsum(rev(hi)))
# ======= FORMATO DE INTERVALOS =======
Intervalo <- paste0("[", round(Li,2), " - ", round(Ls,2), ")")
Intervalo[length(Intervalo)] <- paste0("[", round(Li[length(Li)],2), " - ",
round(Ls[length(Ls)],2), "]")
# ======= TABLA FINAL =======
TDF_NO <- data.frame(
Intervalo = Intervalo,
MC = round(MC, 2),
ni = ni,
hi = round(hi, 2),
Ni_ascendente = Ni_asc,
Ni_descendente = Ni_desc,
Hi_ascendente = round(Hi_asc, 2),
Hi_descendente = round(Hi_desc, 2)
)
# ======= AGREGAR FILA DE TOTALES =======
totales <- data.frame(
Intervalo = "Totales",
MC = "-",
ni = sum(ni),
hi = sum(hi),
Ni_ascendente = "-",
Ni_descendente = "-",
Hi_ascendente = "-",
Hi_descendente = "-"
)
TDF_NO <- rbind(TDF_NO, totales)
# ======= REVISIÓN RÁPIDA =======
length(Li)
## [1] 16
length(Ls)
## [1] 16
max(NO)
## [1] 390.68
max(Ls)
## [1] 390.68
range(NO)
## [1] 0.02 390.68
summary(NO)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.02 5.63 9.89 17.57 19.95 390.68
cbind(Li, Ls, ni)
## Li Ls ni
## [1,] 0.020 24.436 20815
## [2,] 24.436 48.853 3209
## [3,] 48.853 73.269 1025
## [4,] 73.269 97.685 471
## [5,] 97.685 122.101 215
## [6,] 122.101 146.518 119
## [7,] 146.518 170.934 49
## [8,] 170.934 195.350 17
## [9,] 195.350 219.766 12
## [10,] 219.766 244.183 5
## [11,] 244.183 268.599 3
## [12,] 268.599 293.015 5
## [13,] 293.015 317.431 1
## [14,] 317.431 341.847 0
## [15,] 341.848 366.264 1
## [16,] 366.264 390.680 2
# ======= TABLA BONITA CON gt() =======
TDF_NO %>%
gt() %>%
tab_header(
title = md("*Tabla Nro. 1*"),
subtitle = md("**Distribución de frecuencia de la concentración de NO, estudio de calidad del aire en China**")
) %>%
tab_source_note(
source_note = md("Fuente: Datos obtenidos y procesados por medio de https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india")
) %>%
tab_style(
style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"),
locations = cells_body()
) %>%
tab_style(
style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"),
locations = cells_body()
) %>%
tab_style(
style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"),
locations = cells_column_labels()
) %>%
tab_style(
style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"),
locations = cells_column_labels()
) %>%
tab_options(
table.border.top.color = "black",
table.border.bottom.color = "black",
table.border.top.style = "solid",
table.border.bottom.style = "solid",
column_labels.border.top.color = "black",
column_labels.border.bottom.color = "black",
column_labels.border.bottom.width = px(2),
row.striping.include_table_body = TRUE,
heading.border.bottom.color = "black",
heading.border.bottom.width = px(2),
table_body.hlines.color = "gray",
table_body.border.bottom.color = "black"
)
| Tabla Nro. 1 |
| Distribución de frecuencia de la concentración de NO, estudio de calidad del aire en China |
| Intervalo |
MC |
ni |
hi |
Ni_ascendente |
Ni_descendente |
Hi_ascendente |
Hi_descendente |
| [0.02 - 24.44) |
12.23 |
20815 |
80.22 |
20815 |
25949 |
80.22 |
100 |
| [24.44 - 48.85) |
36.64 |
3209 |
12.37 |
24024 |
5134 |
92.58 |
19.78 |
| [48.85 - 73.27) |
61.06 |
1025 |
3.95 |
25049 |
1925 |
96.53 |
7.42 |
| [73.27 - 97.68) |
85.48 |
471 |
1.82 |
25520 |
900 |
98.35 |
3.47 |
| [97.68 - 122.1) |
109.89 |
215 |
0.83 |
25735 |
429 |
99.18 |
1.65 |
| [122.1 - 146.52) |
134.31 |
119 |
0.46 |
25854 |
214 |
99.63 |
0.82 |
| [146.52 - 170.93) |
158.73 |
49 |
0.19 |
25903 |
95 |
99.82 |
0.37 |
| [170.93 - 195.35) |
183.14 |
17 |
0.07 |
25920 |
46 |
99.89 |
0.18 |
| [195.35 - 219.77) |
207.56 |
12 |
0.05 |
25932 |
29 |
99.93 |
0.11 |
| [219.77 - 244.18) |
231.97 |
5 |
0.02 |
25937 |
17 |
99.95 |
0.07 |
| [244.18 - 268.6) |
256.39 |
3 |
0.01 |
25940 |
12 |
99.97 |
0.05 |
| [268.6 - 293.02) |
280.81 |
5 |
0.02 |
25945 |
9 |
99.98 |
0.03 |
| [293.02 - 317.43) |
305.22 |
1 |
0.00 |
25946 |
4 |
99.99 |
0.02 |
| [317.43 - 341.85) |
329.64 |
0 |
0.00 |
25946 |
3 |
99.99 |
0.01 |
| [341.85 - 366.26) |
354.06 |
1 |
0.00 |
25947 |
3 |
99.99 |
0.01 |
| [366.26 - 390.68] |
378.47 |
2 |
0.01 |
25949 |
2 |
100 |
0.01 |
| Totales |
- |
25949 |
100.00 |
- |
- |
- |
- |
| Fuente: Datos obtenidos y procesados por medio de https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india |
# ============================================
# PROCESO DE SIMPLIFICACIÓN — VARIABLE NO
library(gt)
library(dplyr)
# ========= LIMPIEZA VARIABLE NO ==========
NO <- datos$NO[datos$NO != "-"]
NO <- as.numeric(NO)
# ========= NÚMERO DE DATOS ==========
n <- length(NO)
# ========= MIN, MAX, RANGO ==========
min_NO <- min(NO)
max_NO <- max(NO)
R <- max_NO - min_NO
# ========= NÚMERO DE INTERVALOS (12) ==========
k <- 12
A <- R / k # ancho
# ========= LIMITES ==========
Lis <- seq(from = min_NO, to = max_NO - A, by = A)
Lss <- c(seq(from = min_NO + A, to = max_NO - A, by = A), max_NO)
# ========= MARCA DE CLASE ==========
MCs <- (Lis + Lss) / 2
# ========= FRECUENCIAS ==========
NO <- round(NO, 3)
Lis <- round(Lis, 3)
Lss <- round(Lss, 3)
ni <- numeric(length(Lis))
for (i in 1:length(Lis)) {
if (i < length(Lis)) {
ni[i] <- sum(NO >= Lis[i] & NO < Lss[i])
} else {
ni[i] <- sum(NO >= Lis[i] & NO <= Lss[i])
}
}
# ========= FRECUENCIAS RELATIVAS (%)
hi <- round((ni / sum(ni)) * 100, 2)
# ========= ACUMULADAS ==========
Ni_asc <- cumsum(ni)
Hi_asc <- cumsum(hi)
Ni_desc <- rev(cumsum(rev(ni)))
Hi_desc <- rev(cumsum(rev(hi)))
# ========= CREACIÓN DE TABLA ==========
TDF_NO <- data.frame(
Intervalo = paste0("[", Lis, " - ", Lss, ")"),
MC = round(MCs, 3),
ni = ni,
hi = hi,
Ni_ascendente = Ni_asc,
Hi_ascendente = Hi_asc,
Ni_descendente = Ni_desc,
Hi_descendente = Hi_desc
)
colnames(TDF_NO) <- c(
"Intervalo",
"MC",
"ni",
"hi(%)",
"Ni_asc",
"Hi_asc (%)",
"Ni_desc",
"Hi_desc (%)"
)
# ========= FILA DE TOTALES ==========
totales <- data.frame(
Intervalo = "Totales",
MC = "-",
ni = sum(ni),
hi = sum(hi),
Ni_asc = "-",
`Hi_asc (%)` = "-",
Ni_desc = "-",
`Hi_desc (%)` = "-"
)
# Ajustar columnas
colnames(totales) <- colnames(TDF_NO)
# Agregar al final
TDF_NO <- rbind(TDF_NO, totales)
# ========= TABLA EN GT ==========
TDF_NO %>%
gt() %>%
tab_header(
title = md("*Tabla Nro. 2*"),
subtitle = md("**Distribución de frecuencia simplificada de la concentración de NO**")
) %>%
tab_source_note(
source_note = md("Fuente: Datos obtenidos y procesados por medio de https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india")
) %>%
tab_style(
style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"),
locations = cells_body()
) %>%
tab_style(
style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"),
locations = cells_body()
) %>%
tab_style(
style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"),
locations = cells_column_labels()
) %>%
tab_style(
style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"),
locations = cells_column_labels()
) %>%
tab_options(
table.border.top.color = "black",
table.border.bottom.color = "black",
table.border.top.style = "solid",
table.border.bottom.style = "solid",
column_labels.border.top.color = "black",
column_labels.border.bottom.color = "black",
column_labels.border.bottom.width = px(2),
row.striping.include_table_body = TRUE,
heading.border.bottom.color = "black",
heading.border.bottom.width = px(2),
table_body.hlines.color = "gray",
table_body.border.bottom.color = "black"
)
| Tabla Nro. 2 |
| Distribución de frecuencia simplificada de la concentración de NO |
| Intervalo |
MC |
ni |
hi(%) |
Ni_asc |
Hi_asc (%) |
Ni_desc |
Hi_desc (%) |
| [0.02 - 32.575) |
16.298 |
22463 |
86.57 |
22463 |
86.57 |
25949 |
100.01 |
| [32.575 - 65.13) |
48.852 |
2330 |
8.98 |
24793 |
95.55 |
3486 |
13.44 |
| [65.13 - 97.685) |
81.408 |
727 |
2.80 |
25520 |
98.35 |
1156 |
4.46 |
| [97.685 - 130.24) |
113.963 |
261 |
1.01 |
25781 |
99.36 |
429 |
1.66 |
| [130.24 - 162.795) |
146.518 |
113 |
0.44 |
25894 |
99.8 |
168 |
0.65 |
| [162.795 - 195.35) |
179.073 |
26 |
0.10 |
25920 |
99.9 |
55 |
0.21 |
| [195.35 - 227.905) |
211.628 |
16 |
0.06 |
25936 |
99.96 |
29 |
0.11 |
| [227.905 - 260.46) |
244.183 |
2 |
0.01 |
25938 |
99.97 |
13 |
0.05 |
| [260.46 - 293.015) |
276.737 |
7 |
0.03 |
25945 |
100 |
11 |
0.04 |
| [293.015 - 325.57) |
309.293 |
1 |
0.00 |
25946 |
100 |
4 |
0.01 |
| [325.57 - 358.125) |
341.847 |
1 |
0.00 |
25947 |
100 |
3 |
0.01 |
| [358.125 - 390.68) |
374.403 |
2 |
0.01 |
25949 |
100.01 |
2 |
0.01 |
| Totales |
- |
25949 |
100.01 |
- |
- |
- |
- |
| Fuente: Datos obtenidos y procesados por medio de https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india |
# ========= GRÁFICA — HISTOGRAMA DE NO =========
Histograma_NO <- hist(NO, breaks = 11, plot = FALSE)
hist(NO, breaks = 11,
main = "Gráfica N°1: Distribución de la Concentración de NO
presente en el estudio sobre calidad del aire en India entre 2015-2020 ",
xlab = " NO (µg/m3)",
ylab = "Cantidad",
ylim = c(0, max(ni)), # ni de tu tabla de NO
col = "lightskyblue",
cex.main = 0.9,
cex.lab = 1,
cex.axis = 0.9,
xaxt = "n")
axis(1, at = Histograma_NO$breaks,
labels = Histograma_NO$breaks,
las = 1,
cex.axis = 0.9)

hist(NO, breaks = 11,
main = "Gráfica N°2: Distribución de la Concentración de NO
presente en el estudio sobre calidad del aire en India entre 2015-2020",
xlab = "NO (µg/m3)",
ylab = "Cantidad",
ylim = c(0, length(NO)),
col = "lightskyblue",
cex.main = 1,
cex.lab = 1,
cex.axis = 0.9,
xaxt = "n")
axis(1, at = Histograma_NO$breaks,
labels = Histograma_NO$breaks,
las = 1,
cex.axis = 0.9)

TDF_NO$`hi (%)` <- as.numeric(TDF_NO$`hi(%)`)
barplot(
TDF_NO$`hi(%)`[1:(nrow(TDF_NO)-1)],
space = 0,
col = "skyblue",
main = "Gráfica N°3: Distribución de la Concentración de NO,
estudio calidad del aire en India, 2015-2020",
xlab = "NO (µg/m3)",
ylab = "Porcentaje (%)",
names.arg = TDF_NO$MC[1:(nrow(TDF_NO)-1)],
ylim = c(0, 100)
)

n <- as.numeric(nrow(TDF_NO))
barplot(
TDF_NO$`hi(%)`[1:(n-1)],
space = 0,
main = "Gráfica N°4: Distribución de la concentración de NO
en el estudio calidad del aire en India, 2015-2020",
ylab = "Porcentaje (%)",
xlab = "NO (µg/m3)",
names.arg = TDF_NO$MC[1:(n-1)],
col = "skyblue"
)

CajaNO <- boxplot(
NO,
horizontal = TRUE,
col = "turquoise",
border = "black",
main = "Gráfica No. 5: Distribución de la concentración de NO,
estudio calidad del aire en India desde 2015-2020",
xlab = "NO (µg/m3)"
)

# Recalcular acumuladas (longitud = k)
Ni_asc <- cumsum(ni)
Ni_desc <- rev(cumsum(rev(ni)))
Hi_asc <- cumsum(hi)
Hi_desc <- rev(cumsum(rev(hi)))
k <- length(Lis) # número de clases
# 1) Ojiva (cantidad) — usar Lss como coordenada x (marca superior de clase)
plot(Lss, Ni_asc,
type = "b",
main = "Gráfica N°6: Ojiva ascendente y descendente (Cantidad) - NO",
xlab = "NO (µg/m3)",
ylab = "Cantidad",
pch = 19,
col = "turquoise",
ylim = c(0, max(Ni_asc)))
lines(Lis, Ni_desc, type = "b", col = "red", pch = 19)

# 2) Ojiva (porcentaje)
plot(Lss, Hi_asc,
type = "b",
main = "Gráfica N°7: Ojiva ascendente y descendente (Porcentaje) - NO",
xlab = "NO (µg/m3)",
ylab = "Porcentaje (%)",
pch = 19,
col = "blue",
ylim = c(0, max(Hi_asc, Hi_desc)))
lines(Lis, Hi_desc, type = "b", col = "red", pch = 19)
