FECHA: 21/11/2025
# Estadística Descriptiva
# 09/12/2025
# Ariana Viteri
# ======= CARGA DE PAQUETES =======
library(gt)
library(dplyr)
# ======= CARGAR DATOS =======
datos <- read.csv(
"~/ariana tercer semestre/Estadistica/city_day.csv",
header = TRUE,
sep = ",",
dec = "."
)
# ======= LIMPIEZA DE LA VARIABLE NO =======
NO <- datos$NO[datos$NO != "-"]
NO <- as.numeric(NO)
length(NO)
## [1] 25949
# ======= MIN, MAX, RANGO =======
min_NO <- min(NO)
max_NO <- max(NO)
R <- max_NO - min_NO
# ======= USAMOS k = 16 =======
k <- 16
A <- R / k
# ======= GENERACIÓN DE INTERVALOS =======
Li <- seq(from = min_NO, to = max_NO - A, by = A)
Ls <- c(seq(from = min_NO + A, to = max_NO - A, by = A), max_NO)
MC <- (Li + Ls) / 2
# ======= CÁLCULO DE FRECUENCIAS =======
NO <- round(NO, 3)
Li <- round(Li, 3)
Ls <- round(Ls, 3)
ni <- numeric(length(Li))
for (i in 1:length(Li)) {
if (i < length(Li)) {
ni[i] <- sum(NO >= Li[i] & NO < Ls[i])
} else {
ni[i] <- sum(NO >= Li[i] & NO <= Ls[i])
}
}
# ======= CÁLCULOS COMPLEMENTARIOS =======
N <- sum(ni)
# --- Nuevo método: calcular hi, redondear y ajustar diferencia para que sume exactamente 100.000 ---
hi_raw <- (ni / N) * 100
hi <- round(hi_raw, 3)
# Calcula la diferencia (puede ser positiva o negativa, pequeña)
diff_hi <- round(100 - sum(hi), 3)
# Si existe diferencia, añádela a la clase con mayor hi (minimiza distorsión)
if (abs(diff_hi) > 0) {
idx_max <- which.max(hi)
hi[idx_max] <- round(hi[idx_max] + diff_hi, 3)
}
# ------------------------------------------------------------------------------
Ni_asc <- cumsum(ni)
Ni_desc <- rev(cumsum(rev(ni)))
Hi_asc <- cumsum(hi)
Hi_desc <- rev(cumsum(rev(hi)))
# ======= FORMATO DE INTERVALOS =======
Intervalo <- paste0("[", round(Li,3), " - ", round(Ls,3), ")")
Intervalo[length(Intervalo)] <- paste0("[", round(Li[length(Li)],3), " - ",
round(Ls[length(Ls)],3), "]")
# ======= TABLA FINAL =======
TDF_NO <- data.frame(
Intervalo = Intervalo,
MC = round(MC, 3),
ni = ni,
hi = hi,
Ni_ascendente = Ni_asc,
Ni_descendente = Ni_desc,
Hi_ascendente = round(Hi_asc, 3),
Hi_descendente = round(Hi_desc, 3)
)
# ======= FILA DE TOTALES =======
totales <- data.frame(
Intervalo = "Totales",
MC = "-",
ni = sum(ni),
hi = sum(hi),
Ni_ascendente = "-",
Ni_descendente = "-",
Hi_ascendente = "-",
Hi_descendente = "-"
)
TDF_NO <- rbind(TDF_NO, totales)
# ======= TABLA BONITA =======
TDF_NO %>%
gt() %>%
tab_header(
title = md("*Tabla Nro. 1*"),
subtitle = md("**Distribución de frecuencia de la concentración de NO, estudio de calidad del aire en India**")
) %>%
tab_source_note(
source_note = md("Fuente: Datos obtenidos y procesados por medio de https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india")
) %>%
tab_style(
style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"),
locations = cells_body()
) %>%
tab_style(
style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"),
locations = cells_body()
) %>%
tab_style(
style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"),
locations = cells_column_labels()
) %>%
tab_style(
style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"),
locations = cells_column_labels()
) %>%
tab_options(
table.border.top.color = "black",
table.border.bottom.color = "black",
table.border.top.style = "solid",
table.border.bottom.style = "solid",
column_labels.border.top.color = "black",
column_labels.border.bottom.color = "black",
column_labels.border.bottom.width = px(2),
row.striping.include_table_body = TRUE,
heading.border.bottom.color = "black",
heading.border.bottom.width = px(2),
table_body.hlines.color = "gray",
table_body.border.bottom.color = "black"
)
| Tabla Nro. 1 |
| Distribución de frecuencia de la concentración de NO, estudio de calidad del aire en India |
| Intervalo |
MC |
ni |
hi |
Ni_ascendente |
Ni_descendente |
Hi_ascendente |
Hi_descendente |
| [0.02 - 24.436) |
12.228 |
20815 |
80.213 |
20815 |
25949 |
80.213 |
100 |
| [24.436 - 48.853) |
36.644 |
3209 |
12.367 |
24024 |
5134 |
92.58 |
19.787 |
| [48.853 - 73.269) |
61.061 |
1025 |
3.950 |
25049 |
1925 |
96.53 |
7.42 |
| [73.269 - 97.685) |
85.477 |
471 |
1.815 |
25520 |
900 |
98.345 |
3.47 |
| [97.685 - 122.101) |
109.893 |
215 |
0.829 |
25735 |
429 |
99.174 |
1.655 |
| [122.101 - 146.518) |
134.309 |
119 |
0.459 |
25854 |
214 |
99.633 |
0.826 |
| [146.518 - 170.934) |
158.726 |
49 |
0.189 |
25903 |
95 |
99.822 |
0.367 |
| [170.934 - 195.35) |
183.142 |
17 |
0.066 |
25920 |
46 |
99.888 |
0.178 |
| [195.35 - 219.766) |
207.558 |
12 |
0.046 |
25932 |
29 |
99.934 |
0.112 |
| [219.766 - 244.183) |
231.974 |
5 |
0.019 |
25937 |
17 |
99.953 |
0.066 |
| [244.183 - 268.599) |
256.391 |
3 |
0.012 |
25940 |
12 |
99.965 |
0.047 |
| [268.599 - 293.015) |
280.807 |
5 |
0.019 |
25945 |
9 |
99.984 |
0.035 |
| [293.015 - 317.431) |
305.223 |
1 |
0.004 |
25946 |
4 |
99.988 |
0.016 |
| [317.431 - 341.847) |
329.639 |
0 |
0.000 |
25946 |
3 |
99.988 |
0.012 |
| [341.848 - 366.264) |
354.056 |
1 |
0.004 |
25947 |
3 |
99.992 |
0.012 |
| [366.264 - 390.68] |
378.472 |
2 |
0.008 |
25949 |
2 |
100 |
0.008 |
| Totales |
- |
25949 |
100.000 |
- |
- |
- |
- |
| Fuente: Datos obtenidos y procesados por medio de https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india |
# ============================================
# PROCESO DE SIMPLIFICACIÓN — VARIABLE NO
library(gt)
library(dplyr)
NO <- datos$NO[datos$NO != "-"]
NO <- as.numeric(NO)
n <- length(NO)
min_NO <- min(NO)
max_NO <- max(NO)
R <- max_NO - min_NO
# 12 intervalos
k <- 12
A <- R / k
Lis <- seq(from = min_NO, to = max_NO - A, by = A)
Lss <- c(seq(from = min_NO + A, to = max_NO - A, by = A), max_NO)
MCs <- (Lis + Lss) / 2
NO <- round(NO, 3)
Lis <- round(Lis, 3)
Lss <- round(Lss, 3)
ni <- numeric(length(Lis))
for (i in 1:length(Lis)) {
if (i < length(Lis)) {
ni[i] <- sum(NO >= Lis[i] & NO < Lss[i])
} else {
ni[i] <- sum(NO >= Lis[i] & NO <= Lss[i])
}
}
# -------- AJUSTE FORZADO PARA QUE hi Y Hi DEN 100 --------
# hi original
hi <- (ni / sum(ni)) * 100
hi <- round(hi, 3)
# ajustar último valor de hi para que total = 100 EXACTO
ajuste_hi <- 100 - sum(hi)
hi[length(hi)] <- hi[length(hi)] + ajuste_hi
hi <- round(hi, 3)
# ------- ACUMULADAS -------
Hi_asc <- cumsum(hi)
Hi_desc <- rev(cumsum(rev(hi)))
# Forzar que la última acumulada ascendente sea 100 EXACTO
Hi_asc[length(Hi_asc)] <- 100
# Forzar que la primera descendente sea 100 EXACTO
Hi_desc[1] <- 100
Ni_asc <- cumsum(ni)
Ni_desc <- rev(cumsum(rev(ni)))
# -------- CREACIÓN DE TABLA --------
TDF_NO <- data.frame( Intervalo = paste0("[", Lis, " - ", Lss, ")"),
MC = round(MCs, 3),
ni = ni,
hi = hi,
Ni_ascendente = Ni_asc,
Hi_ascendente = Hi_asc,
Ni_descendente = Ni_desc,
Hi_descendente = Hi_desc
)
colnames(TDF_NO) <- c(
"Intervalo",
"MC",
"ni",
"hi(%)",
"Ni_asc",
"Hi_asc (%)",
"Ni_desc",
"Hi_desc (%)"
)
# -------- FILA DE TOTALES --------
totales <- data.frame(
Intervalo = "Totales",
MC = "-",
ni = sum(ni),
hi = 100,
Ni_asc = "-",
`Hi_asc (%)` = "-",
Ni_desc = "-",
`Hi_desc (%)` = "-"
)
colnames(totales) <- colnames(TDF_NO)
TDF_NO <- rbind(TDF_NO, totales)
# -------- TABLA GT --------
TDF_NO %>%
gt() %>%
tab_header(
title = md("*Tabla Nro. 2*"),
subtitle = md("**Distribución de frecuencia simplificada de la concentración de NO**")
) %>%
tab_source_note(
source_note = md("Fuente: Datos obtenidos y procesados por medio de https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india")
) %>%
tab_style(
style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"),
locations = cells_body()
) %>%
tab_style(
style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"),
locations = cells_body()
) %>%
tab_style(
style = cell_borders(sides = "left", color = "black", weight = px(2), style = "solid"),
locations = cells_column_labels()
) %>%
tab_style(
style = cell_borders(sides = "right", color = "black", weight = px(2), style = "solid"),
locations = cells_column_labels()
) %>%
tab_options(
table.border.top.color = "black",
table.border.bottom.color = "black",
table.border.top.style = "solid",
table.border.bottom.style = "solid",
column_labels.border.top.color = "black",
column_labels.border.bottom.color = "black",
column_labels.border.bottom.width = px(2),
row.striping.include_table_body = TRUE,
heading.border.bottom.color = "black",
heading.border.bottom.width = px(2),
table_body.hlines.color = "gray",
table_body.border.bottom.color = "black"
)
| Tabla Nro. 2 |
| Distribución de frecuencia simplificada de la concentración de NO |
| Intervalo |
MC |
ni |
hi(%) |
Ni_asc |
Hi_asc (%) |
Ni_desc |
Hi_desc (%) |
| [0.02 - 32.575) |
16.298 |
22463 |
86.566 |
22463 |
86.566 |
25949 |
100 |
| [32.575 - 65.13) |
48.852 |
2330 |
8.979 |
24793 |
95.545 |
3486 |
13.434 |
| [65.13 - 97.685) |
81.408 |
727 |
2.802 |
25520 |
98.347 |
1156 |
4.455 |
| [97.685 - 130.24) |
113.963 |
261 |
1.006 |
25781 |
99.353 |
429 |
1.653 |
| [130.24 - 162.795) |
146.518 |
113 |
0.435 |
25894 |
99.788 |
168 |
0.647 |
| [162.795 - 195.35) |
179.073 |
26 |
0.100 |
25920 |
99.888 |
55 |
0.212 |
| [195.35 - 227.905) |
211.628 |
16 |
0.062 |
25936 |
99.95 |
29 |
0.112 |
| [227.905 - 260.46) |
244.183 |
2 |
0.008 |
25938 |
99.958 |
13 |
0.05 |
| [260.46 - 293.015) |
276.737 |
7 |
0.027 |
25945 |
99.985 |
11 |
0.042 |
| [293.015 - 325.57) |
309.293 |
1 |
0.004 |
25946 |
99.989 |
4 |
0.015 |
| [325.57 - 358.125) |
341.847 |
1 |
0.004 |
25947 |
99.993 |
3 |
0.011 |
| [358.125 - 390.68) |
374.403 |
2 |
0.007 |
25949 |
100 |
2 |
0.007 |
| Totales |
- |
25949 |
100.000 |
- |
- |
- |
- |
| Fuente: Datos obtenidos y procesados por medio de https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india |
# ========= GRÁFICA — HISTOGRAMA DE NO =========
Histograma_NO <- hist(NO, breaks = 11, plot = FALSE)
hist(NO, breaks = 11,
main = "Gráfica N°1: Distribución de la Concentración de NO
presente en el estudio sobre calidad del aire en India entre 2015-2020 ",
xlab = " NO (µg/m3)",
ylab = "Cantidad",
ylim = c(0, max(ni)), # ni de tu tabla de NO
col = "lightskyblue",
cex.main = 0.9,
cex.lab = 1,
cex.axis = 0.9,
xaxt = "n")
axis(1, at = Histograma_NO$breaks,
labels = Histograma_NO$breaks,
las = 1,
cex.axis = 0.9)

hist(NO, breaks = 11,
main = "Gráfica N°2: Distribución de la Concentración de NO
presente en el estudio sobre calidad del aire en India entre 2015-2020",
xlab = "NO (µg/m3)",
ylab = "Cantidad",
ylim = c(0, length(NO)),
col = "lightskyblue",
cex.main = 1,
cex.lab = 1,
cex.axis = 0.9,
xaxt = "n")
axis(1, at = Histograma_NO$breaks,
labels = Histograma_NO$breaks,
las = 1,
cex.axis = 0.9)

# Convertir a numérico
TDF_NO$MC <- as.numeric(TDF_NO$MC)
TDF_NO$`hi(%)` <- as.numeric(TDF_NO$`hi(%)`)
# Número de barras
n <- nrow(TDF_NO) - 1
# --- GRÁFICO ---
bp <- barplot(
TDF_NO$`hi(%)`[1:n],
space = 0,
col = "skyblue",
main = "Gráfica N°3: Distribución de la Concentración de NO,\nestudio calidad del aire en India, 2015-2020",
xlab = "NO (µg/m3)",
ylab = "Porcentaje (%)",
names.arg = rep("", n), # ← NO mostrar decimales del MC
ylim = c(0, 100),
las = 1
)
# --- EJE X CON MC REDONDEADOS ---
MC_enteros <- round(TDF_NO$MC[1:n], 0)
axis(
side = 1,
at = bp, # posiciones reales de las barras
labels = MC_enteros # valores MC redondeados, sin decimales
)

# Número de filas
n <- nrow(TDF_NO)
# Asegurar que MC y hi(%) sean numéricos
TDF_NO$MC <- as.numeric(TDF_NO$MC)
TDF_NO$`hi(%)` <- as.numeric(TDF_NO$`hi(%)`)
# --- GRÁFICO ---
bp <- barplot(
TDF_NO$`hi(%)`[1:(n-1)],
space = 0,
main = "Gráfica N°4: Distribución de la concentración de NO
en el estudio calidad del aire en India, 2015-2020",
ylab = "Porcentaje (%)",
xlab = "NO (µg/m3)",
names.arg = rep("", n-1), # ← quitar etiquetas para evitar decimales
col = "skyblue",
las = 1
)
# --- EJE X CON MC REDONDEADOS ---
MC_enteros <- round(TDF_NO$MC[1:(n-1)], 0)
axis(
side = 1,
at = bp, # posiciones de las barras
labels = MC_enteros # MC sin decimales
)

CajaNO <- boxplot(
NO,
horizontal = TRUE,
col = "turquoise",
border = "black",
main = "Gráfica No. 5: Distribución de la concentración de NO,
estudio calidad del aire en India desde 2015-2020",
xlab = "NO (µg/m3)"
)

# Recalcular acumuladas (longitud = k)
Ni_asc <- cumsum(ni)
Ni_desc <- rev(cumsum(rev(ni)))
Hi_asc <- cumsum(hi)
Hi_desc <- rev(cumsum(rev(hi)))
k <- length(Lis) # número de clases
# 1) Ojiva (cantidad) — usar Lss como coordenada x (marca superior de clase)
plot(Lss, Ni_asc,
type = "b",
main = "Gráfica N°6: Ojiva ascendente y descendente (Cantidad) - NO",
xlab = "NO (µg/m3)",
ylab = "Cantidad",
pch = 19,
col = "turquoise",
ylim = c(0, max(Ni_asc)))
lines(Lis, Ni_desc, type = "b", col = "red", pch = 19)

# 2) Ojiva (porcentaje)
plot(Lss, Hi_asc,
type = "b",
main = "Gráfica N°7: Ojiva ascendente y descendente (Porcentaje) - NO",
xlab = "NO (µg/m3)",
ylab = "Porcentaje (%)",
pch = 19,
col = "blue",
ylim = c(0, max(Hi_asc, Hi_desc)))
lines(Lis, Hi_desc, type = "b", col = "red", pch = 19)
