#Variable Cuantitativa Continua
#PM2.5
#Autor: Ariana Viteri
#Fecha:31/05/2026
#Carga de Librerias
library(gt)
## Warning: package 'gt' was built under R version 4.5.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.3
##
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(e1071)
## Warning: package 'e1071' was built under R version 4.5.3
#Cargar los datos
datos <- read.csv("~/ariana tercer semestre/Estadistica/city_day.csv",
header = TRUE, dec = ".", sep = ";")
# ==============================================================================
#Selección de variable
pm25 <- datos$PM2.5[datos$PM2.5 != "-"]
# Conversión a numérico
pm25 <- as.numeric(pm25)
# ==============================================================================
# Calculo de frecuencia
n <- length(pm25) # Tamaño de muestra
min_pm25 <- min(pm25)
max_pm25 <- max(pm25)
# Rango
R <- max_pm25 - min_pm25
# Número de intervalos mediante la Regla de Sturges
k_detallado <- ceiling(1 + 3.322 * log10(n))
# Amplitud de clase
A <- R / k_detallado
# Mostrar resultados
cat("Número de intervalos (k):", k_detallado, "\n")
## Número de intervalos (k): 16
# Generación de límites de intervalos
Li <- seq(from = min_pm25, to = max_pm25 - A, by = A)
Ls <- c(seq(from = min_pm25 + A, to = max_pm25 - A, by = A), max_pm25)
# Redondeo para cálculos de intervalos precisos
pm25 <- round(pm25, 3)
Li <- round(Li, 3)
Ls <- round(Ls, 3)
# Marcas de Clase (MC)
MC <- (Li + Ls) / 2
# Creación de frecuencias absolutas (ni)
ni <- numeric(length(Li))
for (i in 1:length(Li)) {
if (i < length(Li)) {
# Intervalo abierto por la derecha: [Li, Ls)
ni[i] <- sum(pm25 >= Li[i] & pm25 < Ls[i])
} else {
# Último intervalo cerrado: [Li, Ls]
ni[i] <- sum(pm25 >= Li[i] & pm25 <= Ls[i])
}
}
# Frecuencias relativas y acumuladas
hi <- (ni / n) * 100
Ni_asc <- cumsum(ni)
Ni_desc <- rev(cumsum(rev(ni)))
Hi_asc <- cumsum(hi)
Hi_desc <- rev(cumsum(rev(hi)))
# Formatear la columna Intervalo
Intervalo <- paste0("[", round(Li, 2), " - ", round(Ls, 2), ")")
# Corregir el último intervalo para que sea cerrado
Intervalo[length(Intervalo)] <- paste0("[", round(Li[length(Li)], 2), " - ",
round(Ls[length(Ls)], 2), "]")
# Crear el Data Frame (TDF)
TDF_pm25 <- data.frame(
Intervalo = Intervalo,
MC = round(MC, 2),
ni = ni,
hi = round(hi, 2),
Ni_ascendente = Ni_asc,
Ni_descendente = Ni_desc,
Hi_ascendente = round(Hi_asc, 2),
Hi_descendente = round(Hi_desc, 2)
)
# Agregar la fila de totales
totales <- data.frame(
Intervalo = "Totales",
MC = "-",
ni = sum(ni),
hi = sum(hi),
Ni_ascendente = "-",
Ni_descendente = "-",
Hi_ascendente = "-",
Hi_descendente = "-"
)
TDF_pm25_completa <- rbind(TDF_pm25, totales)
#TABLA DE FRECUENCIAS DETALLADA
TDF_pm25_completa %>%
gt() %>%
tab_header(
title = "Tabla Nro. 1",
subtitle = "Distribución de frecuencia de concentración de Materia Particulada (PM2.5), estudio calidad del aire en India entre 2015-2020"
) %>%
tab_source_note(
source_note = md("Grupo: 1 <br> Fuente: https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india ")
) %>%
tab_style(
style = cell_borders(sides = "left", color = "black", weight = px(2)),
locations = cells_body()
) %>%
tab_style(
style = cell_borders(sides = "right", color = "black", weight = px(2)),
locations = cells_body()
) %>%
tab_style(
style = cell_borders(sides = "left", color = "black", weight = px(2)),
locations = cells_column_labels()
) %>%
tab_style(
style = cell_borders(sides = "right", color = "black", weight = px(2)),
locations = cells_column_labels()
) %>%
tab_options(
table.border.top.color = "black",
table.border.bottom.color = "black",
table.border.top.style = "solid",
table.border.bottom.style = "solid",
column_labels.border.top.color = "black",
column_labels.border.bottom.color = "black",
column_labels.border.bottom.width = px(2),
row.striping.include_table_body = TRUE,
heading.border.bottom.color = "black",
heading.border.bottom.width = px(2),
table_body.hlines.color = "gray",
table_body.border.bottom.color = "black"
)
| Tabla Nro. 1 | |||||||
| Distribución de frecuencia de concentración de Materia Particulada (PM2.5), estudio calidad del aire en India entre 2015-2020 | |||||||
| Intervalo | MC | ni | hi | Ni_ascendente | Ni_descendente | Hi_ascendente | Hi_descendente |
|---|---|---|---|---|---|---|---|
| [0.04 - 59.41) | 29.73 | 15069 | 60.44 | 15069 | 24933 | 60.44 | 100 |
| [59.41 - 118.78) | 89.1 | 6572 | 26.36 | 21641 | 9864 | 86.8 | 39.56 |
| [118.78 - 178.16) | 148.47 | 1772 | 7.11 | 23413 | 3292 | 93.9 | 13.2 |
| [178.16 - 237.53) | 207.84 | 820 | 3.29 | 24233 | 1520 | 97.19 | 6.1 |
| [237.53 - 296.9) | 267.21 | 401 | 1.61 | 24634 | 700 | 98.8 | 2.81 |
| [296.9 - 356.27) | 326.59 | 138 | 0.55 | 24772 | 299 | 99.35 | 1.2 |
| [356.27 - 415.64) | 385.96 | 75 | 0.30 | 24847 | 161 | 99.66 | 0.65 |
| [415.64 - 475.02) | 445.33 | 39 | 0.16 | 24886 | 86 | 99.81 | 0.34 |
| [475.02 - 534.39) | 504.7 | 16 | 0.06 | 24902 | 47 | 99.88 | 0.19 |
| [534.39 - 593.76) | 564.07 | 11 | 0.04 | 24913 | 31 | 99.92 | 0.12 |
| [593.76 - 653.13) | 623.44 | 5 | 0.02 | 24918 | 20 | 99.94 | 0.08 |
| [653.13 - 712.5) | 682.82 | 2 | 0.01 | 24920 | 15 | 99.95 | 0.06 |
| [712.5 - 771.87) | 742.19 | 2 | 0.01 | 24922 | 13 | 99.96 | 0.05 |
| [771.87 - 831.25) | 801.56 | 2 | 0.01 | 24924 | 11 | 99.96 | 0.04 |
| [831.25 - 890.62) | 860.93 | 3 | 0.01 | 24927 | 9 | 99.98 | 0.04 |
| [890.62 - 949.99] | 920.3 | 6 | 0.02 | 24933 | 6 | 100 | 0.02 |
| Totales | - | 24933 | 100.00 | - | - | - | - |
| Grupo: 1 Fuente: https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india |
|||||||
# ==============================================================================
# Por una gran cantidad de intervalos se realizara una reducción de filas en la
# tabla Nro. 2 creando solo 10 intervalos
# ==============================================================================
# TABLA 2
k_tabla2 <- 10
# Nueva amplitud
A2 <- R / k_tabla2
# Nuevos límites
Li2 <- seq(from = min_pm25, to = max_pm25 - A2, by = A2)
Ls2 <- c(seq(from = min_pm25 + A2, to = max_pm25 - A2, by = A2), max_pm25)
# Redondeo
Li2 <- round(Li2, 3)
Ls2 <- round(Ls2, 3)
# Marcas de clase
MC2 <- (Li2 + Ls2) / 2
# Frecuencias absolutas
ni2 <- numeric(length(Li2))
for(i in 1:length(Li2)){
if(i < length(Li2)){
ni2[i] <- sum(pm25 >= Li2[i] & pm25 < Ls2[i])
} else {
ni2[i] <- sum(pm25 >= Li2[i] & pm25 <= Ls2[i])
}
}
# Frecuencias relativas y acumuladas
hi2 <- (ni2 / n) * 100
Ni2_asc <- cumsum(ni2)
Ni2_desc <- rev(cumsum(rev(ni2)))
Hi2_asc <- cumsum(hi2)
Hi2_desc <- rev(cumsum(rev(hi2)))
# Intervalos
Intervalo2 <- paste0("[", round(Li2,2), " - ", round(Ls2,2), ")")
Intervalo2[length(Intervalo2)] <- paste0(
"[",
round(Li2[length(Li2)],2),
" - ",
round(Ls2[length(Ls2)],2),
"]"
)
# Tabla 2
TDF_pm25_10 <- data.frame(
Intervalo = Intervalo2,
MC = round(MC2,2),
ni = ni2,
hi = round(hi2,2),
Ni_ascendente = Ni2_asc,
Ni_descendente = Ni2_desc,
Hi_ascendente = round(Hi2_asc,2),
Hi_descendente = round(Hi2_desc,2)
)
# Totales
totales2 <- data.frame(
Intervalo = "Totales",
MC = "-",
ni = sum(ni2),
hi = sum(hi2),
Ni_ascendente = "-",
Ni_descendente = "-",
Hi_ascendente = "-",
Hi_descendente = "-"
)
TDF_pm25_10_completa <- rbind(TDF_pm25_10, totales2)
# TABLA 2: Distribución de frecuencias de PM2.5 con 10 intervalos
TDF_pm25_10_completa %>%
gt() %>%
tab_header(
title = "Tabla Nro. 1",
subtitle = "Distribución de frecuencia de concentración de Materia Particulada (PM2.5),\nestudio calidad del aire en India entre 2015-2020"
) %>%
tab_source_note(
source_note = md("Grupo: 1 <br> Fuente: https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india ")
) %>%
tab_style(
style = cell_borders(sides = "left", color = "black", weight = px(2)),
locations = cells_body()
) %>%
tab_style(
style = cell_borders(sides = "right", color = "black", weight = px(2)),
locations = cells_body()
) %>%
tab_style(
style = cell_borders(sides = "left", color = "black", weight = px(2)),
locations = cells_column_labels()
) %>%
tab_style(
style = cell_borders(sides = "right", color = "black", weight = px(2)),
locations = cells_column_labels()
) %>%
tab_options(
table.border.top.color = "black",
table.border.bottom.color = "black",
table.border.top.style = "solid",
table.border.bottom.style = "solid",
column_labels.border.top.color = "black",
column_labels.border.bottom.color = "black",
column_labels.border.bottom.width = px(2),
row.striping.include_table_body = TRUE,
heading.border.bottom.color = "black",
heading.border.bottom.width = px(2),
table_body.hlines.color = "gray",
table_body.border.bottom.color = "black"
)
| Tabla Nro. 1 | |||||||
| Distribución de frecuencia de concentración de Materia Particulada (PM2.5), estudio calidad del aire en India entre 2015-2020 | |||||||
| Intervalo | MC | ni | hi | Ni_ascendente | Ni_descendente | Hi_ascendente | Hi_descendente |
|---|---|---|---|---|---|---|---|
| [0.04 - 95.03) | 47.54 | 20146 | 80.80 | 20146 | 24933 | 80.8 | 100 |
| [95.03 - 190.03) | 142.53 | 3475 | 13.94 | 23621 | 4787 | 94.74 | 19.2 |
| [190.03 - 285.02) | 237.53 | 954 | 3.83 | 24575 | 1312 | 98.56 | 5.26 |
| [285.02 - 380.02) | 332.52 | 235 | 0.94 | 24810 | 358 | 99.51 | 1.44 |
| [380.02 - 475.02) | 427.52 | 76 | 0.30 | 24886 | 123 | 99.81 | 0.49 |
| [475.02 - 570.01) | 522.51 | 23 | 0.09 | 24909 | 47 | 99.9 | 0.19 |
| [570.01 - 665) | 617.51 | 9 | 0.04 | 24918 | 24 | 99.94 | 0.1 |
| [665 - 760) | 712.5 | 4 | 0.02 | 24922 | 15 | 99.96 | 0.06 |
| [760 - 855) | 807.5 | 3 | 0.01 | 24925 | 11 | 99.97 | 0.04 |
| [855 - 949.99] | 902.49 | 8 | 0.03 | 24933 | 8 | 100 | 0.03 |
| Totales | - | 24933 | 100.00 | - | - | - | - |
| Grupo: 1 Fuente: https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india |
|||||||
#===========================
# Histograma de R studio
# Primero: Crea el objeto sin graficar
Histograma_pm25 <- hist(pm25, breaks = 13, plot = FALSE)
# Segundo: Ahora sí, usa el objeto en el gráfico
hist(pm25, breaks = 13,
main = "Grafica Nro.1 de distribución de frecuencias de concentración de PM2.5\nen el estudio calidad del aire en India de 2015-2020",
xlab = "PM2.5 (\u00B5g/m\u00B3)",
ylab = "Cantidad",
ylim = c(0, max(Histograma_pm25$counts)),
col = "darkseagreen3",
cex.main = 0.9,
cex.lab = 1,
cex.axis = 0.9,
xaxt = "n")
axis(1, at = Histograma_pm25$breaks,
labels = round(Histograma_pm25$breaks, 0), las = 1,
cex.axis = 0.9)
grid()
#================================
#Histograma con relación a la totalidad de los datos
# Crear objeto histograma
Histograma_pm25 <- hist(pm25, breaks = 13, plot = FALSE)
par(mgp = c(3.2, 1, 0))
hist(pm25, breaks = 13,
main = "Grafica Nro.2 de distribución de frecuencias de concentración de PM2.5\nen el estudio calidad del aire en India de 2015-2020",
xlab = "PM2.5 (µg/m³)",
ylab = "Cantidad",
ylim = c(0, 25000),
col = "darkseagreen3",
cex.main = 0.9,
cex.lab = 1,
cex.axis = 0.9,
xaxt = "n",
yaxt = "n")
# Eje X
axis(1,
at = Histograma_pm25$breaks,
labels = round(Histograma_pm25$breaks, 0),
las = 1,
cex.axis = 0.9)
# Eje Y
axis(2,
at = seq(0, 25000, by = 5000),
labels = seq(0, 25000, by = 5000),
las = 1,
cex.axis = 0.9)
grid()
## Histogramas Porcentuales
# ================================
# Histograma que genera r studio porcentual
bp <- barplot(hi2,
space = 0,
names.arg = FALSE,
xaxt = "n",
main = "Grafica Nro.3 de distribución porcentual de PM2.5\nen el estudio calidad del aire en India de 2015-2020",
xlab = "PM2.5 (µg/m³)",
ylab = "Porcentaje (%)",
col = "darkseagreen3",
border = "black",
ylim = c(0, max(hi2) + 5),
cex.main = 0.9,
cex.lab = 1,
cex.axis = 0.9)
# Obtener los bordes de las barras
bordes <- c(bp[1] - 0.5, bp + 0.5)
axis(1,
at = bordes,
labels = seq(0, 1000, by = 100),
las = 1,
tck = -0.03)
grid()
# ================================
#Histograma con relación a la totalidad porcentualmente
bp <- barplot(hi2,
space = 0,
names.arg = FALSE,
xaxt = "n",
yaxt = "n",
main = "Grafica Nro.4 de distribución porcentual de PM2.5\nen el estudio calidad del aire en India de 2015-2020",
xlab = "PM2.5 (µg/m³)",
ylab = "Porcentaje (%)",
col = "darkseagreen3",
border = "black",
ylim = c(0, 100), # Hasta 100%
cex.main = 0.9,
cex.lab = 1,
cex.axis = 0.9)
# Eje X
bordes <- c(bp[1] - 0.5, bp + 0.5)
axis(1,
at = bordes,
labels = seq(0, 1000, by = 100),
las = 1,
tck = -0.03)
# Eje Y
axis(2,
at = seq(0, 100, by = 20),
labels = paste0(seq(0, 100, by = 20)),
las = 1)
grid()
#diagrama de caja y bigotes
boxplot(pm25,
horizontal = TRUE,
xaxt = "n",
yaxt = "n",
main = "Gráfica Nro.5: Diagrama de caja de la concentración de PM2.5\nen el estudio calidad del aire en India de 2015-2020",
xlab = "PM2.5 (µg/m³)",
col = "turquoise3",
border = "black",
cex.main = 0.9,
cex.lab = 1,
cex.axis = 0.9)
# Eje X personalizado
axis(1,
at = seq(0, 1000, by = 100),
labels = seq(0, 1000, by = 100),
las = 1)
grid()
# ==============================================================================
#OJIVA ASCENDENTE Y DESCENDENTE generada por R studio
plot(Ls2, Ni2_asc,
type = "b",
pch = 16,
col = "turquoise3",
lwd = 1,
ylim = c(0, n),
xlim = c(100, 1000),
xaxt = "n",
xlab = "PM2.5",
ylab = "Cantidad",
main = "Gráfica N°6: Ojiva ascendente y descendente de la\nconcentración de Materia Particulada (PM2.5)",
cex.main = 1)
# Eje X cada 200 unidades
axis(1, at = seq(0, 1000, by = 200))
# Ojiva descendente
lines(Ls2, Ni2_desc,
type = "b",
pch = 16,
col = "black",
lwd = 1)
grid()
box()
# ==============================================================================
#OJIVA PORCENTUAL ASCENDENTE Y DESCENDENTE
plot(Ls2, Hi2_asc,
type = "b",
pch = 16,
col = "turquoise3",
lwd = 1,
ylim = c(0, 100),
xlim = c(100, 1000),
xaxt = "n",
xlab = "PM2.5",
ylab = "Porcentaje (%)",
main = "Gráfica N°7: Ojiva porcentual ascendente y descendente de la\nconcentración de Materia Particulada (PM2.5)",
cex.main = 1)
# Eje X cada 200 unidades
axis(1, at = seq(0, 1000, by = 200))
# Ojiva porcentual descendente
lines(Ls2, Hi2_desc,
type = "b",
pch = 16,
col = "black",
lwd = 1)
grid()
box()
# Calculo previo de indicadores
# =========================================================
X <- mean(pm25, na.rm = TRUE) # Media
Me <- median(pm25, na.rm = TRUE) # Mediana
# Funcion para la Moda
# Moda (Mo)
# Obtenemos el intervalo de la clase con la frecuencia más alta (Moda de la distribución simplificada)
moda_index <- which.max(TDF_pm25_10_completa$ni[1:(nrow(TDF_pm25_10_completa)-1)])
Mo <- TDF_pm25_10_completa$Intervalo[moda_index]
desv <- sd(pm25, na.rm = TRUE) # Desviacion estandar
CV <- (desv / X) * 100 # Coeficiente de variacion
# Libreria para Asimetria y Curtosis
library(e1071)
As <- skewness(pm25, na.rm = TRUE)
K <- kurtosis(pm25, na.rm = TRUE)
# Creacion del data frame
Tabla_indicadores <- data.frame(
Variable = "PM2.5",
Rango = paste0("[", round(min(pm25),2), " - ", round(max(pm25),2), "]"),
Media = X,
Mediana = Me,
Moda = Mo,
DesvEst = desv,
CV = CV,
Asimetria = As,
Curtosis = K
)
# Visualizacion de la tabla
library(gt)
Tabla_indicadores %>%
gt() %>%
cols_label(
Variable = "Variable",
Rango = "Rango",
Media = "Media (X)",
Mediana = "Mediana (Me)",
Moda = "Moda (Mo)",
DesvEst = "Desv. Est. (sd)",
CV = "CV (%)",
Asimetria = "Asimetria (As)",
Curtosis = "Curtosis (K)"
) %>%
tab_header(
title = "Tabla Nro. 3",
subtitle = "Indicadores Estadisticos de la concentracion PM2.5, estudio calidad del aire en India entre 2015-2020"
) %>%
tab_source_note(
source_note = "Autor: Grupo 1 | Fuente: https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india"
) %>%
tab_spanner(
label = "Tendencia Central",
columns = c(Media, Mediana, Moda)
) %>%
tab_spanner(
label = "Dispersion",
columns = c(DesvEst, CV)
) %>%
tab_spanner(
label = "Forma",
columns = c(Asimetria, Curtosis)
) %>%
fmt_number(
columns = c(Media, Mediana, Moda, DesvEst, CV, Asimetria, Curtosis),
decimals = 2
) %>%
tab_style(
style = cell_borders(
sides = c("left", "right", "top", "bottom"),
color = "black",
weight = px(1)
),
locations = list(
cells_body(columns = everything(), rows = everything()),
cells_column_labels(columns = everything()),
cells_column_spanners(spanners = everything())
)
) %>%
tab_options(
table.border.top.color = "black",
table.border.bottom.color = "black",
table.border.left.color = "black",
table.border.right.color = "black",
table_body.hlines.color = "black",
table_body.vlines.color = "black",
column_labels.border.bottom.width = px(2)
)
| Tabla Nro. 3 | ||||||||
| Indicadores Estadisticos de la concentracion PM2.5, estudio calidad del aire en India entre 2015-2020 | ||||||||
| Variable | Rango |
Tendencia Central
|
Dispersion
|
Forma
|
||||
|---|---|---|---|---|---|---|---|---|
| Media (X) | Mediana (Me) | Moda (Mo) | Desv. Est. (sd) | CV (%) | Asimetria (As) | Curtosis (K) | ||
| PM2.5 | [0.04 - 949.99] | 67.45 | 48.57 | [0.04 - 95.03) | 64.66 | 95.86 | 3.37 | 21.13 |
| Autor: Grupo 1 | Fuente: https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india | ||||||||
En conclusión:
La variable material particulado PM2.5
fluctúa entre un mínimo de 0.04 a un maximo de 949.99 microgramos por
metro cúbico (µg/m³), con una media de 67.45 y una desviación estándar
de 64.66. Debido a un coeficiente de variación del 95.86%, se concluye
que es un conjunto de valores heterogéneos con una alta dispersión. Los
datos se acumulan de manera predominante en el primer intervalo [0.04 -
95.03) .No obstante, se han detectado 1,982 valores atípicos
extendiéndose hasta alcanzar picos críticos. Esta distribución evidencia
una condición medianamente mala para el medio ambiente.