Univercidad Central del Eciador
FIGEMPA-Ingeniería Ambiental
# Cargar datos
datos <- read.csv(
"city_day.csv",
header = TRUE,
sep = ",",
dec = "."
)
# Crear vector ozono sin guiones
tolueno<- datos$Toluene[datos$Toluene != "-"]
tolueno<- as.numeric(tolueno)
#Estadística descriptiva
#5/12/2025
#Lorien Arcentales
#Carga de paquetes
library(gt)
library(dplyr)
datos<-read.csv("city_day.csv", header = TRUE, dec = ".",
sep = ",")
#Extraccion de los "-" de la variable ozono porque son valores inexistentes,
# para un mejor analisis, cambia tamaño muestral de 29531 a 21490
tolueno<-datos$Toluene[datos$Toluene != "-"]
length(tolueno)
## [1] 21490
tolueno <- as.numeric(tolueno)
min<-min(tolueno)
max<-max(tolueno)
R=max-min
k=1+(3.3)*log(length(tolueno))
k<-floor(k)
A<-R/k
#Generación de intervalos
Li <- seq(from = min, to = max - A, by = A)
Ls <- c (seq(from = min + A, to = max - A, by = A), max) # último límite = max
MC<-(Li+Ls)/2
#Creación de ni
tolueno <- round(tolueno, 3) # redondear los datos de ozono
Li <- round(Li, 3) # redondear límites inferiores
Ls <- round(Ls, 3)
ni <- numeric(length(Li))
for (i in 1:length(Li)) {
if (i < length(Li)) {
ni[i] <- sum(tolueno >= Li[i] & tolueno < Ls[i])
} else {
ni[i] <- sum(tolueno >= Li[i] & tolueno <= Ls[i]) # Último intervalo cerrado
}
}
N <- sum(ni)
hi <- (ni / N) * 100
Ni_asc <- cumsum(ni)
Ni_desc <- rev(cumsum(rev(ni)))
Hi_asc <- cumsum(hi)
Hi_desc <- rev(cumsum(rev(hi)))
Intervalo <- paste0("[", round(Li,2), " - ", round(Ls,2), ")")
Intervalo[length(Intervalo)] <- paste0("[", round(Li[length(Li)],2), " - ",
round(Ls[length(Ls)],2), "]")
TDF_tolueno <- data.frame(
Intervalo = Intervalo,
MC = round(MC, 2),
ni = ni,
hi = round(hi, 2),
Ni_ascendente = Ni_asc,
Ni_descendente = Ni_desc,
Hi_ascendente = round(Hi_asc, 2),
Hi_descendente = round(Hi_desc, 2)
)
# Crear fila de totales
totales <- data.frame(
Intervalo = "Totales",
MC = "-",
ni = sum(ni),
hi = sum(hi),
Ni_ascendente = "-",
Ni_descendente = "-",
Hi_ascendente = "-",
Hi_descendente = "-"
)
# Agregar al final del data.frame
TDF_tolueno<- rbind(TDF_tolueno, totales)
library(gt)
library(dplyr)
TDF_tolueno %>%
gt() %>%
tab_header(
title = md("*Tabla Nro. 1*"),
subtitle = md("**Distribucion de frecuencia de concentración de Tolueno,estudio calidad del aire en India entre 2015-2020 **")
) %>%
tab_source_note(
source_note = md("Autor: Grupo 2\n Fuente:https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india")
) %>%
tab_options(
table.border.top.color = "black",
table.border.bottom.color = "black",
table.border.top.style = "solid",
table.border.bottom.style = "solid",
column_labels.border.top.color = "black",
column_labels.border.bottom.color = "black",
column_labels.border.bottom.width = px(2),
row.striping.include_table_body = TRUE,
heading.border.bottom.color = "black",
heading.border.bottom.width = px(2),
table_body.hlines.color = "gray",
table_body.border.bottom.color = "black"
)
| Tabla Nro. 1 |
| **Distribucion de frecuencia de concentración de Tolueno,estudio calidad del aire en India entre 2015-2020 ** |
| Intervalo |
MC |
ni |
hi |
Ni_ascendente |
Ni_descendente |
Hi_ascendente |
Hi_descendente |
| [0 - 13.78) |
6.89 |
17720 |
82.46 |
17720 |
21490 |
82.46 |
100 |
| [13.78 - 27.57) |
20.68 |
1940 |
9.03 |
19660 |
3770 |
91.48 |
17.54 |
| [27.57 - 41.35) |
34.46 |
1160 |
5.40 |
20820 |
1830 |
96.88 |
8.52 |
| [41.35 - 55.13) |
48.24 |
279 |
1.30 |
21099 |
670 |
98.18 |
3.12 |
| [55.13 - 68.92) |
62.03 |
151 |
0.70 |
21250 |
391 |
98.88 |
1.82 |
| [68.92 - 82.7) |
75.81 |
113 |
0.53 |
21363 |
240 |
99.41 |
1.12 |
| [82.7 - 96.48) |
89.59 |
48 |
0.22 |
21411 |
127 |
99.63 |
0.59 |
| [96.48 - 110.27) |
103.38 |
18 |
0.08 |
21429 |
79 |
99.72 |
0.37 |
| [110.27 - 124.05) |
117.16 |
6 |
0.03 |
21435 |
61 |
99.74 |
0.28 |
| [124.05 - 137.83) |
130.94 |
7 |
0.03 |
21442 |
55 |
99.78 |
0.26 |
| [137.83 - 151.62) |
144.73 |
4 |
0.02 |
21446 |
48 |
99.8 |
0.22 |
| [151.62 - 165.4) |
158.51 |
5 |
0.02 |
21451 |
44 |
99.82 |
0.2 |
| [165.4 - 179.18) |
172.29 |
3 |
0.01 |
21454 |
39 |
99.83 |
0.18 |
| [179.18 - 192.97) |
186.07 |
1 |
0.00 |
21455 |
36 |
99.84 |
0.17 |
| [192.97 - 206.75) |
199.86 |
4 |
0.02 |
21459 |
35 |
99.86 |
0.16 |
| [206.75 - 220.53) |
213.64 |
2 |
0.01 |
21461 |
31 |
99.87 |
0.14 |
| [220.53 - 234.32) |
227.43 |
1 |
0.00 |
21462 |
29 |
99.87 |
0.13 |
| [234.32 - 248.1) |
241.21 |
1 |
0.00 |
21463 |
28 |
99.87 |
0.13 |
| [248.1 - 261.88) |
254.99 |
1 |
0.00 |
21464 |
27 |
99.88 |
0.13 |
| [261.88 - 275.67) |
268.78 |
3 |
0.01 |
21467 |
26 |
99.89 |
0.12 |
| [275.67 - 289.45) |
282.56 |
1 |
0.00 |
21468 |
23 |
99.9 |
0.11 |
| [289.45 - 303.23) |
296.34 |
0 |
0.00 |
21468 |
22 |
99.9 |
0.1 |
| [303.23 - 317.02) |
310.12 |
0 |
0.00 |
21468 |
22 |
99.9 |
0.1 |
| [317.02 - 330.8) |
323.91 |
0 |
0.00 |
21468 |
22 |
99.9 |
0.1 |
| [330.8 - 344.58) |
337.69 |
0 |
0.00 |
21468 |
22 |
99.9 |
0.1 |
| [344.58 - 358.37) |
351.48 |
0 |
0.00 |
21468 |
22 |
99.9 |
0.1 |
| [358.37 - 372.15) |
365.26 |
1 |
0.00 |
21469 |
22 |
99.9 |
0.1 |
| [372.15 - 385.93) |
379.04 |
0 |
0.00 |
21469 |
21 |
99.9 |
0.1 |
| [385.93 - 399.72) |
392.83 |
0 |
0.00 |
21469 |
21 |
99.9 |
0.1 |
| [399.72 - 413.5) |
406.61 |
3 |
0.01 |
21472 |
21 |
99.92 |
0.1 |
| [413.5 - 427.28) |
420.39 |
2 |
0.01 |
21474 |
18 |
99.93 |
0.08 |
| [427.28 - 441.07) |
434.17 |
10 |
0.05 |
21484 |
16 |
99.97 |
0.07 |
| [441.07 - 454.85] |
447.96 |
6 |
0.03 |
21490 |
6 |
100 |
0.03 |
| Totales |
- |
21490 |
100.00 |
- |
- |
- |
- |
| Autor: Grupo 2
Fuente:https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india |
#PROCESO DE SIMPLIFICACIÓN
#Histograma(tolueno)
#Elemnetos simplificados
Lis<-Histograma_tolueno$breaks [1:10]
Lss<-Histograma_tolueno$breaks [2:11]
MCs<-(Lis+Lis)/2
nis<-Histograma_tolueno$counts
his <- (nis / N) * 100
Nis_asc <- cumsum(nis)
His_asc <- cumsum(his)
Nis_desc <- rev(cumsum(rev(nis)))
His_desc <- rev(cumsum(rev(his)))
Intervalos <- paste0("[", round(Lis,2), " - ", round(Lss,2), ")")
Intervalos[length(Intervalos)] <- paste0("[", round(Lis[length(Lis)],2),
" - ", round(Lss[length(Lss)],2), "]")
TDF_toluenosimplificado <- data.frame(
Intervalo = Intervalos,
MC = round(MCs, 2),
ni = nis,
hi= round(his, 2),
Ni_ascendente = Nis_asc,
Hi_ascendente = round(His_asc, 2),
Ni_descendente = Nis_desc,
Hi_descendente = round(His_desc, 2)
)
colnames(TDF_toluenosimplificado) <- c(
"Intervalo",
"MC",
"ni",
"hi(%)",
"Ni_asc",
"Hi_asc (%)",
"Ni_desc",
"Hi_desc (%)"
)
totaless <- data.frame(
Intervalo = "Totales",
MC = "-",
ni = sum(nis), # suma total de ni
hi = sum(his), # suma total de hi (%)
Ni_ascendente = "-",
Ni_descendente = "-",
Hi_ascendente = "-",
Hi_descendente = "-"
)
colnames(totaless) <- c(
"Intervalo",
"MC",
"ni",
"hi(%)",
"Ni_asc",
"Hi_asc (%)",
"Ni_desc",
"Hi_desc (%)"
)
# Agregar al final de la tabla
TDF_toluenosimplificado <- rbind(TDF_toluenosimplificado, totaless)
#Tabla 2
TDF_toluenosimplificado %>%
gt() %>%
tab_header(
title = md("*Tabla Nro. 2*"),
subtitle = md("**Distribucion de frecuencia simplificado de concentración de Tolueno,estudio calidad del aire en India entre 2015-2020 **")
) %>%
tab_source_note(
source_note = md("Autor: Grupo 2\n Fuente:https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india")
) %>%
tab_options(
table.border.top.color = "black",
table.border.bottom.color = "black",
table.border.top.style = "solid",
table.border.bottom.style = "solid",
column_labels.border.top.color = "black",
column_labels.border.bottom.color = "black",
column_labels.border.bottom.width = px(2),
row.striping.include_table_body = TRUE,
heading.border.bottom.color = "black",
heading.border.bottom.width = px(2),
table_body.hlines.color = "gray",
table_body.border.bottom.color = "black"
)
| Tabla Nro. 2 |
| **Distribucion de frecuencia simplificado de concentración de Tolueno,estudio calidad del aire en India entre 2015-2020 ** |
| Intervalo |
MC |
ni |
hi(%) |
Ni_asc |
Hi_asc (%) |
Ni_desc |
Hi_desc (%) |
| [0 - 50) |
0 |
21021 |
97.82 |
21021 |
97.82 |
21490 |
100 |
| [50 - 100) |
50 |
392 |
1.82 |
21413 |
99.64 |
469 |
2.18 |
| [100 - 150) |
100 |
33 |
0.15 |
21446 |
99.8 |
77 |
0.36 |
| [150 - 200) |
150 |
11 |
0.05 |
21457 |
99.85 |
44 |
0.2 |
| [200 - 250) |
200 |
6 |
0.03 |
21463 |
99.87 |
33 |
0.15 |
| [250 - 300) |
250 |
5 |
0.02 |
21468 |
99.9 |
27 |
0.13 |
| [300 - 350) |
300 |
0 |
0.00 |
21468 |
99.9 |
22 |
0.1 |
| [350 - 400) |
350 |
1 |
0.00 |
21469 |
99.9 |
22 |
0.1 |
| [400 - 450) |
400 |
19 |
0.09 |
21488 |
99.99 |
21 |
0.1 |
| [450 - 500] |
450 |
2 |
0.01 |
21490 |
100 |
2 |
0.01 |
| Totales |
- |
21490 |
100.00 |
- |
- |
- |
- |
| Autor: Grupo 2
Fuente:https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india |
#GRAFICAS
#Histogramas local
hist(tolueno, breaks = 11,
main = "Gráfica N°1: Distribución de la Concentración de Tolueno
presente en el estudio sobre calidad del aire en India entre 2015-2020 ",
xlab = " Tolueno (µg/m3)",
ylab = "Cantidad",
ylim = c(0, max(nis)),
col = "purple",
cex.main = 0.9,
cex.lab = 1,
cex.axis = 0.9,
xaxt = "n")
axis(1, at = Histograma_tolueno$breaks,
labels = Histograma_tolueno$breaks, las = 1,
cex.axis = 0.9)

#Histograma global
hist(tolueno, breaks = 11,
main = "Gráfica N°2:Distribución de la Concentración de Tolueno
presente en el estudio sobre calidad del aire en India entre 2015-2020",
xlab = "Tolueno (µg/m3)",
ylab = "Cantidad",
ylim = c(0, length(tolueno)),
col = "purple",
cex.main = 1,
cex.lab = 1,
cex.axis = 0.9,
xaxt = "n")
axis(1, at = Histograma_tolueno$breaks,
labels = Histograma_tolueno$breaks, las = 1,
cex.axis = 0.9)

#Histograma porcentual global
TDF_toluenosimplificado$`hi (%)` <- as.numeric(TDF_toluenosimplificado$`hi(%)`)
post<-barplot(TDF_toluenosimplificado$`hi(%)`[1:(nrow(TDF_toluenosimplificado)-1)],
space = 0,
col = "purple",
main = "Gráfica N°3:Distribución de la Concentración de Tolueno, estudio
calidad del aire en India, 2015-2020",
xlab = "Tolueno (µg/m3)",
ylab = "Porcentaje (%)",
names.arg = TDF_toluenosimplificado$MC[1:(nrow(TDF_toluenosimplificado)-1)],
ylim = c(0,100), xaxt = "n"
)
axis(side = 1,
at = post,
labels = TDF_toluenosimplificado$MC[1:(nrow(TDF_toluenosimplificado)-1)],
tck = -0.02)

#Histograma porcentual local
n <- as.numeric(nrow(TDF_toluenosimplificado))
pos<-barplot(
TDF_toluenosimplificado$`hi(%)`[1:(n-1)],
space = 0,
main = "Gráfica No. 4:Distribución concentración de Tolueno en el estudio
calidad del aire en India, 2015-2020",
ylab = "Porcentaje (%)",
xlab = "Tolueno (µg/m3)",
names.arg = TDF_toluenosimplificado$MC[1:(n-1)],
col = "purple"
)
axis(side = 1,
at = pos,
labels = TDF_toluenosimplificado$MC[1:(nrow(TDF_toluenosimplificado)-1)],
tck = -0.04, # controla la longitud de la rayita
las = 1) # rota las etiquetas si quieres verticales

#Box plot
Cajatolueno<-boxplot(tolueno, horizontal = T,col = "green", border = "black",
main= "Gráfica No. 5: Distribución de la concentración de Tolueno,
estudio calidad del aire en India desde 2015-2020",
xlab="Tolueno (µg/m3)")

#Ojivas locales
# Usar un único eje X para ambas ojivas
x <- Lss
plot(
x, Nis_asc,
type = "b",
col = "orange",
pch = 19,
main = "Gráfica N°6:Distribución de concentración de Tolueno en el
estudio calidad del aire en India 2015-2020",
xlab = "Tolueno (µg/m3)",
ylab = "Cantidad",
ylim = c(0, max(c(Nis_asc, Nis_desc))) # 👈 Eje Y iniciando en 0
)
# Ojiva descendente
lines(
x, Nis_desc,
type = "b",
col = "blue",
pch = 19
)

#Ojiva porcentual
# Usar un único eje X para ambas ojivas
x <- Lss
# Ojiva ascendente porcentual
plot(
x, His_asc,
type = "b",
main ="Gráfica N°7:Distribución de concentración de Tolueno en el
estudio calidad del aire en India 2015-2020",
xlab = "Tolueno (µg/m3)",
ylab = "Porcentaje (%)",
col = "blue",
pch = 19,
ylim = c(0, 100)
)
# Ojiva descendente porcentual
lines(
x, His_desc,
type = "b",
col = "orange",
pch = 19
)

#INDICADORES
#Indicadores de Tendencia Central
# Mediana
Me <- median(tolueno)
Me
## [1] 2.97
# Media
X <- mean(tolueno)
X
## [1] 8.700972
# Moda
Mo <- "[0,50]"
Mo
## [1] "[0,50]"
#Indicadores de Dispersión
# Varianza
var(tolueno)
## [1] 398.7675
# Desviación estandar
desv<-round(sd(tolueno), 2)
# Coeficiente de variación
CV <- (sd(tolueno)/X)*100
CV
## [1] 229.505
#Indicadores de Forma
# Coeficiente de Asimetría
library(e1071)
As <- skewness(tolueno)
As
## [1] 11.6645
# Curtosis
K <- kurtosis(tolueno)
K
## [1] 216.6744
valoresatipicos<-"2427,[44,21367]"
Variable <- "Tolueno"
Rango <- "[0,455.85]"
Tabla_indicadores <- data.frame(Variable,Rango,round(X,3),Me,Mo,round(desv,2),round(CV,2),round(As,2),round(K,2),
valoresatipicos)
colnames(Tabla_indicadores) <- c("Variable","Rango","X", "Me", "Mo","sd","CV","As","K","Valores atípicos")
library(gt)
library(dplyr)
Tabla_indicadores %>%
gt() %>%
tab_header(
title = md("*Tabla Nro. 3*"),
subtitle = md("**Indicadores Estadísticos de concentración de Xyleno,estudio calidad del aire en India entre 2015-2020 **")
) %>%
tab_source_note(
source_note = md("Autor: Grupo 2\n Fuente:https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india")
) %>%
tab_options(
table.border.top.color = "black",
table.border.bottom.color = "black",
table.border.top.style = "solid",
table.border.bottom.style = "solid",
column_labels.border.top.color = "black",
column_labels.border.bottom.color = "black",
column_labels.border.bottom.width = px(2),
row.striping.include_table_body = TRUE,
heading.border.bottom.color = "black",
heading.border.bottom.width = px(2),
table_body.hlines.color = "gray",
table_body.border.bottom.color = "black"
)
| Tabla Nro. 3 |
| **Indicadores Estadísticos de concentración de Xyleno,estudio calidad del aire en India entre 2015-2020 ** |
| Variable |
Rango |
X |
Me |
Mo |
sd |
CV |
As |
K |
Valores atípicos |
| Tolueno |
[0,455.85] |
8.701 |
2.97 |
[0,50] |
19.97 |
229.5 |
11.66 |
216.67 |
2427,[44,21367] |
| Autor: Grupo 2
Fuente:https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india |