Univercidad Central del Eciador
FIGEMPA-Ingeniería Ambiental
# Cargar datos
datos <- read.csv(
"city_day.csv",
header = TRUE,
sep = ",",
dec = "."
)
# Crear vector ozono sin guiones
tolueno<- datos$Toluene[datos$Toluene != "-"]
tolueno<- as.numeric(tolueno)
#Estadística descriptiva
#5/12/2025
#Lorien Arcentales
#Carga de paquetes
library(gt)
library(dplyr)
datos<-read.csv("city_day.csv", header = TRUE, dec = ".",
sep = ",")
#Extraccion de los "-" de la variable ozono porque son valores inexistentes,
# para un mejor analisis, cambia tamaño muestral de 29531 a 21490
tolueno<-datos$Toluene[datos$Toluene != "-"]
length(tolueno)
## [1] 21490
tolueno <- as.numeric(tolueno)
min<-min(tolueno)
max<-max(tolueno)
R=max-min
k=1+(3.3)*log(length(tolueno))
k<-floor(k)
A<-R/k
#Generación de intervalos
Li <- seq(from = min, to = max - A, by = A)
Ls <- c (seq(from = min + A, to = max - A, by = A), max) # último límite = max
MC<-(Li+Ls)/2
#Creación de ni
tolueno <- round(tolueno, 3) # redondear los datos de ozono
Li <- round(Li, 3) # redondear límites inferiores
Ls <- round(Ls, 3)
ni <- numeric(length(Li))
for (i in 1:length(Li)) {
if (i < length(Li)) {
ni[i] <- sum(tolueno >= Li[i] & tolueno < Ls[i])
} else {
ni[i] <- sum(tolueno >= Li[i] & tolueno <= Ls[i]) # Último intervalo cerrado
}
}
N <- sum(ni)
hi <- (ni / N) * 100
Ni_asc <- cumsum(ni)
Ni_desc <- rev(cumsum(rev(ni)))
Hi_asc <- cumsum(hi)
Hi_desc <- rev(cumsum(rev(hi)))
Intervalo <- paste0("[", round(Li,2), " - ", round(Ls,2), ")")
Intervalo[length(Intervalo)] <- paste0("[", round(Li[length(Li)],2), " - ",
round(Ls[length(Ls)],2), "]")
TDF_tolueno <- data.frame(
Intervalo = Intervalo,
MC = round(MC, 2),
ni = ni,
hi = round(hi, 2),
Ni_ascendente = Ni_asc,
Ni_descendente = Ni_desc,
Hi_ascendente = round(Hi_asc, 2),
Hi_descendente = round(Hi_desc, 2)
)
# Crear fila de totales
totales <- data.frame(
Intervalo = "Totales",
MC = "-",
ni = sum(ni),
hi = sum(hi),
Ni_ascendente = "-",
Ni_descendente = "-",
Hi_ascendente = "-",
Hi_descendente = "-"
)
# Agregar al final del data.frame
TDF_tolueno<- rbind(TDF_tolueno, totales)
library(gt)
library(dplyr)
TDF_tolueno %>%
gt() %>%
tab_header(
title = md("*Tabla Nro. 1*"),
subtitle = md("**Distribucion de frecuencia de concentración de Tolueno,estudio calidad del aire en India entre 2015-2020 **")
) %>%
tab_source_note(
source_note = md("Autor: Grupo 2\n Fuente:https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india")
) %>%
tab_options(
table.border.top.color = "black",
table.border.bottom.color = "black",
table.border.top.style = "solid",
table.border.bottom.style = "solid",
column_labels.border.top.color = "black",
column_labels.border.bottom.color = "black",
column_labels.border.bottom.width = px(2),
row.striping.include_table_body = TRUE,
heading.border.bottom.color = "black",
heading.border.bottom.width = px(2),
table_body.hlines.color = "gray",
table_body.border.bottom.color = "black"
)
| Tabla Nro. 1 | |||||||
| **Distribucion de frecuencia de concentración de Tolueno,estudio calidad del aire en India entre 2015-2020 ** | |||||||
| Intervalo | MC | ni | hi | Ni_ascendente | Ni_descendente | Hi_ascendente | Hi_descendente |
|---|---|---|---|---|---|---|---|
| [0 - 13.78) | 6.89 | 17720 | 82.46 | 17720 | 21490 | 82.46 | 100 |
| [13.78 - 27.57) | 20.68 | 1940 | 9.03 | 19660 | 3770 | 91.48 | 17.54 |
| [27.57 - 41.35) | 34.46 | 1160 | 5.40 | 20820 | 1830 | 96.88 | 8.52 |
| [41.35 - 55.13) | 48.24 | 279 | 1.30 | 21099 | 670 | 98.18 | 3.12 |
| [55.13 - 68.92) | 62.03 | 151 | 0.70 | 21250 | 391 | 98.88 | 1.82 |
| [68.92 - 82.7) | 75.81 | 113 | 0.53 | 21363 | 240 | 99.41 | 1.12 |
| [82.7 - 96.48) | 89.59 | 48 | 0.22 | 21411 | 127 | 99.63 | 0.59 |
| [96.48 - 110.27) | 103.38 | 18 | 0.08 | 21429 | 79 | 99.72 | 0.37 |
| [110.27 - 124.05) | 117.16 | 6 | 0.03 | 21435 | 61 | 99.74 | 0.28 |
| [124.05 - 137.83) | 130.94 | 7 | 0.03 | 21442 | 55 | 99.78 | 0.26 |
| [137.83 - 151.62) | 144.73 | 4 | 0.02 | 21446 | 48 | 99.8 | 0.22 |
| [151.62 - 165.4) | 158.51 | 5 | 0.02 | 21451 | 44 | 99.82 | 0.2 |
| [165.4 - 179.18) | 172.29 | 3 | 0.01 | 21454 | 39 | 99.83 | 0.18 |
| [179.18 - 192.97) | 186.07 | 1 | 0.00 | 21455 | 36 | 99.84 | 0.17 |
| [192.97 - 206.75) | 199.86 | 4 | 0.02 | 21459 | 35 | 99.86 | 0.16 |
| [206.75 - 220.53) | 213.64 | 2 | 0.01 | 21461 | 31 | 99.87 | 0.14 |
| [220.53 - 234.32) | 227.43 | 1 | 0.00 | 21462 | 29 | 99.87 | 0.13 |
| [234.32 - 248.1) | 241.21 | 1 | 0.00 | 21463 | 28 | 99.87 | 0.13 |
| [248.1 - 261.88) | 254.99 | 1 | 0.00 | 21464 | 27 | 99.88 | 0.13 |
| [261.88 - 275.67) | 268.78 | 3 | 0.01 | 21467 | 26 | 99.89 | 0.12 |
| [275.67 - 289.45) | 282.56 | 1 | 0.00 | 21468 | 23 | 99.9 | 0.11 |
| [289.45 - 303.23) | 296.34 | 0 | 0.00 | 21468 | 22 | 99.9 | 0.1 |
| [303.23 - 317.02) | 310.12 | 0 | 0.00 | 21468 | 22 | 99.9 | 0.1 |
| [317.02 - 330.8) | 323.91 | 0 | 0.00 | 21468 | 22 | 99.9 | 0.1 |
| [330.8 - 344.58) | 337.69 | 0 | 0.00 | 21468 | 22 | 99.9 | 0.1 |
| [344.58 - 358.37) | 351.48 | 0 | 0.00 | 21468 | 22 | 99.9 | 0.1 |
| [358.37 - 372.15) | 365.26 | 1 | 0.00 | 21469 | 22 | 99.9 | 0.1 |
| [372.15 - 385.93) | 379.04 | 0 | 0.00 | 21469 | 21 | 99.9 | 0.1 |
| [385.93 - 399.72) | 392.83 | 0 | 0.00 | 21469 | 21 | 99.9 | 0.1 |
| [399.72 - 413.5) | 406.61 | 3 | 0.01 | 21472 | 21 | 99.92 | 0.1 |
| [413.5 - 427.28) | 420.39 | 2 | 0.01 | 21474 | 18 | 99.93 | 0.08 |
| [427.28 - 441.07) | 434.17 | 10 | 0.05 | 21484 | 16 | 99.97 | 0.07 |
| [441.07 - 454.85] | 447.96 | 6 | 0.03 | 21490 | 6 | 100 | 0.03 |
| Totales | - | 21490 | 100.00 | - | - | - | - |
| Autor: Grupo 2 Fuente:https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india | |||||||
#PROCESO DE SIMPLIFICACIÓN
#Histograma(tolueno)
#Elemnetos simplificados
Lis<-Histograma_tolueno$breaks [1:10]
Lss<-Histograma_tolueno$breaks [2:11]
MCs<-(Lis+Lis)/2
nis<-Histograma_tolueno$counts
his <- (nis / N) * 100
Nis_asc <- cumsum(nis)
His_asc <- cumsum(his)
Nis_desc <- rev(cumsum(rev(nis)))
His_desc <- rev(cumsum(rev(his)))
Intervalos <- paste0("[", round(Lis,2), " - ", round(Lss,2), ")")
Intervalos[length(Intervalos)] <- paste0("[", round(Lis[length(Lis)],2),
" - ", round(Lss[length(Lss)],2), "]")
TDF_toluenosimplificado <- data.frame(
Intervalo = Intervalos,
MC = round(MCs, 2),
ni = nis,
hi= round(his, 2),
Ni_ascendente = Nis_asc,
Hi_ascendente = round(His_asc, 2),
Ni_descendente = Nis_desc,
Hi_descendente = round(His_desc, 2)
)
colnames(TDF_toluenosimplificado) <- c(
"Intervalo",
"MC",
"ni",
"hi(%)",
"Ni_asc",
"Hi_asc (%)",
"Ni_desc",
"Hi_desc (%)"
)
totaless <- data.frame(
Intervalo = "Totales",
MC = "-",
ni = sum(nis), # suma total de ni
hi = sum(his), # suma total de hi (%)
Ni_ascendente = "-",
Ni_descendente = "-",
Hi_ascendente = "-",
Hi_descendente = "-"
)
colnames(totaless) <- c(
"Intervalo",
"MC",
"ni",
"hi(%)",
"Ni_asc",
"Hi_asc (%)",
"Ni_desc",
"Hi_desc (%)"
)
# Agregar al final de la tabla
TDF_toluenosimplificado <- rbind(TDF_toluenosimplificado, totaless)
#Tabla 2
TDF_toluenosimplificado %>%
gt() %>%
tab_header(
title = md("*Tabla Nro. 2*"),
subtitle = md("**Distribucion de frecuencia simplificado de concentración de Tolueno,estudio calidad del aire en India entre 2015-2020 **")
) %>%
tab_source_note(
source_note = md("Autor: Grupo 2\n Fuente:https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india")
) %>%
tab_options(
table.border.top.color = "black",
table.border.bottom.color = "black",
table.border.top.style = "solid",
table.border.bottom.style = "solid",
column_labels.border.top.color = "black",
column_labels.border.bottom.color = "black",
column_labels.border.bottom.width = px(2),
row.striping.include_table_body = TRUE,
heading.border.bottom.color = "black",
heading.border.bottom.width = px(2),
table_body.hlines.color = "gray",
table_body.border.bottom.color = "black"
)
| Tabla Nro. 2 | |||||||
| **Distribucion de frecuencia simplificado de concentración de Tolueno,estudio calidad del aire en India entre 2015-2020 ** | |||||||
| Intervalo | MC | ni | hi(%) | Ni_asc | Hi_asc (%) | Ni_desc | Hi_desc (%) |
|---|---|---|---|---|---|---|---|
| [0 - 50) | 0 | 21021 | 97.82 | 21021 | 97.82 | 21490 | 100 |
| [50 - 100) | 50 | 392 | 1.82 | 21413 | 99.64 | 469 | 2.18 |
| [100 - 150) | 100 | 33 | 0.15 | 21446 | 99.8 | 77 | 0.36 |
| [150 - 200) | 150 | 11 | 0.05 | 21457 | 99.85 | 44 | 0.2 |
| [200 - 250) | 200 | 6 | 0.03 | 21463 | 99.87 | 33 | 0.15 |
| [250 - 300) | 250 | 5 | 0.02 | 21468 | 99.9 | 27 | 0.13 |
| [300 - 350) | 300 | 0 | 0.00 | 21468 | 99.9 | 22 | 0.1 |
| [350 - 400) | 350 | 1 | 0.00 | 21469 | 99.9 | 22 | 0.1 |
| [400 - 450) | 400 | 19 | 0.09 | 21488 | 99.99 | 21 | 0.1 |
| [450 - 500] | 450 | 2 | 0.01 | 21490 | 100 | 2 | 0.01 |
| Totales | - | 21490 | 100.00 | - | - | - | - |
| Autor: Grupo 2 Fuente:https://www.kaggle.com/datasets/rohanrao/air-quality-data-in-india | |||||||
#GRAFICAS
#Histogramas local
hist(tolueno, breaks = 11,
main = "Gráfica N°1: Distribución de la Concentración de Tolueno
presente en el estudio sobre calidad del aire en India entre 2015-2020 ",
xlab = " Tolueno (µg/m3)",
ylab = "Cantidad",
ylim = c(0, max(nis)),
col = "purple",
cex.main = 0.9,
cex.lab = 1,
cex.axis = 0.9,
xaxt = "n")
axis(1, at = Histograma_tolueno$breaks,
labels = Histograma_tolueno$breaks, las = 1,
cex.axis = 0.9)
#Histograma global
hist(tolueno, breaks = 11,
main = "Gráfica N°2:Distribución de la Concentración de Tolueno
presente en el estudio sobre calidad del aire en India entre 2015-2020",
xlab = "Tolueno (µg/m3)",
ylab = "Cantidad",
ylim = c(0, length(tolueno)),
col = "purple",
cex.main = 1,
cex.lab = 1,
cex.axis = 0.9,
xaxt = "n")
axis(1, at = Histograma_tolueno$breaks,
labels = Histograma_tolueno$breaks, las = 1,
cex.axis = 0.9)
#Histograma porcentual global
TDF_toluenosimplificado$`hi (%)` <- as.numeric(TDF_toluenosimplificado$`hi(%)`)
post<-barplot(TDF_toluenosimplificado$`hi(%)`[1:(nrow(TDF_toluenosimplificado)-1)],
space = 0,
col = "purple",
main = "Gráfica N°3:Distribución de la Concentración de Tolueno, estudio
calidad del aire en India, 2015-2020",
xlab = "Tolueno (µg/m3)",
ylab = "Porcentaje (%)",
names.arg = TDF_toluenosimplificado$MC[1:(nrow(TDF_toluenosimplificado)-1)],
ylim = c(0,100), xaxt = "n"
)
axis(side = 1,
at = post,
labels = TDF_toluenosimplificado$MC[1:(nrow(TDF_toluenosimplificado)-1)],
tck = -0.02)
#Histograma porcentual local
n <- as.numeric(nrow(TDF_toluenosimplificado))
pos<-barplot(
TDF_toluenosimplificado$`hi(%)`[1:(n-1)],
space = 0,
main = "Gráfica No. 4:Distribución concentración de Tolueno en el estudio
calidad del aire en India, 2015-2020",
ylab = "Porcentaje (%)",
xlab = "Tolueno (µg/m3)",
names.arg = TDF_toluenosimplificado$MC[1:(n-1)],
col = "purple"
)
axis(side = 1,
at = pos,
labels = TDF_toluenosimplificado$MC[1:(nrow(TDF_toluenosimplificado)-1)],
tck = -0.04, # controla la longitud de la rayita
las = 1) # rota las etiquetas si quieres verticales
#Box plot
Cajatolueno<-boxplot(tolueno, horizontal = T,col = "green", border = "black",
main= "Gráfica No. 5: Distribución de la concentración de Tolueno,
estudio calidad del aire en India desde 2015-2020",
xlab="Tolueno (µg/m3)")
# Ojiva ascendente
plot(Lss, Nis_asc, type = "b", main = "Gráfica N°6:Ojiva ascendente y descendente de la
distribución local del concentración de Tolueno",
xlab = "Tolueno (µg/m3)",
ylab = "Cantidad", pch = 19,col="red")
# Ojiva descendente
lines(Lis, Nis_desc, type = "b", col = "black", pch = 19) # agrega en rojo
# Ojiva ascendente
plot(Lss, His_asc,
type = "b",
main = " Gráfica N°7:Ojiva ascendete y descendete de la distribución
de la concentración de Tolueno",
xlab = "Tolueno(µg/m3)",
ylab = "Porcentaje %",
col="blue",
pch = 19)
# Ojiva descendente
lines(Lis, His_desc,
type = "b",
col = "orange",
pch = 19)