VARIABLE CUANTITATIVA CONTINUA

#UNIVERSIDAD CENTRAL DEL ECUADOR #ANALISIS ESTADÍSTICO SOBRE LA CALIDAD DE AIRE EN LA INDIA #FECHA: 05/12/2025

#Estadística Descriptiva
#VAribale Continua LONGITUD
#Llumitasig Daniela 


#Cargar librerias
library(gt)
library(dplyr)

## 
## Adjuntando el paquete: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(e1071)

#Cargar datos 
city_day_2_ <- read.csv("city_day (2).csv")
datos<-read.csv("~/Documentos R/TABLAS A/city_day (2).csv")


#Filtrar LONGITUD
Longitud <- subset(city_day_2_$Longitud, city_day_2_$Longitud >= 0)

length(Longitud)

## [1] 29531

# Calcular el mínimo y máximo para la longitud
min_lon <- min(Longitud)
max_lon <- max(Longitud)

min_lon

## [1] 73.0104

max_lon

## [1] 91.69974

#Calcular rango 
R <- max_lon - min_lon

#Calcular intervalos 
K <- floor(1 + 3.33 * log10(length(Longitud)))
K

## [1] 15

#Calcular amplitud 
A <-R/K

#Limite inferior 
Li <-round(seq(from=min_lon,to=max_lon-A,by=A),2)
#Limite superior 
Ls <-round(seq(from=min_lon+A,to=max_lon,by=A),2)
#Marca de clase 
Mc <- (Li+Ls)/2
Mc

##  [1] 73.635 74.880 76.125 77.370 78.615 79.865 81.110 82.355 83.600 84.845
## [11] 86.095 87.340 88.585 89.830 91.075

# Vector vacío para guardar las frecuencias de cada clase
ni <- c()

for (i in 1:K) {
  if (i < K) {
    # Para las primeras clases: [Li , Ls)
    ni[i] <- length(subset(Longitud, Longitud >= Li[i] & Longitud < Ls[i]))
  } else {
    # Para la última clase: [Li , Ls]
    ni[i] <- length(subset(Longitud, Longitud >= Li[i] & Longitud <= Ls[i]))
  }
}

N <- sum(ni)
hi <- (ni / N) * 100
Ni_asc <- cumsum(ni)
Ni_desc <- rev(cumsum(rev(ni)))
Hi_asc <- cumsum(hi)
Hi_desc <- rev(cumsum(rev(hi)))
Intervalo <- paste0("[", round(Li,2), " - ", round(Ls,2), ")")
Intervalo[length(Intervalo)] <- paste0("[", round(Li[length(Li)],2), " - ", 
                                       round(Ls[length(Ls)],2), "]")

TDF_Longitud <- data.frame(
  Intervalo = Intervalo,
  Mc = round(Mc, 2),
  ni = ni,
  hi = round(hi, 2),
  Ni_asc = Ni_asc,
  Ni_desc = Ni_desc,
  Hi_asc = round(Hi_asc, 2),
  Hi_desc = round(Hi_desc, 2)
)
# Crear fila de totales para Longitud
totales <- data.frame(
  Intervalo = "Totales",
  Mc = "-",
  ni = sum(ni),
  hi = round(sum(hi), 2),
  Ni_asc = "-",
  Ni_desc = "-",
  Hi_asc= "-",
  Hi_desc = "-"
)

TDF_Longitud <- rbind(TDF_Longitud, totales)



# Tabla 1

TDF_Longitud %>%
  gt() %>%
  tab_header(
    title = md("*Tabla Nro. 1*"),
    subtitle = md("*Distribución de frecuencias de la longitud en el estudio de calidad del aire en la India*")
  ) %>%
  tab_source_note(
    source_note = md("Fuente:Datos procesados por el autor a partir del archivo *city_day_(2).csv*")
  ) %>%
  tab_options(
    table.border.top.color = "black",
    table.border.bottom.color = "black",
    table.border.top.style = "solid",
    table.border.bottom.style = "solid",
    column_labels.border.top.color = "black",
    column_labels.border.bottom.color = "black",
    column_labels.border.bottom.width = px(2),
    row.striping.include_table_body = TRUE,
    heading.border.bottom.color = "black",
    heading.border.bottom.width = px(2),
    table_body.hlines.color = "gray",
    table_body.border.bottom.color = "black"
  )

Intervalo	Mc	ni	hi	Ni_asc	Ni_desc	Hi_asc	Hi_desc
Tabla Nro. 1
Distribución de frecuencias de la longitud en el estudio de calidad del aire en la India
[73.01 - 74.26)	73.64	2009	6.80	2009	29531	6.8	100
[74.26 - 75.5)	74.88	2025	6.86	4034	27522	13.66	93.2
[75.5 - 76.75)	76.12	2390	8.09	6424	25497	21.75	86.34
[76.75 - 77.99)	77.37	12492	42.30	18916	23107	64.05	78.25
[77.99 - 79.24)	78.61	2298	7.78	21214	10615	71.84	35.95
[79.24 - 80.49)	79.86	1112	3.77	22326	8317	75.6	28.16
[80.49 - 81.73)	81.11	2009	6.80	24335	7205	82.4	24.4
[81.73 - 82.98)	82.36	951	3.22	25286	5196	85.63	17.6
[82.98 - 84.22)	83.6	1772	6.00	27058	4245	91.63	14.37
[84.22 - 85.47)	84.84	1971	6.67	29029	2473	98.3	8.37
[85.47 - 86.72)	86.1	0	0.00	29029	502	98.3	1.7
[86.72 - 87.96)	87.34	0	0.00	29029	502	98.3	1.7
[87.96 - 89.21)	88.58	0	0.00	29029	502	98.3	1.7
[89.21 - 90.45)	89.83	0	0.00	29029	502	98.3	1.7
[90.45 - 91.7]	91.08	502	1.70	29531	502	100	1.7
Totales	-	29531	100.00	-	-	-	-
Fuente:Datos procesados por el autor a partir del archivo city_day_(2).csv

# PROCESO DE SIMPLIFICACIÓN PARA la longitud

#Histograma longitud
histo_longitud <- hist(Longitud, plot = FALSE)


#Elementos simplificados
Lis<-histo_longitud$breaks [1:19]
Lss<-histo_longitud$breaks [2:20]
MCs<-(Lis+Lis)/2
nis<-histo_longitud$counts
his <- (nis / N) * 100
Nis_asc <- cumsum(nis) 
His_asc <- cumsum(his) 
Nis_desc <- rev(cumsum(rev(nis)))
His_desc <- rev(cumsum(rev(his)))
Intervalos <- paste0("[", round(Lis,2), " - ", round(Lss,2), ")")
Intervalos[length(Intervalos)] <- paste0("[", round(Lis[length(Lis)],2), 
 
                                         
                                         
                                                                                 " - ", round(Lss[length(Lss)],2), "]")
TDF_lonsimplificado <- data.frame(
  Intervalo = Intervalos,
  MC = round(MCs, 2),
  ni = nis,
  hi= round(his, 2),
  Ni_ascendente = Nis_asc,
  Hi_ascendente = round(His_asc, 2),
  Ni_descendente = Nis_desc,
  Hi_descendente = round(His_desc, 2)
)
colnames(TDF_lonsimplificado) <- c(
  "Intervalo",
  "MC",
  "ni",
  "hi(%)",
  "Ni_asc",
  "Hi_asc (%)",
  "Ni_desc",
  "Hi_desc (%)"
)
totaless <- data.frame(
  Intervalo = "Totales",
  MC = "-",
  ni = sum(nis),           
  hi = sum(his),            
  Ni_ascendente = "-",
  Ni_descendente = "-",
  Hi_ascendente = "-",
  Hi_descendente = "-"
)
colnames(totaless) <- c(
  "Intervalo",
  "MC",
  "ni",
  "hi(%)",
  "Ni_asc",
  "Hi_asc (%)",
  "Ni_desc",
  "Hi_desc (%)"
)
# Agregar al final de la tabla
TDF_lonsimplificado <- rbind(TDF_lonsimplificado, totaless)


#Tabla 2 

TDF_lonsimplificado %>%
  gt() %>%
  tab_header(
    title = md("*Tabla Nro. 2*"),
    subtitle = md("*Distribución de la variable Longitud registrada
                  en el estudio de calidad del aire en India*")
  ) %>%
  tab_source_note(
    source_note = md("Fuente: Datos procesados por el autor a partir del archivo *city_day (2)*")
  ) %>%
  tab_options(
    table.border.top.color = "black",
    table.border.bottom.color = "black",
    table.border.top.style = "solid",
    table.border.bottom.style = "solid",
    column_labels.border.top.color = "black",
    column_labels.border.bottom.color = "black",
    column_labels.border.bottom.width = px(2),
    row.striping.include_table_body = TRUE,
    heading.border.bottom.color = "black",
    heading.border.bottom.width = px(2),
    table_body.hlines.color = "gray",
    table_body.border.bottom.color = "black"
  )

Intervalo	MC	ni	hi(%)	Ni_asc	Hi_asc (%)	Ni_desc	Hi_desc (%)
Tabla Nro. 2
Distribución de la variable Longitud registrada en el estudio de calidad del aire en India
[73 - 74)	73	2009	6.80	2009	6.8	29531	100
[74 - 75)	74	1863	6.31	3872	13.11	27522	93.2
[75 - 76)	75	2552	8.64	6424	21.75	25659	86.89
[76 - 77)	76	1276	4.32	7700	26.07	23107	78.25
[77 - 78)	77	11216	37.98	18916	64.05	21831	73.93
[78 - 79)	78	289	0.98	19205	65.03	10615	35.95
[79 - 80)	79	3121	10.57	22326	75.6	10326	34.97
[80 - 81)	80	0	0.00	22326	75.6	7205	24.4
[81 - 82)	81	2960	10.02	25286	85.63	7205	24.4
[82 - 83)	82	0	0.00	25286	85.63	4245	14.37
[83 - 84)	83	1772	6.00	27058	91.63	4245	14.37
[84 - 85)	84	0	0.00	27058	91.63	2473	8.37
[85 - 86)	85	1971	6.67	29029	98.3	2473	8.37
[86 - 87)	86	0	0.00	29029	98.3	502	1.7
[87 - 88)	87	0	0.00	29029	98.3	502	1.7
[88 - 89)	88	0	0.00	29029	98.3	502	1.7
[89 - 90)	89	0	0.00	29029	98.3	502	1.7
[90 - 91)	90	0	0.00	29029	98.3	502	1.7
[91 - 92]	91	502	1.70	29531	100	502	1.7
Totales	-	29531	100.00	-	-	-	-
Fuente: Datos procesados por el autor a partir del archivo city_day (2)

#GRAFICAS

#Histogramas local

hist(Longitud, breaks = 20,
     main = "Gráfica N°1: Distribución de la variable Longitud
     registrada en el estudio de calidad del aire en India",
     xlab = " Longitud",
     ylab = "Cantidad",
     ylim = c(0, max(nis)),
     col = "pink",
     cex.main = 0.9,
     cex.lab = 1,
     cex.axis = 0.9,
     xaxt = "n")
axis(1, at = histo_longitud$breaks,
     labels = histo_longitud$breaks, las = 1,
     cex.axis = 0.9)

#Histograma global
hist(Longitud, breaks = 20,
     main = "Gráfica N°2:Distribución de la variable Longitud
     registrada en el estudio de calidad del aire en India",
     xlab = "Longitud",
     ylab = "Cantidad",
     ylim = c(0, length(Longitud)),
     col = "pink",
     cex.main = 1,
     cex.lab = 1,
     cex.axis = 0.9,
     xaxt = "n")
axis(1, at = histo_longitud$breaks,
     labels = histo_longitud$breaks, las = 1,
     cex.axis = 0.9)

#Histograma porcentual local

# hi(%) local desde tu tabla simplificada (sin la fila Totales)
hi_loc <- TDF_lonsimplificado$`hi(%)`[1:(nrow(TDF_lonsimplificado) - 1)]

# MISMO sistema de breaks locales
breaks_longitud_simplificado <- c(Lis[1], Lss)
histo_longitud_simplificado  <- hist(Longitud, breaks = breaks_longitud_simplificado, plot = FALSE)

# LIENZO vacío con eje Y en PORCENTAJE, SIN MARCO
plot(NA,
     xlim = range(breaks_longitud_simplificado),
     ylim = c(0, max(hi_loc) * 1.1),
     xlab = "Longitud",
     ylab = "Porcentaje (%)",
     main = "Gráfica N°3: Histograma porcentual local de longitud",
     xaxt = "n",
     bty = "n")     # <<--- QUITA EL MARCO COMPLETO

# DIBUJAR LAS BARRAS (como histograma) usando hi(%)
for (i in seq_along(hi_loc)) {
  rect(xleft   = breaks_longitud_simplificado[i],
       ybottom = 0,
       xright  = breaks_longitud_simplificado[i + 1],
       ytop    = hi_loc[i],
       col     = "pink",
       border  = "black")
}

# EJE X igual que en el histograma global/local absoluto (con etiquetas salteadas)
idx <- seq(1, length(histo_longitud_simplificado$breaks), by = 2)
axis(1,
     at     = histo_longitud_simplificado$breaks[idx],
     labels = round(histo_longitud_simplificado$breaks[idx], 0),
     las    = 1,
     cex.axis = 0.9)

#HISTOGRAMA PORCENTUAL GLOBAL PARA NH3

# 1. Histograma global para obtener breaks y counts
histo_longitud_global <- hist(Longitud, breaks = 20, plot = FALSE)

# 2. Frecuencias absolutas
ni_global <- histo_longitud_global$counts

# 3. Calcular porcentajes globales
hi_global <- ni_global / sum(ni_global) * 100

# 4. Crear lienzo vacío con eje Y de 0 a 100%, SIN MARCO
plot(NA,
     xlim = range(histo_longitud_global$breaks),
     ylim = c(0, 100),
     xlab = "Longitud",
     ylab = "Porcentaje (%)",
     main = "Gráfica N°4: Histograma porcentual global de Longitud",
     xaxt = "n",
     bty = "n"      #QUITA EL MARCO
)

# 5. Dibujar barras del histograma porcentual GLOBAL

for (i in seq_along(hi_global)) {
  rect(
    xleft   = histo_longitud_global$breaks[i],
    ybottom = 0,
    xright  = histo_longitud_global$breaks[i + 1],
    ytop    = hi_global[i],
    col     = "pink",
    border  = "black"
  )
}

# 6. Eje X limpio (saltando un break)
idx <- seq(1, length(histo_longitud_global$breaks), by = 2)

axis(1,
     at     = histo_longitud_global$breaks[idx],
     labels = round(histo_longitud_global$breaks[idx], 0),
     las    = 1,
     cex.axis = 0.9)

#Diagrama de caja
Cajalong<-boxplot(Longitud, horizontal = T,col = "red", border = "black",
                 main= "Gráfica N°5: Distribución de la variable Longitud
     registrada en el estudio de calidad del aire en India",
                 xlab="Longitud")

#Ojivas locales

# Usar un único eje X para ambas ojivas
x <- Lss

plot(
  x, Nis_asc,
  type = "b",
  col = "orange",
  pch = 19,
  main = "Gráfica N°6:Distribución de la variable Longitud
     registrada en el estudio de calidad del aire en India",
  xlab = "Longitud",
  ylab = "Cantidad",
  ylim = c(0, max(c(Nis_asc, Nis_desc)))   
)

# Ojiva descendente
lines(
  x, Nis_desc,
  type = "b",
  col = "blue",
  pch = 19
)

#Ojiva porcentual 

# Usar un único eje X para ambas ojivas
x <- Lss   

# Ojiva ascendente porcentual
plot(
  x, His_asc,
  type = "b",
  main ="Gráfica N°7:Distribución de la variable Longitud
     registrada en el estudio de calidad del aire en India",
  xlab = "Longitud",
  ylab = "Porcentaje (%)",
  col = "orange",
  pch = 19,
  ylim = c(0, 100)   
)

# Ojiva descendente porcentual
lines(
  x, His_desc,
  type = "b",
  col = "blue",
  pch = 19
)

#INDICADORES DE POSICION 
#MEDIA ARITMETICA
X <- sum(Longitud) / length(Longitud)
X

## [1] 78.44855

#MEDIANA 
Me <- median(Longitud)
Me

## [1] 77.3796

#MODA
Mo <- "[77,78]"
Mo

## [1] "[77,78]"

#Porque la tabla simplificada, la clase número 5 en el intervalo [77,78] tiene la frecuencia más alta.
#La mayor frecuencia es 11216, que está en la posición 1 del vector

#INDICADORES DE DISPERSION 
#VARIANZA
var<- var(Longitud)
var

## [1] 12.37411

#DESVIACION ESTANDAR 
sd <- sd(Longitud)
sd

## [1] 3.517685

# COEFICIENTE DE VARIACIÓN (%)
CV <- (sd / X) * 100
CV

## [1] 4.484067

#INDICADORES DE FORMA

#COEFICIENTE DE ASIMETRIA 
install.packages("e1071")

## Warning: package 'e1071' is in use and will not be installed

library(e1071)

As <- skewness(Longitud)
As

## [1] 1.211761

#CUORTOSIS
library(e1071)

Cu <- kurtosis(Longitud)
Cu

## [1] 2.076345

#outliers

cajaBigotes <- boxplot(Longitud, plot = FALSE)

outliers <- cajaBigotes$out
min(outliers)

## [1] 83.9153

max(outliers)

## [1] 91.69974

length(outliers)

## [1] 2783

Variable<-"Longitud"

Rango<-Rango <- "[0,18.68]"



Tabla_indicadores <- data.frame(
  Variable = Variable,
  Rango    = Rango,
  X   = round(X, 3),
  Me   = Me,
  Mo       = Mo,
  sd   = round(sd, 2),
  CV   = round(CV, 2),
  As   = round(As, 2),
  Cu   = round(Cu, 2)
)

#Tabla indicadores

library(gt)
library(dplyr)
Tabla_indicadores %>%
  gt() %>%
  tab_header(
    title = md("Tabla Nro. 3"),
    subtitle = md("*Indicadores Estadísticos de la variable Longitud*")
  ) %>%
  tab_source_note(
    source_note = md("Fuente: Datos procesados por el autor a partir del archivo *city_day (2)*")
  ) %>%
  tab_options(
    table.border.top.color = "black",
    table.border.bottom.color = "black",
    table.border.top.style = "solid",
    table.border.bottom.style = "solid",
    column_labels.border.top.color = "black",
    column_labels.border.bottom.color = "black",
    column_labels.border.bottom.width = px(2),
    row.striping.include_table_body = TRUE,
    heading.border.bottom.color = "black",
    heading.border.bottom.width = px(2),
    table_body.hlines.color = "gray",
    table_body.border.bottom.color = "black"
  )

Variable	Rango	X	Me	Mo	sd	CV	As	Cu
Tabla Nro. 3
Indicadores Estadísticos de la variable Longitud
Longitud	[0,18.68]	78.449	77.3796	[77,78]	3.52	4.48	1.21	2.08
Fuente: Datos procesados por el autor a partir del archivo city_day (2)