Continua_Produccion-de-gas.R

#==============================ENCABEZADO===================================
# TEMA: EI Variables Continua - TOTAL GAS PRODUCTION BY 2023
# AUTOR: GRUPO 4
# FECHA: 08-02-2026

# PASO 0: CARGA Y LIMPIEZA DE DATOS
setwd("C:/Users/HP/Documents/PROYECTO ESTADISTICA/RStudio")
datos <- read.csv("tablap.csv", header = TRUE, dec = ".", sep = ";") 

cat("\n>>> PASO 0: PREPARACION DE DATOS <<<\n")

## 
## >>> PASO 0: PREPARACION DE DATOS <<<

prod_raw <- as.numeric(gsub("[^0-9.]", "", as.character(datos$Total.gas.production.by.2023)))
prod_raw <- na.omit(prod_raw)

# PASO 1: JUSTIFICACION Y DOMINIO
cat("1. JUSTIFICACION: Variable continua (Produccion acumulada). Dominio: [0, +inf).\n")

## 1. JUSTIFICACION: Variable continua (Produccion acumulada). Dominio: [0, +inf).

# PASO 2: GRAFICA NRO. 1 (EVIDENCIA ORIGINAL)
hist(prod_raw, col = "green", 
     main = "GRAFICA Nro. 1: MODELO DE PROBABILIDAD DE TOTAL GAS PRODUCTION BY 2023",
     xlab = "TOTAL GAS PRODUCTION BY 2023", ylab = "DENSIDAD DE PROBABILIDAD", 
     freq = FALSE, las = 1)

# PASO 3: FILTRADO (SUBSET)
caja_p <- boxplot(prod_raw, plot = FALSE)
prod_v <- prod_raw[prod_raw >= caja_p$stats[1] & prod_raw <= caja_p$stats[5]]

# PASO 4: CONFIGURACION DE RANGOS (REGLA DE STURGES)
n <- length(prod_v)
k <- round(1 + 3.322 * log10(n)) 
rango_total <- range(prod_v)
amplitud <- diff(rango_total) / k
cortes <- seq(rango_total[1], rango_total[1] + (k * amplitud), by = amplitud)

# PASO 5: TABLA TECNICA DE 9 COLUMNAS
h_p <- hist(prod_v, breaks = cortes, plot = FALSE)
li <- h_p$breaks[-length(h_p$breaks)]; ls <- h_p$breaks[-1]
xi <- (li + ls) / 2
ni <- h_p$counts
hi <- ni / n 
Ni <- cumsum(ni); Hi <- cumsum(hi)
Ni_desc <- n - Ni + ni; Hi_desc <- 1 - Hi + hi

TDF_9_COL <- data.frame(LI=li, LS=ls, Xi=xi, ni=ni, hi=round(hi,4), 
                        Ni=Ni, Hi=round(Hi,4), Ni_desc=Ni_desc, Hi_desc=round(Hi_desc,4))
cat("\n>>> PASO 5: TABLA DE FRECUENCIAS (9 COLUMNAS) <<<\n")

## 
## >>> PASO 5: TABLA DE FRECUENCIAS (9 COLUMNAS) <<<

print(TDF_9_COL)

##           LI        LS        Xi   ni     hi    Ni     Hi Ni_desc Hi_desc
## 1    25946.0  316940.1  171443.1 1869 0.1628  1869 0.1628   11480  1.0000
## 2   316940.1  607934.3  462437.2 2004 0.1746  3873 0.3374    9611  0.8372
## 3   607934.3  898928.4  753431.4 1856 0.1617  5729 0.4990    7607  0.6626
## 4   898928.4 1189922.6 1044425.5 1476 0.1286  7205 0.6276    5751  0.5010
## 5  1189922.6 1480916.7 1335419.6 1101 0.0959  8306 0.7235    4275  0.3724
## 6  1480916.7 1771910.9 1626413.8  776 0.0676  9082 0.7911    3174  0.2765
## 7  1771910.9 2062905.0 1917407.9  573 0.0499  9655 0.8410    2398  0.2089
## 8  2062905.0 2353899.1 2208402.1  430 0.0375 10085 0.8785    1825  0.1590
## 9  2353899.1 2644893.3 2499396.2  371 0.0323 10456 0.9108    1395  0.1215
## 10 2644893.3 2935887.4 2790390.4  283 0.0247 10739 0.9355    1024  0.0892
## 11 2935887.4 3226881.6 3081384.5  240 0.0209 10979 0.9564     741  0.0645
## 12 3226881.6 3517875.7 3372378.6  194 0.0169 11173 0.9733     501  0.0436
## 13 3517875.7 3808869.9 3663372.8  156 0.0136 11329 0.9868     307  0.0267
## 14 3808869.9 4099864.0 3954366.9  151 0.0132 11480 1.0000     151  0.0132

# PASO 6: ESTIMACION DE PARAMETROS (LOG-NORMAL)
log_prod <- log(prod_v[prod_v > 0]) 
media_log <- mean(log_prod)
sd_log <- sd(log_prod)

# PASO 7: HISTOGRAMA DE DENSIDAD Y MODELO
# GRAFICA Nro. 2: MODELO DE PROBABILIDAD
hist(prod_v, freq = FALSE, breaks = cortes, col = "yellow",
     main = "GRAFICA Nro. 2: MODELO DE PROBABILIDAD DE TOTAL GAS PRODUCTION BY 2023",
     xlab = "TOTAL GAS PRODUCTION BY 2023", ylab = "DENSIDAD DE PROBABILIDAD", las = 1)
curve(dlnorm(x, meanlog = media_log, sdlog = sd_log), col = "red", lwd = 2, add = TRUE)

# PASO 7.5: RESUMEN DE PARAMETROS
media_original <- exp(media_log + (sd_log^2)/2)

cat("\n>>> PASO 7.5: RESUMEN DE PARAMETROS <<<\n")

## 
## >>> PASO 7.5: RESUMEN DE PARAMETROS <<<

resumen_inf <- data.frame(
  Indicador = c("Media Logaritmica (mu)", "Desv. Est. Logaritmica (sigma)", 
                "Media Escala Original"),
  Valor = c(round(media_log, 4), round(sd_log, 4), round(media_original, 2))
)
print(resumen_inf, row.names = FALSE)

##                       Indicador        Valor
##          Media Logaritmica (mu)      13.5929
##  Desv. Est. Logaritmica (sigma)       0.9454
##           Media Escala Original 1251456.7200

# PASO 8: VALIDACION (ESTRATEGIA hi)
FO_rel <- hi 
Pi <- sapply(1:length(li), function(i) {
  plnorm(ls[i], media_log, sd_log) - plnorm(li[i], media_log, sd_log)
})
FE_rel <- Pi 
FE_rel[FE_rel < 0.000001] <- 0.000001 

r_pearson <- cor(FO_rel, FE_rel)
x2_valor <- sum((FO_rel - FE_rel)^2 / FE_rel)
df_libertad <- length(FO_rel) - 1 - 2 
vc_chi <- qchisq(0.999, df_libertad)

# GRAFICA Nro. 3: COMPARACION DEL MODELO DE POISON CON LA REALIDAD
plot(FO_rel, FE_rel, 
     main = "GRAFICA Nro. 3: COMPARACION DEL MODELO DE POISON CON LA REALIDAD",
     xlab = "TGP (OBSERVADO)", 
     ylab = "TGP (ESPERADO)", 
     pch = 19, col = "blue")
abline(0, 1, col = "red")

cat("\n>>> PASO 8: TABLA VEREDICTO FINAL (BASADO EN hi) <<<\n")

## 
## >>> PASO 8: TABLA VEREDICTO FINAL (BASADO EN hi) <<<

tabla_veredicto <- data.frame(
  Prueba = c("Coeficiente de Pearson (r)", "Prueba Chi-Cuadrado (X2)"),
  Valor_Obtenido = c(round(r_pearson, 4), round(x2_valor, 6)),
  Criterio_Referencia = c("r >= 0.90", paste("X2 <", round(vc_chi, 2))),
  Resultado = c(ifelse(r_pearson >= 0.90, "APROBADO", "REPROBADO"), 
                ifelse(x2_valor < vc_chi, "APROBADO", "REPROBADO"))
)
print(tabla_veredicto, row.names = FALSE)

##                      Prueba Valor_Obtenido Criterio_Referencia Resultado
##  Coeficiente de Pearson (r)       0.982000           r >= 0.90  APROBADO
##    Prueba Chi-Cuadrado (X2)       0.036757          X2 < 31.26  APROBADO

Continua_Produccion-de-gas.R

HP

2026-02-09