ANÁLISIS ESTADÍSTICO

1. CARGA DE LIBRERÍAS Y DATOS

#==============================ENCABEZADO================================
# TEMA: MODELOS PROBABILISTICOS- PRODUCCION TOTAL DE GAS
# AUTOR: GRUPO 3
# FECHA: 03-2026
#========================================================================
library(dplyr)
library(knitr)
library(gt)

setwd("C:/Users/HP/Documents/PROYECTO ESTADISTICA/RStudio")
datos <- read.csv("tablap.csv", header = TRUE, dec = ",", sep = ";")

2. TABLA DE DISTRIBUCIÓN DE CANTIDAD

options(scipen = 999) #permmite trabajar sin notacion cientifica
produccion_gas <- as.numeric(datos$Total.gas.production.by.2023)
produccion_gas <- na.omit(produccion_gas)
produccion_gas <- produccion_gas[produccion_gas > 0] 

caja <- boxplot(produccion_gas, plot = FALSE)
limite_sup <- caja$stats[5]
limite_inf <- caja$stats[1]

gas_outliers <- produccion_gas[produccion_gas < limite_inf | produccion_gas > limite_sup]
gas_sin_outliers <- produccion_gas[produccion_gas >= limite_inf & produccion_gas <= limite_sup]

cat("Cantidad con outliers:", length(produccion_gas), "\n")

## Cantidad con outliers: 12561

cat("Cantidad sin outliers:", length(gas_sin_outliers), "\n")

## Cantidad sin outliers: 11480

histograma_gas <- hist(gas_sin_outliers, plot = FALSE)
ni_gas <- histograma_gas$counts
hi_gas <- ni_gas / sum(ni_gas) * 100

intervalos_gas <- paste0("[", round(histograma_gas$breaks[-length(histograma_gas$breaks)], 2),
                         ", ", round(histograma_gas$breaks[-1], 2), ")")

tabla_base <- data.frame(Intervalo = intervalos_gas, ni = ni_gas, hi = round(hi_gas, 2))
fila_total <- data.frame(Intervalo = "TOTAL", ni = sum(tabla_base$ni), hi = round(sum(tabla_base$hi)))
tabla_final_c <- rbind(tabla_base, fila_total)

tabla_final_c %>% 
  gt() %>%
  cols_label(Intervalo = "Intervalo", ni = "ni", hi = "hi (%)") %>%
  tab_header(title = md("Tabla N°1. Distribución de frecuencias de producción de gas en los pozos de gas natural en Nuevo Mexico")) %>%
  tab_style(
    style = list(cell_fill(color = "lightgray"), cell_text(weight = "bold")),
    locations = cells_body(rows = Intervalo == "TOTAL")
  ) %>%
  cols_align(align = "center", columns = c(ni, hi)) %>%
  tab_options(table.width = pct(80), heading.title.font.size = px(20), column_labels.font.weight = "bold")%>%
  
# PIE DE PAGINA 
  tab_source_note(
    source_note = md("Las unidades de Produccion de Gas se expresan en pies cubicos")
  )

Intervalo	ni	hi (%)
Tabla N°1. Distribución de frecuencias de producción de gas en los pozos de gas natural en Nuevo Mexico
[0, 200000)	1031	8.98
[200000, 400000)	1445	12.59
[400000, 600000)	1349	11.75
[600000, 800000)	1280	11.15
[800000, 1000000)	1163	10.13
[1000000, 1200000)	986	8.59
[1200000, 1400000)	805	7.01
[1400000, 1600000)	595	5.18
[1600000, 1800000)	491	4.28
[1800000, 2000000)	396	3.45
[2000000, 2200000)	334	2.91
[2200000, 2400000)	263	2.29
[2400000, 2600000)	260	2.26
[2600000, 2800000)	234	2.04
[2800000, 3000000)	159	1.39
[3000000, 3200000)	168	1.46
[3200000, 3400000)	139	1.21
[3400000, 3600000)	122	1.06
[3600000, 3800000)	106	0.92
[3800000, 4000000)	107	0.93
[4000000, 4200000)	47	0.41
TOTAL	11480	100.00
Las unidades de Produccion de Gas se expresan en pies cubicos

3. GRÁFICA DE DISTRIBUCIÓN DE CANTIDAD

hist(gas_sin_outliers, freq = TRUE,
     main = "Grafica 1. Distribucion de cantidad de la produccion 
     de gas en pozos de gas natural en Nuevo Mexico",
     xlab = "Produccion de Gas", ylab = "Cantidad", col = "indianred1")

4. CONJETURA DEL MODELO

Debido a la similitud de las barras asociamos con el modelo de probabilidad log-normal

histograma <- hist(gas_sin_outliers, freq = FALSE,
                   main = "Grafica Nº2. Comparacion de la realidad con el modelo
                   de la produccion de gas de los pozos de gas natural de
                   Nuevo Mexico",
                   xlab = "Produccion de Gas", ylab = "Densidad de probabilidad", col = "indianred1")

medialog <- mean(log(gas_sin_outliers)) 
medialog

## [1] 13.59292

sd_log <- sd(log(gas_sin_outliers)) 
sd_log

## [1] 0.9454061

x_seq <- seq(min(gas_sin_outliers), max(gas_sin_outliers), length.out = 1000) 
curve(dlnorm(x, meanlog = medialog, sdlog = sd_log), add = TRUE, col = "black", lwd = 3)

5. TESTS DE APROBACIÓN

#----------------
#TEST DE PEARSON
#----------------
n<-length(gas_sin_outliers)
n

## [1] 11480

Fo <- histograma$counts

P <- c(0) 
for (i in 1:length(Fo)) {
  P[i] <- plnorm(histograma$breaks[i+1], medialog, sd_log) - plnorm(histograma$breaks[i], medialog, sd_log)
}
Fe <- P * length(gas_sin_outliers)

Fo_perc <- (Fo / length(gas_sin_outliers)) * 100
Fo_perc

##  [1]  8.9808362 12.5871080 11.7508711 11.1498258 10.1306620  8.5888502
##  [7]  7.0121951  5.1829268  4.2770035  3.4494774  2.9094077  2.2909408
## [13]  2.2648084  2.0383275  1.3850174  1.4634146  1.2108014  1.0627178
## [19]  0.9233449  0.9320557  0.4094077

Fe_perc <- (Fe / length(gas_sin_outliers)) * 100
Fe_perc

##  [1]  7.1196800 16.0350251 14.8680880 11.9537686  9.3301459  7.2713574
##  [7]  5.7074415  4.5239597  3.6230166  2.9306712  2.3930438  1.9711757
## [13]  1.6368053  1.3692698  1.1533143  0.9775596  0.8334296  0.7143957
## [19]  0.6154405  0.5326726  0.4630483

#Correlacionar Fo y Fe
plot(Fo_perc,Fe_perc,main="Gráfica Nº3: Correlación de frecuencias 
     en el modelo log-normal de la produccion de gas",
     xlab="Frecuencia Observada(%)",
     ylab="Frecuencia esperada(%)",
     col="blue3")

abline(a = 0, b = 1, col="red",lwd=2)

Correlacion <- cor(Fo_perc, Fe_perc) * 100
Correlacion

## [1] 97.20808

#---------------------
#TEST DE CHI-CUADRADO
#---------------------
grados_libertad <- length(Fo) - 1
grados_libertad

## [1] 20

Fe_perc[Fe_perc == 0] <- 1e-9
x2 <- sum((Fe_perc - Fo_perc)^2 / Fe_perc)
x2

## [1] 4.666674

umbral_aceptacion <- qchisq(0.95, grados_libertad)
umbral_aceptacion

## [1] 31.41043

x2<umbral_aceptacion

## [1] TRUE

cat("Tabla Resumen")

## Tabla Resumen

tabla_resumen <- data.frame(Variable = "Produccion Total Gas", 
                            Pearson = round(Correlacion, 2), 
                            Chi = round(x2, 2), 
                            Umbral = round(umbral_aceptacion, 2))

tabla_resumen %>% gt() %>%
  cols_label(Variable = "Variable", Pearson = "Test Pearson (%)", Chi = "Chi Cuadrado", Umbral = "Umbral de aceptacion") %>%
  tab_header(title = md("Tabla N°2. Resumen de test de bondad al modelo de probabilidad")) %>%
  cols_align(align = "center") %>%
  tab_options(table.width = pct(80), column_labels.font.weight = "bold")

Variable	Test Pearson (%)	Chi Cuadrado	Umbral de aceptacion
Tabla N°2. Resumen de test de bondad al modelo de probabilidad
Produccion Total Gas	97.21	4.67	31.41

6. CÁLCULO DE PROBABILIDADES

##PREGUNTA DE PROBABILIDAD 

plot.new()
plot.window(xlim = c(0, 100), ylim = c(0, 100))

# Dibujamos los textos
text(50, 55, "Cual es la probabilidad de que la produccion total", cex = 1.25, font = 2) 
text(50, 35, "de gas se encuentre entre 0 y 1,000,000 de unidades?", cex = 1.25, font = 2) 
# Diseñamos el marco
rect(0, 15, 100, 75, border = "#2A9D8F", lwd = 3)

# PROBABILIDAD ENTRE 0 y 1000000 
probabilidad_Gas <- plnorm(1000000, meanlog = medialog, sdlog = sd_log) - 
                    plnorm(0, meanlog = medialog, sdlog = sd_log)

# En porcentaje
probabilidad_Gas * 100

## [1] 59.30671

# Rango para la curva 
x <- seq(min(gas_sin_outliers), max(gas_sin_outliers), length.out = 1000)

# Curva log-normal
plot(x, dlnorm(x, meanlog = medialog, sdlog = sd_log),
     col = "skyblue3", 
     lwd = 2, 
     type = "l",
     main = "Grafica Nº4: Calculo de probabilidades de la
     produccion total de gas en los pozos de gas natural
     de Nuevo Mexico", 
     ylab = "Densidad de probabilidad",
     xlab = "Produccion de Gas")

# Rango del área de probabilidad
x_area <- seq(0, 1000000, length.out = 500) 
y_area <- dlnorm(x_area, meanlog = medialog, sdlog = sd_log)

# Línea del área
lines(x_area, y_area, col = "red", lwd = 2)

# Área sombreada 
polygon(c(x_area, rev(x_area)), 
        c(y_area, rep(0, length(y_area))),
        col = rgb(1, 0, 0, 0.5), border = "black")

# Leyenda 
legend("topright",
       legend = c("Modelo Log-normal", "Area de Probabilidad"),
       col = c("skyblue3", "red"),
       lwd = 2, 
       pch = c(NA, 15), 
       cex = 0.5)

# TEXTO DE LA PROBABILIDAD EN LA GRÁFICA
texto_prob <- paste0("Probabilidad: ", 
                     round(probabilidad_Gas * 100, 2), " %")

text(x = max(gas_sin_outliers) * 0.7,
     y = max(dlnorm(x, medialog, sd_log)) * 0.8,
     labels = texto_prob,
     col = "black",
     cex = 0.7,
     font = 2)

7. INTERVALOS DE CONFIANZA

# Media aritmetica
media_original <- exp(medialog + (sd_log^2)/2)
media_original

## [1] 1251457

# Desviacion estandar
sigma_original <- sqrt((exp(sd_log^2) - 1) * exp(2 * medialog + sd_log^2))
sigma_original

## [1] 1504032

#P(x-2e<u<x+2e)=95%
e <- sigma_original / sqrt(length(gas_sin_outliers))
e

## [1] 14037.38

li <- media_original - 2 * e
li

## [1] 1223382

ls <- media_original + 2 * e
ls

## [1] 1279531

tabla_media <- data.frame(Variable = "Produccion Total Gas",
                          li = round(li, 2), 
                          media = round(media_original, 2), 
                          ls = round(ls, 2))

tabla_media %>% gt() %>%
  cols_label(Variable = "Variable", li = "Limite inferior", media = "Media poblacional", ls = "Limite superior") %>%
  tab_header(title = md("Tabla Nº3. Media poblacional")) %>%
  cols_align(align = "center") %>%
  tab_options(table.width = pct(80), column_labels.font.weight = "bold")

Variable	Limite inferior	Media poblacional	Limite superior
Tabla Nº3. Media poblacional
Produccion Total Gas	1223382	1251457	1279531

8. CONCLUSIÓN

## La variable produccion total de gas se explica a traves del modelo log-normal siendo la media aritmetica de 1251456.72 y una desviacion estandar de 1504031.77. De esta manera logramos calcular probabilidades como por ejemplo, que al seleccionar aleatoriamente cualquier area donde la produccion se encuentre entre 0 y 1,000,000 es de 59.31 %. Mediante el teorema de limite central, sabemos que la media aritmetica poblacional de la produccion de gas se encuentra entre 1223381.96 y 1279531.48 con un 95% de confianza.

ANÁLISIS DE POZOS DE GAS NATURAL EN NUEVO MÉXICO: UN ENFOQUE BASADO EN DATOS

Grupo Nro. 3

03-2026

ANÁLISIS ESTADÍSTICO

1. CARGA DE LIBRERÍAS Y DATOS

2. TABLA DE DISTRIBUCIÓN DE CANTIDAD

3. GRÁFICA DE DISTRIBUCIÓN DE CANTIDAD

4. CONJETURA DEL MODELO

Debido a la similitud de las barras asociamos con el modelo de probabilidad log-normal

5. TESTS DE APROBACIÓN

6. CÁLCULO DE PROBABILIDADES

7. INTERVALOS DE CONFIANZA

8. CONCLUSIÓN