Introducción

Análisis de la variable STATUS del dataset petrolero.

Configuración y Carga de Datos

library(readxl)
library(dplyr)
library(ggplot2)
library(gt)
library(scales)
library(knitr)
datos    <- read_excel("dataset_mundial_petro.xlsx")
head(datos)
## # A tibble: 6 × 23
##   `Unit ID` `Unit Name`   Unit name local scri…¹ `Fuel type` `Unit type` Country
##   <chr>     <chr>         <chr>                  <chr>       <chr>       <chr>  
## 1 OG0000001 Matzen        <NA>                   oil and gas field       Austria
## 2 OG0000002 Abalone       Abalone                oil and gas field       Brazil 
## 3 OG0000006 Aguilhada     Aguilhada              oil and gas field       Brazil 
## 4 OG0000007 Agulha        Agulha                 oil and gas field       Brazil 
## 5 OG0000008 Albacora      Albacora               oil and gas field       Brazil 
## 6 OG0000009 Albacora Les… Albacora Leste         oil and gas field       Brazil 
## # ℹ abbreviated name: ¹​`Unit name local script`
## # ℹ 17 more variables: `Subnational unit (province, state)` <chr>,
## #   Latitude <dbl>, Longitude <dbl>, `Location accuracy` <chr>, Status <chr>,
## #   `Status year` <dbl>, `Discovery year` <dbl>, `FID Year` <chr>,
## #   `Production start year` <chr>, Operator <chr>, Owner <chr>, Parent <chr>,
## #   Basin <chr>, `Concession / block` <chr>, `Project or complex` <chr>,
## #   `Government unit ID` <chr>, `Wiki URL` <chr>

Extracción de la Variable STATUS

status <- datos %>%
  select(Status) %>%
  filter(!is.na(Status))

unique(status$Status)
## [1] "operating"      "discovered"     "in development" "decommissioned"
## [5] "UGS"            "abandoned"      "shut in"        "cancelled"     
## [9] "exploration"

Transformación a Variable Ordinal

niveles <- c(
  "Exploration",
  "Discovery",
  "Development",
  "Operating",
  "Closed",
  "Decommissioned",
  "Abandoned",
  "Underground Storage",
  "Cancelled"
)

status$Status <- factor(status$Status,
                        levels = niveles,
                        ordered = TRUE)

status_num <- as.numeric(status$Status)

Tabla de Distribución de Frecuencias

tabla <- data.frame(
  Estado = levels(status$Status),
  Frecuencia = as.vector(table(status$Status))
)

tabla <- tabla %>%
  mutate(
    Frecuencia_Relativa = Frecuencia/sum(Frecuencia),
    Porcentaje = Frecuencia_Relativa*100
  )

tabla
##                Estado Frecuencia Frecuencia_Relativa Porcentaje
## 1         Exploration          0                 NaN        NaN
## 2           Discovery          0                 NaN        NaN
## 3         Development          0                 NaN        NaN
## 4           Operating          0                 NaN        NaN
## 5              Closed          0                 NaN        NaN
## 6      Decommissioned          0                 NaN        NaN
## 7           Abandoned          0                 NaN        NaN
## 8 Underground Storage          0                 NaN        NaN
## 9           Cancelled          0                 NaN        NaN
tabla %>% gt()
Estado Frecuencia Frecuencia_Relativa Porcentaje
Exploration 0 NaN NaN
Discovery 0 NaN NaN
Development 0 NaN NaN
Operating 0 NaN NaN
Closed 0 NaN NaN
Decommissioned 0 NaN NaN
Abandoned 0 NaN NaN
Underground Storage 0 NaN NaN
Cancelled 0 NaN NaN

Análisis Gráfico

ggplot(tabla, aes(Estado, Frecuencia)) +
  geom_col() +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Conjetura del Modelo Probabilístico

media_status <- mean(status_num)
p <- 1/media_status

media_status
## [1] NA
p
## [1] NA

Frecuencias Esperadas

k <- 1:length(tabla$Frecuencia)

prob_geo <- dgeom(k - 1, p)

esperadas <- prob_geo * sum(tabla$Frecuencia)

tabla_modelo <- data.frame(
  Categoria = k,
  Observada = tabla$Frecuencia,
  Esperada = round(esperadas,2)
)

tabla_modelo
##   Categoria Observada Esperada
## 1         1         0       NA
## 2         2         0       NA
## 3         3         0       NA
## 4         4         0       NA
## 5         5         0       NA
## 6         6         0       NA
## 7         7         0       NA
## 8         8         0       NA
## 9         9         0       NA

Test de Pearson

pearson <- sum(
(tabla_modelo$Observada - tabla_modelo$Esperada)^2 /
tabla_modelo$Esperada
)

pearson
## [1] NA

Test Chi-cuadrado

gl <- nrow(tabla_modelo) - 2
chi_critico <- qchisq(0.95, df = gl)

chi_critico
## [1] 14.06714

Tabla Resumen

resultado <- ifelse(
pearson < chi_critico,
"No se rechaza H0",
"Se rechaza H0"
)

resumen <- data.frame(
Estadistico = pearson,
Chi_Critico = chi_critico,
Decision = resultado
)

resumen %>% gt()
Estadistico Chi_Critico Decision
NA 14.06714 NA

Probabilidades

probabilidades <- data.frame(
Categoria = k,
Probabilidad = round(prob_geo,4)
)

probabilidades %>% gt()
Categoria Probabilidad
1 NA
2 NA
3 NA
4 NA
5 NA
6 NA
7 NA
8 NA
9 NA