Se cargan las librerías necesarias y el dataset Global Oil and Gas Extraction Tracker (GOGET), que contiene registros de unidades de extracción de petróleo y gas a nivel mundial.
library(readxl)
library(dplyr)
library(gt)
library(ggplot2)
library(scales)
library(e1071)
Datos <- read_excel(file.choose())
Variable <- na.omit(Datos$Longitude)
N <- length(Variable)
cat("Registros válidos:", N, "\n")## Registros válidos: 7537
## Variables: 32
Se extrae la variable Longitude (Longitud Geográfica). Es una variable de escala de razón: admite el cero absoluto y todos los indicadores estadísticos son aplicables. Sus valores oscilan entre −180° y +180°, donde valores negativos corresponden al hemisferio occidental y positivos al oriental.
## Variable analizada: Longitude
## Total de observaciones (n): 7537
## Mínimo: -152.129 °
## Máximo: 174.361 °
Se construyen dos versiones de la tabla: con límites decimales (cortes exactos sobre los datos) y con límites enteros (redondeados a múltiplos de 10), usando la regla de Sturges: k = 1 + 3.322 · log₁₀(n).
# ── LÍMITES DECIMALES ──────────────────────────────────────────
min_dec <- min(Variable)
max_dec <- max(Variable)
k_dec <- floor(1 + 3.322 * log10(N))
cortes_dec <- seq(min_dec, max_dec, length.out = k_dec + 1)
cortes_dec[length(cortes_dec)] <- max_dec + 0.0001
inter_dec <- cut(Variable, breaks = cortes_dec, include.lowest = TRUE, right = FALSE)
ni_dec <- as.vector(table(inter_dec))
hi_dec <- (ni_dec / N) * 100
TDF_Decimal <- data.frame(
Li = cortes_dec[1:k_dec],
Ls = cortes_dec[2:(k_dec + 1)],
MC = (cortes_dec[1:k_dec] + cortes_dec[2:(k_dec + 1)]) / 2,
ni = ni_dec,
hi = hi_dec,
Ni_asc = cumsum(ni_dec),
Ni_desc = rev(cumsum(rev(ni_dec))),
Hi_asc = cumsum(hi_dec),
Hi_desc = rev(cumsum(rev(hi_dec)))
)
# ── LÍMITES ENTEROS ────────────────────────────────────────────
BASE <- 10
min_int <- floor(min(Variable) / BASE) * BASE
max_int <- ceiling(max(Variable) / BASE) * BASE
k_int_sug <- floor(1 + 3.322 * log10(N))
Rango_int <- max_int - min_int
Amplitud_int <- ceiling((Rango_int / k_int_sug) / 10) * 10
if (Amplitud_int == 0) Amplitud_int <- 10
cortes_int <- seq(from = min_int, by = Amplitud_int, length.out = k_int_sug + 1)
if (max(cortes_int) < max(Variable)) cortes_int <- c(cortes_int, max(cortes_int) + Amplitud_int)
while (length(cortes_int) > 2 && cortes_int[length(cortes_int) - 1] >= max(Variable))
cortes_int <- cortes_int[-length(cortes_int)]
K_real <- length(cortes_int) - 1
inter_int <- cut(Variable, breaks = cortes_int, include.lowest = TRUE, right = FALSE)
ni_int <- as.vector(table(inter_int))
hi_int <- (ni_int / N) * 100
TDF_Enteros <- data.frame(
Li = cortes_int[1:K_real],
Ls = cortes_int[2:(K_real + 1)],
MC = (cortes_int[1:K_real] + cortes_int[2:(K_real + 1)]) / 2,
ni = ni_int,
hi = hi_int,
Ni_asc = cumsum(ni_int),
Ni_desc = rev(cumsum(rev(ni_int))),
Hi_asc = cumsum(hi_int),
Hi_desc = rev(cumsum(rev(hi_int)))
)
cat("Clases decimales (k):", k_dec, "\n")## Clases decimales (k): 13
## Clases enteras (k): 12
## Verificación — Σnᵢ: 7537 (debe ser 7537 )
fuente_nota <- paste0("n = ", format(N, big.mark = ","),
" | Fuente: Global Energy Monitor — GOGET 2023")
estilo_gt <- function(tabla_gt) {
tabla_gt %>%
tab_options(
table.width = pct(95),
table.font.size = px(13),
table.font.names = "Arial",
heading.title.font.size = px(15),
heading.subtitle.font.size = px(12),
heading.align = "center",
heading.background.color = "#AAAAAA",
column_labels.font.weight = "bold",
column_labels.background.color = "#FFFFFF",
column_labels.border.top.color = "#AAAAAA",
column_labels.border.bottom.color = "#AAAAAA",
table.border.top.color = "#AAAAAA",
table.border.bottom.color = "#AAAAAA",
data_row.padding = px(5)
) %>%
tab_style(
style = cell_text(color = "white", weight = "bold"),
locations = cells_title(groups = c("title", "subtitle"))
) %>%
tab_style(
style = cell_text(weight = "bold"),
locations = cells_column_labels()
)
}TDF_Dec_gt <- TDF_Decimal %>%
mutate(across(c(Li, Ls, MC), ~ round(., 2)),
hi = round(hi, 2), Hi_asc = round(Hi_asc, 2), Hi_desc = round(Hi_desc, 2))
fila_total <- data.frame(Li = NA, Ls = NA, MC = NA,
ni = sum(TDF_Dec_gt$ni),
hi = round(sum(TDF_Dec_gt$hi), 2),
Ni_asc = NA, Ni_desc = NA, Hi_asc = NA, Hi_desc = NA)
bind_rows(TDF_Dec_gt, fila_total) %>%
gt() %>%
tab_header(title = md("**Tabla N. 1**"),
subtitle = md("Distribución de frecuencias de longitud geográfica — límites decimales")) %>%
cols_label(Li = md("**Lím. Inf (°)**"), Ls = md("**Lím. Sup (°)**"),
MC = md("**Marca clase**"), ni = md("**nᵢ**"),
hi = md("**hᵢ (%)**"), Ni_asc = md("**Nᵢ ↑**"), Ni_desc = md("**Nᵢ ↓**"),
Hi_asc = md("**Hᵢ ↑ (%)**"), Hi_desc = md("**Hᵢ ↓ (%)**")) %>%
cols_align(align = "center", columns = everything()) %>%
fmt_number(columns = c(Li, Ls, MC, hi, Hi_asc, Hi_desc), decimals = 2) %>%
fmt_number(columns = c(ni, Ni_asc, Ni_desc), decimals = 0, use_seps = TRUE) %>%
tab_source_note(fuente_nota) %>%
tab_style(style = cell_fill(color = "#F5F5F5"),
locations = cells_body(rows = nrow(TDF_Dec_gt) + 1)) %>%
tab_style(style = cell_text(weight = "bold"),
locations = cells_body(rows = nrow(TDF_Dec_gt) + 1)) %>%
estilo_gt()| Tabla N. 1 | ||||||||
| Distribución de frecuencias de longitud geográfica — límites decimales | ||||||||
| Lím. Inf (°) | Lím. Sup (°) | Marca clase | nᵢ | hᵢ (%) | Nᵢ ↑ | Nᵢ ↓ | Hᵢ ↑ (%) | Hᵢ ↓ (%) |
|---|---|---|---|---|---|---|---|---|
| −152.13 | −127.01 | −139.57 | 37 | 0.49 | 37 | 7,537 | 0.49 | 100.00 |
| −127.01 | −101.90 | −114.46 | 2,327 | 30.87 | 2,364 | 7,500 | 31.37 | 99.51 |
| −101.90 | −76.79 | −89.34 | 1,991 | 26.42 | 4,355 | 5,173 | 57.78 | 68.63 |
| −76.79 | −51.67 | −64.23 | 719 | 9.54 | 5,074 | 3,182 | 67.32 | 42.22 |
| −51.67 | −26.56 | −39.11 | 105 | 1.39 | 5,179 | 2,463 | 68.71 | 32.68 |
| −26.56 | −1.44 | −14.00 | 66 | 0.88 | 5,245 | 2,358 | 69.59 | 31.29 |
| −1.44 | 23.67 | 11.12 | 1,157 | 15.35 | 6,402 | 2,292 | 84.94 | 30.41 |
| 23.67 | 48.79 | 36.23 | 306 | 4.06 | 6,708 | 1,135 | 89.00 | 15.06 |
| 48.79 | 73.90 | 61.35 | 354 | 4.70 | 7,062 | 829 | 93.70 | 11.00 |
| 73.90 | 99.02 | 86.46 | 148 | 1.96 | 7,210 | 475 | 95.66 | 6.30 |
| 99.02 | 124.13 | 111.57 | 275 | 3.65 | 7,485 | 327 | 99.31 | 4.34 |
| 124.13 | 149.25 | 136.69 | 41 | 0.54 | 7,526 | 52 | 99.85 | 0.69 |
| 149.25 | 174.36 | 161.80 | 11 | 0.15 | 7,537 | 11 | 100.00 | 0.15 |
| NA | NA | NA | 7,537 | 100.00 | NA | NA | NA | NA |
| n = 7,537 | Fuente: Global Energy Monitor — GOGET 2023 | ||||||||
TDF_Int_gt <- TDF_Enteros %>%
mutate(hi = round(hi, 2), Hi_asc = round(Hi_asc, 2), Hi_desc = round(Hi_desc, 2))
fila_total_int <- data.frame(Li = NA, Ls = NA, MC = NA,
ni = sum(TDF_Int_gt$ni),
hi = round(sum(TDF_Int_gt$hi), 2),
Ni_asc = NA, Ni_desc = NA, Hi_asc = NA, Hi_desc = NA)
bind_rows(TDF_Int_gt, fila_total_int) %>%
gt() %>%
tab_header(title = md("**Tabla N. 2**"),
subtitle = md("Distribución de frecuencias de longitud geográfica — límites enteros")) %>%
cols_label(Li = md("**Lím. Inf (°)**"), Ls = md("**Lím. Sup (°)**"),
MC = md("**Marca clase**"), ni = md("**nᵢ**"),
hi = md("**hᵢ (%)**"), Ni_asc = md("**Nᵢ ↑**"), Ni_desc = md("**Nᵢ ↓**"),
Hi_asc = md("**Hᵢ ↑ (%)**"), Hi_desc = md("**Hᵢ ↓ (%)**")) %>%
cols_align(align = "center", columns = everything()) %>%
fmt_number(columns = c(Li, Ls, MC, hi, Hi_asc, Hi_desc), decimals = 2) %>%
fmt_number(columns = c(ni, Ni_asc, Ni_desc), decimals = 0, use_seps = TRUE) %>%
tab_source_note(fuente_nota) %>%
tab_style(style = cell_fill(color = "#F5F5F5"),
locations = cells_body(rows = nrow(TDF_Int_gt) + 1)) %>%
tab_style(style = cell_text(weight = "bold"),
locations = cells_body(rows = nrow(TDF_Int_gt) + 1)) %>%
estilo_gt()| Tabla N. 2 | ||||||||
| Distribución de frecuencias de longitud geográfica — límites enteros | ||||||||
| Lím. Inf (°) | Lím. Sup (°) | Marca clase | nᵢ | hᵢ (%) | Nᵢ ↑ | Nᵢ ↓ | Hᵢ ↑ (%) | Hᵢ ↓ (%) |
|---|---|---|---|---|---|---|---|---|
| −160.00 | −130.00 | −145.00 | 37 | 0.49 | 37 | 7,537 | 0.49 | 100.00 |
| −130.00 | −100.00 | −115.00 | 2,689 | 35.68 | 2,726 | 7,500 | 36.17 | 99.51 |
| −100.00 | −70.00 | −85.00 | 2,018 | 26.77 | 4,744 | 4,811 | 62.94 | 63.83 |
| −70.00 | −40.00 | −55.00 | 403 | 5.35 | 5,147 | 2,793 | 68.29 | 37.06 |
| −40.00 | −10.00 | −25.00 | 51 | 0.68 | 5,198 | 2,390 | 68.97 | 31.71 |
| −10.00 | 20.00 | 5.00 | 1,130 | 14.99 | 6,328 | 2,339 | 83.96 | 31.03 |
| 20.00 | 50.00 | 35.00 | 431 | 5.72 | 6,759 | 1,209 | 89.68 | 16.04 |
| 50.00 | 80.00 | 65.00 | 384 | 5.09 | 7,143 | 778 | 94.77 | 10.32 |
| 80.00 | 110.00 | 95.00 | 183 | 2.43 | 7,326 | 394 | 97.20 | 5.23 |
| 110.00 | 140.00 | 125.00 | 178 | 2.36 | 7,504 | 211 | 99.56 | 2.80 |
| 140.00 | 170.00 | 155.00 | 26 | 0.34 | 7,530 | 33 | 99.91 | 0.44 |
| 170.00 | 200.00 | 185.00 | 7 | 0.09 | 7,537 | 7 | 100.00 | 0.09 |
| NA | NA | NA | 7,537 | 99.99 | NA | NA | NA | NA |
| n = 7,537 | Fuente: Global Energy Monitor — GOGET 2023 | ||||||||
color_barras <- "#2E86C1"
color_ojiva1 <- "#2E86C1"
color_ojiva2 <- "#C0392B"
tema_base <- theme_minimal(base_size = 12) +
theme(
legend.position = "none",
plot.title = element_text(face = "bold", size = 13),
plot.caption = element_text(color = "#888888", size = 9, hjust = 0),
axis.title = element_text(face = "bold", size = 11),
axis.text.x = element_text(angle = 25, hjust = 1, size = 8),
panel.grid.major.x = element_blank(),
panel.grid.major.y = element_line(color = "#EEEEEE"),
panel.grid.minor = element_blank(),
plot.background = element_rect(fill = "white", color = NA)
)
df_graf <- TDF_Enteros %>%
mutate(intervalo = factor(paste0(Li, " — ", Ls),
levels = paste0(Li, " — ", Ls)))ggplot(df_graf, aes(x = intervalo, y = ni)) +
geom_col(fill = color_barras, color = "white", width = 0.95) +
geom_text(aes(label = format(ni, big.mark = ",")),
vjust = -0.4, size = 3, fontface = "bold") +
scale_y_continuous(labels = label_comma(),
expand = expansion(mult = c(0, 0.12))) +
labs(title = "Gráfica N. 1: Distribución de cantidad de unidades petroleras por longitud",
x = "Longitud (°)", y = "Frecuencia Absoluta (nᵢ)",
caption = fuente_nota) +
tema_baseggplot(df_graf, aes(x = intervalo, y = hi)) +
geom_col(fill = color_barras, color = "white", width = 0.95) +
geom_text(aes(label = paste0(round(hi, 2), "%")),
vjust = -0.4, size = 3, fontface = "bold") +
scale_y_continuous(labels = function(x) paste0(x, "%"),
expand = expansion(mult = c(0, 0.12))) +
labs(title = "Gráfica N. 2: Distribución porcentual de unidades petroleras por longitud",
x = "Longitud (°)", y = "Frecuencia Relativa (%)",
caption = fuente_nota) +
tema_baseggplot(data.frame(x = Variable), aes(x = x)) +
geom_boxplot(fill = color_barras, color = "#1A5276",
outlier.color = "#C0392B", outlier.size = 1.5) +
labs(title = "Gráfica N. 3: Distribución de la longitud en unidades petroleras",
x = "Longitud (°)", y = "",
caption = fuente_nota) +
theme_minimal(base_size = 12) +
theme(
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
plot.title = element_text(face = "bold", size = 13),
plot.caption = element_text(color = "#888888", size = 9),
plot.background = element_rect(fill = "white", color = NA)
)ojiva_df <- data.frame(
x = c(TDF_Enteros$Ls, TDF_Enteros$Li),
y = c(TDF_Enteros$Hi_asc, TDF_Enteros$Hi_desc),
tipo = rep(c("Ascendente", "Descendente"), each = K_real)
)
ggplot(ojiva_df, aes(x = x, y = y, color = tipo, group = tipo)) +
geom_line(linewidth = 1.2) +
geom_point(size = 2.5) +
scale_color_manual(values = c("Ascendente" = color_ojiva1,
"Descendente" = color_ojiva2)) +
scale_y_continuous(labels = function(x) paste0(x, "%"),
limits = c(0, 105),
expand = expansion(mult = c(0, 0.05))) +
labs(title = "Gráfica N. 4: Ojivas ascendente y descendente de longitud en unidades petroleras",
x = "Longitud (°)", y = "Frecuencia Acumulada (%)",
color = NULL, caption = fuente_nota) +
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(face = "bold", size = 13),
plot.caption = element_text(color = "#888888", size = 9),
legend.position = "bottom",
plot.background = element_rect(fill = "white", color = NA)
)La variable Longitud es cuantitativa continua de escala de razón, por lo que se calculan todos los indicadores: tendencia central, dispersión, forma y valores atípicos.
media <- round(mean(Variable), 2)
mediana <- round(median(Variable), 2)
moda_val <- TDF_Enteros$MC[which.max(TDF_Enteros$ni)]
varianza <- round(var(Variable), 2)
sd_val <- round(sd(Variable), 2)
cv <- round((sd_val / abs(media)) * 100, 2)
asim <- round(skewness(Variable, type = 2), 4)
kurt <- round(kurtosis(Variable), 4)
Q1 <- quantile(Variable, 0.25)
Q3 <- quantile(Variable, 0.75)
IQR_val <- Q3 - Q1
outliers_data <- Variable[Variable < (Q1 - 1.5 * IQR_val) | Variable > (Q3 + 1.5 * IQR_val)]
num_out <- length(outliers_data)
out_txt <- if (num_out > 0) paste0(num_out, " [", round(min(outliers_data), 2),
"; ", round(max(outliers_data), 2), "]") else "0 [Sin outliers]"
data.frame(
Variable = "Longitud (°)",
Rango = paste0("[", round(min(Variable), 2), "; ", round(max(Variable), 2), "]"),
Media = media,
Mediana = mediana,
Moda = round(moda_val, 2),
Varianza = varianza,
Desv_Est = sd_val,
CV = cv,
Asimetria = asim,
Curtosis = kurt,
Outliers = out_txt,
check.names = FALSE
) %>%
gt() %>%
tab_header(
title = md("**Tabla N°3 — Indicadores Estadísticos: Longitud Geográfica de Yacimientos**")
) %>%
cols_label(
Variable = md("**Variable**"),
Rango = md("**Rango**"),
Media = md("**Media (X̄)**"),
Mediana = md("**Mediana (Me)**"),
Moda = md("**Moda (Mo)**"),
Varianza = md("**Varianza (S²)**"),
Desv_Est = md("**Desv. Est. (S)**"),
CV = md("**C.V. (%)**"),
Asimetria = md("**Asimetría (As)**"),
Curtosis = md("**Curtosis (K)**"),
Outliers = md("**Outliers [Intervalo]**")
) %>%
cols_align(align = "center", columns = everything()) %>%
tab_source_note("Autor: Grupo 5") %>%
estilo_gt()| Tabla N°3 — Indicadores Estadísticos: Longitud Geográfica de Yacimientos | ||||||||||
| Variable | Rango | Media (X̄) | Mediana (Me) | Moda (Mo) | Varianza (S²) | Desv. Est. (S) | C.V. (%) | Asimetría (As) | Curtosis (K) | Outliers [Intervalo] |
|---|---|---|---|---|---|---|---|---|---|---|
| Longitud (°) | [-152.13; 174.36] | -54.65 | -93.4 | -115 | 4610.38 | 67.9 | 124.25 | 1.0723 | -0.0522 | 7 [173.31; 174.36] |
| Autor: Grupo 5 | ||||||||||
La variable longitud geográfica de los yacimientos de petróleo y gas presenta una media de -54.65° y una mediana de -93.4°, lo que sugiere una distribución con asimetría positiva (cola hacia la derecha). La desviación estándar de 67.9° refleja una alta dispersión geográfica, consistente con la distribución global de los yacimientos. El coeficiente de variación de 124.25% confirma la heterogeneidad de la muestra. La clase modal se ubica alrededor de -115°, que corresponde a la zona de mayor concentración de yacimientos registrados. Se identificaron 7 valores atípicos en los extremos de la distribución.