Se cargan las librerías necesarias y el dataset Global Oil and Gas Extraction Tracker (GOGET), que contiene registros de unidades de extracción de petróleo y gas a nivel mundial.
library(readxl)
library(dplyr)
library(gt)
library(ggplot2)
library(scales)
library(e1071)
Datos <- read_excel(file.choose())
Variable <- na.omit(Datos$Latitude)
N <- length(Variable)
cat("Registros válidos:", N, "\n")## Registros válidos: 7537
## Variables: 32
Se extrae la variable Latitude (Latitud Geográfica). Es una variable de escala de razón: admite el cero absoluto y todos los indicadores estadísticos son aplicables. Sus valores oscilan entre −90° y +90°, donde valores negativos corresponden al hemisferio sur y positivos al hemisferio norte.
## Variable analizada: Latitude
## Total de observaciones (n): 7537
## Mínimo: -53.9713 °
## Máximo: 73.4344 °
Se construyen dos versiones de la tabla: con límites decimales (cortes exactos sobre los datos) y con límites enteros (redondeados a múltiplos de 10), usando la regla de Sturges: k = 1 + 3.322 · log₁₀(n).
# ── LÍMITES DECIMALES ──────────────────────────────────────────
min_dec <- min(Variable)
max_dec <- max(Variable)
k_dec <- floor(1 + 3.322 * log10(N))
cortes_dec <- seq(min_dec, max_dec, length.out = k_dec + 1)
cortes_dec[length(cortes_dec)] <- max_dec + 0.0001
inter_dec <- cut(Variable, breaks = cortes_dec, include.lowest = TRUE, right = FALSE)
ni_dec <- as.vector(table(inter_dec))
hi_dec <- (ni_dec / N) * 100
TDF_Decimal <- data.frame(
Li = cortes_dec[1:k_dec],
Ls = cortes_dec[2:(k_dec + 1)],
MC = (cortes_dec[1:k_dec] + cortes_dec[2:(k_dec + 1)]) / 2,
ni = ni_dec,
hi = hi_dec,
Ni_asc = cumsum(ni_dec),
Ni_desc = rev(cumsum(rev(ni_dec))),
Hi_asc = cumsum(hi_dec),
Hi_desc = rev(cumsum(rev(hi_dec)))
)
# ── LÍMITES ENTEROS ────────────────────────────────────────────
BASE <- 10
min_int <- floor(min(Variable) / BASE) * BASE
max_int <- ceiling(max(Variable) / BASE) * BASE
k_int_sug <- floor(1 + 3.322 * log10(N))
Rango_int <- max_int - min_int
Amplitud_int <- ceiling((Rango_int / k_int_sug) / 10) * 10
if (Amplitud_int == 0) Amplitud_int <- 10
cortes_int <- seq(from = min_int, by = Amplitud_int, length.out = k_int_sug + 1)
if (max(cortes_int) < max(Variable)) cortes_int <- c(cortes_int, max(cortes_int) + Amplitud_int)
while (length(cortes_int) > 2 && cortes_int[length(cortes_int) - 1] >= max(Variable))
cortes_int <- cortes_int[-length(cortes_int)]
K_real <- length(cortes_int) - 1
inter_int <- cut(Variable, breaks = cortes_int, include.lowest = TRUE, right = FALSE)
ni_int <- as.vector(table(inter_int))
hi_int <- (ni_int / N) * 100
TDF_Enteros <- data.frame(
Li = cortes_int[1:K_real],
Ls = cortes_int[2:(K_real + 1)],
MC = (cortes_int[1:K_real] + cortes_int[2:(K_real + 1)]) / 2,
ni = ni_int,
hi = hi_int,
Ni_asc = cumsum(ni_int),
Ni_desc = rev(cumsum(rev(ni_int))),
Hi_asc = cumsum(hi_int),
Hi_desc = rev(cumsum(rev(hi_int)))
)
cat("Clases decimales (k):", k_dec, "\n")## Clases decimales (k): 13
## Clases enteras (k): 7
## Verificación — Σnᵢ: 7537 (debe ser 7537 )
fuente_nota <- paste0("n = ", format(N, big.mark = ","),
" | Fuente: Global Energy Monitor — GOGET 2023")
estilo_gt <- function(tabla_gt) {
tabla_gt %>%
tab_options(
table.width = pct(95),
table.font.size = px(13),
table.font.names = "Arial",
heading.title.font.size = px(15),
heading.subtitle.font.size = px(12),
heading.align = "center",
heading.background.color = "#AAAAAA",
column_labels.font.weight = "bold",
column_labels.background.color = "#FFFFFF",
column_labels.border.top.color = "#AAAAAA",
column_labels.border.bottom.color = "#AAAAAA",
table.border.top.color = "#AAAAAA",
table.border.bottom.color = "#AAAAAA",
data_row.padding = px(5)
) %>%
tab_style(
style = cell_text(color = "white", weight = "bold"),
locations = cells_title(groups = c("title", "subtitle"))
) %>%
tab_style(
style = cell_text(weight = "bold"),
locations = cells_column_labels()
)
}TDF_Dec_gt <- TDF_Decimal %>%
mutate(across(c(Li, Ls, MC), ~ round(., 2)),
hi = round(hi, 2), Hi_asc = round(Hi_asc, 2), Hi_desc = round(Hi_desc, 2))
fila_total <- data.frame(Li = NA, Ls = NA, MC = NA,
ni = sum(TDF_Dec_gt$ni),
hi = round(sum(TDF_Dec_gt$hi), 2),
Ni_asc = NA, Ni_desc = NA, Hi_asc = NA, Hi_desc = NA)
bind_rows(TDF_Dec_gt, fila_total) %>%
gt() %>%
tab_header(title = md("**Tabla N. 1**"),
subtitle = md("Distribución de frecuencias de latitud geográfica — límites decimales")) %>%
cols_label(Li = md("**Lím. Inf (°)**"), Ls = md("**Lím. Sup (°)**"),
MC = md("**Marca clase**"), ni = md("**nᵢ**"),
hi = md("**hᵢ (%)**"), Ni_asc = md("**Nᵢ ↑**"), Ni_desc = md("**Nᵢ ↓**"),
Hi_asc = md("**Hᵢ ↑ (%)**"), Hi_desc = md("**Hᵢ ↓ (%)**")) %>%
cols_align(align = "center", columns = everything()) %>%
fmt_number(columns = c(Li, Ls, MC, hi, Hi_asc, Hi_desc), decimals = 2) %>%
fmt_number(columns = c(ni, Ni_asc, Ni_desc), decimals = 0, use_seps = TRUE) %>%
tab_source_note(fuente_nota) %>%
tab_style(style = cell_fill(color = "#F5F5F5"),
locations = cells_body(rows = nrow(TDF_Dec_gt) + 1)) %>%
tab_style(style = cell_text(weight = "bold"),
locations = cells_body(rows = nrow(TDF_Dec_gt) + 1)) %>%
estilo_gt()| Tabla N. 1 | ||||||||
| Distribución de frecuencias de latitud geográfica — límites decimales | ||||||||
| Lím. Inf (°) | Lím. Sup (°) | Marca clase | nᵢ | hᵢ (%) | Nᵢ ↑ | Nᵢ ↓ | Hᵢ ↑ (%) | Hᵢ ↓ (%) |
|---|---|---|---|---|---|---|---|---|
| −53.97 | −44.17 | −49.07 | 79 | 1.05 | 79 | 7,537 | 1.05 | 100.00 |
| −44.17 | −34.37 | −39.27 | 125 | 1.66 | 204 | 7,458 | 2.71 | 98.95 |
| −34.37 | −24.57 | −29.47 | 53 | 0.70 | 257 | 7,333 | 3.41 | 97.29 |
| −24.57 | −14.77 | −19.67 | 99 | 1.31 | 356 | 7,280 | 4.72 | 96.59 |
| −14.77 | −4.97 | −9.87 | 162 | 2.15 | 518 | 7,181 | 6.87 | 95.28 |
| −4.97 | 4.83 | −0.07 | 373 | 4.95 | 891 | 7,019 | 11.82 | 93.13 |
| 4.83 | 14.63 | 9.73 | 528 | 7.01 | 1,419 | 6,646 | 18.83 | 88.18 |
| 14.63 | 24.43 | 19.53 | 269 | 3.57 | 1,688 | 6,118 | 22.40 | 81.17 |
| 24.43 | 34.23 | 29.33 | 2,407 | 31.94 | 4,095 | 5,849 | 54.33 | 77.60 |
| 34.23 | 44.03 | 39.13 | 782 | 10.38 | 4,877 | 3,442 | 64.71 | 45.67 |
| 44.03 | 53.83 | 48.93 | 1,614 | 21.41 | 6,491 | 2,660 | 86.12 | 35.29 |
| 53.83 | 63.63 | 58.73 | 904 | 11.99 | 7,395 | 1,046 | 98.12 | 13.88 |
| 63.63 | 73.43 | 68.53 | 142 | 1.88 | 7,537 | 142 | 100.00 | 1.88 |
| NA | NA | NA | 7,537 | 100.00 | NA | NA | NA | NA |
| n = 7,537 | Fuente: Global Energy Monitor — GOGET 2023 | ||||||||
TDF_Int_gt <- TDF_Enteros %>%
mutate(hi = round(hi, 2), Hi_asc = round(Hi_asc, 2), Hi_desc = round(Hi_desc, 2))
fila_total_int <- data.frame(Li = NA, Ls = NA, MC = NA,
ni = sum(TDF_Int_gt$ni),
hi = round(sum(TDF_Int_gt$hi), 2),
Ni_asc = NA, Ni_desc = NA, Hi_asc = NA, Hi_desc = NA)
bind_rows(TDF_Int_gt, fila_total_int) %>%
gt() %>%
tab_header(title = md("**Tabla N. 2**"),
subtitle = md("Distribución de frecuencias de latitud geográfica — límites enteros")) %>%
cols_label(Li = md("**Lím. Inf (°)**"), Ls = md("**Lím. Sup (°)**"),
MC = md("**Marca clase**"), ni = md("**nᵢ**"),
hi = md("**hᵢ (%)**"), Ni_asc = md("**Nᵢ ↑**"), Ni_desc = md("**Nᵢ ↓**"),
Hi_asc = md("**Hᵢ ↑ (%)**"), Hi_desc = md("**Hᵢ ↓ (%)**")) %>%
cols_align(align = "center", columns = everything()) %>%
fmt_number(columns = c(Li, Ls, MC, hi, Hi_asc, Hi_desc), decimals = 2) %>%
fmt_number(columns = c(ni, Ni_asc, Ni_desc), decimals = 0, use_seps = TRUE) %>%
tab_source_note(fuente_nota) %>%
tab_style(style = cell_fill(color = "#F5F5F5"),
locations = cells_body(rows = nrow(TDF_Int_gt) + 1)) %>%
tab_style(style = cell_text(weight = "bold"),
locations = cells_body(rows = nrow(TDF_Int_gt) + 1)) %>%
estilo_gt()| Tabla N. 2 | ||||||||
| Distribución de frecuencias de latitud geográfica — límites enteros | ||||||||
| Lím. Inf (°) | Lím. Sup (°) | Marca clase | nᵢ | hᵢ (%) | Nᵢ ↑ | Nᵢ ↓ | Hᵢ ↑ (%) | Hᵢ ↓ (%) |
|---|---|---|---|---|---|---|---|---|
| −60.00 | −40.00 | −50.00 | 79 | 1.05 | 79 | 7,537 | 1.05 | 100.00 |
| −40.00 | −20.00 | −30.00 | 248 | 3.29 | 327 | 7,458 | 4.34 | 98.95 |
| −20.00 | 0.00 | −10.00 | 309 | 4.10 | 636 | 7,210 | 8.44 | 95.66 |
| 0.00 | 20.00 | 10.00 | 956 | 12.68 | 1,592 | 6,901 | 21.12 | 91.56 |
| 20.00 | 40.00 | 30.00 | 2,971 | 39.42 | 4,563 | 5,945 | 60.54 | 78.88 |
| 40.00 | 60.00 | 50.00 | 2,666 | 35.37 | 7,229 | 2,974 | 95.91 | 39.46 |
| 60.00 | 80.00 | 70.00 | 308 | 4.09 | 7,537 | 308 | 100.00 | 4.09 |
| NA | NA | NA | 7,537 | 100.00 | NA | NA | NA | NA |
| n = 7,537 | Fuente: Global Energy Monitor — GOGET 2023 | ||||||||
color_barras <- "#2E86C1"
color_ojiva1 <- "#2E86C1"
color_ojiva2 <- "#C0392B"
tema_base <- theme_minimal(base_size = 12) +
theme(
legend.position = "none",
plot.title = element_text(face = "bold", size = 13),
plot.caption = element_text(color = "#888888", size = 9, hjust = 0),
axis.title = element_text(face = "bold", size = 11),
axis.text.x = element_text(angle = 25, hjust = 1, size = 8),
panel.grid.major.x = element_blank(),
panel.grid.major.y = element_line(color = "#EEEEEE"),
panel.grid.minor = element_blank(),
plot.background = element_rect(fill = "white", color = NA)
)
df_graf <- TDF_Enteros %>%
mutate(intervalo = factor(paste0(Li, " — ", Ls),
levels = paste0(Li, " — ", Ls)))ggplot(df_graf, aes(x = intervalo, y = ni)) +
geom_col(fill = color_barras, color = "white", width = 0.95) +
geom_text(aes(label = format(ni, big.mark = ",")),
vjust = -0.4, size = 3, fontface = "bold") +
scale_y_continuous(labels = label_comma(),
expand = expansion(mult = c(0, 0.12))) +
labs(title = "Gráfica N. 1: Distribución de cantidad de unidades petroleras por latitud",
x = "Latitud (°)", y = "Frecuencia Absoluta (nᵢ)",
caption = fuente_nota) +
tema_baseggplot(df_graf, aes(x = intervalo, y = hi)) +
geom_col(fill = color_barras, color = "white", width = 0.95) +
geom_text(aes(label = paste0(round(hi, 2), "%")),
vjust = -0.4, size = 3, fontface = "bold") +
scale_y_continuous(labels = function(x) paste0(x, "%"),
expand = expansion(mult = c(0, 0.12))) +
labs(title = "Gráfica N. 2: Distribución porcentual de unidades petroleras por latitud",
x = "Latitud (°)", y = "Frecuencia Relativa (%)",
caption = fuente_nota) +
tema_baseggplot(data.frame(x = Variable), aes(x = x)) +
geom_boxplot(fill = color_barras, color = "#1A5276",
outlier.color = "#C0392B", outlier.size = 1.5) +
labs(title = "Gráfica N. 3: Distribución de la latitud en unidades petroleras",
x = "Latitud (°)", y = "",
caption = fuente_nota) +
theme_minimal(base_size = 12) +
theme(
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
plot.title = element_text(face = "bold", size = 13),
plot.caption = element_text(color = "#888888", size = 9),
plot.background = element_rect(fill = "white", color = NA)
)ojiva_df <- data.frame(
x = c(TDF_Enteros$Ls, TDF_Enteros$Li),
y = c(TDF_Enteros$Hi_asc, TDF_Enteros$Hi_desc),
tipo = rep(c("Ascendente", "Descendente"), each = K_real)
)
ggplot(ojiva_df, aes(x = x, y = y, color = tipo, group = tipo)) +
geom_line(linewidth = 1.2) +
geom_point(size = 2.5) +
scale_color_manual(values = c("Ascendente" = color_ojiva1,
"Descendente" = color_ojiva2)) +
scale_y_continuous(labels = function(x) paste0(x, "%"),
limits = c(0, 105),
expand = expansion(mult = c(0, 0.05))) +
labs(title = "Gráfica N. 4: Ojivas ascendente y descendente de latitud en unidades petroleras",
x = "Latitud (°)", y = "Frecuencia Acumulada (%)",
color = NULL, caption = fuente_nota) +
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(face = "bold", size = 13),
plot.caption = element_text(color = "#888888", size = 9),
legend.position = "bottom",
plot.background = element_rect(fill = "white", color = NA)
)La variable Latitud es cuantitativa continua de escala de razón, por lo que se calculan todos los indicadores: tendencia central, dispersión, forma y valores atípicos.
media <- round(mean(Variable), 2)
mediana <- round(median(Variable), 2)
moda_val <- TDF_Enteros$MC[which.max(TDF_Enteros$ni)]
varianza <- round(var(Variable), 2)
sd_val <- round(sd(Variable), 2)
cv <- round((sd_val / abs(media)) * 100, 2)
asim <- round(skewness(Variable, type = 2), 4)
kurt <- round(kurtosis(Variable), 4)
Q1 <- quantile(Variable, 0.25)
Q3 <- quantile(Variable, 0.75)
IQR_val <- Q3 - Q1
outliers_data <- Variable[Variable < (Q1 - 1.5 * IQR_val) | Variable > (Q3 + 1.5 * IQR_val)]
num_out <- length(outliers_data)
out_txt <- if (num_out > 0) paste0(num_out, " [", round(min(outliers_data), 2),
"; ", round(max(outliers_data), 2), "]") else "0 [Sin outliers]"
data.frame(
Variable = "Latitud (°)",
Rango = paste0("[", round(min(Variable), 2), "; ", round(max(Variable), 2), "]"),
Media = media,
Mediana = mediana,
Moda = round(moda_val, 2),
Varianza = varianza,
Desv_Est = sd_val,
CV = cv,
Asimetria = asim,
Curtosis = kurt,
Outliers = out_txt,
check.names = FALSE
) %>%
gt() %>%
tab_header(
title = md("**Tabla N°3 — Indicadores Estadísticos: Latitud Geográfica de Yacimientos**")
) %>%
cols_label(
Variable = md("**Variable**"),
Rango = md("**Rango**"),
Media = md("**Media (X̄)**"),
Mediana = md("**Mediana (Me)**"),
Moda = md("**Moda (Mo)**"),
Varianza = md("**Varianza (S²)**"),
Desv_Est = md("**Desv. Est. (S)**"),
CV = md("**C.V. (%)**"),
Asimetria = md("**Asimetría (As)**"),
Curtosis = md("**Curtosis (K)**"),
Outliers = md("**Outliers [Intervalo]**")
) %>%
cols_align(align = "center", columns = everything()) %>%
tab_source_note("Autor: Grupo 5") %>%
estilo_gt()| Tabla N°3 — Indicadores Estadísticos: Latitud Geográfica de Yacimientos | ||||||||||
| Variable | Rango | Media (X̄) | Mediana (Me) | Moda (Mo) | Varianza (S²) | Desv. Est. (S) | C.V. (%) | Asimetría (As) | Curtosis (K) | Outliers [Intervalo] |
|---|---|---|---|---|---|---|---|---|---|---|
| Latitud (°) | [-53.97; 73.43] | 32.25 | 32.53 | 30 | 521.16 | 22.83 | 70.79 | -1.2222 | 1.6975 | 430 [-53.97; -7.66] |
| Autor: Grupo 5 | ||||||||||
La variable latitud geográfica de los yacimientos de petróleo y gas presenta una media de 32.25° y una mediana de 32.53°, lo que sugiere una distribución con asimetría negativa (cola hacia la izquierda). La desviación estándar de 22.83° refleja una dispersión considerable entre hemisferios, consistente con la distribución global de los yacimientos. El coeficiente de variación de 70.79% confirma la heterogeneidad geográfica de la muestra. La clase modal se ubica alrededor de 30°, que corresponde a la zona de mayor concentración de yacimientos registrados. Se identificaron 430 valores atípicos en los extremos de la distribución.