🧱 PARTE 1:
Construcción y análisis técnico de la base
0. Verificación de
archivos
datos_dir <- params$datos_dir
files_expected <- c(
"clean_precios_carbon.csv",
"clean_precios_gas_natural.csv",
"clean_precios_gasolina.csv",
"clean_precios_petroleo.csv",
"IPC.csv"
)
file_paths <- file.path(datos_dir, files_expected)
verif <- tibble(Archivo = files_expected, Ruta = file_paths, Existe = file.exists(file_paths))
kable(verif, caption = "Verificación de archivos requeridos") %>%
kable_styling(bootstrap_options = c("striped", "hover", "responsive"), full_width = FALSE)
Verificación de archivos requeridos
|
Archivo
|
Ruta
|
Existe
|
|
clean_precios_carbon.csv
|
./clean_precios_carbon.csv
|
TRUE
|
|
clean_precios_gas_natural.csv
|
./clean_precios_gas_natural.csv
|
TRUE
|
|
clean_precios_gasolina.csv
|
./clean_precios_gasolina.csv
|
TRUE
|
|
clean_precios_petroleo.csv
|
./clean_precios_petroleo.csv
|
TRUE
|
|
IPC.csv
|
./IPC.csv
|
TRUE
|
datos_dir <- params$datos_dir
files_expected <- c(
"clean_precios_carbon.csv",
"clean_precios_gas_natural.csv",
"clean_precios_gasolina.csv",
"clean_precios_petroleo.csv",
"IPC.csv"
)
file_paths <- file.path(datos_dir, files_expected)
verif <- tibble(Archivo = files_expected, Ruta = file_paths, Existe = file.exists(file_paths))
kable(verif, caption = "Verificación de archivos requeridos") %>%
kable_styling(bootstrap_options = c("striped", "hover", "responsive"), full_width = FALSE)
Verificación de archivos requeridos
|
Archivo
|
Ruta
|
Existe
|
|
clean_precios_carbon.csv
|
./clean_precios_carbon.csv
|
TRUE
|
|
clean_precios_gas_natural.csv
|
./clean_precios_gas_natural.csv
|
TRUE
|
|
clean_precios_gasolina.csv
|
./clean_precios_gasolina.csv
|
TRUE
|
|
clean_precios_petroleo.csv
|
./clean_precios_petroleo.csv
|
TRUE
|
|
IPC.csv
|
./IPC.csv
|
TRUE
|
1. Lectura de
datos
read_maybe <- function(name) {
p <- file.path(datos_dir, name)
if (file.exists(p)) return(read_csv(p, show_col_types = FALSE))
if (file.exists(name)) return(read_csv(name, show_col_types = FALSE))
stop(glue("No se encontró el archivo: {name}"))
}
precio_carbon <- read_maybe("clean_precios_carbon.csv")
precio_gas_natural <- read_maybe("clean_precios_gas_natural.csv")
precio_gasolina <- read_maybe("clean_precios_gasolina.csv")
precio_petroleo <- read_maybe("clean_precios_petroleo.csv")
ipc <- read_maybe("IPC.csv")
# Vista rápida
list(
"Carbon" = head(precio_carbon),
"Gas Natural" = head(precio_gas_natural),
"Gasolina" = head(precio_gasolina),
"Petroleo" = head(precio_petroleo)
)
## $Carbon
## # A tibble: 6 × 2
## fecha precio_carbon
## <date> <dbl>
## 1 2000-01-01 56.8
## 2 2000-01-02 84.1
## 3 2000-01-03 70.7
## 4 2000-01-05 79.7
## 5 2000-01-06 91.5
## 6 2000-01-07 69.8
##
## $`Gas Natural`
## # A tibble: 6 × 2
## fecha precio_gas_natural
## <date> <dbl>
## 1 2000-01-01 134.
## 2 2000-01-02 103.
## 3 2000-01-03 72.7
## 4 2000-01-04 102.
## 5 2000-01-05 134.
## 6 2000-01-06 78.6
##
## $Gasolina
## # A tibble: 6 × 2
## fecha precio_gasolina
## <date> <dbl>
## 1 2000-01-01 85.2
## 2 2000-01-02 58.8
## 3 2000-01-03 62.4
## 4 2000-01-04 48.6
## 5 2000-01-05 43.8
## 6 2000-01-06 41.9
##
## $Petroleo
## # A tibble: 6 × 2
## fecha precio_petroleo
## <date> <dbl>
## 1 2000-01-01 77.7
## 2 2000-01-02 108.
## 3 2000-01-03 119.
## 4 2000-01-04 106.
## 5 2000-01-05 115.
## 6 2000-01-06 101.
2. Función para
completar fechas faltantes (diario)
explicitar_fechas <- function(df, date_col = "fecha", price_col = NULL, item_name = NULL) {
if (!date_col %in% names(df)) {
date_candidates <- names(df)[str_detect(names(df), regex("fecha|date|periodo|mes", ignore_case = TRUE))]
if (length(date_candidates) == 0) stop("No se encontró ninguna columna de fecha en el dataset.")
date_col <- date_candidates[1]
}
if (is.null(price_col) || !price_col %in% names(df)) {
price_candidates <- names(df)[
str_detect(names(df), regex("precio|valor|price|cost", ignore_case = TRUE))
]
if (length(price_candidates) == 0) stop(glue("No se encontró columna de precio en '{item_name}'."))
price_col <- price_candidates[1]
}
df2 <- df %>% mutate(date = as.Date(!!sym(date_col))) %>% arrange(date)
seq_full <- tibble(date = seq.Date(min(df2$date, na.rm = TRUE),
max(df2$date, na.rm = TRUE), by = "day"))
out <- seq_full %>%
left_join(df2, by = "date") %>%
mutate(
missing_price = is.na(!!sym(price_col)),
item = item_name
)
names(out)[names(out) == price_col] <- "price"
out %>% select(date, price, item, everything())
}
ejemplo_explicito <- explicitar_fechas(precio_carbon, "fecha", "precio", "carbon")
head(ejemplo_explicito)
## # A tibble: 6 × 5
## date price item fecha missing_price
## <date> <dbl> <chr> <date> <lgl>
## 1 2000-01-01 56.8 carbon 2000-01-01 FALSE
## 2 2000-01-02 84.1 carbon 2000-01-02 FALSE
## 3 2000-01-03 70.7 carbon 2000-01-03 FALSE
## 4 2000-01-04 NA carbon NA TRUE
## 5 2000-01-05 79.7 carbon 2000-01-05 FALSE
## 6 2000-01-06 91.5 carbon 2000-01-06 FALSE
3. Aplicar a todos
los datasets
carb_explicita <- explicitar_fechas(precio_carbon, "fecha", "precio", "carbon")
gasnat_explicita <- explicitar_fechas(precio_gas_natural, "fecha", "precio", "gas_natural")
gasol_explicita <- explicitar_fechas(precio_gasolina, "fecha", "precio", "gasolina")
pet_explicita <- explicitar_fechas(precio_petroleo, "fecha", "precio", "petroleo")
resumen_explicitos <- tibble(
Combustible = c("Carbon", "Gas Natural", "Gasolina", "Petroleo"),
Fechas_iniciales= c(min(carb_explicita$date), min(gasnat_explicita$date),
min(gasol_explicita$date), min(pet_explicita$date)),
Fechas_finales = c(max(carb_explicita$date), max(gasnat_explicita$date),
max(gasol_explicita$date), max(pet_explicita$date)),
Total_dias = c(nrow(carb_explicita), nrow(gasnat_explicita),
nrow(gasol_explicita), nrow(pet_explicita))
)
kable(resumen_explicitos, caption = "Cobertura temporal y número de observaciones por combustible")
Cobertura temporal y número de observaciones por
combustible
| Carbon |
2000-01-01 |
2024-01-01 |
8767 |
| Gas Natural |
2000-01-01 |
2024-01-01 |
8767 |
| Gasolina |
2000-01-01 |
2024-01-01 |
8767 |
| Petroleo |
2000-01-01 |
2024-01-01 |
8767 |
4. Unir datasets
normizar <- function(df, item_name) {
if ("fecha" %in% names(df) && !"date" %in% names(df)) {
df <- df %>% rename(date = fecha)
}
df <- df[, !duplicated(names(df))]
price_col <- names(df)[str_detect(names(df), regex("precio|valor|price", ignore_case = TRUE))][1]
if (is.na(price_col)) stop("No se encontró columna de precio en ", item_name)
if (price_col != "price") {
df <- df %>% rename(price = !!sym(price_col))
}
df %>%
mutate(item = item_name, date = as.Date(date)) %>%
select(date, price, item, everything()) %>%
arrange(date)
}
consolidado_daily <- bind_rows(
normizar(carb_explicita, "carbon"),
normizar(gasnat_explicita, "gas_natural"),
normizar(gasol_explicita, "gasolina"),
normizar(pet_explicita, "petroleo")
) %>% arrange(item, date)
consolidado_daily %>%
group_by(item) %>%
summarise(
total_obs = n(),
missing = sum(is.na(price)),
pct_missing = round(100 * mean(is.na(price)), 2),
.groups = "drop"
) %>%
kable(caption = "Resumen de datos consolidados diarios (faltantes)")
Resumen de datos consolidados diarios (faltantes)
| carbon |
8767 |
188 |
2.14 |
| gas_natural |
8767 |
188 |
2.14 |
| gasolina |
8767 |
188 |
2.14 |
| petroleo |
8767 |
188 |
2.14 |
5. Agregación mensual
(corte hasta 2023-12-31)
consolidado_daily <- consolidado_daily %>%
mutate(year = year(date), month = month(date), ym = floor_date(date, "month"))
monthly_avg <- consolidado_daily %>%
group_by(item, ym) %>%
summarise(price_month_avg = mean(price, na.rm = TRUE), .groups = "drop")
monthly_filtered <- monthly_avg %>%
filter(ym >= as.Date("2000-01-01") & ym <= as.Date("2023-12-31"))
head(monthly_filtered) %>%
kable(caption = "Ejemplo de precios mensuales promedio (2000–2023)")
Ejemplo de precios mensuales promedio (2000–2023)
| carbon |
2000-01-01 |
77.14800 |
| carbon |
2000-02-01 |
80.84513 |
| carbon |
2000-03-01 |
76.15327 |
| carbon |
2000-04-01 |
74.68102 |
| carbon |
2000-05-01 |
77.61114 |
| carbon |
2000-06-01 |
78.32986 |
6. IPC (corte a
2023)
library(dplyr)
library(lubridate)
library(stringr)
library(readr)
library(scales)
library(knitr)
library(kableExtra)
# ---------------------------
# 1️⃣ Preparar IPC
# ---------------------------
ipc_val_col <- names(ipc)[str_detect(names(ipc), regex("ipc|indice|valor|index|value", ignore_case = TRUE))][1]
ipc_date_col <- names(ipc)[str_detect(names(ipc), regex("fecha|mes|anio|año|periodo|period|date", ignore_case = TRUE))][1]
ipc_prepared <- ipc %>%
rename(ipc_value = !!sym(ipc_val_col)) %>%
mutate(
ym = as.Date(ifelse(
str_detect(as.character(.data[[ipc_date_col]]), "^\\d{4}-\\d{2}$"),
paste0(.data[[ipc_date_col]], "-01"),
as.character(floor_date(parse_date_time(as.character(.data[[ipc_date_col]]),
orders = c("ym","my","Ym","Y-m","Y-m-d","ymd")),
"month"))
))
) %>%
select(ym, ipc_value) %>%
filter(!is.na(ym), !is.na(ipc_value)) %>%
arrange(ym) %>%
filter(ym <= as.Date("2023-12-31"))
# ---------------------------
# 2️⃣ Función para pasar precios nominales a reales
# ---------------------------
transformar_a_reales <- function(df, ipc_df, base_ym="2023-12", price_col="price_month_avg") {
base_first <- as.Date(paste0(base_ym, "-01"))
ipc_base <- ipc_df %>% filter(ym == base_first) %>% pull(ipc_value)
if(length(ipc_base)==0) stop("No se encontró IPC para la base ", base_ym)
df %>%
left_join(ipc_df, by="ym") %>%
mutate(price_real = .data[[price_col]] * (ipc_base / ipc_value))
}
# ---------------------------
# 3️⃣ Transformar todos los combustibles
# ---------------------------
transformados_por_base <- monthly_filtered %>%
group_by(item) %>%
group_modify(~ transformar_a_reales(.x, ipc_prepared)) %>%
ungroup()
# ---------------------------
# 4️⃣ Mostrar tabla ejemplo para Carbon
# ---------------------------
tabla_carbon <- transformados_por_base %>%
filter(item=="carbon") %>%
select(
Combustible = item,
Mes = ym,
Precio_Nominal = price_month_avg,
Precio_Real = price_real
) %>%
slice(1:12) %>%
mutate(
Precio_Nominal = scales::comma(Precio_Nominal, accuracy=0.01),
Precio_Real = scales::comma(Precio_Real, accuracy=0.01),
Mes = format(Mes, "%b %Y")
)
kable(tabla_carbon, caption="Ejemplo precios nominales vs reales (Carbon, base dic-2023)", align="c")
Ejemplo precios nominales vs reales (Carbon, base
dic-2023)
| carbon |
ene. 2000 |
77.15 |
263.64 |
| carbon |
feb. 2000 |
80.85 |
270.05 |
| carbon |
mar. 2000 |
76.15 |
250.13 |
| carbon |
abr. 2000 |
74.68 |
242.86 |
| carbon |
may. 2000 |
77.61 |
251.08 |
| carbon |
jun. 2000 |
78.33 |
253.47 |
| carbon |
jul. 2000 |
77.47 |
250.73 |
| carbon |
ago. 2000 |
74.42 |
240.13 |
| carbon |
sept. 2000 |
74.68 |
239.98 |
| carbon |
oct. 2000 |
77.52 |
248.69 |
| carbon |
nov. 2000 |
78.78 |
251.91 |
| carbon |
dic. 2000 |
74.21 |
236.20 |
🎓 PARTE 2: Taller Big
Data - Análisis y visualización
📊 1. Estadísticas
descriptivas (2000–2023)
stats <- transformados_por_base %>%
group_by(item) %>%
summarise(
Min_Nominal = round(min(price_month_avg, na.rm = TRUE), 2),
Max_Nominal = round(max(price_month_avg, na.rm = TRUE), 2),
Promedio_Nominal = round(mean(price_month_avg, na.rm = TRUE), 2),
SD_Nominal = round(sd(price_month_avg, na.rm = TRUE), 2),
Min_Real = round(min(price_real, na.rm = TRUE), 2),
Max_Real = round(max(price_real, na.rm = TRUE), 2),
Promedio_Real = round(mean(price_real, na.rm = TRUE), 2),
SD_Real = round(sd(price_real, na.rm = TRUE), 2),
.groups = "drop"
)
# Mostrar la tabla profesional
stats %>%
kable(
col.names = c(
"Combustible",
"Mín.", "Máx.", "Prom.", "SD", # Nominal
"Mín.", "Máx.", "Prom.", "SD" # Real
),
align = "c",
caption = "📊 Estadísticas descriptivas por combustible (2000–2023)"
) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE, font_size = 13) %>%
add_header_above(c(" " = 1, "Nominal" = 4, "Real" = 4)) %>% # La suma debe ser 9
column_spec(1, bold = TRUE, color = "#2C3E50")
📊 Estadísticas descriptivas por combustible (2000–2023)
|
|
Nominal
|
Real
|
|
Combustible
|
Mín.
|
Máx.
|
Prom.
|
SD
|
Mín.
|
Máx.
|
Prom.
|
SD
|
|
carbon
|
74.21
|
123.16
|
96.96
|
12.70
|
116.17
|
270.05
|
178.50
|
32.56
|
|
gas_natural
|
98.65
|
137.71
|
118.33
|
8.31
|
126.66
|
366.96
|
222.29
|
55.37
|
|
gasolina
|
61.65
|
158.90
|
108.97
|
25.60
|
150.71
|
238.32
|
193.92
|
14.11
|
|
petroleo
|
91.25
|
129.80
|
108.09
|
8.00
|
115.90
|
327.85
|
202.62
|
49.04
|
ggplot(transformados_por_base, aes(x = ym, y = price_real, color = item)) +
geom_line(size = 1) +
scale_x_date(date_breaks = "2 years", date_labels = "%Y") +
scale_y_continuous(labels = scales::comma) +
labs(
title = "📉 Evolución de precios reales de combustibles (2000–2023)",
subtitle = "Ajustados por IPC, base diciembre 2023",
x = "Año",
y = "Precio real promedio mensual",
color = "Combustible"
) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold", color = "#2C3E50"),
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "bottom"
)

⏳ 3. Serie temporal:
Precios reales por combustible (2000–2023)
reales_long <- transformados_por_base %>% select(ym, item, price_real)
ggplot(reales_long, aes(x = ym, y = price_real, color = item)) +
geom_line(size = 1) +
labs(title = "Evolución de precios reales por combustible (2000–2023, base dic-2023)",
x = "Fecha", y = "Precio real (COP)", color = "Combustible") +
scale_y_continuous(labels = comma_format(big.mark = ".", decimal.mark = ",")) +
scale_x_date(date_labels = "%b %Y", date_breaks = "1 year") +
theme_minimal(base_size = 13) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

💾 4. Exportar
resultados (2000–2023)
output_folder <- "output"
if(!dir.exists(output_folder)) dir.create(output_folder)
outfile <- file.path(output_folder, "precios_reales_dic2023_2000-2023.csv")
write_csv(transformados_por_base, outfile)
tibble(Archivos_Generados = list.files(output_folder, full.names = TRUE)) %>%
kable(caption = "Archivos generados")
Archivos generados
| output/precios_reales_dic2023_2000-2023.csv |