knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(readr)
library(knitr)
library(glue)
library(ggplot2)
library(scales)
## 
## Adjuntando el paquete: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(tidyr)
library(stringr)
library(openxlsx)
library(kableExtra)
## 
## Adjuntando el paquete: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
palette_combustibles <- c(
"carbon" = "#1f77b4",
"gas_natural" = "#ff7f0e",
"gasolina" = "#2ca02c",
"petroleo" = "#d62728"
)

1 🧱 PARTE 1: Construcción y análisis técnico de la base

1.1 0. Verificación de archivos

datos_dir <- params$datos_dir
files_expected <- c(
"clean_precios_carbon.csv",
"clean_precios_gas_natural.csv",
"clean_precios_gasolina.csv",
"clean_precios_petroleo.csv",
"IPC.csv"
)
file_paths <- file.path(datos_dir, files_expected)
verif <- tibble(Archivo = files_expected, Ruta = file_paths, Existe = file.exists(file_paths))

kable(verif, caption = "Verificación de archivos requeridos") %>%
kable_styling(bootstrap_options = c("striped", "hover", "responsive"), full_width = FALSE)
Verificación de archivos requeridos
Archivo Ruta Existe
clean_precios_carbon.csv ./clean_precios_carbon.csv TRUE
clean_precios_gas_natural.csv ./clean_precios_gas_natural.csv TRUE
clean_precios_gasolina.csv ./clean_precios_gasolina.csv TRUE
clean_precios_petroleo.csv ./clean_precios_petroleo.csv TRUE
IPC.csv ./IPC.csv TRUE
datos_dir <- params$datos_dir
files_expected <- c(
"clean_precios_carbon.csv",
"clean_precios_gas_natural.csv",
"clean_precios_gasolina.csv",
"clean_precios_petroleo.csv",
"IPC.csv"
)
file_paths <- file.path(datos_dir, files_expected)
verif <- tibble(Archivo = files_expected, Ruta = file_paths, Existe = file.exists(file_paths))

kable(verif, caption = "Verificación de archivos requeridos") %>%
kable_styling(bootstrap_options = c("striped", "hover", "responsive"), full_width = FALSE)
Verificación de archivos requeridos
Archivo Ruta Existe
clean_precios_carbon.csv ./clean_precios_carbon.csv TRUE
clean_precios_gas_natural.csv ./clean_precios_gas_natural.csv TRUE
clean_precios_gasolina.csv ./clean_precios_gasolina.csv TRUE
clean_precios_petroleo.csv ./clean_precios_petroleo.csv TRUE
IPC.csv ./IPC.csv TRUE

1.2 1. Lectura de datos

read_maybe <- function(name) {
p <- file.path(datos_dir, name)
if (file.exists(p)) return(read_csv(p, show_col_types = FALSE))
if (file.exists(name)) return(read_csv(name, show_col_types = FALSE))
stop(glue("No se encontró el archivo: {name}"))
}

precio_carbon      <- read_maybe("clean_precios_carbon.csv")
precio_gas_natural <- read_maybe("clean_precios_gas_natural.csv")
precio_gasolina    <- read_maybe("clean_precios_gasolina.csv")
precio_petroleo    <- read_maybe("clean_precios_petroleo.csv")
ipc                <- read_maybe("IPC.csv")

# Vista rápida

list(
"Carbon"      = head(precio_carbon),
"Gas Natural" = head(precio_gas_natural),
"Gasolina"    = head(precio_gasolina),
"Petroleo"    = head(precio_petroleo)
)
## $Carbon
## # A tibble: 6 × 2
##   fecha      precio_carbon
##   <date>             <dbl>
## 1 2000-01-01          56.8
## 2 2000-01-02          84.1
## 3 2000-01-03          70.7
## 4 2000-01-05          79.7
## 5 2000-01-06          91.5
## 6 2000-01-07          69.8
## 
## $`Gas Natural`
## # A tibble: 6 × 2
##   fecha      precio_gas_natural
##   <date>                  <dbl>
## 1 2000-01-01              134. 
## 2 2000-01-02              103. 
## 3 2000-01-03               72.7
## 4 2000-01-04              102. 
## 5 2000-01-05              134. 
## 6 2000-01-06               78.6
## 
## $Gasolina
## # A tibble: 6 × 2
##   fecha      precio_gasolina
##   <date>               <dbl>
## 1 2000-01-01            85.2
## 2 2000-01-02            58.8
## 3 2000-01-03            62.4
## 4 2000-01-04            48.6
## 5 2000-01-05            43.8
## 6 2000-01-06            41.9
## 
## $Petroleo
## # A tibble: 6 × 2
##   fecha      precio_petroleo
##   <date>               <dbl>
## 1 2000-01-01            77.7
## 2 2000-01-02           108. 
## 3 2000-01-03           119. 
## 4 2000-01-04           106. 
## 5 2000-01-05           115. 
## 6 2000-01-06           101.

1.3 2. Función para completar fechas faltantes (diario)

explicitar_fechas <- function(df, date_col = "fecha", price_col = NULL, item_name = NULL) {
  if (!date_col %in% names(df)) {
    date_candidates <- names(df)[str_detect(names(df), regex("fecha|date|periodo|mes", ignore_case = TRUE))]
    if (length(date_candidates) == 0) stop("No se encontró ninguna columna de fecha en el dataset.")
    date_col <- date_candidates[1]
  }
  if (is.null(price_col) || !price_col %in% names(df)) {
    price_candidates <- names(df)[
      str_detect(names(df), regex("precio|valor|price|cost", ignore_case = TRUE))
    ]
    if (length(price_candidates) == 0) stop(glue("No se encontró columna de precio en '{item_name}'."))
    price_col <- price_candidates[1]
  }

  df2 <- df %>% mutate(date = as.Date(!!sym(date_col))) %>% arrange(date)

  seq_full <- tibble(date = seq.Date(min(df2$date, na.rm = TRUE),
                                     max(df2$date, na.rm = TRUE), by = "day"))

  out <- seq_full %>%
    left_join(df2, by = "date") %>%
    mutate(
      missing_price = is.na(!!sym(price_col)),
      item = item_name
    )

  names(out)[names(out) == price_col] <- "price"
  out %>% select(date, price, item, everything())
}

ejemplo_explicito <- explicitar_fechas(precio_carbon, "fecha", "precio", "carbon")
head(ejemplo_explicito)
## # A tibble: 6 × 5
##   date       price item   fecha      missing_price
##   <date>     <dbl> <chr>  <date>     <lgl>        
## 1 2000-01-01  56.8 carbon 2000-01-01 FALSE        
## 2 2000-01-02  84.1 carbon 2000-01-02 FALSE        
## 3 2000-01-03  70.7 carbon 2000-01-03 FALSE        
## 4 2000-01-04  NA   carbon NA         TRUE         
## 5 2000-01-05  79.7 carbon 2000-01-05 FALSE        
## 6 2000-01-06  91.5 carbon 2000-01-06 FALSE

1.4 3. Aplicar a todos los datasets

carb_explicita   <- explicitar_fechas(precio_carbon, "fecha", "precio", "carbon")
gasnat_explicita <- explicitar_fechas(precio_gas_natural, "fecha", "precio", "gas_natural")
gasol_explicita  <- explicitar_fechas(precio_gasolina, "fecha", "precio", "gasolina")
pet_explicita    <- explicitar_fechas(precio_petroleo, "fecha", "precio", "petroleo")

resumen_explicitos <- tibble(
  Combustible     = c("Carbon", "Gas Natural", "Gasolina", "Petroleo"),
  Fechas_iniciales= c(min(carb_explicita$date), min(gasnat_explicita$date),
                      min(gasol_explicita$date), min(pet_explicita$date)),
  Fechas_finales  = c(max(carb_explicita$date), max(gasnat_explicita$date),
                      max(gasol_explicita$date), max(pet_explicita$date)),
  Total_dias      = c(nrow(carb_explicita), nrow(gasnat_explicita),
                      nrow(gasol_explicita), nrow(pet_explicita))
)
kable(resumen_explicitos, caption = "Cobertura temporal y número de observaciones por combustible")
Cobertura temporal y número de observaciones por combustible
Combustible Fechas_iniciales Fechas_finales Total_dias
Carbon 2000-01-01 2024-01-01 8767
Gas Natural 2000-01-01 2024-01-01 8767
Gasolina 2000-01-01 2024-01-01 8767
Petroleo 2000-01-01 2024-01-01 8767

1.5 4. Unir datasets

normizar <- function(df, item_name) {
  if ("fecha" %in% names(df) && !"date" %in% names(df)) {
    df <- df %>% rename(date = fecha)
  }
  df <- df[, !duplicated(names(df))]
  price_col <- names(df)[str_detect(names(df), regex("precio|valor|price", ignore_case = TRUE))][1]
  if (is.na(price_col)) stop("No se encontró columna de precio en ", item_name)
  if (price_col != "price") {
    df <- df %>% rename(price = !!sym(price_col))
  }
  df %>%
    mutate(item = item_name, date = as.Date(date)) %>%
    select(date, price, item, everything()) %>%
    arrange(date)
}

consolidado_daily <- bind_rows(
  normizar(carb_explicita, "carbon"),
  normizar(gasnat_explicita, "gas_natural"),
  normizar(gasol_explicita, "gasolina"),
  normizar(pet_explicita, "petroleo")
) %>% arrange(item, date)

consolidado_daily %>%
  group_by(item) %>%
  summarise(
    total_obs   = n(),
    missing     = sum(is.na(price)),
    pct_missing = round(100 * mean(is.na(price)), 2),
    .groups = "drop"
  ) %>%
  kable(caption = "Resumen de datos consolidados diarios (faltantes)")
Resumen de datos consolidados diarios (faltantes)
item total_obs missing pct_missing
carbon 8767 188 2.14
gas_natural 8767 188 2.14
gasolina 8767 188 2.14
petroleo 8767 188 2.14

1.6 5. Agregación mensual (corte hasta 2023-12-31)

consolidado_daily <- consolidado_daily %>%
  mutate(year = year(date), month = month(date), ym = floor_date(date, "month"))

monthly_avg <- consolidado_daily %>%
  group_by(item, ym) %>%
  summarise(price_month_avg = mean(price, na.rm = TRUE), .groups = "drop")

monthly_filtered <- monthly_avg %>%
  filter(ym >= as.Date("2000-01-01") & ym <= as.Date("2023-12-31"))

head(monthly_filtered) %>% 
  kable(caption = "Ejemplo de precios mensuales promedio (2000–2023)")
Ejemplo de precios mensuales promedio (2000–2023)
item ym price_month_avg
carbon 2000-01-01 77.14800
carbon 2000-02-01 80.84513
carbon 2000-03-01 76.15327
carbon 2000-04-01 74.68102
carbon 2000-05-01 77.61114
carbon 2000-06-01 78.32986

1.7 6. IPC (corte a 2023)

library(dplyr)
library(lubridate)
library(stringr)
library(readr)
library(scales)
library(knitr)
library(kableExtra)

# ---------------------------
# 1️⃣ Preparar IPC
# ---------------------------

ipc_val_col  <- names(ipc)[str_detect(names(ipc), regex("ipc|indice|valor|index|value", ignore_case = TRUE))][1]
ipc_date_col <- names(ipc)[str_detect(names(ipc), regex("fecha|mes|anio|año|periodo|period|date", ignore_case = TRUE))][1]

ipc_prepared <- ipc %>%
  rename(ipc_value = !!sym(ipc_val_col)) %>%
  mutate(
    ym = as.Date(ifelse(
      str_detect(as.character(.data[[ipc_date_col]]), "^\\d{4}-\\d{2}$"),
      paste0(.data[[ipc_date_col]], "-01"),
      as.character(floor_date(parse_date_time(as.character(.data[[ipc_date_col]]),
                                               orders = c("ym","my","Ym","Y-m","Y-m-d","ymd")),
                              "month"))
    ))
  ) %>%
  select(ym, ipc_value) %>%
  filter(!is.na(ym), !is.na(ipc_value)) %>%
  arrange(ym) %>%
  filter(ym <= as.Date("2023-12-31"))

# ---------------------------
# 2️⃣ Función para pasar precios nominales a reales
# ---------------------------

transformar_a_reales <- function(df, ipc_df, base_ym="2023-12", price_col="price_month_avg") {
  base_first <- as.Date(paste0(base_ym, "-01"))
  ipc_base <- ipc_df %>% filter(ym == base_first) %>% pull(ipc_value)
  if(length(ipc_base)==0) stop("No se encontró IPC para la base ", base_ym)
  
  df %>%
    left_join(ipc_df, by="ym") %>%
    mutate(price_real = .data[[price_col]] * (ipc_base / ipc_value))
}

# ---------------------------
# 3️⃣ Transformar todos los combustibles
# ---------------------------

transformados_por_base <- monthly_filtered %>%
  group_by(item) %>%
  group_modify(~ transformar_a_reales(.x, ipc_prepared)) %>%
  ungroup()

# ---------------------------
# 4️⃣ Mostrar tabla ejemplo para Carbon
# ---------------------------

tabla_carbon <- transformados_por_base %>%
  filter(item=="carbon") %>%
  select(
    Combustible = item,
    Mes = ym,
    Precio_Nominal = price_month_avg,
    Precio_Real = price_real
  ) %>%
  slice(1:12) %>%
  mutate(
    Precio_Nominal = scales::comma(Precio_Nominal, accuracy=0.01),
    Precio_Real = scales::comma(Precio_Real, accuracy=0.01),
    Mes = format(Mes, "%b %Y")
  )

kable(tabla_carbon, caption="Ejemplo precios nominales vs reales (Carbon, base dic-2023)", align="c")
Ejemplo precios nominales vs reales (Carbon, base dic-2023)
Combustible Mes Precio_Nominal Precio_Real
carbon ene. 2000 77.15 263.64
carbon feb. 2000 80.85 270.05
carbon mar. 2000 76.15 250.13
carbon abr. 2000 74.68 242.86
carbon may. 2000 77.61 251.08
carbon jun. 2000 78.33 253.47
carbon jul. 2000 77.47 250.73
carbon ago. 2000 74.42 240.13
carbon sept. 2000 74.68 239.98
carbon oct. 2000 77.52 248.69
carbon nov. 2000 78.78 251.91
carbon dic. 2000 74.21 236.20

2 🎓 PARTE 2: Taller Big Data - Análisis y visualización

2.1 📊 1. Estadísticas descriptivas (2000–2023)

stats <- transformados_por_base %>%
  group_by(item) %>%
  summarise(
    Min_Nominal      = round(min(price_month_avg, na.rm = TRUE), 2),
    Max_Nominal      = round(max(price_month_avg, na.rm = TRUE), 2),
    Promedio_Nominal = round(mean(price_month_avg, na.rm = TRUE), 2),
    SD_Nominal       = round(sd(price_month_avg, na.rm = TRUE), 2),
    Min_Real         = round(min(price_real, na.rm = TRUE), 2),
    Max_Real         = round(max(price_real, na.rm = TRUE), 2),
    Promedio_Real    = round(mean(price_real, na.rm = TRUE), 2),
    SD_Real          = round(sd(price_real, na.rm = TRUE), 2),
    .groups = "drop"
  )

# Mostrar la tabla profesional
stats %>%
  kable(
    col.names = c(
      "Combustible", 
      "Mín.", "Máx.", "Prom.", "SD",  # Nominal
      "Mín.", "Máx.", "Prom.", "SD"   # Real
    ),
    align = "c",
    caption = "📊 Estadísticas descriptivas por combustible (2000–2023)"
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE, font_size = 13) %>%
  add_header_above(c(" " = 1, "Nominal" = 4, "Real" = 4)) %>%  # La suma debe ser 9
  column_spec(1, bold = TRUE, color = "#2C3E50")
📊 Estadísticas descriptivas por combustible (2000–2023)
Nominal
Real
Combustible Mín. Máx. Prom. SD Mín. Máx. Prom. SD
carbon 74.21 123.16 96.96 12.70 116.17 270.05 178.50 32.56
gas_natural 98.65 137.71 118.33 8.31 126.66 366.96 222.29 55.37
gasolina 61.65 158.90 108.97 25.60 150.71 238.32 193.92 14.11
petroleo 91.25 129.80 108.09 8.00 115.90 327.85 202.62 49.04
ggplot(transformados_por_base, aes(x = ym, y = price_real, color = item)) +
  geom_line(size = 1) +
  scale_x_date(date_breaks = "2 years", date_labels = "%Y") +
  scale_y_continuous(labels = scales::comma) +
  labs(
    title = "📉 Evolución de precios reales de combustibles (2000–2023)",
    subtitle = "Ajustados por IPC, base diciembre 2023",
    x = "Año",
    y = "Precio real promedio mensual",
    color = "Combustible"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    plot.title = element_text(face = "bold", color = "#2C3E50"),
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "bottom"
  )

2.2 ⏳ 3. Serie temporal: Precios reales por combustible (2000–2023)

reales_long <- transformados_por_base %>% select(ym, item, price_real)
ggplot(reales_long, aes(x = ym, y = price_real, color = item)) +
  geom_line(size = 1) +
  labs(title = "Evolución de precios reales por combustible (2000–2023, base dic-2023)",
       x = "Fecha", y = "Precio real (COP)", color = "Combustible") +
  scale_y_continuous(labels = comma_format(big.mark = ".", decimal.mark = ",")) +
  scale_x_date(date_labels = "%b %Y", date_breaks = "1 year") +
  theme_minimal(base_size = 13) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

2.3 💾 4. Exportar resultados (2000–2023)

output_folder <- "output"
if(!dir.exists(output_folder)) dir.create(output_folder)

outfile <- file.path(output_folder, "precios_reales_dic2023_2000-2023.csv")
write_csv(transformados_por_base, outfile)

tibble(Archivos_Generados = list.files(output_folder, full.names = TRUE)) %>% 
  kable(caption = "Archivos generados")
Archivos generados
Archivos_Generados
output/precios_reales_dic2023_2000-2023.csv