1 🧱 PARTE 1: Construcción y análisis técnico de la base
2 🎓 PARTE 2: Taller Big Data - Análisis y visualización

knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(lubridate)
library(readr)
library(knitr)
library(glue)
library(ggplot2)
library(scales)

## 
## Adjuntando el paquete: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

library(tidyr)
library(stringr)
library(openxlsx)
library(kableExtra)

## 
## Adjuntando el paquete: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows

palette_combustibles <- c(
"carbon" = "#1f77b4",
"gas_natural" = "#ff7f0e",
"gasolina" = "#2ca02c",
"petroleo" = "#d62728"
)

1 🧱 PARTE 1: Construcción y análisis técnico de la base

1.1 0. Verificación de archivos

datos_dir <- params$datos_dir
files_expected <- c(
"clean_precios_carbon.csv",
"clean_precios_gas_natural.csv",
"clean_precios_gasolina.csv",
"clean_precios_petroleo.csv",
"IPC.csv"
)
file_paths <- file.path(datos_dir, files_expected)
verif <- tibble(Archivo = files_expected, Ruta = file_paths, Existe = file.exists(file_paths))

kable(verif, caption = "Verificación de archivos requeridos") %>%
kable_styling(bootstrap_options = c("striped", "hover", "responsive"), full_width = FALSE)

Verificación de archivos requeridos
Archivo	Ruta	Existe
clean_precios_carbon.csv	./clean_precios_carbon.csv	TRUE
clean_precios_gas_natural.csv	./clean_precios_gas_natural.csv	TRUE
clean_precios_gasolina.csv	./clean_precios_gasolina.csv	TRUE
clean_precios_petroleo.csv	./clean_precios_petroleo.csv	TRUE
IPC.csv	./IPC.csv	TRUE

datos_dir <- params$datos_dir
files_expected <- c(
"clean_precios_carbon.csv",
"clean_precios_gas_natural.csv",
"clean_precios_gasolina.csv",
"clean_precios_petroleo.csv",
"IPC.csv"
)
file_paths <- file.path(datos_dir, files_expected)
verif <- tibble(Archivo = files_expected, Ruta = file_paths, Existe = file.exists(file_paths))

kable(verif, caption = "Verificación de archivos requeridos") %>%
kable_styling(bootstrap_options = c("striped", "hover", "responsive"), full_width = FALSE)

Verificación de archivos requeridos
Archivo	Ruta	Existe
clean_precios_carbon.csv	./clean_precios_carbon.csv	TRUE
clean_precios_gas_natural.csv	./clean_precios_gas_natural.csv	TRUE
clean_precios_gasolina.csv	./clean_precios_gasolina.csv	TRUE
clean_precios_petroleo.csv	./clean_precios_petroleo.csv	TRUE
IPC.csv	./IPC.csv	TRUE

1.2 1. Lectura de datos

read_maybe <- function(name) {
p <- file.path(datos_dir, name)
if (file.exists(p)) return(read_csv(p, show_col_types = FALSE))
if (file.exists(name)) return(read_csv(name, show_col_types = FALSE))
stop(glue("No se encontró el archivo: {name}"))
}

precio_carbon      <- read_maybe("clean_precios_carbon.csv")
precio_gas_natural <- read_maybe("clean_precios_gas_natural.csv")
precio_gasolina    <- read_maybe("clean_precios_gasolina.csv")
precio_petroleo    <- read_maybe("clean_precios_petroleo.csv")
ipc                <- read_maybe("IPC.csv")

# Vista rápida

list(
"Carbon"      = head(precio_carbon),
"Gas Natural" = head(precio_gas_natural),
"Gasolina"    = head(precio_gasolina),
"Petroleo"    = head(precio_petroleo)
)

## $Carbon
## # A tibble: 6 × 2
##   fecha      precio_carbon
##   <date>             <dbl>
## 1 2000-01-01          56.8
## 2 2000-01-02          84.1
## 3 2000-01-03          70.7
## 4 2000-01-05          79.7
## 5 2000-01-06          91.5
## 6 2000-01-07          69.8
## 
## $`Gas Natural`
## # A tibble: 6 × 2
##   fecha      precio_gas_natural
##   <date>                  <dbl>
## 1 2000-01-01              134. 
## 2 2000-01-02              103. 
## 3 2000-01-03               72.7
## 4 2000-01-04              102. 
## 5 2000-01-05              134. 
## 6 2000-01-06               78.6
## 
## $Gasolina
## # A tibble: 6 × 2
##   fecha      precio_gasolina
##   <date>               <dbl>
## 1 2000-01-01            85.2
## 2 2000-01-02            58.8
## 3 2000-01-03            62.4
## 4 2000-01-04            48.6
## 5 2000-01-05            43.8
## 6 2000-01-06            41.9
## 
## $Petroleo
## # A tibble: 6 × 2
##   fecha      precio_petroleo
##   <date>               <dbl>
## 1 2000-01-01            77.7
## 2 2000-01-02           108. 
## 3 2000-01-03           119. 
## 4 2000-01-04           106. 
## 5 2000-01-05           115. 
## 6 2000-01-06           101.

1.3 2. Función para completar fechas faltantes (diario)

explicitar_fechas <- function(df, date_col = "fecha", price_col = NULL, item_name = NULL) {
  if (!date_col %in% names(df)) {
    date_candidates <- names(df)[str_detect(names(df), regex("fecha|date|periodo|mes", ignore_case = TRUE))]
    if (length(date_candidates) == 0) stop("No se encontró ninguna columna de fecha en el dataset.")
    date_col <- date_candidates[1]
  }
  if (is.null(price_col) || !price_col %in% names(df)) {
    price_candidates <- names(df)[
      str_detect(names(df), regex("precio|valor|price|cost", ignore_case = TRUE))
    ]
    if (length(price_candidates) == 0) stop(glue("No se encontró columna de precio en '{item_name}'."))
    price_col <- price_candidates[1]
  }

  df2 <- df %>% mutate(date = as.Date(!!sym(date_col))) %>% arrange(date)

  seq_full <- tibble(date = seq.Date(min(df2$date, na.rm = TRUE),
                                     max(df2$date, na.rm = TRUE), by = "day"))

  out <- seq_full %>%
    left_join(df2, by = "date") %>%
    mutate(
      missing_price = is.na(!!sym(price_col)),
      item = item_name
    )

  names(out)[names(out) == price_col] <- "price"
  out %>% select(date, price, item, everything())
}

ejemplo_explicito <- explicitar_fechas(precio_carbon, "fecha", "precio", "carbon")
head(ejemplo_explicito)

## # A tibble: 6 × 5
##   date       price item   fecha      missing_price
##   <date>     <dbl> <chr>  <date>     <lgl>        
## 1 2000-01-01  56.8 carbon 2000-01-01 FALSE        
## 2 2000-01-02  84.1 carbon 2000-01-02 FALSE        
## 3 2000-01-03  70.7 carbon 2000-01-03 FALSE        
## 4 2000-01-04  NA   carbon NA         TRUE         
## 5 2000-01-05  79.7 carbon 2000-01-05 FALSE        
## 6 2000-01-06  91.5 carbon 2000-01-06 FALSE

1.4 3. Aplicar a todos los datasets

carb_explicita   <- explicitar_fechas(precio_carbon, "fecha", "precio", "carbon")
gasnat_explicita <- explicitar_fechas(precio_gas_natural, "fecha", "precio", "gas_natural")
gasol_explicita  <- explicitar_fechas(precio_gasolina, "fecha", "precio", "gasolina")
pet_explicita    <- explicitar_fechas(precio_petroleo, "fecha", "precio", "petroleo")

resumen_explicitos <- tibble(
  Combustible     = c("Carbon", "Gas Natural", "Gasolina", "Petroleo"),
  Fechas_iniciales= c(min(carb_explicita$date), min(gasnat_explicita$date),
                      min(gasol_explicita$date), min(pet_explicita$date)),
  Fechas_finales  = c(max(carb_explicita$date), max(gasnat_explicita$date),
                      max(gasol_explicita$date), max(pet_explicita$date)),
  Total_dias      = c(nrow(carb_explicita), nrow(gasnat_explicita),
                      nrow(gasol_explicita), nrow(pet_explicita))
)
kable(resumen_explicitos, caption = "Cobertura temporal y número de observaciones por combustible")

Cobertura temporal y número de observaciones por combustible
Combustible	Fechas_iniciales	Fechas_finales	Total_dias
Carbon	2000-01-01	2024-01-01	8767
Gas Natural	2000-01-01	2024-01-01	8767
Gasolina	2000-01-01	2024-01-01	8767
Petroleo	2000-01-01	2024-01-01	8767

1.5 4. Unir datasets

normizar <- function(df, item_name) {
  if ("fecha" %in% names(df) && !"date" %in% names(df)) {
    df <- df %>% rename(date = fecha)
  }
  df <- df[, !duplicated(names(df))]
  price_col <- names(df)[str_detect(names(df), regex("precio|valor|price", ignore_case = TRUE))][1]
  if (is.na(price_col)) stop("No se encontró columna de precio en ", item_name)
  if (price_col != "price") {
    df <- df %>% rename(price = !!sym(price_col))
  }
  df %>%
    mutate(item = item_name, date = as.Date(date)) %>%
    select(date, price, item, everything()) %>%
    arrange(date)
}

consolidado_daily <- bind_rows(
  normizar(carb_explicita, "carbon"),
  normizar(gasnat_explicita, "gas_natural"),
  normizar(gasol_explicita, "gasolina"),
  normizar(pet_explicita, "petroleo")
) %>% arrange(item, date)

consolidado_daily %>%
  group_by(item) %>%
  summarise(
    total_obs   = n(),
    missing     = sum(is.na(price)),
    pct_missing = round(100 * mean(is.na(price)), 2),
    .groups = "drop"
  ) %>%
  kable(caption = "Resumen de datos consolidados diarios (faltantes)")

Resumen de datos consolidados diarios (faltantes)
item	total_obs	missing	pct_missing
carbon	8767	188	2.14
gas_natural	8767	188	2.14
gasolina	8767	188	2.14
petroleo	8767	188	2.14

1.6 5. Agregación mensual (corte hasta 2023-12-31)

consolidado_daily <- consolidado_daily %>%
  mutate(year = year(date), month = month(date), ym = floor_date(date, "month"))

monthly_avg <- consolidado_daily %>%
  group_by(item, ym) %>%
  summarise(price_month_avg = mean(price, na.rm = TRUE), .groups = "drop")

monthly_filtered <- monthly_avg %>%
  filter(ym >= as.Date("2000-01-01") & ym <= as.Date("2023-12-31"))

head(monthly_filtered) %>% 
  kable(caption = "Ejemplo de precios mensuales promedio (2000–2023)")

Ejemplo de precios mensuales promedio (2000–2023)
item	ym	price_month_avg
carbon	2000-01-01	77.14800
carbon	2000-02-01	80.84513
carbon	2000-03-01	76.15327
carbon	2000-04-01	74.68102
carbon	2000-05-01	77.61114
carbon	2000-06-01	78.32986

1.7 6. IPC (corte a 2023)

library(dplyr)
library(lubridate)
library(stringr)
library(readr)
library(scales)
library(knitr)
library(kableExtra)

# ---------------------------
# 1️⃣ Preparar IPC
# ---------------------------

ipc_val_col  <- names(ipc)[str_detect(names(ipc), regex("ipc|indice|valor|index|value", ignore_case = TRUE))][1]
ipc_date_col <- names(ipc)[str_detect(names(ipc), regex("fecha|mes|anio|año|periodo|period|date", ignore_case = TRUE))][1]

ipc_prepared <- ipc %>%
  rename(ipc_value = !!sym(ipc_val_col)) %>%
  mutate(
    ym = as.Date(ifelse(
      str_detect(as.character(.data[[ipc_date_col]]), "^\\d{4}-\\d{2}$"),
      paste0(.data[[ipc_date_col]], "-01"),
      as.character(floor_date(parse_date_time(as.character(.data[[ipc_date_col]]),
                                               orders = c("ym","my","Ym","Y-m","Y-m-d","ymd")),
                              "month"))
    ))
  ) %>%
  select(ym, ipc_value) %>%
  filter(!is.na(ym), !is.na(ipc_value)) %>%
  arrange(ym) %>%
  filter(ym <= as.Date("2023-12-31"))

# ---------------------------
# 2️⃣ Función para pasar precios nominales a reales
# ---------------------------

transformar_a_reales <- function(df, ipc_df, base_ym="2023-12", price_col="price_month_avg") {
  base_first <- as.Date(paste0(base_ym, "-01"))
  ipc_base <- ipc_df %>% filter(ym == base_first) %>% pull(ipc_value)
  if(length(ipc_base)==0) stop("No se encontró IPC para la base ", base_ym)
  
  df %>%
    left_join(ipc_df, by="ym") %>%
    mutate(price_real = .data[[price_col]] * (ipc_base / ipc_value))
}

# ---------------------------
# 3️⃣ Transformar todos los combustibles
# ---------------------------

transformados_por_base <- monthly_filtered %>%
  group_by(item) %>%
  group_modify(~ transformar_a_reales(.x, ipc_prepared)) %>%
  ungroup()

# ---------------------------
# 4️⃣ Mostrar tabla ejemplo para Carbon
# ---------------------------

tabla_carbon <- transformados_por_base %>%
  filter(item=="carbon") %>%
  select(
    Combustible = item,
    Mes = ym,
    Precio_Nominal = price_month_avg,
    Precio_Real = price_real
  ) %>%
  slice(1:12) %>%
  mutate(
    Precio_Nominal = scales::comma(Precio_Nominal, accuracy=0.01),
    Precio_Real = scales::comma(Precio_Real, accuracy=0.01),
    Mes = format(Mes, "%b %Y")
  )

kable(tabla_carbon, caption="Ejemplo precios nominales vs reales (Carbon, base dic-2023)", align="c")

Ejemplo precios nominales vs reales (Carbon, base dic-2023)
Combustible	Mes	Precio_Nominal	Precio_Real
carbon	ene. 2000	77.15	263.64
carbon	feb. 2000	80.85	270.05
carbon	mar. 2000	76.15	250.13
carbon	abr. 2000	74.68	242.86
carbon	may. 2000	77.61	251.08
carbon	jun. 2000	78.33	253.47
carbon	jul. 2000	77.47	250.73
carbon	ago. 2000	74.42	240.13
carbon	sept. 2000	74.68	239.98
carbon	oct. 2000	77.52	248.69
carbon	nov. 2000	78.78	251.91
carbon	dic. 2000	74.21	236.20

2 🎓 PARTE 2: Taller Big Data - Análisis y visualización

2.1 📊 1. Estadísticas descriptivas (2000–2023)

stats <- transformados_por_base %>%
  group_by(item) %>%
  summarise(
    Min_Nominal      = round(min(price_month_avg, na.rm = TRUE), 2),
    Max_Nominal      = round(max(price_month_avg, na.rm = TRUE), 2),
    Promedio_Nominal = round(mean(price_month_avg, na.rm = TRUE), 2),
    SD_Nominal       = round(sd(price_month_avg, na.rm = TRUE), 2),
    Min_Real         = round(min(price_real, na.rm = TRUE), 2),
    Max_Real         = round(max(price_real, na.rm = TRUE), 2),
    Promedio_Real    = round(mean(price_real, na.rm = TRUE), 2),
    SD_Real          = round(sd(price_real, na.rm = TRUE), 2),
    .groups = "drop"
  )

# Mostrar la tabla profesional
stats %>%
  kable(
    col.names = c(
      "Combustible", 
      "Mín.", "Máx.", "Prom.", "SD",  # Nominal
      "Mín.", "Máx.", "Prom.", "SD"   # Real
    ),
    align = "c",
    caption = "📊 Estadísticas descriptivas por combustible (2000–2023)"
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = FALSE, font_size = 13) %>%
  add_header_above(c(" " = 1, "Nominal" = 4, "Real" = 4)) %>%  # La suma debe ser 9
  column_spec(1, bold = TRUE, color = "#2C3E50")

📊 Estadísticas descriptivas por combustible (2000–2023)
	Nominal				Real
Combustible	Mín.	Máx.	Prom.	SD	Mín.	Máx.	Prom.	SD
carbon	74.21	123.16	96.96	12.70	116.17	270.05	178.50	32.56
gas_natural	98.65	137.71	118.33	8.31	126.66	366.96	222.29	55.37
gasolina	61.65	158.90	108.97	25.60	150.71	238.32	193.92	14.11
petroleo	91.25	129.80	108.09	8.00	115.90	327.85	202.62	49.04

ggplot(transformados_por_base, aes(x = ym, y = price_real, color = item)) +
  geom_line(size = 1) +
  scale_x_date(date_breaks = "2 years", date_labels = "%Y") +
  scale_y_continuous(labels = scales::comma) +
  labs(
    title = "📉 Evolución de precios reales de combustibles (2000–2023)",
    subtitle = "Ajustados por IPC, base diciembre 2023",
    x = "Año",
    y = "Precio real promedio mensual",
    color = "Combustible"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    plot.title = element_text(face = "bold", color = "#2C3E50"),
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "bottom"
  )

2.2 ⏳ 3. Serie temporal: Precios reales por combustible (2000–2023)

reales_long <- transformados_por_base %>% select(ym, item, price_real)
ggplot(reales_long, aes(x = ym, y = price_real, color = item)) +
  geom_line(size = 1) +
  labs(title = "Evolución de precios reales por combustible (2000–2023, base dic-2023)",
       x = "Fecha", y = "Precio real (COP)", color = "Combustible") +
  scale_y_continuous(labels = comma_format(big.mark = ".", decimal.mark = ",")) +
  scale_x_date(date_labels = "%b %Y", date_breaks = "1 year") +
  theme_minimal(base_size = 13) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

2.3 💾 4. Exportar resultados (2000–2023)

output_folder <- "output"
if(!dir.exists(output_folder)) dir.create(output_folder)

outfile <- file.path(output_folder, "precios_reales_dic2023_2000-2023.csv")
write_csv(transformados_por_base, outfile)

tibble(Archivos_Generados = list.files(output_folder, full.names = TRUE)) %>% 
  kable(caption = "Archivos generados")

Archivos generados
Archivos_Generados
output/precios_reales_dic2023_2000-2023.csv

Ejercicio y Taller Big Data - Precios Reales de Combustibles

Samanta Torres Acosta

2025-10-23

1 🧱 PARTE 1: Construcción y análisis técnico de la base

1.1 0. Verificación de archivos

1.2 1. Lectura de datos

1.3 2. Función para completar fechas faltantes (diario)

1.4 3. Aplicar a todos los datasets

1.5 4. Unir datasets

1.6 5. Agregación mensual (corte hasta 2023-12-31)

1.7 6. IPC (corte a 2023)

2 🎓 PARTE 2: Taller Big Data - Análisis y visualización

2.1 📊 1. Estadísticas descriptivas (2000–2023)

2.2 ⏳ 3. Serie temporal: Precios reales por combustible (2000–2023)

2.3 💾 4. Exportar resultados (2000–2023)