#Instalamos las librerias necesarias
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'tibble' was built under R version 4.4.3
## Warning: package 'tidyr' was built under R version 4.4.3
## Warning: package 'readr' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.3
## Warning: package 'stringr' was built under R version 4.4.3
## Warning: package 'forcats' was built under R version 4.4.3
## Warning: package 'lubridate' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.2.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(skimr)
## Warning: package 'skimr' was built under R version 4.4.3
library(lubridate)
library(tidyverse)

# Carga del archivo DF2
df2 <- read_csv("global_climate_energy_2020_2024.csv")
## Rows: 36540 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): country
## dbl  (8): avg_temperature, humidity, co2_emission, energy_consumption, renew...
## date (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Validación de dimensiones iniciales
# Debería mostrar 36540 filas y 10 columnas
dim(df2)
## [1] 36540    10
# Comprobar nombres de columnas exactos
names(df2)
##  [1] "date"                      "country"                  
##  [3] "avg_temperature"           "humidity"                 
##  [5] "co2_emission"              "energy_consumption"       
##  [7] "renewable_share"           "urban_population"         
##  [9] "industrial_activity_index" "energy_price"
# Verificar tipos de datos (Numéricos vs Factores/Fechas)
str(df2)
## spc_tbl_ [36,540 × 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ date                     : Date[1:36540], format: "2020-01-01" "2020-01-02" ...
##  $ country                  : chr [1:36540] "Germany" "Germany" "Germany" "Germany" ...
##  $ avg_temperature          : num [1:36540] 28.3 28.4 28.7 26.7 26.8 ...
##  $ humidity                 : num [1:36540] 31.1 37.9 57.7 51.3 65.4 ...
##  $ co2_emission             : num [1:36540] 213 606 269 167 394 ...
##  $ energy_consumption       : num [1:36540] 11349 4167 4504 3259 7024 ...
##  $ renewable_share          : num [1:36540] 14.42 5.63 14.2 13.84 6.93 ...
##  $ urban_population         : num [1:36540] 76.4 86.3 75.9 63.1 76 ...
##  $ industrial_activity_index: num [1:36540] 51.2 78.3 49 97.4 81.9 ...
##  $ energy_price             : num [1:36540] 83.9 110.4 173.6 89.1 40.6 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   date = col_date(format = ""),
##   ..   country = col_character(),
##   ..   avg_temperature = col_double(),
##   ..   humidity = col_double(),
##   ..   co2_emission = col_double(),
##   ..   energy_consumption = col_double(),
##   ..   renewable_share = col_double(),
##   ..   urban_population = col_double(),
##   ..   industrial_activity_index = col_double(),
##   ..   energy_price = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
# 1. Validar fechas
range(df2$date, na.rm = TRUE)
## [1] "2020-01-01" "2024-12-31"
# 2. Resumen de calidad (NAs y Outliers)
skim(df2)
Data summary
Name df2
Number of rows 36540
Number of columns 10
_______________________
Column type frequency:
character 1
Date 1
numeric 8
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
country 0 1 5 14 0 20 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
date 0 1 2020-01-01 2024-12-31 2022-07-02 1827

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
avg_temperature 0 1 13.58 10.08 -9.60 5.63 13.79 20.84 38.71 ▂▆▇▅▂
humidity 0 1 59.97 17.30 30.00 45.01 59.99 74.97 90.00 ▇▇▇▇▇
co2_emission 0 1 445.82 234.36 50.15 248.67 422.65 628.42 999.85 ▇▇▇▅▂
energy_consumption 0 1 7295.90 3693.93 1001.89 4184.18 6921.62 10175.11 15998.05 ▇▇▇▅▂
renewable_share 0 1 15.94 5.33 5.00 12.02 15.71 19.84 30.87 ▃▇▇▅▁
urban_population 0 1 74.98 8.65 60.00 67.47 75.03 82.50 90.00 ▇▇▇▇▇
industrial_activity_index 0 1 70.17 17.33 40.00 55.30 70.06 85.26 100.00 ▇▇▇▇▇
energy_price 0 1 115.28 49.18 30.00 72.42 115.24 158.27 200.00 ▇▇▇▇▇
# 3. Verificación de duplicados por país y fecha
df2 %>% 
  group_by(country, date) %>% 
  filter(n() > 1)
## # A tibble: 0 × 10
## # Groups:   country, date [0]
## # ℹ 10 variables: date <date>, country <chr>, avg_temperature <dbl>,
## #   humidity <dbl>, co2_emission <dbl>, energy_consumption <dbl>,
## #   renewable_share <dbl>, urban_population <dbl>,
## #   industrial_activity_index <dbl>, energy_price <dbl>
# Creación de tabla de validación de métricas
df_validado <- df2 %>%
  mutate(
    intensidad_emisiones = co2_emission / energy_consumption,
    indice_transicion = renewable_share / energy_price,
    emisiones_urbanas = co2_emission / urban_population
  )

# Verificar que no haya infinitos (división por cero)
summary(df_validado %>% select(intensidad_emisiones, indice_transicion))
##  intensidad_emisiones indice_transicion
##  Min.   :0.006408     Min.   :0.02564  
##  1st Qu.:0.035920     1st Qu.:0.09457  
##  Median :0.060991     Median :0.13797  
##  Mean   :0.081727     Mean   :0.17745  
##  3rd Qu.:0.101234     3rd Qu.:0.21938  
##  Max.   :0.488175     Max.   :0.96238
# Conteo de países para el mapa coroplético
unique(df2$country)
##  [1] "Germany"        "France"         "Netherlands"    "Italy"         
##  [5] "Spain"          "Sweden"         "Norway"         "Poland"        
##  [9] "Turkey"         "United Kingdom" "United States"  "Canada"        
## [13] "Brazil"         "India"          "China"          "Japan"         
## [17] "Australia"      "South Africa"   "Mexico"         "Indonesia"
# Verificar si tenemos datos suficientes para cada año (2020-2024)
df2 %>% 
  mutate(year = year(date)) %>% 
  count(year)
## # A tibble: 5 × 2
##    year     n
##   <dbl> <int>
## 1  2020  7320
## 2  2021  7300
## 3  2022  7300
## 4  2023  7300
## 5  2024  7320
# 1. Carga y Limpieza Inicial
df_final <- df2 %>%
  # Eliminar duplicados y asegurar tipos de datos
  distinct() %>%
  mutate(
    date = as.Date(date),
    year = year(date),
    month = month(date),
    # Clasificación por estaciones
    season = case_when(
      month %in% c(12, 1, 2) ~ "Invierno",
      month %in% c(3, 4, 5) ~ "Primavera",
      month %in% c(6, 7, 8) ~ "Verano",
      TRUE ~ "Otoño"
    )
  ) %>%
  # 2. Filtros de Calidad: Valores lógicos
  filter(
    renewable_share >= 0 & renewable_share <= 100,
    energy_consumption > 0,
    co2_emission >= 0
  ) %>%
  # 3. Creación de Métricas Derivadas
  mutate(
    emissions_intensity = co2_emission / energy_consumption,
    urban_efficiency = co2_emission / urban_population,
    transition_index = renewable_share / energy_price
  )

# 4. Exportar el archivo final
write_csv(df_final, "clima_energia_final.csv")
print("Archivo 'clima_energia_final.csv' generado con éxito.")
## [1] "Archivo 'clima_energia_final.csv' generado con éxito."