#Instalamos las librerias necesarias
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'tibble' was built under R version 4.4.3
## Warning: package 'tidyr' was built under R version 4.4.3
## Warning: package 'readr' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.3
## Warning: package 'stringr' was built under R version 4.4.3
## Warning: package 'forcats' was built under R version 4.4.3
## Warning: package 'lubridate' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(skimr)
## Warning: package 'skimr' was built under R version 4.4.3
library(lubridate)
library(tidyverse)
# Carga del archivo DF2
df2 <- read_csv("global_climate_energy_2020_2024.csv")
## Rows: 36540 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): country
## dbl (8): avg_temperature, humidity, co2_emission, energy_consumption, renew...
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Validación de dimensiones iniciales
# Debería mostrar 36540 filas y 10 columnas
dim(df2)
## [1] 36540 10
# Comprobar nombres de columnas exactos
names(df2)
## [1] "date" "country"
## [3] "avg_temperature" "humidity"
## [5] "co2_emission" "energy_consumption"
## [7] "renewable_share" "urban_population"
## [9] "industrial_activity_index" "energy_price"
# Verificar tipos de datos (Numéricos vs Factores/Fechas)
str(df2)
## spc_tbl_ [36,540 × 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ date : Date[1:36540], format: "2020-01-01" "2020-01-02" ...
## $ country : chr [1:36540] "Germany" "Germany" "Germany" "Germany" ...
## $ avg_temperature : num [1:36540] 28.3 28.4 28.7 26.7 26.8 ...
## $ humidity : num [1:36540] 31.1 37.9 57.7 51.3 65.4 ...
## $ co2_emission : num [1:36540] 213 606 269 167 394 ...
## $ energy_consumption : num [1:36540] 11349 4167 4504 3259 7024 ...
## $ renewable_share : num [1:36540] 14.42 5.63 14.2 13.84 6.93 ...
## $ urban_population : num [1:36540] 76.4 86.3 75.9 63.1 76 ...
## $ industrial_activity_index: num [1:36540] 51.2 78.3 49 97.4 81.9 ...
## $ energy_price : num [1:36540] 83.9 110.4 173.6 89.1 40.6 ...
## - attr(*, "spec")=
## .. cols(
## .. date = col_date(format = ""),
## .. country = col_character(),
## .. avg_temperature = col_double(),
## .. humidity = col_double(),
## .. co2_emission = col_double(),
## .. energy_consumption = col_double(),
## .. renewable_share = col_double(),
## .. urban_population = col_double(),
## .. industrial_activity_index = col_double(),
## .. energy_price = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
# 1. Validar fechas
range(df2$date, na.rm = TRUE)
## [1] "2020-01-01" "2024-12-31"
# 2. Resumen de calidad (NAs y Outliers)
skim(df2)
Data summary
| Name |
df2 |
| Number of rows |
36540 |
| Number of columns |
10 |
| _______________________ |
|
| Column type frequency: |
|
| character |
1 |
| Date |
1 |
| numeric |
8 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
Variable type: Date
| date |
0 |
1 |
2020-01-01 |
2024-12-31 |
2022-07-02 |
1827 |
Variable type: numeric
| avg_temperature |
0 |
1 |
13.58 |
10.08 |
-9.60 |
5.63 |
13.79 |
20.84 |
38.71 |
▂▆▇▅▂ |
| humidity |
0 |
1 |
59.97 |
17.30 |
30.00 |
45.01 |
59.99 |
74.97 |
90.00 |
▇▇▇▇▇ |
| co2_emission |
0 |
1 |
445.82 |
234.36 |
50.15 |
248.67 |
422.65 |
628.42 |
999.85 |
▇▇▇▅▂ |
| energy_consumption |
0 |
1 |
7295.90 |
3693.93 |
1001.89 |
4184.18 |
6921.62 |
10175.11 |
15998.05 |
▇▇▇▅▂ |
| renewable_share |
0 |
1 |
15.94 |
5.33 |
5.00 |
12.02 |
15.71 |
19.84 |
30.87 |
▃▇▇▅▁ |
| urban_population |
0 |
1 |
74.98 |
8.65 |
60.00 |
67.47 |
75.03 |
82.50 |
90.00 |
▇▇▇▇▇ |
| industrial_activity_index |
0 |
1 |
70.17 |
17.33 |
40.00 |
55.30 |
70.06 |
85.26 |
100.00 |
▇▇▇▇▇ |
| energy_price |
0 |
1 |
115.28 |
49.18 |
30.00 |
72.42 |
115.24 |
158.27 |
200.00 |
▇▇▇▇▇ |
# 3. Verificación de duplicados por país y fecha
df2 %>%
group_by(country, date) %>%
filter(n() > 1)
## # A tibble: 0 × 10
## # Groups: country, date [0]
## # ℹ 10 variables: date <date>, country <chr>, avg_temperature <dbl>,
## # humidity <dbl>, co2_emission <dbl>, energy_consumption <dbl>,
## # renewable_share <dbl>, urban_population <dbl>,
## # industrial_activity_index <dbl>, energy_price <dbl>
# Creación de tabla de validación de métricas
df_validado <- df2 %>%
mutate(
intensidad_emisiones = co2_emission / energy_consumption,
indice_transicion = renewable_share / energy_price,
emisiones_urbanas = co2_emission / urban_population
)
# Verificar que no haya infinitos (división por cero)
summary(df_validado %>% select(intensidad_emisiones, indice_transicion))
## intensidad_emisiones indice_transicion
## Min. :0.006408 Min. :0.02564
## 1st Qu.:0.035920 1st Qu.:0.09457
## Median :0.060991 Median :0.13797
## Mean :0.081727 Mean :0.17745
## 3rd Qu.:0.101234 3rd Qu.:0.21938
## Max. :0.488175 Max. :0.96238
# Conteo de países para el mapa coroplético
unique(df2$country)
## [1] "Germany" "France" "Netherlands" "Italy"
## [5] "Spain" "Sweden" "Norway" "Poland"
## [9] "Turkey" "United Kingdom" "United States" "Canada"
## [13] "Brazil" "India" "China" "Japan"
## [17] "Australia" "South Africa" "Mexico" "Indonesia"
# Verificar si tenemos datos suficientes para cada año (2020-2024)
df2 %>%
mutate(year = year(date)) %>%
count(year)
## # A tibble: 5 × 2
## year n
## <dbl> <int>
## 1 2020 7320
## 2 2021 7300
## 3 2022 7300
## 4 2023 7300
## 5 2024 7320
# 1. Carga y Limpieza Inicial
df_final <- df2 %>%
# Eliminar duplicados y asegurar tipos de datos
distinct() %>%
mutate(
date = as.Date(date),
year = year(date),
month = month(date),
# Clasificación por estaciones
season = case_when(
month %in% c(12, 1, 2) ~ "Invierno",
month %in% c(3, 4, 5) ~ "Primavera",
month %in% c(6, 7, 8) ~ "Verano",
TRUE ~ "Otoño"
)
) %>%
# 2. Filtros de Calidad: Valores lógicos
filter(
renewable_share >= 0 & renewable_share <= 100,
energy_consumption > 0,
co2_emission >= 0
) %>%
# 3. Creación de Métricas Derivadas
mutate(
emissions_intensity = co2_emission / energy_consumption,
urban_efficiency = co2_emission / urban_population,
transition_index = renewable_share / energy_price
)
# 4. Exportar el archivo final
write_csv(df_final, "clima_energia_final.csv")
print("Archivo 'clima_energia_final.csv' generado con éxito.")
## [1] "Archivo 'clima_energia_final.csv' generado con éxito."