practica II

#Instalamos las librerias necesarias
library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.4.3

## Warning: package 'ggplot2' was built under R version 4.4.3

## Warning: package 'tibble' was built under R version 4.4.3

## Warning: package 'tidyr' was built under R version 4.4.3

## Warning: package 'readr' was built under R version 4.4.3

## Warning: package 'purrr' was built under R version 4.4.3

## Warning: package 'dplyr' was built under R version 4.4.3

## Warning: package 'stringr' was built under R version 4.4.3

## Warning: package 'forcats' was built under R version 4.4.3

## Warning: package 'lubridate' was built under R version 4.4.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.2.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(skimr)

## Warning: package 'skimr' was built under R version 4.4.3

library(lubridate)
library(tidyverse)

# Carga del archivo DF2
df2 <- read_csv("global_climate_energy_2020_2024.csv")

## Rows: 36540 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): country
## dbl  (8): avg_temperature, humidity, co2_emission, energy_consumption, renew...
## date (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Validación de dimensiones iniciales
# Debería mostrar 36540 filas y 10 columnas
dim(df2)

## [1] 36540    10

# Comprobar nombres de columnas exactos
names(df2)

##  [1] "date"                      "country"                  
##  [3] "avg_temperature"           "humidity"                 
##  [5] "co2_emission"              "energy_consumption"       
##  [7] "renewable_share"           "urban_population"         
##  [9] "industrial_activity_index" "energy_price"

# Verificar tipos de datos (Numéricos vs Factores/Fechas)
str(df2)

## spc_tbl_ [36,540 × 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ date                     : Date[1:36540], format: "2020-01-01" "2020-01-02" ...
##  $ country                  : chr [1:36540] "Germany" "Germany" "Germany" "Germany" ...
##  $ avg_temperature          : num [1:36540] 28.3 28.4 28.7 26.7 26.8 ...
##  $ humidity                 : num [1:36540] 31.1 37.9 57.7 51.3 65.4 ...
##  $ co2_emission             : num [1:36540] 213 606 269 167 394 ...
##  $ energy_consumption       : num [1:36540] 11349 4167 4504 3259 7024 ...
##  $ renewable_share          : num [1:36540] 14.42 5.63 14.2 13.84 6.93 ...
##  $ urban_population         : num [1:36540] 76.4 86.3 75.9 63.1 76 ...
##  $ industrial_activity_index: num [1:36540] 51.2 78.3 49 97.4 81.9 ...
##  $ energy_price             : num [1:36540] 83.9 110.4 173.6 89.1 40.6 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   date = col_date(format = ""),
##   ..   country = col_character(),
##   ..   avg_temperature = col_double(),
##   ..   humidity = col_double(),
##   ..   co2_emission = col_double(),
##   ..   energy_consumption = col_double(),
##   ..   renewable_share = col_double(),
##   ..   urban_population = col_double(),
##   ..   industrial_activity_index = col_double(),
##   ..   energy_price = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

# 1. Validar fechas
range(df2$date, na.rm = TRUE)

## [1] "2020-01-01" "2024-12-31"

# 2. Resumen de calidad (NAs y Outliers)
skim(df2)

Data summary
Name	df2
Number of rows	36540
Number of columns	10
_______________________
Column type frequency:
character	1
Date	1
numeric	8
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
country	0	1	5	14	0	20	0

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
date	0	1	2020-01-01	2024-12-31	2022-07-02	1827

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
avg_temperature	1	13.58	10.08	-9.60	5.63	13.79	20.84	38.71	▂▆▇▅▂
humidity	1	59.97	17.30	30.00	45.01	59.99	74.97	90.00	▇▇▇▇▇
co2_emission	1	445.82	234.36	50.15	248.67	422.65	628.42	999.85	▇▇▇▅▂
energy_consumption	1	7295.90	3693.93	1001.89	4184.18	6921.62	10175.11	15998.05	▇▇▇▅▂
renewable_share	1	15.94	5.33	5.00	12.02	15.71	19.84	30.87	▃▇▇▅▁
urban_population	1	74.98	8.65	60.00	67.47	75.03	82.50	90.00	▇▇▇▇▇
industrial_activity_index	1	70.17	17.33	40.00	55.30	70.06	85.26	100.00	▇▇▇▇▇
energy_price	1	115.28	49.18	30.00	72.42	115.24	158.27	200.00	▇▇▇▇▇

# 3. Verificación de duplicados por país y fecha
df2 %>% 
  group_by(country, date) %>% 
  filter(n() > 1)

## # A tibble: 0 × 10
## # Groups:   country, date [0]
## # ℹ 10 variables: date <date>, country <chr>, avg_temperature <dbl>,
## #   humidity <dbl>, co2_emission <dbl>, energy_consumption <dbl>,
## #   renewable_share <dbl>, urban_population <dbl>,
## #   industrial_activity_index <dbl>, energy_price <dbl>

# Creación de tabla de validación de métricas
df_validado <- df2 %>%
  mutate(
    intensidad_emisiones = co2_emission / energy_consumption,
    indice_transicion = renewable_share / energy_price,
    emisiones_urbanas = co2_emission / urban_population
  )

# Verificar que no haya infinitos (división por cero)
summary(df_validado %>% select(intensidad_emisiones, indice_transicion))

##  intensidad_emisiones indice_transicion
##  Min.   :0.006408     Min.   :0.02564  
##  1st Qu.:0.035920     1st Qu.:0.09457  
##  Median :0.060991     Median :0.13797  
##  Mean   :0.081727     Mean   :0.17745  
##  3rd Qu.:0.101234     3rd Qu.:0.21938  
##  Max.   :0.488175     Max.   :0.96238

# Conteo de países para el mapa coroplético
unique(df2$country)

##  [1] "Germany"        "France"         "Netherlands"    "Italy"         
##  [5] "Spain"          "Sweden"         "Norway"         "Poland"        
##  [9] "Turkey"         "United Kingdom" "United States"  "Canada"        
## [13] "Brazil"         "India"          "China"          "Japan"         
## [17] "Australia"      "South Africa"   "Mexico"         "Indonesia"

# Verificar si tenemos datos suficientes para cada año (2020-2024)
df2 %>% 
  mutate(year = year(date)) %>% 
  count(year)

## # A tibble: 5 × 2
##    year     n
##   <dbl> <int>
## 1  2020  7320
## 2  2021  7300
## 3  2022  7300
## 4  2023  7300
## 5  2024  7320

# 1. Carga y Limpieza Inicial
df_final <- df2 %>%
  # Eliminar duplicados y asegurar tipos de datos
  distinct() %>%
  mutate(
    date = as.Date(date),
    year = year(date),
    month = month(date),
    # Clasificación por estaciones
    season = case_when(
      month %in% c(12, 1, 2) ~ "Invierno",
      month %in% c(3, 4, 5) ~ "Primavera",
      month %in% c(6, 7, 8) ~ "Verano",
      TRUE ~ "Otoño"
    )
  ) %>%
  # 2. Filtros de Calidad: Valores lógicos
  filter(
    renewable_share >= 0 & renewable_share <= 100,
    energy_consumption > 0,
    co2_emission >= 0
  ) %>%
  # 3. Creación de Métricas Derivadas
  mutate(
    emissions_intensity = co2_emission / energy_consumption,
    urban_efficiency = co2_emission / urban_population,
    transition_index = renewable_share / energy_price
  )

# 4. Exportar el archivo final
write_csv(df_final, "clima_energia_final.csv")
print("Archivo 'clima_energia_final.csv' generado con éxito.")

## [1] "Archivo 'clima_energia_final.csv' generado con éxito."

practica II

Ivan Malagon

2026-01-06