0 Configuración

# knitr::opts_chunk_set(echo = TRUE): Configura R Markdown para mostrar el código fuente (TRUE) junto con su salida.
knitr::opts_chunk$set(echo = TRUE)
# Carga la librería principal para manipulación de datos y visualización (dplyr, ggplot2).
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Carga la librería para leer archivos Excel (.xlsx).
library(readxl)
# Carga la librería para crear tablas bonitas y estilizadas en formato HTML.
library(kableExtra)
## 
## Adjuntando el paquete: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
# Carga la librería para el análisis y la imputación de valores perdidos (NA).
library(VIM)
## Cargando paquete requerido: colorspace
## Cargando paquete requerido: grid
## VIM is ready to use.
## 
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Adjuntando el paquete: 'VIM'
## 
## The following object is masked from 'package:datasets':
## 
##     sleep
# Carga la librería para visualizar matrices de correlación.
library(corrplot)
## corrplot 0.95 loaded
# Carga la librería para formatos de texto especializados (ej: formato de dólar).
library(scales)
## 
## Adjuntando el paquete: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

1 Estruictura de Datos

# 1.1 Diccionario de Datos
# Carga el diccionario de datos desde el archivo Data Dictionary.xlsx.
diccionario <- read_excel("Data Dictionary.xlsx")

# Crea la tabla 1: Variables, código y ODS asociado (ordenada por ODS).
tabla_variables <- diccionario %>%
  # Selecciona las columnas requeridas: Objetivo ODS, Código de Variable y Descripción.
  select(`Associated SDG GOAL`, Code, Description) %>%
  # Ordena la tabla por el número de Objetivo ODS.
  arrange(`Associated SDG GOAL`)

# Muestra la tabla en formato estilizado Kable.
kable(tabla_variables, caption = "Variables y ODS asociados") %>%
  kable_styling()
Variables y ODS asociados
Associated SDG GOAL Code Description
Classification types Regime Regime classified considering the competitiveness of access to power as well as existence of liberal principles. Regime type is a weighted index that considers a variety of political metrics.
Classification types Income World Bank assigns the world’s economies to four income groups—low, lower-middle, upper-middle, and high-income countries.
Classification types Region World Region as classified by UN.
Classification types Continent Continent classification according to UN Convention
Economic General NE.EXP.GNFS.ZS Exports of goods and services represent the value of all goods and other market services provided to the rest of the world
Economic General NE.CON.TOTL.ZS Sum of household final consumption expenditure (private consumption) and general government final consumption expenditure (general government consumption).
Economic General NE.DAB.TOTL.ZS Sum of household final consumption expenditure (formerly private consumption), general government final consumption expenditure (formerly general government consumption), and gross capital formation (formerly gross domestic investment).
Economic General NY.GDP.MKTP.CD Current value (in USD) of total goods and services produced within a country.
Economic General NY.GNS.ICTR.ZS Calculated as gross national income less total consumption, plus net transfers
Economic General NE.CON.GOVT.ZS Final consumption expenditure (formerly general government consumption) includes all government current expenditures for purchases of goods and services (including compensation of employees). 
Economic General FP.CPI.TOTL.ZG Rate at which prices of goods and services brought by conusmers rise or fall. Estimated by using consumer price indices.
Economic General NE.IMP.GNFS.ZS Value of all goods and other market services received from the rest of the world
Economic General NE.TRD.GNFS.ZS Proportion of total GDP that is comprised by Trade activity.
Explanatory factors IT.NET.USER.ZS Individuals who have used the Internet (from any location) in the last 3 months
Explanatory factors GH.EM.IC.LUF NA
Explanatory factors SH.HIV.INCD.TL.P3 HIV rates per 1000 people
Explanatory factors IT.NET.USER.ZS % of countrys total population that has access to working internet
Explanatory factors SP.DYN.LE00.IN Average life expectancy for newborn babies
Explanatory factors SP.POP.TOTL Total population
Explanatory factors NY.GDP.TOTL.RT.ZS % Income accrued from natural resources (e.g. Exports) as a percentage of GDP
Explanatory factors SP.URB.TOTL.IN.ZS What proportion of total population live in urban areas?
Explanatory factors SP.RUR.TOTL.ZS Percentage of a countries total population that occupy rural areas.
Goal 1. End poverty in all its forms everywhere SP_ACS_BSRVH2O Percentage of the population who have sustained access to basic water drinking services.
Goal 1. End poverty in all its forms everywhere SI_POV_DAY1 Percentage of the population living on less than $1.90 a day at 2011 international prices. The international poverty line is currently set at $1.90 a day at 2011 international prices.
Goal 10. Reduce inequality within and among countries SI.POV.GINI Score awarded based on how equitably income is dispersed within a country
Goal 10. Reduce inequality within and among countries NY.ADJ.NNTY.PC.KD.ZG Percentage yearly growth of adjusted net national income divided by number of people in country
Goal 13. Take urgent action to combat climate change and its impacts NY.ADJ.SVNX.GN.ZS Measure that monitors whether savings and investment in a country compensate for depreciation of natural and physical capital
Goal 13. Take urgent action to combat climate change and its impacts NY.ADJ.DCO2.GN.ZS Measure that monitors whether savings and investment in a country compensate for depreciation of natural and physical capital
Goal 13. Take urgent action to combat climate change and its impacts NY.ADJ.DRES.GN.ZS Measure that monitors whether savings and investment in a country compensate for depreciation of natural and physical capital
Goal 13. Take urgent action to combat climate change and its impacts NY.ADJ.DPEM.GN.ZS Measure that monitors whether savings and investment in a country compensate for depreciation of natural and physical capital
Goal 13. Take urgent action to combat climate change and its impacts NY.ADJ.DFOR.GN.ZS Measure, calculated as the product of unit resource rents and the excess of roundwood harvest over natural growth.
Goal 2. End hunger, achieve food security and improved nutrition and promote sustainable agriculture SN_ITK_DEFC Percentage of the population whose habitual food consumption is insufficient to provide the dietary energy levels that are required to maintain a normal active and healthy life. 
Goal 3. Ensure healthy lives and promote well-being for all at all ages SP.DYN.LE00.IN Value which captures the expectated age a newborn baby will live to
Goal 4. Ensure inclusive and equitable quality education and promoted lifelong learning oppurtunities for all SE.PRM.UNER.ZS Percentage of primary-school-age children who are not enrolled in primary or secondary school. Children in the official primary age group that are in preprimary education should be considered out of school.
Goal 4. Ensure inclusive and equitable quality education and promoted lifelong learning oppurtunities for all SE.COM.DURS Duration of compulsory education is the number of years that children are legally obliged to attend school.
Goal 4. Ensure inclusive and equitable quality education and promoted lifelong learning oppurtunities for all SE.PRM.CMPT.ZS Number of new entrants (enrollments minus repeaters) in the last grade of primary education, regardless of age, divided by the population at the entrance age for the last grade of primary education.
Goal 4. Ensure inclusive and equitable quality education and promoted lifelong learning oppurtunities for all SE.PRE.ENRR Ratio of total enrollment, regardless of age, to the population of the age group that officially corresponds to the level of education shown
Goal 4. Ensure inclusive and equitable quality education and promoted lifelong learning oppurtunities for all SE.PRM.ENRR Ratio of total enrollment, regardless of age, to the population of the age group that officially corresponds to the level of education shown
Goal 4. Ensure inclusive and equitable quality education and promoted lifelong learning oppurtunities for all SE.SEC.ENRR Ratio of total enrollment, regardless of age, to the population of the age group that officially corresponds to the level of education shown
Goal 4. Ensure inclusive and equitable quality education and promoted lifelong learning oppurtunities for all SE.SEC.ENRR.FE Ratio of female enrollment, regardless of age, to the population of the age group that officially corresponds to the level of education shown
Goal 4. Ensure inclusive and equitable quality education and promoted lifelong learning oppurtunities for all SE.PRM.ENRL.TC.ZS Average number of pupils per teacher in primary school.
Goal 5. Achieve gender equality and empower all women and girls SG.GEN.PARL.ZS Percentage of parliamentary seats in a single or lower chamber held by women.
Goal 5. Achieve gender equality and empower all women and girls SG.LAW.INDX index measures how laws and regulations affect women’s economic opportunity. Overall scores are calculated by taking the average score of each of the eight areas (Going Places, Starting a Job, Getting Paid, Getting Married, Having Children, Running a Business, Managing Assets and Getting a Pension), with 100 representing the highest possible score.
Goal 7. Ensure access to affordable, reliable, sustainable and modern energy for all EG.FEC.RNEW.ZS Share of renewables energy in total final energy consumption.
Goal 7. Ensure access to affordable, reliable, sustainable and modern energy for all EG.ELC.RNEW.ZS Share of electrity generated by renewable power plants in total electricity generated by all types of plants.
Goal 8. Promote sustained, inclusive and sustainable economic growth, full and productive employment and decent work for all SL_TLF_UEM Share of labour force without work but available and seeking employment (male)
Goal 8. Promote sustained, inclusive and sustainable economic growth, full and productive employment and decent work for all SL_TLF_UEM Share of labour force without work but available and seeking employment (female)
Goal 9. Build resilient infrastructure, promote inclusive and sustainable industrialization and foster innovation IT_MOB_2GNTWK Percentage of inhabitants living within range of a mobile-cellular signal, irrespective of whether or not they are mobile phone subscribers or users.
Goal 9. Build resilient infrastructure, promote inclusive and sustainable industrialization and foster innovation IT_MOB_3GNTWK Percentage of inhabitants living within range of a mobile-cellular signal, irrespective of whether or not they are mobile phone subscribers or users. 
Goal 9. Build resilient infrastructure, promote inclusive and sustainable industrialization and foster innovation EG.ELC.ACCS.ZS Percentage of population with access to electricity. Electrification data are collected from industry, national surveys and international sources.
Goal 9. Build resilient infrastructure, promote inclusive and sustainable industrialization and foster innovation FB.ATM.TOTL.P5 Number of physical ATMs that allow clients to make financial transactions in a public place.
Goal 9. Build resilient infrastructure, promote inclusive and sustainable industrialization and foster innovation IC.REG.COST.PC.FE.ZS Cost to register a business is normalized by presenting it as a percentage of the gross national income (GNI) per capita.
Goal 9. Build resilient infrastructure, promote inclusive and sustainable industrialization and foster innovation IC.REG.COST.PC.MA.ZS Cost to register a business is normalized by presenting it as a percentage of the gross national income (GNI) per capita.
# Crea la tabla 2: Listado de ODS y descripción (ordenada por ODS).
tabla_ods <- diccionario %>%
  # Selecciona la columna del Objetivo ODS.
  select(`Associated SDG GOAL`) %>%
  # Mantiene solo los valores únicos (distintos ODS).
  distinct() %>%
  # Ordena la tabla por el número de Objetivo ODS.
  arrange(`Associated SDG GOAL`)

# Muestra la tabla en formato estilizado Kable.
kable(tabla_ods, caption = "Listado de ODS") %>%
  kable_styling()
Listado de ODS
Associated SDG GOAL
Classification types
Economic General
Explanatory factors
Goal 1. End poverty in all its forms everywhere
Goal 10. Reduce inequality within and among countries
Goal 13. Take urgent action to combat climate change and its impacts
Goal 2. End hunger, achieve food security and improved nutrition and promote sustainable agriculture
Goal 3. Ensure healthy lives and promote well-being for all at all ages
Goal 4. Ensure inclusive and equitable quality education and promoted lifelong learning oppurtunities for all
Goal 5. Achieve gender equality and empower all women and girls
Goal 7. Ensure access to affordable, reliable, sustainable and modern energy for all
Goal 8. Promote sustained, inclusive and sustainable economic growth, full and productive employment and decent work for all
Goal 9. Build resilient infrastructure, promote inclusive and sustainable industrialization and foster innovation
## 1.2 Fichero de Datos
# Carga el conjunto de datos principal.
datos <- read_csv("WorldSustainabilityDataset.csv")
## Rows: 3287 Columns: 54
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (6): Country Name, Country Code, Continent, Income Classification (Worl...
## dbl (48): Year, Access to electricity (% of population) - EG.ELC.ACCS.ZS, Ad...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Renombra columnas clave con nombres cortos y comprensibles según el enunciado[cite: 65].
datos <- datos %>%
  rename(
    Country = `Country Name`,
    CountryCode = `Country Code`,
    Income = `Income Classification (World Bank Definition)`,
    Regime = `Regime Type (RoW Measure Definition)`,
    Region = `World Regions (UN SDG Definition)`,
    Continent = Continent
  )

# Muestra el listado de los nuevos nombres de columnas.
names(datos)
##  [1] "Country"                                                                                            
##  [2] "CountryCode"                                                                                        
##  [3] "Year"                                                                                               
##  [4] "Access to electricity (% of population) - EG.ELC.ACCS.ZS"                                           
##  [5] "Adjusted net national income per capita (annual % growth) - NY.ADJ.NNTY.PC.KD.ZG"                   
##  [6] "Adjusted net savings, excluding particulate emission damage (% of GNI) - NY.ADJ.SVNX.GN.ZS"         
##  [7] "Adjusted savings: carbon dioxide damage (% of GNI) - NY.ADJ.DCO2.GN.ZS"                             
##  [8] "Adjusted savings: natural resources depletion (% of GNI) - NY.ADJ.DRES.GN.ZS"                       
##  [9] "Adjusted savings: net forest depletion (% of GNI) - NY.ADJ.DFOR.GN.ZS"                              
## [10] "Adjusted savings: particulate emission damage (% of GNI) - NY.ADJ.DPEM.GN.ZS"                       
## [11] "Automated teller machines (ATMs) (per 100,000 adults) - FB.ATM.TOTL.P5"                             
## [12] "Broad money (% of GDP) - FM.LBL.BMNY.GD.ZS"                                                         
## [13] "Children out of school (% of primary school age) - SE.PRM.UNER.ZS"                                  
## [14] "Compulsory education, duration (years) - SE.COM.DURS"                                               
## [15] "Cost of business start-up procedures, female (% of GNI per capita) - IC.REG.COST.PC.FE.ZS"          
## [16] "Cost of business start-up procedures, male (% of GNI per capita) - IC.REG.COST.PC.MA.ZS"            
## [17] "Exports of goods and services (% of GDP) - NE.EXP.GNFS.ZS"                                          
## [18] "Final consumption expenditure (% of GDP) - NE.CON.TOTL.ZS"                                          
## [19] "GDP (current US$) - NY.GDP.MKTP.CD"                                                                 
## [20] "GDP per capita (current US$) - NY.GDP.PCAP.CD"                                                      
## [21] "General government final consumption expenditure (% of GDP) - NE.CON.GOVT.ZS"                       
## [22] "Gross national expenditure (% of GDP) - NE.DAB.TOTL.ZS"                                             
## [23] "Gross savings (% of GDP) - NY.GNS.ICTR.ZS"                                                          
## [24] "Imports of goods and services (% of GDP) - NE.IMP.GNFS.ZS"                                          
## [25] "Inflation, consumer prices (annual %) - FP.CPI.TOTL.ZG"                                             
## [26] "Primary completion rate, total (% of relevant age group) - SE.PRM.CMPT.ZS"                          
## [27] "Proportion of seats held by women in national parliaments (%) - SG.GEN.PARL.ZS"                     
## [28] "Pupil-teacher ratio, primary - SE.PRM.ENRL.TC.ZS"                                                   
## [29] "Renewable electricity output (% of total electricity output) - EG.ELC.RNEW.ZS"                      
## [30] "Renewable energy consumption (% of total final energy consumption) - EG.FEC.RNEW.ZS"                
## [31] "School enrollment, preprimary (% gross) - SE.PRE.ENRR"                                              
## [32] "School enrollment, primary (% gross) - SE.PRM.ENRR"                                                 
## [33] "School enrollment, secondary (% gross) - SE.SEC.ENRR"                                               
## [34] "Trade (% of GDP) - NE.TRD.GNFS.ZS"                                                                  
## [35] "Women Business and the Law Index Score (scale 1-100) - SG.LAW.INDX"                                 
## [36] "Prevalence of undernourishment (%) - SN_ITK_DEFC"                                                   
## [37] "Proportion of population below international poverty line (%) - SI_POV_DAY1"                        
## [38] "Proportion of population covered by at least a 2G mobile network (%) - IT_MOB_2GNTWK"               
## [39] "Proportion of population covered by at least a 3G mobile network (%) - IT_MOB_3GNTWK"               
## [40] "Proportion of population using basic drinking water services (%) - SP_ACS_BSRVH2O"                  
## [41] "Unemployment rate, male (%) - SL_TLF_UEM"                                                           
## [42] "Unemployment rate, women (%) - SL_TLF_UEM"                                                          
## [43] "Annual production-based emissions of carbon dioxide (CO2) measured in million tonnes - GH.EM.IC.LUF"
## [44] "Continent"                                                                                          
## [45] "Gini index (World Bank estimate) - SI.POV.GINI"                                                     
## [46] "Income"                                                                                             
## [47] "Individuals using the Internet (% of population) - IT.NET.USER.ZS"                                  
## [48] "Life expectancy at birth, total (years) - SP.DYN.LE00.IN"                                           
## [49] "Population, total - SP.POP.TOTL"                                                                    
## [50] "Regime"                                                                                             
## [51] "Rural population (% of total population) - SP.RUR.TOTL.ZS"                                          
## [52] "Total natural resources rents (% of GDP) - NY.GDP.TOTL.RT.ZS"                                       
## [53] "Urban population (% of total population) - SP.URB.TOTL.IN.ZS"                                       
## [54] "Region"
# 2 Tipos de datos y posibles inconsistencias
# 2.1 Variables cuantitativas y 2.2 Variables cualitativas
# Reemplazar los valores vacíos ("") en columnas de tipo texto por NA, como se pide[cite: 76].
datos <- datos %>%
  # Aplica la función en todas las columnas (across) que sean de tipo carácter (where(is.character)).
  mutate(across(where(is.character), ~na_if(., "")))

# Convertir las variables cualitativas a factor para optimizar el análisis y la memoria.
# Esto cubre las subsecciones 2.2.1 a 2.2.5[cite: 78, 79, 80, 81, 82].
datos <- datos %>%
  # Aplica la función en todas las columnas especificadas para convertirlas a tipo factor.
  mutate(across(c(Country, CountryCode, Regime, Income, Region, Continent), as.factor))

# Muestra la estructura de los datos para verificar los tipos (str).
str(datos)
## tibble [3,287 × 54] (S3: tbl_df/tbl/data.frame)
##  $ Country                                                                                            : Factor w/ 173 levels "Albania","Algeria",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ CountryCode                                                                                        : Factor w/ 173 levels "ABW","AGO","ALB",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Year                                                                                               : num [1:3287] 2000 2001 2002 2003 2004 ...
##  $ Access to electricity (% of population) - EG.ELC.ACCS.ZS                                           : num [1:3287] 91.7 100 100 100 100 ...
##  $ Adjusted net national income per capita (annual % growth) - NY.ADJ.NNTY.PC.KD.ZG                   : num [1:3287] NA NA NA NA NA NA NA NA NA NA ...
##  $ Adjusted net savings, excluding particulate emission damage (% of GNI) - NY.ADJ.SVNX.GN.ZS         : num [1:3287] 15.39 11.49 3.89 6.71 9.8 ...
##  $ Adjusted savings: carbon dioxide damage (% of GNI) - NY.ADJ.DCO2.GN.ZS                             : num [1:3287] 2.39 2.51 2.72 2.73 2.68 ...
##  $ Adjusted savings: natural resources depletion (% of GNI) - NY.ADJ.DRES.GN.ZS                       : num [1:3287] 0.00019 0.000214 0.000223 0.000243 0.000256 ...
##  $ Adjusted savings: net forest depletion (% of GNI) - NY.ADJ.DFOR.GN.ZS                              : num [1:3287] 0.00019 0.000214 0.000223 0.000243 0.000256 ...
##  $ Adjusted savings: particulate emission damage (% of GNI) - NY.ADJ.DPEM.GN.ZS                       : num [1:3287] NA NA NA NA NA NA NA NA NA NA ...
##  $ Automated teller machines (ATMs) (per 100,000 adults) - FB.ATM.TOTL.P5                             : num [1:3287] NA NA NA NA NA NA NA NA NA NA ...
##  $ Broad money (% of GDP) - FM.LBL.BMNY.GD.ZS                                                         : num [1:3287] 51.8 53.6 58.5 61.2 57.3 ...
##  $ Children out of school (% of primary school age) - SE.PRM.UNER.ZS                                  : num [1:3287] 1.603 0.323 1.816 3.322 2.177 ...
##  $ Compulsory education, duration (years) - SE.COM.DURS                                               : num [1:3287] NA NA NA NA NA NA NA NA NA NA ...
##  $ Cost of business start-up procedures, female (% of GNI per capita) - IC.REG.COST.PC.FE.ZS          : num [1:3287] NA NA NA NA NA NA NA NA NA NA ...
##  $ Cost of business start-up procedures, male (% of GNI per capita) - IC.REG.COST.PC.MA.ZS            : num [1:3287] NA NA NA NA NA NA NA NA NA NA ...
##  $ Exports of goods and services (% of GDP) - NE.EXP.GNFS.ZS                                          : num [1:3287] 74.4 70.5 64.9 63.1 65.1 ...
##  $ Final consumption expenditure (% of GDP) - NE.CON.TOTL.ZS                                          : num [1:3287] 70.8 74.3 77.6 78.6 74.7 ...
##  $ GDP (current US$) - NY.GDP.MKTP.CD                                                                 : num [1:3287] 1.87e+09 1.92e+09 1.94e+09 2.02e+09 2.23e+09 ...
##  $ GDP per capita (current US$) - NY.GDP.PCAP.CD                                                      : num [1:3287] 20618 20670 20437 20834 22568 ...
##  $ General government final consumption expenditure (% of GDP) - NE.CON.GOVT.ZS                       : num [1:3287] 21.4 24.5 25.2 24.7 23.2 ...
##  $ Gross national expenditure (% of GDP) - NE.DAB.TOTL.ZS                                             : num [1:3287] 96.3 98.1 104.5 107.8 103.5 ...
##  $ Gross savings (% of GDP) - NY.GNS.ICTR.ZS                                                          : num [1:3287] 23.7 19.3 12 14.5 17.1 ...
##  $ Imports of goods and services (% of GDP) - NE.IMP.GNFS.ZS                                          : num [1:3287] 70.7 68.5 69.4 70.9 68.6 ...
##  $ Inflation, consumer prices (annual %) - FP.CPI.TOTL.ZG                                             : num [1:3287] 4.04 2.88 3.32 3.66 2.53 ...
##  $ Primary completion rate, total (% of relevant age group) - SE.PRM.CMPT.ZS                          : num [1:3287] 97.1 94.4 94.8 90.2 90.6 ...
##  $ Proportion of seats held by women in national parliaments (%) - SG.GEN.PARL.ZS                     : num [1:3287] NA NA NA NA NA NA NA NA NA NA ...
##  $ Pupil-teacher ratio, primary - SE.PRM.ENRL.TC.ZS                                                   : num [1:3287] 19.1 18.9 19.1 18.4 18.5 ...
##  $ Renewable electricity output (% of total electricity output) - EG.ELC.RNEW.ZS                      : num [1:3287] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Renewable energy consumption (% of total final energy consumption) - EG.FEC.RNEW.ZS                : num [1:3287] 0.175 0.181 0.181 0.185 0.187 ...
##  $ School enrollment, preprimary (% gross) - SE.PRE.ENRR                                              : num [1:3287] 95.9 97.6 98.4 99.6 98.7 ...
##  $ School enrollment, primary (% gross) - SE.PRM.ENRR                                                 : num [1:3287] 111 109 111 109 111 ...
##  $ School enrollment, secondary (% gross) - SE.SEC.ENRR                                               : num [1:3287] 96.5 98 100.5 99.1 97.3 ...
##  $ Trade (% of GDP) - NE.TRD.GNFS.ZS                                                                  : num [1:3287] 145 139 134 134 134 ...
##  $ Women Business and the Law Index Score (scale 1-100) - SG.LAW.INDX                                 : num [1:3287] NA NA NA NA NA NA NA NA NA NA ...
##  $ Prevalence of undernourishment (%) - SN_ITK_DEFC                                                   : num [1:3287] NA NA NA NA NA NA NA NA NA NA ...
##  $ Proportion of population below international poverty line (%) - SI_POV_DAY1                        : num [1:3287] NA NA NA NA NA NA NA NA NA NA ...
##  $ Proportion of population covered by at least a 2G mobile network (%) - IT_MOB_2GNTWK               : num [1:3287] NA NA NA NA NA 90 99 99 99 NA ...
##  $ Proportion of population covered by at least a 3G mobile network (%) - IT_MOB_3GNTWK               : num [1:3287] NA NA NA NA NA NA NA NA NA NA ...
##  $ Proportion of population using basic drinking water services (%) - SP_ACS_BSRVH2O                  : num [1:3287] NA NA NA NA NA NA NA NA NA NA ...
##  $ Unemployment rate, male (%) - SL_TLF_UEM                                                           : num [1:3287] NA NA NA NA NA NA NA NA NA NA ...
##  $ Unemployment rate, women (%) - SL_TLF_UEM                                                          : num [1:3287] NA NA NA NA NA NA NA NA NA NA ...
##  $ Annual production-based emissions of carbon dioxide (CO2) measured in million tonnes - GH.EM.IC.LUF: num [1:3287] 2.38 2.41 2.44 2.56 2.62 ...
##  $ Continent                                                                                          : Factor w/ 6 levels "Africa","Asia",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ Gini index (World Bank estimate) - SI.POV.GINI                                                     : num [1:3287] NA NA NA NA NA NA NA NA NA NA ...
##  $ Income                                                                                             : Factor w/ 4 levels "High income",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Individuals using the Internet (% of population) - IT.NET.USER.ZS                                  : num [1:3287] NA 17.1 18.8 20.8 23 25.4 28 30.9 52 58 ...
##  $ Life expectancy at birth, total (years) - SP.DYN.LE00.IN                                           : num [1:3287] NA 73.9 73.9 74 74.2 ...
##  $ Population, total - SP.POP.TOTL                                                                    : num [1:3287] NA 92892 94992 97016 98744 ...
##  $ Regime                                                                                             : Factor w/ 4 levels "Closed Autocracy",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ Rural population (% of total population) - SP.RUR.TOTL.ZS                                          : num [1:3287] 53.3 53.7 54 54.4 54.8 ...
##  $ Total natural resources rents (% of GDP) - NY.GDP.TOTL.RT.ZS                                       : num [1:3287] NA 0.000712 0.000657 0.0007 0.000685 ...
##  $ Urban population (% of total population) - SP.URB.TOTL.IN.ZS                                       : num [1:3287] NA 46.3 46 45.6 45.2 ...
##  $ Region                                                                                             : Factor w/ 7 levels "Central and Southern Asia",..: 4 4 4 4 4 4 4 4 4 4 ...
# Muestra un resumen de la variable Income para identificar posibles inconsistencias en sus niveles.
summary(datos$Income)
##         High income          Low income Lower-middle income Upper-middle income 
##                 899                 753                 868                 765 
##                NA's 
##                   2

3 valores extremos

# 3.1 Desigualdad GINI
#-- GINI: Visualización de la distribución --
ggplot(datos, aes(x = `Gini index (World Bank estimate) - SI.POV.GINI`)) +
  # Crea un histograma con un ancho de bin de 2.
  geom_histogram(binwidth = 2, fill = "steelblue") +
  # Añade títulos y etiquetas más comprensibles.
  labs(title = "Distribución del índice GINI", x = "GINI", y = "Frecuencia") +
  theme_minimal()
## Warning: Removed 1984 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Interpretación: La distribución parece normal con una cola a la derecha. 
# Para mitigar la influencia de valores extremos o la asimetría, 
# aplicaremos una transformación de raíz cuadrada a la variable GINI.
datos <- datos %>%
  # Crear una nueva columna con el GINI transformado (raíz cuadrada)
  mutate(GINI_sqrt = sqrt(`Gini index (World Bank estimate) - SI.POV.GINI`))

# Visualización del GINI transformado
ggplot(datos, aes(x = GINI_sqrt)) +
  geom_histogram(binwidth = 0.2, fill = "orange") +
  labs(title = "Distribución del índice GINI (Transformación Raíz Cuadrada)", x = "Raíz(GINI)", y = "Frecuencia") +
  theme_minimal()
## Warning: Removed 1984 rows containing non-finite outside the scale range
## (`stat_bin()`).

#  3.2 Emisiones de gas de efecto invernadero (GHE)
# --- GHE: Visualización de la distribución ---
ggplot(datos, aes(x = `Annual production-based emissions of carbon dioxide (CO2) measured in million tonnes - GH.EM.IC.LUF`)) +
  # Crea un histograma con un ancho de bin de 50.
  geom_histogram(binwidth = 50, fill = "darkgreen") +
  # Añade títulos y etiquetas más comprensibles.
  labs(title = "Distribución de emisiones GHE", x = "Emisiones (millones de ton)", y = "Frecuencia") +
  theme_minimal()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Interpretación: El gráfico muestra una asimetría extrema a la derecha (muchos países con pocas emisiones 
# y unos pocos con emisiones muy altas).
# Una transformación logarítmica es necesaria para gestionar la dispersión y los valores extremos[cite: 95].
datos <- datos %>%
  # Aplicar logaritmo natural (log) a las emisiones GHE y crear una nueva columna.
  # Se usa log(x+1) para evitar errores si hay valores cero.
  mutate(GHE_log = log(`Annual production-based emissions of carbon dioxide (CO2) measured in million tonnes - GH.EM.IC.LUF` + 1))

# Visualización del GHE transformado
ggplot(datos, aes(x = GHE_log)) +
  geom_histogram(binwidth = 0.5, fill = "red") +
  labs(title = "Distribución de emisiones GHE (Transformación Logarítmica)", x = "Log(Emisiones + 1)", y = "Frecuencia") +
  theme_minimal()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Top países contaminantes en 2018 (detección de valores extremos)
datos_2018 <- datos %>% filter(Year == 2018)

# Ordena, selecciona el país y las emisiones, y muestra el top 10[cite: 97, 98].
top_contaminantes <- datos_2018 %>%
  arrange(desc(`Annual production-based emissions of carbon dioxide (CO2) measured in million tonnes - GH.EM.IC.LUF`)) %>%
  select(Country, `Annual production-based emissions of carbon dioxide (CO2) measured in million tonnes - GH.EM.IC.LUF`) %>%
  head(10)

# Muestra la tabla.
kable(top_contaminantes, caption = "Top 10 países con mayores Emisiones GHE en 2018") %>%
  kable_styling()
Top 10 países con mayores Emisiones GHE en 2018
Country Annual production-based emissions of carbon dioxide (CO2) measured in million tonnes - GH.EM.IC.LUF
China 9956.569
United States 5424.882
India 2591.324
Russian Federation 1691.360
Japan 1135.688
Iran, Islamic Rep.  755.402
Germany 755.362
Korea, Rep.  634.934
Canada 586.505
Saudi Arabia 576.758

4 Correlaciones

#  4.1 Matriz de correlaciones en indicadores de pobreza
# Renombrar las variables con etiquetas cortas para el corrplot
# Se seleccionan las variables indicadas: ODS 1,2,3,4 (solo SE.PRM.UNER.ZS), 10 y GDP[cite: 102].
datos_cor_limpio <- datos %>%
  select(
    Pobreza_Int = `Proportion of population below international poverty line (%) - SI_POV_DAY1`,
    Desnutricion = `Prevalence of undernourishment (%) - SN_ITK_DEFC`, # ODS 2
    Esperanza_Vida = `Life expectancy at birth, total (years) - SP.DYN.LE00.IN`, # ODS 3
    Ninos_Sin_Escuela = `Children out of school (% of primary school age) - SE.PRM.UNER.ZS`, # ODS 4
    GINI = `Gini index (World Bank estimate) - SI.POV.GINI`, # ODS 10
    PIB_Total = `GDP (current US$) - NY.GDP.MKTP.CD` # GDP
  ) %>%
  # Elimina filas con NA para asegurar que cor() funcione.
  na.omit()

# 2. Calcular la matriz de correlación de Pearson.
cor_matrix_limpio <- cor(datos_cor_limpio)

# 3. Generar corrplot más legible y visual
corrplot(cor_matrix_limpio,
         method = "color", # Usar color para representar la intensidad de la correlación.
         tl.cex = 0.8, # Tamaño de la etiqueta del texto ajustado.
         addCoef.col = "black", # Añadir valores de correlación (coeficientes).
         type = "upper", # Mostrar solo la mitad superior (más limpio).
         diag = FALSE, # Ocultar la diagonal de 1s (correlación consigo misma).
         order = "hclust", # Reordenar por clústeres para agrupar variables relacionadas.
         title = "Matriz de Correlación de Indicadores de Pobreza y Desarrollo",
         mar=c(0,0,1,0) # Ajuste de márgenes.
)

# 4.2 Correlaciones con esperanza de vida
# Calcular correlaciones de todas las variables numéricas con Esperanza de Vida.
correlaciones <- datos %>%
  # Selecciona solo columnas numéricas.
  select(where(is.numeric)) %>%
  # Calcula la correlación de cada columna con la Esperanza de Vida (SP.DYN.LE00.IN),
  # usando "complete.obs" para manejar NAs.
  summarise(across(everything(), ~cor(., datos$`Life expectancy at birth, total (years) - SP.DYN.LE00.IN`, use = "complete.obs"))) %>%
  # Transforma la tabla ancha a un formato largo (Variable, Correlación).
  pivot_longer(cols = everything(), names_to = "Variable", values_to = "Correlación") %>%
  # Ordena la tabla de correlación de forma descendente (de mayor a menor).
  arrange(desc(Correlación))

# Muestra la tabla estilizada (kable).
kable(correlaciones, caption = "Correlación de todas las variables con Esperanza de Vida") %>%
  kable_styling()
Correlación de todas las variables con Esperanza de Vida
Variable Correlación
Life expectancy at birth, total (years) - SP.DYN.LE00.IN 1.0000000
Access to electricity (% of population) - EG.ELC.ACCS.ZS 0.8462019
School enrollment, secondary (% gross) - SE.SEC.ENRR 0.8062534
Proportion of population using basic drinking water services (%) - SP_ACS_BSRVH2O 0.7659222
Primary completion rate, total (% of relevant age group) - SE.PRM.CMPT.ZS 0.7280693
Individuals using the Internet (% of population) - IT.NET.USER.ZS 0.7179673
School enrollment, preprimary (% gross) - SE.PRE.ENRR 0.6812603
Urban population (% of total population) - SP.URB.TOTL.IN.ZS 0.6443750
Proportion of population covered by at least a 3G mobile network (%) - IT_MOB_3GNTWK 0.6221733
Automated teller machines (ATMs) (per 100,000 adults) - FB.ATM.TOTL.P5 0.6062708
GDP per capita (current US\() - NY.GDP.PCAP.CD </td> <td style="text-align:right;"> 0.5862631 </td> </tr> <tr> <td style="text-align:left;"> Proportion of population covered by at least a 2G mobile network (%) - IT_MOB_2GNTWK </td> <td style="text-align:right;"> 0.5778520 </td> </tr> <tr> <td style="text-align:left;"> Broad money (% of GDP) - FM.LBL.BMNY.GD.ZS </td> <td style="text-align:right;"> 0.5550244 </td> </tr> <tr> <td style="text-align:left;"> GHE_log </td> <td style="text-align:right;"> 0.4632149 </td> </tr> <tr> <td style="text-align:left;"> Women Business and the Law Index Score (scale 1-100) - SG.LAW.INDX </td> <td style="text-align:right;"> 0.4016865 </td> </tr> <tr> <td style="text-align:left;"> Compulsory education, duration (years) - SE.COM.DURS </td> <td style="text-align:right;"> 0.3811764 </td> </tr> <tr> <td style="text-align:left;"> Exports of goods and services (% of GDP) - NE.EXP.GNFS.ZS </td> <td style="text-align:right;"> 0.3241664 </td> </tr> <tr> <td style="text-align:left;"> Trade (% of GDP) - NE.TRD.GNFS.ZS </td> <td style="text-align:right;"> 0.2637263 </td> </tr> <tr> <td style="text-align:left;"> Adjusted net savings, excluding particulate emission damage (% of GNI) - NY.ADJ.SVNX.GN.ZS </td> <td style="text-align:right;"> 0.2571140 </td> </tr> <tr> <td style="text-align:left;"> Proportion of seats held by women in national parliaments (%) - SG.GEN.PARL.ZS </td> <td style="text-align:right;"> 0.2189707 </td> </tr> <tr> <td style="text-align:left;"> GDP (current US\)) - NY.GDP.MKTP.CD 0.2021001
Year 0.1939736
Imports of goods and services (% of GDP) - NE.IMP.GNFS.ZS 0.1704533
General government final consumption expenditure (% of GDP) - NE.CON.GOVT.ZS 0.1428950
Annual production-based emissions of carbon dioxide (CO2) measured in million tonnes - GH.EM.IC.LUF 0.1242841
Gross savings (% of GDP) - NY.GNS.ICTR.ZS 0.1242556
School enrollment, primary (% gross) - SE.PRM.ENRR 0.0600071
Population, total - SP.POP.TOTL 0.0116152
Adjusted net national income per capita (annual % growth) - NY.ADJ.NNTY.PC.KD.ZG -0.0191602
Adjusted savings: carbon dioxide damage (% of GNI) - NY.ADJ.DCO2.GN.ZS -0.0364124
Unemployment rate, male (%) - SL_TLF_UEM -0.0675769
Unemployment rate, women (%) - SL_TLF_UEM -0.1135456
Inflation, consumer prices (annual %) - FP.CPI.TOTL.ZG -0.1911438
Adjusted savings: natural resources depletion (% of GNI) - NY.ADJ.DRES.GN.ZS -0.2593173
Gross national expenditure (% of GDP) - NE.DAB.TOTL.ZS -0.2665952
Renewable electricity output (% of total electricity output) - EG.ELC.RNEW.ZS -0.2789410
Total natural resources rents (% of GDP) - NY.GDP.TOTL.RT.ZS -0.3014496
Final consumption expenditure (% of GDP) - NE.CON.TOTL.ZS -0.3047017
Adjusted savings: net forest depletion (% of GNI) - NY.ADJ.DFOR.GN.ZS -0.3320652
Gini index (World Bank estimate) - SI.POV.GINI -0.3529437
GINI_sqrt -0.3539959
Cost of business start-up procedures, male (% of GNI per capita) - IC.REG.COST.PC.MA.ZS -0.5000100
Cost of business start-up procedures, female (% of GNI per capita) - IC.REG.COST.PC.FE.ZS -0.5000397
Prevalence of undernourishment (%) - SN_ITK_DEFC -0.5736113
Rural population (% of total population) - SP.RUR.TOTL.ZS -0.6443750
Children out of school (% of primary school age) - SE.PRM.UNER.ZS -0.6612297
Renewable energy consumption (% of total final energy consumption) - EG.FEC.RNEW.ZS -0.6950236
Proportion of population below international poverty line (%) - SI_POV_DAY1 -0.7651884
Adjusted savings: particulate emission damage (% of GNI) - NY.ADJ.DPEM.GN.ZS -0.7923531
Pupil-teacher ratio, primary - SE.PRM.ENRL.TC.ZS -0.7934361
# 5 Imputacion
# Identificar variables más correlacionadas (top 5) con Esperanza de Vida para usar en kNN.
# Se extraen los nombres de las 5 variables principales (excluyendo la Esperanza de Vida misma).
top_5_vars <- correlaciones %>%
  filter(Variable != "Life expectancy at birth, total (years) - SP.DYN.LE00.IN") %>%
  head(5) %>%
  pull(Variable)

# 2. Imputación para el año 2000 (requisito especial: usar datos del 2001)[cite: 112].
# Calcula la media de la variable Esperanza de Vida para el año 2001.
mean_2001 <- datos %>%
  filter(Year == 2001) %>%
  pull(`Life expectancy at birth, total (years) - SP.DYN.LE00.IN`) %>%
  mean(na.rm = TRUE)

# Crea un vector lógico para identificar los NAs en el año 2000.
na_2000_indices <- datos$Year == 2000 & is.na(datos$`Life expectancy at birth, total (years) - SP.DYN.LE00.IN`)

# Aplica la imputación: sustituye los NAs del año 2000 con la media de 2001.
datos$`Life expectancy at birth, total (years) - SP.DYN.LE00.IN`[na_2000_indices] <- mean_2001

# 3. Imputación para el resto de años (usando kNN)[cite: 112, 113].
# Variables a usar en la imputación: Esperanza de Vida (target) y las Top 5 correlacionadas (vecinos).
imputacion_vars <- c("Life expectancy at birth, total (years) - SP.DYN.LE00.IN", top_5_vars)

# Crea un subconjunto de datos solo con las variables de interés.
datos_imputar <- datos %>%
  select(all_of(imputacion_vars))

# Realiza la imputación k-Nearest Neighbors (k=5 por defecto) usando la librería VIM.
# 'imp_data' contiene el conjunto de datos imputado.
imp_data <- VIM::kNN(datos_imputar, variable = "Life expectancy at birth, total (years) - SP.DYN.LE00.IN", k = 5)
##                          Access to electricity (% of population) - EG.ELC.ACCS.ZS 
##                                                                           1.27018 
##                              School enrollment, secondary (% gross) - SE.SEC.ENRR 
##                                                                           6.19735 
## Proportion of population using basic drinking water services (%) - SP_ACS_BSRVH2O 
##                                                                          26.00000 
##         Primary completion rate, total (% of relevant age group) - SE.PRM.CMPT.ZS 
##                                                                          16.57523 
##                 Individuals using the Internet (% of population) - IT.NET.USER.ZS 
##                                                                           0.00000 
##                          Access to electricity (% of population) - EG.ELC.ACCS.ZS 
##                                                                         100.00000 
##                              School enrollment, secondary (% gross) - SE.SEC.ENRR 
##                                                                         163.93472 
## Proportion of population using basic drinking water services (%) - SP_ACS_BSRVH2O 
##                                                                         100.00000 
##         Primary completion rate, total (% of relevant age group) - SE.PRM.CMPT.ZS 
##                                                                         134.54251 
##                 Individuals using the Internet (% of population) - IT.NET.USER.ZS 
##                                                                          99.65285
# Actualiza el conjunto de datos original con los valores de Esperanza de Vida imputados.
datos$`Life expectancy at birth, total (years) - SP.DYN.LE00.IN` <- imp_data$`Life expectancy at birth, total (years) - SP.DYN.LE00.IN`

# 4. Mostrar el número de NAs restantes (debe ser 0).
print(paste("NAs restantes en Esperanza de Vida después de kNN:", sum(is.na(datos$`Life expectancy at birth, total (years) - SP.DYN.LE00.IN`))))
## [1] "NAs restantes en Esperanza de Vida después de kNN: 0"
### 6 resumen
# Filtrar datos del último año disponible.
datos_final <- datos %>% filter(Year == max(Year))

# NOTA: Se ha eliminado la definición del vector 'vars_analisis'
#       y se han usado directamente los nombres de las columnas 
#       para evitar el error en la función summarise().

# 1. Medidas de Tendencia Central (Media y Mediana).
tendencia <- datos_final %>%
  # Agrupar por la variable Region.
  group_by(Region) %>%
  # Calcular la media y mediana para cada variable en cada región.
  summarise(
    # Referencia directa a la columna: mean(`Nombre Columna`)
    Media_Pobreza = mean(`Proportion of population below international poverty line (%) - SI_POV_DAY1`, na.rm = TRUE),
    Mediana_Pobreza = median(`Proportion of population below international poverty line (%) - SI_POV_DAY1`, na.rm = TRUE),
    Media_GINI = mean(`Gini index (World Bank estimate) - SI.POV.GINI`, na.rm = TRUE),
    Mediana_GINI = median(`Gini index (World Bank estimate) - SI.POV.GINI`, na.rm = TRUE),
    Media_Esperanza = mean(`Life expectancy at birth, total (years) - SP.DYN.LE00.IN`, na.rm = TRUE),
    Mediana_Esperanza = median(`Life expectancy at birth, total (years) - SP.DYN.LE00.IN`, na.rm = TRUE)
  )

# 2. Medidas de Dispersión (Desviación Estándar y Desviación Absoluta de la Mediana, MAD).
dispersion <- datos_final %>%
  # Agrupar por Region.
  group_by(Region) %>%
  # Calcular la desviación estándar (no robusta) y el MAD (robusta).
  summarise(
    SD_Pobreza = sd(`Proportion of population below international poverty line (%) - SI_POV_DAY1`, na.rm = TRUE),
    MAD_Pobreza = mad(`Proportion of population below international poverty line (%) - SI_POV_DAY1`, na.rm = TRUE),
    SD_GINI = sd(`Gini index (World Bank estimate) - SI.POV.GINI`, na.rm = TRUE),
    MAD_GINI = mad(`Gini index (World Bank estimate) - SI.POV.GINI`, na.rm = TRUE),
    SD_Esperanza = sd(`Life expectancy at birth, total (years) - SP.DYN.LE00.IN`, na.rm = TRUE),
    MAD_Esperanza = mad(`Life expectancy at birth, total (years) - SP.DYN.LE00.IN`, na.rm = TRUE)
  )

# Mostrar las tablas estilizadas.
kable(tendencia, caption = "Medidas de Tendencia Central por Región (Último Año)") %>%
  kable_styling()
Medidas de Tendencia Central por Región (Último Año)
Region Media_Pobreza Mediana_Pobreza Media_GINI Mediana_GINI Media_Esperanza Mediana_Esperanza
Central and Southern Asia 1.3750000 0.55 32.27500 29.7 72.47567 71.51650
Eastern and South-Eastern Asia 3.1000000 2.25 37.28333 37.1 75.41964 75.51950
Europe and Northern America 0.4483871 0.20 31.00323 30.2 79.14412 80.89268
Latin America and Caribbean 3.2071429 1.70 45.39286 45.4 74.66935 74.74400
Northern Africa and Western Asia 1.1800000 0.00 34.28000 34.4 76.02221 76.45300
Oceania NaN NA NaN NA 74.31789 71.81800
Sub-Saharan Africa 33.1250000 41.05 38.55000 35.4 62.58853 62.97300
kable(dispersion, caption = "Medidas de Dispersión por Región (Último Año)") %>%
  kable_styling()
Medidas de Dispersión por Región (Último Año)
Region SD_Pobreza MAD_Pobreza SD_GINI MAD_GINI SD_Esperanza MAD_Esperanza
Central and Southern Asia 2.0336748 0.44478 6.732694 2.89107 3.313894 1.980754
Eastern and South-Eastern Asia 3.6353817 2.29803 3.225782 2.29803 6.164585 8.732514
Europe and Northern America 0.7334262 0.29652 4.609663 4.44780 3.506372 3.609661
Latin America and Caribbean 3.9118094 1.70499 4.352977 4.81845 3.515416 2.840662
Northern Africa and Western Asia 1.9524344 0.00000 5.779879 2.96520 3.130638 3.036365
Oceania NA NA NA NA 6.436534 4.427044
Sub-Saharan Africa 22.2036596 8.00604 8.644652 2.66868 5.027565 4.400357