Limpieza de bases de datos Cia

###1. ENERGY

###1.1. Electricity

library(rio)
electricity = import("Electricity.csv")

names(electricity)

## [1] "name"                "slug"                "kW"                 
## [4] "date_of_information" "ranking"             "region"

str(electricity)

## 'data.frame':    213 obs. of  6 variables:
##  $ name               : chr  "China" "United States" "India" "Japan" ...
##  $ slug               : chr  "china" "united-states" "india" "japan" ...
##  $ kW                 : chr  "2,217,925,000" "1,143,266,000" "432,768,000" "348,666,000" ...
##  $ date_of_information: int  2020 2020 2020 2020 2020 2020 2020 2020 2020 2020 ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "East and Southeast Asia" "North America" "South Asia" "East and Southeast Asia" ...

#install.packages("readr")
library(readr)

electricity$kW <- parse_number(electricity$kW)

#electricity$kW <- as.integer(electricity$kW) este lo pasa a entero pero borra el valor de china (?)

str(electricity)

## 'data.frame':    213 obs. of  6 variables:
##  $ name               : chr  "China" "United States" "India" "Japan" ...
##  $ slug               : chr  "china" "united-states" "india" "japan" ...
##  $ kW                 : num  2.22e+09 1.14e+09 4.33e+08 3.49e+08 2.76e+08 ...
##  $ date_of_information: int  2020 2020 2020 2020 2020 2020 2020 2020 2020 2020 ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "East and Southeast Asia" "North America" "South Asia" "East and Southeast Asia" ...

###1.2. Refined_petroleum_products_production

re_petroleum = import("Refined_petroleum_products_production.csv")

names(re_petroleum)

## [1] "name"                "slug"                "bbl/day"            
## [4] "date_of_information" "ranking"             "region"

str(re_petroleum)

## 'data.frame':    216 obs. of  6 variables:
##  $ name               : chr  "United States" "China" "Russia" "India" ...
##  $ slug               : chr  "united-states" "china" "russia" "india" ...
##  $ bbl/day            : chr  "20,300,000" "11,510,000" "6,076,000" "4,897,000" ...
##  $ date_of_information: int  2017 2015 2015 2015 2017 2017 2015 2015 2017 2017 ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "North America" "East and Southeast Asia" "Central Asia" "South Asia" ...

re_petroleum$"bbl/day" <- parse_number(re_petroleum$"bbl/day")
str(re_petroleum)

## 'data.frame':    216 obs. of  6 variables:
##  $ name               : chr  "United States" "China" "Russia" "India" ...
##  $ slug               : chr  "united-states" "china" "russia" "india" ...
##  $ bbl/day            : num  20300000 11510000 6076000 4897000 3467000 ...
##  $ date_of_information: int  2017 2015 2015 2015 2017 2017 2015 2015 2017 2017 ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "North America" "East and Southeast Asia" "Central Asia" "South Asia" ...

###1.3. Carboon dioxide emissions

carbon_emissions = import("Carbon_dioxide_emissions.csv")

names(carbon_emissions)

## [1] "name"                 "slug"                 "metric tonnes of CO2"
## [4] "date_of_information"  "ranking"              "region"

str(carbon_emissions)

## 'data.frame':    218 obs. of  6 variables:
##  $ name                : chr  "China" "United States" "India" "Russia" ...
##  $ slug                : chr  "china" "united-states" "india" "russia" ...
##  $ metric tonnes of CO2: chr  "10,773,248,000" "5,144,361,000" "2,314,738,000" "1,848,070,000" ...
##  $ date_of_information : int  2019 2019 2019 2019 2019 2019 2019 2019 2019 2019 ...
##  $ ranking             : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region              : chr  "East and Southeast Asia" "North America" "South Asia" "Central Asia" ...

carbon_emissions$"metric tonnes of CO2" <- parse_number(carbon_emissions$"metric tonnes of CO2")
str(carbon_emissions)

## 'data.frame':    218 obs. of  6 variables:
##  $ name                : chr  "China" "United States" "India" "Russia" ...
##  $ slug                : chr  "china" "united-states" "india" "russia" ...
##  $ metric tonnes of CO2: num  1.08e+10 5.14e+09 2.31e+09 1.85e+09 1.10e+09 ...
##  $ date_of_information : int  2019 2019 2019 2019 2019 2019 2019 2019 2019 2019 ...
##  $ ranking             : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region              : chr  "East and Southeast Asia" "North America" "South Asia" "Central Asia" ...

###1.4. Energy consumption per capita

energy_consumption = import("Energy_consumption_per_capita.csv")

names(energy_consumption)

## [1] "name"                "slug"                "Btu/person"         
## [4] "date_of_information" "ranking"             "region"

str(energy_consumption)

## 'data.frame':    212 obs. of  6 variables:
##  $ name               : chr  "Qatar" "Singapore" "Bahrain" "United Arab Emirates" ...
##  $ slug               : chr  "qatar" "singapore" "bahrain" "united-arab-emirates" ...
##  $ Btu/person         : chr  "723,582,000" "639,951,000" "547,976,000" "471,788,000" ...
##  $ date_of_information: int  2019 2019 2019 2019 2019 2019 2019 2019 2019 2019 ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "Middle East" "East and Southeast Asia" "Middle East" "Middle East" ...

energy_consumption$"Btu/person" <- parse_number(energy_consumption$"Btu/person")
str(energy_consumption)

## 'data.frame':    212 obs. of  6 variables:
##  $ name               : chr  "Qatar" "Singapore" "Bahrain" "United Arab Emirates" ...
##  $ slug               : chr  "qatar" "singapore" "bahrain" "united-arab-emirates" ...
##  $ Btu/person         : num  7.24e+08 6.40e+08 5.48e+08 4.72e+08 4.15e+08 ...
##  $ date_of_information: int  2019 2019 2019 2019 2019 2019 2019 2019 2019 2019 ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "Middle East" "East and Southeast Asia" "Middle East" "Middle East" ...

###2. COMMUNICATIONS

###2.1. Telephones_fixed_lines

telephones_fixed = import("Telephones_fixed_lines.csv")

names(telephones_fixed)

## [1] "name"                "slug"                "value"              
## [4] "date_of_information" "ranking"             "region"

str(telephones_fixed)

## 'data.frame':    224 obs. of  6 variables:
##  $ name               : chr  "China" "United States" "Japan" "Germany" ...
##  $ slug               : chr  "china" "united-states" "japan" "germany" ...
##  $ value              : chr  "179,414,000" "91,623,000" "60,721,000" "38,580,000" ...
##  $ date_of_information: int  2022 2022 2022 2022 2022 2022 2022 2022 2022 2022 ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "East and Southeast Asia" "North America" "East and Southeast Asia" "Europe" ...

telephones_fixed$value <- parse_number(telephones_fixed$value)
str(telephones_fixed)

## 'data.frame':    224 obs. of  6 variables:
##  $ name               : chr  "China" "United States" "Japan" "Germany" ...
##  $ slug               : chr  "china" "united-states" "japan" "germany" ...
##  $ value              : num  1.79e+08 9.16e+07 6.07e+07 3.86e+07 3.77e+07 ...
##  $ date_of_information: int  2022 2022 2022 2022 2022 2022 2022 2022 2022 2022 ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "East and Southeast Asia" "North America" "East and Southeast Asia" "Europe" ...

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

telephones_fixed <- telephones_fixed %>% 
  rename(value_telefixed = value)

###2.2. Telephones_mobile_cellular

telephones_mobile = import("Telephones_mobile_cellular.csv")

names(telephones_mobile)

## [1] "name"                "slug"                "value"              
## [4] "date_of_information" "ranking"             "region"

str(telephones_mobile)

## 'data.frame':    225 obs. of  6 variables:
##  $ name               : chr  "China" "India" "United States" "Indonesia" ...
##  $ slug               : chr  "china" "india" "united-states" "indonesia" ...
##  $ value              : chr  "1,781,000,000" "1,143,000,000" "372,682,000" "316,553,000" ...
##  $ date_of_information: int  2022 2022 2022 2022 2022 2022 2022 2022 2022 2022 ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "East and Southeast Asia" "South Asia" "North America" "East and Southeast Asia" ...

telephones_mobile$value <- parse_number(telephones_mobile$value)
str(telephones_mobile)

## 'data.frame':    225 obs. of  6 variables:
##  $ name               : chr  "China" "India" "United States" "Indonesia" ...
##  $ slug               : chr  "china" "india" "united-states" "indonesia" ...
##  $ value              : num  1.78e+09 1.14e+09 3.73e+08 3.17e+08 2.45e+08 ...
##  $ date_of_information: int  2022 2022 2022 2022 2022 2022 2022 2022 2022 2022 ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "East and Southeast Asia" "South Asia" "North America" "East and Southeast Asia" ...

telephones_mobile <- telephones_mobile %>% 
  rename(value_telemobi = value)

###2.3. Broadband_fixed_subscriptions.csv

broadband_fixed = import("Broadband_fixed_subscriptions.csv")

names(broadband_fixed)

## [1] "name"                "slug"                "value"              
## [4] "date_of_information" "ranking"             "region"

str(broadband_fixed)

## 'data.frame':    214 obs. of  6 variables:
##  $ name               : chr  "China" "United States" "Japan" "Brazil" ...
##  $ slug               : chr  "china" "united-states" "japan" "brazil" ...
##  $ value              : chr  "483,549,500" "121,176,000" "44,000,791" "36,344,670" ...
##  $ date_of_information: chr  "2020" "2020" "2020" "2020" ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "East and Southeast Asia" "North America" "East and Southeast Asia" "South America" ...

broadband_fixed$value <- parse_number(broadband_fixed$value)
str(broadband_fixed)

## 'data.frame':    214 obs. of  6 variables:
##  $ name               : chr  "China" "United States" "Japan" "Brazil" ...
##  $ slug               : chr  "china" "united-states" "japan" "brazil" ...
##  $ value              : num  4.84e+08 1.21e+08 4.40e+07 3.63e+07 3.62e+07 ...
##  $ date_of_information: chr  "2020" "2020" "2020" "2020" ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "East and Southeast Asia" "North America" "East and Southeast Asia" "South America" ...

broadband_fixed <- broadband_fixed %>% 
  rename(value_broad = value)

###3. ECONOMY

###3.1. Inflation Rate

inflation_rate = import("Inflation rate (consumer prices).csv")

names(inflation_rate)

## [1] "name"                "slug"                "%"                  
## [4] "date_of_information" "ranking"             "region"

str(inflation_rate)

## 'data.frame':    221 obs. of  6 variables:
##  $ name               : chr  "South Sudan" "Andorra" "Dominica" "American Samoa" ...
##  $ slug               : chr  "south-sudan" "andorra" "dominica" "american-samoa" ...
##  $ %                  : chr  "-6.69" "-0.9" "-0.73" "-0.5" ...
##  $ date_of_information: chr  "2022" "2015" "2020" "2015" ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "Africa" "Europe" "Central America and the Caribbean" "Australia and Oceania" ...

#inflation_rate$"%" <- parse_number(inflation_rate$"%", locale = locale(decimal_mark = ".", grouping_mark = " "))
#str(inflation_rate)

inflation_rate$"%" <- readr::parse_number(gsub("(?<!\\d),(?!\\d)|\\.(?!\\d)", "", inflation_rate$"%", perl = TRUE))
str(inflation_rate)

## 'data.frame':    221 obs. of  6 variables:
##  $ name               : chr  "South Sudan" "Andorra" "Dominica" "American Samoa" ...
##  $ slug               : chr  "south-sudan" "andorra" "dominica" "american-samoa" ...
##  $ %                  : num  -6.69 -0.9 -0.73 -0.5 -0.4 -0.3 0 0 0.3 0.3 ...
##  $ date_of_information: chr  "2022" "2015" "2020" "2015" ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "Africa" "Europe" "Central America and the Caribbean" "Australia and Oceania" ...

inflation_rate <- inflation_rate %>% 
  rename("%_inflation" = "%")

###3.2. Youth unemployment rate (ages 15-24)

youth_unemployment = import("Youth_unemployment_rate(ages 15-24).csv")

names(youth_unemployment)

## [1] "name"                "slug"                "%"                  
## [4] "date_of_information" "ranking"             "region"

str(youth_unemployment)

## 'data.frame':    203 obs. of  6 variables:
##  $ name               : chr  "Djibouti" "South Africa" "Eswatini" "Libya" ...
##  $ slug               : chr  "djibouti" "south-africa" "eswatini" "libya" ...
##  $ %                  : num  79.9 64.2 50.9 50.5 48.8 45.4 42.3 42.2 41.2 41.1 ...
##  $ date_of_information: int  2021 2021 2021 2021 2020 2021 2021 2020 2021 2021 ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "Africa" "Africa" "Africa" "Africa" ...

youth_unemployment <- youth_unemployment %>% 
  rename("%_youth" = "%")

###3.3. Public debt

public_debt = import("Public debt.csv")

names(public_debt)

## [1] "name"                "slug"                "% of GDP"           
## [4] "date_of_information" "ranking"             "region"

str(public_debt)

## 'data.frame':    210 obs. of  6 variables:
##  $ name               : chr  "Greece" "Japan" "United Kingdom" "Singapore" ...
##  $ slug               : chr  "greece" "japan" "united-kingdom" "singapore" ...
##  $ % of GDP           : num  237 216 185 154 147 ...
##  $ date_of_information: chr  "2021" "2021" "2021" "2021" ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "Europe" "East and Southeast Asia" "Europe" "East and Southeast Asia" ...

###3.4. Debt external

debt_external = import("Debt_external.csv")

names(debt_external)

## [1] "name"                "slug"                "value"              
## [4] "date_of_information" "ranking"             "region"

str(debt_external)

## 'data.frame':    207 obs. of  6 variables:
##  $ name               : chr  "United States" "United Kingdom" "France" "Germany" ...
##  $ slug               : chr  "united-states" "united-kingdom" "france" "germany" ...
##  $ value              : chr  "$20,275,951,000,000" "$8,722,000,000,000" "$6,356,000,000,000" "$5,671,463,000,000" ...
##  $ date_of_information: chr  "2019" "2019" "2019" "2019" ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "North America" "Europe" "Europe" "Europe" ...

debt_external$value <- parse_number(debt_external$value)
str(debt_external)

## 'data.frame':    207 obs. of  6 variables:
##  $ name               : chr  "United States" "United Kingdom" "France" "Germany" ...
##  $ slug               : chr  "united-states" "united-kingdom" "france" "germany" ...
##  $ value              : num  2.03e+13 8.72e+12 6.36e+12 5.67e+12 4.35e+12 ...
##  $ date_of_information: chr  "2019" "2019" "2019" "2019" ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "North America" "Europe" "Europe" "Europe" ...

debt_external <- debt_external %>% 
  rename("value_debt" = value)

###Prueba de unión de bases de datos

sort(setdiff(electricity$name,debt_external$name))

##  [1] "American Samoa"                               
##  [2] "Antarctica"                                   
##  [3] "Cayman Islands"                               
##  [4] "French Polynesia"                             
##  [5] "Gaza Strip"                                   
##  [6] "Gibraltar"                                    
##  [7] "Guam"                                         
##  [8] "Saint Helena, Ascension, and Tristan da Cunha"
##  [9] "Saint Pierre and Miquelon"                    
## [10] "South Sudan"                                  
## [11] "Turks and Caicos Islands"                     
## [12] "Virgin Islands"                               
## [13] "Wake Island"

sort(setdiff(debt_external$name,electricity$name))

## [1] "Andorra"                         "Anguilla"                       
## [3] "Liechtenstein"                   "Marshall Islands"               
## [5] "Micronesia, Federated States of" "Palau"                          
## [7] "Wallis and Futuna"

gaaaa=merge(electricity,debt_external, by.x = "name", by.y = 'name')
gaaaa%>%
    rmarkdown::paged_table()

Limpieza de bases de datos Cia

Alfredo Ludmir Aro Terleira

2024-06-12