Limpieza de bases de datos

BROADBAND

library(rio)
broadband = import("broadband.csv")
str(broadband)
## 'data.frame':    214 obs. of  6 variables:
##  $ name               : chr  "China" "United States" "Japan" "Brazil" ...
##  $ slug               : chr  "china" "united-states" "japan" "brazil" ...
##  $ value              : chr  "483,549,500" "121,176,000" "44,000,791" "36,344,670" ...
##  $ date_of_information: chr  "2020" "2020" "2020" "2020" ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "East and Southeast Asia" "North America" "East and Southeast Asia" "South America" ...
library(readr)
broadband$value <- parse_number(broadband$value)
str(broadband)
## 'data.frame':    214 obs. of  6 variables:
##  $ name               : chr  "China" "United States" "Japan" "Brazil" ...
##  $ slug               : chr  "china" "united-states" "japan" "brazil" ...
##  $ value              : num  4.84e+08 1.21e+08 4.40e+07 3.63e+07 3.62e+07 ...
##  $ date_of_information: chr  "2020" "2020" "2020" "2020" ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "East and Southeast Asia" "North America" "East and Southeast Asia" "South America" ...
table(broadband$region)
## 
##                            Africa             Australia and Oceania 
##                                53                                17 
## Central America and the Caribbean                      Central Asia 
##                                27                                 6 
##           East and Southeast Asia                            Europe 
##                                19                                47 
##                       Middle East                     North America 
##                                19                                 5 
##                     South America                        South Asia 
##                                13                                 8
colnames(broadband)[colnames(broadband) == "value"] <- "bband"
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
broadband = select(broadband, name, bband) 

CARBON EMISSIONS - carboncito

metric tonnes of co2

carboncito = import("carbon_emissions.csv")
str(carboncito)
## 'data.frame':    218 obs. of  6 variables:
##  $ name                : chr  "China" "United States" "India" "Russia" ...
##  $ slug                : chr  "china" "united-states" "india" "russia" ...
##  $ metric tonnes of CO2: chr  "10,773,248,000" "5,144,361,000" "2,314,738,000" "1,848,070,000" ...
##  $ date_of_information : int  2019 2019 2019 2019 2019 2019 2019 2019 2019 2019 ...
##  $ ranking             : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region              : chr  "East and Southeast Asia" "North America" "South Asia" "Central Asia" ...
colnames(carboncito)[colnames(carboncito) == "metric tonnes of CO2"] <- "metrics"
carboncito$metrics <- parse_number(carboncito$metrics)
str(carboncito)
## 'data.frame':    218 obs. of  6 variables:
##  $ name               : chr  "China" "United States" "India" "Russia" ...
##  $ slug               : chr  "china" "united-states" "india" "russia" ...
##  $ metrics            : num  1.08e+10 5.14e+09 2.31e+09 1.85e+09 1.10e+09 ...
##  $ date_of_information: int  2019 2019 2019 2019 2019 2019 2019 2019 2019 2019 ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "East and Southeast Asia" "North America" "South Asia" "Central Asia" ...
colnames(carboncito)[colnames(carboncito) == "metrics"] <- "carbon_emi"
library(dplyr)
carboncito = select(carboncito, name, carbon_emi) 

EXTERNAL DEBT - pafuera

pafuera = import("debt_external.csv")
str(pafuera)
## 'data.frame':    207 obs. of  6 variables:
##  $ name               : chr  "United States" "United Kingdom" "France" "Germany" ...
##  $ slug               : chr  "united-states" "united-kingdom" "france" "germany" ...
##  $ value              : chr  "$20,275,951,000,000" "$8,722,000,000,000" "$6,356,000,000,000" "$5,671,463,000,000" ...
##  $ date_of_information: chr  "2019" "2019" "2019" "2019" ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "North America" "Europe" "Europe" "Europe" ...
pafuera$value <- parse_number(pafuera$value)
str(pafuera)
## 'data.frame':    207 obs. of  6 variables:
##  $ name               : chr  "United States" "United Kingdom" "France" "Germany" ...
##  $ slug               : chr  "united-states" "united-kingdom" "france" "germany" ...
##  $ value              : num  2.03e+13 8.72e+12 6.36e+12 5.67e+12 4.35e+12 ...
##  $ date_of_information: chr  "2019" "2019" "2019" "2019" ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "North America" "Europe" "Europe" "Europe" ...
colnames(pafuera)[colnames(pafuera) == "value"] <- "externa_deuda"
library(dplyr)
pafuera = select(pafuera, name, externa_deuda) 

ELECTRICITY - pikachu

Kw

pikachu = import("electricity.csv")
str(pikachu)
## 'data.frame':    213 obs. of  6 variables:
##  $ name               : chr  "China" "United States" "India" "Japan" ...
##  $ slug               : chr  "china" "united-states" "india" "japan" ...
##  $ kW                 : chr  "2,217,925,000" "1,143,266,000" "432,768,000" "348,666,000" ...
##  $ date_of_information: int  2020 2020 2020 2020 2020 2020 2020 2020 2020 2020 ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "East and Southeast Asia" "North America" "South Asia" "East and Southeast Asia" ...
colnames(pikachu)[colnames(pikachu) == "kW"] <- "kw"
pikachu$kw <- parse_number(pikachu$kw)
str(pikachu)
## 'data.frame':    213 obs. of  6 variables:
##  $ name               : chr  "China" "United States" "India" "Japan" ...
##  $ slug               : chr  "china" "united-states" "india" "japan" ...
##  $ kw                 : num  2.22e+09 1.14e+09 4.33e+08 3.49e+08 2.76e+08 ...
##  $ date_of_information: int  2020 2020 2020 2020 2020 2020 2020 2020 2020 2020 ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "East and Southeast Asia" "North America" "South Asia" "East and Southeast Asia" ...
colnames(pikachu)[colnames(pikachu) == "kw"] <- "kw_electricity"
library(dplyr)
pikachu = select(pikachu, name, kw_electricity) 

ENERGY - powerade

Btu/person

powerade = import("energy_compercap.csv")
str(powerade)
## 'data.frame':    212 obs. of  6 variables:
##  $ name               : chr  "Qatar" "Singapore" "Bahrain" "United Arab Emirates" ...
##  $ slug               : chr  "qatar" "singapore" "bahrain" "united-arab-emirates" ...
##  $ Btu/person         : chr  "723,582,000" "639,951,000" "547,976,000" "471,788,000" ...
##  $ date_of_information: int  2019 2019 2019 2019 2019 2019 2019 2019 2019 2019 ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "Middle East" "East and Southeast Asia" "Middle East" "Middle East" ...
names(powerade)
## [1] "name"                "slug"                "Btu/person"         
## [4] "date_of_information" "ranking"             "region"
library(dplyr)
powerade <- powerade %>% rename(btu = `Btu/person`)
powerade$btu <- parse_number(powerade$btu)
str(powerade)
## 'data.frame':    212 obs. of  6 variables:
##  $ name               : chr  "Qatar" "Singapore" "Bahrain" "United Arab Emirates" ...
##  $ slug               : chr  "qatar" "singapore" "bahrain" "united-arab-emirates" ...
##  $ btu                : num  7.24e+08 6.40e+08 5.48e+08 4.72e+08 4.15e+08 ...
##  $ date_of_information: int  2019 2019 2019 2019 2019 2019 2019 2019 2019 2019 ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "Middle East" "East and Southeast Asia" "Middle East" "Middle East" ...
colnames(powerade)[colnames(powerade) == "btu"] <- "btuperson"
library(dplyr)
powerade = select(powerade, name, btuperson) 

INFLATION - alangarcia

%

alangarcia = import("inflation.csv")
str(alangarcia)
## 'data.frame':    221 obs. of  6 variables:
##  $ name               : chr  "South Sudan" "Andorra" "Dominica" "American Samoa" ...
##  $ slug               : chr  "south-sudan" "andorra" "dominica" "american-samoa" ...
##  $ %                  : chr  "-6.69" "-0.9" "-0.73" "-0.5" ...
##  $ date_of_information: chr  "2022" "2015" "2020" "2015" ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "Africa" "Europe" "Central America and the Caribbean" "Australia and Oceania" ...
alangarcia <- alangarcia %>% rename(percentage = `%`)
alangarcia$percentage <- readr::parse_number(gsub("(?<!\\d),(?!\\d)|\\.(?!\\d)", "", alangarcia$percentage, perl = TRUE))
str(alangarcia)
## 'data.frame':    221 obs. of  6 variables:
##  $ name               : chr  "South Sudan" "Andorra" "Dominica" "American Samoa" ...
##  $ slug               : chr  "south-sudan" "andorra" "dominica" "american-samoa" ...
##  $ percentage         : num  -6.69 -0.9 -0.73 -0.5 -0.4 -0.3 0 0 0.3 0.3 ...
##  $ date_of_information: chr  "2022" "2015" "2020" "2015" ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "Africa" "Europe" "Central America and the Caribbean" "Australia and Oceania" ...
colnames(alangarcia)[colnames(alangarcia) == "percentage"] <- "infla_percent"
library(dplyr)
alangarcia = select(alangarcia, name, infla_percent) 

TELEPHONE MOBILE CELULAR - perapad

perapad = import("mobile_celular.csv")
str(perapad)
## 'data.frame':    225 obs. of  6 variables:
##  $ name               : chr  "China" "India" "United States" "Indonesia" ...
##  $ slug               : chr  "china" "india" "united-states" "indonesia" ...
##  $ value              : chr  "1,781,000,000" "1,143,000,000" "372,682,000" "316,553,000" ...
##  $ date_of_information: int  2022 2022 2022 2022 2022 2022 2022 2022 2022 2022 ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "East and Southeast Asia" "South Asia" "North America" "East and Southeast Asia" ...
perapad$value <- parse_number(perapad$value)
str(perapad)
## 'data.frame':    225 obs. of  6 variables:
##  $ name               : chr  "China" "India" "United States" "Indonesia" ...
##  $ slug               : chr  "china" "india" "united-states" "indonesia" ...
##  $ value              : num  1.78e+09 1.14e+09 3.73e+08 3.17e+08 2.45e+08 ...
##  $ date_of_information: int  2022 2022 2022 2022 2022 2022 2022 2022 2022 2022 ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "East and Southeast Asia" "South Asia" "North America" "East and Southeast Asia" ...
colnames(perapad)[colnames(perapad) == "value"] <- "mobile_cellular"
library(dplyr)
perapad = select(perapad, name, mobile_cellular) 

PETROLEUM PRODUCTION - derrame

BBL/DAY

derrame = import("petroleum_production.csv")
str(derrame)
## 'data.frame':    216 obs. of  6 variables:
##  $ name               : chr  "United States" "China" "Russia" "India" ...
##  $ slug               : chr  "united-states" "china" "russia" "india" ...
##  $ bbl/day            : chr  "20,300,000" "11,510,000" "6,076,000" "4,897,000" ...
##  $ date_of_information: int  2017 2015 2015 2015 2017 2017 2015 2015 2017 2017 ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "North America" "East and Southeast Asia" "Central Asia" "South Asia" ...
derrame <- derrame %>% rename(bbl = `bbl/day`)
derrame$bbl <- parse_number(derrame$bbl)
str(derrame)
## 'data.frame':    216 obs. of  6 variables:
##  $ name               : chr  "United States" "China" "Russia" "India" ...
##  $ slug               : chr  "united-states" "china" "russia" "india" ...
##  $ bbl                : num  20300000 11510000 6076000 4897000 3467000 ...
##  $ date_of_information: int  2017 2015 2015 2015 2017 2017 2015 2015 2017 2017 ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "North America" "East and Southeast Asia" "Central Asia" "South Asia" ...
colnames(derrame)[colnames(derrame) == "bbl"] <- "barrels_pd"
library(dplyr)
derrame = select(derrame, name, barrels_pd) 

PUBLIC DEBT - morosos

% OF GDP

morosos = import("public_debt.csv")
str(morosos)
## 'data.frame':    210 obs. of  6 variables:
##  $ name               : chr  "Greece" "Japan" "United Kingdom" "Singapore" ...
##  $ slug               : chr  "greece" "japan" "united-kingdom" "singapore" ...
##  $ % of GDP           : num  237 216 185 154 147 ...
##  $ date_of_information: chr  "2021" "2021" "2021" "2021" ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "Europe" "East and Southeast Asia" "Europe" "East and Southeast Asia" ...
morosos <- morosos %>% rename(gdp = `% of GDP`)
colnames(morosos)[colnames(morosos) == "gdp"] <- "publicdebt_percent"
library(dplyr)
morosos = select(morosos, name, publicdebt_percent) 

TELEPHONE FIXED LINES - cablemagico

cablemagico = import("telephone_fixedlines.csv")
str(cablemagico)
## 'data.frame':    224 obs. of  6 variables:
##  $ name               : chr  "China" "United States" "Japan" "Germany" ...
##  $ slug               : chr  "china" "united-states" "japan" "germany" ...
##  $ value              : chr  "179,414,000" "91,623,000" "60,721,000" "38,580,000" ...
##  $ date_of_information: int  2022 2022 2022 2022 2022 2022 2022 2022 2022 2022 ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "East and Southeast Asia" "North America" "East and Southeast Asia" "Europe" ...
cablemagico$value <- parse_number(cablemagico$value)
str(cablemagico)
## 'data.frame':    224 obs. of  6 variables:
##  $ name               : chr  "China" "United States" "Japan" "Germany" ...
##  $ slug               : chr  "china" "united-states" "japan" "germany" ...
##  $ value              : num  1.79e+08 9.16e+07 6.07e+07 3.86e+07 3.77e+07 ...
##  $ date_of_information: int  2022 2022 2022 2022 2022 2022 2022 2022 2022 2022 ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "East and Southeast Asia" "North America" "East and Southeast Asia" "Europe" ...
colnames(cablemagico)[colnames(cablemagico) == "value"] <- "fixedlines"
library(dplyr)
cablemagico = select(cablemagico, name, fixedlines) 

YOUTH UNEMPLOYED - otarola

%

otarola = import("youth_unemployed.csv")
otarola <- otarola %>% rename(percent = `%`)
str(otarola)
## 'data.frame':    203 obs. of  6 variables:
##  $ name               : chr  "Djibouti" "South Africa" "Eswatini" "Libya" ...
##  $ slug               : chr  "djibouti" "south-africa" "eswatini" "libya" ...
##  $ percent            : num  79.9 64.2 50.9 50.5 48.8 45.4 42.3 42.2 41.2 41.1 ...
##  $ date_of_information: int  2021 2021 2021 2021 2020 2021 2021 2020 2021 2021 ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "Africa" "Africa" "Africa" "Africa" ...
colnames(otarola)[colnames(otarola) == "percent"] <- "unemployed_pc"
library(dplyr)
otarola = select(otarola, name, unemployed_pc) 

COMBINAR DATA FRAMES

library(dplyr)
library(purrr)
dataza <- list(alangarcia, broadband, cablemagico, carboncito, derrame, morosos, otarola, pafuera, perapad, pikachu, powerade) 
dataza <- reduce(dataza, full_join, by = "name")

REGRESIÓN

datareg <- dataza[complete.cases(dataza), ]

DEUDA PÚBLICA

hp1 = formula(publicdebt_percent ~ bband + fixedlines + carbon_emi + barrels_pd  + mobile_cellular + kw_electricity + btuperson)
reg_gauss=lm(hp1, data= datareg)
summary(reg_gauss)
## 
## Call:
## lm(formula = hp1, data = datareg)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -71.591 -21.396  -4.417  14.348 175.262 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.286e+01  3.515e+00  15.040  < 2e-16 ***
## bband           -5.609e-07  7.054e-07  -0.795  0.42760    
## fixedlines       2.126e-06  6.689e-07   3.178  0.00176 ** 
## carbon_emi      -6.824e-08  3.576e-08  -1.908  0.05802 .  
## barrels_pd      -3.192e-06  7.091e-06  -0.450  0.65324    
## mobile_cellular -2.168e-08  4.804e-08  -0.451  0.65243    
## kw_electricity   3.113e-07  1.991e-07   1.564  0.11974    
## btuperson        1.480e-08  2.367e-08   0.625  0.53272    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 34.17 on 172 degrees of freedom
## Multiple R-squared:  0.1848, Adjusted R-squared:  0.1516 
## F-statistic: 5.568 on 7 and 172 DF,  p-value: 8.502e-06

DEUDA EXTERNA

hp2 = formula(externa_deuda ~ bband + fixedlines + carbon_emi + barrels_pd  + mobile_cellular + kw_electricity + btuperson)
reg_gauss=lm(hp2, data= datareg)
summary(reg_gauss)
## 
## Call:
## lm(formula = hp2, data = datareg)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -3.089e+12 -1.956e+11 -6.946e+10  3.159e+09  6.394e+12 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      4.165e+10  9.251e+10   0.450 0.653131    
## bband           -4.343e+04  1.857e+04  -2.339 0.020464 *  
## fixedlines       5.718e+04  1.760e+04   3.248 0.001398 ** 
## carbon_emi      -3.699e+03  9.412e+02  -3.930 0.000123 ***
## barrels_pd       5.459e+05  1.866e+05   2.925 0.003910 ** 
## mobile_cellular -4.335e+03  1.264e+03  -3.429 0.000759 ***
## kw_electricity   2.442e+04  5.240e+03   4.660 6.31e-06 ***
## btuperson        1.068e+03  6.231e+02   1.715 0.088221 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.994e+11 on 172 degrees of freedom
## Multiple R-squared:  0.7772, Adjusted R-squared:  0.7681 
## F-statistic: 85.72 on 7 and 172 DF,  p-value: < 2.2e-16

INFLACION

hp3 = formula(infla_percent ~ bband + fixedlines + carbon_emi + barrels_pd  + mobile_cellular + kw_electricity + btuperson)
reg_gauss=lm(hp3, data= datareg)
summary(reg_gauss)
## 
## Call:
## lm(formula = hp3, data = datareg)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -6552   -885   -737   -685 144275 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)
## (Intercept)      7.337e+02  1.139e+03   0.644    0.520
## bband            1.782e-04  2.285e-04   0.780    0.437
## fixedlines      -1.511e-04  2.167e-04  -0.697    0.487
## carbon_emi      -9.631e-06  1.158e-05  -0.831    0.407
## barrels_pd       2.049e-03  2.297e-03   0.892    0.374
## mobile_cellular  8.334e-06  1.556e-05   0.536    0.593
## kw_electricity   1.681e-06  6.450e-05   0.026    0.979
## btuperson        8.727e-08  7.669e-06   0.011    0.991
## 
## Residual standard error: 11070 on 172 degrees of freedom
## Multiple R-squared:  0.006893,   Adjusted R-squared:  -0.03352 
## F-statistic: 0.1705 on 7 and 172 DF,  p-value: 0.9908
hp4 = formula(unemployed_pc ~ bband + fixedlines + carbon_emi + barrels_pd  + mobile_cellular + kw_electricity + btuperson)
reg_gauss=lm(hp4, data= datareg)
summary(reg_gauss)
## 
## Call:
## lm(formula = hp4, data = datareg)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -19.963  -9.271  -2.473   7.929  58.864 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      2.117e+01  1.292e+00  16.394   <2e-16 ***
## bband           -2.409e-07  2.592e-07  -0.929   0.3540    
## fixedlines       6.035e-11  2.458e-07   0.000   0.9998    
## carbon_emi       9.674e-09  1.314e-08   0.736   0.4627    
## barrels_pd      -2.153e-06  2.606e-06  -0.826   0.4099    
## mobile_cellular -1.286e-08  1.766e-08  -0.728   0.4675    
## kw_electricity   2.285e-08  7.317e-08   0.312   0.7552    
## btuperson       -1.526e-08  8.700e-09  -1.755   0.0811 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.56 on 172 degrees of freedom
## Multiple R-squared:  0.02942,    Adjusted R-squared:  -0.01009 
## F-statistic: 0.7447 on 7 and 172 DF,  p-value: 0.6344

```