Limpieza de bases de datos Cia

###1. ENERGY

###1.1. Electricity

library(rio)
electricity = import("Electricity.csv")

names(electricity)

## [1] "name"                "slug"                "kW"                 
## [4] "date_of_information" "ranking"             "region"

str(electricity)

## 'data.frame':    213 obs. of  6 variables:
##  $ name               : chr  "China" "United States" "India" "Japan" ...
##  $ slug               : chr  "china" "united-states" "india" "japan" ...
##  $ kW                 : chr  "2,217,925,000" "1,143,266,000" "432,768,000" "348,666,000" ...
##  $ date_of_information: int  2020 2020 2020 2020 2020 2020 2020 2020 2020 2020 ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "East and Southeast Asia" "North America" "South Asia" "East and Southeast Asia" ...

#install.packages("readr")
library(readr)

electricity$kW <- parse_number(electricity$kW)

#electricity$kW <- as.integer(electricity$kW) este lo pasa a entero pero borra el valor de china (?)

str(electricity)

## 'data.frame':    213 obs. of  6 variables:
##  $ name               : chr  "China" "United States" "India" "Japan" ...
##  $ slug               : chr  "china" "united-states" "india" "japan" ...
##  $ kW                 : num  2.22e+09 1.14e+09 4.33e+08 3.49e+08 2.76e+08 ...
##  $ date_of_information: int  2020 2020 2020 2020 2020 2020 2020 2020 2020 2020 ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "East and Southeast Asia" "North America" "South Asia" "East and Southeast Asia" ...

electricity <- electricity[, !(names(electricity) %in% c("slug", "date_of_information", "ranking", "region"))]

###1.2. Refined_petroleum_products_production

re_petroleum = import("Refined_petroleum_products_production.csv")

names(re_petroleum)

## [1] "name"                "slug"                "bbl/day"            
## [4] "date_of_information" "ranking"             "region"

str(re_petroleum)

## 'data.frame':    216 obs. of  6 variables:
##  $ name               : chr  "United States" "China" "Russia" "India" ...
##  $ slug               : chr  "united-states" "china" "russia" "india" ...
##  $ bbl/day            : chr  "20,300,000" "11,510,000" "6,076,000" "4,897,000" ...
##  $ date_of_information: int  2017 2015 2015 2015 2017 2017 2015 2015 2017 2017 ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "North America" "East and Southeast Asia" "Central Asia" "South Asia" ...

re_petroleum$"bbl/day" <- parse_number(re_petroleum$"bbl/day")
str(re_petroleum)

## 'data.frame':    216 obs. of  6 variables:
##  $ name               : chr  "United States" "China" "Russia" "India" ...
##  $ slug               : chr  "united-states" "china" "russia" "india" ...
##  $ bbl/day            : num  20300000 11510000 6076000 4897000 3467000 ...
##  $ date_of_information: int  2017 2015 2015 2015 2017 2017 2015 2015 2017 2017 ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "North America" "East and Southeast Asia" "Central Asia" "South Asia" ...

re_petroleum <- re_petroleum[, !(names(re_petroleum) %in% c("slug", "date_of_information", "ranking", "region"))]

###1.3. Carboon dioxide emissions

carbon_emissions = import("Carbon_dioxide_emissions.csv")

names(carbon_emissions)

## [1] "name"                 "slug"                 "metric tonnes of CO2"
## [4] "date_of_information"  "ranking"              "region"

str(carbon_emissions)

## 'data.frame':    218 obs. of  6 variables:
##  $ name                : chr  "China" "United States" "India" "Russia" ...
##  $ slug                : chr  "china" "united-states" "india" "russia" ...
##  $ metric tonnes of CO2: chr  "10,773,248,000" "5,144,361,000" "2,314,738,000" "1,848,070,000" ...
##  $ date_of_information : int  2019 2019 2019 2019 2019 2019 2019 2019 2019 2019 ...
##  $ ranking             : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region              : chr  "East and Southeast Asia" "North America" "South Asia" "Central Asia" ...

carbon_emissions$"metric tonnes of CO2" <- parse_number(carbon_emissions$"metric tonnes of CO2")
str(carbon_emissions)

## 'data.frame':    218 obs. of  6 variables:
##  $ name                : chr  "China" "United States" "India" "Russia" ...
##  $ slug                : chr  "china" "united-states" "india" "russia" ...
##  $ metric tonnes of CO2: num  1.08e+10 5.14e+09 2.31e+09 1.85e+09 1.10e+09 ...
##  $ date_of_information : int  2019 2019 2019 2019 2019 2019 2019 2019 2019 2019 ...
##  $ ranking             : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region              : chr  "East and Southeast Asia" "North America" "South Asia" "Central Asia" ...

carbon_emissions <- carbon_emissions[, !(names(carbon_emissions) %in% c("slug", "date_of_information", "ranking", "region"))]

###1.4. Energy consumption per capita

energy_consumption = import("Energy_consumption_per_capita.csv")

names(energy_consumption)

## [1] "name"                "slug"                "Btu/person"         
## [4] "date_of_information" "ranking"             "region"

str(energy_consumption)

## 'data.frame':    212 obs. of  6 variables:
##  $ name               : chr  "Qatar" "Singapore" "Bahrain" "United Arab Emirates" ...
##  $ slug               : chr  "qatar" "singapore" "bahrain" "united-arab-emirates" ...
##  $ Btu/person         : chr  "723,582,000" "639,951,000" "547,976,000" "471,788,000" ...
##  $ date_of_information: int  2019 2019 2019 2019 2019 2019 2019 2019 2019 2019 ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "Middle East" "East and Southeast Asia" "Middle East" "Middle East" ...

energy_consumption$"Btu/person" <- parse_number(energy_consumption$"Btu/person")
str(energy_consumption)

## 'data.frame':    212 obs. of  6 variables:
##  $ name               : chr  "Qatar" "Singapore" "Bahrain" "United Arab Emirates" ...
##  $ slug               : chr  "qatar" "singapore" "bahrain" "united-arab-emirates" ...
##  $ Btu/person         : num  7.24e+08 6.40e+08 5.48e+08 4.72e+08 4.15e+08 ...
##  $ date_of_information: int  2019 2019 2019 2019 2019 2019 2019 2019 2019 2019 ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "Middle East" "East and Southeast Asia" "Middle East" "Middle East" ...

energy_consumption <- energy_consumption[, !(names(energy_consumption) %in% c("slug", "date_of_information", "ranking", "region"))]

###2. COMMUNICATIONS

###2.1. Telephones_fixed_lines

telephones_fixed = import("Telephones_fixed_lines.csv")

names(telephones_fixed)

## [1] "name"                "slug"                "value"              
## [4] "date_of_information" "ranking"             "region"

str(telephones_fixed)

## 'data.frame':    224 obs. of  6 variables:
##  $ name               : chr  "China" "United States" "Japan" "Germany" ...
##  $ slug               : chr  "china" "united-states" "japan" "germany" ...
##  $ value              : chr  "179,414,000" "91,623,000" "60,721,000" "38,580,000" ...
##  $ date_of_information: int  2022 2022 2022 2022 2022 2022 2022 2022 2022 2022 ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "East and Southeast Asia" "North America" "East and Southeast Asia" "Europe" ...

telephones_fixed$value <- parse_number(telephones_fixed$value)
str(telephones_fixed)

## 'data.frame':    224 obs. of  6 variables:
##  $ name               : chr  "China" "United States" "Japan" "Germany" ...
##  $ slug               : chr  "china" "united-states" "japan" "germany" ...
##  $ value              : num  1.79e+08 9.16e+07 6.07e+07 3.86e+07 3.77e+07 ...
##  $ date_of_information: int  2022 2022 2022 2022 2022 2022 2022 2022 2022 2022 ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "East and Southeast Asia" "North America" "East and Southeast Asia" "Europe" ...

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

telephones_fixed <- telephones_fixed %>% 
  rename(value_telefixed = value)

telephones_fixed <- telephones_fixed[, !(names(telephones_fixed) %in% c("slug", "date_of_information", "ranking", "region"))]

###2.2. Telephones_mobile_cellular

telephones_mobile = import("Telephones_mobile_cellular.csv")

names(telephones_mobile)

## [1] "name"                "slug"                "value"              
## [4] "date_of_information" "ranking"             "region"

str(telephones_mobile)

## 'data.frame':    225 obs. of  6 variables:
##  $ name               : chr  "China" "India" "United States" "Indonesia" ...
##  $ slug               : chr  "china" "india" "united-states" "indonesia" ...
##  $ value              : chr  "1,781,000,000" "1,143,000,000" "372,682,000" "316,553,000" ...
##  $ date_of_information: int  2022 2022 2022 2022 2022 2022 2022 2022 2022 2022 ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "East and Southeast Asia" "South Asia" "North America" "East and Southeast Asia" ...

telephones_mobile$value <- parse_number(telephones_mobile$value)
str(telephones_mobile)

## 'data.frame':    225 obs. of  6 variables:
##  $ name               : chr  "China" "India" "United States" "Indonesia" ...
##  $ slug               : chr  "china" "india" "united-states" "indonesia" ...
##  $ value              : num  1.78e+09 1.14e+09 3.73e+08 3.17e+08 2.45e+08 ...
##  $ date_of_information: int  2022 2022 2022 2022 2022 2022 2022 2022 2022 2022 ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "East and Southeast Asia" "South Asia" "North America" "East and Southeast Asia" ...

telephones_mobile <- telephones_mobile %>% 
  rename(value_telemobi = value)

telephones_mobile <- telephones_mobile[, !(names(telephones_mobile) %in% c("slug", "date_of_information", "ranking", "region"))]

###2.3. Broadband_fixed_subscriptions.csv

broadband_fixed = import("Broadband_fixed_subscriptions.csv")

names(broadband_fixed)

## [1] "name"                "slug"                "value"              
## [4] "date_of_information" "ranking"             "region"

str(broadband_fixed)

## 'data.frame':    214 obs. of  6 variables:
##  $ name               : chr  "China" "United States" "Japan" "Brazil" ...
##  $ slug               : chr  "china" "united-states" "japan" "brazil" ...
##  $ value              : chr  "483,549,500" "121,176,000" "44,000,791" "36,344,670" ...
##  $ date_of_information: chr  "2020" "2020" "2020" "2020" ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "East and Southeast Asia" "North America" "East and Southeast Asia" "South America" ...

broadband_fixed$value <- parse_number(broadband_fixed$value)
str(broadband_fixed)

## 'data.frame':    214 obs. of  6 variables:
##  $ name               : chr  "China" "United States" "Japan" "Brazil" ...
##  $ slug               : chr  "china" "united-states" "japan" "brazil" ...
##  $ value              : num  4.84e+08 1.21e+08 4.40e+07 3.63e+07 3.62e+07 ...
##  $ date_of_information: chr  "2020" "2020" "2020" "2020" ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "East and Southeast Asia" "North America" "East and Southeast Asia" "South America" ...

broadband_fixed <- broadband_fixed %>% 
  rename(value_broad = value)

broadband_fixed <- broadband_fixed[, !(names(broadband_fixed) %in% c("slug", "date_of_information", "ranking", "region"))]

###3. ECONOMY

###3.1. Inflation Rate

inflation_rate = import("Inflation rate (consumer prices).csv")

names(inflation_rate)

## [1] "name"                "slug"                "%"                  
## [4] "date_of_information" "ranking"             "region"

str(inflation_rate)

## 'data.frame':    221 obs. of  6 variables:
##  $ name               : chr  "South Sudan" "Andorra" "Dominica" "American Samoa" ...
##  $ slug               : chr  "south-sudan" "andorra" "dominica" "american-samoa" ...
##  $ %                  : chr  "-6.69" "-0.9" "-0.73" "-0.5" ...
##  $ date_of_information: chr  "2022" "2015" "2020" "2015" ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "Africa" "Europe" "Central America and the Caribbean" "Australia and Oceania" ...

#inflation_rate$"%" <- parse_number(inflation_rate$"%", locale = locale(decimal_mark = ".", grouping_mark = " "))
#str(inflation_rate)

inflation_rate$"%" <- readr::parse_number(gsub("(?<!\\d),(?!\\d)|\\.(?!\\d)", "", inflation_rate$"%", perl = TRUE))
str(inflation_rate)

## 'data.frame':    221 obs. of  6 variables:
##  $ name               : chr  "South Sudan" "Andorra" "Dominica" "American Samoa" ...
##  $ slug               : chr  "south-sudan" "andorra" "dominica" "american-samoa" ...
##  $ %                  : num  -6.69 -0.9 -0.73 -0.5 -0.4 -0.3 0 0 0.3 0.3 ...
##  $ date_of_information: chr  "2022" "2015" "2020" "2015" ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "Africa" "Europe" "Central America and the Caribbean" "Australia and Oceania" ...

inflation_rate <- inflation_rate %>% 
  rename("%_inflation" = "%")

inflation_rate <- inflation_rate[, !(names(inflation_rate) %in% c("slug", "date_of_information", "ranking", "region"))]

###3.2. Youth unemployment rate (ages 15-24)

youth_unemployment = import("Youth_unemployment_rate(ages 15-24).csv")

names(youth_unemployment)

## [1] "name"                "slug"                "%"                  
## [4] "date_of_information" "ranking"             "region"

str(youth_unemployment)

## 'data.frame':    203 obs. of  6 variables:
##  $ name               : chr  "Djibouti" "South Africa" "Eswatini" "Libya" ...
##  $ slug               : chr  "djibouti" "south-africa" "eswatini" "libya" ...
##  $ %                  : num  79.9 64.2 50.9 50.5 48.8 45.4 42.3 42.2 41.2 41.1 ...
##  $ date_of_information: int  2021 2021 2021 2021 2020 2021 2021 2020 2021 2021 ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "Africa" "Africa" "Africa" "Africa" ...

youth_unemployment <- youth_unemployment %>% 
  rename("%_youth" = "%")

youth_unemployment <- youth_unemployment[, !(names(youth_unemployment) %in% c("slug", "date_of_information", "ranking", "region"))]

###3.3. Public debt

public_debt = import("Public debt.csv")

names(public_debt)

## [1] "name"                "slug"                "% of GDP"           
## [4] "date_of_information" "ranking"             "region"

str(public_debt)

## 'data.frame':    210 obs. of  6 variables:
##  $ name               : chr  "Greece" "Japan" "United Kingdom" "Singapore" ...
##  $ slug               : chr  "greece" "japan" "united-kingdom" "singapore" ...
##  $ % of GDP           : num  237 216 185 154 147 ...
##  $ date_of_information: chr  "2021" "2021" "2021" "2021" ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "Europe" "East and Southeast Asia" "Europe" "East and Southeast Asia" ...

public_debt <- public_debt[, !(names(public_debt) %in% c("slug", "date_of_information", "ranking", "region"))]

###3.4. Debt external

debt_external = import("Debt_external.csv")

names(debt_external)

## [1] "name"                "slug"                "value"              
## [4] "date_of_information" "ranking"             "region"

str(debt_external)

## 'data.frame':    207 obs. of  6 variables:
##  $ name               : chr  "United States" "United Kingdom" "France" "Germany" ...
##  $ slug               : chr  "united-states" "united-kingdom" "france" "germany" ...
##  $ value              : chr  "$20,275,951,000,000" "$8,722,000,000,000" "$6,356,000,000,000" "$5,671,463,000,000" ...
##  $ date_of_information: chr  "2019" "2019" "2019" "2019" ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "North America" "Europe" "Europe" "Europe" ...

debt_external$value <- parse_number(debt_external$value)
str(debt_external)

## 'data.frame':    207 obs. of  6 variables:
##  $ name               : chr  "United States" "United Kingdom" "France" "Germany" ...
##  $ slug               : chr  "united-states" "united-kingdom" "france" "germany" ...
##  $ value              : num  2.03e+13 8.72e+12 6.36e+12 5.67e+12 4.35e+12 ...
##  $ date_of_information: chr  "2019" "2019" "2019" "2019" ...
##  $ ranking            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ region             : chr  "North America" "Europe" "Europe" "Europe" ...

debt_external <- debt_external %>% 
  rename("value_debt" = value)

debt_external <- debt_external[, !(names(debt_external) %in% c("slug", "date_of_information", "ranking", "region"))]

###MERGE

gaaaa=merge(electricity,debt_external, by.x = "name", by.y = 'name')
gaaaa%>%
    rmarkdown::paged_table()

bases_datos <- list(electricity, re_petroleum, carbon_emissions, energy_consumption, telephones_fixed, telephones_mobile, broadband_fixed, inflation_rate, youth_unemployment, public_debt, debt_external)  
data_final <- bases_datos[[1]]  # La base de datos combinada comienza con la primera base de datos

for (i in 2:length(bases_datos)) {
  data_final <- merge(data_final, bases_datos[[i]], by = "name")
}

str(data_final)

## 'data.frame':    180 obs. of  12 variables:
##  $ name                : chr  "Afghanistan" "Albania" "Algeria" "Angola" ...
##  $ kW                  : num  776000 2531000 21694000 7344000 44731000 ...
##  $ bbl/day             : num  0 5638 627900 53480 669800 ...
##  $ metric tonnes of CO2: num  7.89e+06 3.79e+06 1.52e+08 1.94e+07 1.93e+08 ...
##  $ Btu/person          : num  3227000 38442000 61433000 11693000 79083000 ...
##  $ value_telefixed     : num  146000 177000 5576000 94000 7615000 ...
##  $ value_telemobi      : num  22678000 2782000 49019000 23978000 60236000 ...
##  $ value_broad         : num  26570 508937 3790459 230610 9571562 ...
##  $ %_inflation         : num  2.3 6.73 9.27 25.75 25.7 ...
##  $ %_youth             : num  20.2 27.8 31.9 18.5 29.9 36.1 10.8 11.4 16.5 30.8 ...
##  $ % of GDP            : num  7 82.4 27.5 65 57.6 ...
##  $ value_debt          : num  2.84e+08 9.31e+09 5.57e+09 4.21e+10 2.79e+11 ...

###Pregunta 1: Análisis Factorial

###Pasos previos

names(data_final)

##  [1] "name"                 "kW"                   "bbl/day"             
##  [4] "metric tonnes of CO2" "Btu/person"           "value_telefixed"     
##  [7] "value_telemobi"       "value_broad"          "%_inflation"         
## [10] "%_youth"              "% of GDP"             "value_debt"

data_final[data_final == 0] <- NA

dontselect=c("name")
select=setdiff(names(data_final),dontselect) 
theData=data_final[,select]

# usaremos:
library(magrittr)
head(theData,10)%>%
    rmarkdown::paged_table()

###Calculando correlaciones entre todas las variables

library(polycor)
corMatrix=polycor::hetcor(theData)$correlations
round(corMatrix,2)

##                         kW bbl/day metric tonnes of CO2 Btu/person
## kW                    1.00    0.83                 0.99       0.08
## bbl/day               0.83    1.00                 0.81       0.20
## metric tonnes of CO2  0.99    0.81                 1.00       0.09
## Btu/person            0.08    0.20                 0.09       1.00
## value_telefixed       0.96    0.80                 0.94       0.08
## value_telemobi        0.87    0.61                 0.88      -0.06
## value_broad           0.96    0.66                 0.96       0.04
## %_inflation          -0.02    0.00                -0.02      -0.02
## %_youth              -0.09   -0.09                -0.09      -0.22
## % of GDP              0.08    0.13                 0.03       0.06
## value_debt            0.48    0.75                 0.42       0.23
##                      value_telefixed value_telemobi value_broad %_inflation
## kW                              0.96           0.87        0.96       -0.02
## bbl/day                         0.80           0.61        0.66        0.00
## metric tonnes of CO2            0.94           0.88        0.96       -0.02
## Btu/person                      0.08          -0.06        0.04       -0.02
## value_telefixed                 1.00           0.81        0.93       -0.02
## value_telemobi                  0.81           1.00        0.85       -0.03
## value_broad                     0.93           0.85        1.00       -0.02
## %_inflation                    -0.02          -0.03       -0.02        1.00
## %_youth                        -0.11          -0.04       -0.10       -0.03
## % of GDP                        0.19          -0.03        0.04       -0.06
## value_debt                      0.53           0.19        0.30       -0.03
##                      %_youth % of GDP value_debt
## kW                     -0.09     0.08       0.48
## bbl/day                -0.09     0.13       0.75
## metric tonnes of CO2   -0.09     0.03       0.42
## Btu/person             -0.22     0.06       0.23
## value_telefixed        -0.11     0.19       0.53
## value_telemobi         -0.04    -0.03       0.19
## value_broad            -0.10     0.04       0.30
## %_inflation            -0.03    -0.06      -0.03
## %_youth                 1.00     0.20      -0.14
## % of GDP                0.20     1.00       0.36
## value_debt             -0.14     0.36       1.00

###Graficamos las correlaciones

library(ggcorrplot)

## Loading required package: ggplot2

ggcorrplot(corMatrix)

###AF

library(psych)

## 
## Attaching package: 'psych'

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

## The following object is masked from 'package:polycor':
## 
##     polyserial

psych::KMO(corMatrix)

## Kaiser-Meyer-Olkin factor adequacy
## Call: psych::KMO(r = corMatrix)
## Overall MSA =  0.75
## MSA for each item = 
##                   kW              bbl/day metric tonnes of CO2 
##                 0.83                 0.68                 0.79 
##           Btu/person      value_telefixed       value_telemobi 
##                 0.43                 0.88                 0.72 
##          value_broad          %_inflation              %_youth 
##                 0.68                 0.25                 0.42 
##             % of GDP           value_debt 
##                 0.48                 0.77

#Verificar si la matriz de correlación es adecuada

cortest.bartlett(corMatrix,n=nrow(theData))$p.value>0.05

## [1] FALSE

library(matrixcalc)

is.singular.matrix(corMatrix)

## [1] FALSE

#Determinar en cuantos factores o variables latentes podríamos redimensionar la data

fa.parallel(theData, fa = 'fa',correct = T,plot = F)

## Warning in cor.smooth(model): Matrix was not positive definite, smoothing was
## done

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Parallel analysis suggests that the number of factors =  2  and the number of components =  NA

#Redimensionar a número menor de factores

library(GPArotation)

## 
## Attaching package: 'GPArotation'

## The following objects are masked from 'package:psych':
## 
##     equamax, varimin

resfa <- fa(theData,
            nfactors = 3,
            cor = 'mixed',
            rotate = "varimax", 
            fm="minres")

## Warning in cor.smooth(model): Matrix was not positive definite, smoothing was
## done

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.1266e+09, ..): not converged in 1000000
## iter.

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.12583e+09, ..): not converged in 1000000
## iter.

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.12583e+09, ..): not converged in 1000000
## iter.

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.12582e+09, ..): not converged in 1000000
## iter.

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.12582e+09, ..): not converged in 1000000
## iter.

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.12581e+09, ..): not converged in 1000000
## iter.

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.12581e+09, ..): not converged in 1000000
## iter.

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.12581e+09, ..): not converged in 1000000
## iter.

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.12581e+09, ..): not converged in 1000000
## iter.

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.12581e+09, ..): not converged in 1000000
## iter.

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.12581e+09, ..): not converged in 1000000
## iter.

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.12581e+09, ..): not converged in 1000000
## iter.

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.12581e+09, ..): not converged in 1000000
## iter.

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.12581e+09, ..): not converged in 1000000
## iter.

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.12581e+09, ..): not converged in 1000000
## iter.

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.12581e+09, ..): not converged in 1000000
## iter.

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.12581e+09, ..): not converged in 1000000
## iter.

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.12581e+09, ..): not converged in 1000000
## iter.

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.12581e+09, ..): not converged in 1000000
## iter.

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.12581e+09, ..): not converged in 1000000
## iter.

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.12581e+09, ..): not converged in 1000000
## iter.

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.12581e+09, ..): not converged in 1000000
## iter.

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.12581e+09, ..): not converged in 1000000
## iter.

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.12581e+09, ..): not converged in 1000000
## iter.

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.12581e+09, ..): not converged in 1000000
## iter.

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.1266e+09, ..): not converged in 1000000
## iter.

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.12589e+09, ..): not converged in 1000000
## iter.

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.12584e+09, ..): not converged in 1000000
## iter.

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.12582e+09, ..): not converged in 1000000
## iter.

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.12581e+09, ..): not converged in 1000000
## iter.

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.12581e+09, ..): not converged in 1000000
## iter.

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.12581e+09, ..): not converged in 1000000
## iter.

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.12581e+09, ..): not converged in 1000000
## iter.

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.12581e+09, ..): not converged in 1000000
## iter.

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.12581e+09, ..): not converged in 1000000
## iter.

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.12581e+09, ..): not converged in 1000000
## iter.

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.12581e+09, ..): not converged in 1000000
## iter.

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.12581e+09, ..): not converged in 1000000
## iter.

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.12581e+09, ..): not converged in 1000000
## iter.

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.12581e+09, ..): not converged in 1000000
## iter.

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.12581e+09, ..): not converged in 1000000
## iter.

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.12581e+09, ..): not converged in 1000000
## iter.

## Warning in pchisq(df = result$dof, ncp = x, q = result$STATISTIC):
## pnchisq(x=6.1266e+09, f=25, theta=6.12581e+09, ..): not converged in 1000000
## iter.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

print(resfa$loadings)

## 
## Loadings:
##                      MR1    MR2    MR3   
## kW                    0.971  0.246       
## bbl/day               0.686  0.589       
## metric tonnes of CO2  0.979  0.206       
## Btu/person                   0.322       
## value_telefixed       0.917  0.301  0.141
## value_telemobi        0.903              
## value_broad           0.962              
## %_inflation                              
## %_youth                     -0.169  0.277
## % of GDP                     0.213  0.472
## value_debt            0.262  0.941  0.191
## 
##                  MR1   MR2   MR3
## SS loadings    5.027 1.608 0.371
## Proportion Var 0.457 0.146 0.034
## Cumulative Var 0.457 0.603 0.637

print(resfa$loadings,cutoff = 0.5)

## 
## Loadings:
##                      MR1    MR2    MR3   
## kW                    0.971              
## bbl/day               0.686  0.589       
## metric tonnes of CO2  0.979              
## Btu/person                               
## value_telefixed       0.917              
## value_telemobi        0.903              
## value_broad           0.962              
## %_inflation                              
## %_youth                                  
## % of GDP                                 
## value_debt                   0.941       
## 
##                  MR1   MR2   MR3
## SS loadings    5.027 1.608 0.371
## Proportion Var 0.457 0.146 0.034
## Cumulative Var 0.457 0.603 0.637

fa.diagram(resfa,main = "Resultados del EFA")

#Evaluando resultados

sort(resfa$communality)

##          %_inflation              %_youth           Btu/person 
##          0.006663116          0.107622134          0.110143157 
##             % of GDP       value_telemobi              bbl/day 
##          0.269284456          0.816208782          0.818299104 
##          value_broad      value_telefixed           value_debt 
##          0.930382235          0.950995839          0.991087793 
## metric tonnes of CO2                   kW 
##          1.001893848          1.003662906

sort(resfa$complexity)

##       value_telemobi          value_broad          %_inflation 
##             1.003324             1.009656             1.025048 
## metric tonnes of CO2           Btu/person                   kW 
##             1.092190             1.126941             1.128750 
##           value_debt      value_telefixed             % of GDP 
##             1.241835             1.264896             1.401224 
##              %_youth              bbl/day 
##             1.724001             1.957138

resfa$TLI

## [1] -4656113

resfa$rms

## [1] 0.01647071

resfa$RMSEA

##      RMSEA      lower      upper confidence 
##   1166.819         NA   1169.999      0.900

resfa$BIC

## [1] 6126595136

###PREGUNTA 2

cor(data_final[,c(2:12)])

##                                kW bbl/day metric tonnes of CO2 Btu/person
## kW                    1.000000000      NA          0.993718658         NA
## bbl/day                        NA       1                   NA         NA
## metric tonnes of CO2  0.993718658      NA          1.000000000         NA
## Btu/person                     NA      NA                   NA          1
## value_telefixed                NA      NA                   NA         NA
## value_telemobi        0.870539984      NA          0.887093225         NA
## value_broad           0.963448718      NA          0.962391896         NA
## %_inflation          -0.003864285      NA         -0.007582493         NA
## %_youth              -0.074977408      NA         -0.071206012         NA
## % of GDP                       NA      NA                   NA         NA
## value_debt                     NA      NA                   NA         NA
##                      value_telefixed value_telemobi  value_broad  %_inflation
## kW                                NA     0.87053998  0.963448718 -0.003864285
## bbl/day                           NA             NA           NA           NA
## metric tonnes of CO2              NA     0.88709323  0.962391896 -0.007582493
## Btu/person                        NA             NA           NA           NA
## value_telefixed                    1             NA           NA           NA
## value_telemobi                    NA     1.00000000  0.849225178 -0.013548883
## value_broad                       NA     0.84922518  1.000000000 -0.008542596
## %_inflation                       NA    -0.01354888 -0.008542596  1.000000000
## %_youth                           NA    -0.05225389 -0.073855128 -0.026774124
## % of GDP                          NA             NA           NA           NA
## value_debt                        NA             NA           NA           NA
##                          %_youth % of GDP value_debt
## kW                   -0.07497741       NA         NA
## bbl/day                       NA       NA         NA
## metric tonnes of CO2 -0.07120601       NA         NA
## Btu/person                    NA       NA         NA
## value_telefixed               NA       NA         NA
## value_telemobi       -0.05225389       NA         NA
## value_broad          -0.07385513       NA         NA
## %_inflation          -0.02677412       NA         NA
## %_youth               1.00000000       NA         NA
## % of GDP                      NA        1         NA
## value_debt                    NA       NA          1

datos_sin_na <- data_final[complete.cases(data_final), ]

cor(datos_sin_na[,c(2:12)])

##                               kW      bbl/day metric tonnes of CO2  Btu/person
## kW                    1.00000000  0.825596828           0.99354664  0.08497303
## bbl/day               0.82559683  1.000000000           0.81267695  0.20415268
## metric tonnes of CO2  0.99354664  0.812676953           1.00000000  0.09206464
## Btu/person            0.08497303  0.204152685           0.09206464  1.00000000
## value_telefixed       0.95980779  0.802058773           0.93905789  0.07880541
## value_telemobi        0.86689704  0.614168876           0.88448189 -0.06377627
## value_broad           0.96298937  0.661855455           0.96173071  0.03776922
## %_inflation          -0.01585127  0.002884461          -0.01916433 -0.02269951
## %_youth              -0.09299223 -0.092420098          -0.08794022 -0.21993919
## % of GDP              0.07917610  0.132983658           0.03171408  0.05968754
## value_debt            0.47508934  0.754404362           0.42336012  0.22704345
##                      value_telefixed value_telemobi value_broad  %_inflation
## kW                        0.95980779     0.86689704  0.96298937 -0.015851267
## bbl/day                   0.80205877     0.61416888  0.66185545  0.002884461
## metric tonnes of CO2      0.93905789     0.88448189  0.96173071 -0.019164328
## Btu/person                0.07880541    -0.06377627  0.03776922 -0.022699512
## value_telefixed           1.00000000     0.81146230  0.92614603 -0.023072723
## value_telemobi            0.81146230     1.00000000  0.84695230 -0.027336627
## value_broad               0.92614603     0.84695230  1.00000000 -0.018362244
## %_inflation              -0.02307272    -0.02733663 -0.01836224  1.000000000
## %_youth                  -0.10535423    -0.03628953 -0.09510730 -0.033131537
## % of GDP                  0.19067439    -0.02579508  0.04280293 -0.060841078
## value_debt                0.53466241     0.18665655  0.30306443 -0.028830948
##                          %_youth    % of GDP  value_debt
## kW                   -0.09299223  0.07917610  0.47508934
## bbl/day              -0.09242010  0.13298366  0.75440436
## metric tonnes of CO2 -0.08794022  0.03171408  0.42336012
## Btu/person           -0.21993919  0.05968754  0.22704345
## value_telefixed      -0.10535423  0.19067439  0.53466241
## value_telemobi       -0.03628953 -0.02579508  0.18665655
## value_broad          -0.09510730  0.04280293  0.30306443
## %_inflation          -0.03313154 -0.06084108 -0.02883095
## %_youth               1.00000000  0.19973133 -0.14432389
## % of GDP              0.19973133  1.00000000  0.35874968
## value_debt           -0.14432389  0.35874968  1.00000000

dataClus=datos_sin_na[,c(2:12)]
row.names(dataClus)=datos_sin_na$name

library(cluster)
g.dist = daisy(dataClus, metric="gower")

library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

fviz_nbclust(dataClus, hcut,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F,hc_func = "agnes")

###Pregunta 3

names(data_final)

##  [1] "name"                 "kW"                   "bbl/day"             
##  [4] "metric tonnes of CO2" "Btu/person"           "value_telefixed"     
##  [7] "value_telemobi"       "value_broad"          "%_inflation"         
## [10] "%_youth"              "% of GDP"             "value_debt"

Limpieza de bases de datos Cia

Alfredo Ludmir Aro Terleira

2024-06-12