Anexo 3.5

En este fichero se mostrarán los pasos seguidos con el fin de cumplir con el objetivo 5:

Confirmar o rechazar la hipótesis planteada: “En los países más industrializados se invierte más en educación y en los países menos industrializados (que son más rurales) se invierte menos”.

1. Lectura de datos.

En primer lugar, hacemos la carga de datos:

library(readr)
setwd("C:/Users/yassm/OneDrive/Escritorio/2º GCD (cuatri B)/PROYECTO II/TRABAJO PROYECTO II")
load("decadas.RData", verbose = TRUE)

## Loading objects:
##   a70
##   a80
##   a90
##   a00
##   a10

1.1. Transformación inicial: cambio de nombres.

Debido a que nos es necesario más adelante tener unos nombres concretos para algunos paquetes, decido hacer un cambio de nombres inicial.

a70$country[a70$country == 'United States'] = 'USA'
a70$country[a70$country == 'United Kingdom'] = 'UK'
a70$country[a70$country == 'Turkiye'] = 'Turkey'
a70$country[a70$country == 'Egypt, Arab Rep.'] = 'Egypt'
a70$country[a70$country == 'Russian Federation'] = 'Russia'
a70$country[a70$country == 'Korea, Rep.'] = 'South Korea'
a70$country[a70$country == 'Slovak Republic'] = 'Slovakia'
a70$country[a70$country == 'Venezuela, RB'] = 'Venezuela'
a70$country[a70$country == 'Iran, Islamic Rep.'] = 'Iran'
a70$country[a70$country == 'Congo, Dem. Rep.'] = 'Congo'

a80$country[a80$country == 'United States'] = 'USA'
a80$country[a80$country == 'United Kingdom'] = 'UK'
a80$country[a80$country == 'Turkiye'] = 'Turkey'
a80$country[a80$country == 'Egypt, Arab Rep.'] = 'Egypt'
a80$country[a80$country == 'Russian Federation'] = 'Russia'
a80$country[a80$country == 'Korea, Rep.'] = 'South Korea'
a80$country[a80$country == 'Slovak Republic'] = 'Slovakia'
a80$country[a80$country == 'Venezuela, RB'] = 'Venezuela'
a80$country[a80$country == 'Iran, Islamic Rep.'] = 'Iran'
a80$country[a80$country == 'Congo, Dem. Rep.'] = 'Congo'

a90$country[a90$country == 'United States'] = 'USA'
a90$country[a90$country == 'United Kingdom'] = 'UK'
a90$country[a90$country == 'Turkiye'] = 'Turkey'
a90$country[a90$country == 'Egypt, Arab Rep.'] = 'Egypt'
a90$country[a90$country == 'Russian Federation'] = 'Russia'
a90$country[a90$country == 'Korea, Rep.'] = 'South Korea'
a90$country[a90$country == 'Slovak Republic'] = 'Slovakia'
a90$country[a90$country == 'Venezuela, RB'] = 'Venezuela'
a90$country[a90$country == 'Iran, Islamic Rep.'] = 'Iran'
a90$country[a90$country == 'Congo, Dem. Rep.'] = 'Congo'

a00$country[a00$country == 'United States'] = 'USA'
a00$country[a00$country == 'United Kingdom'] = 'UK'
a00$country[a00$country == 'Turkiye'] = 'Turkey'
a00$country[a00$country == 'Egypt, Arab Rep.'] = 'Egypt'
a00$country[a00$country == 'Russian Federation'] = 'Russia'
a00$country[a00$country == 'Korea, Rep.'] = 'South Korea'
a00$country[a00$country == 'Slovak Republic'] = 'Slovakia'
a00$country[a00$country == 'Venezuela, RB'] = 'Venezuela'
a00$country[a00$country == 'Iran, Islamic Rep.'] = 'Iran'
a00$country[a00$country == 'Congo, Dem. Rep.'] = 'Congo'

a10$country[a10$country == 'United States'] = 'USA'
a10$country[a10$country == 'United Kingdom'] = 'UK'
a10$country[a10$country == 'Turkiye'] = 'Turkey'
a10$country[a10$country == 'Egypt, Arab Rep.'] = 'Egypt'
a10$country[a10$country == 'Russian Federation'] = 'Russia'
a10$country[a10$country == 'Korea, Rep.'] = 'South Korea'
a10$country[a10$country == 'Slovak Republic'] = 'Slovakia'
a10$country[a10$country == 'Venezuela, RB'] = 'Venezuela'
a10$country[a10$country == 'Iran, Islamic Rep.'] = 'Iran'
a10$country[a10$country == 'Congo, Dem. Rep.'] = 'Congo'

2. Selección teniendo en cuenta faltantes.

2.1. Selección de variables o indicadores.

Para cumplir con el objetivo, nos interesan las variables relacionadas con industria, empleabilidad y con educación. Por tanto, en principio nos interesan los indicadores:

Children in employment, total (% of children ages 7-14) Employment in industry (% of total employment) (modeled ILO estimate) Employment in services (% of total employment) (modeled ILO estimate) Energy use (kg of oil equivalent per capita) External balance on goods and services (% of GDP) Government expenditure on education, total (% of GDP) Industry (including construction), value added (% of GDP) Literacy rate, adult total (% of people ages 15 and above) Manufactures exports (% of merchandise exports) Manufacturing, value added (% of GDP) School enrollment, primary (% net) School enrollment, secondary (% net) School enrollment, tertiary (% gross) Self-employed, total (% of total employment) (modeled ILO estimate) Unemployment, female (% of female labor force) (modeled ILO estimate) Unemployment, total (% of total labor force) (modeled ILO estimate) Unemployment, youth total (% of total labor force ages 15-24) (modeled ILO estimate)

Para empezar, estudiaremos la cantidad de datos faltantes por década y por indicador para tomar una decisión. Para ello, creamos una matriz de datos con el recuento de faltantes.

df_a10 <- as.data.frame(colSums(is.na(a10)))
df_a00 <- as.data.frame(colSums(is.na(a00)))
df_a90 <- as.data.frame(colSums(is.na(a90)))
df_a80 <- as.data.frame(colSums(is.na(a80)))
df_a70 <- as.data.frame(colSums(is.na(a70)))

df_faltantes <- merge(df_a10, df_a00, by = "row.names", all.x = TRUE)
rownames(df_faltantes) <- as.matrix(df_faltantes[,1])
df_faltantes[,1] <- NULL

df_faltantes <- merge(df_faltantes, df_a90, by = "row.names", all.x = TRUE)
rownames(df_faltantes) <- as.matrix(df_faltantes[,1])
df_faltantes[,1] <- NULL

df_faltantes <- merge(df_faltantes, df_a80, by = "row.names", all.x = TRUE)
rownames(df_faltantes) <- as.matrix(df_faltantes[,1])
df_faltantes[,1] <- NULL

df_faltantes <- merge(df_faltantes, df_a70, by = "row.names", all.x = TRUE)
rownames(df_faltantes) <- as.matrix(df_faltantes[,1])
df_faltantes[,1] <- NULL

colnames(df_faltantes) <- c("a10", "a00", "a90", "a80", "a70")

Solo revisaremos aquellas filas del dataframe con las variables interesantes para el objetivo 5:

filas <- rownames(df_faltantes) %in% c("Children in employment, total (% of children ages 7-14)",
"Employment in industry (% of total employment) (modeled ILO estimate)",
"Employment in services (% of total employment) (modeled ILO estimate)",
"Energy use (kg of oil equivalent per capita)",
"External balance on goods and services (% of GDP)",
"Government expenditure on education, total (% of GDP)",
"Industry (including construction), value added (% of GDP)",
"Literacy rate, adult total (% of people ages 15 and above)",
"Manufactures exports (% of merchandise exports)",
"Manufacturing, value added (% of GDP)",
"School enrollment, primary (% net)",
"School enrollment, secondary (% net)", 
"School enrollment, tertiary (% gross)",
"Self-employed, total (% of total employment) (modeled ILO estimate)",
"Unemployment, female (% of female labor force) (modeled ILO estimate)",
"Unemployment, total (% of total labor force) (modeled ILO estimate)",
"Unemployment, youth total (% of total labor force ages 15-24) (modeled ILO estimate)")

df_faltantes[filas, ]

##                                                                                      a10
## Children in employment, total (% of children ages 7-14)                              175
## Employment in industry (% of total employment) (modeled ILO estimate)                 30
## Employment in services (% of total employment) (modeled ILO estimate)                 30
## Energy use (kg of oil equivalent per capita)                                          46
## External balance on goods and services (% of GDP)                                     27
## Government expenditure on education, total (% of GDP)                                 28
## Industry (including construction), value added (% of GDP)                             14
## Literacy rate, adult total (% of people ages 15 and above)                            69
## Manufactures exports (% of merchandise exports)                                       34
## Manufacturing, value added (% of GDP)                                                 19
## School enrollment, primary (% net)                                                    40
## School enrollment, secondary (% net)                                                  60
## School enrollment, tertiary (% gross)                                                 43
## Self-employed, total (% of total employment) (modeled ILO estimate)                   30
## Unemployment, female (% of female labor force) (modeled ILO estimate)                 30
## Unemployment, total (% of total labor force) (modeled ILO estimate)                   30
## Unemployment, youth total (% of total labor force ages 15-24) (modeled ILO estimate)  30
##                                                                                      a00
## Children in employment, total (% of children ages 7-14)                              200
## Employment in industry (% of total employment) (modeled ILO estimate)                 29
## Employment in services (% of total employment) (modeled ILO estimate)                 29
## Energy use (kg of oil equivalent per capita)                                          47
## External balance on goods and services (% of GDP)                                     39
## Government expenditure on education, total (% of GDP)                                 49
## Industry (including construction), value added (% of GDP)                             31
## Literacy rate, adult total (% of people ages 15 and above)                            83
## Manufactures exports (% of merchandise exports)                                       37
## Manufacturing, value added (% of GDP)                                                 34
## School enrollment, primary (% net)                                                    55
## School enrollment, secondary (% net)                                                  88
## School enrollment, tertiary (% gross)                                                 47
## Self-employed, total (% of total employment) (modeled ILO estimate)                   29
## Unemployment, female (% of female labor force) (modeled ILO estimate)                 29
## Unemployment, total (% of total labor force) (modeled ILO estimate)                   29
## Unemployment, youth total (% of total labor force ages 15-24) (modeled ILO estimate)  29
##                                                                                      a90
## Children in employment, total (% of children ages 7-14)                               NA
## Employment in industry (% of total employment) (modeled ILO estimate)                 NA
## Employment in services (% of total employment) (modeled ILO estimate)                 NA
## Energy use (kg of oil equivalent per capita)                                          47
## External balance on goods and services (% of GDP)                                     56
## Government expenditure on education, total (% of GDP)                                122
## Industry (including construction), value added (% of GDP)                             69
## Literacy rate, adult total (% of people ages 15 and above)                           128
## Manufactures exports (% of merchandise exports)                                       97
## Manufacturing, value added (% of GDP)                                                 98
## School enrollment, primary (% net)                                                    94
## School enrollment, secondary (% net)                                                 162
## School enrollment, tertiary (% gross)                                                 51
## Self-employed, total (% of total employment) (modeled ILO estimate)                   NA
## Unemployment, female (% of female labor force) (modeled ILO estimate)                 NA
## Unemployment, total (% of total labor force) (modeled ILO estimate)                   NA
## Unemployment, youth total (% of total labor force ages 15-24) (modeled ILO estimate)  NA
##                                                                                      a80
## Children in employment, total (% of children ages 7-14)                               NA
## Employment in industry (% of total employment) (modeled ILO estimate)                 NA
## Employment in services (% of total employment) (modeled ILO estimate)                 NA
## Energy use (kg of oil equivalent per capita)                                         105
## External balance on goods and services (% of GDP)                                     96
## Government expenditure on education, total (% of GDP)                                148
## Industry (including construction), value added (% of GDP)                            120
## Literacy rate, adult total (% of people ages 15 and above)                           154
## Manufactures exports (% of merchandise exports)                                       72
## Manufacturing, value added (% of GDP)                                                143
## School enrollment, primary (% net)                                                   106
## School enrollment, secondary (% net)                                                 180
## School enrollment, tertiary (% gross)                                                 58
## Self-employed, total (% of total employment) (modeled ILO estimate)                   NA
## Unemployment, female (% of female labor force) (modeled ILO estimate)                 NA
## Unemployment, total (% of total labor force) (modeled ILO estimate)                   NA
## Unemployment, youth total (% of total labor force ages 15-24) (modeled ILO estimate)  NA
##                                                                                      a70
## Children in employment, total (% of children ages 7-14)                               NA
## Employment in industry (% of total employment) (modeled ILO estimate)                 NA
## Employment in services (% of total employment) (modeled ILO estimate)                 NA
## Energy use (kg of oil equivalent per capita)                                          99
## External balance on goods and services (% of GDP)                                    115
## Government expenditure on education, total (% of GDP)                                170
## Industry (including construction), value added (% of GDP)                            153
## Literacy rate, adult total (% of people ages 15 and above)                           239
## Manufactures exports (% of merchandise exports)                                       96
## Manufacturing, value added (% of GDP)                                                170
## School enrollment, primary (% net)                                                   138
## School enrollment, secondary (% net)                                                 200
## School enrollment, tertiary (% gross)                                                 77
## Self-employed, total (% of total employment) (modeled ILO estimate)                   NA
## Unemployment, female (% of female labor force) (modeled ILO estimate)                 NA
## Unemployment, total (% of total labor force) (modeled ILO estimate)                   NA
## Unemployment, youth total (% of total labor force ages 15-24) (modeled ILO estimate)  NA

Decidimos prescindir de “Children in employment, total (% of children ages 7-14)” debido a la casi inexistencia de datos para esta. Y también prescindiremos de los datos de las décadas 90, 80 y 70, porque en los dos últimos hay muchos faltantes (superando el 50% casi en todos los casos) y en el primero hay muchos faltantes o inexistencia de datos en variables altamente importantes (como “Employment in industry (% of total employment) (modeled ILO estimate)”, “Government expenditure on education, total (% of GDP)” y “Literacy rate, adult total (% of people ages 15 and above)”).

Así que, nos quedamos con las décadas del siglo 21 (la 10 y la 00) y con los siguientes indicadores:

Employment in industry (% of total employment) (modeled ILO estimate) Employment in services (% of total employment) (modeled ILO estimate) Energy use (kg of oil equivalent per capita) External balance on goods and services (% of GDP) Government expenditure on education, total (% of GDP) Industry (including construction), value added (% of GDP) Literacy rate, adult total (% of people ages 15 and above) Manufactures exports (% of merchandise exports) Manufacturing, value added (% of GDP) School enrollment, primary (% net) School enrollment, secondary (% net) School enrollment, tertiary (% gross) Self-employed, total (% of total employment) (modeled ILO estimate) Unemployment, female (% of female labor force) (modeled ILO estimate) Unemployment, total (% of total labor force) (modeled ILO estimate) Unemployment, youth total (% of total labor force ages 15-24) (modeled ILO estimate)

2.2. Selección de observaciones o países.

Hacemos un subset solo con las variables que usaremos para el estudio. De esta manera nos quedan 16 variables más el país.

vars_a10 <- subset(a10, select = c("country", "Employment in industry (% of total employment) (modeled ILO estimate)",
"Employment in services (% of total employment) (modeled ILO estimate)",
"Energy use (kg of oil equivalent per capita)",
"External balance on goods and services (% of GDP)",
"Government expenditure on education, total (% of GDP)",
"Industry (including construction), value added (% of GDP)",
"Literacy rate, adult total (% of people ages 15 and above)",
"Manufactures exports (% of merchandise exports)",
"Manufacturing, value added (% of GDP)",
"School enrollment, primary (% net)",
"School enrollment, secondary (% net)", 
"School enrollment, tertiary (% gross)",
"Self-employed, total (% of total employment) (modeled ILO estimate)",
"Unemployment, female (% of female labor force) (modeled ILO estimate)",
"Unemployment, total (% of total labor force) (modeled ILO estimate)",
"Unemployment, youth total (% of total labor force ages 15-24) (modeled ILO estimate)")) 

vars_a00 <- subset(a00, select = c("country", "Employment in industry (% of total employment) (modeled ILO estimate)",
"Employment in services (% of total employment) (modeled ILO estimate)",
"Energy use (kg of oil equivalent per capita)",
"External balance on goods and services (% of GDP)",
"Government expenditure on education, total (% of GDP)",
"Industry (including construction), value added (% of GDP)",
"Literacy rate, adult total (% of people ages 15 and above)",
"Manufactures exports (% of merchandise exports)",
"Manufacturing, value added (% of GDP)",
"School enrollment, primary (% net)",
"School enrollment, secondary (% net)", 
"School enrollment, tertiary (% gross)",
"Self-employed, total (% of total employment) (modeled ILO estimate)",
"Unemployment, female (% of female labor force) (modeled ILO estimate)",
"Unemployment, total (% of total labor force) (modeled ILO estimate)",
"Unemployment, youth total (% of total labor force ages 15-24) (modeled ILO estimate)"))

dfp_a10 <- as.data.frame(cbind(a10[,1],rowSums(is.na(vars_a10))))
rownames(dfp_a10) <- as.matrix(dfp_a10[,1])
dfp_a10[,1] <- NULL

dfp_a00 <- as.data.frame(cbind(a10[,1],rowSums(is.na(vars_a00))))

## Warning in base::cbind(...): number of rows of result is not a multiple of
## vector length (arg 2)

rownames(dfp_a00) <- as.matrix(dfp_a00[,1])
dfp_a00[,1] <- NULL

dfp_faltantes <- merge(dfp_a10, dfp_a00, by = "row.names", all.x = TRUE)
rownames(dfp_faltantes) <- as.matrix(dfp_faltantes[,1])
dfp_faltantes[,1] <- NULL

colnames(dfp_faltantes) <- c("a10", "a00")

A continuación analizamos qué países tienen 8 o más faltantes en alguna de las 2 columnas y veremos qué hacer con ellos:

filas <- dfp_faltantes$a10 %in% seq(0,8,1) & dfp_faltantes$a00 %in% seq(0,8,1)
dfp_faltantes[filas,]

##                                                      a10 a00
## Afghanistan                                            3   7
## Africa Eastern and Southern                            2   2
## Africa Western and Central                             2   2
## Albania                                                0   0
## Algeria                                                1   1
## Angola                                                 1   2
## Antigua and Barbuda                                    6   8
## Arab World                                             0   0
## Argentina                                              0   0
## Armenia                                                0   3
## Aruba                                                  7   7
## Australia                                              3   1
## Austria                                                1   1
## Azerbaijan                                             1   1
## Bahamas, The                                           2   2
## Bahrain                                                0   3
## Bangladesh                                             0   1
## Barbados                                               0   0
## Belarus                                                0   1
## Belgium                                                1   1
## Belize                                                 1   0
## Benin                                                  0   0
## Bhutan                                                 0   1
## Bolivia                                                1   2
## Bosnia and Herzegovina                                 3   3
## Botswana                                               1   1
## Brazil                                                 0   1
## Brunei Darussalam                                      0   2
## Bulgaria                                               1   1
## Burkina Faso                                           1   1
## Burundi                                                1   2
## Cabo Verde                                             0   0
## Cambodia                                               0   0
## Cameroon                                               0   2
## Canada                                                 1   2
## Caribbean small states                                 1   2
## Central African Republic                               1   5
## Central Europe and the Baltics                         0   0
## Chad                                                   4   2
## Chile                                                  0   2
## China                                                  2   1
## Colombia                                               0   1
## Comoros                                                1   2
## Congo                                                  3   3
## Congo, Rep.                                            1   4
## Costa Rica                                             0   0
## Cote d'Ivoire                                          0   1
## Croatia                                                0   0
## Cuba                                                   0   0
## Cyprus                                                 0   0
## Czechia                                                1   1
## Denmark                                                1   1
## Djibouti                                               1   5
## Dominican Republic                                     0   0
## Early-demographic dividend                             0   0
## East Asia & Pacific                                    0   0
## East Asia & Pacific (excluding high income)            0   0
## East Asia & Pacific (IDA & IBRD countries)             0   0
## Ecuador                                                0   0
## Egypt                                                  0   1
## El Salvador                                            0   1
## Equatorial Guinea                                      4   4
## Eritrea                                                1   0
## Estonia                                                0   0
## Eswatini                                               0   0
## Ethiopia                                               0   1
## Euro area                                              1   1
## Europe & Central Asia                                  0   0
## Europe & Central Asia (excluding high income)          0   0
## Europe & Central Asia (IDA & IBRD countries)           0   0
## European Union                                         1   1
## Fiji                                                   2   1
## Finland                                                1   1
## Fragile and conflict affected situations               2   3
## France                                                 1   1
## Gabon                                                  3   2
## Gambia, The                                            1   1
## Georgia                                                0   0
## Germany                                                1   1
## Ghana                                                  0   1
## Greece                                                 0   0
## Grenada                                                8   8
## Guatemala                                              0   0
## Guinea                                                 1   1
## Guinea-Bissau                                          2   1
## Guyana                                                 1   1
## Haiti                                                  4   3
## Heavily indebted poor countries (HIPC)                 0   0
## High income                                            1   1
## Honduras                                               0   2
## Hong Kong SAR, China                                   1   1
## Hungary                                                0   1
## IBRD only                                              0   0
## Iceland                                                1   1
## IDA & IBRD total                                       0   0
## IDA blend                                              0   0
## IDA only                                               0   0
## IDA total                                              0   0
## India                                                  0   1
## Indonesia                                              0   0
## Iran                                                   0   1
## Iraq                                                   2   1
## Ireland                                                1   1
## Israel                                                 1   1
## Italy                                                  0   0
## Jamaica                                                1   0
## Japan                                                  3   3
## Jordan                                                 0   0
## Kazakhstan                                             0   0
## Kenya                                                  0   0
## Kuwait                                                 0   2
## Kyrgyz Republic                                        0   0
## Lao PDR                                                1   2
## Late-demographic dividend                              0   0
## Latin America & Caribbean                              0   0
## Latin America & Caribbean (excluding high income)      0   0
## Latin America & the Caribbean (IDA & IBRD countries)   0   0
## Latvia                                                 0   1
## Least developed countries: UN classification           0   1
## Lebanon                                                3   4
## Lesotho                                                0   1
## Liberia                                                3   6
## Libya                                                  5   3
## Lithuania                                              0   0
## Low & middle income                                    0   0
## Low income                                             1   1
## Lower middle income                                    0   0
## Luxembourg                                             1   1
## Macao SAR, China                                       1   1
## Madagascar                                             2   4
## Malawi                                                 2   2
## Malaysia                                               0   0
## Maldives                                               1   1
## Mali                                                   1   3
## Malta                                                  0   1
## Marshall Islands                                       7   8
## Mauritania                                             1   1
## Mauritius                                              1   0
## Mexico                                                 0   0
## Middle East & North Africa                             0   0
## Middle East & North Africa (excluding high income)     0   0
## Middle East & North Africa (IDA & IBRD countries)      0   0
## Middle income                                          0   0
## Moldova                                                0   1
## Mongolia                                               0   0
## Montenegro                                             2   4
## Morocco                                                0   1
## Mozambique                                             0   0
## Myanmar                                                1   3
## Namibia                                                0   0
## Nepal                                                  0   1
## Netherlands                                            1   5
## New Caledonia                                          5   1
## New Zealand                                            1   0
## Nicaragua                                              1   0
## Niger                                                  0   2
## Nigeria                                                1   0
## North America                                          1   1
## Norway                                                 1   1
## OECD members                                           1   0
## Oman                                                   0   1
## Other small states                                     0   1
## Pacific island small states                            1   1
## Panama                                                 0   4
## Papua New Guinea                                       6   1
## Paraguay                                               0   0
## Peru                                                   0   0
## Philippines                                            0   0
## Poland                                                 0   1
## Portugal                                               0   1
## Post-demographic dividend                              1   0
## Pre-demographic dividend                               0   7
## Puerto Rico                                            2   1
## Qatar                                                  0   0
## Romania                                                0   2
## Russia                                                 0   2
## Rwanda                                                 3   1
## Sao Tome and Principe                                  1   2
## Saudi Arabia                                           0   0
## Senegal                                                0   3
## Serbia                                                 0   7
## Seychelles                                             6   3
## Sierra Leone                                           2   3
## Slovakia                                               1   0
## Slovenia                                               0   1
## Small states                                           0   2
## South Africa                                           2   0
## South Asia                                             0   0
## South Korea                                            0   1
## South Sudan                                            2   0
## Spain                                                  0   2
## St. Kitts and Nevis                                    8   4
## St. Vincent and the Grenadines                         2   0
## Sub-Saharan Africa                                     0   0
## Sub-Saharan Africa (excluding high income)             0   0
## Sub-Saharan Africa (IDA & IBRD countries)              0   2
## Sudan                                                  0   4
## Suriname                                               2   1
## Sweden                                                 1   1
## Switzerland                                            1   1
## Syrian Arab Republic                                   1   0
## Tajikistan                                             1   1
## Tanzania                                               1   2
## Thailand                                               0   3
## Timor-Leste                                            0   0
## Togo                                                   1   0
## Tonga                                                  2   2
## Trinidad and Tobago                                    3   1
## Tunisia                                                1   0
## Turkey                                                 0   5
## Uganda                                                 1   0
## UK                                                     1   4
## Ukraine                                                0   3
## United Arab Emirates                                   3   1
## Upper middle income                                    0   1
## Uruguay                                                0   5
## USA                                                    1   0
## Uzbekistan                                             1   0
## Vanuatu                                                2   1
## Venezuela                                              0   3
## West Bank and Gaza                                     1   0
## World                                                  0   1
## Yemen, Rep.                                            1   1
## Zambia                                                 1   3
## Zimbabwe                                               0   7

2.3. Tabla final con la selección.

Seleccionamos aquellos países que tengan el 50% de faltantes o menos y nos quedan 161 países por 16 variables.

countries <- c("Afghanistan", "Albania", "Algeria", "Andorra", "Angola", "Antigua and Barbuda", "Argentina", "Armenia", "Australia", "Austria", "Azerbaijan", "Bahamas", "Bahrain", "Bangladesh", "Barbados", "Belarus", "Belgium", "Belize", "Benin", "Bhutan", "Bolivia", "Bosnia and Herzegovina", "Botswana", "Brazil", "Brunei", "Bulgaria", "Burkina Faso", "Burundi", "Cabo Verde", "Cambodia", "Cameroon", "Canada", "Central African Republic", "Chad", "Chile", "China", "Colombia", "Comoros", "Congo", "Congo, Republic of the", "Costa Rica", "Cote d'Ivoire", "Croatia", "Cuba", "Cyprus", "Czech Republic", "Denmark", "Djibouti", "Dominica", "Dominican Republic", "Ecuador", "Egypt", "El Salvador", "Equatorial Guinea", "Eritrea", "Estonia", "Eswatini", "Ethiopia", "Fiji", "Finland", "France", "Gabon", "Gambia", "Georgia", "Germany", "Ghana", "Greece", "Grenada", "Guatemala", "Guinea", "Guinea-Bissau", "Guyana", "Haiti", "Honduras", "Hungary", "Iceland", "India", "Indonesia", "Iran", "Iraq", "Ireland", "Israel", "Italy", "Jamaica", "Japan", "Jordan", "Kazakhstan", "Kenya", "Kiribati", "Kosovo", "Kuwait", "Kyrgyzstan", "Laos", "Latvia", "Lebanon", "Lesotho", "Liberia", "Libya", "Liechtenstein", "Lithuania", "Luxembourg", "Madagascar", "Malawi", "Malaysia", "Maldives", "Mali", "Malta", "Marshall Islands", "Mauritania", "Mauritius", "Mexico", "Micronesia", "Moldova", "Monaco", "Mongolia", "Montenegro", "Morocco", "Mozambique", "Myanmar", "Namibia", "Nauru", "Nepal", "Netherlands", "New Zealand", "Nicaragua", "Niger", "Nigeria", "North Korea", "North Macedonia", "Norway", "Oman", "Pakistan", "Palau", "Palestine", "Panama", "Papua New Guinea", "Paraguay", "Peru", "Philippines", "Poland", "Portugal", "Qatar", "Romania", "Russia", "Rwanda", "Saint Kitts and Nevis", "Saint Lucia", "Saint Vincent and the Grenadines", "Samoa", "San Marino", "Sao Tome and Principe", "Saudi Arabia", "Senegal", "Serbia", "Seychelles", "Sierra Leone", "Singapore", "Slovakia", "Slovenia", "Solomon Islands", "Somalia", "South Africa", "South Korea", "South Sudan", "Spain", "Sri Lanka", "Sudan", "Suriname", "Sweden", "Switzerland", "Syria", "Taiwan", "Tajikistan", "Tanzania", "Thailand", "Timor-Leste", "Togo", "Tonga", "Trinidad and Tobago", "Tunisia", "Turkey", "Turkmenistan", "Tuvalu", "Uganda", "Ukraine", "United Arab Emirates", "UK", "USA", "Uruguay", "Uzbekistan", "Vanuatu", "Vatican City (Holy See)", "Venezuela", "Vietnam", "Yemen", "Zambia", "Zimbabwe")

columnas <- c("Employment in industry (% of total employment) (modeled ILO estimate)",
"Employment in services (% of total employment) (modeled ILO estimate)",
"Energy use (kg of oil equivalent per capita)",
"External balance on goods and services (% of GDP)",
"Government expenditure on education, total (% of GDP)",
"Industry (including construction), value added (% of GDP)",
"Literacy rate, adult total (% of people ages 15 and above)",
"Manufactures exports (% of merchandise exports)",
"Manufacturing, value added (% of GDP)",
"School enrollment, primary (% net)",
"School enrollment, secondary (% net)", 
"School enrollment, tertiary (% gross)",
"Self-employed, total (% of total employment) (modeled ILO estimate)",
"Unemployment, female (% of female labor force) (modeled ILO estimate)",
"Unemployment, total (% of total labor force) (modeled ILO estimate)",
"Unemployment, youth total (% of total labor force ages 15-24) (modeled ILO estimate)")
filas <- dfp_faltantes$a10 %in% seq(0,8,1) & dfp_faltantes$a00 %in% seq(0,8,1)
filasA <- intersect(rownames(dfp_faltantes[filas,]), countries)

rownames(a00) <- as.matrix(a00[,1])
rownames(a10) <- as.matrix(a10[,1])

dat_00 <- a00[filasA, columnas]
dat_10 <- a10[filasA, columnas]

3. Separación datos en una tabla educación y en una tabla industrialización e imputación.

Ya que estudiarlas en conjunto resulta más complicado porque son muchas variables que analizar, decidimos separar la materia educación por un lado y la materia industrialización por otro.

3.1. Creación de las 4 tablas (dos para el 00 y dos para el 10).

A parte de por materia, también separamos por década con el fin de estudiar la evolución. De esta forma nos quedarían 4 tablas.

dat_industria00 <- subset(dat_00, select = c("Employment in industry (% of total employment) (modeled ILO estimate)",
"Employment in services (% of total employment) (modeled ILO estimate)", "Energy use (kg of oil equivalent per capita)", 
"External balance on goods and services (% of GDP)","Industry (including construction), value added (% of GDP)",
"Manufactures exports (% of merchandise exports)", "Manufacturing, value added (% of GDP)",
"Self-employed, total (% of total employment) (modeled ILO estimate)", 
"Unemployment, female (% of female labor force) (modeled ILO estimate)",
"Unemployment, total (% of total labor force) (modeled ILO estimate)",
"Unemployment, youth total (% of total labor force ages 15-24) (modeled ILO estimate)"))
dat_educacion00 <- subset(dat_00, select = c("Government expenditure on education, total (% of GDP)",
"Literacy rate, adult total (% of people ages 15 and above)", "School enrollment, primary (% net)",
"School enrollment, secondary (% net)", "School enrollment, tertiary (% gross)"))

dat_industria10 <- subset(dat_10, select = c("Employment in industry (% of total employment) (modeled ILO estimate)",
"Employment in services (% of total employment) (modeled ILO estimate)", "Energy use (kg of oil equivalent per capita)", 
"External balance on goods and services (% of GDP)","Industry (including construction), value added (% of GDP)",
"Manufactures exports (% of merchandise exports)", "Manufacturing, value added (% of GDP)",
"Self-employed, total (% of total employment) (modeled ILO estimate)", 
"Unemployment, female (% of female labor force) (modeled ILO estimate)",
"Unemployment, total (% of total labor force) (modeled ILO estimate)",
"Unemployment, youth total (% of total labor force ages 15-24) (modeled ILO estimate)"))
dat_educacion10 <- subset(dat_10, select = c("Government expenditure on education, total (% of GDP)",
"Literacy rate, adult total (% of people ages 15 and above)", "School enrollment, primary (% net)",
"School enrollment, secondary (% net)", "School enrollment, tertiary (% gross)"))

3.2. Imputación de ambas tablas (necesario para PCA).

Imputamos las tablas porque nos hacen falta imputadas para aplicar métodos como el PCA. En este caso, llevar a cabo la imputación no es perjudicial, ya que nos hemos encargado de filtrar todos aquellos países que puedan suponer un problema en este sentido.

# Renombrar las columnas
colnames(dat_00) <- make.names(colnames(dat_00))

# Calcular matriz de correlaciones con valores faltantes imputados
library(mice) # cargar el paquete mice
df_imput00 <- mice(dat_00, m = 1) # imputar valores faltantes

## 
##  iter imp variable
##   1   1  Employment.in.industry....of.total.employment...modeled.ILO.estimate.  Employment.in.services....of.total.employment...modeled.ILO.estimate.  Energy.use..kg.of.oil.equivalent.per.capita.  External.balance.on.goods.and.services....of.GDP.  Government.expenditure.on.education..total....of.GDP.  Industry..including.construction...value.added....of.GDP.  Literacy.rate..adult.total....of.people.ages.15.and.above.  Manufactures.exports....of.merchandise.exports.  Manufacturing..value.added....of.GDP.  School.enrollment..primary....net.  School.enrollment..secondary....net.  School.enrollment..tertiary....gross.  Self.employed..total....of.total.employment...modeled.ILO.estimate.  Unemployment..female....of.female.labor.force...modeled.ILO.estimate.  Unemployment..total....of.total.labor.force...modeled.ILO.estimate.  Unemployment..youth.total....of.total.labor.force.ages.15.24...modeled.ILO.estimate.
##   2   1  Employment.in.industry....of.total.employment...modeled.ILO.estimate.  Employment.in.services....of.total.employment...modeled.ILO.estimate.  Energy.use..kg.of.oil.equivalent.per.capita.  External.balance.on.goods.and.services....of.GDP.  Government.expenditure.on.education..total....of.GDP.  Industry..including.construction...value.added....of.GDP.  Literacy.rate..adult.total....of.people.ages.15.and.above.  Manufactures.exports....of.merchandise.exports.  Manufacturing..value.added....of.GDP.  School.enrollment..primary....net.  School.enrollment..secondary....net.  School.enrollment..tertiary....gross.  Self.employed..total....of.total.employment...modeled.ILO.estimate.  Unemployment..female....of.female.labor.force...modeled.ILO.estimate.  Unemployment..total....of.total.labor.force...modeled.ILO.estimate.  Unemployment..youth.total....of.total.labor.force.ages.15.24...modeled.ILO.estimate.
##   3   1  Employment.in.industry....of.total.employment...modeled.ILO.estimate.  Employment.in.services....of.total.employment...modeled.ILO.estimate.  Energy.use..kg.of.oil.equivalent.per.capita.  External.balance.on.goods.and.services....of.GDP.  Government.expenditure.on.education..total....of.GDP.  Industry..including.construction...value.added....of.GDP.  Literacy.rate..adult.total....of.people.ages.15.and.above.  Manufactures.exports....of.merchandise.exports.  Manufacturing..value.added....of.GDP.  School.enrollment..primary....net.  School.enrollment..secondary....net.  School.enrollment..tertiary....gross.  Self.employed..total....of.total.employment...modeled.ILO.estimate.  Unemployment..female....of.female.labor.force...modeled.ILO.estimate.  Unemployment..total....of.total.labor.force...modeled.ILO.estimate.  Unemployment..youth.total....of.total.labor.force.ages.15.24...modeled.ILO.estimate.
##   4   1  Employment.in.industry....of.total.employment...modeled.ILO.estimate.  Employment.in.services....of.total.employment...modeled.ILO.estimate.  Energy.use..kg.of.oil.equivalent.per.capita.  External.balance.on.goods.and.services....of.GDP.  Government.expenditure.on.education..total....of.GDP.  Industry..including.construction...value.added....of.GDP.  Literacy.rate..adult.total....of.people.ages.15.and.above.  Manufactures.exports....of.merchandise.exports.  Manufacturing..value.added....of.GDP.  School.enrollment..primary....net.  School.enrollment..secondary....net.  School.enrollment..tertiary....gross.  Self.employed..total....of.total.employment...modeled.ILO.estimate.  Unemployment..female....of.female.labor.force...modeled.ILO.estimate.  Unemployment..total....of.total.labor.force...modeled.ILO.estimate.  Unemployment..youth.total....of.total.labor.force.ages.15.24...modeled.ILO.estimate.
##   5   1  Employment.in.industry....of.total.employment...modeled.ILO.estimate.  Employment.in.services....of.total.employment...modeled.ILO.estimate.  Energy.use..kg.of.oil.equivalent.per.capita.  External.balance.on.goods.and.services....of.GDP.  Government.expenditure.on.education..total....of.GDP.  Industry..including.construction...value.added....of.GDP.  Literacy.rate..adult.total....of.people.ages.15.and.above.  Manufactures.exports....of.merchandise.exports.  Manufacturing..value.added....of.GDP.  School.enrollment..primary....net.  School.enrollment..secondary....net.  School.enrollment..tertiary....gross.  Self.employed..total....of.total.employment...modeled.ILO.estimate.  Unemployment..female....of.female.labor.force...modeled.ILO.estimate.  Unemployment..total....of.total.labor.force...modeled.ILO.estimate.  Unemployment..youth.total....of.total.labor.force.ages.15.24...modeled.ILO.estimate.

# Recuperamos los datos imputados
df_imput00 = mice::complete(df_imput00)

# Renombrar las columnas
colnames(dat_10) <- make.names(colnames(dat_10))

# Calcular matriz de correlaciones con valores faltantes imputados
library(mice) # cargar el paquete mice
df_imput10 <- mice(dat_10, m = 1) # imputar valores faltantes

## 
##  iter imp variable
##   1   1  Employment.in.industry....of.total.employment...modeled.ILO.estimate.  Employment.in.services....of.total.employment...modeled.ILO.estimate.  Energy.use..kg.of.oil.equivalent.per.capita.  External.balance.on.goods.and.services....of.GDP.  Government.expenditure.on.education..total....of.GDP.  Literacy.rate..adult.total....of.people.ages.15.and.above.  Manufactures.exports....of.merchandise.exports.  Manufacturing..value.added....of.GDP.  School.enrollment..primary....net.  School.enrollment..secondary....net.  School.enrollment..tertiary....gross.  Self.employed..total....of.total.employment...modeled.ILO.estimate.  Unemployment..female....of.female.labor.force...modeled.ILO.estimate.  Unemployment..total....of.total.labor.force...modeled.ILO.estimate.  Unemployment..youth.total....of.total.labor.force.ages.15.24...modeled.ILO.estimate.
##   2   1  Employment.in.industry....of.total.employment...modeled.ILO.estimate.  Employment.in.services....of.total.employment...modeled.ILO.estimate.  Energy.use..kg.of.oil.equivalent.per.capita.  External.balance.on.goods.and.services....of.GDP.  Government.expenditure.on.education..total....of.GDP.  Literacy.rate..adult.total....of.people.ages.15.and.above.  Manufactures.exports....of.merchandise.exports.  Manufacturing..value.added....of.GDP.  School.enrollment..primary....net.  School.enrollment..secondary....net.  School.enrollment..tertiary....gross.  Self.employed..total....of.total.employment...modeled.ILO.estimate.  Unemployment..female....of.female.labor.force...modeled.ILO.estimate.  Unemployment..total....of.total.labor.force...modeled.ILO.estimate.  Unemployment..youth.total....of.total.labor.force.ages.15.24...modeled.ILO.estimate.
##   3   1  Employment.in.industry....of.total.employment...modeled.ILO.estimate.  Employment.in.services....of.total.employment...modeled.ILO.estimate.  Energy.use..kg.of.oil.equivalent.per.capita.  External.balance.on.goods.and.services....of.GDP.  Government.expenditure.on.education..total....of.GDP.  Literacy.rate..adult.total....of.people.ages.15.and.above.  Manufactures.exports....of.merchandise.exports.  Manufacturing..value.added....of.GDP.  School.enrollment..primary....net.  School.enrollment..secondary....net.  School.enrollment..tertiary....gross.  Self.employed..total....of.total.employment...modeled.ILO.estimate.  Unemployment..female....of.female.labor.force...modeled.ILO.estimate.  Unemployment..total....of.total.labor.force...modeled.ILO.estimate.  Unemployment..youth.total....of.total.labor.force.ages.15.24...modeled.ILO.estimate.
##   4   1  Employment.in.industry....of.total.employment...modeled.ILO.estimate.  Employment.in.services....of.total.employment...modeled.ILO.estimate.  Energy.use..kg.of.oil.equivalent.per.capita.  External.balance.on.goods.and.services....of.GDP.  Government.expenditure.on.education..total....of.GDP.  Literacy.rate..adult.total....of.people.ages.15.and.above.  Manufactures.exports....of.merchandise.exports.  Manufacturing..value.added....of.GDP.  School.enrollment..primary....net.  School.enrollment..secondary....net.  School.enrollment..tertiary....gross.  Self.employed..total....of.total.employment...modeled.ILO.estimate.  Unemployment..female....of.female.labor.force...modeled.ILO.estimate.  Unemployment..total....of.total.labor.force...modeled.ILO.estimate.  Unemployment..youth.total....of.total.labor.force.ages.15.24...modeled.ILO.estimate.
##   5   1  Employment.in.industry....of.total.employment...modeled.ILO.estimate.  Employment.in.services....of.total.employment...modeled.ILO.estimate.  Energy.use..kg.of.oil.equivalent.per.capita.  External.balance.on.goods.and.services....of.GDP.  Government.expenditure.on.education..total....of.GDP.  Literacy.rate..adult.total....of.people.ages.15.and.above.  Manufactures.exports....of.merchandise.exports.  Manufacturing..value.added....of.GDP.  School.enrollment..primary....net.  School.enrollment..secondary....net.  School.enrollment..tertiary....gross.  Self.employed..total....of.total.employment...modeled.ILO.estimate.  Unemployment..female....of.female.labor.force...modeled.ILO.estimate.  Unemployment..total....of.total.labor.force...modeled.ILO.estimate.  Unemployment..youth.total....of.total.labor.force.ages.15.24...modeled.ILO.estimate.

# Recuperamos los datos imputados
df_imput10 = mice::complete(df_imput10)

3.3. Creación de las 4 tablas (pero imputadas).

Volvemos a separar en educación e industrialización, pero esta vez con las tablas imputadas.

df_imputIndus00 <- subset(df_imput00, select = c("Employment.in.industry....of.total.employment...modeled.ILO.estimate.",
"Employment.in.services....of.total.employment...modeled.ILO.estimate.", "Energy.use..kg.of.oil.equivalent.per.capita.", 
"External.balance.on.goods.and.services....of.GDP.","Industry..including.construction...value.added....of.GDP.",
"Manufactures.exports....of.merchandise.exports.", "Manufacturing..value.added....of.GDP.",
"Self.employed..total....of.total.employment...modeled.ILO.estimate.", 
"Unemployment..female....of.female.labor.force...modeled.ILO.estimate.",
"Unemployment..total....of.total.labor.force...modeled.ILO.estimate.",
"Unemployment..youth.total....of.total.labor.force.ages.15.24...modeled.ILO.estimate."))
df_imputEduc00 <- subset(df_imput00, select = c("Government.expenditure.on.education..total....of.GDP.",
"Literacy.rate..adult.total....of.people.ages.15.and.above.", "School.enrollment..primary....net.",
"School.enrollment..secondary....net.", "School.enrollment..tertiary....gross."))

df_imputIndus10 <- subset(df_imput10, select = c("Employment.in.industry....of.total.employment...modeled.ILO.estimate.",
"Employment.in.services....of.total.employment...modeled.ILO.estimate.", "Energy.use..kg.of.oil.equivalent.per.capita.", 
"External.balance.on.goods.and.services....of.GDP.","Industry..including.construction...value.added....of.GDP.",
"Manufactures.exports....of.merchandise.exports.", "Manufacturing..value.added....of.GDP.",
"Self.employed..total....of.total.employment...modeled.ILO.estimate.", 
"Unemployment..female....of.female.labor.force...modeled.ILO.estimate.",
"Unemployment..total....of.total.labor.force...modeled.ILO.estimate.",
"Unemployment..youth.total....of.total.labor.force.ages.15.24...modeled.ILO.estimate."))
df_imputEduc10 <- subset(df_imput10, select = c("Government.expenditure.on.education..total....of.GDP.",
"Literacy.rate..adult.total....of.people.ages.15.and.above.", "School.enrollment..primary....net.",
"School.enrollment..secondary....net.", "School.enrollment..tertiary....gross."))

4. Métodos o algoritmos a usar.

Dado que mi objetivo es ver cómo se relaciona la industrialización de un país y la inversión en educación o escolarización, podría llevar a cabo los siguientes procedimientos: - PCA para reducir la dimensión, ver la contribución de las variables y la relación entre países, entre variables o entre países y variables. - Clustering para agrupar los países según la inversión en educación por un lado, agruparlos de nuevo según su industrialización y ver si las agrupaciones hechas coinciden en ambas materias.

5. PCA.

5.1. Estudio para educación.

Estudiaremos las variables de educación primero y sacaremos conclusiones sobre relaciones que podamos encontrar.

res.pca001 = PCA(df_imputEduc00, scale.unit = TRUE, graph = FALSE, ncp = 10)
eig.val <- get_eigenvalue(res.pca001)
VPmedio = 100 * (1/nrow(eig.val))
fviz_eig(res.pca001, addlabels = TRUE, main="PCA educación década 2000") +
  geom_hline(yintercept=VPmedio, linetype=2, color="red")

res.pca101 = PCA(df_imputEduc10, scale.unit = TRUE, graph = FALSE, ncp = 10)
eig.val <- get_eigenvalue(res.pca101)
VPmedio = 100 * (1/nrow(eig.val))
fviz_eig(res.pca101, addlabels = TRUE, main="PCA educación década 2010") +
  geom_hline(yintercept=VPmedio, linetype=2, color="red")

Seleccionamos dos componentes principales en ambas décadas, ya que explican el 84,5% y el 80,6% de la variabilidad total respectivamente.

res.pca001 = PCA(df_imputEduc00, scale.unit = TRUE, graph = FALSE, ncp = 2)
res.pca101 = PCA(df_imputEduc10, scale.unit = TRUE, graph = FALSE, ncp = 2)

Miramos la contribución de las variables a las 2 primeras dimensiones seleccionadas:

fviz_contrib(res.pca001, choice = "var", axes = 1:2, title="Década 2000")

fviz_contrib(res.pca101, choice = "var", axes = 1:2, title="Década 2010")

Vemos que las dos variables que más contribuyen a explicar el comportamiento de los datos en materia educación son la inversión total del gobierno en educación (porcentaje del PIB) y el pocentaje neto de la matriculación en escuela secundaria. Por tanto, les daremos más relevancia a la hora de sacar conclusiones y analizar gráficos.

A continuación representamos unos gráficos de los 70 países (casi la mitad) con mayor contribución en educación:

fviz_pca_ind(res.pca001, axes = c(1,2), geom = c("point", "text"), repel = TRUE, labelsize = 2, col.ind = "contrib",
             select.ind = list("contrib"=70), gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), title="Década 2000")

fviz_pca_ind(res.pca101, axes = c(1,2), geom = c("point", "text"), repel = TRUE, labelsize = 2, col.ind = "contrib",
             select.ind = list("contrib"=70), gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), title="Década 2010")

Podemos ver que los países con mayor contribución a la variabilidad total de los datos son los países considerados del primer mundo, casi todos europeos. También podemos destacar que países muy cercanos entre sí, como los nórdicos por un lado y los de habla inglesa por otro lado (Canada y Australia), tienen tendencias muy similares en cuanto a materia educación (quizás una alta inversión total del gobierno en educación y alta matriculación en la escuela secundaria). Por otro lado, una gran mayoría de los países que tienen una contribución alta en las dos primeras componentes, pero de manera negativa a la primera componente principal, son tercermundistas (mayoritariamente países africanos, aunque hay algunos asiáticos). Por tanto, podemos ver una clara separación en cuanto a materia educación en ambas décadas entre países europeos, americanos y oceánicos por un lado y países africanos y asiáticos por otro lado. Finalmente, Marshall Islands y Timor-Leste se podrían considerar datos anómalos debido a su alta contribución a las dos dimensiones en el caso de Marshall Islands y a la segunda dimensión en el caso de Timor-Leste.

Gráfico de variables en materia educación:

fviz_pca_var(res.pca001, axes = c(1,2), geom = c("point", "text"), repel = TRUE, labelsize = 2, col.var = "contrib",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), title="Década 2000")

fviz_pca_var(res.pca101, axes = c(1,2), geom = c("point", "text"), repel = TRUE, labelsize = 2, col.var = "contrib",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), title="Década 2010")

Podemos ver que también se da el mismo comportamiento que las Marshall Islands en la variable inversión total del gobierno en educación (“Government.expenditure.on.education..total….of.GDP.”). Además, se encuentran en posiciones similares, por lo que podremos asumir que la inverisión en educación en las Islas Marshall es muy alta. Y eso no varia en la siguiente década. Luego, los demás países europeos, americanos y oceánicos que tenían una alta contribución a la primera dimensión tienen tendencias similares y se agrupan en torno a las variables que también contribuyen muy positivamente a la primera dimensión, que son: matriculación en escuela secundaria, matriculación terciaria, matriculación primaria y la tasa de alfabetización. Y ese comportamiento es similar en ambas décadas, pero para ser más específicos, vemos qué países aportan más a cada una de las variables. En la década de los 2000, países nórdicos (Suecia, Finlandia y Noruega), centroeuropeos (como Francia, Bielorusia, Lituania y Eslovenia), algunos oceánicos (como Nueva Zelanda) y alguno asiático (como Israel) tienen tendencias similares y una alta matriculación en educación secundaria. Quizás ofrezcan muchas oportunidades en ese ámbito o más presión. Seguiré con las conclusiones en los biplots, donde todo es más claro.

Gráficos de biplots de los 70 países con mayor contribución:

fviz_pca_biplot(res.pca001, axes = c(1,2), select.ind = list("contrib"=70), labelsize = 3, label = "ind", repel = TRUE, title="Década 2000")

fviz_pca_biplot(res.pca001, axes = c(1,2), select.ind = list("contrib"=60), labelsize = 3, label = "var", repel = TRUE, title="Década 2000")

fviz_pca_biplot(res.pca101, axes = c(1,2), select.ind = list("contrib"=70), labelsize = 3, label = "ind", repel = TRUE, title="Década 2010")

fviz_pca_biplot(res.pca101, axes = c(1,2), select.ind = list("contrib"=60), labelsize = 3, label = "var", repel = TRUE, title="Década 2010")

Vemos que, en general, en ambas décadas se observa que los países africanos y algunos asiáticos (como Nepal) tienen una baja tasa de matriculación ya sea en secundaria, primaria u otro. También se podría confirmar para algunos, como Sudan en la primera década o el Congo en la segunda década, que el gobierno tiene una baja inversión en educación. Luego, los países con alta matriculación y alta alfabetización son en general países más desarrollados europeos, norteamericanos y oceánicos. Entre los únicos que encontramos que no cumplen con ser de estas zonas serían Argentina, Israel, Corea del Sur y Japón. Por tanto, para esta parte podemos concluir que en ambas décadas, las tasas de matriculación y de alfabetización están correlacionadas entre sí, mientras que parece que no lo están con el gasto en educación del gobierno. Además de que los países desarrollados parecen tener altas tasas de matriculación y alfabetización, cosa que no se ve reflejada en los países subdesarrollados, ya que están correlacionados negativamente con esas variables. Un aspecto a destacar sería el aumento de la inversión en educación de Cuba en la década del 2010, esto se puede corroborar con fuentes de datos externas a la nuestra: https://www.indexmundi.com/es/datos/indicadores/SE.XPD.TOTL.GD.ZS/rankings

5.2. Estudio para industrialización.

Aplicamos el estudio anterior, pero para el ámbito de la industria con el fin de averiguar si hay una alta industrialización en los países con más inversión en educación.

res.pca002 = PCA(df_imputIndus00, scale.unit = TRUE, graph = FALSE, ncp = 10)
eig.val <- get_eigenvalue(res.pca002)
VPmedio = 100 * (1/nrow(eig.val))
fviz_eig(res.pca002, addlabels = TRUE, main="PCA industrialización década 2000") +
  geom_hline(yintercept=VPmedio, linetype=2, color="red")

res.pca102 = PCA(df_imputIndus10, scale.unit = TRUE, graph = FALSE, ncp = 10)
eig.val <- get_eigenvalue(res.pca102)
VPmedio = 100 * (1/nrow(eig.val))
fviz_eig(res.pca102, addlabels = TRUE, main="PCA industrialización década 2010") +
  geom_hline(yintercept=VPmedio, linetype=2, color="red")

Para ambas décadas seleccionamos 4 componentes principales, ya que cumplen la regla de tener un porcentaje de la variabilidad explicada alto, más de un 80%.

res.pca002 = PCA(df_imputIndus00, scale.unit = TRUE, graph = FALSE, ncp = 4)
res.pca102 = PCA(df_imputIndus10, scale.unit = TRUE, graph = FALSE, ncp = 4)

Miramos la contribución de las variables a las primeras dimensiones:

fviz_contrib(res.pca002, choice = "var", axes = 1:2, title = "Década 2000")

fviz_contrib(res.pca102, choice = "var", axes = 1:2, title = "Década 2010")

Las tasas de desempleo son las variables con mayor contribución principalmente.

Miramos también la contribución de algunas observaciones a las primeras dimensiones:

fviz_contrib(res.pca002, choice = "ind", top=50, axes = 1:2, title = "Década 2000")

fviz_contrib(res.pca102, choice = "ind", top=50, axes = 1:2, title = "Década 2010")

Países como Qatar y Djibouti contribuyen mucho a las dos primeras dimensiones de esta segunda parte del análisis más enfocada a la industrialización de los países. Quizás se encuentren entre los muy insdustrializados.

Miramos la contribución de 70 países principales en materia industria de manera más clara:

fviz_pca_ind(res.pca002, axes = c(1,2), geom = c("point", "text"), repel = TRUE, labelsize = 2, col.ind = "contrib",
             select.ind = list("contrib"=70), gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), title = "Década 2000")

fviz_pca_ind(res.pca102, axes = c(1,2), geom = c("point", "text"), repel = TRUE, labelsize = 2, col.ind = "contrib",
             select.ind = list("contrib"=70), gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), title = "Década 2010")

Países reconocidos como adinerados o del primer mundo tienen tendencias similares entre sí en cuanto a materia industria, observamos tres agrupaciones principalmente: - Luxemburgo, Suiza, Irlanda, Islandia, USA, Países Bajos, Suecia, Malasia y Japón. - Emiratos Árabes Unidos, Kuwait y Bahrain (Qatar también aproximadamente, pero es más dato atípico). - Canadá, Bélgica, Alemania, Bielorusia, Finlandia y Trinidad y Tobago. Por otro lado, tenemos ciertos países algo desarrollados, pero no tan adinerados con las siguientes agrupaciones: - España, Bulgaria, Botswana, Polonia, Jordania, Seychelles, Croacia, Sud África, Eslovakia, Libya, Colombia y Namibia. - Algeria, Eswatini, Montenegro, Bosnia and Herzegovina y Djibouti. Como aprendizaje importante podríamos sacar que el nivel de industrialización de España no está a la misma altura que el de los países más adinerados de Europa. No obstante, en la siguiente década, las anteriores agrupaciones observadas se modifican un poco y España comienza a asemejarse más a países como Irlanda.

Gráficos de variables en materia industria:

fviz_pca_var(res.pca002, axes = c(1,2), geom = c("point", "text"), repel = TRUE, labelsize = 2, col.var = "contrib",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), title = "Década 2000")

fviz_pca_var(res.pca102, axes = c(1,2), geom = c("point", "text"), repel = TRUE, labelsize = 2, col.var = "contrib",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), title = "Década 2010")

En la década del 2000, vemos una correlación importante entre las distintas tasas de desempleo por un lado y, por otro lado, entre el pocentaje de exportaciones, la contribución del sector de la industria, manufactura y construcción a un país y la tasa de empleo en industria y en el sector de los servicios. Además de que variables como el saldo de transacciones comerciales y la cantidad de energía usada están fuertemente correlacionadas entre sí, aunque también lo están con las de antes.

A continuación, vemos unos gráficos con los indicadores y los países juntos:

fviz_pca_biplot(res.pca002, axes = c(1,2), select.ind = list("contrib"=70), labelsize = 3, label = "ind", repel = TRUE, title = "Década 2000")

fviz_pca_biplot(res.pca002, axes = c(1,2), select.ind = list("contrib"=50), labelsize = 3, label = "var", repel = TRUE, title = "Década 2000")

fviz_pca_biplot(res.pca102, axes = c(1,2), select.ind = list("contrib"=70), labelsize = 3, label = "ind", repel = TRUE, title = "Década 2010")

fviz_pca_biplot(res.pca102, axes = c(1,2), select.ind = list("contrib"=50), labelsize = 3, label = "var", repel = TRUE, title = "Década 2010")

Podríamos clasificar como países de industrialización media a aquellos con alta tasa de empleabilidad en servicios e industria, con alto pocentaje de exportaciones y con una muy buena contribución del sector de la industria, manufactura y construcción al país. Es decir: Italia, Finlandia, Trinidad y Tobago y Bielorrusia entre otros. Luego, los países altamente industrializados serían aquellos con alta contribución a los indicadores saldo de transacciones comerciales y la cantidad de energía usada. Que serían: Bélgica, Alemania, Japón, Canadá, Suecia, Malasia, Austria, Emiratos Árabes Unidos, Estados Unidos y más. A continuación, tenemos algunos países con alta tasa de desempleo, como: España, Jordania, Croacia, Colombia, Montenegro y otros más. Por último, los países con alta autonomía y baja industrialización son en su mayoría africanos, aunque también se incluyen algunos asiáticos. Repetimos los gráficos anteriores pero para la tercera y cuarta dimensión.

fviz_pca_biplot(res.pca002, axes = c(3,4), select.ind = list("contrib"=70), labelsize = 3, label = "ind", repel = TRUE, title = "Década 2000")

fviz_pca_biplot(res.pca002, axes = c(3,4), select.ind = list("contrib"=50), labelsize = 3, label = "var", repel = TRUE, title = "Década 2000")

fviz_pca_biplot(res.pca102, axes = c(3,4), select.ind = list("contrib"=70), labelsize = 3, label = "ind", repel = TRUE, title = "Década 2010")

fviz_pca_biplot(res.pca102, axes = c(3,4), select.ind = list("contrib"=50), labelsize = 3, label = "var", repel = TRUE, title = "Década 2010")

No vemos gran cambio en la claridad de la visualización de los gráficos. No obstante, en este caso podemos destacar la muy alta exportación de bienes de los países asiáticos como China, Tailandia y Corea del Sur.

6. Clustering.

Como en nuestro objetivo nos interesa ver si hay la misma separación entre los países tanto en industria como en educación, optamos por hacer un clustering. Este último nos ayudará a salir de dudas con países con resultados confusos como España, la cual no está muy claro si entra entre los países más industrializados o no.

6.1. Estudio para educación.

Debido a la diferencia de unidades y magnitudes, escalamos y centramos.

df_Educ00 = scale(df_imputEduc00, center = TRUE, scale = TRUE)
df_Educ10 = scale(df_imputEduc10, center = TRUE, scale = TRUE)

Dado que nos interesa estudiar la similitud de inversión en educación entre los distintos países, usaremos la medida de correlación de Pearson.

midistEduc00 <- get_dist(df_Educ00, stand = FALSE, method = "pearson")
fviz_dist(midistEduc00, show_labels = TRUE, lab_size = 0.3,
          gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

midistEduc10 <- get_dist(df_Educ10, stand = FALSE, method = "pearson")
fviz_dist(midistEduc10, show_labels = TRUE, lab_size = 0.3,
          gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

Tenemos dudas sobre la presencia de agrupaciones claras en la década 00, estudiaremos el estadístico de Hopkins.

set.seed(100)
myN = c(20, 35, 50, 80, 120)  # m
myhopkins = NULL
myseed = sample(1:1000, 10)
for (i in myN) {
  for (j in myseed) {
    tmp = get_clust_tendency(data = df_Educ00, n = i, graph = FALSE, seed = j)
    myhopkins = c(myhopkins, tmp$hopkins_stat)
  }
}
summary(myhopkins)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.7233  0.7776  0.7856  0.7881  0.8011  0.8293

Como el valor oscila entre 0.76 y 0.83, siendo este muy alto (cercano a 1), asumimos la presencia de agrupaciones. Usaremos k-medoides porque es menos sensible al ruido, a los atípicos y a los faltantes (aunque ya no tenemos tras la imputación).

p1 = fviz_nbclust(x = df_Educ00, FUNcluster = pam, method = "silhouette", 
             k.max = 10, verbose = FALSE) +
  labs(title = "Numero optimo de clusters 00")
p2 = fviz_nbclust(x = df_Educ00, FUNcluster = pam, method = "wss", 
             k.max = 10, verbose = FALSE) +
  labs(title = "Numero optimo de clusters 00")
grid.arrange(p1, p2, nrow = 1)

p1 = fviz_nbclust(x = df_Educ10, FUNcluster = pam, method = "silhouette", 
             k.max = 10, verbose = FALSE) +
  labs(title = "Numero optimo de clusters 10")
p2 = fviz_nbclust(x = df_Educ10, FUNcluster = pam, method = "wss", 
             k.max = 10, verbose = FALSE) +
  labs(title = "Numero optimo de clusters 10")
grid.arrange(p1, p2, nrow = 1)

Como en el 2 aún tiene una suma de cuadrados intra-cluster demasiado alta, seleccionaremos el siguiente que nos parezca más óptimo: 5 clusters para ambas décadas, porque en ambos casos pasan sucesos demasiado similares.

clust1 <- pam(df_Educ00, k = 5)
table(clust1$clustering)

## 
##  1  2  3  4  5 
## 25 48 15 34 39

clust2 <- pam(df_Educ10, k = 5)
table(clust2$clustering)

## 
##  1  2  3  4  5 
## 30 51 12 49 19

Observamos algunos grupos bastante equilibrados entre sí. Veamos como se agrupan en las dos primeras dimensiones del PCA.

p1 = fviz_cluster(object = list(data=df_Educ00, cluster=clust1$clustering), stand = FALSE,
             ellipse.type = "convex", geom = "point", show.clust.cent = FALSE,
             labelsize = 8)  +
  labs(title = "K-MEDOIDES + Proyeccion PCA 00",
       subtitle = "Dist de pearson, K=5") +
  theme_bw() +
  theme(legend.position = "bottom")
p2 = fviz_cluster(object = list(data=df_Educ10, cluster=clust2$clustering), stand = FALSE,
             ellipse.type = "convex", geom = "point", show.clust.cent = FALSE,
             labelsize = 8)  +
  labs(title = "K-MEDOIDES + Proyeccion PCA 10",
       subtitle = "Dist de pearson, K=5") +
  theme_bw() +
  theme(legend.position = "bottom")
grid.arrange(p1, p2, nrow = 1)

Podemos ver que con las dos primeras dimensiones y Pearson se separan perfectamente los países.

misclust1 = factor(clust1$cluster)
misclust2 = factor(clust2$cluster)

Los representamos gráficamente en las dos primeras dimensiones principales.

p1 = fviz_pca_ind(res.pca001, geom = "point", label="all", habillage = misclust1, addEllipses = FALSE, 
             palette = rainbow(5), title = "Década 2000")
p2 = fviz_pca_var(res.pca001)
grid.arrange(p1, p2, nrow = 1)

p1 = fviz_pca_ind(res.pca101, geom = "point", habillage = misclust2, addEllipses = FALSE, 
             palette = rainbow(5), title = "Década 2010")
p2 = fviz_pca_var(res.pca101)
grid.arrange(p1, p2, nrow = 1)

Estudiamos esa ubicación de grupos y añadimos etiquetas adecuadas:

df_clust1 <- data.frame(clust1$clustering)

df_clust1$clust1.clustering[df_clust1$clust1.clustering == "1"] = "Alfabetización y escolarización muy baja"
df_clust1$clust1.clustering[df_clust1$clust1.clustering == "2"] = "Alfabetización y escolarización alta"
df_clust1$clust1.clustering[df_clust1$clust1.clustering == "3"] = "Alfabetización y escolarización baja"
df_clust1$clust1.clustering[df_clust1$clust1.clustering == "4"] = "Alfabetización y escolarización muy alta" 
df_clust1$clust1.clustering[df_clust1$clust1.clustering == "5"] = "Alto gasto en educación"

df_clust2 <- data.frame(clust2$clustering)

df_clust2$clust2.clustering[df_clust2$clust2.clustering == "1"] = "Alfabetización y escolarización muy baja"
df_clust2$clust2.clustering[df_clust2$clust2.clustering == "2"] = "Alfabetización y escolarización alta"
df_clust2$clust2.clustering[df_clust2$clust2.clustering == "3"] = "Alto gasto en educación"
df_clust2$clust2.clustering[df_clust2$clust2.clustering == "4"] = "Alfabetización y escolarización baja" 
df_clust2$clust2.clustering[df_clust2$clust2.clustering == "5"] = "Alfabetización y escolarización muy alta"

Representamos los distintos niveles en mapas:

library(maps)
library(ggplot2)
library(purrr)
library(dplyr)
invisible(map_data("world"))

df_clusters1 <- data.frame(region = rownames(df_clust1), cluster = df_clust1$clust1.clustering)
map_clusters1 <- map_data("world") %>% left_join(df_clusters1)

## Joining with `by = join_by(region)`

ggplot() +
  geom_map(data = map_clusters1, map = map_clusters1,
           aes(x = long, y = lat, map_id = region, fill = factor(cluster)),
           color = "white") +
  expand_limits(x = map_clusters1$long, y = map_clusters1$lat) +
  scale_fill_discrete(name = "Cluster") +
  theme_void()

## Warning in geom_map(data = map_clusters1, map = map_clusters1, aes(x = long, :
## Ignoring unknown aesthetics: x and y

df_clusters2 <- data.frame(region = rownames(df_clust2), cluster = df_clust2$clust2.clustering)
map_clusters2 <- map_data("world") %>% left_join(df_clusters2)

## Joining with `by = join_by(region)`

ggplot() +
  geom_map(data = map_clusters2, map = map_clusters2,
           aes(x = long, y = lat, map_id = region, fill = factor(cluster)),
           color = "white") +
  expand_limits(x = map_clusters2$long, y = map_clusters2$lat) +
  scale_fill_discrete(name = "Cluster") +
  theme_void()

## Warning in geom_map(data = map_clusters2, map = map_clusters2, aes(x = long, :
## Ignoring unknown aesthetics: x and y

Así a simple vista vemos que los de escolarización y alfabetización muy alta son los países anteriormente mencionados y clasificados como adinerados o industrializados en un principio. Aunque en este caso se incluye a España, porque a pesar de tener alta tasa de desempleo, en este caso tiene alta escolarización.

6.2. Estudio para industrialización.

Repetimos exactamente el mismo estudio, pero para las variables de industria.

df_Indus00 = scale(df_imputIndus00, center = TRUE, scale = TRUE)
df_Indus10 = scale(df_imputIndus10, center = TRUE, scale = TRUE)

Dado que nos interesa estudiar la similitud de tendencias en industrialización entre los distintos países, usaremos la medida de correlación de Pearson.

midistIndus00 <- get_dist(df_Indus00, stand = FALSE, method = "pearson")
fviz_dist(midistIndus00, show_labels = TRUE, lab_size = 0.3,
          gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

midistIndus10 <- get_dist(df_Indus10, stand = FALSE, method = "pearson")
fviz_dist(midistIndus10, show_labels = TRUE, lab_size = 0.3,
          gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

Usaremos k-medoides porque es menos sensible al ruido, a los atípicos y a los faltantes (aunque ya no tenemos tras la imputación).

p1 = fviz_nbclust(x = df_Indus00, FUNcluster = pam, method = "silhouette", 
             k.max = 10, verbose = FALSE) +
  labs(title = "Numero optimo de clusters 00")
p2 = fviz_nbclust(x = df_Indus00, FUNcluster = pam, method = "wss", 
             k.max = 10, verbose = FALSE) +
  labs(title = "Numero optimo de clusters 00")
grid.arrange(p1, p2, nrow = 1)

p1 = fviz_nbclust(x = df_Indus10, FUNcluster = pam, method = "silhouette", 
             k.max = 10, verbose = FALSE) +
  labs(title = "Numero optimo de clusters 10")
p2 = fviz_nbclust(x = df_Indus10, FUNcluster = pam, method = "wss", 
             k.max = 10, verbose = FALSE) +
  labs(title = "Numero optimo de clusters 10")
grid.arrange(p1, p2, nrow = 1)

Como en el 2 aún tiene una suma de cuadrados intra-cluster demasiado alta, seleccionaremos el siguiente al siguiente más óptimo: 5 clusters para ambas décadas.

clust3 <- pam(df_Indus00, k = 5)
table(clust3$clustering)

## 
##  1  2  3  4  5 
## 37 50 14 34 26

clust4 <- pam(df_Indus10, k = 5)
table(clust4$clustering)

## 
##  1  2  3  4  5 
## 39 21 63 30  8

Observamos grupos bastante equilibrados entre sí. Veamos como se agrupan en las dos primeras dimensiones del PCA.

p1 = fviz_cluster(object = list(data=df_Indus00, cluster=clust3$clustering), stand = FALSE,
             ellipse.type = "convex", geom = "point", show.clust.cent = FALSE,
             labelsize = 8)  +
  labs(title = "K-MEDOIDES + Proyeccion PCA 00",
       subtitle = "Dist de pearson, K=5") +
  theme_bw() +
  theme(legend.position = "bottom")
p2 = fviz_cluster(object = list(data=df_Indus10, cluster=clust4$clustering), stand = FALSE,
             ellipse.type = "convex", geom = "point", show.clust.cent = FALSE,
             labelsize = 8)  +
  labs(title = "K-MEDOIDES + Proyeccion PCA 10",
       subtitle = "Dist de pearson, K=5") +
  theme_bw() +
  theme(legend.position = "bottom")
grid.arrange(p1, p2, nrow = 1)

Varios de los grupos presentes se solapan en ambas décadas. Veamos si hay separación en las dos siguientes dimensiones principales entre esos grupos concretos que se solapan.

p1 = fviz_cluster(object = list(data=df_Indus00, cluster=clust3$clustering), stand = FALSE,
             ellipse.type = "convex", geom = "point", show.clust.cent = FALSE,
             labelsize = 8, axes = 3:4)  +
  labs(title = "K-MEDOIDES + Proyeccion PCA 00",
       subtitle = "Dist de pearson, K=5") +
  theme_bw() +
  theme(legend.position = "bottom")
p2 = fviz_cluster(object = list(data=df_Indus10, cluster=clust4$clustering), stand = FALSE,
             ellipse.type = "convex", geom = "point", show.clust.cent = FALSE,
             labelsize = 8, axes = 3:4)  +
  labs(title = "K-MEDOIDES + Proyeccion PCA 10",
       subtitle = "Dist de pearson, K=5") +
  theme_bw() +
  theme(legend.position = "bottom")
grid.arrange(p1, p2, nrow = 1)

En las componentes principales 3 y 4, esos grupos que antes se solapaban ahora sí que se distinguen y separan notablemente mejor. Guardamos los clusters formados en las variables:

misclust3 = factor(clust3$cluster)
misclust4 = factor(clust4$cluster)

Los representamos gráficamente en las dos primeras dimensiones principales para asignar niveles:

p1 = fviz_pca_ind(res.pca002, geom = "point", habillage = misclust3, addEllipses = FALSE, 
             palette = rainbow(5),  title = "Década 2000")
p2 = fviz_pca_var(res.pca002)
grid.arrange(p1, p2, nrow = 1)

p1 = fviz_pca_ind(res.pca102, geom = "point", habillage = misclust4, addEllipses = FALSE, 
             palette = rainbow(5),  title = "Década 2010")
p2 = fviz_pca_var(res.pca102)
grid.arrange(p1, p2, nrow = 1)

Añadimos etiquetas según la ubicación de los distintos clusters:

df_clust3 <- data.frame(clust3$clustering)

df_clust3$clust3.clustering[df_clust3$clust3.clustering == "1"] = "Ruralizados"
df_clust3$clust3.clustering[df_clust3$clust3.clustering == "2"] = "Industrializados con alto desempleo"
df_clust3$clust3.clustering[df_clust3$clust3.clustering == "4"] = "Poco industrializados"
df_clust3$clust3.clustering[df_clust3$clust3.clustering == "5"] = "Industrializados"
df_clust3$clust3.clustering[df_clust3$clust3.clustering == "3"] = "Muy industrializados"

df_clust4 <- data.frame(clust4$clustering)

df_clust4$clust4.clustering[df_clust4$clust4.clustering == "1"] = "Ruralizados"
df_clust4$clust4.clustering[df_clust4$clust4.clustering == "2"] = "Industrializados con alto desempleo"
df_clust4$clust4.clustering[df_clust4$clust4.clustering == "3"] = "Poco industrializados"
df_clust4$clust4.clustering[df_clust4$clust4.clustering == "4"] = "Industrializados"
df_clust4$clust4.clustering[df_clust4$clust4.clustering == "5"] = "Muy industrializados"

Por último, vemos la representación en el mapa:

library(maps)
library(ggplot2)
library(purrr)
invisible(map_data("world"))

df_clusters3 <- data.frame(region = rownames(df_clust3), cluster = df_clust3$clust3.clustering)
map_clusters3 <- map_data("world") %>% left_join(df_clusters3)

## Joining with `by = join_by(region)`

ggplot() +
  geom_map(data = map_clusters3, map = map_clusters3,
           aes(x = long, y = lat, map_id = region, fill = factor(cluster)),
           color = "white") +
  expand_limits(x = map_clusters3$long, y = map_clusters3$lat) +
  scale_fill_discrete(name = "Cluster") +
  theme_void()

## Warning in geom_map(data = map_clusters3, map = map_clusters3, aes(x = long, :
## Ignoring unknown aesthetics: x and y

df_clusters4 <- data.frame(region = rownames(df_clust4), cluster = df_clust4$clust4.clustering)
map_clusters4 <- map_data("world") %>% left_join(df_clusters4)

## Joining with `by = join_by(region)`

ggplot() +
  geom_map(data = map_clusters4, map = map_clusters4,
           aes(x = long, y = lat, map_id = region, fill = factor(cluster)),
           color = "white") +
  expand_limits(x = map_clusters4$long, y = map_clusters4$lat) +
  scale_fill_discrete(name = "Cluster") +
  theme_void()

## Warning in geom_map(data = map_clusters4, map = map_clusters4, aes(x = long, :
## Ignoring unknown aesthetics: x and y

En este caso observamos algo más de variabilidad con respecto a la educación, pero esto se debe a que los países con alta tasa de desempleo se separan en un grupo a parte, pero no por eso se dejan de considerar industrializados.

7. Conclusión.

Finalmente, a pesar de que la hipótesis parece cumplirse y las separaciones parecen claras, algunos países industrializados son clasificados como con muy baja alfabetización y escolarización. Así que por eso decidimos confirmar la hipótesis, pero parcialmente. Debido a que se confirma para la mayoría de casos, pero hay excepciones, como en el caso de Gabón, el cual se trata de un país supuestamente industrializado con muy baja escolarización y tasa de matriculación.