
Descripción de la data set
Se entrega el siguiente set de datos de los países del mundo,IndicadoresMundiales.csv en los cuáles observamos los indicadores más importantes del milenio y de comparación entre países en los cuáles queremos establecer grupos o bloques mundiales. Las estrategias de segmentación de países son muy importantes pues nos permiten tomar acciones por ejemplo del Banco Mundial respecto a asistencia financiera y técnica para los llamados países en desarrollo en vías de desarrollo.
Principal objetivo
El principal objetivo de esta problemática es tratar de encontrar grupos, segmentos o bloques mundiales de países para de cara al desarrollo de las regiones poder tomar acciones e inversiones focalizadas.
Las variables que se disponibilizan son:
# cargamos la data
data = read.csv("IndicadoresMundiales.csv",sep = ",")
#observamos la data set
head(data)
## country region tfr contraception educationMale educationFemale
## 1 Afghanistan Asia 6.90 NA NA NA
## 2 Albania Europe 2.60 NA NA NA
## 3 Algeria Africa 3.81 52 11.1 9.9
## 4 American.Samoa Asia NA NA NA NA
## 5 Andorra Europe NA NA NA NA
## 6 Angola Africa 6.69 NA NA NA
## lifeMale lifeFemale infantMortality GDPperCapita economicActivityMale
## 1 45.0 46.0 154 2848 87.5
## 2 68.0 74.0 32 863 NA
## 3 67.5 70.3 44 1531 76.4
## 4 68.0 73.0 11 NA 58.8
## 5 NA NA NA NA NA
## 6 44.9 48.1 124 355 NA
## economicActivityFemale illiteracyMale illiteracyFemale
## 1 7.2 52.800 85.00
## 2 NA NA NA
## 3 7.8 26.100 51.00
## 4 42.4 0.264 0.36
## 5 NA NA NA
## 6 NA NA NA
# Detección del número de valores perdidos en cada una de las columnas que presenta la tabla
colSums(is.na(data))
## country region tfr
## 0 0 10
## contraception educationMale educationFemale
## 63 131 131
## lifeMale lifeFemale infantMortality
## 11 11 6
## GDPperCapita economicActivityMale economicActivityFemale
## 10 42 42
## illiteracyMale illiteracyFemale
## 47 47
library(visdat) # visualización de datos
# Visualización gráfica de proporción de datos perdidos y donde se producen.
vis_dat(data)
#Determinación del porcentaje de datos perdidos.
vis_miss(data ,sort_miss = TRUE)
#Determinación del porcentaje de datos perdidos.
library(DataExplorer)
plot_missing(data)
#Reemplazamos con la mediana (numérico), moda (no numérico)
library(DMwR2)
data_imp_mtc = centralImputation(data)
dim(data_imp_mtc)
## [1] 207 14
# observamos la data imputada
vis_miss(data_imp_mtc)
## Compracion histogramas originales - imputadas
par(mfrow = c(1, 2))
hist(data$tfr, main = 'original')
hist(data_imp_mtc$tfr, main = 'imputada')
par(mfrow = c(1, 2))
hist(data$contraception, main = 'original')
hist(data_imp_mtc$contraception, main = 'imputada')
par(mfrow = c(1, 2))
hist(data$educationMale, main = 'original')
hist(data_imp_mtc$educationMale, main = 'imputada')
par(mfrow = c(1, 2))
hist(data$educationFemale, main = 'original')
hist(data_imp_mtc$educationFemale, main = 'imputada')
par(mfrow = c(1, 2))
hist(data$lifeMale, main = 'original')
hist(data_imp_mtc$lifeMale, main = 'imputada')
par(mfrow = c(1, 2))
hist(data$lifeFemale, main = 'original')
hist(data_imp_mtc$lifeFemale, main = 'imputada')
par(mfrow = c(1, 2))
hist(data$infantMortality, main = 'original')
hist(data_imp_mtc$infantMortality, main = 'imputada')
par(mfrow = c(1, 2))
hist(data$GDPperCapita, main = 'original')
hist(data_imp_mtc$GDPperCapita, main = 'imputada')
par(mfrow = c(1, 2))
hist(data$economicActivityMale, main = 'original')
hist(data_imp_mtc$economicActivityMale, main = 'imputada')
par(mfrow = c(1, 2))
hist(data$economicActivityFemale, main = 'original')
hist(data_imp_mtc$economicActivityFemale, main = 'imputada')
par(mfrow = c(1, 2))
hist(data$illiteracyMale, main = 'original')
hist(data_imp_mtc$illiteracyMale, main = 'imputada')
par(mfrow = c(1, 2))
hist(data$illiteracyFemale, main = 'original')
hist(data_imp_mtc$illiteracyFemale, main = 'imputada')
# Estandarizamos las variables numericas
data_std = scale(data_imp_mtc[, 3:14])
head(data_std)
## tfr contraception educationMale educationFemale lifeMale
## [1,] 1.9711750 0.1252804 -0.03199859 0.06541694 -1.9675673
## [2,] -0.5266438 0.1252804 -0.03199859 0.06541694 0.4423496
## [3,] 0.1762308 0.3775474 -0.11343762 -0.76606682 0.3899601
## [4,] -0.2536264 0.1252804 -0.03199859 0.06541694 0.4423496
## [5,] -0.2536264 0.1252804 -0.03199859 0.06541694 0.2851811
## [6,] 1.8491885 0.1252804 -0.03199859 0.06541694 -1.9780452
## lifeFemale infantMortality GDPperCapita economicActivityMale
## [1,] -2.0965491 2.89933362 -0.3484326 1.59916549
## [2,] 0.5019174 -0.28981971 -0.5647464 0.03943514
## [3,] 0.1585486 0.02386751 -0.4919516 -0.01887253
## [4,] 0.4091150 -0.83877233 -0.4649260 -2.58441031
## [5,] 0.3302330 -0.34210091 -0.4649260 0.03943514
## [6,] -1.9016641 2.11511559 -0.6201053 0.03943514
## economicActivityFemale illiteracyMale illiteracyFemale
## [1,] -2.65901952 2.3123842 2.496306
## [2,] 0.08523922 -0.3269057 -0.255871
## [3,] -2.61905458 0.6333906 1.054488
## [4,] -0.31441011 -0.9912715 -1.092974
## [5,] 0.08523922 -0.3269057 -0.255871
## [6,] 0.08523922 -0.3269057 -0.255871
# Llamar a la función de combinación
data_final = cbind(data_std,data_imp_mtc[,1:2])
head(data_final)
## tfr contraception educationMale educationFemale lifeMale lifeFemale
## 1 1.9711750 0.1252804 -0.03199859 0.06541694 -1.9675673 -2.0965491
## 2 -0.5266438 0.1252804 -0.03199859 0.06541694 0.4423496 0.5019174
## 3 0.1762308 0.3775474 -0.11343762 -0.76606682 0.3899601 0.1585486
## 4 -0.2536264 0.1252804 -0.03199859 0.06541694 0.4423496 0.4091150
## 5 -0.2536264 0.1252804 -0.03199859 0.06541694 0.2851811 0.3302330
## 6 1.8491885 0.1252804 -0.03199859 0.06541694 -1.9780452 -1.9016641
## infantMortality GDPperCapita economicActivityMale economicActivityFemale
## 1 2.89933362 -0.3484326 1.59916549 -2.65901952
## 2 -0.28981971 -0.5647464 0.03943514 0.08523922
## 3 0.02386751 -0.4919516 -0.01887253 -2.61905458
## 4 -0.83877233 -0.4649260 -2.58441031 -0.31441011
## 5 -0.34210091 -0.4649260 0.03943514 0.08523922
## 6 2.11511559 -0.6201053 0.03943514 0.08523922
## illiteracyMale illiteracyFemale country region
## 1 2.3123842 2.496306 Afghanistan Asia
## 2 -0.3269057 -0.255871 Albania Europe
## 3 0.6333906 1.054488 Algeria Africa
## 4 -0.9912715 -1.092974 American.Samoa Asia
## 5 -0.3269057 -0.255871 Andorra Europe
## 6 -0.3269057 -0.255871 Angola Africa
K-means. agrupa las observaciones en K clusters distintos, donde el número K lo determina el analista. K-means clustering encuentra los K mejores clusters, entendiendo como mejor cluster aquel cuya varianza interna (intra-cluster variation) sea lo más pequeña posible
K-means ++. cada clúster está representado por la media de los puntos de datos que pertenecen al clúster. El método K-means es sensible a los puntos de datos anómalos y a los valores atípicos.
PAM. cada cluster está representado por uno de los objetos del cluster. PAM es menos sensible a los valores atípicos en comparación con k-means.
CLARA (clustering large aplications). es una extensión de PAM adaptada para grandes conjuntos de datos.
library(stats)
library(factoextra)
set.seed(123)
fviz_nbclust(data_std, kmeans, method = "wss")+
labs(subtitle = "wss")
fviz_nbclust(data_std, kmeans, method = "silhouette") +
labs(subtitle = "silhouette")
library(fpc)
kmeansruns(data_std,criterion="ch")
## K-means clustering with 2 clusters of sizes 64, 143
##
## Cluster means:
## tfr contraception educationMale educationFemale lifeMale lifeFemale
## 1 1.1276166 -0.9405476 -0.5219052 -0.5481735 -1.219707 -1.2682879
## 2 -0.5046675 0.4209444 0.2335800 0.2453364 0.545883 0.5676253
## infantMortality GDPperCapita economicActivityMale economicActivityFemale
## 1 1.2296027 -0.5445045 0.3911033 -0.12228702
## 2 -0.5503117 0.2436943 -0.1750392 0.05472986
## illiteracyMale illiteracyFemale
## 1 1.098544 1.1536923
## 2 -0.491656 -0.5163378
##
## Clustering vector:
## [1] 1 2 2 2 2 1 2 2 2 2 2 2 2 2 1 2 2 2 2 1 1 2 2 1 2 2 2 1 1 1 1 2 2 1 1 2 2
## [38] 2 1 1 2 2 2 2 2 2 1 2 1 2 2 1 2 1 2 1 1 2 1 2 2 2 2 2 1 1 2 2 2 1 2 2 2 2
## [75] 1 1 1 2 1 2 2 2 2 1 2 2 1 2 2 2 1 2 2 2 2 1 2 2 2 2 2 1 2 2 1 1 2 2 2 2 2
## [112] 2 1 1 2 2 1 2 2 2 1 2 2 2 2 2 2 1 1 1 1 1 2 2 2 2 1 1 1 2 2 1 1 2 2 1 2 2
## [149] 2 2 2 2 2 2 2 2 1 2 2 2 2 2 1 1 2 1 2 2 2 2 1 2 2 2 2 1 2 1 2 2 2 2 1 2 1
## [186] 2 2 2 2 2 2 1 2 2 2 2 2 2 1 2 2 2 2 1 2 1 1
##
## Within cluster sum of squares by cluster:
## [1] 619.0174 895.1062
## (between_SS / total_SS = 38.7 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault" "crit"
## [11] "bestk"
agrup_kmeans = kmeans(data_std,
centers=2, # Número de Cluster
nstart = 100, # Número de puntos iniciales
iter.max = 1000)
agrup_kmeans
## K-means clustering with 2 clusters of sizes 64, 143
##
## Cluster means:
## tfr contraception educationMale educationFemale lifeMale lifeFemale
## 1 1.1276166 -0.9405476 -0.5219052 -0.5481735 -1.219707 -1.2682879
## 2 -0.5046675 0.4209444 0.2335800 0.2453364 0.545883 0.5676253
## infantMortality GDPperCapita economicActivityMale economicActivityFemale
## 1 1.2296027 -0.5445045 0.3911033 -0.12228702
## 2 -0.5503117 0.2436943 -0.1750392 0.05472986
## illiteracyMale illiteracyFemale
## 1 1.098544 1.1536923
## 2 -0.491656 -0.5163378
##
## Clustering vector:
## [1] 1 2 2 2 2 1 2 2 2 2 2 2 2 2 1 2 2 2 2 1 1 2 2 1 2 2 2 1 1 1 1 2 2 1 1 2 2
## [38] 2 1 1 2 2 2 2 2 2 1 2 1 2 2 1 2 1 2 1 1 2 1 2 2 2 2 2 1 1 2 2 2 1 2 2 2 2
## [75] 1 1 1 2 1 2 2 2 2 1 2 2 1 2 2 2 1 2 2 2 2 1 2 2 2 2 2 1 2 2 1 1 2 2 2 2 2
## [112] 2 1 1 2 2 1 2 2 2 1 2 2 2 2 2 2 1 1 1 1 1 2 2 2 2 1 1 1 2 2 1 1 2 2 1 2 2
## [149] 2 2 2 2 2 2 2 2 1 2 2 2 2 2 1 1 2 1 2 2 2 2 1 2 2 2 2 1 2 1 2 2 2 2 1 2 1
## [186] 2 2 2 2 2 2 1 2 2 2 2 2 2 1 2 2 2 2 1 2 1 1
##
## Within cluster sum of squares by cluster:
## [1] 619.0174 895.1062
## (between_SS / total_SS = 38.7 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
library(LICORS)
set.seed(123)
agrup_kmeansplus <- kmeanspp(data_std,
k=2,
start="random",
nstart = 25,
iter.max=100)
agrup_kmeansplus
## K-means clustering with 2 clusters of sizes 143, 64
##
## Cluster means:
## tfr contraception educationMale educationFemale lifeMale lifeFemale
## 1 -0.5046675 0.4209444 0.2335800 0.2453364 0.545883 0.5676253
## 2 1.1276166 -0.9405476 -0.5219052 -0.5481735 -1.219707 -1.2682879
## infantMortality GDPperCapita economicActivityMale economicActivityFemale
## 1 -0.5503117 0.2436943 -0.1750392 0.05472986
## 2 1.2296027 -0.5445045 0.3911033 -0.12228702
## illiteracyMale illiteracyFemale
## 1 -0.491656 -0.5163378
## 2 1.098544 1.1536923
##
## Clustering vector:
## [1] 2 1 1 1 1 2 1 1 1 1 1 1 1 1 2 1 1 1 1 2 2 1 1 2 1 1 1 2 2 2 2 1 1 2 2 1 1
## [38] 1 2 2 1 1 1 1 1 1 2 1 2 1 1 2 1 2 1 2 2 1 2 1 1 1 1 1 2 2 1 1 1 2 1 1 1 1
## [75] 2 2 2 1 2 1 1 1 1 2 1 1 2 1 1 1 2 1 1 1 1 2 1 1 1 1 1 2 1 1 2 2 1 1 1 1 1
## [112] 1 2 2 1 1 2 1 1 1 2 1 1 1 1 1 1 2 2 2 2 2 1 1 1 1 2 2 2 1 1 2 2 1 1 2 1 1
## [149] 1 1 1 1 1 1 1 1 2 1 1 1 1 1 2 2 1 2 1 1 1 1 2 1 1 1 1 2 1 2 1 1 1 1 2 1 2
## [186] 1 1 1 1 1 1 2 1 1 1 1 1 1 2 1 1 1 1 2 1 2 2
##
## Within cluster sum of squares by cluster:
## [1] 895.1062 619.0174
## (between_SS / total_SS = 38.7 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault" "inicial.centers"
library(cluster)
set.seed(123)
agrup_pam <- pam(data_std,
k=2, stand=FALSE)
agrup_pam
## Medoids:
## ID tfr contraception educationMale educationFemale lifeMale
## [1,] 39 1.1637405 -1.1865079 -0.03199859 0.06541694 -0.7102194
## [2,] 104 -0.4395106 0.1252804 -0.03199859 0.06541694 0.4528275
## lifeFemale infantMortality GDPperCapita economicActivityMale
## [1,] -0.9829206 1.0172103 -0.6187976 0.03943514
## [2,] 0.2884719 -0.3682415 -0.3194455 0.03943514
## economicActivityFemale illiteracyMale illiteracyFemale
## [1,] 0.08523922 1.2433621 0.9951188
## [2,] 0.08523922 -0.6745894 -0.6968979
## Clustering vector:
## [1] 1 2 2 2 2 1 2 2 2 2 2 2 2 2 1 2 2 2 2 1 1 2 2 1 2 2 2 1 1 1 1 2 2 1 1 2 2
## [38] 2 1 1 2 2 2 2 2 2 1 2 1 2 2 1 2 1 2 1 1 2 1 2 2 2 2 2 1 1 2 2 2 1 2 2 2 2
## [75] 1 1 1 2 1 2 2 2 2 1 2 2 1 2 2 2 1 2 2 2 2 1 2 2 2 2 2 1 2 2 1 1 1 2 2 2 2
## [112] 2 1 1 2 2 1 2 2 2 1 2 2 2 2 2 2 1 1 1 1 1 2 2 2 2 2 1 1 2 2 2 1 2 2 1 2 2
## [149] 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 1 2 1 2 2 2 2 1 2 2 2 2 1 2 1 2 2 2 2 1 2 1
## [186] 2 2 2 2 2 2 1 2 2 2 2 2 2 1 2 2 2 2 1 2 1 1
## Objective function:
## build swap
## 2.614153 2.577024
##
## Available components:
## [1] "medoids" "id.med" "clustering" "objective" "isolation"
## [6] "clusinfo" "silinfo" "diss" "call" "data"
set.seed(123)
agrup_clara <- clara(data_std,
k=2,
samples = 100,
pamLike = TRUE)
agrup_clara
## Call: clara(x = data_std, k = 2, samples = 100, pamLike = TRUE)
## Medoids:
## tfr contraception educationMale educationFemale lifeMale
## [1,] 1.1637405 -1.1865079 -0.03199859 0.06541694 -0.7102194
## [2,] -0.4395106 0.1252804 -0.03199859 0.06541694 0.4528275
## lifeFemale infantMortality GDPperCapita economicActivityMale
## [1,] -0.9829206 1.0172103 -0.6187976 0.03943514
## [2,] 0.2884719 -0.3682415 -0.3194455 0.03943514
## economicActivityFemale illiteracyMale illiteracyFemale
## [1,] 0.08523922 1.2433621 0.9951188
## [2,] 0.08523922 -0.6745894 -0.6968979
## Objective function: 2.577024
## Clustering vector: int [1:207] 1 2 2 2 2 1 2 2 2 2 2 2 2 2 1 2 2 2 ...
## Cluster sizes: 62 145
## Best sample:
## [1] 8 9 17 18 33 38 39 41 44 46 47 49 52 54 59 60 61 63 70
## [20] 71 73 75 78 95 99 104 105 107 116 123 126 136 138 139 144 147 148 159
## [39] 166 203 204 205 206 207
##
## Available components:
## [1] "sample" "medoids" "i.med" "clustering" "objective"
## [6] "clusinfo" "diss" "call" "silinfo" "data"
`
# índice de Validación de Davies-Bouldin (más pequeño)
grupos_1 = as.integer(agrup_kmeans$cluster)
library(clusterSim)
indice_DB1 = index.DB(data_std, grupos_1, centrotypes = "centroids")
indice_DB1$DB
## [1] 1.20567
# índice de Validación de Davies-Bouldin (más pequeño)
grupos_2<-as.integer(agrup_kmeansplus$cluster)
indice_DB2<-index.DB(data_std, grupos_2)
indice_DB2$DB
## [1] 1.20567
# índice de Validación de Davies-Bouldin (más pequeño)
grupos_3<-as.integer(agrup_pam$cluster)
indice_DB3<-index.DB(data_std, grupos_3)
indice_DB3$DB
## [1] 1.194184
# índice de Validación de Davies-Bouldin (más pequeño)
grupos_4<-as.integer(agrup_clara$cluster)
indice_DB4<-index.DB(data_std, grupos_4)
indice_DB4$DB
## [1] 1.194184
# Indice de Dunn (Más grande)
library(clValid)
indice_dunn1<-dunn(Data=data_std, clusters=grupos_1, distance = NULL)
indice_dunn1
## [1] 0.1184207
# Indice de Dunn (Más grande)
indice_dunn2<-dunn(Data=data_std, clusters=grupos_2, distance = NULL)
indice_dunn2
## [1] 0.1184207
# Indice de Dunn (Más grande)
indice_dunn3 = dunn(Data=data_std, clusters=grupos_3, distance = NULL)
indice_dunn3
## [1] 0.1184207
# Indice de Dunn (Más grande)
indice_dunn4 = dunn(Data=data_std, clusters=grupos_4, distance = NULL)
indice_dunn4
## [1] 0.1184207
# Indice de Dunn (Más grande)
indices = data.frame(DB1=indice_DB1$DB,DB2=indice_DB2$DB,DB3=indice_DB3$DB,DB4=indice_DB4$DB,
DUNN1=indice_dunn1,DUNN2=indice_dunn2,DUNN3=indice_dunn3,DUNN4=indice_dunn4)
indices
## DB1 DB2 DB3 DB4 DUNN1 DUNN2 DUNN3 DUNN4
## 1 1.20567 1.20567 1.194184 1.194184 0.1184207 0.1184207 0.1184207 0.1184207
Usa información intrínseca de la data para evaluar la calidad del agrupamiento. Las medidas internas incluyen:
**Según los resultados del modelo (Davies-Bouldin) para evaluar la calidad del agrupamiento válido (mejores indices) es PAM,CLARA
# Importancia de las variables
library(FeatureImpCluster)
library(flexclust)
# Comparando resultados del modelo ganador (Modedelo 3) PAM
agrupamiento_pam = flexclust::as.kcca(agrup_pam, data_std)
barplot(agrupamiento_pam)
library(MASS)
parcoord(data_std, col = grupos_3)
library(factoextra)
fviz_cluster(agrup_pam, data = data_std,
ellipse.type = "convex") +
theme_classic()
# observamos la variable más importante en el cluster
library(FeatureImpCluster)
data_import = as.data.table(data_imp_mtc[, 3:14])
Importancia_data = FeatureImpCluster(agrupamiento_pam,data_import)
plot(Importancia_data)