rm(list = ls()) # limpio memoria
library(rio)
allDFs=import("dataOK_all.xlsx")
## New names:
## • `` -> `...1`
allDFs$Electricidad_pct=allDFs$elec1_SÃ/allDFs$elec3_Total
allDFs$CastilloKeiko_ratio=allDFs$Castillo/allDFs$Keiko
allDFs$fallecido_x10000POS=1000*allDFs$countFallecidos/allDFs$countPositivos
keptForCluster=c('key','Electricidad_pct','CastilloKeiko_ratio','fallecido_x10000POS')
allDFs_small=allDFs[,keptForCluster]
allDFs_small=allDFs_small[!allDFs_small$key=='LIMA+LIMA',]
#ver standarizacion----
boxplot(BBmisc::normalize(allDFs_small[,-1],method='standardize'))

#crear data standarizada----
allDFs_small_norm=allDFs_small
allDFs_small_norm[,-1]=BBmisc::normalize(allDFs_small[,-1],method='standardize')
# ver si correlaciones son positivas----
cor(allDFs_small_norm[,-1],use = "complete.obs")
## Electricidad_pct CastilloKeiko_ratio fallecido_x10000POS
## Electricidad_pct 1.00000000 -0.3118796 0.09436612
## CastilloKeiko_ratio -0.31187963 1.0000000 0.15322775
## fallecido_x10000POS 0.09436612 0.1532277 1.00000000
# preparar data para clustering -----
dataClus=allDFs_small_norm[,-1] # sin nombre de fila
row.names(dataClus)=allDFs_small_norm$key # para no perder nombre
dataClus=dataClus[complete.cases(dataClus),] # solo los no missing
g.dist = cluster::daisy(dataClus, metric="gower")
library(cluster)
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.3.3
set.seed(123)
res.pam=pam(g.dist,5,cluster.only = F)
#nueva columna
dataClus$pam=res.pam$cluster
# ver
head(dataClus,15)%>%kbl()%>%kable_styling()
|
|
Electricidad_pct
|
CastilloKeiko_ratio
|
fallecido_x10000POS
|
pam
|
|
AMAZONAS+BAGUA
|
-0.8258959
|
-0.3512991
|
-1.1415785
|
1
|
|
AMAZONAS+BONGARA
|
0.3825621
|
-0.5292981
|
-0.5365194
|
2
|
|
AMAZONAS+CHACHAPOYAS
|
0.7947371
|
-0.5549533
|
-0.7999689
|
2
|
|
AMAZONAS+CONDORCANQUI
|
-4.8430495
|
1.1973385
|
-1.2593821
|
1
|
|
AMAZONAS+LUYA
|
0.6185018
|
-0.5292276
|
-0.4992799
|
2
|
|
AMAZONAS+RODRIGUEZ DE MENDOZA
|
-0.0262426
|
-0.5654212
|
1.1638051
|
2
|
|
AMAZONAS+UTCUBAMBA
|
0.2458983
|
-0.4616676
|
-0.9869613
|
2
|
|
ANCASH+AIJA
|
0.0786941
|
-0.5205757
|
0.1430353
|
2
|
|
ANCASH+ANTONIO RAIMONDI
|
0.3128470
|
0.5793327
|
1.2988419
|
3
|
|
ANCASH+ASUNCION
|
0.8583510
|
-0.1026242
|
0.2695732
|
4
|
|
ANCASH+BOLOGNESI
|
0.0879310
|
-0.4560327
|
0.4619025
|
2
|
|
ANCASH+CARHUAZ
|
0.1488582
|
-0.3958612
|
-0.0165583
|
2
|
|
ANCASH+CARLOS FERMIN FITZCARRALD
|
-1.0011285
|
-0.0220210
|
1.4548719
|
1
|
|
ANCASH+CASMA
|
0.1764010
|
-0.7319343
|
0.3638289
|
2
|
|
ANCASH+CORONGO
|
0.9733097
|
-0.5566324
|
1.0130963
|
4
|
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.3.3
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_silhouette(res.pam,print.summary = F)

silPAM=data.frame(res.pam$silinfo$widths)
silPAM$country=row.names(silPAM)
poorPAM=silPAM[silPAM$sil_width<0,'country']%>%sort()
poorPAM
## [1] "ANCASH+CARLOS FERMIN FITZCARRALD" "ANCASH+HUARI"
## [3] "APURIMAC+COTABAMBAS" "AYACUCHO+LUCANAS"
## [5] "CAJAMARCA+CHOTA" "CAJAMARCA+CONTUMAZA"
## [7] "CAJAMARCA+CUTERVO" "CAJAMARCA+SAN IGNACIO"
## [9] "CAJAMARCA+SAN MIGUEL" "CUSCO+CANCHIS"
## [11] "HUANCAVELICA+HUANCAVELICA" "HUANUCO+AMBO"
## [13] "HUANUCO+YAROWILCA" "LA LIBERTAD+JULCAN"
## [15] "LA LIBERTAD+PATAZ" "LORETO+ALTO AMAZONAS"
## [17] "LORETO+REQUENA" "PIURA+HUANCABAMBA"
## [19] "PUNO+YUNGUYO" "SAN MARTIN+MOYOBAMBA"
# calcular agnes -----
res.agnes<- factoextra::hcut(g.dist, k = 5,hc_func='agnes',hc_method = "ward.D")
dataClus$agnes=res.agnes$cluster
fviz_silhouette(res.agnes,print.summary = F)

silAGNES=data.frame(res.agnes$silinfo$widths)
silAGNES$country=row.names(silAGNES)
poorAGNES=silAGNES[silAGNES$sil_width<0,'country']%>%sort()
poorAGNES
## [1] "ANCASH+MARISCAL LUZURIAGA" "APURIMAC+AYMARAES"
## [3] "AREQUIPA+CONDESUYOS" "AYACUCHO+LUCANAS"
## [5] "CAJAMARCA+CUTERVO" "CAJAMARCA+SAN IGNACIO"
## [7] "CAJAMARCA+SAN PABLO" "CUSCO+CANCHIS"
## [9] "CUSCO+ESPINAR" "HUANUCO+AMBO"
## [11] "HUANUCO+LAURICOCHA" "LA LIBERTAD+JULCAN"
## [13] "LA LIBERTAD+SANCHEZ CARRION" "LA LIBERTAD+SANTIAGO DE CHUCO"
## [15] "LIMA+OYON" "PUNO+LAMPA"
## [17] "PUNO+YUNGUYO" "SAN MARTIN+HUALLAGA"
## [19] "UCAYALI+PADRE ABAD"
aggregate(.~ agnes, data=dataClus,mean)
## agnes Electricidad_pct CastilloKeiko_ratio fallecido_x10000POS pam
## 1 1 -1.29974013 -0.1186030 -0.4572045 1.727273
## 2 2 0.66486117 -0.5314727 -0.1367686 2.969388
## 3 3 -0.06017147 0.5674930 0.1850179 2.914286
## 4 4 0.11452352 -0.4154376 4.4840109 2.250000
## 5 5 -0.45142298 2.7930253 0.6506180 5.000000
original=aggregate(.~ agnes, data=dataClus,mean)
original[order(original$Electricidad_pct),]
## agnes Electricidad_pct CastilloKeiko_ratio fallecido_x10000POS pam
## 1 1 -1.29974013 -0.1186030 -0.4572045 1.727273
## 5 5 -0.45142298 2.7930253 0.6506180 5.000000
## 3 3 -0.06017147 0.5674930 0.1850179 2.914286
## 4 4 0.11452352 -0.4154376 4.4840109 2.250000
## 2 2 0.66486117 -0.5314727 -0.1367686 2.969388
set.seed(123)
res.diana <- hcut(g.dist, k = 5,hc_func='diana')
dataClus$diana=res.diana$cluster
# veamos
head(dataClus,15)%>%kbl%>%kable_styling()
|
|
Electricidad_pct
|
CastilloKeiko_ratio
|
fallecido_x10000POS
|
pam
|
agnes
|
diana
|
|
AMAZONAS+BAGUA
|
-0.8258959
|
-0.3512991
|
-1.1415785
|
1
|
1
|
1
|
|
AMAZONAS+BONGARA
|
0.3825621
|
-0.5292981
|
-0.5365194
|
2
|
2
|
1
|
|
AMAZONAS+CHACHAPOYAS
|
0.7947371
|
-0.5549533
|
-0.7999689
|
2
|
2
|
1
|
|
AMAZONAS+CONDORCANQUI
|
-4.8430495
|
1.1973385
|
-1.2593821
|
1
|
1
|
2
|
|
AMAZONAS+LUYA
|
0.6185018
|
-0.5292276
|
-0.4992799
|
2
|
2
|
1
|
|
AMAZONAS+RODRIGUEZ DE MENDOZA
|
-0.0262426
|
-0.5654212
|
1.1638051
|
2
|
2
|
1
|
|
AMAZONAS+UTCUBAMBA
|
0.2458983
|
-0.4616676
|
-0.9869613
|
2
|
2
|
1
|
|
ANCASH+AIJA
|
0.0786941
|
-0.5205757
|
0.1430353
|
2
|
2
|
1
|
|
ANCASH+ANTONIO RAIMONDI
|
0.3128470
|
0.5793327
|
1.2988419
|
3
|
3
|
1
|
|
ANCASH+ASUNCION
|
0.8583510
|
-0.1026242
|
0.2695732
|
4
|
2
|
1
|
|
ANCASH+BOLOGNESI
|
0.0879310
|
-0.4560327
|
0.4619025
|
2
|
2
|
1
|
|
ANCASH+CARHUAZ
|
0.1488582
|
-0.3958612
|
-0.0165583
|
2
|
2
|
1
|
|
ANCASH+CARLOS FERMIN FITZCARRALD
|
-1.0011285
|
-0.0220210
|
1.4548719
|
1
|
3
|
1
|
|
ANCASH+CASMA
|
0.1764010
|
-0.7319343
|
0.3638289
|
2
|
2
|
1
|
|
ANCASH+CORONGO
|
0.9733097
|
-0.5566324
|
1.0130963
|
4
|
2
|
1
|
fviz_silhouette(res.diana,print.summary = F)
