rm(list = ls()) # limpio memoria
library(rio)
allDFs=import("dataOK_all.xlsx")
## New names:
## • `` -> `...1`
allDFs$Electricidad_pct=allDFs$elec1_Sí/allDFs$elec3_Total
allDFs$CastilloKeiko_ratio=allDFs$Castillo/allDFs$Keiko
allDFs$fallecido_x10000POS=1000*allDFs$countFallecidos/allDFs$countPositivos
keptForCluster=c('key','Electricidad_pct','CastilloKeiko_ratio','fallecido_x10000POS')
allDFs_small=allDFs[,keptForCluster]
allDFs_small=allDFs_small[!allDFs_small$key=='LIMA+LIMA',]
#ver standarizacion---- 
boxplot(BBmisc::normalize(allDFs_small[,-1],method='standardize'))

#crear data standarizada----   
allDFs_small_norm=allDFs_small
allDFs_small_norm[,-1]=BBmisc::normalize(allDFs_small[,-1],method='standardize')
# ver si correlaciones son positivas----
cor(allDFs_small_norm[,-1],use = "complete.obs")
##                     Electricidad_pct CastilloKeiko_ratio fallecido_x10000POS
## Electricidad_pct          1.00000000          -0.3118796          0.09436612
## CastilloKeiko_ratio      -0.31187963           1.0000000          0.15322775
## fallecido_x10000POS       0.09436612           0.1532277          1.00000000
# preparar data para clustering -----
dataClus=allDFs_small_norm[,-1] # sin nombre de fila
row.names(dataClus)=allDFs_small_norm$key # para no perder nombre
dataClus=dataClus[complete.cases(dataClus),] # solo los no missing
g.dist = cluster::daisy(dataClus, metric="gower")
library(cluster)
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.3.3
set.seed(123)
res.pam=pam(g.dist,5,cluster.only = F)

#nueva columna
dataClus$pam=res.pam$cluster

# ver

head(dataClus,15)%>%kbl()%>%kable_styling()
Electricidad_pct CastilloKeiko_ratio fallecido_x10000POS pam
AMAZONAS+BAGUA -0.8258959 -0.3512991 -1.1415785 1
AMAZONAS+BONGARA 0.3825621 -0.5292981 -0.5365194 2
AMAZONAS+CHACHAPOYAS 0.7947371 -0.5549533 -0.7999689 2
AMAZONAS+CONDORCANQUI -4.8430495 1.1973385 -1.2593821 1
AMAZONAS+LUYA 0.6185018 -0.5292276 -0.4992799 2
AMAZONAS+RODRIGUEZ DE MENDOZA -0.0262426 -0.5654212 1.1638051 2
AMAZONAS+UTCUBAMBA 0.2458983 -0.4616676 -0.9869613 2
ANCASH+AIJA 0.0786941 -0.5205757 0.1430353 2
ANCASH+ANTONIO RAIMONDI 0.3128470 0.5793327 1.2988419 3
ANCASH+ASUNCION 0.8583510 -0.1026242 0.2695732 4
ANCASH+BOLOGNESI 0.0879310 -0.4560327 0.4619025 2
ANCASH+CARHUAZ 0.1488582 -0.3958612 -0.0165583 2
ANCASH+CARLOS FERMIN FITZCARRALD -1.0011285 -0.0220210 1.4548719 1
ANCASH+CASMA 0.1764010 -0.7319343 0.3638289 2
ANCASH+CORONGO 0.9733097 -0.5566324 1.0130963 4
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.3.3
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_silhouette(res.pam,print.summary = F)

silPAM=data.frame(res.pam$silinfo$widths)
silPAM$country=row.names(silPAM)
poorPAM=silPAM[silPAM$sil_width<0,'country']%>%sort()
poorPAM
##  [1] "ANCASH+CARLOS FERMIN FITZCARRALD" "ANCASH+HUARI"                    
##  [3] "APURIMAC+COTABAMBAS"              "AYACUCHO+LUCANAS"                
##  [5] "CAJAMARCA+CHOTA"                  "CAJAMARCA+CONTUMAZA"             
##  [7] "CAJAMARCA+CUTERVO"                "CAJAMARCA+SAN IGNACIO"           
##  [9] "CAJAMARCA+SAN MIGUEL"             "CUSCO+CANCHIS"                   
## [11] "HUANCAVELICA+HUANCAVELICA"        "HUANUCO+AMBO"                    
## [13] "HUANUCO+YAROWILCA"                "LA LIBERTAD+JULCAN"              
## [15] "LA LIBERTAD+PATAZ"                "LORETO+ALTO AMAZONAS"            
## [17] "LORETO+REQUENA"                   "PIURA+HUANCABAMBA"               
## [19] "PUNO+YUNGUYO"                     "SAN MARTIN+MOYOBAMBA"
# calcular agnes -----
res.agnes<- factoextra::hcut(g.dist, k = 5,hc_func='agnes',hc_method = "ward.D")
dataClus$agnes=res.agnes$cluster
fviz_silhouette(res.agnes,print.summary = F)

silAGNES=data.frame(res.agnes$silinfo$widths)
silAGNES$country=row.names(silAGNES)
poorAGNES=silAGNES[silAGNES$sil_width<0,'country']%>%sort()
poorAGNES
##  [1] "ANCASH+MARISCAL LUZURIAGA"     "APURIMAC+AYMARAES"            
##  [3] "AREQUIPA+CONDESUYOS"           "AYACUCHO+LUCANAS"             
##  [5] "CAJAMARCA+CUTERVO"             "CAJAMARCA+SAN IGNACIO"        
##  [7] "CAJAMARCA+SAN PABLO"           "CUSCO+CANCHIS"                
##  [9] "CUSCO+ESPINAR"                 "HUANUCO+AMBO"                 
## [11] "HUANUCO+LAURICOCHA"            "LA LIBERTAD+JULCAN"           
## [13] "LA LIBERTAD+SANCHEZ CARRION"   "LA LIBERTAD+SANTIAGO DE CHUCO"
## [15] "LIMA+OYON"                     "PUNO+LAMPA"                   
## [17] "PUNO+YUNGUYO"                  "SAN MARTIN+HUALLAGA"          
## [19] "UCAYALI+PADRE ABAD"
aggregate(.~ agnes, data=dataClus,mean)
##   agnes Electricidad_pct CastilloKeiko_ratio fallecido_x10000POS      pam
## 1     1      -1.29974013          -0.1186030          -0.4572045 1.727273
## 2     2       0.66486117          -0.5314727          -0.1367686 2.969388
## 3     3      -0.06017147           0.5674930           0.1850179 2.914286
## 4     4       0.11452352          -0.4154376           4.4840109 2.250000
## 5     5      -0.45142298           2.7930253           0.6506180 5.000000
original=aggregate(.~ agnes, data=dataClus,mean)
original[order(original$Electricidad_pct),]
##   agnes Electricidad_pct CastilloKeiko_ratio fallecido_x10000POS      pam
## 1     1      -1.29974013          -0.1186030          -0.4572045 1.727273
## 5     5      -0.45142298           2.7930253           0.6506180 5.000000
## 3     3      -0.06017147           0.5674930           0.1850179 2.914286
## 4     4       0.11452352          -0.4154376           4.4840109 2.250000
## 2     2       0.66486117          -0.5314727          -0.1367686 2.969388
set.seed(123)
res.diana <- hcut(g.dist, k = 5,hc_func='diana')
dataClus$diana=res.diana$cluster
# veamos
head(dataClus,15)%>%kbl%>%kable_styling()
Electricidad_pct CastilloKeiko_ratio fallecido_x10000POS pam agnes diana
AMAZONAS+BAGUA -0.8258959 -0.3512991 -1.1415785 1 1 1
AMAZONAS+BONGARA 0.3825621 -0.5292981 -0.5365194 2 2 1
AMAZONAS+CHACHAPOYAS 0.7947371 -0.5549533 -0.7999689 2 2 1
AMAZONAS+CONDORCANQUI -4.8430495 1.1973385 -1.2593821 1 1 2
AMAZONAS+LUYA 0.6185018 -0.5292276 -0.4992799 2 2 1
AMAZONAS+RODRIGUEZ DE MENDOZA -0.0262426 -0.5654212 1.1638051 2 2 1
AMAZONAS+UTCUBAMBA 0.2458983 -0.4616676 -0.9869613 2 2 1
ANCASH+AIJA 0.0786941 -0.5205757 0.1430353 2 2 1
ANCASH+ANTONIO RAIMONDI 0.3128470 0.5793327 1.2988419 3 3 1
ANCASH+ASUNCION 0.8583510 -0.1026242 0.2695732 4 2 1
ANCASH+BOLOGNESI 0.0879310 -0.4560327 0.4619025 2 2 1
ANCASH+CARHUAZ 0.1488582 -0.3958612 -0.0165583 2 2 1
ANCASH+CARLOS FERMIN FITZCARRALD -1.0011285 -0.0220210 1.4548719 1 3 1
ANCASH+CASMA 0.1764010 -0.7319343 0.3638289 2 2 1
ANCASH+CORONGO 0.9733097 -0.5566324 1.0130963 4 2 1
fviz_silhouette(res.diana,print.summary = F)