rm(list = ls()) # limpio memoria
linkGoogle="https://docs.google.com/spreadsheets/d/e/2PACX-1vQOKTNeLrYhFklfcGWO9-xzVuYFuEGv-LlomX5ou3dXxNye6XmSYj6pOoYZgfThB2p2vUEn29jLFe1F/pub?gid=1366853308&single=true&output=csv"
data2=read.csv(linkGoogle)

creo porcentaje de viviendas que tiene agua de red publica dentro de la vivienda

data2$agua_pct=data2$agua1_Red/data2$agua10_Total

creo la razón de votacion de keiko entre castillo

data2$Keikocastillo_ratio=data2$Keiko/data2$Castillo

creo la tasa fallecidos por cada 1000 contagiados

data2$fallecido_x10000POS=1000*data2$countFallecidos/data2$countPositivos

excluir la provincia de lima

notwant=c ("LIMA+LIMA")
data2=data2 [! data2$key%in%notwant,]

crear data para clusterizar

datacluster=c('key','agua_pct','Keikocastillo_ratio','fallecido_x10000POS')
datacluster=data2[,datacluster]

ver estandarización

library(BBmisc)
## Warning: package 'BBmisc' was built under R version 4.3.3
## 
## Attaching package: 'BBmisc'
## The following object is masked from 'package:base':
## 
##     isFALSE
boxplot(BBmisc::normalize(datacluster[,-1],method='standardize'))

estandarizar

dataclusterN=datacluster
dataclusterN[,-1]=BBmisc::normalize(datacluster[,-1],method='standardize')

data para clusterizar

dataClus=dataclusterN[,-1] 
row.names(dataClus)=dataclusterN$key # para no perder nombre
dataClus=dataClus[complete.cases(dataClus),] # solo los no missing

matriz de distancias

library(cluster)
## Warning: package 'cluster' was built under R version 4.3.3
g.dist = daisy(dataClus, metric="gower")

#estrategia de partición ##clusterizar via PAM

library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.3.3
set.seed(123)
res.pam=pam(g.dist,5,cluster.only = F)

#nueva columna
dataClus$pam=res.pam$cluster

# ver

head(dataClus,15)%>%kbl()%>%kable_styling()
agua_pct Keikocastillo_ratio fallecido_x10000POS pam
AMAZONAS+BAGUA -0.3519838 -0.3739358 -1.1415785 1
AMAZONAS+BONGARA 0.3924045 -0.0198897 -0.5365194 1
AMAZONAS+CHACHAPOYAS 1.1011178 0.0612639 -0.7999689 2
AMAZONAS+CONDORCANQUI -2.1769143 -0.9182894 -1.2593821 3
AMAZONAS+LUYA 0.0625030 -0.0200974 -0.4992799 1
AMAZONAS+RODRIGUEZ DE MENDOZA 0.2096156 0.0979522 1.1638051 1
AMAZONAS+UTCUBAMBA -0.3307202 -0.1882854 -0.9869613 1
ANCASH+AIJA 1.0330394 -0.0449805 0.1430353 2
ANCASH+ANTONIO RAIMONDI 1.5845474 -0.8377221 1.2988419 2
ANCASH+ASUNCION 0.8536475 -0.6038640 0.2695732 2
ANCASH+BOLOGNESI 0.9049263 -0.2000002 0.4619025 2
ANCASH+CARHUAZ 0.9450800 -0.3087639 -0.0165583 2
ANCASH+CARLOS FERMIN FITZCARRALD 0.5876363 -0.6504380 1.4548719 2
ANCASH+CASMA 0.2832239 1.2963666 0.3638289 4
ANCASH+CORONGO 1.0542571 0.0669980 1.0130963 2
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.3.3
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_silhouette(res.pam,print.summary = F)

silPAM=data.frame(res.pam$silinfo$widths)
silPAM$key=row.names(silPAM)
poorPAM=silPAM[silPAM$sil_width<0,'key']%>%sort()
poorPAM
##  [1] "ANCASH+OCROS"                "APURIMAC+ANDAHUAYLAS"       
##  [3] "APURIMAC+AYMARAES"           "AREQUIPA+CARAVELI"          
##  [5] "AREQUIPA+CAYLLOMA"           "AYACUCHO+CANGALLO"          
##  [7] "AYACUCHO+HUANTA"             "AYACUCHO+VILCAS HUAMAN"     
##  [9] "CAJAMARCA+SAN MIGUEL"        "CAJAMARCA+SANTA CRUZ"       
## [11] "HUANCAVELICA+ACOBAMBA"       "HUANCAVELICA+CASTROVIRREYNA"
## [13] "HUANCAVELICA+HUANCAVELICA"   "HUANUCO+AMBO"               
## [15] "HUANUCO+HUAMALIES"           "HUANUCO+PUERTO INCA"        
## [17] "JUNIN+JAUJA"                 "JUNIN+TARMA"                
## [19] "LIMA+YAUYOS"                 "LORETO+DATEM DEL MARANON"   
## [21] "LORETO+UCAYALI"              "PASCO+OXAPAMPA"             
## [23] "UCAYALI+CORONEL PORTILLO"

guardamos pam

data2$pamdata2poor=data2$key%in%poorPAM
data2$pamdata2=as.ordered(dataClus$pam)
dataClus$pam=NULL

#estrategia jerarquica ##Agnes

set.seed(123)
library(factoextra)

res.agnes<- hcut(g.dist, k = 5,hc_func='agnes',hc_method = "ward.D")

dataClus$agnes=res.agnes$cluster

# ver

head(dataClus,15)%>%kbl()%>%kable_styling()
agua_pct Keikocastillo_ratio fallecido_x10000POS agnes
AMAZONAS+BAGUA -0.3519838 -0.3739358 -1.1415785 1
AMAZONAS+BONGARA 0.3924045 -0.0198897 -0.5365194 2
AMAZONAS+CHACHAPOYAS 1.1011178 0.0612639 -0.7999689 2
AMAZONAS+CONDORCANQUI -2.1769143 -0.9182894 -1.2593821 3
AMAZONAS+LUYA 0.0625030 -0.0200974 -0.4992799 2
AMAZONAS+RODRIGUEZ DE MENDOZA 0.2096156 0.0979522 1.1638051 2
AMAZONAS+UTCUBAMBA -0.3307202 -0.1882854 -0.9869613 1
ANCASH+AIJA 1.0330394 -0.0449805 0.1430353 2
ANCASH+ANTONIO RAIMONDI 1.5845474 -0.8377221 1.2988419 2
ANCASH+ASUNCION 0.8536475 -0.6038640 0.2695732 2
ANCASH+BOLOGNESI 0.9049263 -0.2000002 0.4619025 2
ANCASH+CARHUAZ 0.9450800 -0.3087639 -0.0165583 2
ANCASH+CARLOS FERMIN FITZCARRALD 0.5876363 -0.6504380 1.4548719 2
ANCASH+CASMA 0.2832239 1.2963666 0.3638289 4
ANCASH+CORONGO 1.0542571 0.0669980 1.0130963 2
fviz_silhouette(res.agnes,print.summary = F)

silAGNES=data.frame(res.agnes$silinfo$widths)
silAGNES$key=row.names(silAGNES)
poorAGNES=silAGNES[silAGNES$sil_width<0,'key']%>%sort()
poorAGNES
##  [1] "AMAZONAS+LUYA"                 "ANCASH+OCROS"                 
##  [3] "APURIMAC+CHINCHEROS"           "AREQUIPA+CARAVELI"            
##  [5] "AREQUIPA+LA UNION"             "AYACUCHO+SUCRE"               
##  [7] "CAJAMARCA+CAJABAMBA"           "CAJAMARCA+JAEN"               
##  [9] "HUANCAVELICA+HUAYTARA"         "HUANUCO+HUANUCO"              
## [11] "HUANUCO+LEONCIO PRADO"         "HUANUCO+PACHITEA"             
## [13] "ICA+NAZCA"                     "JUNIN+JAUJA"                  
## [15] "JUNIN+TARMA"                   "LA LIBERTAD+OTUZCO"           
## [17] "LA LIBERTAD+SANCHEZ CARRION"   "LA LIBERTAD+SANTIAGO DE CHUCO"
## [19] "LIMA+CAJATAMBO"                "LORETO+LORETO"                
## [21] "MADRE DE DIOS+MANU"            "PASCO+OXAPAMPA"               
## [23] "TACNA+TARATA"                  "UCAYALI+CORONEL PORTILLO"
data2$agnesdata2poor=data2$key%in%poorAGNES
data2$agnesdata2=as.ordered(dataClus$agnes)
dataClus$agnes=NULL

##Diana

set.seed(123)
res.diana <- hcut(g.dist, k = 5,hc_func='diana')
dataClus$diana=res.diana$cluster
# veamos
head(dataClus,15)%>%kbl%>%kable_styling()
agua_pct Keikocastillo_ratio fallecido_x10000POS diana
AMAZONAS+BAGUA -0.3519838 -0.3739358 -1.1415785 1
AMAZONAS+BONGARA 0.3924045 -0.0198897 -0.5365194 2
AMAZONAS+CHACHAPOYAS 1.1011178 0.0612639 -0.7999689 2
AMAZONAS+CONDORCANQUI -2.1769143 -0.9182894 -1.2593821 1
AMAZONAS+LUYA 0.0625030 -0.0200974 -0.4992799 2
AMAZONAS+RODRIGUEZ DE MENDOZA 0.2096156 0.0979522 1.1638051 2
AMAZONAS+UTCUBAMBA -0.3307202 -0.1882854 -0.9869613 1
ANCASH+AIJA 1.0330394 -0.0449805 0.1430353 2
ANCASH+ANTONIO RAIMONDI 1.5845474 -0.8377221 1.2988419 2
ANCASH+ASUNCION 0.8536475 -0.6038640 0.2695732 2
ANCASH+BOLOGNESI 0.9049263 -0.2000002 0.4619025 2
ANCASH+CARHUAZ 0.9450800 -0.3087639 -0.0165583 2
ANCASH+CARLOS FERMIN FITZCARRALD 0.5876363 -0.6504380 1.4548719 2
ANCASH+CASMA 0.2832239 1.2963666 0.3638289 2
ANCASH+CORONGO 1.0542571 0.0669980 1.0130963 2
fviz_silhouette(res.diana,print.summary = F)

silDIANA=data.frame(res.diana$silinfo$widths)
silDIANA$key=row.names(silDIANA)
poorDIANA=silDIANA[silDIANA$sil_width<0,'key']%>%sort()
poorDIANA
## [1] "AYACUCHO+LA MAR"       "HUANUCO+HUACAYBAMBA"   "HUANUCO+LEONCIO PRADO"
## [4] "LA LIBERTAD+ASCOPE"    "LAMBAYEQUE+CHICLAYO"   "PASCO+PASCO"          
## [7] "PIURA+PAITA"           "PIURA+SECHURA"
data2$dianadata2poor=data2$key%in%poorDIANA
data2$dianadata2=as.ordered(dataClus$diana)
dataClus$diana=NULL