rm(list = ls()) # limpio memoria
linkGoogle="https://docs.google.com/spreadsheets/d/e/2PACX-1vQOKTNeLrYhFklfcGWO9-xzVuYFuEGv-LlomX5ou3dXxNye6XmSYj6pOoYZgfThB2p2vUEn29jLFe1F/pub?gid=1366853308&single=true&output=csv"
data2=read.csv(linkGoogle)
creo porcentaje de viviendas que tiene agua de red publica dentro de la vivienda
data2$agua_pct=data2$agua1_Red/data2$agua10_Total
creo la razón de votacion de keiko entre castillo
data2$Keikocastillo_ratio=data2$Keiko/data2$Castillo
creo la tasa fallecidos por cada 1000 contagiados
data2$fallecido_x10000POS=1000*data2$countFallecidos/data2$countPositivos
excluir la provincia de lima
notwant=c ("LIMA+LIMA")
data2=data2 [! data2$key%in%notwant,]
crear data para clusterizar
datacluster=c('key','agua_pct','Keikocastillo_ratio','fallecido_x10000POS')
datacluster=data2[,datacluster]
ver estandarización
library(BBmisc)
## Warning: package 'BBmisc' was built under R version 4.3.3
##
## Attaching package: 'BBmisc'
## The following object is masked from 'package:base':
##
## isFALSE
boxplot(BBmisc::normalize(datacluster[,-1],method='standardize'))
estandarizar
dataclusterN=datacluster
dataclusterN[,-1]=BBmisc::normalize(datacluster[,-1],method='standardize')
data para clusterizar
dataClus=dataclusterN[,-1]
row.names(dataClus)=dataclusterN$key # para no perder nombre
dataClus=dataClus[complete.cases(dataClus),] # solo los no missing
matriz de distancias
library(cluster)
## Warning: package 'cluster' was built under R version 4.3.3
g.dist = daisy(dataClus, metric="gower")
#estrategia de partición ##clusterizar via PAM
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.3.3
set.seed(123)
res.pam=pam(g.dist,5,cluster.only = F)
#nueva columna
dataClus$pam=res.pam$cluster
# ver
head(dataClus,15)%>%kbl()%>%kable_styling()
| agua_pct | Keikocastillo_ratio | fallecido_x10000POS | pam | |
|---|---|---|---|---|
| AMAZONAS+BAGUA | -0.3519838 | -0.3739358 | -1.1415785 | 1 |
| AMAZONAS+BONGARA | 0.3924045 | -0.0198897 | -0.5365194 | 1 |
| AMAZONAS+CHACHAPOYAS | 1.1011178 | 0.0612639 | -0.7999689 | 2 |
| AMAZONAS+CONDORCANQUI | -2.1769143 | -0.9182894 | -1.2593821 | 3 |
| AMAZONAS+LUYA | 0.0625030 | -0.0200974 | -0.4992799 | 1 |
| AMAZONAS+RODRIGUEZ DE MENDOZA | 0.2096156 | 0.0979522 | 1.1638051 | 1 |
| AMAZONAS+UTCUBAMBA | -0.3307202 | -0.1882854 | -0.9869613 | 1 |
| ANCASH+AIJA | 1.0330394 | -0.0449805 | 0.1430353 | 2 |
| ANCASH+ANTONIO RAIMONDI | 1.5845474 | -0.8377221 | 1.2988419 | 2 |
| ANCASH+ASUNCION | 0.8536475 | -0.6038640 | 0.2695732 | 2 |
| ANCASH+BOLOGNESI | 0.9049263 | -0.2000002 | 0.4619025 | 2 |
| ANCASH+CARHUAZ | 0.9450800 | -0.3087639 | -0.0165583 | 2 |
| ANCASH+CARLOS FERMIN FITZCARRALD | 0.5876363 | -0.6504380 | 1.4548719 | 2 |
| ANCASH+CASMA | 0.2832239 | 1.2963666 | 0.3638289 | 4 |
| ANCASH+CORONGO | 1.0542571 | 0.0669980 | 1.0130963 | 2 |
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.3.3
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_silhouette(res.pam,print.summary = F)
silPAM=data.frame(res.pam$silinfo$widths)
silPAM$key=row.names(silPAM)
poorPAM=silPAM[silPAM$sil_width<0,'key']%>%sort()
poorPAM
## [1] "ANCASH+OCROS" "APURIMAC+ANDAHUAYLAS"
## [3] "APURIMAC+AYMARAES" "AREQUIPA+CARAVELI"
## [5] "AREQUIPA+CAYLLOMA" "AYACUCHO+CANGALLO"
## [7] "AYACUCHO+HUANTA" "AYACUCHO+VILCAS HUAMAN"
## [9] "CAJAMARCA+SAN MIGUEL" "CAJAMARCA+SANTA CRUZ"
## [11] "HUANCAVELICA+ACOBAMBA" "HUANCAVELICA+CASTROVIRREYNA"
## [13] "HUANCAVELICA+HUANCAVELICA" "HUANUCO+AMBO"
## [15] "HUANUCO+HUAMALIES" "HUANUCO+PUERTO INCA"
## [17] "JUNIN+JAUJA" "JUNIN+TARMA"
## [19] "LIMA+YAUYOS" "LORETO+DATEM DEL MARANON"
## [21] "LORETO+UCAYALI" "PASCO+OXAPAMPA"
## [23] "UCAYALI+CORONEL PORTILLO"
guardamos pam
data2$pamdata2poor=data2$key%in%poorPAM
data2$pamdata2=as.ordered(dataClus$pam)
dataClus$pam=NULL
#estrategia jerarquica ##Agnes
set.seed(123)
library(factoextra)
res.agnes<- hcut(g.dist, k = 5,hc_func='agnes',hc_method = "ward.D")
dataClus$agnes=res.agnes$cluster
# ver
head(dataClus,15)%>%kbl()%>%kable_styling()
| agua_pct | Keikocastillo_ratio | fallecido_x10000POS | agnes | |
|---|---|---|---|---|
| AMAZONAS+BAGUA | -0.3519838 | -0.3739358 | -1.1415785 | 1 |
| AMAZONAS+BONGARA | 0.3924045 | -0.0198897 | -0.5365194 | 2 |
| AMAZONAS+CHACHAPOYAS | 1.1011178 | 0.0612639 | -0.7999689 | 2 |
| AMAZONAS+CONDORCANQUI | -2.1769143 | -0.9182894 | -1.2593821 | 3 |
| AMAZONAS+LUYA | 0.0625030 | -0.0200974 | -0.4992799 | 2 |
| AMAZONAS+RODRIGUEZ DE MENDOZA | 0.2096156 | 0.0979522 | 1.1638051 | 2 |
| AMAZONAS+UTCUBAMBA | -0.3307202 | -0.1882854 | -0.9869613 | 1 |
| ANCASH+AIJA | 1.0330394 | -0.0449805 | 0.1430353 | 2 |
| ANCASH+ANTONIO RAIMONDI | 1.5845474 | -0.8377221 | 1.2988419 | 2 |
| ANCASH+ASUNCION | 0.8536475 | -0.6038640 | 0.2695732 | 2 |
| ANCASH+BOLOGNESI | 0.9049263 | -0.2000002 | 0.4619025 | 2 |
| ANCASH+CARHUAZ | 0.9450800 | -0.3087639 | -0.0165583 | 2 |
| ANCASH+CARLOS FERMIN FITZCARRALD | 0.5876363 | -0.6504380 | 1.4548719 | 2 |
| ANCASH+CASMA | 0.2832239 | 1.2963666 | 0.3638289 | 4 |
| ANCASH+CORONGO | 1.0542571 | 0.0669980 | 1.0130963 | 2 |
fviz_silhouette(res.agnes,print.summary = F)
silAGNES=data.frame(res.agnes$silinfo$widths)
silAGNES$key=row.names(silAGNES)
poorAGNES=silAGNES[silAGNES$sil_width<0,'key']%>%sort()
poorAGNES
## [1] "AMAZONAS+LUYA" "ANCASH+OCROS"
## [3] "APURIMAC+CHINCHEROS" "AREQUIPA+CARAVELI"
## [5] "AREQUIPA+LA UNION" "AYACUCHO+SUCRE"
## [7] "CAJAMARCA+CAJABAMBA" "CAJAMARCA+JAEN"
## [9] "HUANCAVELICA+HUAYTARA" "HUANUCO+HUANUCO"
## [11] "HUANUCO+LEONCIO PRADO" "HUANUCO+PACHITEA"
## [13] "ICA+NAZCA" "JUNIN+JAUJA"
## [15] "JUNIN+TARMA" "LA LIBERTAD+OTUZCO"
## [17] "LA LIBERTAD+SANCHEZ CARRION" "LA LIBERTAD+SANTIAGO DE CHUCO"
## [19] "LIMA+CAJATAMBO" "LORETO+LORETO"
## [21] "MADRE DE DIOS+MANU" "PASCO+OXAPAMPA"
## [23] "TACNA+TARATA" "UCAYALI+CORONEL PORTILLO"
data2$agnesdata2poor=data2$key%in%poorAGNES
data2$agnesdata2=as.ordered(dataClus$agnes)
dataClus$agnes=NULL
##Diana
set.seed(123)
res.diana <- hcut(g.dist, k = 5,hc_func='diana')
dataClus$diana=res.diana$cluster
# veamos
head(dataClus,15)%>%kbl%>%kable_styling()
| agua_pct | Keikocastillo_ratio | fallecido_x10000POS | diana | |
|---|---|---|---|---|
| AMAZONAS+BAGUA | -0.3519838 | -0.3739358 | -1.1415785 | 1 |
| AMAZONAS+BONGARA | 0.3924045 | -0.0198897 | -0.5365194 | 2 |
| AMAZONAS+CHACHAPOYAS | 1.1011178 | 0.0612639 | -0.7999689 | 2 |
| AMAZONAS+CONDORCANQUI | -2.1769143 | -0.9182894 | -1.2593821 | 1 |
| AMAZONAS+LUYA | 0.0625030 | -0.0200974 | -0.4992799 | 2 |
| AMAZONAS+RODRIGUEZ DE MENDOZA | 0.2096156 | 0.0979522 | 1.1638051 | 2 |
| AMAZONAS+UTCUBAMBA | -0.3307202 | -0.1882854 | -0.9869613 | 1 |
| ANCASH+AIJA | 1.0330394 | -0.0449805 | 0.1430353 | 2 |
| ANCASH+ANTONIO RAIMONDI | 1.5845474 | -0.8377221 | 1.2988419 | 2 |
| ANCASH+ASUNCION | 0.8536475 | -0.6038640 | 0.2695732 | 2 |
| ANCASH+BOLOGNESI | 0.9049263 | -0.2000002 | 0.4619025 | 2 |
| ANCASH+CARHUAZ | 0.9450800 | -0.3087639 | -0.0165583 | 2 |
| ANCASH+CARLOS FERMIN FITZCARRALD | 0.5876363 | -0.6504380 | 1.4548719 | 2 |
| ANCASH+CASMA | 0.2832239 | 1.2963666 | 0.3638289 | 2 |
| ANCASH+CORONGO | 1.0542571 | 0.0669980 | 1.0130963 | 2 |
fviz_silhouette(res.diana,print.summary = F)
silDIANA=data.frame(res.diana$silinfo$widths)
silDIANA$key=row.names(silDIANA)
poorDIANA=silDIANA[silDIANA$sil_width<0,'key']%>%sort()
poorDIANA
## [1] "AYACUCHO+LA MAR" "HUANUCO+HUACAYBAMBA" "HUANUCO+LEONCIO PRADO"
## [4] "LA LIBERTAD+ASCOPE" "LAMBAYEQUE+CHICLAYO" "PASCO+PASCO"
## [7] "PIURA+PAITA" "PIURA+SECHURA"
data2$dianadata2poor=data2$key%in%poorDIANA
data2$dianadata2=as.ordered(dataClus$diana)
dataClus$diana=NULL