library(pacman)
## Warning: package 'pacman' was built under R version 4.3.3
p_load(psych, car, haven, tidyverse, lsr, kableExtra, Rmisc, dplyr, rio, DescTools, taylor, ggplot2)
data <- import("data_test_est.csv")
#Utilizando el porcentaje de viviendas que tiene agua de red publica dentro de la vivienda, #la razón de votacion de keiko entre castillo, #y la tasa fallecidos por cada 1000 contagiados, #Ud se propone agrupar a las provincias del Peru (sin la provincia de Lima) siguiendo diversas estrategias (no corrija correlacion negativa si la hubiera, pero siempre normalice); y en ese proceso Ud. encuentra…
data$agua_red_public_porct=data$agua1_Red/data$agua10_Total
colnames(data)
## [1] "X" "key"
## [3] "Código" "pared1_Ladrillo"
## [5] "pared2_Piedra" "pared3_Adobe"
## [7] "pared4_Tapia" "pared5_Quincha"
## [9] "pared6_Piedra" "pared7_Madera"
## [11] "pared8_Triplay" "pared9_Otro"
## [13] "pared10_Total" "techo1_Concreto"
## [15] "techo2_Madera" "techo3_Tejas"
## [17] "techo4_Planchas" "techo5_Caña"
## [19] "techo6_Triplay" "techo7_Paja"
## [21] "techo8_Otro" "techo9_Total"
## [23] "piso1_Parquet" "piso2_Láminas"
## [25] "piso3_Losetas" "piso4_Madera"
## [27] "piso5_Cemento" "piso6_Tierra"
## [29] "piso7_Otro" "piso8_Total"
## [31] "agua1_Red" "agua2_Red_fueraVivienda"
## [33] "agua3_Pilón" "agua4_Camión"
## [35] "agua5_Pozo" "agua6_Manantial"
## [37] "agua7_RÃo" "agua8_Otro"
## [39] "agua9_Vecino" "agua10_Total"
## [41] "elec1_SÃ" "elec2_No"
## [43] "elec3_Total" "departamento"
## [45] "provincia" "Castillo"
## [47] "Keiko" "ganaCastillo"
## [49] "countPositivos" "countFallecidos"
## [51] "agua_red_public_porct"
data$KeikoCastillo_ratio=data$Keiko/data$Castillo
data$fallecido_x1000POS=1000*data$countFallecidos/data$countPositivos
keptForCluster=c('key','agua_red_public_porct','KeikoCastillo_ratio','fallecido_x1000POS')
data_small=data[,keptForCluster]
data_small=data_small[!data_small$key=='LIMA+LIMA',]
#ver_estandarizacion
boxplot(BBmisc::normalize(data_small[,-1],method='standardize'))
#crear data estandarizada
data_small_norm=data_small
data_small_norm[,-1]=BBmisc::normalize(data_small[,-1],method='standardize')
#Correlación negativa, pero se opta no corregir
cor(data_small_norm[,-1],use = "complete.obs")
## agua_red_public_porct KeikoCastillo_ratio
## agua_red_public_porct 1.0000000 0.11958032
## KeikoCastillo_ratio 0.1195803 1.00000000
## fallecido_x1000POS 0.1035734 -0.09694139
## fallecido_x1000POS
## agua_red_public_porct 0.10357342
## KeikoCastillo_ratio -0.09694139
## fallecido_x1000POS 1.00000000
dataClus=data_small_norm[,-1] # sin nombre de fila
row.names(dataClus)=data_small_norm$key # para no perder nombre
#Jerarquico aglomerativo agnes
dataClus=dataClus[complete.cases(dataClus),]
g.dist = cluster::daisy(dataClus, metric="gower")
factoextra::fviz_nbclust(dataClus, factoextra::hcut,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F,hc_func = "agnes")
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.3.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
res.agnes<- factoextra::hcut(g.dist, k = 5,hc_func='agnes',hc_method = "ward.D")
dataClus$agnes=res.agnes$cluster
fviz_dend(res.agnes, cex = 0.7, horiz = T,main = "")
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
fviz_silhouette(res.agnes,print.summary = F)
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
silAGNES=data.frame(res.agnes$silinfo$widths)
silAGNES$key=row.names(silAGNES)
poorAGNES=silAGNES[silAGNES$sil_width<0,'key']%>%sort()
poorAGNES
## [1] "AMAZONAS+LUYA" "ANCASH+OCROS"
## [3] "APURIMAC+CHINCHEROS" "AREQUIPA+CARAVELI"
## [5] "AREQUIPA+LA UNION" "AYACUCHO+SUCRE"
## [7] "CAJAMARCA+CAJABAMBA" "CAJAMARCA+JAEN"
## [9] "HUANCAVELICA+HUAYTARA" "HUANUCO+HUANUCO"
## [11] "HUANUCO+LEONCIO PRADO" "HUANUCO+PACHITEA"
## [13] "ICA+NAZCA" "JUNIN+JAUJA"
## [15] "JUNIN+TARMA" "LA LIBERTAD+OTUZCO"
## [17] "LA LIBERTAD+SANCHEZ CARRION" "LA LIBERTAD+SANTIAGO DE CHUCO"
## [19] "LIMA+CAJATAMBO" "LORETO+LORETO"
## [21] "MADRE DE DIOS+MANU" "PASCO+OXAPAMPA"
## [23] "TACNA+TARATA" "UCAYALI+CORONEL PORTILLO"
#no hay valores mal clusterizados
data_small_norm$agnesIDHpoor=data_small_norm$key%in%poorAGNES
data_small_norm$agnesIDH=as.ordered(dataClus$agnes)
dataClus$agnes=NULL
#jerarquico divisica DIANA
factoextra::fviz_nbclust(dataClus, factoextra::hcut,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F,hc_func = "diana")
res.diana <- factoextra::hcut(g.dist, k = 5,hc_func='diana')
dataClus$diana=res.diana$cluster
fviz_dend(res.diana, cex = 0.7, horiz = T, main = "")
fviz_silhouette(res.diana,print.summary = F)
silDIANA=data.frame(res.diana$silinfo$widths)
silDIANA$key=row.names(silDIANA)
poorDIANA=silDIANA[silDIANA$sil_width<0,'key']%>%sort()
poorDIANA
## [1] "AYACUCHO+LA MAR" "HUANUCO+HUACAYBAMBA" "HUANUCO+LEONCIO PRADO"
## [4] "LA LIBERTAD+ASCOPE" "LAMBAYEQUE+CHICLAYO" "PASCO+PASCO"
## [7] "PIURA+PAITA" "PIURA+SECHURA"
data_small_norm$dianaIDHpoor=data_small_norm$key%in%poorDIANA
data_small_norm$dianaIDH=as.ordered(dataClus$diana)
dataClus$diana=NULL