library(pacman)
## Warning: package 'pacman' was built under R version 4.3.3
p_load(psych, car, haven, tidyverse, lsr, kableExtra, Rmisc, dplyr, rio, DescTools, taylor, ggplot2)
data <- import("data_test_est.csv")

#Utilizando el porcentaje de viviendas que tiene agua de red publica dentro de la vivienda, #la razón de votacion de keiko entre castillo, #y la tasa fallecidos por cada 1000 contagiados, #Ud se propone agrupar a las provincias del Peru (sin la provincia de Lima) siguiendo diversas estrategias (no corrija correlacion negativa si la hubiera, pero siempre normalice); y en ese proceso Ud. encuentra…

data$agua_red_public_porct=data$agua1_Red/data$agua10_Total
colnames(data)
##  [1] "X"                       "key"                    
##  [3] "Código"                  "pared1_Ladrillo"        
##  [5] "pared2_Piedra"           "pared3_Adobe"           
##  [7] "pared4_Tapia"            "pared5_Quincha"         
##  [9] "pared6_Piedra"           "pared7_Madera"          
## [11] "pared8_Triplay"          "pared9_Otro"            
## [13] "pared10_Total"           "techo1_Concreto"        
## [15] "techo2_Madera"           "techo3_Tejas"           
## [17] "techo4_Planchas"         "techo5_Caña"            
## [19] "techo6_Triplay"          "techo7_Paja"            
## [21] "techo8_Otro"             "techo9_Total"           
## [23] "piso1_Parquet"           "piso2_Láminas"          
## [25] "piso3_Losetas"           "piso4_Madera"           
## [27] "piso5_Cemento"           "piso6_Tierra"           
## [29] "piso7_Otro"              "piso8_Total"            
## [31] "agua1_Red"               "agua2_Red_fueraVivienda"
## [33] "agua3_Pilón"             "agua4_Camión"           
## [35] "agua5_Pozo"              "agua6_Manantial"        
## [37] "agua7_Río"               "agua8_Otro"             
## [39] "agua9_Vecino"            "agua10_Total"           
## [41] "elec1_Sí"                "elec2_No"               
## [43] "elec3_Total"             "departamento"           
## [45] "provincia"               "Castillo"               
## [47] "Keiko"                   "ganaCastillo"           
## [49] "countPositivos"          "countFallecidos"        
## [51] "agua_red_public_porct"
data$KeikoCastillo_ratio=data$Keiko/data$Castillo
data$fallecido_x1000POS=1000*data$countFallecidos/data$countPositivos
keptForCluster=c('key','agua_red_public_porct','KeikoCastillo_ratio','fallecido_x1000POS')
data_small=data[,keptForCluster]
data_small=data_small[!data_small$key=='LIMA+LIMA',]

#ver_estandarizacion

boxplot(BBmisc::normalize(data_small[,-1],method='standardize'))

#crear data estandarizada

data_small_norm=data_small
data_small_norm[,-1]=BBmisc::normalize(data_small[,-1],method='standardize')

#Correlación negativa, pero se opta no corregir

cor(data_small_norm[,-1],use = "complete.obs")
##                       agua_red_public_porct KeikoCastillo_ratio
## agua_red_public_porct             1.0000000          0.11958032
## KeikoCastillo_ratio               0.1195803          1.00000000
## fallecido_x1000POS                0.1035734         -0.09694139
##                       fallecido_x1000POS
## agua_red_public_porct         0.10357342
## KeikoCastillo_ratio          -0.09694139
## fallecido_x1000POS            1.00000000
dataClus=data_small_norm[,-1] # sin nombre de fila
row.names(dataClus)=data_small_norm$key # para no perder nombre

#Jerarquico aglomerativo agnes

dataClus=dataClus[complete.cases(dataClus),]
g.dist = cluster::daisy(dataClus, metric="gower")
factoextra::fviz_nbclust(dataClus, factoextra::hcut,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F,hc_func = "agnes")

library(factoextra)
## Warning: package 'factoextra' was built under R version 4.3.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
res.agnes<- factoextra::hcut(g.dist, k = 5,hc_func='agnes',hc_method = "ward.D")
dataClus$agnes=res.agnes$cluster
fviz_dend(res.agnes, cex = 0.7, horiz = T,main = "")
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

fviz_silhouette(res.agnes,print.summary = F)

library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
silAGNES=data.frame(res.agnes$silinfo$widths)
silAGNES$key=row.names(silAGNES)
poorAGNES=silAGNES[silAGNES$sil_width<0,'key']%>%sort()
poorAGNES
##  [1] "AMAZONAS+LUYA"                 "ANCASH+OCROS"                 
##  [3] "APURIMAC+CHINCHEROS"           "AREQUIPA+CARAVELI"            
##  [5] "AREQUIPA+LA UNION"             "AYACUCHO+SUCRE"               
##  [7] "CAJAMARCA+CAJABAMBA"           "CAJAMARCA+JAEN"               
##  [9] "HUANCAVELICA+HUAYTARA"         "HUANUCO+HUANUCO"              
## [11] "HUANUCO+LEONCIO PRADO"         "HUANUCO+PACHITEA"             
## [13] "ICA+NAZCA"                     "JUNIN+JAUJA"                  
## [15] "JUNIN+TARMA"                   "LA LIBERTAD+OTUZCO"           
## [17] "LA LIBERTAD+SANCHEZ CARRION"   "LA LIBERTAD+SANTIAGO DE CHUCO"
## [19] "LIMA+CAJATAMBO"                "LORETO+LORETO"                
## [21] "MADRE DE DIOS+MANU"            "PASCO+OXAPAMPA"               
## [23] "TACNA+TARATA"                  "UCAYALI+CORONEL PORTILLO"

#no hay valores mal clusterizados

data_small_norm$agnesIDHpoor=data_small_norm$key%in%poorAGNES
data_small_norm$agnesIDH=as.ordered(dataClus$agnes)
dataClus$agnes=NULL

#jerarquico divisica DIANA

factoextra::fviz_nbclust(dataClus, factoextra::hcut,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F,hc_func = "diana")

res.diana <- factoextra::hcut(g.dist, k = 5,hc_func='diana')
dataClus$diana=res.diana$cluster
fviz_dend(res.diana, cex = 0.7, horiz = T, main = "")

fviz_silhouette(res.diana,print.summary = F)

silDIANA=data.frame(res.diana$silinfo$widths)
silDIANA$key=row.names(silDIANA)
poorDIANA=silDIANA[silDIANA$sil_width<0,'key']%>%sort()
poorDIANA
## [1] "AYACUCHO+LA MAR"       "HUANUCO+HUACAYBAMBA"   "HUANUCO+LEONCIO PRADO"
## [4] "LA LIBERTAD+ASCOPE"    "LAMBAYEQUE+CHICLAYO"   "PASCO+PASCO"          
## [7] "PIURA+PAITA"           "PIURA+SECHURA"
data_small_norm$dianaIDHpoor=data_small_norm$key%in%poorDIANA
data_small_norm$dianaIDH=as.ordered(dataClus$diana)
dataClus$diana=NULL