rm(list = ls())
library(dplyr)
## 
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(rio)
library(magrittr)
library(polycor)
library(psych)
## 
## Adjuntando el paquete: 'psych'
## The following object is masked from 'package:polycor':
## 
##     polyserial
library(matrixcalc)
library(GPArotation)
## 
## Adjuntando el paquete: 'GPArotation'
## The following objects are masked from 'package:psych':
## 
##     equamax, varimin
library(BBmisc)
## 
## Adjuntando el paquete: 'BBmisc'
## The following objects are masked from 'package:dplyr':
## 
##     coalesce, collapse, symdiff
## The following object is masked from 'package:base':
## 
##     isFALSE
library(ggcorrplot)
## Cargando paquete requerido: ggplot2
## 
## Adjuntando el paquete: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
library(knitr)
library(modelsummary)
## 
## Adjuntando el paquete: 'modelsummary'
## The following object is masked from 'package:psych':
## 
##     SD
library(kableExtra)
## 
## Adjuntando el paquete: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(car)
## Cargando paquete requerido: carData
## 
## Adjuntando el paquete: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
## The following object is masked from 'package:dplyr':
## 
##     recode
library(MASS)
## Warning: package 'MASS' was built under R version 4.4.1
## 
## Adjuntando el paquete: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(readxl)
dataok <- read_excel("C:/Users/JHOJAN/Downloads/dataOK_all (2).xlsx")
dataok <- dataok %>%
  mutate(Razon_votacion = Keiko / Castillo)
dataok <- dataok %>%
  mutate(Porcentaje_agua_red = (agua1_Red / agua10_Total) * 100)
dataok <- dataok %>%
  mutate(Tasa_fallecidos_por_1000 = (countFallecidos / countPositivos) * 1000)
boxplot(dataok[,c(47:49)],horizontal = F,las=2,cex.axis = 0.5)

boxplot(normalize(dataok[,c(47:49)],method='range',range=c(0,10)))

boxplot(normalize(dataok[,c(47:49)],method='standardize'))

dataok[,c(47:49)]=normalize(dataok[,c(47:49)],method='standardize')
cor(dataok[,c(47:49)])
##                          Razon_votacion Porcentaje_agua_red
## Razon_votacion               1.00000000           0.1195803
## Porcentaje_agua_red          0.11958032           1.0000000
## Tasa_fallecidos_por_1000    -0.09694139           0.1035734
##                          Tasa_fallecidos_por_1000
## Razon_votacion                        -0.09694139
## Porcentaje_agua_red                    0.10357342
## Tasa_fallecidos_por_1000               1.00000000
dataClus= dataok[,c(47:49)]
row.names(dataClus)=dataok$provincia
## Warning: Setting row names on a tibble is deprecated.
library(cluster)
g.dist = daisy(dataClus, metric="gower")
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_nbclust(dataClus, pam,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F)

library(kableExtra)
set.seed(123)
res.pam=pam(g.dist,5,cluster.only = F)

#nueva columna
dataClus$pam=res.pam$cluster

# ver

head(dataClus,15)%>%kbl()%>%kable_styling()
Razon_votacion Porcentaje_agua_red Tasa_fallecidos_por_1000 pam
-0.3739358 -0.3519838 -1.1415785 1
-0.0198897 0.3924045 -0.5365194 1
0.0612639 1.1011178 -0.7999689 2
-0.9182894 -2.1769143 -1.2593821 3
-0.0200974 0.0625030 -0.4992799 1
0.0979522 0.2096156 1.1638051 1
-0.1882854 -0.3307202 -0.9869613 1
-0.0449805 1.0330394 0.1430353 2
-0.8377221 1.5845474 1.2988419 2
-0.6038640 0.8536475 0.2695732 2
-0.2000002 0.9049263 0.4619025 2
-0.3087639 0.9450800 -0.0165583 2
-0.6504380 0.5876363 1.4548719 2
1.2963666 0.2832239 0.3638289 4
0.0669980 1.0542571 1.0130963 2
fviz_silhouette(res.pam,print.summary = F)

silPAM=data.frame(res.pam$silinfo$widths)
silPAM$country=row.names(silPAM)
poorPAM=silPAM[silPAM$sil_width<0,'country']%>%sort()
poorPAM
##  [1] "ACOBAMBA"          "AMBO"              "ANDAHUAYLAS"      
##  [4] "AYMARAES"          "CANGALLO"          "CARAVELƍ"         
##  [7] "CASTROVIRREYNA"    "CAYLLOMA"          "CORONEL PORTILLO" 
## [10] "DATEM DEL MARAƑƓN" "HUAMALƍES"         "HUANCAVELICA"     
## [13] "HUANTA"            "JAUJA"             "OCROS"            
## [16] "OXAPAMPA"          "PUERTO INCA"       "SAN MIGUEL"       
## [19] "SANTA CRUZ"        "TARMA"             "UCAYALI"          
## [22] "VILCAS HUAMƁN"     "YAUYOS"
aggregate(.~ pam, data=dataClus,mean)
##   pam Razon_votacion Porcentaje_agua_red Tasa_fallecidos_por_1000
## 1   1    -0.06658091         -0.03680525             -0.234018075
## 2   2    -0.40093916          0.91601596              0.004838563
## 3   3    -0.75415394         -0.93152842              0.166831221
## 4   4     1.35969919          0.77707759              0.228045629
## 5   5     1.92230620         -1.91647793             -0.895573982
dataClus2= dataok[,c(47:49)]
row.names(dataClus2)=dataok$provincia
## Warning: Setting row names on a tibble is deprecated.
res.agnes<- hcut(g.dist, k = 5,hc_func='agnes',hc_method = "ward.D")

dataClus$agnes=res.agnes$cluster

# ver

head(dataClus,15)%>%kbl()%>%kable_styling()
Razon_votacion Porcentaje_agua_red Tasa_fallecidos_por_1000 pam agnes
-0.3739358 -0.3519838 -1.1415785 1 1
-0.0198897 0.3924045 -0.5365194 1 2
0.0612639 1.1011178 -0.7999689 2 2
-0.9182894 -2.1769143 -1.2593821 3 3
-0.0200974 0.0625030 -0.4992799 1 2
0.0979522 0.2096156 1.1638051 1 2
-0.1882854 -0.3307202 -0.9869613 1 1
-0.0449805 1.0330394 0.1430353 2 2
-0.8377221 1.5845474 1.2988419 2 2
-0.6038640 0.8536475 0.2695732 2 2
-0.2000002 0.9049263 0.4619025 2 2
-0.3087639 0.9450800 -0.0165583 2 2
-0.6504380 0.5876363 1.4548719 2 2
1.2963666 0.2832239 0.3638289 4 4
0.0669980 1.0542571 1.0130963 2 2
fviz_dend(res.agnes, cex = 0.7, horiz = T,main = "")
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

fviz_silhouette(res.agnes,print.summary = F)

silAGNES=data.frame(res.agnes$silinfo$widths)
silAGNES$country=row.names(silAGNES)
poorAGNES=silAGNES[silAGNES$sil_width<0,'country']%>%sort()
poorAGNES
##  [1] "CAJABAMBA"         "CAJATAMBO"         "CARAVELƍ"         
##  [4] "CHINCHEROS"        "CORONEL PORTILLO"  "HUƁNUCO"          
##  [7] "HUAYTARƁ"          "JAƉN"              "JAUJA"            
## [10] "LA UNIƓN"          "LEONCIO PRADO"     "LORETO"           
## [13] "LUYA"              "MANU"              "NAZCA"            
## [16] "OCROS"             "OTUZCO"            "OXAPAMPA"         
## [19] "PACHITEA"          "SƁNCHEZ CARRIƓN"   "SANTIAGO DE CHUCO"
## [22] "SUCRE"             "TARATA"            "TARMA"
aggregate(.~ agnes, data=dataClus,mean)
##   agnes Razon_votacion Porcentaje_agua_red Tasa_fallecidos_por_1000      pam
## 1     1    -0.67877141          -0.3433202             -0.140302752 2.355932
## 2     2    -0.09602983           0.6193371              0.182623721 1.855263
## 3     3    -0.43942801          -1.6757364             -0.015876912 3.160000
## 4     4     1.47394814           0.8505111             -0.005452738 4.000000
## 5     5     2.82260736          -2.0871298             -1.008207099 5.000000
fviz_nbclust(dataClus2, hcut,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F,hc_func = "agnes")

set.seed(123)
res.diana <- hcut(g.dist, k = 5,hc_func='diana')
dataClus$diana=res.diana$cluster
# veamos
head(dataClus,15)%>%kbl%>%kable_styling()
Razon_votacion Porcentaje_agua_red Tasa_fallecidos_por_1000 pam agnes diana
-0.3739358 -0.3519838 -1.1415785 1 1 1
-0.0198897 0.3924045 -0.5365194 1 2 2
0.0612639 1.1011178 -0.7999689 2 2 2
-0.9182894 -2.1769143 -1.2593821 3 3 1
-0.0200974 0.0625030 -0.4992799 1 2 2
0.0979522 0.2096156 1.1638051 1 2 2
-0.1882854 -0.3307202 -0.9869613 1 1 1
-0.0449805 1.0330394 0.1430353 2 2 2
-0.8377221 1.5845474 1.2988419 2 2 2
-0.6038640 0.8536475 0.2695732 2 2 2
-0.2000002 0.9049263 0.4619025 2 2 2
-0.3087639 0.9450800 -0.0165583 2 2 2
-0.6504380 0.5876363 1.4548719 2 2 2
1.2963666 0.2832239 0.3638289 4 4 2
0.0669980 1.0542571 1.0130963 2 2 2
fviz_dend(res.diana, cex = 0.7, horiz = T, main = "")

fviz_silhouette(res.diana,print.summary = F)

silDIANA=data.frame(res.diana$silinfo$widths)
silDIANA$country=row.names(silDIANA)
poorDIANA=silDIANA[silDIANA$sil_width<0,'country']%>%sort()
poorDIANA
## [1] "ASCOPE"        "CHICLAYO"      "HUACAYBAMBA"   "LA MAR"       
## [5] "LEONCIO PRADO" "PAITA"         "PASCO"         "SECHURA"