pregunta 2

rm(list = ls())
library(dplyr)

## 
## Adjuntando el paquete: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(rio)
library(magrittr)
library(polycor)
library(psych)

## 
## Adjuntando el paquete: 'psych'

## The following object is masked from 'package:polycor':
## 
##     polyserial

library(matrixcalc)
library(GPArotation)

## 
## Adjuntando el paquete: 'GPArotation'

## The following objects are masked from 'package:psych':
## 
##     equamax, varimin

library(BBmisc)

## 
## Adjuntando el paquete: 'BBmisc'

## The following objects are masked from 'package:dplyr':
## 
##     coalesce, collapse, symdiff

## The following object is masked from 'package:base':
## 
##     isFALSE

library(ggcorrplot)

## Cargando paquete requerido: ggplot2

## 
## Adjuntando el paquete: 'ggplot2'

## The following objects are masked from 'package:psych':
## 
##     %+%, alpha

library(knitr)
library(modelsummary)

## 
## Adjuntando el paquete: 'modelsummary'

## The following object is masked from 'package:psych':
## 
##     SD

library(kableExtra)

## 
## Adjuntando el paquete: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

library(car)

## Cargando paquete requerido: carData

## 
## Adjuntando el paquete: 'car'

## The following object is masked from 'package:psych':
## 
##     logit

## The following object is masked from 'package:dplyr':
## 
##     recode

library(MASS)

## Warning: package 'MASS' was built under R version 4.4.1

## 
## Adjuntando el paquete: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

library(readxl)
dataok <- read_excel("C:/Users/JHOJAN/Downloads/dataOK_all (2).xlsx")

dataok <- dataok %>%
  mutate(Razon_votacion = Keiko / Castillo)

dataok <- dataok %>%
  mutate(Porcentaje_agua_red = (agua1_Red / agua10_Total) * 100)

dataok <- dataok %>%
  mutate(Tasa_fallecidos_por_1000 = (countFallecidos / countPositivos) * 1000)

boxplot(dataok[,c(47:49)],horizontal = F,las=2,cex.axis = 0.5)

boxplot(normalize(dataok[,c(47:49)],method='range',range=c(0,10)))

boxplot(normalize(dataok[,c(47:49)],method='standardize'))

dataok[,c(47:49)]=normalize(dataok[,c(47:49)],method='standardize')

cor(dataok[,c(47:49)])

##                          Razon_votacion Porcentaje_agua_red
## Razon_votacion               1.00000000           0.1195803
## Porcentaje_agua_red          0.11958032           1.0000000
## Tasa_fallecidos_por_1000    -0.09694139           0.1035734
##                          Tasa_fallecidos_por_1000
## Razon_votacion                        -0.09694139
## Porcentaje_agua_red                    0.10357342
## Tasa_fallecidos_por_1000               1.00000000

dataClus= dataok[,c(47:49)]
row.names(dataClus)=dataok$provincia

## Warning: Setting row names on a tibble is deprecated.

library(cluster)
g.dist = daisy(dataClus, metric="gower")

library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

fviz_nbclust(dataClus, pam,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F)

library(kableExtra)
set.seed(123)
res.pam=pam(g.dist,5,cluster.only = F)

#nueva columna
dataClus$pam=res.pam$cluster

# ver

head(dataClus,15)%>%kbl()%>%kable_styling()

Razon_votacion	Porcentaje_agua_red	Tasa_fallecidos_por_1000	pam
-0.3739358	-0.3519838	-1.1415785	1
-0.0198897	0.3924045	-0.5365194	1
0.0612639	1.1011178	-0.7999689	2
-0.9182894	-2.1769143	-1.2593821	3
-0.0200974	0.0625030	-0.4992799	1
0.0979522	0.2096156	1.1638051	1
-0.1882854	-0.3307202	-0.9869613	1
-0.0449805	1.0330394	0.1430353	2
-0.8377221	1.5845474	1.2988419	2
-0.6038640	0.8536475	0.2695732	2
-0.2000002	0.9049263	0.4619025	2
-0.3087639	0.9450800	-0.0165583	2
-0.6504380	0.5876363	1.4548719	2
1.2963666	0.2832239	0.3638289	4
0.0669980	1.0542571	1.0130963	2

fviz_silhouette(res.pam,print.summary = F)

silPAM=data.frame(res.pam$silinfo$widths)
silPAM$country=row.names(silPAM)
poorPAM=silPAM[silPAM$sil_width<0,'country']%>%sort()
poorPAM

##  [1] "ACOBAMBA"          "AMBO"              "ANDAHUAYLAS"      
##  [4] "AYMARAES"          "CANGALLO"          "CARAVELÍ"         
##  [7] "CASTROVIRREYNA"    "CAYLLOMA"          "CORONEL PORTILLO" 
## [10] "DATEM DEL MARAÑÓN" "HUAMALÍES"         "HUANCAVELICA"     
## [13] "HUANTA"            "JAUJA"             "OCROS"            
## [16] "OXAPAMPA"          "PUERTO INCA"       "SAN MIGUEL"       
## [19] "SANTA CRUZ"        "TARMA"             "UCAYALI"          
## [22] "VILCAS HUAMÁN"     "YAUYOS"

aggregate(.~ pam, data=dataClus,mean)

##   pam Razon_votacion Porcentaje_agua_red Tasa_fallecidos_por_1000
## 1   1    -0.06658091         -0.03680525             -0.234018075
## 2   2    -0.40093916          0.91601596              0.004838563
## 3   3    -0.75415394         -0.93152842              0.166831221
## 4   4     1.35969919          0.77707759              0.228045629
## 5   5     1.92230620         -1.91647793             -0.895573982

dataClus2= dataok[,c(47:49)]
row.names(dataClus2)=dataok$provincia

## Warning: Setting row names on a tibble is deprecated.

res.agnes<- hcut(g.dist, k = 5,hc_func='agnes',hc_method = "ward.D")

dataClus$agnes=res.agnes$cluster

# ver

head(dataClus,15)%>%kbl()%>%kable_styling()

Razon_votacion	Porcentaje_agua_red	Tasa_fallecidos_por_1000	pam	agnes
-0.3739358	-0.3519838	-1.1415785	1	1
-0.0198897	0.3924045	-0.5365194	1	2
0.0612639	1.1011178	-0.7999689	2	2
-0.9182894	-2.1769143	-1.2593821	3	3
-0.0200974	0.0625030	-0.4992799	1	2
0.0979522	0.2096156	1.1638051	1	2
-0.1882854	-0.3307202	-0.9869613	1	1
-0.0449805	1.0330394	0.1430353	2	2
-0.8377221	1.5845474	1.2988419	2	2
-0.6038640	0.8536475	0.2695732	2	2
-0.2000002	0.9049263	0.4619025	2	2
-0.3087639	0.9450800	-0.0165583	2	2
-0.6504380	0.5876363	1.4548719	2	2
1.2963666	0.2832239	0.3638289	4	4
0.0669980	1.0542571	1.0130963	2	2

fviz_dend(res.agnes, cex = 0.7, horiz = T,main = "")

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

fviz_silhouette(res.agnes,print.summary = F)

silAGNES=data.frame(res.agnes$silinfo$widths)
silAGNES$country=row.names(silAGNES)
poorAGNES=silAGNES[silAGNES$sil_width<0,'country']%>%sort()
poorAGNES

##  [1] "CAJABAMBA"         "CAJATAMBO"         "CARAVELÍ"         
##  [4] "CHINCHEROS"        "CORONEL PORTILLO"  "HUÁNUCO"          
##  [7] "HUAYTARÁ"          "JAÉN"              "JAUJA"            
## [10] "LA UNIÓN"          "LEONCIO PRADO"     "LORETO"           
## [13] "LUYA"              "MANU"              "NAZCA"            
## [16] "OCROS"             "OTUZCO"            "OXAPAMPA"         
## [19] "PACHITEA"          "SÁNCHEZ CARRIÓN"   "SANTIAGO DE CHUCO"
## [22] "SUCRE"             "TARATA"            "TARMA"

aggregate(.~ agnes, data=dataClus,mean)

##   agnes Razon_votacion Porcentaje_agua_red Tasa_fallecidos_por_1000      pam
## 1     1    -0.67877141          -0.3433202             -0.140302752 2.355932
## 2     2    -0.09602983           0.6193371              0.182623721 1.855263
## 3     3    -0.43942801          -1.6757364             -0.015876912 3.160000
## 4     4     1.47394814           0.8505111             -0.005452738 4.000000
## 5     5     2.82260736          -2.0871298             -1.008207099 5.000000

fviz_nbclust(dataClus2, hcut,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F,hc_func = "agnes")

set.seed(123)
res.diana <- hcut(g.dist, k = 5,hc_func='diana')
dataClus$diana=res.diana$cluster
# veamos
head(dataClus,15)%>%kbl%>%kable_styling()

Razon_votacion	Porcentaje_agua_red	Tasa_fallecidos_por_1000	pam	agnes	diana
-0.3739358	-0.3519838	-1.1415785	1	1	1
-0.0198897	0.3924045	-0.5365194	1	2	2
0.0612639	1.1011178	-0.7999689	2	2	2
-0.9182894	-2.1769143	-1.2593821	3	3	1
-0.0200974	0.0625030	-0.4992799	1	2	2
0.0979522	0.2096156	1.1638051	1	2	2
-0.1882854	-0.3307202	-0.9869613	1	1	1
-0.0449805	1.0330394	0.1430353	2	2	2
-0.8377221	1.5845474	1.2988419	2	2	2
-0.6038640	0.8536475	0.2695732	2	2	2
-0.2000002	0.9049263	0.4619025	2	2	2
-0.3087639	0.9450800	-0.0165583	2	2	2
-0.6504380	0.5876363	1.4548719	2	2	2
1.2963666	0.2832239	0.3638289	4	4	2
0.0669980	1.0542571	1.0130963	2	2	2

fviz_dend(res.diana, cex = 0.7, horiz = T, main = "")

fviz_silhouette(res.diana,print.summary = F)

silDIANA=data.frame(res.diana$silinfo$widths)
silDIANA$country=row.names(silDIANA)
poorDIANA=silDIANA[silDIANA$sil_width<0,'country']%>%sort()
poorDIANA

## [1] "ASCOPE"        "CHICLAYO"      "HUACAYBAMBA"   "LA MAR"       
## [5] "LEONCIO PRADO" "PAITA"         "PASCO"         "SECHURA"

pregunta 2

Jhojan Danny Carreño Flores

2024-06-26