library(rio)
## Warning: package 'rio' was built under R version 4.3.3
data2 = import("dataOK_all.xlsx")
## New names:
## • `` -> `...1`
str(data2)
## 'data.frame': 196 obs. of 50 variables:
## $ ...1 : num 1 2 3 4 5 6 7 8 9 10 ...
## $ key : chr "AMAZONAS+BAGUA" "AMAZONAS+BONGARA" "AMAZONAS+CHACHAPOYAS" "AMAZONAS+CONDORCANQUI" ...
## $ Código : num 102 103 101 104 105 106 107 202 203 204 ...
## $ pared1_Ladrillo : num 4633 1602 3782 291 430 ...
## $ pared2_Piedra : num 46 9 22 7 7 7 35 1 0 3 ...
## $ pared3_Adobe : num 6639 2729 5881 672 5217 ...
## $ pared4_Tapia : num 222 240 2476 8 6052 ...
## $ pared5_Quincha : num 2518 157 309 386 346 ...
## $ pared6_Piedra : num 127 36 168 7 54 28 518 65 7 6 ...
## $ pared7_Madera : num 4484 2505 1270 8145 606 ...
## $ pared8_Triplay : num 851 30 91 200 45 24 210 18 0 1 ...
## $ pared9_Otro : num 0 0 0 0 0 0 0 0 0 0 ...
## $ pared10_Total : num 19520 7308 13999 9716 12757 ...
## $ techo1_Concreto : num 2187 692 2262 56 187 ...
## $ techo2_Madera : num 294 75 160 188 43 48 340 57 12 8 ...
## $ techo3_Tejas : num 179 382 3393 177 3071 ...
## $ techo4_Planchas : num 13186 6084 8005 2036 9343 ...
## $ techo5_Caña : num 160 38 50 15 26 15 196 10 8 5 ...
## $ techo6_Triplay : num 106 5 14 10 12 5 62 17 4 3 ...
## $ techo7_Paja : num 3408 32 115 7234 75 ...
## $ techo8_Otro : num 0 0 0 0 0 0 0 0 0 0 ...
## $ techo9_Total : num 19520 7308 13999 9716 12757 ...
## $ piso1_Parquet : num 6 5 23 2 4 3 20 0 0 5 ...
## $ piso2_Láminas : num 19 2 36 0 0 4 32 0 0 1 ...
## $ piso3_Losetas : num 647 165 1077 20 46 ...
## $ piso4_Madera : num 157 132 240 1523 295 ...
## $ piso5_Cemento : num 7121 2917 6189 943 1911 ...
## $ piso6_Tierra : num 11569 4087 6434 7228 10501 ...
## $ piso7_Otro : num 1 0 0 0 0 0 0 0 0 0 ...
## $ piso8_Total : num 19520 7308 13999 9716 12757 ...
## $ agua1_Red : num 9429 4569 10647 1307 7172 ...
## $ agua2_Red_fueraVivienda: num 4392 1497 1619 867 3097 ...
## $ agua3_Pilón : num 793 215 184 1003 1112 ...
## $ agua4_Camión : num 59 0 49 2 0 0 117 0 0 0 ...
## $ agua5_Pozo : num 1792 474 876 2564 819 ...
## $ agua6_Manantial : num 270 67 92 431 132 211 471 121 61 27 ...
## $ agua7_Río : num 2648 388 488 3428 369 ...
## $ agua8_Otro : num 56 61 24 80 9 29 104 2 1 6 ...
## $ agua9_Vecino : num 81 37 20 34 47 8 177 9 4 6 ...
## $ agua10_Total : num 19520 7308 13999 9716 12757 ...
## $ elec1_Sí : num 13204 6025 12248 1792 10886 ...
## $ elec2_No : num 6316 1283 1751 7924 1871 ...
## $ elec3_Total : num 19520 7308 13999 9716 12757 ...
## $ departamento : chr "AMAZONAS" "AMAZONAS" "AMAZONAS" "AMAZONAS" ...
## $ provincia : chr "BAGUA" "BONGARA" "CHACHAPOYAS" "CONDORCANQUI" ...
## $ Castillo : num 25629 8374 15671 13154 12606 ...
## $ Keiko : num 10770 5209 10473 1446 7840 ...
## $ ganaCastillo : num 1 1 1 1 1 1 1 1 1 1 ...
## $ countPositivos : num 8126 389 2174 3481 456 ...
## $ countFallecidos : num 462 72 281 111 88 60 336 26 31 21 ...
#pregunta
Utilizando el porcentaje de viviendas que tiene agua de red publica dentro de la vivienda, la razón de votacion de keiko entre castillo, y la tasa fallecidos por cada 1000 contagiados, Ud se propone agrupar a las provincias del Peru (sin la provincia de Lima) siguiendo diversas estrategias (no corrija correlacion negativa si la hubiera, pero siempre normalice); y en ese proceso Ud. encuentra…
# Cargar el paquete dplyr
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Crear la nueva variable
data2 <- data2 %>%
mutate(porcentaje_aguaRed = (agua1_Red / agua10_Total) * 100)
data2$razon_votacion <- data2$Castillo / data2$Keiko
data2$tasa_fallecidos_por_1000 <- (data2$countFallecidos / data2$countPositivos) * 1000
excluir a lima
# Excluir la provincia de Lima
dataC <- data2[data2$provincia != "LIMA", ]
#conglomerado
#creamos dataClus, idehdemo es nuestra data original y las columnas que QUEREMOS clusterizar, EN country ponemos la columna de los nombres de nuestra data original.
dataClus=dataC[,c("porcentaje_aguaRed", "razon_votacion", "tasa_fallecidos_por_1000")]
row.names(dataClus)=dataC$provincia
library(cluster)
g.dist = daisy(dataClus, metric="gower")
#pam
## para PAM
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.3.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_nbclust(dataClus, pam,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F)
#COLOCAMOS el numero que nos salio antes DONDE esta el 3
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.3.3
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
set.seed(123)
res.pam=pam(g.dist,5,cluster.only = F)
#nueva columna
dataClus$pam=res.pam$cluster
# ver
head(dataClus,15)%>%kbl()%>%kable_styling()
| porcentaje_aguaRed | razon_votacion | tasa_fallecidos_por_1000 | pam | |
|---|---|---|---|---|
| BAGUA | 48.30430 | 2.3796657 | 56.85454 | 1 |
| BONGARA | 62.52053 | 1.6076022 | 185.08997 | 1 |
| CHACHAPOYAS | 76.05543 | 1.4963239 | 129.25483 | 2 |
| CONDORCANQUI | 13.45204 | 9.0968188 | 31.88739 | 3 |
| LUYA | 56.22011 | 1.6079082 | 192.98246 | 1 |
| RODRÍGUEZ DE MENDOZA | 59.02965 | 1.4509197 | 545.45455 | 1 |
| UTCUBAMBA | 48.71039 | 1.9009468 | 89.62390 | 1 |
| AIJA | 74.75528 | 1.6454352 | 329.11392 | 2 |
| ANTONIO RAYMONDI | 85.28790 | 6.4162437 | 574.07407 | 2 |
| ASUNCIÓN | 71.32928 | 3.4582830 | 355.93220 | 2 |
| BOLOGNESI | 72.30859 | 1.9253881 | 396.69421 | 2 |
| CARHUAZ | 73.07544 | 2.1863795 | 295.28986 | 2 |
| CARLOS FERMÍN FITZCARRALD | 66.24904 | 3.8078963 | 607.14286 | 2 |
| CASMA | 60.43541 | 0.7286762 | 375.90862 | 1 |
| CORONGO | 75.16049 | 1.4890411 | 513.51351 | 2 |
fviz_silhouette(res.pam,print.summary = F)
silPAM=data.frame(res.pam$silinfo$widths)
silPAM$country=row.names(silPAM)
poorPAM=silPAM[silPAM$sil_width<0,'country']%>%sort()
poorPAM
## [1] "ACOMAYO" "AMBO" "ANDAHUAYLAS" "ANTABAMBA"
## [5] "AYMARAES" "BELLAVISTA" "CAJATAMBO" "CASTROVIRREYNA"
## [9] "CONDORCANQUI" "CUTERVO" "HUACAYBAMBA" "HUALGAYOC"
## [13] "LAURICOCHA" "SAN MIGUEL" "SANTA CRUZ" "SIHUAS"
## [17] "SUCRE" "SULLANA" "TARATA" "YAUYOS"
#agnes
## PARA JERARQUICO
fviz_nbclust(dataClus, hcut,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F,hc_func = "agnes")
# en la K ponemos el numero que nos salio antes
set.seed(123)
library(factoextra)
res.agnes<- hcut(g.dist, k = 5,hc_func='agnes',hc_method = "ward.D")
dataClus$agnes=res.agnes$cluster
# ver
head(dataClus,15)%>%kbl()%>%kable_styling()
| porcentaje_aguaRed | razon_votacion | tasa_fallecidos_por_1000 | pam | agnes | |
|---|---|---|---|---|---|
| BAGUA | 48.30430 | 2.3796657 | 56.85454 | 1 | 1 |
| BONGARA | 62.52053 | 1.6076022 | 185.08997 | 1 | 1 |
| CHACHAPOYAS | 76.05543 | 1.4963239 | 129.25483 | 2 | 2 |
| CONDORCANQUI | 13.45204 | 9.0968188 | 31.88739 | 3 | 3 |
| LUYA | 56.22011 | 1.6079082 | 192.98246 | 1 | 1 |
| RODRÍGUEZ DE MENDOZA | 59.02965 | 1.4509197 | 545.45455 | 1 | 1 |
| UTCUBAMBA | 48.71039 | 1.9009468 | 89.62390 | 1 | 1 |
| AIJA | 74.75528 | 1.6454352 | 329.11392 | 2 | 2 |
| ANTONIO RAYMONDI | 85.28790 | 6.4162437 | 574.07407 | 2 | 4 |
| ASUNCIÓN | 71.32928 | 3.4582830 | 355.93220 | 2 | 2 |
| BOLOGNESI | 72.30859 | 1.9253881 | 396.69421 | 2 | 2 |
| CARHUAZ | 73.07544 | 2.1863795 | 295.28986 | 2 | 2 |
| CARLOS FERMÍN FITZCARRALD | 66.24904 | 3.8078963 | 607.14286 | 2 | 4 |
| CASMA | 60.43541 | 0.7286762 | 375.90862 | 1 | 1 |
| CORONGO | 75.16049 | 1.4890411 | 513.51351 | 2 | 2 |
fviz_silhouette(res.agnes,print.summary = F)
silAGNES=data.frame(res.agnes$silinfo$widths)
silAGNES$country=row.names(silAGNES)
poorAGNES=silAGNES[silAGNES$sil_width<0,'country']%>%sort()
poorAGNES
## [1] "ANTONIO RAYMONDI" "AYMARAES"
## [3] "BONGARA" "CANCHIS"
## [5] "CARLOS FERMÍN FITZCARRALD" "CHINCHEROS"
## [7] "CONTRALMIRANTE VILLAR" "FERREÑAFE"
## [9] "HUANCA SANCOS" "HUARI"
## [11] "HUARMEY" "LAMPA"
## [13] "MARISCAL LUZURIAGA" "MOHO"
## [15] "OXAPAMPA" "PALLASCA"
## [17] "SECHURA" "TARATA"
## [19] "URUBAMBA" "YAUYOS"
#diana
## PARA JERARQUICO
fviz_nbclust(dataClus, hcut,diss=g.dist,method = "gap_stat",k.max = 10,verbose = F,hc_func = "diana")
# Poner el numero anterior en K
set.seed(123)
res.diana <- hcut(g.dist, k = 5,hc_func='diana')
dataClus$diana=res.diana$cluster
# veamos
head(dataClus,15)%>%kbl%>%kable_styling()
| porcentaje_aguaRed | razon_votacion | tasa_fallecidos_por_1000 | pam | agnes | diana | |
|---|---|---|---|---|---|---|
| BAGUA | 48.30430 | 2.3796657 | 56.85454 | 1 | 1 | 1 |
| BONGARA | 62.52053 | 1.6076022 | 185.08997 | 1 | 1 | 1 |
| CHACHAPOYAS | 76.05543 | 1.4963239 | 129.25483 | 2 | 2 | 1 |
| CONDORCANQUI | 13.45204 | 9.0968188 | 31.88739 | 3 | 3 | 2 |
| LUYA | 56.22011 | 1.6079082 | 192.98246 | 1 | 1 | 1 |
| RODRÍGUEZ DE MENDOZA | 59.02965 | 1.4509197 | 545.45455 | 1 | 1 | 1 |
| UTCUBAMBA | 48.71039 | 1.9009468 | 89.62390 | 1 | 1 | 1 |
| AIJA | 74.75528 | 1.6454352 | 329.11392 | 2 | 2 | 1 |
| ANTONIO RAYMONDI | 85.28790 | 6.4162437 | 574.07407 | 2 | 4 | 1 |
| ASUNCIÓN | 71.32928 | 3.4582830 | 355.93220 | 2 | 2 | 1 |
| BOLOGNESI | 72.30859 | 1.9253881 | 396.69421 | 2 | 2 | 1 |
| CARHUAZ | 73.07544 | 2.1863795 | 295.28986 | 2 | 2 | 1 |
| CARLOS FERMÍN FITZCARRALD | 66.24904 | 3.8078963 | 607.14286 | 2 | 4 | 1 |
| CASMA | 60.43541 | 0.7286762 | 375.90862 | 1 | 1 | 1 |
| CORONGO | 75.16049 | 1.4890411 | 513.51351 | 2 | 2 | 1 |
fviz_silhouette(res.diana,print.summary = F)
silDIANA=data.frame(res.diana$silinfo$widths)
silDIANA$country=row.names(silDIANA)
poorDIANA=silDIANA[silDIANA$sil_width<0,'country']%>%sort()
poorDIANA
## [1] "ACOMAYO" "BELLAVISTA" "CUTERVO" "GRAN CHIMÚ" "OXAPAMPA"
## [6] "SUCRE" "YAROWILCA"