ruta <- "C:/Users/yakim/Desktop/EXÁMEN FINAL/"

library(readxl)
## Warning: package 'readxl' was built under R version 4.4.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.2
## 
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)
## Warning: package 'readr' was built under R version 4.4.2
library(DescTools)
## Warning: package 'DescTools' was built under R version 4.4.2
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.2
library(rio)
library(tidyr)
library(polycor)
## Warning: package 'polycor' was built under R version 4.4.2
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.4.2
## Cargando paquete requerido: ggplot2
library(psych)
## Warning: package 'psych' was built under R version 4.4.2
## 
## Adjuntando el paquete: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## The following object is masked from 'package:polycor':
## 
##     polyserial
## The following objects are masked from 'package:DescTools':
## 
##     AUC, ICC, SD
library(matrixcalc)
library(GPArotation)
## 
## Adjuntando el paquete: 'GPArotation'
## The following objects are masked from 'package:psych':
## 
##     equamax, varimin
library(stringr)
library(cluster)
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.4.2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.4.1
## 
## Adjuntando el paquete: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows

datos1<- read_excel(paste0(ruta,"reporte.xlsx"))

PREGUNTA 2

Utilizando el porcentaje de viviendas que tiene agua de red publica dentro de la vivienda, la razón de votacion de keiko  entre castillo,  y la tasa fallecidos por cada 1000 contagiados,
Ud se propone agrupar a las provincias del Peru (sin la provincia de Lima) siguiendo diversas  estrategias (no corrija correlacion negativa si la hubiera, pero siempre normalice)

datos2<- read_excel(paste0(ruta,"dataOK_all.xlsx"))
## New names:
## • `` -> `...1`
str(datos2)
## tibble [196 × 50] (S3: tbl_df/tbl/data.frame)
##  $ ...1                   : num [1:196] 1 2 3 4 5 6 7 8 9 10 ...
##  $ key                    : chr [1:196] "AMAZONAS+BAGUA" "AMAZONAS+BONGARA" "AMAZONAS+CHACHAPOYAS" "AMAZONAS+CONDORCANQUI" ...
##  $ Código                 : num [1:196] 102 103 101 104 105 106 107 202 203 204 ...
##  $ pared1_Ladrillo        : num [1:196] 4633 1602 3782 291 430 ...
##  $ pared2_Piedra          : num [1:196] 46 9 22 7 7 7 35 1 0 3 ...
##  $ pared3_Adobe           : num [1:196] 6639 2729 5881 672 5217 ...
##  $ pared4_Tapia           : num [1:196] 222 240 2476 8 6052 ...
##  $ pared5_Quincha         : num [1:196] 2518 157 309 386 346 ...
##  $ pared6_Piedra          : num [1:196] 127 36 168 7 54 28 518 65 7 6 ...
##  $ pared7_Madera          : num [1:196] 4484 2505 1270 8145 606 ...
##  $ pared8_Triplay         : num [1:196] 851 30 91 200 45 24 210 18 0 1 ...
##  $ pared9_Otro            : num [1:196] 0 0 0 0 0 0 0 0 0 0 ...
##  $ pared10_Total          : num [1:196] 19520 7308 13999 9716 12757 ...
##  $ techo1_Concreto        : num [1:196] 2187 692 2262 56 187 ...
##  $ techo2_Madera          : num [1:196] 294 75 160 188 43 48 340 57 12 8 ...
##  $ techo3_Tejas           : num [1:196] 179 382 3393 177 3071 ...
##  $ techo4_Planchas        : num [1:196] 13186 6084 8005 2036 9343 ...
##  $ techo5_Caña            : num [1:196] 160 38 50 15 26 15 196 10 8 5 ...
##  $ techo6_Triplay         : num [1:196] 106 5 14 10 12 5 62 17 4 3 ...
##  $ techo7_Paja            : num [1:196] 3408 32 115 7234 75 ...
##  $ techo8_Otro            : num [1:196] 0 0 0 0 0 0 0 0 0 0 ...
##  $ techo9_Total           : num [1:196] 19520 7308 13999 9716 12757 ...
##  $ piso1_Parquet          : num [1:196] 6 5 23 2 4 3 20 0 0 5 ...
##  $ piso2_Láminas          : num [1:196] 19 2 36 0 0 4 32 0 0 1 ...
##  $ piso3_Losetas          : num [1:196] 647 165 1077 20 46 ...
##  $ piso4_Madera           : num [1:196] 157 132 240 1523 295 ...
##  $ piso5_Cemento          : num [1:196] 7121 2917 6189 943 1911 ...
##  $ piso6_Tierra           : num [1:196] 11569 4087 6434 7228 10501 ...
##  $ piso7_Otro             : num [1:196] 1 0 0 0 0 0 0 0 0 0 ...
##  $ piso8_Total            : num [1:196] 19520 7308 13999 9716 12757 ...
##  $ agua1_Red              : num [1:196] 9429 4569 10647 1307 7172 ...
##  $ agua2_Red_fueraVivienda: num [1:196] 4392 1497 1619 867 3097 ...
##  $ agua3_Pilón            : num [1:196] 793 215 184 1003 1112 ...
##  $ agua4_Camión           : num [1:196] 59 0 49 2 0 0 117 0 0 0 ...
##  $ agua5_Pozo             : num [1:196] 1792 474 876 2564 819 ...
##  $ agua6_Manantial        : num [1:196] 270 67 92 431 132 211 471 121 61 27 ...
##  $ agua7_Río              : num [1:196] 2648 388 488 3428 369 ...
##  $ agua8_Otro             : num [1:196] 56 61 24 80 9 29 104 2 1 6 ...
##  $ agua9_Vecino           : num [1:196] 81 37 20 34 47 8 177 9 4 6 ...
##  $ agua10_Total           : num [1:196] 19520 7308 13999 9716 12757 ...
##  $ elec1_Sí               : num [1:196] 13204 6025 12248 1792 10886 ...
##  $ elec2_No               : num [1:196] 6316 1283 1751 7924 1871 ...
##  $ elec3_Total            : num [1:196] 19520 7308 13999 9716 12757 ...
##  $ departamento           : chr [1:196] "AMAZONAS" "AMAZONAS" "AMAZONAS" "AMAZONAS" ...
##  $ provincia              : chr [1:196] "BAGUA" "BONGARA" "CHACHAPOYAS" "CONDORCANQUI" ...
##  $ Castillo               : num [1:196] 25629 8374 15671 13154 12606 ...
##  $ Keiko                  : num [1:196] 10770 5209 10473 1446 7840 ...
##  $ ganaCastillo           : num [1:196] 1 1 1 1 1 1 1 1 1 1 ...
##  $ covidPositivos         : num [1:196] 8126 389 2174 3481 456 ...
##  $ covidFallecidos        : num [1:196] 462 72 281 111 88 60 336 26 31 21 ...
names(datos2)
##  [1] "...1"                    "key"                    
##  [3] "Código"                  "pared1_Ladrillo"        
##  [5] "pared2_Piedra"           "pared3_Adobe"           
##  [7] "pared4_Tapia"            "pared5_Quincha"         
##  [9] "pared6_Piedra"           "pared7_Madera"          
## [11] "pared8_Triplay"          "pared9_Otro"            
## [13] "pared10_Total"           "techo1_Concreto"        
## [15] "techo2_Madera"           "techo3_Tejas"           
## [17] "techo4_Planchas"         "techo5_Caña"            
## [19] "techo6_Triplay"          "techo7_Paja"            
## [21] "techo8_Otro"             "techo9_Total"           
## [23] "piso1_Parquet"           "piso2_Láminas"          
## [25] "piso3_Losetas"           "piso4_Madera"           
## [27] "piso5_Cemento"           "piso6_Tierra"           
## [29] "piso7_Otro"              "piso8_Total"            
## [31] "agua1_Red"               "agua2_Red_fueraVivienda"
## [33] "agua3_Pilón"             "agua4_Camión"           
## [35] "agua5_Pozo"              "agua6_Manantial"        
## [37] "agua7_Río"               "agua8_Otro"             
## [39] "agua9_Vecino"            "agua10_Total"           
## [41] "elec1_Sí"                "elec2_No"               
## [43] "elec3_Total"             "departamento"           
## [45] "provincia"               "Castillo"               
## [47] "Keiko"                   "ganaCastillo"           
## [49] "covidPositivos"          "covidFallecidos"
censo_provincias <- datos2 %>%
  filter(provincia != "Lima") %>%  # Excluir la provincia de Lima
  select(provincia, agua1_Red, Keiko, Castillo, covidPositivos, covidFallecidos)
censo_provincias <- censo_provincias %>%
  mutate( agua_red_publica= rowMeans(select(., starts_with("agua")), na.rm = TRUE))
censo_provincias <- censo_provincias %>%
  mutate(
    razon_votacion = Keiko / Castillo,
    tasa_fallecidos = (covidFallecidos / covidPositivos) * 1000
  )
censo_normalizado <- censo_provincias %>%
  mutate(
    agua_red_publica = scale(agua_red_publica),
    razon_votacion = scale(razon_votacion),
    tasa_fallecidos = scale(tasa_fallecidos)
  )
set.seed(123)
kmeans_resultado <- kmeans(censo_normalizado[, -1], centers = 4)
censo_normalizado$grupo <- kmeans_resultado$cluster
set.seed(123)
wss <- sapply(1:10, function(k) {
  kmeans(censo_normalizado[, -1], centers = k)$tot.withinss
})

# Graficar el resultado del codo
plot(1:10, wss, type = "b", xlab = "Número de clusters", ylab = "Suma de errores cuadrados dentro del cluster")

distancia <- dist(censo_normalizado[, -1])  # Excluimos la columna de provincia

# Aplicar clustering jerárquico
hclust_resultado <- hclust(distancia, method = "ward.D2")

# Cortar el dendrograma en 4 grupos
grupos_hclust <- cutree(hclust_resultado, k = 4)

# Añadir los resultados al dataframe
censo_normalizado$grupo_hclust <- grupos_hclust
ggplot(censo_normalizado, aes(x = agua_red_publica, y = razon_votacion, color = factor(grupo))) +
  geom_point() +
  labs(title = "Agrupamiento de Provincias", x = "Agua de Red Pública", y = "Razón de Votación", color = "Grupo") +
  theme_minimal()

plot(hclust_resultado, main = "Dendrograma del Clustering Jerárquico")