setwd("~/FINALESTADISTICA")
library(rio)
## Warning: package 'rio' was built under R version 4.4.2
data2<-import("dataOK_all.xlsx")
## New names:
## • `` -> `...1`
# Cargar las librerías necesarias
library(readxl)
## Warning: package 'readxl' was built under R version 4.4.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.2
##
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(cluster)
## Warning: package 'cluster' was built under R version 4.4.2
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.4.2
## Cargando paquete requerido: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
# Filtrar las provincias, excluyendo la provincia de Lima
data2 <- dplyr::filter(data2, provincia != "LIMA")
# Calcular las variables requeridas
# 1. Porcentaje de viviendas con agua de red publica dentro de la vivienda
data2 <- dplyr::mutate(data2, pct_agua_red_publica = (agua1_Red / agua10_Total) * 100)
# 2. Razón de votación de Keiko entre Castillo
data2 <- dplyr::mutate(data2, razon_keiko_castillo = Keiko / Castillo)
# 3. Tasa de fallecidos por cada 1000 contagiados
data2 <- dplyr::mutate(data2, tasa_fallecidos_por_1000 = (covidFallecidos / covidPositivos) * 1000)
# Seleccionar las variables de interés y normalizarlas
variables <- data2 %>%
dplyr::select(pct_agua_red_publica, razon_keiko_castillo, tasa_fallecidos_por_1000) %>%
dplyr::mutate(across(everything(), scale))
# Realizar la agrupación jerárquica divisiva
divisiva_cluster <- diana(variables)
# Visualizar el dendrograma
fviz_dend(divisiva_cluster, rect = TRUE, show_labels = FALSE)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Realizar la agrupación jerárquica aglomerativa
aglomerativa_cluster <- hclust(dist(variables), method = "ward.D2")
# Visualizar el dendrograma
fviz_dend(aglomerativa_cluster, rect = TRUE, show_labels = FALSE)

# Realizar la técnica de partición (k-means)
set.seed(123)
kmeans_cluster <- kmeans(variables, centers = 3, nstart = 25)
# Visualizar los clusters formados
fviz_cluster(kmeans_cluster, data = variables)

# Comparar las técnicas
silueta_divisiva <- silhouette(cutree(divisiva_cluster, k = 3), dist(variables))
summary(silueta_divisiva)
## Silhouette of 195 units in 3 clusters from silhouette.default(x = cutree(divisiva_cluster, k = 3), dist = dist(variables)) :
## Cluster sizes and average silhouette widths:
## 169 13 13
## 0.4516393 0.2401505 0.3081906
## Individual silhouette widths:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.06912 0.32074 0.48538 0.42798 0.55728 0.63334
silueta_aglomerativa <- silhouette(cutree(aglomerativa_cluster, k = 3), dist(variables))
summary(silueta_aglomerativa)
## Silhouette of 195 units in 3 clusters from silhouette.default(x = cutree(aglomerativa_cluster, k = 3), dist = dist(variables)) :
## Cluster sizes and average silhouette widths:
## 90 98 7
## 0.2290844 0.3243515 0.3452700
## Individual silhouette widths:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.2447 0.1842 0.3248 0.2811 0.4090 0.5572
silueta_kmeans <- silhouette(kmeans_cluster$cluster, dist(variables))
summary(silueta_kmeans)
## Silhouette of 195 units in 3 clusters from silhouette.default(x = kmeans_cluster$cluster, dist = dist(variables)) :
## Cluster sizes and average silhouette widths:
## 109 12 74
## 0.3219756 0.2553264 0.3005849
## Individual silhouette widths:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.04528 0.19724 0.33864 0.30976 0.43364 0.51137