library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(cluster)
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(stringr)

url <- "https://docs.google.com/spreadsheets/d/e/2PACX-1vTwrwOXL5DeiJYEZ1X6HVcZm9D9ZLiEaMfJ3DM7WgcZhiyYHLwN5daebw_38pnEsw/pub?output=csv"
data <- read_delim(url)
## New names:
## • `` -> `...1`
## Rows: 196 Columns: 50
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): key, departamento, provincia
## dbl (47): ...1, Código, pared1_Ladrillo, pared2_Piedra, pared3_Adobe, pared4...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data$departamento=str_split(string = data$key,
                         pattern = "\\+",
                         simplify = T)[,1]

data$provincia=str_split(string = data$key,
                          pattern = "\\+",
                          simplify = T)[,2]
data <- data[, c(1, which(names(data) == "departamento"), which(names(data) == "provincia"), 
                 setdiff(2:ncol(data), c(which(names(data) == "departamento"), which(names(data) == "provincia"))))]

data = data[,-c(1,4)]
data <- subset(data, provincia != "Lima")
data <- data %>%
  mutate(
    AguaRed = agua1_Red,
    RazonVoto = Keiko / Castillo,
    TasaFallecidos = (covidFallecidos / covidPositivos) * 1000
  )
data_cluster <- data %>%
  select(AguaRed, RazonVoto, TasaFallecidos) %>%
  scale()
library(cluster)
set.seed(123)
pam_result <- pam(data_cluster, k = 2)
pam_silhouette <- silhouette(pam_result)
mean_pam_sil <- mean(pam_silhouette[, 3])
agnes_result <- agnes(data_cluster, method = "ward")
agnes_clusters <- cutree(as.hclust(agnes_result), k = 2)
agnes_silhouette <- silhouette(agnes_clusters, dist(data_cluster))
mean_agnes_sil <- mean(agnes_silhouette[, 3])
diana_result <- diana(data_cluster)
diana_clusters <- cutree(as.hclust(diana_result), k = 2)
diana_silhouette <- silhouette(diana_clusters, dist(data_cluster))
mean_diana_sil <- mean(diana_silhouette[, 3])
print(mean_pam_sil)
## [1] 0.4148512
print(mean_agnes_sil)
## [1] 0.8763441
print(mean_diana_sil)
## [1] 0.8763441