Se propone trabajar con datos continuos normalizados. La selección de variables se sustenta en los principios teóricos de crecimiento de las ciudades, y a la disponbilidad de información.
Para el piloto se han seleccionado: - Volumen poblacional - Tamaño económico - Densidad estudiantil - Cantidad de establecimientos de salud
df <- read_excel("/Users/dalineportocarrero/Documents/Daline/4.PROYECTOS/PCM_PNUD/R_CUF/data_puno.xlsx")
# Vista rápida
glimpse(df)## Rows: 1,964
## Columns: 22
## $ fid <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ DN <dbl> 196, 196, 196, 196, 196, 196, 196, 196, 196, 196, 196, 196…
## $ POBcount <dbl> 1, 1, 1, 3, 19, 2, 1, 1, 1, 2, 2, 1, 4, 1, 2, 1, 7, 5, 2, …
## $ POBsum <dbl> 2.051268, 2.502553, 2.006236, 11.129877, 90.794351, 5.1720…
## $ POBmean <dbl> 2.051268, 2.502553, 2.006236, 3.709959, 4.778650, 2.586023…
## $ AREA <dbl> 0.0082, 0.0082, 0.0082, 0.0246, 0.1559, 0.0164, 0.0082, 0.…
## $ PERIMETRO <dbl> 0.3622, 0.3622, 0.3622, 0.7245, 2.1741, 0.5405, 0.3625, 0.…
## $ DENSIDAD_p <dbl> 250.2910, 305.3453, 244.7650, 452.4987, 582.5068, 315.0284…
## $ DIM_FRAC <dbl> 0.9999, 0.9999, 0.9999, 0.9223, 0.6560, 0.9741, 0.9999, 0.…
## $ PBI_cp_sum <dbl> NA, NA, 10.06867, NA, NA, NA, 64.99388, NA, NA, 194.98163,…
## $ PBI_cp_mea <dbl> NA, NA, 10.06867, NA, NA, NA, 64.99388, NA, NA, 194.98163,…
## $ DENSIDAD_e <dbl> 1.017547e+00, 4.638769e-01, 0.000000e+00, 0.000000e+00, 2.…
## $ SALUD_sum <dbl> NA, NA, NA, NA, 2, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ norm_pop <dbl> 0.0017, 0.0126, 0.0006, 0.0416, 0.0673, 0.0145, 0.0188, 0.…
## $ norm_PBI <dbl> NA, NA, 0.0000, NA, NA, NA, 0.0002, NA, NA, 0.0005, 0.0002…
## $ norm_educ <dbl> 0.1503, 0.0685, 0.0000, 0.0000, 0.3875, 0.0000, 0.0000, 0.…
## $ norm_salud <dbl> NA, NA, NA, NA, 0.0286, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ INDICE <dbl> 0.0306, 0.0175, 0.0002, 0.0125, 0.1048, 0.0044, 0.0057, 0.…
## $ ID <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
## $ COD <chr> "A-1", "A-2", "A-3", "A-4", "A-5", "A-6", "A-7", "A-8", "A…
## $ C_Cluster <chr> "C5", "C7", "C1", "C7", "C2", "C1", "C1", "C1", "C1", "C1"…
## $ J_Jerarquia <chr> "5", "6", "7", "6", "3", "7", "7", "7", "7", "7", "7", "7"…
## fid DN POBcount POBsum
## Min. :1 Min. :196 Min. : 1.000 Min. :1.991e+00
## 1st Qu.:1 1st Qu.:196 1st Qu.: 1.000 1st Qu.:2.354e+00
## Median :1 Median :196 Median : 2.000 Median :4.710e+00
## Mean :1 Mean :196 Mean : 7.657 Mean :9.935e+01
## 3rd Qu.:1 3rd Qu.:196 3rd Qu.: 4.000 3rd Qu.:1.162e+01
## Max. :1 Max. :196 Max. :3234.000 Max. :1.414e+05
##
## POBmean AREA PERIMETRO DENSIDAD_p
## Min. : 1.991 Min. : 0.00820 Min. : 0.3620 Min. : 241.7
## 1st Qu.: 2.261 1st Qu.: 0.00820 1st Qu.: 0.3630 1st Qu.: 274.7
## Median : 2.558 Median : 0.01650 Median : 0.5417 Median : 311.0
## Mean : 2.765 Mean : 0.06302 Mean : 0.9110 Mean : 336.0
## 3rd Qu.: 2.932 3rd Qu.: 0.03290 3rd Qu.: 0.9042 3rd Qu.: 356.1
## Max. :43.731 Max. :26.64430 Max. :75.5179 Max. :5307.9
##
## DIM_FRAC PBI_cp_sum PBI_cp_mea DENSIDAD_e
## Min. :-8.8673 Min. : 0.0 Min. : 0.0 Min. :0.0000
## 1st Qu.: 0.9222 1st Qu.: 126.2 1st Qu.: 117.1 1st Qu.:0.0000
## Median : 0.9740 Median : 345.2 Median : 326.0 Median :0.0000
## Mean : 0.9285 Mean : 2737.2 Mean : 1048.9 Mean :0.5020
## 3rd Qu.: 0.9999 3rd Qu.: 1082.3 3rd Qu.: 1016.3 3rd Qu.:0.9107
## Max. :15.2195 Max. :428949.8 Max. :35745.8 Max. :6.7708
## NA's :1615 NA's :1615
## SALUD_sum norm_pop norm_PBI norm_educ
## Min. : 1.000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.: 1.000 1st Qu.:0.00650 1st Qu.:0.00030 1st Qu.:0.00000
## Median : 2.000 Median :0.01370 Median :0.00080 Median :0.00000
## Mean : 2.227 Mean :0.01861 Mean :0.00638 Mean :0.07415
## 3rd Qu.: 2.000 3rd Qu.:0.02260 3rd Qu.:0.00250 3rd Qu.:0.13453
## Max. :36.000 Max. :1.00000 Max. :1.00000 Max. :1.00000
## NA's :1898 NA's :1615
## norm_salud INDICE ID COD
## Min. :0.00000 Min. :0.00000 Min. : 1.0 Length:1964
## 1st Qu.:0.00000 1st Qu.:0.00330 1st Qu.: 491.8 Class :character
## Median :0.02860 Median :0.00800 Median : 982.5 Mode :character
## Mean :0.03507 Mean :0.02099 Mean : 982.5
## 3rd Qu.:0.02860 3rd Qu.:0.03165 3rd Qu.:1473.2
## Max. :1.00000 Max. :1.00000 Max. :1964.0
## NA's :1898
## C_Cluster J_Jerarquia
## Length:1964 Length:1964
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
vars_analisis <- c("POBsum", "PBI_cp_sum", "SALUD_sum",
"DENSIDAD_p", "DENSIDAD_e", "DIM_FRAC")
df |> select(all_of(vars_analisis)) |>
summarise(across(everything(), ~ sum(is.na(.)))) |>
pivot_longer(everything(), names_to = "variable", values_to = "n_NA")## # A tibble: 6 × 2
## variable n_NA
## <chr> <int>
## 1 POBsum 0
## 2 PBI_cp_sum 1615
## 3 SALUD_sum 1898
## 4 DENSIDAD_p 0
## 5 DENSIDAD_e 0
## 6 DIM_FRAC 0
#df_clean |> select(all_of(vars_analisis)) |> summary()
df_clean |>
select(all_of(vars_analisis)) |>
pivot_longer(everything()) |>
ggplot(aes(x = name, y = value)) +
geom_boxplot() +
facet_wrap(~name, scales = "free") +
theme_minimal()## POBsum PBI_cp_sum SALUD_sum DENSIDAD_p
## Min. :1.991e+00 Min. : 0.0 Min. : 0.00000 Min. : 241.7
## 1st Qu.:2.354e+00 1st Qu.: 0.0 1st Qu.: 0.00000 1st Qu.: 274.7
## Median :4.710e+00 Median : 0.0 Median : 0.00000 Median : 311.0
## Mean :9.935e+01 Mean : 486.4 Mean : 0.07485 Mean : 336.0
## 3rd Qu.:1.162e+01 3rd Qu.: 0.0 3rd Qu.: 0.00000 3rd Qu.: 356.1
## Max. :1.414e+05 Max. :428949.8 Max. :36.00000 Max. :5307.9
## DENSIDAD_e DIM_FRAC
## Min. :0.0000 Min. :-8.8673
## 1st Qu.:0.0000 1st Qu.: 0.9222
## Median :0.0000 Median : 0.9740
## Mean :0.5020 Mean : 0.9285
## 3rd Qu.:0.9107 3rd Qu.: 0.9999
## Max. :6.7708 Max. :15.2195
# Winsorización de outliers no aplica porque son parte del análisis de jerarquías. Evaluar aplicación de log
#Conversión a logaritmos
#df_clean <- df_clean |>
# mutate(
# POBsum_log = log1p(POBsum),
# PBI_cp_log = log1p(PBI_cp_sum),
# SALUD_log = log1p(SALUD_sum),
# DENSIDAD_p_log = log1p(DENSIDAD_p)
# )
#df_clean |> select(POBsum, POBsum_log, PBI_cp_sum, PBI_cp_log) |> head(5)# Normalizacion de variables seleccionadas
vars_modelo <- c("POBsum", "PBI_cp_sum", "SALUD_sum", "DENSIDAD_e")
# Función Min-Max → [0, 1]
minmax <- function(x) (x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
# Aplicar y guardar en nuevas columnas
df_clean <- df_clean |>
mutate(across(all_of(vars_modelo), minmax, .names = "{.col}_mm"))
df_clean <- df_clean |>
mutate(across(ends_with("_mm"), ~ round(., 3)))Se evaluan dos métodos, cada uno con sus métodos de agrupación.
# Variables normalizadas para el ACP
vars_mm <- c("POBsum_mm", "PBI_cp_sum_mm", "SALUD_sum_mm", "DENSIDAD_e_mm")
# Matriz para el ACP
mat <- df_clean |> select(all_of(vars_mm)) |> as.matrix()
# 1. Matriz de correlación
print(round(cor(mat, use = "complete.obs"), 3))## POBsum_mm PBI_cp_sum_mm SALUD_sum_mm DENSIDAD_e_mm
## POBsum_mm 1.000 0.971 0.932 0.199
## PBI_cp_sum_mm 0.971 1.000 0.918 0.222
## SALUD_sum_mm 0.932 0.918 1.000 0.311
## DENSIDAD_e_mm 0.199 0.222 0.311 1.000
# 2. ACP
res_pca <- PCA(mat, ncp = 4, graph = FALSE)
# 3. Varianza explicada — cuánto captura cada componente
print(round(res_pca$eig, 3))## eigenvalue percentage of variance cumulative percentage of variance
## comp 1 2.971 74.276 74.276
## comp 2 0.918 22.957 97.233
## comp 3 0.083 2.087 99.320
## comp 4 0.027 0.680 100.000
## Dim.1 Dim.2 Dim.3 Dim.4
## POBsum_mm 0.975 -0.167 0.077 0.125
## PBI_cp_sum_mm 0.973 -0.142 0.147 -0.104
## SALUD_sum_mm 0.971 -0.036 -0.235 -0.024
## DENSIDAD_e_mm 0.361 0.932 0.027 0.006
# 5. Biplot — variables e individuos
fviz_pca_biplot(res_pca,
repel = TRUE,
col.var = "#E46726",
col.ind = "#69b3a2",
alpha.ind = 0.3,
label = "var")El análisis identifica cuatro componentes, de los cuales solo el CP1 cuenta con un eigenvalue superior a 1, el mismo que cuenta con una varianza explicada del 74.27%, por lo que se utiliza como resultado para el método de agrupación.
mat_cp1 <- res_pca$ind$coord[, 1:2]
# Elbow method — prueba k de 1 a 10
fviz_nbclust(mat_cp1,
FUNcluster = kmeans,
method = "wss",
k.max = 10) +
labs(title = "Elbow method — k-means")# Silhouette - para validar k
fviz_nbclust(mat_cp1,
FUNcluster = kmeans,
method = "silhouette",
k.max = 10) +
labs(title = "Silhouette — validación k óptimo")# K-means con k = 6
set.seed(123) # para reproducibilidad
km <- kmeans(mat_cp1, centers = 6, nstart = 25)
# Asignar clusters al dataframe
df_clean$cluster_km <- as.factor(km$cluster)
# Visualizar clusters en el plano CP1
fviz_cluster(km, data = mat_cp1,
geom = "point",
alpha = 0.4,
palette = "Set2",
ggtheme = theme_minimal(),
main = "K-means k=6 — CP1")df_clean |>
group_by(cluster_km) |>
summarise(
n = n(),
pob_max = max(POBsum),
pob_med = median(POBsum),
pob_min = min(POBsum)
) |>
arrange(desc(pob_max))## # A tibble: 6 × 5
## cluster_km n pob_max pob_med pob_min
## <fct> <int> <dbl> <dbl> <dbl>
## 1 6 1 141425. 141425. 141425.
## 2 5 8 4520. 1684. 366.
## 3 4 426 1661. 5.24 1.99
## 4 2 47 1250. 22.8 2.03
## 5 1 206 745. 7.15 1.99
## 6 3 1276 184. 4.36 1.99
# Cargar y reproyectar a WGS84 geográfico
cp_puntos <- st_read("/Users/dalineportocarrero/Documents/Daline/4.PROYECTOS/PCM_PNUD/2.TRABAJO/resultados_Puno.shp") |> st_transform(4326)## Reading layer `resultados_Puno' from data source
## `/Users/dalineportocarrero/Documents/Daline/4.PROYECTOS/PCM_PNUD/2.TRABAJO/resultados_Puno.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 1964 features and 21 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: 963321.9 ymin: 8127329 xmax: 1076865 ymax: 8280653
## Projected CRS: WGS 84 / UTM zone 18S
## [1] "fid" "DN" "POBcount" "POBsum" "POBmean"
## [6] "AREA" "PERIMETRO" "DENSIDAD_p" "DIM_FRAC" "PBI_cp_sum"
## [11] "PBI_cp_mea" "DENSIDAD_e" "SALUD_sum" "norm_pop" "norm_PBI"
## [16] "norm_educ" "norm_salud" "INDICE" "ID" "COD"
## [21] "C_Cluster" "geometry"
## [1] "A-1" "A-2" "A-3" "A-4" "A-5" "A-6"
## [1] "A-1" "A-2" "A-3" "A-4" "A-5" "A-6"
# Unir clusters al shapefile de puntos
cp_mapa <- cp_puntos |>
left_join(df_clean |>
select(COD, cluster_km),
by = "COD")
# Paleta por cluster
pal <- colorFactor(palette = brewer.pal(6, "Set2"),
domain = cp_mapa$cluster_km)
# Mapa
leaflet() |>
addProviderTiles("CartoDB.Positron") |>
# Puntos centros poblados
addCircleMarkers(data = cp_mapa,
radius = ~case_when(
cluster_km == 6 ~ 25,
cluster_km == 5 ~ 16,
cluster_km == 2 ~ 8,
cluster_km == 4 ~ 4,
cluster_km == 1 ~ 2,
cluster_km == 3 ~ 1,
),
fillColor = "#039ccf",
fillOpacity = 0.35,
color = "white",
weight = 0.2,
label = ~paste0("Cluster: ", cluster_km,
" | Pob: ", round(POBsum, 0)),
popup = ~paste0("<b>COD: ", COD, "</b><br>",
"Cluster: <b>", cluster_km, "</b><br>",
"Población: ", round(POBsum, 0))) |>
addLegend(
position = "bottomright",
title = "Jerarquía Kmeans",
colors = rep("#039ccf", 6),
labels = c("6 - Mayor", "5", "2", "4", "1", "3 - Menor"),
opacity = 0.35
)# Ward con k = 7 fijo — CP1
hc_cp1 <- hclust(dist(mat_cp1), method = "ward.D2")
# Cortar en k=5
df_clean$cluster_ward <- cutree(hc_cp1, k = 7) |> as.factor()
# Perfil para comparar con k-means
df_clean |>
group_by(cluster_ward) |>
summarise(
n = n(),
pob_max = round(max(POBsum), 0),
pob_med = round(median(POBsum), 0),
pob_min = round(min(POBsum), 0)
) |>
arrange(desc(pob_max))## # A tibble: 7 × 5
## cluster_ward n pob_max pob_med pob_min
## <fct> <int> <dbl> <dbl> <dbl>
## 1 7 1 141425 141425 141425
## 2 5 5 4520 2204 366
## 3 3 50 1762 31 2
## 4 6 28 1661 248 11
## 5 4 212 572 6 2
## 6 1 351 456 5 2
## 7 2 1317 193 4 2
#Dendograma
# Muestra aleatoria para dendrograma
set.seed(123)
idx <- sample(nrow(mat_cp1), 200)
mat_muestra <- mat_cp1[idx, , drop = FALSE]
# Ward sobre la muestra
hc_muestra <- hclust(dist(mat_muestra), method = "ward.D2")
# Dendrograma con dendextend
dend <- as.dendrogram(hc_muestra)
dend <- color_branches(dend, k = 7, col = brewer.pal(7, "Set2"))
par(mar = c(4, 1, 2, 1))
plot(dend,
horiz = TRUE,
leaflab = "none",
lwd = 0.3,
main = "Dendrograma — muestra 200 núcleos",
xlab = "Height")# Cargar y reproyectar a WGS84 geográfico
cp_puntos <- st_read("/Users/dalineportocarrero/Documents/Daline/4.PROYECTOS/PCM_PNUD/2.TRABAJO/resultados_Puno.shp") |> st_transform(4326)## Reading layer `resultados_Puno' from data source
## `/Users/dalineportocarrero/Documents/Daline/4.PROYECTOS/PCM_PNUD/2.TRABAJO/resultados_Puno.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 1964 features and 21 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: 963321.9 ymin: 8127329 xmax: 1076865 ymax: 8280653
## Projected CRS: WGS 84 / UTM zone 18S
## [1] "fid" "DN" "POBcount" "POBsum" "POBmean"
## [6] "AREA" "PERIMETRO" "DENSIDAD_p" "DIM_FRAC" "PBI_cp_sum"
## [11] "PBI_cp_mea" "DENSIDAD_e" "SALUD_sum" "norm_pop" "norm_PBI"
## [16] "norm_educ" "norm_salud" "INDICE" "ID" "COD"
## [21] "C_Cluster" "geometry"
## [1] "A-1" "A-2" "A-3" "A-4" "A-5" "A-6"
## [1] "A-1" "A-2" "A-3" "A-4" "A-5" "A-6"
# Unir clusters al shapefile de puntos
cp_mapa <- cp_puntos |>
left_join(df_clean |>
select(COD, cluster_ward),
by = "COD")
# Paleta por cluster
pal <- colorFactor(palette = brewer.pal(7, "Set2"),
domain = cp_mapa$cluster_ward)
# Mapa
leaflet() |>
addProviderTiles("CartoDB.Positron") |>
# Puntos centros poblados
addCircleMarkers(data = cp_mapa,
radius = ~case_when(
cluster_ward == 7 ~ 25,
cluster_ward == 5 ~ 16,
cluster_ward == 6 ~ 8,
cluster_ward == 3 ~ 4,
cluster_ward == 4 ~ 3,
cluster_ward == 1 ~ 2,
cluster_ward == 2 ~ 1,
),
fillColor = "#039ccf",
fillOpacity = 0.35,
color = "white",
weight = 0.2,
label = ~paste0("Cluster: ", cluster_ward,
" | Pob: ", round(POBsum, 0)),
popup = ~paste0("<b>COD: ", COD, "</b><br>",
"Cluster: <b>", cluster_ward, "</b><br>",
"Población: ", round(POBsum, 0))) |>
addLegend(
position = "bottomright",
title = "Jerarquía ward",
colors = rep("#039ccf", 7),
labels = c("7 - Mayor", "5", "6", "3", "4", "1", "2 - Menor"),
opacity = 0.35
)