1 Objetivo de este documento

En este cuaderno documento los procedimeintos realizados para la clasificación de cuencas. Primero hago análisis de componentes principales y después clasificación no supervisada con Kmeans.

Paso 1: cargar las librerias y configuraciones que necesito

rm(list=ls())

library(corrplot)
library(tidyverse)  # data manipulation
library(cluster)    # clustering algorithms
library(factoextra) # clustering algorithms & visualization
library(rgl) # Multi 3D plot
library(gridExtra)
library(ggplot2)
library(rgdal)
library(sp)

windowsFonts(
  AS = windowsFont("Ancizar Sans")
)

Paso 2: Cargo la matriz multivariada (esta matriz es la entrada para la clasificación)

path <- "G:/Mi unidad/UNIVERSIDAD PC/MAESTRIA/ADICIONALES/48510 DWB DIARIO/PRODUCTOS/ARTÍCULO/SHH 2021/CAMBIO COBERTURA/TRABAJO/"
setwd(path)

file_ <- read.csv(paste0("./",list.files(".", pattern = ".csv")))
rownames(file_) <- file_$COD_CAT
# file_ <- file_[3:dim(file_)[2]]
file_ <- file_[6:22] #aun no tomo las variables de poblacion porque deseo calcular una pendiente de cambio de población entre 2005 y 2018 por cuenca

head(file_)

##               A_km2    PER_km    S_por ELEV_MIN_m ELEV_MAX_m ELEV_MEDIANA_m
## 21027010   3569.855  353.5112 27.29945        825       4618           2088
## 21137050  22291.972 1063.7114 28.56843        333       5375           1656
## 22017010    655.437  180.5820 50.92921        733       4564           3172
## 23037010  56506.130 1769.2750 27.01210        146       5375           1623
## 23187280  65619.558 1869.5925 23.35583          4       5283           1178
## 25027360 164978.326 3671.8017 22.70542          0       5448            958
##          ELEV_MEDIA_m FF_HORTON AH RN P_mm_ano ETP_mm_ano cambio_NDVI
## 21027010     2156.080 0.5227336  2  2 1613.079   1113.875 0.000103627
## 21137050     1730.116 0.4885001  2  2 1594.386   1228.194 0.001089856
## 22017010     2928.199 0.3992014  2  2 1879.293   1027.042 0.001213334
## 23037010     1736.075 0.2712644  2  2 1636.306   1232.111 0.001494592
## 23187280     1371.669 0.5252963  2  2 2418.809   1307.323 0.001536290
## 25027360     1246.362 0.2394912  2  2 2002.669   1333.809 0.001443443
##            cambio_EVI   qmed_m3s  qmed_q50 tasa_pobl
## 21027010 -0.000070000  155.85146 1.2609341  5.810332
## 21137050  0.000643585  622.35087 1.1173265  1.653051
## 22017010  0.000806541   29.26579 1.3242440 -7.509468
## 23037010  0.000903595 1570.91979 1.1101907  3.438639
## 23187280  0.000829664 3590.46944 1.0597608  0.902713
## 25027360  0.000744647 2779.41437 0.9898199  3.251411

Paso 3: Explorar los datos para evaluar las variables que son independientes.

cmat <- cor(file_, method = "spearman") 


#Esta función evalua si la correlación es significativa
cor.mtest <- function(mat, ...) {
  mat <- as.matrix(mat)
  n <- ncol(mat)
  p.mat<- matrix(NA, n, n)
  diag(p.mat) <- 0
  for (i in 1:(n - 1)) {
    for (j in (i + 1):n) {
      tmp <- cor.test(mat[, i], mat[, j], ...)
      p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
    }
  }
  colnames(p.mat) <- rownames(p.mat) <- colnames(mat)
  p.mat
}
# matriz de los p-value de cada correlación
p.mat <- cor.mtest(cmat)
# head(p.mat[, 1:5])

#grafico nuevo de correlaciones
#ruta <- "./CORRELACIONES/"
#jpeg(filename = paste(ruta,"COR_MULTI.jpeg", sep=""), width =15, height = 10, units = "cm", res = 300) #mejorar el nombre
par(family="AS", oma=c(0.5,0,0,0), mar=c(0,0,0,0), xpd=F )
corrplot(cmat, type = "upper", order = "hclust", tl.col = "black", tl.srt = 45,
          cl.cex=0.8, number.cex=0.8 , tl.cex=0.65 , method = "square", 
         p.mat = p.mat, sig.level = 0.05, insig = "blank" )

# mtext(side=1, text = "Diagrama de correlación - Pearson", line = 0.9)

#dev.off()

La grafica anterior muestra con vacio las correlaciones que no son significativas y ayuda a definir las variables que son dependientes. En este ejercicio el enfoque se hará en analizar todas las variables con respecto a cambio NDVI y cambio EVI.

Lo primero que se puede evidenciar es que cambio NDVI y EVI son dependientes, en este caso se selecciona EVI por presentar correlación siginificativa con más variables.

Las tres elevaciones son dependientes entre ellas, por tanto, se utilizará la elevación mediana.

Las variables que se consideran independientes y con correlación significativa con cambio NDVI son: área, perímetro, elevación mediana, precipitación total anual, evapotranspiración total anual y caudal medio.

Paso 4: Generar la matriz de variables para realizar la clasificación

# variables_remover <- c("S_por", "ELEV_MIN_m", "ELEV_MAX_m", "ELEV_MEDIA_m","FF_HORTON", "AH",
#                "RN" , "cambio_EVI", "qmed_q50")
variables_remover <- c( "ELEV_MIN_m", "ELEV_MAX_m", "ELEV_MEDIA_m","FF_HORTON", "AH",
               "RN" , "cambio_NDVI", "qmed_q50")
print(paste("Variables a remover de la matriz inicial:", variables_remover))

## [1] "Variables a remover de la matriz inicial: ELEV_MIN_m"  
## [2] "Variables a remover de la matriz inicial: ELEV_MAX_m"  
## [3] "Variables a remover de la matriz inicial: ELEV_MEDIA_m"
## [4] "Variables a remover de la matriz inicial: FF_HORTON"   
## [5] "Variables a remover de la matriz inicial: AH"          
## [6] "Variables a remover de la matriz inicial: RN"          
## [7] "Variables a remover de la matriz inicial: cambio_NDVI" 
## [8] "Variables a remover de la matriz inicial: qmed_q50"

variables_remover_id <- unlist(lapply(variables_remover, function(x){
  which(x==colnames(file_))
}))


file_ <- file_[, -variables_remover_id]
head(file_)

##               A_km2    PER_km    S_por ELEV_MEDIANA_m P_mm_ano ETP_mm_ano
## 21027010   3569.855  353.5112 27.29945           2088 1613.079   1113.875
## 21137050  22291.972 1063.7114 28.56843           1656 1594.386   1228.194
## 22017010    655.437  180.5820 50.92921           3172 1879.293   1027.042
## 23037010  56506.130 1769.2750 27.01210           1623 1636.306   1232.111
## 23187280  65619.558 1869.5925 23.35583           1178 2418.809   1307.323
## 25027360 164978.326 3671.8017 22.70542            958 2002.669   1333.809
##            cambio_EVI   qmed_m3s tasa_pobl
## 21027010 -0.000070000  155.85146  5.810332
## 21137050  0.000643585  622.35087  1.653051
## 22017010  0.000806541   29.26579 -7.509468
## 23037010  0.000903595 1570.91979  3.438639
## 23187280  0.000829664 3590.46944  0.902713
## 25027360  0.000744647 2779.41437  3.251411

plot(as.data.frame(file_))

Para realizar estos análisis es recomendable trabajar todas las variables en un mismo rango, así que se escalan restandoles la media y dividiéndolos por la desviación estandar. Adiconalmente se eliminan las cuencas que tienen alguna variable con valor NA.

df <- na.omit(file_)
df <- scale(df)

2 PCA

En el análisis de componentes principales se busca agrupar de tal forma que los nuevos componentes representen al menos el 85% de la varianza.

# pc <- princomp(df)
pc_spearman <- princomp(covmat=cmat)
pc_pearson <- princomp(covmat=cor(file_, method = "pearson") )

par(mfrow=c(1, 2))
# plot(pc_spearman, xaxt="n", main = "n")
# axTicks(1)
plot(pc_spearman, type='l')

# plot(pc_pearson, xaxt="n", main = "n")
# axTicks(1)
plot(pc_pearson, type='l')

Con los primeros 5 componentes se logra el 87% de la varianza. Efecto codo

summary(pc_spearman) # 4 components is both 'elbow' and explains >85% variance

## Importance of components:
##                           Comp.1    Comp.2    Comp.3    Comp.4     Comp.5
## Standard deviation     2.4423856 1.7631106 1.3052924 1.2259568 1.00216542
## Proportion of Variance 0.3508969 0.1828564 0.1002228 0.0884100 0.05907856
## Cumulative Proportion  0.3508969 0.5337533 0.6339762 0.7223862 0.78146472
##                            Comp.6     Comp.7     Comp.8     Comp.9   Comp.10
## Standard deviation     0.96056386 0.89145958 0.80137083 0.64911820 0.5668888
## Proportion of Variance 0.05427547 0.04674707 0.03777619 0.02478556 0.0189037
## Cumulative Proportion  0.83574019 0.88248726 0.92026344 0.94504900 0.9639527
##                           Comp.11     Comp.12     Comp.13     Comp.14
## Standard deviation     0.49394008 0.385921744 0.281195856 0.244643445
## Proportion of Variance 0.01435158 0.008760917 0.004651242 0.003520613
## Cumulative Proportion  0.97830428 0.987065194 0.991716435 0.995237048
##                            Comp.15     Comp.16     Comp.17
## Standard deviation     0.237520763 0.143904838 0.062011841
## Proportion of Variance 0.003318595 0.001218153 0.000226204
## Cumulative Proportion  0.998555643 0.999773796 1.000000000

summary(pc_pearson) # 4 components is both 'elbow' and explains >85% variance

## Importance of components:
##                           Comp.1    Comp.2    Comp.3     Comp.4     Comp.5
## Standard deviation     1.8815842 1.3983728 0.9909372 0.88791723 0.85213919
## Proportion of Variance 0.3933732 0.2172718 0.1091063 0.08759967 0.08068236
## Cumulative Proportion  0.3933732 0.6106450 0.7197513 0.80735100 0.88803336
##                            Comp.6     Comp.7     Comp.8      Comp.9
## Standard deviation     0.76441275 0.49628673 0.34598298 0.239516613
## Proportion of Variance 0.06492521 0.02736672 0.01330047 0.006374245
## Cumulative Proportion  0.95295856 0.98032529 0.99362575 1.000000000

# # Get principal component vectors using prcomp instead of princomp
# pc <- prcomp(df)
# # First for principal components
# comp <- data.frame(pc$x[,1:5])
# # Plot
# plot(comp, pch=16, col=rgb(0,0,0,0.5))

# library(ggbiplot)
# # wine.pca = prcomp(wine, scale. = TRUE)
# g = ggbiplot(pc, obs.scale = 1, var.scale = 1, ellipse = TRUE, circle = TRUE)
# g = g + scale_color_discrete(name = "1")
# # g = g + opts(legend.direction = "horizontal", legend.position = "top")
# print(g)

3 Clasificación no supervisada Kmeans

##Definir el número de centros (k)

Elbow method

Objetivo: encontrar nucleos donde la variación total dentro del nucleo se minimice.

\[ Minimizar\left(\sum_{k=1}^{k} W(C_{k})\right)\] Donde, Ck es el cluster k-esimo y W(Ck) es la variación dentro de dicho cluster.

# #haciendo la función a mano
# set.seed(123)
# 
# # function to compute total within-cluster sum of square 
# wss <- function(k) {
#   kmeans(df, k, nstart = 100 )$tot.withinss
# }
# 
# # Compute and plot wss for k = 1 to k = 15
# k.values <- 1:10
# 
# # extract wss for 2-15 clusters
# wss_values <- map_dbl(k.values, wss)
# 
# plot(k.values, wss_values,
#        type="b", pch = 19, frame = FALSE, 
#        xlab="Number of clusters K",
#        ylab="Total within-clusters sum of squares")

#aplicando la función de R
fviz_nbclust(df, kmeans, method = "wss") +
    geom_vline(xintercept = 4, linetype = 2)+
  labs(subtitle = "Elbow method")

Average Silhouette Method

Este método se basa en la calidad del agrupamiento, en otras palabras determina que tan buen agrupado se encuentra cada uno de los elementos que pertenecen a un grupo.

Este método calcula la silueta promedio de observaciones para diferentes núcleos (k). Aquí el número óptimo de núcleos es el que maximiza la silueta promedio

# # Función a mano
# avg_sil <- function(k) {
#   km.res <- kmeans(df, centers = k, nstart = 100)
#   ss <- silhouette(km.res$cluster, dist(df))
#   mean(ss[, 3])
# }
# 
# # Compute and plot wss for k = 2 to k = 15
# k.values <- 2:10
# 
# # extract avg silhouette for 2-15 clusters
# avg_sil_values <- map_dbl(k.values, avg_sil)
# 
# plot(k.values, avg_sil_values,
#        type = "b", pch = 19, frame = FALSE, 
#        xlab = "Number of clusters K",
#        ylab = "Average Silhouettes")

#con la función de r

# Silhouette method
fviz_nbclust(df, kmeans, method = "silhouette")+
  labs(subtitle = "Silhouette method")

##Gap Statistic Method Se basa en simulaciones montecarlo y compara la variación intragrupo para diferentes valores de núcleo. Para los datos observados y los datos de referencia, la variación intragrupo total se calcula utilizando diferentes valores de k.

\[ Gap_{n}(k)= E_{n}^{*}log(W_{k})-log(W_{k}) \] revisar a profundidad

3.1 Paquete de r que evalua 30 índices para determinar el número de clusters óptimo

library(NbClust)

nb <- NbClust(data = df, diss = NULL, distance = "euclidean",
        min.nc = 2, max.nc = 15, method = "kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 4 proposed 2 as the best number of clusters 
## * 11 proposed 3 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 1 proposed 10 as the best number of clusters 
## * 2 proposed 13 as the best number of clusters 
## * 4 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

unique_cluster <- lapply(unique(as.numeric(nb$Best.nc[1,])), function(x){
  count_cluster <- length(which(x==as.numeric(nb$Best.nc[1,])))
  count_cluster <- c(x, count_cluster)
})

unique_cluster <- do.call(rbind, unique_cluster)
unique_cluster <- as.data.frame(unique_cluster)
colnames(unique_cluster) <- c("Clusters", "Frecuencia")


p<-ggplot(data=unique_cluster, aes(x=Clusters, y=Frecuencia)) +
    geom_bar(stat="identity") 
p

3.2 Extraer resultados

Según el análisis el grupo óptimo es de 3

set.seed(151)
km_clusters <- kmeans(df, centers = 3, nstart = 500)
km_clusters$centers

##       A_km2     PER_km      S_por ELEV_MEDIANA_m   P_mm_ano ETP_mm_ano
## 1 -0.247178 -0.1902834 -0.5850892     -1.0882730  0.8114355  0.9919802
## 2 -0.269722 -0.2877605  0.3995336      0.6413619 -0.4186340 -0.5870515
## 3  2.918457  2.8485899 -0.8893452     -0.8896356  0.2157761  0.8292388
##    cambio_EVI   qmed_m3s  tasa_pobl
## 1 -0.65459436 -0.1330628  0.3829428
## 2  0.31166777 -0.2917003 -0.2298985
## 3  0.02161688  2.6716340  0.3447126

fviz_cluster(km_clusters, geom = "point",  data = df) + ggtitle("k = 4")

3.3 Comparación con diferentes núcleos

k2 <- kmeans(df, centers = 2, nstart = 100)
k4 <- kmeans(df, centers = 4, nstart = 100)
k5 <- kmeans(df, centers = 5, nstart = 100)
k6 <- kmeans(df, centers = 6, nstart = 100)

# plots to compare
p2 <- fviz_cluster(k2, geom = "point", data = df) + ggtitle("k = 2")
p4 <- fviz_cluster(k4, geom = "point",  data = df) + ggtitle("k = 4")
p5 <- fviz_cluster(k5, geom = "point",  data = df) + ggtitle("k = 5")
p6 <- fviz_cluster(k6, geom = "point",  data = df) + ggtitle("k = 6")

library(gridExtra)
grid.arrange(p2, p4, p5, p6, nrow = 2)

3.4 Estadísticas descriptivas de los clusters

coor <- read.csv("G:/Mi unidad/UNIVERSIDAD PC/MAESTRIA/ADICIONALES/48510 DWB DIARIO/PRODUCTOS/ARTÍCULO/SHH 2021/CAMBIO COBERTURA/TRABAJO/Matriz_Multivariada.csv")[,1:5]

clusters_cuencas <- cbind(coor,km_clusters[["cluster"]] )
colnames(clusters_cuencas)[6] <- "CLUSTER"
coordinates(clusters_cuencas) <- ~LON+LAT


plot(clusters_cuencas, col=ifelse(clusters_cuencas$CLUSTER==1,"blue",
                           ifelse(clusters_cuencas$CLUSTER==2, "red",
                           ifelse(clusters_cuencas$CLUSTER==3, "green", "black"))))

Clasificación no supervisada de cuencas

Camila García-Echeverri

12.04.2021