Variables
merged2 <- merged %>%
remove_rownames %>%
filter(Codi != 12) %>%
column_to_rownames(var="Nom_Barri") %>%
select("n.tot","pc.esp","pc.ue27-esp","pc.20.34","2019-2014","n.esp.M1419",
"hotel2019","rest1614",
"RFD.2017",
"tot_ann","pmedio","pmedio.M1519","pm_ent.M1519","pm_priv.M1519",
"alq.num","alq.pm","alq.pm.M1519","alq.num.M1519",
"tot.comp","tot.eur","perc.nou.comp","perc.usat.comp","tot.comp.M1419",
"nou.eur.M1419","usat.eur.M1419",
"percbar.filt.A", "percbar.filt"
)
Standardizar los datos
df <- scale(merged2)
Optimal K Elbow - elbow (1 option)
##Optimal K Elbow - elbow (1 option)
set.seed(123)
wss <- function(k) {
kmeans(df, k, nstart = 100 )$tot.withinss
}
k.values <- 1:15
wss_values <- map_dbl(k.values, wss)
plot(k.values, wss_values,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares")
##Optimal K Elbow - elbow (2 option)
set.seed(123)
fviz_nbclust(df, kmeans, method = "wss")
##Optimal K Average Silhouette Method 1
##Optimal K Average Silhouette Method 1
avg_sil <- function(k) {
km.res <- kmeans(df, centers = k, nstart = 100)
ss <- silhouette(km.res$cluster, dist(df))
#fviz_silhouette(ss)
mean(ss[, 3])
}
k.values <- 2:15
avg_sil_values <- map_dbl(k.values, avg_sil)
plot(k.values, avg_sil_values,
type = "b", pch = 19, frame = FALSE,
xlab = "Number of clusters K",
ylab = "Average Silhouettes")
#######AQUI ave.sil.width 5
km.res2 <- kmeans(df, 5, nstart = 100)
sil <- silhouette(km.res2$cluster, dist(df))
fviz_silhouette(sil)
## cluster size ave.sil.width
## 1 1 11 0.04
## 2 2 32 0.16
## 3 3 4 0.17
## 4 4 12 0.21
## 5 5 13 0.16
#######AQUI ave.sil.width 3
km.res3 <- kmeans(df, 3, nstart = 100)
sil2 <- silhouette(km.res3$cluster, dist(df))
fviz_silhouette(sil2)
## cluster size ave.sil.width
## 1 1 10 0.19
## 2 2 17 0.05
## 3 3 45 0.29
##Optimal K Average Silhouette Method 2
##Optimal K Average Silhouette Method 2
xxx <- fviz_nbclust(df, kmeans, method = "silhouette")
xxx
xxx[["data"]][["y"]]
## [1] 0.00000000 0.24479080 0.24824740 0.11011399 0.24806313 0.12275387
## [7] 0.13507505 0.08726083 0.10168022 0.12652978
##Optimal K Gap Statistic Method
##Optimal K Gap Statistic Method
set.seed(123)
gap_stat <- clusGap(df, FUN = kmeans, nstart = 100, K.max = 10, B = 50)
print(gap_stat, method = "firstmax")
## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = df, FUNcluster = kmeans, K.max = 10, B = 50, nstart = 100)
## B=50 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
## --> Number of clusters (method 'firstmax'): 2
## logW E.logW gap SE.sim
## [1,] 4.802555 5.366106 0.5635517 0.01834681
## [2,] 4.669697 5.252933 0.5832367 0.01412971
## [3,] 4.600445 5.180312 0.5798671 0.01522972
## [4,] 4.550710 5.127672 0.5769621 0.01539857
## [5,] 4.489787 5.083694 0.5939061 0.01589063
## [6,] 4.440868 5.043535 0.6026666 0.01636431
## [7,] 4.398696 5.006333 0.6076368 0.01651342
## [8,] 4.355195 4.971500 0.6163045 0.01678180
## [9,] 4.311352 4.938033 0.6266811 0.01644381
## [10,] 4.273728 4.905779 0.6320503 0.01673182
##Optimal K Gap Statistic Method 2
##Optimal K Gap Statistic Method 2
fviz_gap_stat(gap_stat)
##Plot 3-9
##Plot 3-9
final3 <- kmeans(df, centers = 3, nstart = 100)
final4 <- kmeans(df, centers = 4, nstart = 100)
final5 <- kmeans(df, centers = 5, nstart = 100)
final6 <- kmeans(df, centers = 6, nstart = 100)
final7 <- kmeans(df, centers = 7, nstart = 100)
final8 <- kmeans(df, centers = 8, nstart = 100)
final9 <- kmeans(df, centers = 9, nstart = 100)
fviz_cluster(final3, data = df)
fviz_cluster(final4, data = df)
fviz_cluster(final5, data = df)
fviz_cluster(final6, data = df)
fviz_cluster(final7, data = df)
fviz_cluster(final8, data = df)
fviz_cluster(final9, data = df)
##FIN
#Examinar a los numeros de cluster k3 <- kmeans(df, centers = 3, nstart = 100) k4 <- kmeans(df, centers = 4, nstart = 100) k5 <- kmeans(df, centers = 5, nstart = 100) k6 <- kmeans(df, centers = 6, nstart = 100) k7 <- kmeans(df, centers = 7, nstart = 100) k8 <- kmeans(df, centers = 8, nstart = 100) k9 <- kmeans(df, centers = 9, nstart = 100)
#plots to compare p3 <- fviz_cluster(k3, geom = “point”, data = df) + ggtitle(“k = 3”) p4 <- fviz_cluster(k4, geom = “point”, data = df) + ggtitle(“k = 4”) p5 <- fviz_cluster(k5, geom = “point”, data = df) + ggtitle(“k = 5”) p6 <- fviz_cluster(k6, geom = “point”, data = df) + ggtitle(“k = 6”) p7 <- fviz_cluster(k7, geom = “point”, data = df) + ggtitle(“k = 7”) p8 <- fviz_cluster(k8, geom = “point”, data = df) + ggtitle(“k = 8”) p9 <- fviz_cluster(k9, geom = “point”, data = df) + ggtitle(“k = 9”)
grid.arrange(p3, nrow = 1)
grid.arrange(p4, p5, nrow = 1)
grid.arrange(p6, p7, nrow = 1)
grid.arrange(p8, p9, nrow = 1)
K means
for (i in 3:5) { finalK <- kmeans(df, centers = i, nstart = 100) #print print(finalK) }
kkk <- merged2 %>% mutate(Cluster = final3$cluster) %>% group_by(Cluster) %>% summarise_all(“mean”)
z_cluster(finalK, data = df)
for (i in 3:9) { finalK <- kmeans(df, centers = i, nstart = 100) #plot x <- fviz_cluster(finalK, data = df) x }
Descriptive Statistic
kkk <- merged %>% mutate(Cluster = finalK$cluster) %>% group_by(Nom_Barri) #%>% #summarise_all(“mean”)
k.1 <- kkk %>% filter(Cluster == 1)
k.1$Nom_Barri
k.2 <- kkk %>% filter(Cluster == 2)
k.2$Nom_Barri
k.3 <- kkk %>% filter(Cluster == 3)
k.3$Nom_Barri