Везде использовался метод “Hartigan-Wong”. Решение - усредненный кластерный по 1000 проб (запусков)
actions_sc <- actions[, lapply(.SD, scale), .SDcols = c(4:24)]
fit_k <- kmeans(actions_sc, centers = 4, nstart = 1000)
test_k <- cbind(actions_sc, group = fit_k$cluster)
test_k[, .N, by = group]
## group N
## 1: 3 2941
## 2: 1 1608
## 3: 4 1409
## 4: 2 362
test_k_stat <- test_k[, lapply(.SD, mean), by = group, .SDcols = c(1:21)]
test_k_stat_r <- data.table(test_k_stat[, t(.SD), .SDcols = c(2:22)])
test_k_stat_r <- test_k_stat_r[, title := names(test_k_stat)[-1]]
test_k_stat_m <- melt(test_k_stat_r)
## Warning in melt.data.table(test_k_stat_r): To be consistent with reshape2's
## melt, id.vars and measure.vars are internally guessed when both are 'NULL'.
## All non-numeric/integer/logical type columns are conisdered id.vars, which
## in this case are columns [title]. Consider providing at least one of 'id'
## or 'measure' vars in future.
plot_ly(test_k_stat_m, x = ~variable, y = ~title, z = ~value, type = 'heatmap') %>%
layout(title = "средние значения по кластеру",
margin = list(l = 150))
## group N
## 1: 5 1943
## 2: 4 1530
## 3: 1 2258
## 4: 2 248
## 5: 3 341
## Warning in melt.data.table(test_k_stat_r): To be consistent with reshape2's
## melt, id.vars and measure.vars are internally guessed when both are 'NULL'.
## All non-numeric/integer/logical type columns are conisdered id.vars, which
## in this case are columns [title]. Consider providing at least one of 'id'
## or 'measure' vars in future.
# manhattan 6
ds <- dist(actions_sc, method = "manhattan")
fit <- hclust(ds, method="ward.D")
plot(fit, labels = F, hang = 0)
rect.hclust(fit, k = 6, border="blue")
test <- cbind(actions_sc, group = cutree(fit, k = 6))
test[, .N, by = group]
## group N
## 1: 1 939
## 2: 2 3031
## 3: 3 649
## 4: 4 768
## 5: 5 446
## 6: 6 487
test_stat <- test[, lapply(.SD, mean), by = group, .SDcols = c(1:21)]
test_stat_r <- data.table(test_stat[, t(.SD), .SDcols = c(2:22)])
test_stat_r <- test_stat_r[, title := names(test_stat)[-1]]
test_stat_m <- melt(test_stat_r)
## Warning in melt.data.table(test_stat_r): To be consistent with reshape2's
## melt, id.vars and measure.vars are internally guessed when both are 'NULL'.
## All non-numeric/integer/logical type columns are conisdered id.vars, which
## in this case are columns [title]. Consider providing at least one of 'id'
## or 'measure' vars in future.
plot_ly(test_stat_m, x = ~variable, y = ~title, z = ~value, type = 'heatmap') %>%
layout(title = "средние взвешенные на количество урлов значения по кластеру",
margin = list(l = 150))
# canberra 6
ds <- dist(actions_sc, method = "canberra")
fit <- hclust(ds, method="ward.D")
plot(fit, labels = F, hang = 0)
rect.hclust(fit, k = 6, border="blue")
test <- cbind(actions_sc, group = cutree(fit, k = 6))
test[, .N, by = group]
## group N
## 1: 1 68
## 2: 2 1847
## 3: 3 1780
## 4: 4 465
## 5: 5 347
## 6: 6 1813
test_stat <- test[, lapply(.SD, mean), by = group, .SDcols = c(1:21)]
test_stat_r <- data.table(test_stat[, t(.SD), .SDcols = c(2:22)])
test_stat_r <- test_stat_r[, title := names(test_stat)[-1]]
test_stat_m <- melt(test_stat_r)
## Warning in melt.data.table(test_stat_r): To be consistent with reshape2's
## melt, id.vars and measure.vars are internally guessed when both are 'NULL'.
## All non-numeric/integer/logical type columns are conisdered id.vars, which
## in this case are columns [title]. Consider providing at least one of 'id'
## or 'measure' vars in future.
plot_ly(test_stat_m, x = ~variable, y = ~title, z = ~value, type = 'heatmap') %>%
layout(margin = list(l = 150))
actions_bi <- actions[, lapply(.SD, function(x) ifelse(x > 0, 1, 0)), .SDcols = c(4:24)]
ds_bi <- dist(actions_bi, method = "binary")
fit_bi <- hclust(ds_bi, method="ward.D")
plot(fit_bi, labels = F, hang = 0)
rect.hclust(fit_bi, k = 4, border="green")
rect.hclust(fit_bi, k = 5, border="blue")
#rect.hclust(fit_bi, k = 7, border="red")
test_bi <- cbind(actions_bi, group = cutree(fit_bi, k = 5))
test_bi[, .N, by = group]
## group N
## 1: 1 3798
## 2: 2 1508
## 3: 3 604
## 4: 4 271
## 5: 5 139
test_stat_bi <- test_bi[, lapply(.SD, mean), by = group, .SDcols = c(1:21)]
test_stat_bi_r <- data.table(test_stat_bi[, t(.SD), .SDcols = c(2:22)])
test_stat_bi_r <- test_stat_bi_r[, title := names(test_stat_bi)[-1]]
test_stat_bi_m <- melt(test_stat_bi_r)
## Warning in melt.data.table(test_stat_bi_r): To be consistent with
## reshape2's melt, id.vars and measure.vars are internally guessed when both
## are 'NULL'. All non-numeric/integer/logical type columns are conisdered
## id.vars, which in this case are columns [title]. Consider providing at
## least one of 'id' or 'measure' vars in future.
plot_ly(test_stat_bi_m, x = ~variable, y = ~title, z = ~value, type = 'heatmap') %>%
layout(title = 'среднее количество использований инструмента',
margin = list(l = 150))