kmeans

Везде использовался метод “Hartigan-Wong”. Решение - усредненный кластерный по 1000 проб (запусков)

4 кластера

actions_sc <- actions[, lapply(.SD, scale), .SDcols = c(4:24)]
fit_k <- kmeans(actions_sc, centers = 4, nstart = 1000) 

test_k <- cbind(actions_sc, group = fit_k$cluster)
test_k[, .N, by = group]

##    group    N
## 1:     3 2941
## 2:     1 1608
## 3:     4 1409
## 4:     2  362

test_k_stat <- test_k[, lapply(.SD, mean), by = group, .SDcols = c(1:21)]
test_k_stat_r <- data.table(test_k_stat[, t(.SD), .SDcols = c(2:22)])
test_k_stat_r <- test_k_stat_r[, title := names(test_k_stat)[-1]]

test_k_stat_m <- melt(test_k_stat_r)

## Warning in melt.data.table(test_k_stat_r): To be consistent with reshape2's
## melt, id.vars and measure.vars are internally guessed when both are 'NULL'.
## All non-numeric/integer/logical type columns are conisdered id.vars, which
## in this case are columns [title]. Consider providing at least one of 'id'
## or 'measure' vars in future.

plot_ly(test_k_stat_m, x = ~variable, y = ~title, z = ~value, type = 'heatmap') %>% 
  layout(title = "средние значения по кластеру",
         margin = list(l = 150))

5 кластеров

##    group    N
## 1:     5 1943
## 2:     4 1530
## 3:     1 2258
## 4:     2  248
## 5:     3  341

## Warning in melt.data.table(test_k_stat_r): To be consistent with reshape2's
## melt, id.vars and measure.vars are internally guessed when both are 'NULL'.
## All non-numeric/integer/logical type columns are conisdered id.vars, which
## in this case are columns [title]. Consider providing at least one of 'id'
## or 'measure' vars in future.

Иерархический кластерный

manhattan, 6 кластеров

# manhattan 6
ds <- dist(actions_sc, method = "manhattan")

fit <- hclust(ds, method="ward.D")
plot(fit, labels = F, hang = 0)

rect.hclust(fit, k = 6, border="blue")

test <- cbind(actions_sc, group = cutree(fit, k = 6))
test[, .N, by = group]

##    group    N
## 1:     1  939
## 2:     2 3031
## 3:     3  649
## 4:     4  768
## 5:     5  446
## 6:     6  487

test_stat <- test[, lapply(.SD, mean), by = group, .SDcols = c(1:21)]
test_stat_r <- data.table(test_stat[, t(.SD), .SDcols = c(2:22)])
test_stat_r <- test_stat_r[, title := names(test_stat)[-1]]

test_stat_m <- melt(test_stat_r)

## Warning in melt.data.table(test_stat_r): To be consistent with reshape2's
## melt, id.vars and measure.vars are internally guessed when both are 'NULL'.
## All non-numeric/integer/logical type columns are conisdered id.vars, which
## in this case are columns [title]. Consider providing at least one of 'id'
## or 'measure' vars in future.

plot_ly(test_stat_m, x = ~variable, y = ~title, z = ~value, type = 'heatmap') %>% 
  layout(title = "средние взвешенные на количество урлов значения по кластеру",
         margin = list(l = 150))

canberra, 6 кластеров

# canberra 6
ds <- dist(actions_sc, method = "canberra")

fit <- hclust(ds, method="ward.D")
plot(fit, labels = F, hang = 0)

rect.hclust(fit, k = 6, border="blue")

test <- cbind(actions_sc, group = cutree(fit, k = 6))
test[, .N, by = group]

##    group    N
## 1:     1   68
## 2:     2 1847
## 3:     3 1780
## 4:     4  465
## 5:     5  347
## 6:     6 1813

test_stat <- test[, lapply(.SD, mean), by = group, .SDcols = c(1:21)]
test_stat_r <- data.table(test_stat[, t(.SD), .SDcols = c(2:22)])
test_stat_r <- test_stat_r[, title := names(test_stat)[-1]]

test_stat_m <- melt(test_stat_r)

## Warning in melt.data.table(test_stat_r): To be consistent with reshape2's
## melt, id.vars and measure.vars are internally guessed when both are 'NULL'.
## All non-numeric/integer/logical type columns are conisdered id.vars, which
## in this case are columns [title]. Consider providing at least one of 'id'
## or 'measure' vars in future.

plot_ly(test_stat_m, x = ~variable, y = ~title, z = ~value, type = 'heatmap') %>% 
  layout(margin = list(l = 150))

Иерархический на бинарных данных

actions_bi <- actions[, lapply(.SD, function(x) ifelse(x > 0, 1, 0)), .SDcols = c(4:24)]
ds_bi <- dist(actions_bi, method = "binary")

fit_bi <- hclust(ds_bi, method="ward.D")

plot(fit_bi, labels = F, hang = 0)
rect.hclust(fit_bi, k = 4, border="green") 
rect.hclust(fit_bi, k = 5, border="blue")

#rect.hclust(fit_bi, k = 7, border="red")

test_bi <- cbind(actions_bi, group = cutree(fit_bi, k = 5))
test_bi[, .N, by = group]

##    group    N
## 1:     1 3798
## 2:     2 1508
## 3:     3  604
## 4:     4  271
## 5:     5  139

test_stat_bi <- test_bi[, lapply(.SD, mean), by = group, .SDcols = c(1:21)]

test_stat_bi_r <- data.table(test_stat_bi[, t(.SD), .SDcols = c(2:22)])
test_stat_bi_r <- test_stat_bi_r[, title := names(test_stat_bi)[-1]]

test_stat_bi_m <- melt(test_stat_bi_r)

## Warning in melt.data.table(test_stat_bi_r): To be consistent with
## reshape2's melt, id.vars and measure.vars are internally guessed when both
## are 'NULL'. All non-numeric/integer/logical type columns are conisdered
## id.vars, which in this case are columns [title]. Consider providing at
## least one of 'id' or 'measure' vars in future.

plot_ly(test_stat_bi_m, x = ~variable, y = ~title, z = ~value, type = 'heatmap') %>%  
  layout(title = 'среднее количество использований инструмента',
         margin = list(l = 150))

Untitled

Ph.A.Upravitelev

12 февраля 2017 г

kmeans

4 кластера

5 кластеров

Иерархический кластерный

manhattan, 6 кластеров

canberra, 6 кластеров

Иерархический на бинарных данных