require(GGally, quietly = TRUE)
require(reshape2, quietly = TRUE)
require(tidyverse, quietly = TRUE, warn.conflicts = FALSE)
library(ggfortify)
library(cluster)
library(ggdendro)
library(broom)
theme_set(theme_bw())
source("github-lib.R")
dw <- load_github_wide()
summary(dw)
repository_language ForkEvent IssuesEvent PushEvent
ActionScript: 1 Min. : 1.000 Min. : 1.000 Min. : 1.000
Ada : 1 1st Qu.: 1.509 1st Qu.: 3.437 1st Qu.: 7.052
Agda : 1 Median : 2.083 Median : 4.750 Median : 9.314
ANTLR : 1 Mean : 2.454 Mean : 7.311 Mean : 10.921
Apex : 1 3rd Qu.: 2.913 3rd Qu.: 7.269 3rd Qu.: 10.602
AppleScript : 1 Max. :18.000 Max. :63.000 Max. :154.250
(Other) :121
WatchEvent
Min. : 1.000
1st Qu.: 2.000
Median : 3.007
Mean : 3.725
3rd Qu.: 4.636
Max. :13.471
ggpairs(select(dw, -repository_language))
# XML e Bluespec têm mais de 50 pushes / repositório e
# outras linguagens têm também números estranhos. Filtraremos.
dw <- dw %>%
filter(PushEvent < 50, IssuesEvent < 50, ForkEvent < 18)
As variáveis são bastante assimétricas e concentradas em pequenos valores. Transformá-las para log ajuda na visualização.
# Escala de log
dw2 <- log(dw[,2:5])
dw2$repository_language <- dw$repository_language
ggpairs(select(dw2, -repository_language))
summary(select(dw2, -repository_language))
ForkEvent IssuesEvent PushEvent WatchEvent
Min. :0.0000 Min. :0.000 Min. :0.000 Min. :0.0000
1st Qu.:0.4055 1st Qu.:1.232 1st Qu.:1.949 1st Qu.:0.6931
Median :0.7340 Median :1.541 Median :2.218 Median :1.1009
Mean :0.7409 Mean :1.558 Mean :2.142 Mean :1.1268
3rd Qu.:1.0512 3rd Qu.:1.946 3rd Qu.:2.348 3rd Qu.:1.5361
Max. :1.7918 Max. :3.497 Max. :3.281 Max. :2.5320
#dw2.scaled = scale(select(dw2, -repository_language))
dw2.scaled = select(dw2, -repository_language) %>%
mutate_each(funs(scale))
summary(dw2.scaled)
ForkEvent.V1 IssuesEvent.V1 PushEvent.V1
Min. :-1.8062975 Min. :-2.2241452 Min. :-5.194659
1st Qu.:-0.8177696 1st Qu.:-0.4652675 1st Qu.:-0.468106
Median :-0.0168734 Median :-0.0247915 Median : 0.182991
Mean : 0.0000000 Mean : 0.0000000 Mean : 0.000000
3rd Qu.: 0.7564441 3rd Qu.: 0.5536299 3rd Qu.: 0.499225
Max. : 2.5620297 Max. : 2.7670985 Max. : 2.760632
WatchEvent.V1
Min. :-1.8861505
1st Qu.:-0.7258998
Median :-0.0433924
Mean : 0.0000000
3rd Qu.: 0.6850340
Max. : 2.3521652
row.names(dw2.scaled) = dw2$repository_language
dists = dist(dw2.scaled, method = "euclidean")
hc = hclust(dists, method = "ward.D")
plot(hc, cex = .6)
plot(hc, hang = -1)
n_clusters = 4
rect.hclust(hc, k=n_clusters)
dw2$cluster = factor(cutree(hc, k=n_clusters))
dw2.scaled$repository_language = dw2$repository_language
dw2.scaled$cluster = factor(cutree(hc, k=n_clusters))
dw2.long = melt(dw2.scaled, id.vars = c("repository_language", "cluster"))
attributes are not identical across measure variables; they will be dropped
plot(silhouette(cutree(hc, k = n_clusters), dists), col = RColorBrewer::brewer.pal(n_clusters, "Set2"))
ggplot(dw2.long, aes(x = variable, y = value, colour = cluster)) +
geom_boxplot() +
geom_point(alpha = 0.2) +
geom_line() +
facet_wrap(~ cluster)
toclust = dw2.scaled %>%
rownames_to_column(var = "language") %>%
select(1:5)
dists = toclust %>%
select(-language) %>%
dist() # só para plotar silhouetas depois
km = toclust %>%
select(-language) %>%
kmeans(centers = n_clusters, nstart = 20)
km %>%
augment(toclust) %>%
gather(key = "variável", value = "valor", -language, -.cluster) %>%
ggplot(aes(x = `variável`, y = valor, group = language, colour = .cluster)) +
geom_point(alpha = 0.2) +
geom_line(alpha = .5) +
facet_wrap(~ .cluster)
attributes are not identical across measure variables; they will be dropped
#autoplot(km, data = dw2.scaled, size = 3)
autoplot(km, data = dw2.scaled, label = TRUE)
plot(silhouette(km$cluster, dists), col = RColorBrewer::brewer.pal(n_clusters, "Set2"))
dw2.scaled$kmcluster = km$cluster
dw2.long = melt(dw2.scaled, id.vars = c("repository_language", "cluster", "kmcluster"))
attributes are not identical across measure variables; they will be dropped
table(km$cluster)
1 2 3 4
40 37 27 17
km %>%
augment(toclust) %>%
select(language, .cluster) %>%
filter(.cluster == 1)
filmes = readr::read_csv("dados/filmes-scarlett-johanssson.csv")
Parsed with column specification:
cols(
RATING = col_double(),
TITLE = col_character(),
CREDIT = col_character(),
`BOX OFFICE` = col_double(),
YEAR = col_integer()
)
filmes_t = filmes %>%
mutate(`BOX OFFICE` = scale(log10(`BOX OFFICE`)),
RATING = scale(RATING))
atribuicoes = tibble(k = 1:6) %>%
group_by(k) %>%
do(kmeans(select(filmes_t, RATING, `BOX OFFICE`),
centers = .$k,
nstart = 10) %>% augment(filmes)) # alterne entre filmes e filmes_t no augment
Unequal factor levels: coercing to character
atribuicoes_long = atribuicoes %>%
gather(key = "variavel", value = "valor", -TITLE, -k, -.cluster, -CREDIT)
atribuicoes %>%
ggplot(aes(x = RATING, y = `BOX.OFFICE`, label = TITLE, colour = .cluster)) +
geom_point() +
#geom_text() +
facet_wrap(~ k)
#+ scale_y_log10()
# A silhoueta
dists = select(filmes_t, RATING, `BOX OFFICE`) %>% dist()
km = kmeans(select(filmes_t, RATING, `BOX OFFICE`),
centers = 4,
nstart = 10)
silhouette(km$cluster, dists) %>%
plot(col = RColorBrewer::brewer.pal(4, "Set2"))
O dataset ruspini é clássico para ilustrar agrupamento.
str(ruspini)
'data.frame': 75 obs. of 2 variables:
$ x: int 4 5 10 9 13 13 12 15 18 19 ...
$ y: int 53 63 59 77 49 69 88 75 61 65 ...
ggplot(ruspini, aes(x = x, y = y)) +
geom_point(size = 3)
summary(ruspini)
x y
Min. : 4.00 Min. : 4.00
1st Qu.: 31.50 1st Qu.: 56.50
Median : 52.00 Median : 96.00
Mean : 54.88 Mean : 92.03
3rd Qu.: 76.50 3rd Qu.:141.50
Max. :117.00 Max. :156.00
rs <- data.frame((ruspini))
rs <- data.frame(scale(ruspini))
colMeans(rs)
x y
-7.184068e-17 -8.854029e-17
ggplot(rs, aes(x = x, y = y)) +
geom_point(size = 3)
dists = dist(rs, method = "euclidean")
hc = hclust(dists, method = "ward.D")
plot(hc, hang = -1, cex = 0.8)
rect.hclust(hc, k=4)
rs$cluster = factor(cutree(hc, k=4))
ggplot(rs, aes(x = x, y = y, colour = cluster)) +
geom_point(size = 3)
rs$cluster = factor(cutree(hc, k=8))
ggplot(rs, aes(x = x, y = y, colour = cluster, label = cluster)) +
geom_point(size = 2) +
geom_text(hjust = -.1, vjust = 1) +
xlim(0, 150)
plot(silhouette(cutree(hc, k = 4), dists))
plot(silhouette(cutree(hc, k = 6), dists))
#heatmap(as.matrix(dw2[,1:4]), Colv=F, scale='none')
#hc.data <- dendro_data(hc)
#ggdendrogram(hc.data, rotate = TRUE) +
#labs(title = "Agrupamento de Rustini")
km <- kmeans(rs, centers=4, nstart=10)
km
K-means clustering with 4 clusters of sizes 17, 20, 15, 23
Cluster means:
x y cluster
1 1.4194387 0.4692907 5.882353
2 -1.1385941 -0.5559591 1.500000
3 0.4607268 -1.4912271 8.000000
4 -0.3595425 1.1091151 3.565217
Clustering vector:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 4 4 4 4 4 4 4
28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 1 1 1 1 1 1 1 1 1 1 1
55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
Within cluster sum of squares by cluster:
[1] 17.405982 7.705477 1.082373 8.310853
(between_SS / total_SS = 94.1 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss"
[6] "betweenss" "size" "iter" "ifault"
autoplot(km, data = rs)
Error in colMeans(x, na.rm = TRUE) : 'x' deve ser numérico