library(tidyverse) # ggplot2, tidyr, dplyr, etc
library(broom) # facilita lidar com modelos e trata o resultado do kmeans como modelo
library(ggfortify, quietly = TRUE) # plots para modelos
# http://rpubs.com/sinhrks/basics
require(GGally, quietly = TRUE)
library(knitr, quietly = TRUE)
library(cluster)
theme_set(theme_bw())
source("github-lib.R")
Tal como vínhamos fazendo antes:
library(ggplot2movies)
glimpse(movies)
## Observations: 58,788
## Variables: 24
## $ title <chr> "$", "$1000 a Touchdown", "$21 a Day Once a Month"...
## $ year <int> 1971, 1939, 1941, 1996, 1975, 2000, 2002, 2002, 19...
## $ length <int> 121, 71, 7, 70, 71, 91, 93, 25, 97, 61, 99, 96, 10...
## $ budget <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ rating <dbl> 6.4, 6.0, 8.2, 8.2, 3.4, 4.3, 5.3, 6.7, 6.6, 6.0, ...
## $ votes <int> 348, 20, 5, 6, 17, 45, 200, 24, 18, 51, 23, 53, 44...
## $ r1 <dbl> 4.5, 0.0, 0.0, 14.5, 24.5, 4.5, 4.5, 4.5, 4.5, 4.5...
## $ r2 <dbl> 4.5, 14.5, 0.0, 0.0, 4.5, 4.5, 0.0, 4.5, 4.5, 0.0,...
## $ r3 <dbl> 4.5, 4.5, 0.0, 0.0, 0.0, 4.5, 4.5, 4.5, 4.5, 4.5, ...
## $ r4 <dbl> 4.5, 24.5, 0.0, 0.0, 14.5, 14.5, 4.5, 4.5, 0.0, 4....
## $ r5 <dbl> 14.5, 14.5, 0.0, 0.0, 14.5, 14.5, 24.5, 4.5, 0.0, ...
## $ r6 <dbl> 24.5, 14.5, 24.5, 0.0, 4.5, 14.5, 24.5, 14.5, 0.0,...
## $ r7 <dbl> 24.5, 14.5, 0.0, 0.0, 0.0, 4.5, 14.5, 14.5, 34.5, ...
## $ r8 <dbl> 14.5, 4.5, 44.5, 0.0, 0.0, 4.5, 4.5, 14.5, 14.5, 4...
## $ r9 <dbl> 4.5, 4.5, 24.5, 34.5, 0.0, 14.5, 4.5, 4.5, 4.5, 4....
## $ r10 <dbl> 4.5, 14.5, 24.5, 45.5, 24.5, 14.5, 14.5, 14.5, 24....
## $ mpaa <chr> "", "", "", "", "", "", "R", "", "", "", "", "", "...
## $ Action <int> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,...
## $ Animation <int> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Comedy <int> 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,...
## $ Drama <int> 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,...
## $ Documentary <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Romance <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Short <int> 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,...
filmes = movies %>%
filter(year >= 1970,
length >= 90, length <= 180,
Action == 1,
budget > 1e5) %>%
mutate(age = (2017 - year),
title = paste0(title, " (", year, ")"),
love = r10/(r10 + r1)) %>%
select(title, age, budget, love, votes) %>%
filter(complete.cases(.)) # complete.cases recebe um df, e não vetores como entrada
Descritivo express:
filmes %>%
mutate(budget = budget / 1e6) %>%
gather(key = "variavel", value = "valor", -title) %>%
ggplot(aes(x = valor)) +
geom_histogram(fill = "white", color = "black", bins = 20) +
facet_grid(. ~ variavel, scales = "free_x")
filmes %>%
select(-title) %>%
ggpairs(size = .5,
lower = list(continuous = wrap("points", size = .5, alpha = 0.3)))
## Warning in ggpairs(., size = 0.5, lower = list(continuous =
## wrap("points", : Extra arguments: 'size' are being ignored. If these are
## meant to be aesthetics, submit them using the 'mapping' variable within
## ggpairs with ggplot2::aes or ggplot2::aes_string.
filmes_transformado = filmes %>%
#select(-title) %>%
mutate(budget = log10(budget),
votes = log10(votes)) %>%
mutate_at(vars(age:votes), funs(as.numeric(scale(.))))
filmes_transformado %>%
gather(key = "variavel", value = "valor", -title) %>%
ggplot(aes(x = valor)) +
geom_histogram(fill = "lightyellow", color = "black", bins = 20) +
facet_grid(. ~ variavel, scales = "free_x")
Agrupando com kmeans
set.seed(123)
explorando_k = tibble(k = 3:15) %>%
group_by(k) %>%
do(
kmeans(select(filmes_transformado, -title),
centers = .$k,
nstart = 20) %>% glance()
)
explorando_k %>%
ggplot(aes(x = k, y = tot.withinss)) +
geom_line() +
geom_point()
Usaremos k = 6
filmes_km = filmes_transformado %>%
select(-title) %>%
kmeans(centers = 6, nstart = 20)
filmes_transformado_agrupado = filmes_km %>%
augment(filmes_transformado)
Visualizando com coordenadas paralelas
filmes_km %>%
augment(filmes_transformado) %>%
gather(key = "variável", value = "valor", -title, -.cluster) %>%
ggplot(aes(x = `variável`, y = valor, group = title, colour = .cluster)) +
geom_line(alpha = .2) +
facet_wrap(~ .cluster)
Não deixe de dar uma olhada nessa explicação visual sobre PCA.
Encontrando os componentes:
filmes_pca = filmes_transformado %>%
column_to_rownames("title") %>%
prcomp(scale = FALSE)
## Warning: Setting row names on a tibble is deprecated.
Os componentes e sua relação com as variáveis originais
print(as.data.frame(filmes_pca$rotation))
## PC1 PC2 PC3 PC4
## age 0.37189816 0.4498170 -0.7652395 -0.2715969
## budget -0.69785923 -0.0293650 -0.1051809 -0.7078610
## love 0.07118599 0.7969047 0.5697121 -0.1878925
## votes -0.60796119 0.4021757 -0.2806662 0.6243912
Usando o pacote broom para acessar os resultados de prcomp via data frames (em lugar de listas com atributos que você nem sempre sabe quais são).
A mesma coisa que a matriz acima, mas em formato long:
tidy(filmes_pca, "variables")
## column PC value
## 1 age 1 0.37189816
## 2 budget 1 -0.69785923
## 3 love 1 0.07118599
## 4 votes 1 -0.60796119
## 5 age 2 0.44981702
## 6 budget 2 -0.02936500
## 7 love 2 0.79690468
## 8 votes 2 0.40217568
## 9 age 3 -0.76523954
## 10 budget 3 -0.10518095
## 11 love 3 0.56971212
## 12 votes 3 -0.28066620
## 13 age 4 -0.27159686
## 14 budget 4 -0.70786097
## 15 love 4 -0.18789250
## 16 votes 4 0.62439122
Analisando quanta variância cada PC captura:
tidy(filmes_pca, "pcs")
## PC std.dev percent cumulative
## 1 1 1.3592139 0.46187 0.46187
## 2 2 1.0522660 0.27682 0.73868
## 3 3 0.9306917 0.21655 0.95523
## 4 4 0.4231865 0.04477 1.00000
tidy(filmes_pca, "pcs") %>%
ggplot(aes(x = PC, y = cumulative, label = cumulative)) +
geom_line() +
geom_point() +
geom_text(vjust = 1, hjust = -.1)
O valor dos PCs para cada ponto dos dados originais.
tidy(filmes_pca, "samples") %>%
head()
## row PC value
## 1 13th Warrior, The (1999) 1 -1.3152153
## 2 2 Fast 2 Furious (2003) 1 -1.3984194
## 3 3000 Miles to Graceland (2001) 1 -1.0445748
## 4 51st State, The (2001) 1 -0.6572037
## 5 6th Day, The (2000) 1 -1.2870050
## 6 72 metra (2004) 1 1.8108625
Como sempre, visualizar é necessário.
# augment é uma mão na roda para combinar modelo com dados que você já tem
au <- augment(filmes_pca, data = filmes_transformado_agrupado)
glimpse(au)
## Observations: 600
## Variables: 11
## $ .rownames <fctr> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,...
## $ title <chr> "13th Warrior, The (1999)", "2 Fast 2 Furious (2003...
## $ age <dbl> -0.4899323, -0.9446259, -0.7172791, -0.7172791, -0....
## $ budget <dbl> 0.98043149, 0.90718177, 0.77392727, 0.25364963, 0.9...
## $ love <dbl> -0.3458892, -1.6396288, -0.3458892, -0.3458892, -0....
## $ votes <dbl> 0.69771681, 0.48902954, 0.35052440, 0.31057032, 0.6...
## $ .cluster <fctr> 2, 2, 2, 2, 2, 1, 2, 4, 5, 1, 2, 2, 2, 4, 6, 2, 1,...
## $ .fittedPC1 <dbl> -1.3152153, -1.3984194, -1.0445748, -0.6572037, -1....
## $ .fittedPC2 <dbl> -0.24420625, -1.56150026, -0.48003905, -0.48082966,...
## $ .fittedPC3 <dbl> -0.121089894, -0.443923565, 0.172050348, 0.23798740...
## $ .fittedPC4 <dbl> -0.060306872, 0.227818569, -0.069167808, 0.27416945...
au %>%
ggplot(aes(.fittedPC1, .fittedPC2)) +
geom_point(size = .7, alpha = .5)
Tentando ver a estrutura de grupos. Não conseguimos diferenciar muito o
au %>%
ggplot(aes(.fittedPC1, .fittedPC2, color = .cluster)) +
geom_point(size = .8, alpha = .7)
Para entender, sempre é uma boa ver exemplos
set.seed(122)
sample_n(au, 50) %>% # plotando 50 filmes aleatórios
ggplot(aes(.fittedPC1, .fittedPC2, color = .cluster)) +
geom_point(size = .2) +
geom_text(aes(label = title), vjust = 1, hjust = 0, size = 3)
Como nossos labels são grandes, interação ajuda
# No meu RStudio, o gráfico não aparece interativo quando rodo dentro do notebook,
# mas aparece se rodo no console ou quando faço knit html.
library(highcharter)
## Warning: package 'highcharter' was built under R version 3.3.2
## Highcharts (www.highcharts.com) is a Highsoft software product which is
## not free for commercial and Governmental use
p = au %>%
hchart("scatter", hcaes(x = .fittedPC1, y = .fittedPC2, group = .cluster)) %>%
hc_tooltip(pointFormat = "<b>{point.title}</b><br>
$$$: {point.budget:,.2f}<br>
votes: {point.votes:,.2f}<br>
love: {point.love:,.2f}<br>
age: {point.age:,.2f}")
p
O biplot ajuda muito também. Existe um do R base (stats) e um do GGfortify que é mais tunável:
biplot(filmes_pca, scale = 0)
# autoplot(filmes_pca, label = F, label.size = 3, shape = T)
autoplot(filmes_pca, label = F, label.size = 3, shape = T, colour = filmes_km$cluster,
loadings = TRUE, loadings.colour = 'darkorange',
loadings.label = TRUE, loadings.label.size = 3)
## Warning in if (value %in% columns) {: a condição tem comprimento > 1 e
## somente o primeiro elemento será usado
E com um método não linear?
require(Rtsne)
## Loading required package: Rtsne
set.seed(111)
tsne.out = filmes_transformado %>%
select(-title) %>%
Rtsne(verbose = TRUE,
perplexity = 40) # você pode variar esse parâmetro. o autor recomenda entre 5-50
## Read the 600 x 4 data matrix successfully!
## Using no_dims = 2, perplexity = 40.000000, and theta = 0.500000
## Computing input similarities...
## Normalizing input...
## Building tree...
## - point 0 of 600
## Done in 0.07 seconds (sparsity = 0.271611)!
## Learning embedding...
## Iteration 50: error is 59.000959 (50 iterations in 0.30 seconds)
## Iteration 100: error is 55.307531 (50 iterations in 0.28 seconds)
## Iteration 150: error is 55.294459 (50 iterations in 0.27 seconds)
## Iteration 200: error is 55.291596 (50 iterations in 0.28 seconds)
## Iteration 250: error is 55.290610 (50 iterations in 0.29 seconds)
## Iteration 300: error is 0.535510 (50 iterations in 0.26 seconds)
## Iteration 350: error is 0.468716 (50 iterations in 0.24 seconds)
## Iteration 400: error is 0.453866 (50 iterations in 0.25 seconds)
## Iteration 450: error is 0.448520 (50 iterations in 0.25 seconds)
## Iteration 500: error is 0.443614 (50 iterations in 0.24 seconds)
## Iteration 550: error is 0.442158 (50 iterations in 0.25 seconds)
## Iteration 600: error is 0.440338 (50 iterations in 0.24 seconds)
## Iteration 650: error is 0.439648 (50 iterations in 0.23 seconds)
## Iteration 700: error is 0.438779 (50 iterations in 0.24 seconds)
## Iteration 750: error is 0.438330 (50 iterations in 0.24 seconds)
## Iteration 800: error is 0.437489 (50 iterations in 0.24 seconds)
## Iteration 850: error is 0.437496 (50 iterations in 0.24 seconds)
## Iteration 900: error is 0.437153 (50 iterations in 0.24 seconds)
## Iteration 950: error is 0.436610 (50 iterations in 0.24 seconds)
## Iteration 1000: error is 0.436558 (50 iterations in 0.24 seconds)
## Fitting performed in 5.06 seconds.
df = as.data.frame(tsne.out$Y)
filmes_tsne = cbind(au, df)
glimpse(filmes_tsne)
## Observations: 600
## Variables: 13
## $ .rownames <fctr> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,...
## $ title <chr> "13th Warrior, The (1999)", "2 Fast 2 Furious (2003...
## $ age <dbl> -0.4899323, -0.9446259, -0.7172791, -0.7172791, -0....
## $ budget <dbl> 0.98043149, 0.90718177, 0.77392727, 0.25364963, 0.9...
## $ love <dbl> -0.3458892, -1.6396288, -0.3458892, -0.3458892, -0....
## $ votes <dbl> 0.69771681, 0.48902954, 0.35052440, 0.31057032, 0.6...
## $ .cluster <fctr> 2, 2, 2, 2, 2, 1, 2, 4, 5, 1, 2, 2, 2, 4, 6, 2, 1,...
## $ .fittedPC1 <dbl> -1.3152153, -1.3984194, -1.0445748, -0.6572037, -1....
## $ .fittedPC2 <dbl> -0.24420625, -1.56150026, -0.48003905, -0.48082966,...
## $ .fittedPC3 <dbl> -0.121089894, -0.443923565, 0.172050348, 0.23798740...
## $ .fittedPC4 <dbl> -0.060306872, 0.227818569, -0.069167808, 0.27416945...
## $ V1 <dbl> 18.3588308, 6.6289799, 17.5394092, 13.3644499, 18.3...
## $ V2 <dbl> 1.1869690, -17.2347673, 5.6956695, 6.2541034, 2.992...
ggplot(filmes_tsne, aes(x = V1,
y = V2,
color = .cluster)) +
geom_point(alpha = 0.8, size = 1)
filmes_tsne %>%
hchart("scatter", hcaes(x = V1, y = V2, group = .cluster)) %>%
hc_tooltip(pointFormat = "<b>{point.title}</b><br>
$$$: {point.budget:,.2f}<br>
votes: {point.votes:,.2f}<br>
love: {point.love:,.2f}<br>
age: {point.age:,.2f}")
#plotly::ggplotly(p)
dw <- load_github_wide()
#write.csv(dw, "github-20141.csv", row.names = FALSE)
summary(dw)
## repository_language ForkEvent IssuesEvent PushEvent
## ActionScript: 1 Min. : 1.000 Min. : 1.000 Min. : 1.000
## Ada : 1 1st Qu.: 1.509 1st Qu.: 3.437 1st Qu.: 7.052
## Agda : 1 Median : 2.083 Median : 4.750 Median : 9.314
## ANTLR : 1 Mean : 2.454 Mean : 7.311 Mean : 10.921
## Apex : 1 3rd Qu.: 2.913 3rd Qu.: 7.269 3rd Qu.: 10.602
## AppleScript : 1 Max. :18.000 Max. :63.000 Max. :154.250
## (Other) :121
## WatchEvent
## Min. : 1.000
## 1st Qu.: 2.000
## Median : 3.007
## Mean : 3.725
## 3rd Qu.: 4.636
## Max. :13.471
##
dw <- dw %>%
filter(PushEvent < 50, IssuesEvent < 50, ForkEvent < 18)
ggpairs(select(dw, -repository_language))
As variáveis são bastante assimétricas e concentradas em pequenos valores. Transformá-las para log ajuda na visualização.
# Escala de log
dw2 <- log(dw[,2:5])
dw2$repository_language <- dw$repository_language
ggpairs(select(dw2, -repository_language))
row.names(dw2) = dw2$repository_language
# scale()
pr.out = prcomp(select(dw2, -repository_language), scale = TRUE)
kable(pr.out$rotation)
| PC1 | PC2 | PC3 | PC4 | |
|---|---|---|---|---|
| ForkEvent | -0.5925553 | 0.3508754 | -0.0117162 | -0.7250017 |
| IssuesEvent | -0.4199860 | -0.5634247 | -0.7067105 | 0.0820040 |
| PushEvent | -0.4165046 | -0.5687685 | 0.7072059 | 0.0537231 |
| WatchEvent | -0.5468217 | 0.4857380 | 0.0168181 | 0.6817344 |
biplot(pr.out, scale = 0)
autoplot(pr.out, label = TRUE, label.size = 3, shape = FALSE)
autoplot(pr.out, label = TRUE, label.size = 3, shape = FALSE,
loadings = TRUE, loadings.colour = 'blue',
loadings.label = TRUE, loadings.label.size = 3)
# Porcentagem da variância explicada:
plot_pve <- function(prout){
pr.var <- pr.out$sdev^2
pve <- pr.var / sum(pr.var)
df = data.frame(x = 1:NROW(pve), y = cumsum(pve))
ggplot(df, aes(x = x, y = y)) +
geom_point(size = 3) +
geom_line() +
labs(x='Principal Component', y = 'Cumulative Proportion of Variance Explained')
}
plot_pve(pr.out)
require(Rtsne)
tsne.out = Rtsne(select(dw, -repository_language),
verbose = TRUE)
## Read the 121 x 4 data matrix successfully!
## Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Normalizing input...
## Building tree...
## - point 0 of 121
## Done in 0.01 seconds (sparsity = 0.894065)!
## Learning embedding...
## Iteration 50: error is 52.736671 (50 iterations in 0.04 seconds)
## Iteration 100: error is 50.353310 (50 iterations in 0.04 seconds)
## Iteration 150: error is 49.675916 (50 iterations in 0.05 seconds)
## Iteration 200: error is 52.466622 (50 iterations in 0.04 seconds)
## Iteration 250: error is 49.123574 (50 iterations in 0.04 seconds)
## Iteration 300: error is 1.164690 (50 iterations in 0.04 seconds)
## Iteration 350: error is 0.549814 (50 iterations in 0.04 seconds)
## Iteration 400: error is 0.285701 (50 iterations in 0.04 seconds)
## Iteration 450: error is 0.280724 (50 iterations in 0.04 seconds)
## Iteration 500: error is 0.282282 (50 iterations in 0.04 seconds)
## Iteration 550: error is 0.282861 (50 iterations in 0.04 seconds)
## Iteration 600: error is 0.282053 (50 iterations in 0.04 seconds)
## Iteration 650: error is 0.281390 (50 iterations in 0.05 seconds)
## Iteration 700: error is 0.283031 (50 iterations in 0.04 seconds)
## Iteration 750: error is 0.282517 (50 iterations in 0.04 seconds)
## Iteration 800: error is 0.283731 (50 iterations in 0.04 seconds)
## Iteration 850: error is 0.280733 (50 iterations in 0.04 seconds)
## Iteration 900: error is 0.281272 (50 iterations in 0.04 seconds)
## Iteration 950: error is 0.280861 (50 iterations in 0.04 seconds)
## Iteration 1000: error is 0.281710 (50 iterations in 0.04 seconds)
## Fitting performed in 0.82 seconds.
df = as.data.frame(tsne.out$Y)
df$repository_language = dw$repository_language
ggplot(df, aes(x = V1, y = V2, label = repository_language)) +
geom_point(alpha = 0.8, size = 3, color = "tomato")
ggplot(df, aes(x = V1, y = V2, label = repository_language)) +
geom_point(alpha = 0.2, size = 3, color = "tomato") +
geom_text(alpha = .7, size = 4, hjust = -.2)
str(USArrests)
## 'data.frame': 50 obs. of 4 variables:
## $ Murder : num 13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
## $ Assault : int 236 263 294 190 276 204 110 238 335 211 ...
## $ UrbanPop: int 58 48 80 50 91 78 77 72 80 60 ...
## $ Rape : num 21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...
row.names(USArrests)
## [1] "Alabama" "Alaska" "Arizona" "Arkansas"
## [5] "California" "Colorado" "Connecticut" "Delaware"
## [9] "Florida" "Georgia" "Hawaii" "Idaho"
## [13] "Illinois" "Indiana" "Iowa" "Kansas"
## [17] "Kentucky" "Louisiana" "Maine" "Maryland"
## [21] "Massachusetts" "Michigan" "Minnesota" "Mississippi"
## [25] "Missouri" "Montana" "Nebraska" "Nevada"
## [29] "New Hampshire" "New Jersey" "New Mexico" "New York"
## [33] "North Carolina" "North Dakota" "Ohio" "Oklahoma"
## [37] "Oregon" "Pennsylvania" "Rhode Island" "South Carolina"
## [41] "South Dakota" "Tennessee" "Texas" "Utah"
## [45] "Vermont" "Virginia" "Washington" "West Virginia"
## [49] "Wisconsin" "Wyoming"
pr.out <- prcomp(USArrests, scale=TRUE)
biplot(pr.out)
autoplot(pr.out, label = TRUE)
autoplot(pr.out, label = TRUE, label.size = 3, shape = FALSE,
loadings = TRUE, loadings.colour = 'blue',
loadings.label = TRUE, loadings.label.size = 3)
plot_pve(pr.out)
names(iris)
## [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## [5] "Species"
glimpse(iris)
## Observations: 150
## Variables: 5
## $ Sepal.Length <dbl> 5.1, 4.9, 4.7, 4.6, 5.0, 5.4, 4.6, 5.0, 4.4, 4.9,...
## $ Sepal.Width <dbl> 3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1,...
## $ Petal.Length <dbl> 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5,...
## $ Petal.Width <dbl> 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1,...
## $ Species <fctr> setosa, setosa, setosa, setosa, setosa, setosa, ...
ggpairs(select(iris, -Species))
pr.out <- prcomp(select(iris, -Species), scale=TRUE)
plot_pve(pr.out)
autoplot(pr.out, data = iris, colour = 'Species', size = 3,
loadings = TRUE, loadings.colour = 'blue',
loadings.label = TRUE, loadings.label.size = 3)
require(Rtsne)
iris2 = iris %>%
filter(!duplicated(iris)) %>%
mutate_at(vars(-Species), funs(scale))
tsne.out = Rtsne(select(iris2, -Species),
verbose = FALSE,
perplexity = 20) # as vezes é uma boa brincar com esse parâmetro
df = as.data.frame(tsne.out$Y)
df$Species = iris2$Species
ggplot(df, aes(x = V1, y = V2, label = Species, color = Species)) +
geom_point(alpha = 0.8, size = 3)
ggplot(df, aes(x = V1, y = V2, label = Species)) +
geom_point(aes(color = Species), alpha = 0.2, size = 3)