library(tidyverse) # ggplot2, tidyr, dplyr, etc
library(broom) # facilita lidar com modelos e trata o resultado do kmeans como modelo
library(ggfortify, quietly = TRUE) # plots para modelos
# http://rpubs.com/sinhrks/basics
require(GGally, quietly = TRUE)
library(knitr, quietly = TRUE)
library(cluster)

theme_set(theme_bw())
source("github-lib.R")

Intro e filmes

Tal como vínhamos fazendo antes:

library(ggplot2movies)
glimpse(movies)

## Observations: 58,788
## Variables: 24
## $ title       <chr> "$", "$1000 a Touchdown", "$21 a Day Once a Month"...
## $ year        <int> 1971, 1939, 1941, 1996, 1975, 2000, 2002, 2002, 19...
## $ length      <int> 121, 71, 7, 70, 71, 91, 93, 25, 97, 61, 99, 96, 10...
## $ budget      <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ rating      <dbl> 6.4, 6.0, 8.2, 8.2, 3.4, 4.3, 5.3, 6.7, 6.6, 6.0, ...
## $ votes       <int> 348, 20, 5, 6, 17, 45, 200, 24, 18, 51, 23, 53, 44...
## $ r1          <dbl> 4.5, 0.0, 0.0, 14.5, 24.5, 4.5, 4.5, 4.5, 4.5, 4.5...
## $ r2          <dbl> 4.5, 14.5, 0.0, 0.0, 4.5, 4.5, 0.0, 4.5, 4.5, 0.0,...
## $ r3          <dbl> 4.5, 4.5, 0.0, 0.0, 0.0, 4.5, 4.5, 4.5, 4.5, 4.5, ...
## $ r4          <dbl> 4.5, 24.5, 0.0, 0.0, 14.5, 14.5, 4.5, 4.5, 0.0, 4....
## $ r5          <dbl> 14.5, 14.5, 0.0, 0.0, 14.5, 14.5, 24.5, 4.5, 0.0, ...
## $ r6          <dbl> 24.5, 14.5, 24.5, 0.0, 4.5, 14.5, 24.5, 14.5, 0.0,...
## $ r7          <dbl> 24.5, 14.5, 0.0, 0.0, 0.0, 4.5, 14.5, 14.5, 34.5, ...
## $ r8          <dbl> 14.5, 4.5, 44.5, 0.0, 0.0, 4.5, 4.5, 14.5, 14.5, 4...
## $ r9          <dbl> 4.5, 4.5, 24.5, 34.5, 0.0, 14.5, 4.5, 4.5, 4.5, 4....
## $ r10         <dbl> 4.5, 14.5, 24.5, 45.5, 24.5, 14.5, 14.5, 14.5, 24....
## $ mpaa        <chr> "", "", "", "", "", "", "R", "", "", "", "", "", "...
## $ Action      <int> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,...
## $ Animation   <int> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Comedy      <int> 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,...
## $ Drama       <int> 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,...
## $ Documentary <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Romance     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Short       <int> 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,...

filmes = movies %>% 
    filter(year >= 1970, 
           length >= 90, length <= 180, 
           Action == 1, 
           budget > 1e5) %>% 
    mutate(age = (2017 - year), 
           title = paste0(title, " (", year, ")"), 
           love = r10/(r10 + r1)) %>%
    select(title, age, budget, love, votes) %>%
    filter(complete.cases(.)) # complete.cases recebe um df, e não vetores como entrada

Descritivo express:

filmes %>% 
    mutate(budget = budget / 1e6) %>% 
    gather(key = "variavel", value = "valor", -title) %>% 
    ggplot(aes(x = valor)) + 
    geom_histogram(fill = "white", color = "black", bins = 20) + 
    facet_grid(. ~ variavel, scales = "free_x")

filmes %>% 
    select(-title) %>% 
    ggpairs(size = .5, 
            lower = list(continuous = wrap("points", size = .5, alpha = 0.3)))

## Warning in ggpairs(., size = 0.5, lower = list(continuous =
## wrap("points", : Extra arguments: 'size' are being ignored. If these are
## meant to be aesthetics, submit them using the 'mapping' variable within
## ggpairs with ggplot2::aes or ggplot2::aes_string.

filmes_transformado = filmes %>%
    #select(-title) %>% 
    mutate(budget = log10(budget), 
           votes = log10(votes)) %>% 
    mutate_at(vars(age:votes), funs(as.numeric(scale(.))))

filmes_transformado %>% 
    gather(key = "variavel", value = "valor", -title) %>% 
    ggplot(aes(x = valor)) + 
    geom_histogram(fill = "lightyellow", color = "black", bins = 20) + 
    facet_grid(. ~ variavel, scales = "free_x")

Agrupando com kmeans

set.seed(123)
explorando_k = tibble(k = 3:15) %>% 
    group_by(k) %>% 
    do(
        kmeans(select(filmes_transformado, -title), 
               centers = .$k, 
               nstart = 20) %>% glance()
    )

explorando_k %>% 
    ggplot(aes(x = k, y = tot.withinss)) + 
    geom_line() + 
    geom_point()

Usaremos k = 6

filmes_km = filmes_transformado %>% 
    select(-title) %>% 
    kmeans(centers = 6, nstart = 20)

filmes_transformado_agrupado = filmes_km %>% 
    augment(filmes_transformado)

Visualizando com coordenadas paralelas

filmes_km %>% 
    augment(filmes_transformado) %>% 
    gather(key = "variável", value = "valor", -title, -.cluster) %>% 
    ggplot(aes(x = `variável`, y = valor, group = title, colour = .cluster)) + 
    geom_line(alpha = .2) + 
    facet_wrap(~ .cluster)

Visualizando 4d em 2d com PCA

Não deixe de dar uma olhada nessa explicação visual sobre PCA.

Encontrando os componentes:

filmes_pca = filmes_transformado %>% 
    column_to_rownames("title") %>% 
    prcomp(scale = FALSE)

## Warning: Setting row names on a tibble is deprecated.

Os componentes e sua relação com as variáveis originais

print(as.data.frame(filmes_pca$rotation))

##                PC1        PC2        PC3        PC4
## age     0.37189816  0.4498170 -0.7652395 -0.2715969
## budget -0.69785923 -0.0293650 -0.1051809 -0.7078610
## love    0.07118599  0.7969047  0.5697121 -0.1878925
## votes  -0.60796119  0.4021757 -0.2806662  0.6243912

Usando o pacote broom para acessar os resultados de prcomp via data frames (em lugar de listas com atributos que você nem sempre sabe quais são).

A mesma coisa que a matriz acima, mas em formato long:

tidy(filmes_pca, "variables")

##    column PC       value
## 1     age  1  0.37189816
## 2  budget  1 -0.69785923
## 3    love  1  0.07118599
## 4   votes  1 -0.60796119
## 5     age  2  0.44981702
## 6  budget  2 -0.02936500
## 7    love  2  0.79690468
## 8   votes  2  0.40217568
## 9     age  3 -0.76523954
## 10 budget  3 -0.10518095
## 11   love  3  0.56971212
## 12  votes  3 -0.28066620
## 13    age  4 -0.27159686
## 14 budget  4 -0.70786097
## 15   love  4 -0.18789250
## 16  votes  4  0.62439122

Analisando quanta variância cada PC captura:

tidy(filmes_pca, "pcs")

##   PC   std.dev percent cumulative
## 1  1 1.3592139 0.46187    0.46187
## 2  2 1.0522660 0.27682    0.73868
## 3  3 0.9306917 0.21655    0.95523
## 4  4 0.4231865 0.04477    1.00000

tidy(filmes_pca, "pcs") %>% 
    ggplot(aes(x = PC, y = cumulative, label = cumulative)) + 
    geom_line() + 
    geom_point() + 
    geom_text(vjust = 1, hjust = -.1)

O valor dos PCs para cada ponto dos dados originais.

tidy(filmes_pca, "samples") %>% 
    head()

##                              row PC      value
## 1       13th Warrior, The (1999)  1 -1.3152153
## 2        2 Fast 2 Furious (2003)  1 -1.3984194
## 3 3000 Miles to Graceland (2001)  1 -1.0445748
## 4         51st State, The (2001)  1 -0.6572037
## 5            6th Day, The (2000)  1 -1.2870050
## 6                72 metra (2004)  1  1.8108625

Como sempre, visualizar é necessário.

# augment é uma mão na roda para combinar modelo com dados que você já tem
au <- augment(filmes_pca, data = filmes_transformado_agrupado)
glimpse(au)

## Observations: 600
## Variables: 11
## $ .rownames  <fctr> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,...
## $ title      <chr> "13th Warrior, The (1999)", "2 Fast 2 Furious (2003...
## $ age        <dbl> -0.4899323, -0.9446259, -0.7172791, -0.7172791, -0....
## $ budget     <dbl> 0.98043149, 0.90718177, 0.77392727, 0.25364963, 0.9...
## $ love       <dbl> -0.3458892, -1.6396288, -0.3458892, -0.3458892, -0....
## $ votes      <dbl> 0.69771681, 0.48902954, 0.35052440, 0.31057032, 0.6...
## $ .cluster   <fctr> 2, 2, 2, 2, 2, 1, 2, 4, 5, 1, 2, 2, 2, 4, 6, 2, 1,...
## $ .fittedPC1 <dbl> -1.3152153, -1.3984194, -1.0445748, -0.6572037, -1....
## $ .fittedPC2 <dbl> -0.24420625, -1.56150026, -0.48003905, -0.48082966,...
## $ .fittedPC3 <dbl> -0.121089894, -0.443923565, 0.172050348, 0.23798740...
## $ .fittedPC4 <dbl> -0.060306872, 0.227818569, -0.069167808, 0.27416945...

au %>% 
    ggplot(aes(.fittedPC1, .fittedPC2)) +
    geom_point(size = .7, alpha = .5)

Tentando ver a estrutura de grupos. Não conseguimos diferenciar muito o

au %>% 
    ggplot(aes(.fittedPC1, .fittedPC2, color = .cluster)) +
    geom_point(size = .8, alpha = .7)

Para entender, sempre é uma boa ver exemplos

set.seed(122)
sample_n(au, 50) %>% # plotando 50 filmes aleatórios
    ggplot(aes(.fittedPC1, .fittedPC2, color = .cluster)) +
    geom_point(size = .2) +
    geom_text(aes(label = title), vjust = 1, hjust = 0, size = 3)

Como nossos labels são grandes, interação ajuda

# No meu RStudio, o gráfico não aparece interativo quando rodo dentro do notebook, 
# mas aparece se rodo no console ou quando faço knit html.
library(highcharter)

## Warning: package 'highcharter' was built under R version 3.3.2

## Highcharts (www.highcharts.com) is a Highsoft software product which is

## not free for commercial and Governmental use

p = au %>% 
    hchart("scatter", hcaes(x = .fittedPC1, y = .fittedPC2, group = .cluster)) %>%
    hc_tooltip(pointFormat = "<b>{point.title}</b><br>
             $$$: {point.budget:,.2f}<br>
             votes: {point.votes:,.2f}<br>
             love: {point.love:,.2f}<br>
             age: {point.age:,.2f}")
p

O biplot ajuda muito também. Existe um do R base (stats) e um do GGfortify que é mais tunável:

biplot(filmes_pca, scale = 0)

# autoplot(filmes_pca, label = F, label.size = 3, shape = T)
autoplot(filmes_pca, label = F, label.size = 3, shape = T, colour = filmes_km$cluster,
         loadings = TRUE, loadings.colour = 'darkorange',
         loadings.label = TRUE, loadings.label.size = 3)

## Warning in if (value %in% columns) {: a condição tem comprimento > 1 e
## somente o primeiro elemento será usado

E com um método não linear?

require(Rtsne)

## Loading required package: Rtsne

set.seed(111)
tsne.out = filmes_transformado %>% 
    select(-title) %>% 
    Rtsne(verbose = TRUE, 
          perplexity = 40) # você pode variar esse parâmetro. o autor recomenda entre 5-50

## Read the 600 x 4 data matrix successfully!
## Using no_dims = 2, perplexity = 40.000000, and theta = 0.500000
## Computing input similarities...
## Normalizing input...
## Building tree...
##  - point 0 of 600
## Done in 0.07 seconds (sparsity = 0.271611)!
## Learning embedding...
## Iteration 50: error is 59.000959 (50 iterations in 0.30 seconds)
## Iteration 100: error is 55.307531 (50 iterations in 0.28 seconds)
## Iteration 150: error is 55.294459 (50 iterations in 0.27 seconds)
## Iteration 200: error is 55.291596 (50 iterations in 0.28 seconds)
## Iteration 250: error is 55.290610 (50 iterations in 0.29 seconds)
## Iteration 300: error is 0.535510 (50 iterations in 0.26 seconds)
## Iteration 350: error is 0.468716 (50 iterations in 0.24 seconds)
## Iteration 400: error is 0.453866 (50 iterations in 0.25 seconds)
## Iteration 450: error is 0.448520 (50 iterations in 0.25 seconds)
## Iteration 500: error is 0.443614 (50 iterations in 0.24 seconds)
## Iteration 550: error is 0.442158 (50 iterations in 0.25 seconds)
## Iteration 600: error is 0.440338 (50 iterations in 0.24 seconds)
## Iteration 650: error is 0.439648 (50 iterations in 0.23 seconds)
## Iteration 700: error is 0.438779 (50 iterations in 0.24 seconds)
## Iteration 750: error is 0.438330 (50 iterations in 0.24 seconds)
## Iteration 800: error is 0.437489 (50 iterations in 0.24 seconds)
## Iteration 850: error is 0.437496 (50 iterations in 0.24 seconds)
## Iteration 900: error is 0.437153 (50 iterations in 0.24 seconds)
## Iteration 950: error is 0.436610 (50 iterations in 0.24 seconds)
## Iteration 1000: error is 0.436558 (50 iterations in 0.24 seconds)
## Fitting performed in 5.06 seconds.

df = as.data.frame(tsne.out$Y)
filmes_tsne = cbind(au, df)
glimpse(filmes_tsne)

## Observations: 600
## Variables: 13
## $ .rownames  <fctr> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,...
## $ title      <chr> "13th Warrior, The (1999)", "2 Fast 2 Furious (2003...
## $ age        <dbl> -0.4899323, -0.9446259, -0.7172791, -0.7172791, -0....
## $ budget     <dbl> 0.98043149, 0.90718177, 0.77392727, 0.25364963, 0.9...
## $ love       <dbl> -0.3458892, -1.6396288, -0.3458892, -0.3458892, -0....
## $ votes      <dbl> 0.69771681, 0.48902954, 0.35052440, 0.31057032, 0.6...
## $ .cluster   <fctr> 2, 2, 2, 2, 2, 1, 2, 4, 5, 1, 2, 2, 2, 4, 6, 2, 1,...
## $ .fittedPC1 <dbl> -1.3152153, -1.3984194, -1.0445748, -0.6572037, -1....
## $ .fittedPC2 <dbl> -0.24420625, -1.56150026, -0.48003905, -0.48082966,...
## $ .fittedPC3 <dbl> -0.121089894, -0.443923565, 0.172050348, 0.23798740...
## $ .fittedPC4 <dbl> -0.060306872, 0.227818569, -0.069167808, 0.27416945...
## $ V1         <dbl> 18.3588308, 6.6289799, 17.5394092, 13.3644499, 18.3...
## $ V2         <dbl> 1.1869690, -17.2347673, 5.6956695, 6.2541034, 2.992...

ggplot(filmes_tsne, aes(x = V1, 
                   y = V2,  
                   color = .cluster)) + 
  geom_point(alpha = 0.8, size = 1)

filmes_tsne %>% 
    hchart("scatter", hcaes(x = V1, y = V2, group = .cluster)) %>%
    hc_tooltip(pointFormat = "<b>{point.title}</b><br>
             $$$: {point.budget:,.2f}<br>
             votes: {point.votes:,.2f}<br>
             love: {point.love:,.2f}<br>
             age: {point.age:,.2f}")

#plotly::ggplotly(p)

Github data

dw <- load_github_wide()
#write.csv(dw, "github-20141.csv", row.names = FALSE)
summary(dw)

##    repository_language   ForkEvent       IssuesEvent       PushEvent      
##  ActionScript:  1      Min.   : 1.000   Min.   : 1.000   Min.   :  1.000  
##  Ada         :  1      1st Qu.: 1.509   1st Qu.: 3.437   1st Qu.:  7.052  
##  Agda        :  1      Median : 2.083   Median : 4.750   Median :  9.314  
##  ANTLR       :  1      Mean   : 2.454   Mean   : 7.311   Mean   : 10.921  
##  Apex        :  1      3rd Qu.: 2.913   3rd Qu.: 7.269   3rd Qu.: 10.602  
##  AppleScript :  1      Max.   :18.000   Max.   :63.000   Max.   :154.250  
##  (Other)     :121                                                         
##    WatchEvent    
##  Min.   : 1.000  
##  1st Qu.: 2.000  
##  Median : 3.007  
##  Mean   : 3.725  
##  3rd Qu.: 4.636  
##  Max.   :13.471  
##

dw <- dw %>% 
  filter(PushEvent < 50, IssuesEvent < 50, ForkEvent < 18)

ggpairs(select(dw, -repository_language))

As variáveis são bastante assimétricas e concentradas em pequenos valores. Transformá-las para log ajuda na visualização.

# Escala de log 
dw2 <- log(dw[,2:5])
dw2$repository_language <- dw$repository_language
ggpairs(select(dw2, -repository_language))

PCA

row.names(dw2) = dw2$repository_language
# scale()
pr.out = prcomp(select(dw2, -repository_language), scale = TRUE) 

kable(pr.out$rotation)

	PC1	PC2	PC3	PC4
ForkEvent	-0.5925553	0.3508754	-0.0117162	-0.7250017
IssuesEvent	-0.4199860	-0.5634247	-0.7067105	0.0820040
PushEvent	-0.4165046	-0.5687685	0.7072059	0.0537231
WatchEvent	-0.5468217	0.4857380	0.0168181	0.6817344

biplot(pr.out, scale = 0)

autoplot(pr.out, label = TRUE, label.size = 3, shape = FALSE)

autoplot(pr.out, label = TRUE, label.size = 3, shape = FALSE, 
         loadings = TRUE, loadings.colour = 'blue',
         loadings.label = TRUE, loadings.label.size = 3)

# Porcentagem da variância explicada: 
plot_pve <- function(prout){
  pr.var <- pr.out$sdev^2
  pve <- pr.var / sum(pr.var)
  df = data.frame(x = 1:NROW(pve), y = cumsum(pve))
  ggplot(df, aes(x = x, y = y)) + 
    geom_point(size = 3) + 
    geom_line() + 
    labs(x='Principal Component', y = 'Cumulative Proportion of Variance Explained')
}

plot_pve(pr.out)

t-SNE

require(Rtsne)
tsne.out = Rtsne(select(dw, -repository_language), 
                 verbose = TRUE)

## Read the 121 x 4 data matrix successfully!
## Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Normalizing input...
## Building tree...
##  - point 0 of 121
## Done in 0.01 seconds (sparsity = 0.894065)!
## Learning embedding...
## Iteration 50: error is 52.736671 (50 iterations in 0.04 seconds)
## Iteration 100: error is 50.353310 (50 iterations in 0.04 seconds)
## Iteration 150: error is 49.675916 (50 iterations in 0.05 seconds)
## Iteration 200: error is 52.466622 (50 iterations in 0.04 seconds)
## Iteration 250: error is 49.123574 (50 iterations in 0.04 seconds)
## Iteration 300: error is 1.164690 (50 iterations in 0.04 seconds)
## Iteration 350: error is 0.549814 (50 iterations in 0.04 seconds)
## Iteration 400: error is 0.285701 (50 iterations in 0.04 seconds)
## Iteration 450: error is 0.280724 (50 iterations in 0.04 seconds)
## Iteration 500: error is 0.282282 (50 iterations in 0.04 seconds)
## Iteration 550: error is 0.282861 (50 iterations in 0.04 seconds)
## Iteration 600: error is 0.282053 (50 iterations in 0.04 seconds)
## Iteration 650: error is 0.281390 (50 iterations in 0.05 seconds)
## Iteration 700: error is 0.283031 (50 iterations in 0.04 seconds)
## Iteration 750: error is 0.282517 (50 iterations in 0.04 seconds)
## Iteration 800: error is 0.283731 (50 iterations in 0.04 seconds)
## Iteration 850: error is 0.280733 (50 iterations in 0.04 seconds)
## Iteration 900: error is 0.281272 (50 iterations in 0.04 seconds)
## Iteration 950: error is 0.280861 (50 iterations in 0.04 seconds)
## Iteration 1000: error is 0.281710 (50 iterations in 0.04 seconds)
## Fitting performed in 0.82 seconds.

df = as.data.frame(tsne.out$Y)
df$repository_language = dw$repository_language

ggplot(df, aes(x = V1, y = V2, label = repository_language)) + 
  geom_point(alpha = 0.8, size = 3, color = "tomato")

ggplot(df, aes(x = V1, y = V2, label = repository_language)) + 
  geom_point(alpha = 0.2, size = 3, color = "tomato") + 
  geom_text(alpha = .7, size = 4, hjust = -.2)

Prisões nos EUA

str(USArrests)

## 'data.frame':    50 obs. of  4 variables:
##  $ Murder  : num  13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
##  $ Assault : int  236 263 294 190 276 204 110 238 335 211 ...
##  $ UrbanPop: int  58 48 80 50 91 78 77 72 80 60 ...
##  $ Rape    : num  21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...

row.names(USArrests)

##  [1] "Alabama"        "Alaska"         "Arizona"        "Arkansas"      
##  [5] "California"     "Colorado"       "Connecticut"    "Delaware"      
##  [9] "Florida"        "Georgia"        "Hawaii"         "Idaho"         
## [13] "Illinois"       "Indiana"        "Iowa"           "Kansas"        
## [17] "Kentucky"       "Louisiana"      "Maine"          "Maryland"      
## [21] "Massachusetts"  "Michigan"       "Minnesota"      "Mississippi"   
## [25] "Missouri"       "Montana"        "Nebraska"       "Nevada"        
## [29] "New Hampshire"  "New Jersey"     "New Mexico"     "New York"      
## [33] "North Carolina" "North Dakota"   "Ohio"           "Oklahoma"      
## [37] "Oregon"         "Pennsylvania"   "Rhode Island"   "South Carolina"
## [41] "South Dakota"   "Tennessee"      "Texas"          "Utah"          
## [45] "Vermont"        "Virginia"       "Washington"     "West Virginia" 
## [49] "Wisconsin"      "Wyoming"

pr.out <- prcomp(USArrests, scale=TRUE)
biplot(pr.out)

autoplot(pr.out, label = TRUE)

autoplot(pr.out, label = TRUE, label.size = 3, shape = FALSE, 
         loadings = TRUE, loadings.colour = 'blue',
         loadings.label = TRUE, loadings.label.size = 3)

plot_pve(pr.out)

Iris

names(iris)

## [1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width" 
## [5] "Species"

glimpse(iris)

## Observations: 150
## Variables: 5
## $ Sepal.Length <dbl> 5.1, 4.9, 4.7, 4.6, 5.0, 5.4, 4.6, 5.0, 4.4, 4.9,...
## $ Sepal.Width  <dbl> 3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1,...
## $ Petal.Length <dbl> 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5,...
## $ Petal.Width  <dbl> 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1,...
## $ Species      <fctr> setosa, setosa, setosa, setosa, setosa, setosa, ...

ggpairs(select(iris, -Species))

pr.out <- prcomp(select(iris, -Species), scale=TRUE)

plot_pve(pr.out)

autoplot(pr.out, data = iris, colour = 'Species', size = 3,  
         loadings = TRUE, loadings.colour = 'blue',
         loadings.label = TRUE, loadings.label.size = 3)

require(Rtsne)
iris2 = iris %>% 
    filter(!duplicated(iris)) %>% 
    mutate_at(vars(-Species), funs(scale)) 

tsne.out = Rtsne(select(iris2, -Species), 
                 verbose = FALSE, 
                 perplexity = 20) # as vezes é uma boa brincar com esse parâmetro

df = as.data.frame(tsne.out$Y)
df$Species = iris2$Species

ggplot(df, aes(x = V1, y = V2, label = Species, color = Species)) + 
  geom_point(alpha = 0.8, size = 3)

ggplot(df, aes(x = V1, y = V2, label = Species)) + 
  geom_point(aes(color = Species), alpha = 0.2, size = 3)

Redução de dimensionalidade

Nazareno Andrade