Projeto K-Means aplicando o algoritmo de machine learning

library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
data <- read_csv("pokemon_data.csv")
## Rows: 800 Columns: 12
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (3): Name, Type 1, Type 2
## dbl (8): #, HP, Attack, Defense, Sp. Atk, Sp. Def, Speed, Generation
## lgl (1): Legendary
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
pokemon <- read_csv("pokemon_data.csv")
## Rows: 800 Columns: 12
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (3): Name, Type 1, Type 2
## dbl (8): #, HP, Attack, Defense, Sp. Atk, Sp. Def, Speed, Generation
## lgl (1): Legendary
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(pokemon)
## # A tibble: 6 x 12
##     `#` Name    `Type 1` `Type 2`    HP Attack Defense `Sp. Atk` `Sp. Def` Speed
##   <dbl> <chr>   <chr>    <chr>    <dbl>  <dbl>   <dbl>     <dbl>     <dbl> <dbl>
## 1     1 Bulbas~ Grass    Poison      45     49      49        65        65    45
## 2     2 Ivysaur Grass    Poison      60     62      63        80        80    60
## 3     3 Venusa~ Grass    Poison      80     82      83       100       100    80
## 4     3 Venusa~ Grass    Poison      80    100     123       122       120    80
## 5     4 Charma~ Fire     <NA>        39     52      43        60        50    65
## 6     5 Charme~ Fire     <NA>        58     64      58        80        65    80
## # ... with 2 more variables: Generation <dbl>, Legendary <lgl>

Visualizando os 5 primeiros dados

head(pokemon)
## # A tibble: 6 x 12
##     `#` Name    `Type 1` `Type 2`    HP Attack Defense `Sp. Atk` `Sp. Def` Speed
##   <dbl> <chr>   <chr>    <chr>    <dbl>  <dbl>   <dbl>     <dbl>     <dbl> <dbl>
## 1     1 Bulbas~ Grass    Poison      45     49      49        65        65    45
## 2     2 Ivysaur Grass    Poison      60     62      63        80        80    60
## 3     3 Venusa~ Grass    Poison      80     82      83       100       100    80
## 4     3 Venusa~ Grass    Poison      80    100     123       122       120    80
## 5     4 Charma~ Fire     <NA>        39     52      43        60        50    65
## 6     5 Charme~ Fire     <NA>        58     64      58        80        65    80
## # ... with 2 more variables: Generation <dbl>, Legendary <lgl>

Visualizando os 5 últimos dados

tail(pokemon)
## # A tibble: 6 x 12
##     `#` Name    `Type 1` `Type 2`    HP Attack Defense `Sp. Atk` `Sp. Def` Speed
##   <dbl> <chr>   <chr>    <chr>    <dbl>  <dbl>   <dbl>     <dbl>     <dbl> <dbl>
## 1   718 Zygard~ Dragon   Ground     108    100     121        81        95    95
## 2   719 Diancie Rock     Fairy       50    100     150       100       150    50
## 3   719 Dianci~ Rock     Fairy       50    160     110       160       110   110
## 4   720 HoopaH~ Psychic  Ghost       80    110      60       150       130    70
## 5   720 HoopaH~ Psychic  Dark        80    160      60       170       130    80
## 6   721 Volcan~ Fire     Water       80    110     120       130        90    70
## # ... with 2 more variables: Generation <dbl>, Legendary <lgl>

Visualizando total de linhas e colunas

dim(pokemon)
## [1] 800  12
pokemon <- pokemon %>% select(6:11)
head(pokemon)
## # A tibble: 6 x 6
##   Attack Defense `Sp. Atk` `Sp. Def` Speed Generation
##    <dbl>   <dbl>     <dbl>     <dbl> <dbl>      <dbl>
## 1     49      49        65        65    45          1
## 2     62      63        80        80    60          1
## 3     82      83       100       100    80          1
## 4    100     123       122       120    80          1
## 5     52      43        60        50    65          1
## 6     64      58        80        65    80          1

Análise de dados

Gráfico dos pokemon ataque

hist(pokemon$Attack, 
     xlab = "Atack",
     ylab = "Total",
     title = "Atack dos pokemon")
## Warning in plot.window(xlim, ylim, "", ...): "title" não é um parâmetro gráfico
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...): "title"
## não é um parâmetro gráfico
## Warning in axis(1, ...): "title" não é um parâmetro gráfico
## Warning in axis(2, ...): "title" não é um parâmetro gráfico

# Gráfico dos pokemon defesa

hist(pokemon$Defense,
     xlab = "Defesa",
     ylab = "Total",
     title = "Defesa dos pokemon")
## Warning in plot.window(xlim, ylim, "", ...): "title" não é um parâmetro gráfico
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...): "title"
## não é um parâmetro gráfico
## Warning in axis(1, ...): "title" não é um parâmetro gráfico
## Warning in axis(2, ...): "title" não é um parâmetro gráfico

# Gráfico boxplot dos pokemon

boxplot(pokemon $Attack,
        xlab = "Atack",
        ylab = "Total",
        title = "Atack dos pokemon")

boxplot(pokemon$Defense,
        xlab = "Defesa",
        ylab = "Total",
        title = "Defesa dos pokemon")

plot(pokemon$Attack)

# Inicialize o total dentro do erro da soma dos quadrados: wss

wss <- 0

# Veja de 1 a 15 clusters possíveis
for (i in 1:15) {
  # Ajuste o modelo: km.out
  km.out <- kmeans(pokemon, centers = i, nstart = 20, iter.max = 50)
  
  # Salve a soma dos quadrados dentro do cluster
  wss[i] <- km.out$tot.withinss
}

# Produza um scree plot
plot(1:15, wss, type = "b", 
     xlab = "Número de clusters", 
     ylab = "Dentro de grupos soma de quadrados")

Selecione o número de clusters

k <- 10

# Construir modelo com k clusters: km.out
km.pokemon <- kmeans(pokemon, centers = k, nstart = 20, iter.max = 50)

# Visualize o modelo resultante
km.pokemon
## K-means clustering with 10 clusters of sizes 98, 54, 23, 34, 94, 91, 71, 106, 124, 105
## 
## Cluster means:
##       Attack   Defense   Sp. Atk   Sp. Def    Speed Generation
## 1   70.10204  74.78571  45.15306  54.15306 38.26531   3.520408
## 2  140.77778  93.96296 131.12963  92.92593 98.62963   3.685185
## 3   68.04348 137.21739  67.60870 138.69565 50.69565   3.869565
## 4  106.14706 145.02941  62.35294  68.88235 46.14706   3.147059
## 5  116.43617  87.14894  61.56383  77.94681 82.28723   3.297872
## 6   37.19780  41.76923  40.42857  47.31868 40.84615   3.109890
## 7   80.91549  85.92958 120.64789 105.09859 95.76056   3.211268
## 8   79.42453  64.03774  83.29245  69.52830 99.99057   3.339623
## 9   62.20161  45.15323  56.26613  48.78226 66.56452   3.225806
## 10  69.98095  77.17143  87.97143  87.15238 54.35238   3.276190
## 
## Clustering vector:
##   [1]  9 10  7  7  9  8  7  2  7  1 10 10  7  6  6 10  6  6  9  5  6  9  8  7  9
##  [26]  8  9  8  9  8  9  8  1  5  6  1  5  6  9  5  6 10  9  8  6  9  6  8  1 10
##  [51] 10  1  1  6  8  9  8  9  8  9  8  9  5  9  8  9  9  5  9  8  7  7  1  1  5
##  [76]  9  9  8  9  7  1  4  4  8  8  1 10  4 10 10  9  9  5  6 10  1  5  1  4  9
## [101]  8  7  7  4  6 10  1  5  9  8  1 10  1  1  5  5  1  1  4  1  4  6 10  5  5
## [126]  9  8  9  5  9  8  7  5  8  8  8  5  5  5  6  5  5 10  6  9 10  7  5 10 10
## [151] 10  1  5  8  5 10  7  7  7  9  8  2  2  2  2  7  1 10 10  9  8  7  1  1  5
## [176]  6  9  6 10  6 10  6  1  8  9 10  6  6  6  6 10  9  8  6  9 10  7 10  6 10
## [201]  4 10  6  9  8  9  6 10  9  6  1  7  3  8 10  8  9  6  8  1  4  1  5  4  4
## [226]  1  5  5  5  4  3  5  5  8  1  5  6 10  6  1 10  9 10  9 10  4  9  8  7  7
## [251]  1  4 10  8  6  6  5  9  9  9  5 10  7  5  7  1  1  5  2  3  2  7  9  8  8
## [276]  2  9 10  2  2  1  1  5  2  6  5  6  8  6  6  9  6 10  6  9 10  6  9  8  9
## [301]  8  9 10  6  6  7  7  6 10  6  5  1  8  2  1  8  1  6  9 10  6  5  6  3  6
## [326]  9  1  3  1  4  1  4  4  4  6  8  5  9  8  7  8  8  9  8 10  6 10  9  8  2
## [351]  9  9  9 10  2  4  6 10  9  1  9  8  9 10  6 10  7  5  8 10  5  9 10  1  5
## [376]  6  3  1 10  9  5  6  7 10 10  9  5  2  1  3 10 10  5  2  6  6  8  2  6 10
## [401] 10  1 10 10  4  9  1  1  2  2  1  1  4  2  4  3  3  7  7  7  2  7  2  2  2
## [426]  2  2  7  2  2  3  8  1  1  5  9  8  8  6 10 10  6  9  5  6  9  6  9  6  9
## [451]  5  6  7  9  5  1  3  6 10 10 10  8  6 10  8  9  8  6  8  6 10  8  9  8  9
## [476]  8  5  7  8  9  8  6  9  8  1  3  1  6  6  8 10  9  9  5  2  1  9  8  2  1
## [501]  4  1  5  9  8 10  9  8 10  9 10  2  5  7 10  4  4  5  7  7  8  5  7  5  5
## [526]  7  5  5  3  3  8  8  7  7  7  7  7  3  7  2  2  2  7  2  7  2  3  8  7  7
## [551]  7  7  2  7  9  8  8  9  9  5  9 10 10  6  8  6  1  5  9  8  9  8  9  8  9
## [576]  8  6 10  6  9  5  9  8  1  1  4  9  8  9  5 10  3  1  1  5  9  9  8  1  5
## [601]  1  1  5  6  1  5  6  8  6  8  8  9  9  5  1  5  7 10  1  4  1  3  8  1  3
## [626]  1  4  9  2  9  5  9  8  9  8  6 10 10  6 10 10  6  8  9 10  7  9  5  8  9
## [651]  4  6 10  6 10  1  9  8  1  3  1 10  5  9 10 10  1 10  6 10  7  1  5  5  9
## [676]  5  7  1  8 10  9  8  5  1  5  1  5  5  9  5  1 10 10  5  1 10  7  9  7  5
## [701]  5  7  2  7  2  7  2  2  2  2  2  2  2  7  7  7  5  2  1  1  4  9 10  7  9
## [726]  8  8  6  9  9  9  8  6  6  8  9  8  6 10  7  9 10  1  5  8  9  8  8  1  4
## [751]  2  3  1 10  1 10  6  5  1  5  1 10  1 10  9  8  1  5  9 10 10  8  8  3  6
## [776] 10  7 10  1  5  1  1  1  1  5  5  4  4  1  4  6  8  2  2  5  3  2  7  2  2
## 
## Within cluster sum of squares by cluster:
##  [1] 117284.45 155819.30  71276.70  85419.06 167899.79 102515.38 129665.41
##  [8] 145248.86 120115.55 160028.30
##  (between_SS / total_SS =  66.7 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Plot of Defense vs. Speed por associação de cluster

plot(pokemon[, c("Defense", "Speed")],
     col = km.pokemon$cluster,
     main = paste("Agrupamento de Pokémon com", k, "Clusters"),
     xlab = "Defesa", ylab = "Velocidade")

# Ver os meios da coluna
colMeans(pokemon)
##     Attack    Defense    Sp. Atk    Sp. Def      Speed Generation 
##   79.00125   73.84250   72.82000   71.90250   68.27750    3.32375
# Veja os desvios padrão da coluna
apply(pokemon, 2, sd)
##     Attack    Defense    Sp. Atk    Sp. Def      Speed Generation 
##   32.45737   31.18350   32.72229   27.82892   29.06047    1.66129
# Dimensionar os dados
pokemon.scaled <- scale(pokemon)

# Crie um modelo de cluster hierárquico: hclust.pokemon
hclust.pokemon <- hclust(dist(pokemon.scaled), method = "complete")

Comparando kmeans() e hclust()

# Aplique cutree() ao hclust.pokemon: cut.pokemon
cut.pokemon <- cutree(hclust.pokemon, k = 3)

# Comparar métodos
table(km.pokemon$cluster, cut.pokemon)
##     cut.pokemon
##        1   2   3
##   1   98   0   0
##   2   47   7   0
##   3   22   0   1
##   4   34   0   0
##   5   94   0   0
##   6   91   0   0
##   7   61  10   0
##   8  103   3   0
##   9  124   0   0
##   10 105   0   0
pokemon
## # A tibble: 800 x 6
##    Attack Defense `Sp. Atk` `Sp. Def` Speed Generation
##     <dbl>   <dbl>     <dbl>     <dbl> <dbl>      <dbl>
##  1     49      49        65        65    45          1
##  2     62      63        80        80    60          1
##  3     82      83       100       100    80          1
##  4    100     123       122       120    80          1
##  5     52      43        60        50    65          1
##  6     64      58        80        65    80          1
##  7     84      78       109        85   100          1
##  8    130     111       130        85   100          1
##  9    104      78       159       115   100          1
## 10     48      65        50        64    43          1
## # ... with 790 more rows

PCA

pokemon_pr <- data %>% select(HP, Attack, Defense, Speed)
glimpse(pokemon_pr)
## Rows: 800
## Columns: 4
## $ HP      <dbl> 45, 60, 80, 80, 39, 58, 78, 78, 78, 44, 59, 79, 79, 45, 50, 60~
## $ Attack  <dbl> 49, 62, 82, 100, 52, 64, 84, 130, 104, 48, 63, 83, 103, 30, 20~
## $ Defense <dbl> 49, 63, 83, 123, 43, 58, 78, 111, 78, 65, 80, 100, 120, 35, 55~
## $ Speed   <dbl> 45, 60, 80, 80, 65, 80, 100, 100, 100, 43, 58, 78, 78, 45, 30,~
summary(pokemon_pr)
##        HP             Attack       Defense           Speed       
##  Min.   :  1.00   Min.   :  5   Min.   :  5.00   Min.   :  5.00  
##  1st Qu.: 50.00   1st Qu.: 55   1st Qu.: 50.00   1st Qu.: 45.00  
##  Median : 65.00   Median : 75   Median : 70.00   Median : 65.00  
##  Mean   : 69.26   Mean   : 79   Mean   : 73.84   Mean   : 68.28  
##  3rd Qu.: 80.00   3rd Qu.:100   3rd Qu.: 90.00   3rd Qu.: 90.00  
##  Max.   :255.00   Max.   :190   Max.   :230.00   Max.   :180.00
pr.pokemon <- prcomp(x = pokemon_pr, scale = T, center = T)
summary
## function (object, ...) 
## UseMethod("summary")
## <bytecode: 0x00000000216ff0a8>
## <environment: namespace:base>
pr.pokemon
## Standard deviations (1, .., p=4):
## [1] 1.3721424 0.9932783 0.8526020 0.6353685
## 
## Rotation (n x k) = (4 x 4):
##               PC1         PC2        PC3        PC4
## HP      0.5009303 -0.06463396  0.8300858 -0.2363236
## Attack  0.6301797  0.02703796 -0.1621455  0.7588487
## Defense 0.4556878 -0.61865282 -0.4521283 -0.4529871
## Speed   0.3798566  0.78253440 -0.2832778 -0.4038596

Resultados do PCA

Os modelos PCA produzem componentes adicionais de diagnóstico e saída:

Dados em termos dos componentes principais originais. - x - o valor de cada observação no conjunto de dados original projetado para os componentes principais

pr.pokemon$center
##       HP   Attack  Defense    Speed 
## 69.25875 79.00125 73.84250 68.27750
pr.pokemon$scale
##       HP   Attack  Defense    Speed 
## 25.53467 32.45737 31.18350 29.06047
pr.pokemon$rotation
##               PC1         PC2        PC3        PC4
## HP      0.5009303 -0.06463396  0.8300858 -0.2363236
## Attack  0.6301797  0.02703796 -0.1621455  0.7588487
## Defense 0.4556878 -0.61865282 -0.4521283 -0.4529871
## Speed   0.3798566  0.78253440 -0.2832778 -0.4038596
head(pr.pokemon$x,10)
##              PC1          PC2         PC3          PC4
##  [1,] -1.7256845 -0.097546271 -0.05163582  0.207456766
##  [2,] -0.7783645  0.001484139  0.02184005 -0.039259320
##  [3,]  0.5559879  0.109293957  0.08715421 -0.325236471
##  [4,]  1.4899932 -0.669275804 -0.58272583 -0.485458941
##  [5,] -1.6113972  0.577730653 -0.36963560  0.142341152
##  [6,] -0.5904092  0.645964030 -0.17563029 -0.179301322
##  [7,]  0.7439432  0.753773847 -0.11031614 -0.465278473
##  [8,]  2.1192939  0.137402684 -0.81858156  0.130820647
##  [9,]  1.1322555  0.770434451 -0.21022907  0.002318739
## [10,] -1.5570505 -0.467129383 -0.29163598 -0.011297649

Interpretando biplots (1)

biplot(pr.pokemon)

Variação explicada

# Variabilidade de cada componente principal: pr.var
pr.var <- pr.pokemon$sdev^2

# Variação explicada por cada componente principal: pve
pve <- pr.var / sum(pr.var)
pve
## [1] 0.4706937 0.2466505 0.1817326 0.1009233

Visualizar variação explicada

# Variação do gráfico explicada para cada componente principal

plot(pve, xlab = "Principal Component",
     ylab = "Proportion of Variance Explained",
     ylim = c(0, 1), type = "b")

Plotar proporção cumulativa de variância explicada

plot(cumsum(pve), xlab = "Principal Component",
     ylab = "Cumulative Proportion of Variance Explained",
     ylim = c(0, 1), type = "b")

# Média de cada variável
colMeans(pokemon_pr)
##       HP   Attack  Defense    Speed 
## 69.25875 79.00125 73.84250 68.27750
# Desvio padrão de cada variável
apply(pokemon_pr, 2, sd)
##       HP   Attack  Defense    Speed 
## 25.53467 32.45737 31.18350 29.06047
# Modelo PCA com dimensionamento: pr.with.scaling
pr.with.scaling <- prcomp(pokemon_pr, scale = T, center = T)

# Modelo PCA sem dimensionamento: pr.without.scaling
pr.without.scaling <- prcomp(pokemon_pr, scale = F, center = F)

# Cria biplots de ambos para comparação
biplot(pr.without.scaling)

biplot(pr.with.scaling)