library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data <- read_csv("pokemon_data.csv")
## Rows: 800 Columns: 12
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (3): Name, Type 1, Type 2
## dbl (8): #, HP, Attack, Defense, Sp. Atk, Sp. Def, Speed, Generation
## lgl (1): Legendary
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
pokemon <- read_csv("pokemon_data.csv")
## Rows: 800 Columns: 12
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (3): Name, Type 1, Type 2
## dbl (8): #, HP, Attack, Defense, Sp. Atk, Sp. Def, Speed, Generation
## lgl (1): Legendary
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(pokemon)
## # A tibble: 6 x 12
## `#` Name `Type 1` `Type 2` HP Attack Defense `Sp. Atk` `Sp. Def` Speed
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 Bulbas~ Grass Poison 45 49 49 65 65 45
## 2 2 Ivysaur Grass Poison 60 62 63 80 80 60
## 3 3 Venusa~ Grass Poison 80 82 83 100 100 80
## 4 3 Venusa~ Grass Poison 80 100 123 122 120 80
## 5 4 Charma~ Fire <NA> 39 52 43 60 50 65
## 6 5 Charme~ Fire <NA> 58 64 58 80 65 80
## # ... with 2 more variables: Generation <dbl>, Legendary <lgl>
head(pokemon)
## # A tibble: 6 x 12
## `#` Name `Type 1` `Type 2` HP Attack Defense `Sp. Atk` `Sp. Def` Speed
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 Bulbas~ Grass Poison 45 49 49 65 65 45
## 2 2 Ivysaur Grass Poison 60 62 63 80 80 60
## 3 3 Venusa~ Grass Poison 80 82 83 100 100 80
## 4 3 Venusa~ Grass Poison 80 100 123 122 120 80
## 5 4 Charma~ Fire <NA> 39 52 43 60 50 65
## 6 5 Charme~ Fire <NA> 58 64 58 80 65 80
## # ... with 2 more variables: Generation <dbl>, Legendary <lgl>
tail(pokemon)
## # A tibble: 6 x 12
## `#` Name `Type 1` `Type 2` HP Attack Defense `Sp. Atk` `Sp. Def` Speed
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 718 Zygard~ Dragon Ground 108 100 121 81 95 95
## 2 719 Diancie Rock Fairy 50 100 150 100 150 50
## 3 719 Dianci~ Rock Fairy 50 160 110 160 110 110
## 4 720 HoopaH~ Psychic Ghost 80 110 60 150 130 70
## 5 720 HoopaH~ Psychic Dark 80 160 60 170 130 80
## 6 721 Volcan~ Fire Water 80 110 120 130 90 70
## # ... with 2 more variables: Generation <dbl>, Legendary <lgl>
dim(pokemon)
## [1] 800 12
pokemon <- pokemon %>% select(6:11)
head(pokemon)
## # A tibble: 6 x 6
## Attack Defense `Sp. Atk` `Sp. Def` Speed Generation
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 49 49 65 65 45 1
## 2 62 63 80 80 60 1
## 3 82 83 100 100 80 1
## 4 100 123 122 120 80 1
## 5 52 43 60 50 65 1
## 6 64 58 80 65 80 1
hist(pokemon$Attack,
xlab = "Atack",
ylab = "Total",
title = "Atack dos pokemon")
## Warning in plot.window(xlim, ylim, "", ...): "title" não é um parâmetro gráfico
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...): "title"
## não é um parâmetro gráfico
## Warning in axis(1, ...): "title" não é um parâmetro gráfico
## Warning in axis(2, ...): "title" não é um parâmetro gráfico
# Gráfico dos pokemon defesa
hist(pokemon$Defense,
xlab = "Defesa",
ylab = "Total",
title = "Defesa dos pokemon")
## Warning in plot.window(xlim, ylim, "", ...): "title" não é um parâmetro gráfico
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...): "title"
## não é um parâmetro gráfico
## Warning in axis(1, ...): "title" não é um parâmetro gráfico
## Warning in axis(2, ...): "title" não é um parâmetro gráfico
# Gráfico boxplot dos pokemon
boxplot(pokemon $Attack,
xlab = "Atack",
ylab = "Total",
title = "Atack dos pokemon")
boxplot(pokemon$Defense,
xlab = "Defesa",
ylab = "Total",
title = "Defesa dos pokemon")
plot(pokemon$Attack)
# Inicialize o total dentro do erro da soma dos quadrados: wss
wss <- 0
# Veja de 1 a 15 clusters possÃveis
for (i in 1:15) {
# Ajuste o modelo: km.out
km.out <- kmeans(pokemon, centers = i, nstart = 20, iter.max = 50)
# Salve a soma dos quadrados dentro do cluster
wss[i] <- km.out$tot.withinss
}
# Produza um scree plot
plot(1:15, wss, type = "b",
xlab = "Número de clusters",
ylab = "Dentro de grupos soma de quadrados")
k <- 10
# Construir modelo com k clusters: km.out
km.pokemon <- kmeans(pokemon, centers = k, nstart = 20, iter.max = 50)
# Visualize o modelo resultante
km.pokemon
## K-means clustering with 10 clusters of sizes 98, 54, 23, 34, 94, 91, 71, 106, 124, 105
##
## Cluster means:
## Attack Defense Sp. Atk Sp. Def Speed Generation
## 1 70.10204 74.78571 45.15306 54.15306 38.26531 3.520408
## 2 140.77778 93.96296 131.12963 92.92593 98.62963 3.685185
## 3 68.04348 137.21739 67.60870 138.69565 50.69565 3.869565
## 4 106.14706 145.02941 62.35294 68.88235 46.14706 3.147059
## 5 116.43617 87.14894 61.56383 77.94681 82.28723 3.297872
## 6 37.19780 41.76923 40.42857 47.31868 40.84615 3.109890
## 7 80.91549 85.92958 120.64789 105.09859 95.76056 3.211268
## 8 79.42453 64.03774 83.29245 69.52830 99.99057 3.339623
## 9 62.20161 45.15323 56.26613 48.78226 66.56452 3.225806
## 10 69.98095 77.17143 87.97143 87.15238 54.35238 3.276190
##
## Clustering vector:
## [1] 9 10 7 7 9 8 7 2 7 1 10 10 7 6 6 10 6 6 9 5 6 9 8 7 9
## [26] 8 9 8 9 8 9 8 1 5 6 1 5 6 9 5 6 10 9 8 6 9 6 8 1 10
## [51] 10 1 1 6 8 9 8 9 8 9 8 9 5 9 8 9 9 5 9 8 7 7 1 1 5
## [76] 9 9 8 9 7 1 4 4 8 8 1 10 4 10 10 9 9 5 6 10 1 5 1 4 9
## [101] 8 7 7 4 6 10 1 5 9 8 1 10 1 1 5 5 1 1 4 1 4 6 10 5 5
## [126] 9 8 9 5 9 8 7 5 8 8 8 5 5 5 6 5 5 10 6 9 10 7 5 10 10
## [151] 10 1 5 8 5 10 7 7 7 9 8 2 2 2 2 7 1 10 10 9 8 7 1 1 5
## [176] 6 9 6 10 6 10 6 1 8 9 10 6 6 6 6 10 9 8 6 9 10 7 10 6 10
## [201] 4 10 6 9 8 9 6 10 9 6 1 7 3 8 10 8 9 6 8 1 4 1 5 4 4
## [226] 1 5 5 5 4 3 5 5 8 1 5 6 10 6 1 10 9 10 9 10 4 9 8 7 7
## [251] 1 4 10 8 6 6 5 9 9 9 5 10 7 5 7 1 1 5 2 3 2 7 9 8 8
## [276] 2 9 10 2 2 1 1 5 2 6 5 6 8 6 6 9 6 10 6 9 10 6 9 8 9
## [301] 8 9 10 6 6 7 7 6 10 6 5 1 8 2 1 8 1 6 9 10 6 5 6 3 6
## [326] 9 1 3 1 4 1 4 4 4 6 8 5 9 8 7 8 8 9 8 10 6 10 9 8 2
## [351] 9 9 9 10 2 4 6 10 9 1 9 8 9 10 6 10 7 5 8 10 5 9 10 1 5
## [376] 6 3 1 10 9 5 6 7 10 10 9 5 2 1 3 10 10 5 2 6 6 8 2 6 10
## [401] 10 1 10 10 4 9 1 1 2 2 1 1 4 2 4 3 3 7 7 7 2 7 2 2 2
## [426] 2 2 7 2 2 3 8 1 1 5 9 8 8 6 10 10 6 9 5 6 9 6 9 6 9
## [451] 5 6 7 9 5 1 3 6 10 10 10 8 6 10 8 9 8 6 8 6 10 8 9 8 9
## [476] 8 5 7 8 9 8 6 9 8 1 3 1 6 6 8 10 9 9 5 2 1 9 8 2 1
## [501] 4 1 5 9 8 10 9 8 10 9 10 2 5 7 10 4 4 5 7 7 8 5 7 5 5
## [526] 7 5 5 3 3 8 8 7 7 7 7 7 3 7 2 2 2 7 2 7 2 3 8 7 7
## [551] 7 7 2 7 9 8 8 9 9 5 9 10 10 6 8 6 1 5 9 8 9 8 9 8 9
## [576] 8 6 10 6 9 5 9 8 1 1 4 9 8 9 5 10 3 1 1 5 9 9 8 1 5
## [601] 1 1 5 6 1 5 6 8 6 8 8 9 9 5 1 5 7 10 1 4 1 3 8 1 3
## [626] 1 4 9 2 9 5 9 8 9 8 6 10 10 6 10 10 6 8 9 10 7 9 5 8 9
## [651] 4 6 10 6 10 1 9 8 1 3 1 10 5 9 10 10 1 10 6 10 7 1 5 5 9
## [676] 5 7 1 8 10 9 8 5 1 5 1 5 5 9 5 1 10 10 5 1 10 7 9 7 5
## [701] 5 7 2 7 2 7 2 2 2 2 2 2 2 7 7 7 5 2 1 1 4 9 10 7 9
## [726] 8 8 6 9 9 9 8 6 6 8 9 8 6 10 7 9 10 1 5 8 9 8 8 1 4
## [751] 2 3 1 10 1 10 6 5 1 5 1 10 1 10 9 8 1 5 9 10 10 8 8 3 6
## [776] 10 7 10 1 5 1 1 1 1 5 5 4 4 1 4 6 8 2 2 5 3 2 7 2 2
##
## Within cluster sum of squares by cluster:
## [1] 117284.45 155819.30 71276.70 85419.06 167899.79 102515.38 129665.41
## [8] 145248.86 120115.55 160028.30
## (between_SS / total_SS = 66.7 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
plot(pokemon[, c("Defense", "Speed")],
col = km.pokemon$cluster,
main = paste("Agrupamento de Pokémon com", k, "Clusters"),
xlab = "Defesa", ylab = "Velocidade")
# Ver os meios da coluna
colMeans(pokemon)
## Attack Defense Sp. Atk Sp. Def Speed Generation
## 79.00125 73.84250 72.82000 71.90250 68.27750 3.32375
# Veja os desvios padrão da coluna
apply(pokemon, 2, sd)
## Attack Defense Sp. Atk Sp. Def Speed Generation
## 32.45737 31.18350 32.72229 27.82892 29.06047 1.66129
# Dimensionar os dados
pokemon.scaled <- scale(pokemon)
# Crie um modelo de cluster hierárquico: hclust.pokemon
hclust.pokemon <- hclust(dist(pokemon.scaled), method = "complete")
# Aplique cutree() ao hclust.pokemon: cut.pokemon
cut.pokemon <- cutree(hclust.pokemon, k = 3)
# Comparar métodos
table(km.pokemon$cluster, cut.pokemon)
## cut.pokemon
## 1 2 3
## 1 98 0 0
## 2 47 7 0
## 3 22 0 1
## 4 34 0 0
## 5 94 0 0
## 6 91 0 0
## 7 61 10 0
## 8 103 3 0
## 9 124 0 0
## 10 105 0 0
pokemon
## # A tibble: 800 x 6
## Attack Defense `Sp. Atk` `Sp. Def` Speed Generation
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 49 49 65 65 45 1
## 2 62 63 80 80 60 1
## 3 82 83 100 100 80 1
## 4 100 123 122 120 80 1
## 5 52 43 60 50 65 1
## 6 64 58 80 65 80 1
## 7 84 78 109 85 100 1
## 8 130 111 130 85 100 1
## 9 104 78 159 115 100 1
## 10 48 65 50 64 43 1
## # ... with 790 more rows
pokemon_pr <- data %>% select(HP, Attack, Defense, Speed)
glimpse(pokemon_pr)
## Rows: 800
## Columns: 4
## $ HP <dbl> 45, 60, 80, 80, 39, 58, 78, 78, 78, 44, 59, 79, 79, 45, 50, 60~
## $ Attack <dbl> 49, 62, 82, 100, 52, 64, 84, 130, 104, 48, 63, 83, 103, 30, 20~
## $ Defense <dbl> 49, 63, 83, 123, 43, 58, 78, 111, 78, 65, 80, 100, 120, 35, 55~
## $ Speed <dbl> 45, 60, 80, 80, 65, 80, 100, 100, 100, 43, 58, 78, 78, 45, 30,~
summary(pokemon_pr)
## HP Attack Defense Speed
## Min. : 1.00 Min. : 5 Min. : 5.00 Min. : 5.00
## 1st Qu.: 50.00 1st Qu.: 55 1st Qu.: 50.00 1st Qu.: 45.00
## Median : 65.00 Median : 75 Median : 70.00 Median : 65.00
## Mean : 69.26 Mean : 79 Mean : 73.84 Mean : 68.28
## 3rd Qu.: 80.00 3rd Qu.:100 3rd Qu.: 90.00 3rd Qu.: 90.00
## Max. :255.00 Max. :190 Max. :230.00 Max. :180.00
pr.pokemon <- prcomp(x = pokemon_pr, scale = T, center = T)
summary
## function (object, ...)
## UseMethod("summary")
## <bytecode: 0x00000000216ff0a8>
## <environment: namespace:base>
pr.pokemon
## Standard deviations (1, .., p=4):
## [1] 1.3721424 0.9932783 0.8526020 0.6353685
##
## Rotation (n x k) = (4 x 4):
## PC1 PC2 PC3 PC4
## HP 0.5009303 -0.06463396 0.8300858 -0.2363236
## Attack 0.6301797 0.02703796 -0.1621455 0.7588487
## Defense 0.4556878 -0.61865282 -0.4521283 -0.4529871
## Speed 0.3798566 0.78253440 -0.2832778 -0.4038596
Os modelos PCA produzem componentes adicionais de diagnóstico e saÃda:
Dados em termos dos componentes principais originais. - x - o valor de cada observação no conjunto de dados original projetado para os componentes principais
pr.pokemon$center
## HP Attack Defense Speed
## 69.25875 79.00125 73.84250 68.27750
pr.pokemon$scale
## HP Attack Defense Speed
## 25.53467 32.45737 31.18350 29.06047
pr.pokemon$rotation
## PC1 PC2 PC3 PC4
## HP 0.5009303 -0.06463396 0.8300858 -0.2363236
## Attack 0.6301797 0.02703796 -0.1621455 0.7588487
## Defense 0.4556878 -0.61865282 -0.4521283 -0.4529871
## Speed 0.3798566 0.78253440 -0.2832778 -0.4038596
head(pr.pokemon$x,10)
## PC1 PC2 PC3 PC4
## [1,] -1.7256845 -0.097546271 -0.05163582 0.207456766
## [2,] -0.7783645 0.001484139 0.02184005 -0.039259320
## [3,] 0.5559879 0.109293957 0.08715421 -0.325236471
## [4,] 1.4899932 -0.669275804 -0.58272583 -0.485458941
## [5,] -1.6113972 0.577730653 -0.36963560 0.142341152
## [6,] -0.5904092 0.645964030 -0.17563029 -0.179301322
## [7,] 0.7439432 0.753773847 -0.11031614 -0.465278473
## [8,] 2.1192939 0.137402684 -0.81858156 0.130820647
## [9,] 1.1322555 0.770434451 -0.21022907 0.002318739
## [10,] -1.5570505 -0.467129383 -0.29163598 -0.011297649
Quais duas variáveis originais têm aproximadamente as mesmas cargas nos dois primeiros componentes principais?
Pontos de Ataque e Vida (HP)
Quais dois pokémons são os menos semelhantes em termos do segundo componente principal?
biplot(pr.pokemon)
# Variabilidade de cada componente principal: pr.var
pr.var <- pr.pokemon$sdev^2
# Variação explicada por cada componente principal: pve
pve <- pr.var / sum(pr.var)
pve
## [1] 0.4706937 0.2466505 0.1817326 0.1009233
# Variação do gráfico explicada para cada componente principal
plot(pve, xlab = "Principal Component",
ylab = "Proportion of Variance Explained",
ylim = c(0, 1), type = "b")
plot(cumsum(pve), xlab = "Principal Component",
ylab = "Cumulative Proportion of Variance Explained",
ylim = c(0, 1), type = "b")
# Média de cada variável
colMeans(pokemon_pr)
## HP Attack Defense Speed
## 69.25875 79.00125 73.84250 68.27750
# Desvio padrão de cada variável
apply(pokemon_pr, 2, sd)
## HP Attack Defense Speed
## 25.53467 32.45737 31.18350 29.06047
# Modelo PCA com dimensionamento: pr.with.scaling
pr.with.scaling <- prcomp(pokemon_pr, scale = T, center = T)
# Modelo PCA sem dimensionamento: pr.without.scaling
pr.without.scaling <- prcomp(pokemon_pr, scale = F, center = F)
# Cria biplots de ambos para comparação
biplot(pr.without.scaling)
biplot(pr.with.scaling)