library(Stat2Data)
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)
pokemon <- read.csv("pokemon.csv")
head(pokemon)
## Date Time Pokemon Trainer.Region Trainer.Subregion Pokemon.Region
## 1 12/13/2016 17:28 Oricorio South Korea
## 2 12/13/2016 17:30 Zubat United States Texas GER
## 3 12/13/2016 17:31 Carbink United States Oklahoma
## 4 12/13/2016 17:33 Klefki United States Connecticut
## 5 12/13/2016 17:34 Luvdisc United States
## 6 12/13/2016 17:35 Roggenrola United Kingdom SPA
## Level Level.Met Gender Type1 Type2 Nature Pokeball Held.Item Perfect.IVs
## 1 13 10 F Electric Flying Sassy Poke FALSE 0
## 2 8 8 M Poison Flying Hardy Poke FALSE 1
## 3 10 10 N Rock Fairy Relaxed Poke FALSE 0
## 4 29 29 M Steel Fairy Jolly Quick FALSE 0
## 5 16 16 M Water Naughty Quick FALSE 0
## 6 10 10 M Rock Modest Poke FALSE 1
1. Introduction
The dataset that I have selected is the Pokemon dataset. In this data-set there are 15 variables and 500 observations. This data-set focuses on Pokemon trades and about each individual trade/card. This specific dataset was observations from the owners own personal trades. The purpose of performing PCA on my dataset is to reorganize my data into a new perspective that highlights patterns. Making it easier to understand and analyze the key aspects of the data that i am interested in .
2. Data Preparation
str(pokemon)
## 'data.frame': 500 obs. of 15 variables:
## $ Date : chr "12/13/2016" "12/13/2016" "12/13/2016" "12/13/2016" ...
## $ Time : chr "17:28" "17:30" "17:31" "17:33" ...
## $ Pokemon : chr "Oricorio" "Zubat" "Carbink" "Klefki" ...
## $ Trainer.Region : chr "South Korea" "United States" "United States" "United States" ...
## $ Trainer.Subregion: chr "" "Texas" "Oklahoma" "Connecticut" ...
## $ Pokemon.Region : chr "" "GER" "" "" ...
## $ Level : int 13 8 10 29 16 10 19 7 1 1 ...
## $ Level.Met : int 10 8 10 29 16 10 19 7 1 1 ...
## $ Gender : chr "F" "M" "N" "M" ...
## $ Type1 : chr "Electric" "Poison" "Rock" "Steel" ...
## $ Type2 : chr "Flying" "Flying" "Fairy" "Fairy" ...
## $ Nature : chr "Sassy" "Hardy" "Relaxed" "Jolly" ...
## $ Pokeball : chr "Poke" "Poke" "Poke" "Quick" ...
## $ Held.Item : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Perfect.IVs : int 0 1 0 0 0 1 0 0 3 5 ...
summary(pokemon)
## Date Time Pokemon Trainer.Region
## Length:500 Length:500 Length:500 Length:500
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Trainer.Subregion Pokemon.Region Level Level.Met
## Length:500 Length:500 Min. : 1.00 Min. : 1.00
## Class :character Class :character 1st Qu.: 1.00 1st Qu.: 1.00
## Mode :character Mode :character Median : 6.00 Median : 5.00
## Mean :10.91 Mean :10.12
## 3rd Qu.:16.00 3rd Qu.:13.00
## Max. :57.00 Max. :57.00
## Gender Type1 Type2 Nature
## Length:500 Length:500 Length:500 Length:500
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Pokeball Held.Item Perfect.IVs
## Length:500 Mode :logical Min. :0.00
## Class :character FALSE:490 1st Qu.:0.00
## Mode :character TRUE :10 Median :0.00
## Mean :0.93
## 3rd Qu.:1.00
## Max. :6.00
# Subset only the numeric variables you want to visualize
poke_subset <- pokemon[, c( "Level", "Level.Met", "Perfect.IVs",
"Gender")]
# Plot
ggpairs(poke_subset, aes(color = Gender, alpha = 0.7))+
theme_bw(base_size = 8) # Smaller font size
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
3. Principal Component Analysis (PCA) Setup
pca_fit <- pokemon |>
select(where(is.numeric)) |>
scale() |>
prcomp()
pca_fit
## Standard deviations (1, .., p=3):
## [1] 1.4622077 0.9093526 0.1871537
##
## Rotation (n x k) = (3 x 3):
## PC1 PC2 PC3
## Level -0.6612829 -0.2392569 -0.71095788
## Level.Met -0.6559865 -0.2752668 0.70278723
## Perfect.IVs 0.3638498 -0.9311199 -0.02507988
4. Visualize PCA Results
pca_fit |>
augment(pokemon) |>
ggplot(aes(.fittedPC1, .fittedPC2)) +
geom_point(aes(color = Gender))+
xlim(-5,5)+
ylim(-4,4)+
xlab("PC1")+
ylab("PC2")+
guides(color = guide_legend(title = NULL))+
theme_minimal()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
# arrows
pca_fit |>
augment(pokemon) |>
ggplot(aes(.fittedPC1, .fittedPC2)) +
geom_point(aes(color = Gender)) +
# PCA1 and PCA2 arrows
geom_segment(aes(x = -4.9, y = 0, xend = 5, yend = 0),
arrow = arrow(type = "closed", length = unit(0.1, "inches")),
color = "black") +
geom_segment(aes(x = 0, y = -2.5, xend = 0, yend = 4),
arrow = arrow(type = "closed", length = unit(0.1, "inches")),
color = "black") +
# text labels- Positioning and color of the label
geom_text(aes(x = 5, y = 0, label = "PC1"),
vjust = -0.5, color = "black") +
geom_text(aes(x = 0, y = 4, label = "PC2"),
hjust = -0.5, color = "black") +
xlim(-5, 5) +
ylim(-4, 4) +
xlab("PC1") +
ylab("PC2") +
guides(color = guide_legend(title = NULL)) +
scale_color_manual(values = c("F" = "darkorange", "M" = "blue", "N" = "purple"),
labels = c("F" = "Female pokemon", "M" = "Male pokemon", "N" = "Unknown"))+
theme_minimal()
## Warning in geom_segment(aes(x = -4.9, y = 0, xend = 5, yend = 0), arrow = arrow(type = "closed", : All aesthetics have length 1, but the data has 500 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
## a single row.
## Warning in geom_segment(aes(x = 0, y = -2.5, xend = 0, yend = 4), arrow = arrow(type = "closed", : All aesthetics have length 1, but the data has 500 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
## a single row.
## Warning in geom_text(aes(x = 5, y = 0, label = "PC1"), vjust = -0.5, color = "black"): All aesthetics have length 1, but the data has 500 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
## a single row.
## Warning in geom_text(aes(x = 0, y = 4, label = "PC2"), hjust = -0.5, color = "black"): All aesthetics have length 1, but the data has 500 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
## a single row.
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
Note: the unknown in gender is not an NA obseration it actually means th gender of the pokemin is unknown.
#Rotation Matrix
arrow_style <- arrow(
angle = 20, length = grid::unit(8, "pt"),
ends = "first", type = "closed"
)
pca_fit |>
tidy(matrix = "rotation") |>
pivot_wider(
names_from = "PC", values_from = "value",
names_prefix = "PC"
) |>
ggplot(aes(PC1, PC2)) +
geom_segment(
xend = 0, yend = 0,
arrow = arrow_style
) +
geom_text(aes(label = column), hjust = 1) +
xlim(-1.5, 0.5) + ylim(-1, 1) +
coord_fixed()+
theme_minimal()
5. PCA Analysis Summary
What does PC1 represent?
Gender pokemon separate along PC1 in the PCA plot. Since PCA centers the data, this separation happens around zero. pokemon with positive PC1 values tend to be Male, and those with negative values are usually females however the overlap of males is still great, but the trend is clear there is some type of separation.
What does PC2 represent?
Along PC2, there’s no clear separation between Male, Female, and Unknown Pokemon as they appear anywhere along the axis. However, we doo see that it is very Clustered and heavy in the positive numbers of PC2.
6. Variance Explained Plot
# bargraph
pca_fit |>
tidy(matrix = "eigenvalues") |>
ggplot(aes(PC, percent)) +
geom_col(fill= "lightblue") +
scale_x_continuous(
breaks = 1:3
) +
scale_y_continuous(
name = "variance explained",
breaks = seq(0, 1, by = 0.1),
label = scales::label_percent(accuracy = 1)
)+
xlab("Principal Component (PC)") +
theme_minimal()
pca_fit |>
tidy(matrix = "eigenvalues")
## # A tibble: 3 × 4
## PC std.dev percent cumulative
## <dbl> <dbl> <dbl> <dbl>
## 1 1 1.46 0.713 0.713
## 2 2 0.909 0.276 0.988
## 3 3 0.187 0.0117 1
Specifically, together, PC1 + PC2 explain more than 98% of all the variation in the data-set.