library(Stat2Data)
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ✔ readr     2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)
pokemon <- read.csv("pokemon.csv")
head(pokemon)
##         Date  Time    Pokemon Trainer.Region Trainer.Subregion Pokemon.Region
## 1 12/13/2016 17:28   Oricorio    South Korea                                 
## 2 12/13/2016 17:30      Zubat  United States             Texas            GER
## 3 12/13/2016 17:31    Carbink  United States          Oklahoma               
## 4 12/13/2016 17:33     Klefki  United States       Connecticut               
## 5 12/13/2016 17:34    Luvdisc  United States                                 
## 6 12/13/2016 17:35 Roggenrola United Kingdom                              SPA
##   Level Level.Met Gender    Type1  Type2  Nature Pokeball Held.Item Perfect.IVs
## 1    13        10      F Electric Flying   Sassy     Poke     FALSE           0
## 2     8         8      M   Poison Flying   Hardy     Poke     FALSE           1
## 3    10        10      N     Rock  Fairy Relaxed     Poke     FALSE           0
## 4    29        29      M    Steel  Fairy   Jolly    Quick     FALSE           0
## 5    16        16      M    Water        Naughty    Quick     FALSE           0
## 6    10        10      M     Rock         Modest     Poke     FALSE           1

1. Introduction

The dataset that I have selected is the Pokemon dataset. In this data-set there are 15 variables and 500 observations. This data-set focuses on Pokemon trades and about each individual trade/card. This specific dataset was observations from the owners own personal trades. The purpose of performing PCA on my dataset is to reorganize my data into a new perspective that highlights patterns. Making it easier to understand and analyze the key aspects of the data that i am interested in .

2. Data Preparation

str(pokemon)
## 'data.frame':    500 obs. of  15 variables:
##  $ Date             : chr  "12/13/2016" "12/13/2016" "12/13/2016" "12/13/2016" ...
##  $ Time             : chr  "17:28" "17:30" "17:31" "17:33" ...
##  $ Pokemon          : chr  "Oricorio" "Zubat" "Carbink" "Klefki" ...
##  $ Trainer.Region   : chr  "South Korea" "United States" "United States" "United States" ...
##  $ Trainer.Subregion: chr  "" "Texas" "Oklahoma" "Connecticut" ...
##  $ Pokemon.Region   : chr  "" "GER" "" "" ...
##  $ Level            : int  13 8 10 29 16 10 19 7 1 1 ...
##  $ Level.Met        : int  10 8 10 29 16 10 19 7 1 1 ...
##  $ Gender           : chr  "F" "M" "N" "M" ...
##  $ Type1            : chr  "Electric" "Poison" "Rock" "Steel" ...
##  $ Type2            : chr  "Flying" "Flying" "Fairy" "Fairy" ...
##  $ Nature           : chr  "Sassy" "Hardy" "Relaxed" "Jolly" ...
##  $ Pokeball         : chr  "Poke" "Poke" "Poke" "Quick" ...
##  $ Held.Item        : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Perfect.IVs      : int  0 1 0 0 0 1 0 0 3 5 ...
summary(pokemon)
##      Date               Time             Pokemon          Trainer.Region    
##  Length:500         Length:500         Length:500         Length:500        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  Trainer.Subregion  Pokemon.Region         Level         Level.Met    
##  Length:500         Length:500         Min.   : 1.00   Min.   : 1.00  
##  Class :character   Class :character   1st Qu.: 1.00   1st Qu.: 1.00  
##  Mode  :character   Mode  :character   Median : 6.00   Median : 5.00  
##                                        Mean   :10.91   Mean   :10.12  
##                                        3rd Qu.:16.00   3rd Qu.:13.00  
##                                        Max.   :57.00   Max.   :57.00  
##     Gender             Type1              Type2              Nature         
##  Length:500         Length:500         Length:500         Length:500        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    Pokeball         Held.Item        Perfect.IVs  
##  Length:500         Mode :logical   Min.   :0.00  
##  Class :character   FALSE:490       1st Qu.:0.00  
##  Mode  :character   TRUE :10        Median :0.00  
##                                     Mean   :0.93  
##                                     3rd Qu.:1.00  
##                                     Max.   :6.00
# Subset only the numeric variables you want to visualize
poke_subset <- pokemon[, c( "Level", "Level.Met", "Perfect.IVs",
                            "Gender")]

# Plot 
ggpairs(poke_subset, aes(color = Gender, alpha = 0.7))+
   theme_bw(base_size = 8)  # Smaller font size
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

3. Principal Component Analysis (PCA) Setup

pca_fit <- pokemon |> 
  select(where(is.numeric)) |> 
  scale() |>    
  prcomp()        

pca_fit
## Standard deviations (1, .., p=3):
## [1] 1.4622077 0.9093526 0.1871537
## 
## Rotation (n x k) = (3 x 3):
##                    PC1        PC2         PC3
## Level       -0.6612829 -0.2392569 -0.71095788
## Level.Met   -0.6559865 -0.2752668  0.70278723
## Perfect.IVs  0.3638498 -0.9311199 -0.02507988

4. Visualize PCA Results

pca_fit |>
  augment(pokemon) |>
  ggplot(aes(.fittedPC1, .fittedPC2)) +
  geom_point(aes(color = Gender))+
  xlim(-5,5)+
  ylim(-4,4)+
  xlab("PC1")+
  ylab("PC2")+
  guides(color = guide_legend(title = NULL))+
  theme_minimal()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

# arrows

pca_fit |>
  augment(pokemon) |>
  ggplot(aes(.fittedPC1, .fittedPC2)) +
  geom_point(aes(color = Gender)) +
  
  # PCA1 and PCA2 arrows
  geom_segment(aes(x = -4.9, y = 0, xend = 5, yend = 0), 
               arrow = arrow(type = "closed", length = unit(0.1, "inches")),
               color = "black") + 
  geom_segment(aes(x = 0, y = -2.5, xend = 0, yend = 4), 
               arrow = arrow(type = "closed", length = unit(0.1, "inches")),
               color = "black") + 
  
   # text labels- Positioning and color of the label
  geom_text(aes(x = 5, y = 0, label = "PC1"), 
            vjust = -0.5, color = "black") +  
   
  geom_text(aes(x = 0, y = 4, label = "PC2"), 
            hjust = -0.5, color = "black") +  
  xlim(-5, 5) +
  ylim(-4, 4) +
  xlab("PC1") +
  ylab("PC2") +
  guides(color = guide_legend(title = NULL)) +  
  scale_color_manual(values = c("F" = "darkorange", "M" = "blue", "N" = "purple"), 
                     labels = c("F" = "Female pokemon", "M" = "Male pokemon", "N" = "Unknown"))+
  theme_minimal()
## Warning in geom_segment(aes(x = -4.9, y = 0, xend = 5, yend = 0), arrow = arrow(type = "closed", : All aesthetics have length 1, but the data has 500 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.
## Warning in geom_segment(aes(x = 0, y = -2.5, xend = 0, yend = 4), arrow = arrow(type = "closed", : All aesthetics have length 1, but the data has 500 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.
## Warning in geom_text(aes(x = 5, y = 0, label = "PC1"), vjust = -0.5, color = "black"): All aesthetics have length 1, but the data has 500 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.
## Warning in geom_text(aes(x = 0, y = 4, label = "PC2"), hjust = -0.5, color = "black"): All aesthetics have length 1, but the data has 500 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

Note: the unknown in gender is not an NA obseration it actually means th gender of the pokemin is unknown.

#Rotation Matrix

arrow_style <- arrow(
  angle = 20, length = grid::unit(8, "pt"),
  ends = "first", type = "closed"
)
pca_fit |>
  tidy(matrix = "rotation") |>
  pivot_wider(
    names_from = "PC", values_from = "value",
    names_prefix = "PC"
  ) |>
  ggplot(aes(PC1, PC2)) +
  geom_segment(
    xend = 0, yend = 0,
    arrow = arrow_style
  ) +
  geom_text(aes(label = column), hjust = 1) +
  xlim(-1.5, 0.5) + ylim(-1, 1) + 
  coord_fixed()+
  theme_minimal()

5. PCA Analysis Summary

What does PC1 represent?

Gender pokemon separate along PC1 in the PCA plot. Since PCA centers the data, this separation happens around zero. pokemon with positive PC1 values tend to be Male, and those with negative values are usually females however the overlap of males is still great, but the trend is clear there is some type of separation.

What does PC2 represent?

Along PC2, there’s no clear separation between Male, Female, and Unknown Pokemon as they appear anywhere along the axis. However, we doo see that it is very Clustered and heavy in the positive numbers of PC2.

6. Variance Explained Plot

# bargraph

pca_fit |>
  tidy(matrix = "eigenvalues") |>
  ggplot(aes(PC, percent)) + 
  geom_col(fill= "lightblue") + 
  scale_x_continuous(
    breaks = 1:3
  ) +
  scale_y_continuous(
    name = "variance explained",
    breaks = seq(0, 1, by = 0.1), 
    label = scales::label_percent(accuracy = 1)
  )+
  xlab("Principal Component (PC)") +
  theme_minimal()

pca_fit |>
  tidy(matrix = "eigenvalues")
## # A tibble: 3 × 4
##      PC std.dev percent cumulative
##   <dbl>   <dbl>   <dbl>      <dbl>
## 1     1   1.46   0.713       0.713
## 2     2   0.909  0.276       0.988
## 3     3   0.187  0.0117      1

Specifically, together, PC1 + PC2 explain more than 98% of all the variation in the data-set.