library(tidyverse)
## Warning: package 'purrr' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)
## Warning: package 'broom' was built under R version 4.4.3
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.4.3

Introduction

The dataset I selected is palmerpengiuns, a dataset collected by Dr. Kristen Gorman. The dataset covers information such as bill lenght, bill depth, species, which island the penguin is from, etc. The purpose of using this data to perform PCA would be to see the correlation between the species and the bill length and depth. Specifically, which species has a higher bill length and which has a higher bill depth. Essentially, I want to have an idea in my head of what the penguin’s bill might look like without looking up the species.

setwd("~/data 110")
penguins <- read_csv("penguins(1).csv")
## Rows: 344 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): species, island, sex
## dbl (4): bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(penguins)
## # A tibble: 6 × 7
##   species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
##   <chr>   <chr>              <dbl>         <dbl>             <dbl>       <dbl>
## 1 Adelie  Torgersen           39.1          18.7               181        3750
## 2 Adelie  Torgersen           39.5          17.4               186        3800
## 3 Adelie  Torgersen           40.3          18                 195        3250
## 4 Adelie  Torgersen           NA            NA                  NA          NA
## 5 Adelie  Torgersen           36.7          19.3               193        3450
## 6 Adelie  Torgersen           39.3          20.6               190        3650
## # ℹ 1 more variable: sex <chr>
str(penguins)
## spc_tbl_ [344 × 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ species          : chr [1:344] "Adelie" "Adelie" "Adelie" "Adelie" ...
##  $ island           : chr [1:344] "Torgersen" "Torgersen" "Torgersen" "Torgersen" ...
##  $ bill_length_mm   : num [1:344] 39.1 39.5 40.3 NA 36.7 39.3 38.9 39.2 34.1 42 ...
##  $ bill_depth_mm    : num [1:344] 18.7 17.4 18 NA 19.3 20.6 17.8 19.6 18.1 20.2 ...
##  $ flipper_length_mm: num [1:344] 181 186 195 NA 193 190 181 195 193 190 ...
##  $ body_mass_g      : num [1:344] 3750 3800 3250 NA 3450 ...
##  $ sex              : chr [1:344] "MALE" "FEMALE" "FEMALE" NA ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   species = col_character(),
##   ..   island = col_character(),
##   ..   bill_length_mm = col_double(),
##   ..   bill_depth_mm = col_double(),
##   ..   flipper_length_mm = col_double(),
##   ..   body_mass_g = col_double(),
##   ..   sex = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

Data Preparation

penguins_clean <- penguins |>
  select(bill_length_mm, bill_depth_mm, flipper_length_mm, 
                            body_mass_g, species) |> 
  drop_na()
head(penguins_clean)
## # A tibble: 6 × 5
##   bill_length_mm bill_depth_mm flipper_length_mm body_mass_g species
##            <dbl>         <dbl>             <dbl>       <dbl> <chr>  
## 1           39.1          18.7               181        3750 Adelie 
## 2           39.5          17.4               186        3800 Adelie 
## 3           40.3          18                 195        3250 Adelie 
## 4           36.7          19.3               193        3450 Adelie 
## 5           39.3          20.6               190        3650 Adelie 
## 6           38.9          17.8               181        3625 Adelie

PCA Setup

pca_fit <- penguins_clean |> 
  select(where(is.numeric)) |>
  scale() |>                  
  prcomp()                   

pca_fit
## Standard deviations (1, .., p=4):
## [1] 1.6594442 0.8789293 0.6043475 0.3293816
## 
## Rotation (n x k) = (4 x 4):
##                          PC1          PC2        PC3        PC4
## bill_length_mm     0.4552503 -0.597031143 -0.6443012  0.1455231
## bill_depth_mm     -0.4003347 -0.797766572  0.4184272 -0.1679860
## flipper_length_mm  0.5760133 -0.002282201  0.2320840 -0.7837987
## body_mass_g        0.5483502 -0.084362920  0.5966001  0.5798821

PCA Visualizations

unique(penguins_clean$species)
## [1] "Adelie"    "Chinstrap" "Gentoo"
count(penguins_clean, species)
## # A tibble: 3 × 2
##   species       n
##   <chr>     <int>
## 1 Adelie      151
## 2 Chinstrap    68
## 3 Gentoo      123
library(broom)

pca_fit |>
  # add PCs to the original dataset
  augment(penguins_clean) |>
  ggplot(aes(.fittedPC1, .fittedPC2)) +
  geom_point(aes(color = species), alpha = 0.5) +
  
  # Add the PCA1 and PCA2 arrows
  geom_segment(aes(x = -4.9, y = 0, xend = 5, yend = 0), 
               arrow = arrow(type = "closed", length = unit(0.1, "inches")),
               color = "black") +  # PC1 arrow
  geom_segment(aes(x = 0, y = -2.5, xend = 0, yend = 4), 
               arrow = arrow(type = "closed", length = unit(0.1, "inches")),
               color = "black") +  # PC2 arrow
  
   # Add text labels- Positioning and color of the label
  geom_text(aes(x = 5, y = 0, label = "PC1"), 
            vjust = -0.5, color = "black") +  
   
  geom_text(aes(x = 0, y = 4, label = "PC2"), 
            hjust = -0.5, color = "black") +  
  xlim(-5, 5) +
  ylim(-4, 4) +
  xlab("PC1") +
  ylab("PC2") +
  guides(color = guide_legend(title = NULL)) +  # No title for the legend
  scale_color_manual(values = c("Adelie" = "#a2681a", "Chinstrap" = "#1a94a2", "Gentoo" = "black"))+
  theme_minimal()

PCA analysis summary

PC1 is representing bill length, and PC2 is representing bill depth. It appears that overall, adelie and chinstrap penguins have a lower bill length, and a moderate bill depth. However, Gentoo penguins have a lower bill length, though they still have a similar bill depth as the other two species.

For funsies

arrow_style <- arrow(
  angle = 20, length = grid::unit(8, "pt"),
  ends = "first", type = "closed"
)
pca_fit |>
  # extract rotation matrix
  tidy(matrix = "rotation") |>
  pivot_wider(
    names_from = "PC", values_from = "value",
    names_prefix = "PC"
  ) |>
  ggplot(aes(PC1, PC2)) +
  geom_segment(
    xend = 0, yend = 0,
    arrow = arrow_style
  ) +
  geom_text(aes(label = column), hjust = 1) +
  xlim(-1.5, 0.5) + ylim(-1, 1) + 
  coord_fixed()+
  theme_minimal()

Variance Explained Plot

pca_fit |>
  # extract eigenvalues
  tidy(matrix = "eigenvalues") |>
  ggplot(aes(PC, percent)) + 
  geom_col(fill= "#1a94a2") + 
  scale_x_continuous(
    # create one axis tick per PC
    breaks = 1:6
  ) +
  scale_y_continuous(
    name = "variance explained",
    breaks = seq(0, 1, by = 0.1), 
    # format y axis ticks as percent values
    label = scales::label_percent(accuracy = 1)
  )+
  xlab("Principal Component (PC)") +
  theme_stata()

pca_fit |>
  tidy(matrix = "eigenvalues")
## # A tibble: 4 × 4
##      PC std.dev percent cumulative
##   <dbl>   <dbl>   <dbl>      <dbl>
## 1     1   1.66   0.688       0.688
## 2     2   0.879  0.193       0.882
## 3     3   0.604  0.0913      0.973
## 4     4   0.329  0.0271      1

PC1 explains 68.84 percent of the variance and PC2 explains 19.31 percent of the variance. Together, they make up 88.16 percent of the variance.