Introduction-

The dataset I am using contains measurements of three different penguin species from different islands. Variables include bill length, bill depth, flipper length, body mass, and species type. Using PCA we can find patterns in the data. PCA helps to visualize how the species differ based on their physical features and identifies the main traits that separate them.

setwd("/Users/natalia/Stats/Datasets")


library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
penguins <- read_csv("penguins.csv")
## Rows: 344 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): species, island, sex
## dbl (4): bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
penguin_clean <- penguins |>
  drop_na() |>
  select(bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g, species)



head(penguin_clean)
## # A tibble: 6 × 5
##   bill_length_mm bill_depth_mm flipper_length_mm body_mass_g species
##            <dbl>         <dbl>             <dbl>       <dbl> <chr>  
## 1           39.1          18.7               181        3750 Adelie 
## 2           39.5          17.4               186        3800 Adelie 
## 3           40.3          18                 195        3250 Adelie 
## 4           36.7          19.3               193        3450 Adelie 
## 5           39.3          20.6               190        3650 Adelie 
## 6           38.9          17.8               181        3625 Adelie
pca_fit2 <- penguin_clean |> 
  select(where(is.numeric)) |>
  scale() |>                   
  prcomp()                      



pca_fit2
## Standard deviations (1, .., p=4):
## [1] 1.6569115 0.8821095 0.6071594 0.3284579
## 
## Rotation (n x k) = (4 x 4):
##                          PC1         PC2        PC3        PC4
## bill_length_mm     0.4537532 -0.60019490 -0.6424951  0.1451695
## bill_depth_mm     -0.3990472 -0.79616951  0.4258004 -0.1599044
## flipper_length_mm  0.5768250 -0.00578817  0.2360952 -0.7819837
## body_mass_g        0.5496747 -0.07646366  0.5917374  0.5846861
summary(pca_fit2)
## Importance of components:
##                           PC1    PC2     PC3     PC4
## Standard deviation     1.6569 0.8821 0.60716 0.32846
## Proportion of Variance 0.6863 0.1945 0.09216 0.02697
## Cumulative Proportion  0.6863 0.8809 0.97303 1.00000
library(broom)  

pca_fit2 |>
  
  augment(penguin_clean) |>
  ggplot(aes(.fittedPC1, .fittedPC2)) +
  geom_point(aes(color = species))+
  xlim(-5,5)+
  ylim(-4,4)+
  xlab("PC1")+
  ylab("PC2")+
  guides(color = guide_legend(title = NULL))+
  theme_minimal()

arrow_style <- arrow(
  angle = 20, length = grid::unit(8, "pt"),
  ends = "first", type = "closed"
)

# Extract rotation matrix for your pca_fit2
rotation <- pca_fit2 |>
  tidy(matrix = "rotation") |>
  pivot_wider(
    names_from = "PC", values_from = "value",
    names_prefix = "PC"
  )

# Make the plot, Chatgpt and life expectancy notes
pca_fit2 |>
  augment(penguin_clean) |>
  ggplot(aes(.fittedPC1, .fittedPC2)) +
  geom_segment(
    data = rotation,
    aes(x = PC1, y = PC2, xend = 0, yend = 0),  # arrows point *to* the origin
    arrow = arrow_style,
    inherit.aes = FALSE
  ) +
  geom_text(
    data = rotation,
    aes(x = PC1, y = PC2, label = column),
    hjust = 1,
    inherit.aes = FALSE
  ) +
  xlim(-5, 5) +
  ylim(-4, 4) +
  xlab("PC1") +
  ylab("PC2") +
  coord_fixed() +
  guides(color = guide_legend(title = NULL)) +
  theme_minimal()

PC1 reflects differences in body size. Penguins with longer flippers, heavier weight, and longer bills have higher PC1 scores, while smaller penguins have a lower PC1

PC2 separates penguins based on how deep their bills are

pca_fit2 |>
  # extract eigenvalues
  tidy(matrix = "eigenvalues") |>
  ggplot(aes(PC, percent)) + 
  geom_col(fill= "#2e0354") + 
  scale_x_continuous(
    # create one axis tick per PC
    breaks = 1:6
  ) +
  scale_y_continuous(
    name = "variance explained",
    breaks = seq(0, 1, by = 0.1), 
    # format y axis ticks as percent values
    label = scales::label_percent(accuracy = 1)
  )+
  xlab("Principal Component (PC)") +
  theme_minimal()

pca_fit2 |>
 tidy(matrix = "eigenvalues")
## # A tibble: 4 × 4
##      PC std.dev percent cumulative
##   <dbl>   <dbl>   <dbl>      <dbl>
## 1     1   1.66   0.686       0.686
## 2     2   0.882  0.195       0.881
## 3     3   0.607  0.0922      0.973
## 4     4   0.328  0.0270      1