library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(GGally)
## Warning: package 'GGally' was built under R version 4.4.2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(ggplot2)
setwd("C:/Users/MCuser/Downloads")
penguin <- read_csv("penguins(1).csv")
## Rows: 344 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): species, island, sex
## dbl (4): bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

I selected the dataset penguins which looks bill length, bill depth, flipper length, and body mass based on 3 species. I am performing a PCA to see any relationships or patterns among the data in relation to one point, species.

# Subset only the numeric variables you want to visualize
pgn <- penguin[, c("bill_length_mm", "bill_depth_mm", "flipper_length_mm", 
                            "body_mass_g", "species")] |>
  drop_na()
head(pgn)
## # A tibble: 6 × 5
##   bill_length_mm bill_depth_mm flipper_length_mm body_mass_g species
##            <dbl>         <dbl>             <dbl>       <dbl> <chr>  
## 1           39.1          18.7               181        3750 Adelie 
## 2           39.5          17.4               186        3800 Adelie 
## 3           40.3          18                 195        3250 Adelie 
## 4           36.7          19.3               193        3450 Adelie 
## 5           39.3          20.6               190        3650 Adelie 
## 6           38.9          17.8               181        3625 Adelie
#plot
ggpairs(pgn, aes(color = species, alpha = 0.7))+
   theme_bw(base_size = 8)  # Smaller font size

library(broom)  # for augment(), tidy()

pca_fit <- pgn |>
  select(where(is.numeric)) |>
  scale() |>
  prcomp()
print(pca_fit)
## Standard deviations (1, .., p=4):
## [1] 1.6594442 0.8789293 0.6043475 0.3293816
## 
## Rotation (n x k) = (4 x 4):
##                          PC1          PC2        PC3        PC4
## bill_length_mm     0.4552503 -0.597031143 -0.6443012  0.1455231
## bill_depth_mm     -0.4003347 -0.797766572  0.4184272 -0.1679860
## flipper_length_mm  0.5760133 -0.002282201  0.2320840 -0.7837987
## body_mass_g        0.5483502 -0.084362920  0.5966001  0.5798821
pca_fit |>
  # add PCs to the original dataset
  augment(pgn) |>
  ggplot(aes(.fittedPC1, .fittedPC2)) +
  geom_point(aes(color = species))+
  xlim(-5,5)+
  ylim(-4,4)+
  xlab("PC1")+
  ylab("PC2")+
  guides(color = guide_legend(title = NULL))+
  theme_minimal()

pca_fit |>
  # add PCs to the original dataset
  augment(pgn) |>
  ggplot(aes(.fittedPC1, .fittedPC2)) +
  geom_point(aes(color = species)) +
  
  # Add the PCA1 and PCA2 arrows
  geom_segment(aes(x = -4.9, y = 0, xend = 5, yend = 0), 
               arrow = arrow(type = "closed", length = unit(0.1, "inches")),
               color = "black") +  # PC1 arrow
  geom_segment(aes(x = 0, y = -2.5, xend = 0, yend = 4), 
               arrow = arrow(type = "closed", length = unit(0.1, "inches")),
               color = "black") +  # PC2 arrow
  
   # Add text labels- Positioning and color of the label
  geom_text(aes(x = 5, y = 0, label = "PC1"), 
            vjust = -0.5, color = "black") +  
   
  geom_text(aes(x = 0, y = 4, label = "PC2"), 
            hjust = -0.5, color = "black") +  
  xlim(-5, 5) +
  ylim(-4, 4) +
  xlab("PC1") +
  ylab("PC2") +
  guides(color = guide_legend(title = NULL)) +  # No title for the legend
  scale_color_manual(values = c("Adelie" = "maroon", "Gentoo" = "lightblue", "Chinstrap" = "yellow"))+
  theme_minimal()

Rotation arrow

arrow_style <- arrow(
  angle = 20, length = grid::unit(8, "pt"),
  ends = "first", type = "closed"
)
pca_fit |>
  # extract rotation matrix
  tidy(matrix = "rotation") |>
  pivot_wider(
    names_from = "PC", values_from = "value",
    names_prefix = "PC"
  ) |>
  ggplot(aes(PC1, PC2)) +
  geom_segment(
    xend = 0, yend = 0,
    arrow = arrow_style
  ) +
  geom_text(aes(label = column), hjust = 1) +
  xlim(-1.5, 0.5) + ylim(-1, 1) + 
  coord_fixed()+
  theme_minimal()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_segment()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_text()`).

The dot plot is very clustered by species showing that species does dictate penguin size. The arrows indicate that the difference between species is really only dictated bill length and depth. The length of the arrow corresponding to bill depth shows that bill depth accounts for more of a difference between species than bill length. Both length and depth are negative along PC2 indicating that as PC2 increases, length and depth decrease. When looking at this in the context of the dot plot that means that penguins higher up on PC2 have smaller bills so that would be the Adelie and Gentoo penguins. Bill length is positive along PC1, while depth is negative that means that as penguins increase in bill length they decrease in depth and vice versa.

pca_fit |>
  # extract eigenvalues
  tidy(matrix = "eigenvalues") |>
  ggplot(aes(PC, percent)) + 
  geom_col(fill= "darkred") + 
  scale_x_continuous(
    # create one axis tick per PC
    breaks = 1:6
  ) +
  scale_y_continuous(
    name = "variance explained",
    breaks = seq(0, 1, by = 0.1), 
    # format y axis ticks as percent values
    label = scales::label_percent(accuracy = 1)
  )+
  xlab("Principal Component (PC)") +
  theme_minimal()

PC1 explains about 69% of variance, and PC2 explains about 19% of variance, together they explain about 88% of variance.