library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(GGally)
## Warning: package 'GGally' was built under R version 4.4.2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(ggplot2)
setwd("C:/Users/MCuser/Downloads")
penguin <- read_csv("penguins(1).csv")
## Rows: 344 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): species, island, sex
## dbl (4): bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
I selected the dataset penguins which looks bill length, bill depth, flipper length, and body mass based on 3 species. I am performing a PCA to see any relationships or patterns among the data in relation to one point, species.
# Subset only the numeric variables you want to visualize
pgn <- penguin[, c("bill_length_mm", "bill_depth_mm", "flipper_length_mm",
"body_mass_g", "species")] |>
drop_na()
head(pgn)
## # A tibble: 6 × 5
## bill_length_mm bill_depth_mm flipper_length_mm body_mass_g species
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 39.1 18.7 181 3750 Adelie
## 2 39.5 17.4 186 3800 Adelie
## 3 40.3 18 195 3250 Adelie
## 4 36.7 19.3 193 3450 Adelie
## 5 39.3 20.6 190 3650 Adelie
## 6 38.9 17.8 181 3625 Adelie
#plot
ggpairs(pgn, aes(color = species, alpha = 0.7))+
theme_bw(base_size = 8) # Smaller font size
library(broom) # for augment(), tidy()
pca_fit <- pgn |>
select(where(is.numeric)) |>
scale() |>
prcomp()
print(pca_fit)
## Standard deviations (1, .., p=4):
## [1] 1.6594442 0.8789293 0.6043475 0.3293816
##
## Rotation (n x k) = (4 x 4):
## PC1 PC2 PC3 PC4
## bill_length_mm 0.4552503 -0.597031143 -0.6443012 0.1455231
## bill_depth_mm -0.4003347 -0.797766572 0.4184272 -0.1679860
## flipper_length_mm 0.5760133 -0.002282201 0.2320840 -0.7837987
## body_mass_g 0.5483502 -0.084362920 0.5966001 0.5798821
pca_fit |>
# add PCs to the original dataset
augment(pgn) |>
ggplot(aes(.fittedPC1, .fittedPC2)) +
geom_point(aes(color = species))+
xlim(-5,5)+
ylim(-4,4)+
xlab("PC1")+
ylab("PC2")+
guides(color = guide_legend(title = NULL))+
theme_minimal()
pca_fit |>
# add PCs to the original dataset
augment(pgn) |>
ggplot(aes(.fittedPC1, .fittedPC2)) +
geom_point(aes(color = species)) +
# Add the PCA1 and PCA2 arrows
geom_segment(aes(x = -4.9, y = 0, xend = 5, yend = 0),
arrow = arrow(type = "closed", length = unit(0.1, "inches")),
color = "black") + # PC1 arrow
geom_segment(aes(x = 0, y = -2.5, xend = 0, yend = 4),
arrow = arrow(type = "closed", length = unit(0.1, "inches")),
color = "black") + # PC2 arrow
# Add text labels- Positioning and color of the label
geom_text(aes(x = 5, y = 0, label = "PC1"),
vjust = -0.5, color = "black") +
geom_text(aes(x = 0, y = 4, label = "PC2"),
hjust = -0.5, color = "black") +
xlim(-5, 5) +
ylim(-4, 4) +
xlab("PC1") +
ylab("PC2") +
guides(color = guide_legend(title = NULL)) + # No title for the legend
scale_color_manual(values = c("Adelie" = "maroon", "Gentoo" = "lightblue", "Chinstrap" = "yellow"))+
theme_minimal()
Rotation arrow
arrow_style <- arrow(
angle = 20, length = grid::unit(8, "pt"),
ends = "first", type = "closed"
)
pca_fit |>
# extract rotation matrix
tidy(matrix = "rotation") |>
pivot_wider(
names_from = "PC", values_from = "value",
names_prefix = "PC"
) |>
ggplot(aes(PC1, PC2)) +
geom_segment(
xend = 0, yend = 0,
arrow = arrow_style
) +
geom_text(aes(label = column), hjust = 1) +
xlim(-1.5, 0.5) + ylim(-1, 1) +
coord_fixed()+
theme_minimal()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_segment()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_text()`).
The dot plot is very clustered by species showing that species does dictate penguin size. The arrows indicate that the difference between species is really only dictated bill length and depth. The length of the arrow corresponding to bill depth shows that bill depth accounts for more of a difference between species than bill length. Both length and depth are negative along PC2 indicating that as PC2 increases, length and depth decrease. When looking at this in the context of the dot plot that means that penguins higher up on PC2 have smaller bills so that would be the Adelie and Gentoo penguins. Bill length is positive along PC1, while depth is negative that means that as penguins increase in bill length they decrease in depth and vice versa.
pca_fit |>
# extract eigenvalues
tidy(matrix = "eigenvalues") |>
ggplot(aes(PC, percent)) +
geom_col(fill= "darkred") +
scale_x_continuous(
# create one axis tick per PC
breaks = 1:6
) +
scale_y_continuous(
name = "variance explained",
breaks = seq(0, 1, by = 0.1),
# format y axis ticks as percent values
label = scales::label_percent(accuracy = 1)
)+
xlab("Principal Component (PC)") +
theme_minimal()
PC1 explains about 69% of variance, and PC2 explains about 19% of variance, together they explain about 88% of variance.