library(tidyverse)
## Warning: package 'purrr' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)
## Warning: package 'broom' was built under R version 4.4.3
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.4.3
The dataset I selected is palmerpengiuns, a dataset collected by Dr. Kristen Gorman. The dataset covers information such as bill lenght, bill depth, species, which island the penguin is from, etc. The purpose of using this data to perform PCA would be to see the correlation between the species and the bill length and depth. Specifically, which species has a higher bill length and which has a higher bill depth. Essentially, I want to have an idea in my head of what the penguin’s bill might look like without looking up the species.
setwd("~/data 110")
penguins <- read_csv("penguins(1).csv")
## Rows: 344 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): species, island, sex
## dbl (4): bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(penguins)
## # A tibble: 6 × 7
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Adelie Torgersen 39.1 18.7 181 3750
## 2 Adelie Torgersen 39.5 17.4 186 3800
## 3 Adelie Torgersen 40.3 18 195 3250
## 4 Adelie Torgersen NA NA NA NA
## 5 Adelie Torgersen 36.7 19.3 193 3450
## 6 Adelie Torgersen 39.3 20.6 190 3650
## # ℹ 1 more variable: sex <chr>
str(penguins)
## spc_tbl_ [344 × 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ species : chr [1:344] "Adelie" "Adelie" "Adelie" "Adelie" ...
## $ island : chr [1:344] "Torgersen" "Torgersen" "Torgersen" "Torgersen" ...
## $ bill_length_mm : num [1:344] 39.1 39.5 40.3 NA 36.7 39.3 38.9 39.2 34.1 42 ...
## $ bill_depth_mm : num [1:344] 18.7 17.4 18 NA 19.3 20.6 17.8 19.6 18.1 20.2 ...
## $ flipper_length_mm: num [1:344] 181 186 195 NA 193 190 181 195 193 190 ...
## $ body_mass_g : num [1:344] 3750 3800 3250 NA 3450 ...
## $ sex : chr [1:344] "MALE" "FEMALE" "FEMALE" NA ...
## - attr(*, "spec")=
## .. cols(
## .. species = col_character(),
## .. island = col_character(),
## .. bill_length_mm = col_double(),
## .. bill_depth_mm = col_double(),
## .. flipper_length_mm = col_double(),
## .. body_mass_g = col_double(),
## .. sex = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
penguins_clean <- penguins |>
select(bill_length_mm, bill_depth_mm, flipper_length_mm,
body_mass_g, species) |>
drop_na()
head(penguins_clean)
## # A tibble: 6 × 5
## bill_length_mm bill_depth_mm flipper_length_mm body_mass_g species
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 39.1 18.7 181 3750 Adelie
## 2 39.5 17.4 186 3800 Adelie
## 3 40.3 18 195 3250 Adelie
## 4 36.7 19.3 193 3450 Adelie
## 5 39.3 20.6 190 3650 Adelie
## 6 38.9 17.8 181 3625 Adelie
pca_fit <- penguins_clean |>
select(where(is.numeric)) |>
scale() |>
prcomp()
pca_fit
## Standard deviations (1, .., p=4):
## [1] 1.6594442 0.8789293 0.6043475 0.3293816
##
## Rotation (n x k) = (4 x 4):
## PC1 PC2 PC3 PC4
## bill_length_mm 0.4552503 -0.597031143 -0.6443012 0.1455231
## bill_depth_mm -0.4003347 -0.797766572 0.4184272 -0.1679860
## flipper_length_mm 0.5760133 -0.002282201 0.2320840 -0.7837987
## body_mass_g 0.5483502 -0.084362920 0.5966001 0.5798821
unique(penguins_clean$species)
## [1] "Adelie" "Chinstrap" "Gentoo"
count(penguins_clean, species)
## # A tibble: 3 × 2
## species n
## <chr> <int>
## 1 Adelie 151
## 2 Chinstrap 68
## 3 Gentoo 123
library(broom)
pca_fit |>
# add PCs to the original dataset
augment(penguins_clean) |>
ggplot(aes(.fittedPC1, .fittedPC2)) +
geom_point(aes(color = species), alpha = 0.5) +
# Add the PCA1 and PCA2 arrows
geom_segment(aes(x = -4.9, y = 0, xend = 5, yend = 0),
arrow = arrow(type = "closed", length = unit(0.1, "inches")),
color = "black") + # PC1 arrow
geom_segment(aes(x = 0, y = -2.5, xend = 0, yend = 4),
arrow = arrow(type = "closed", length = unit(0.1, "inches")),
color = "black") + # PC2 arrow
# Add text labels- Positioning and color of the label
geom_text(aes(x = 5, y = 0, label = "PC1"),
vjust = -0.5, color = "black") +
geom_text(aes(x = 0, y = 4, label = "PC2"),
hjust = -0.5, color = "black") +
xlim(-5, 5) +
ylim(-4, 4) +
xlab("PC1") +
ylab("PC2") +
guides(color = guide_legend(title = NULL)) + # No title for the legend
scale_color_manual(values = c("Adelie" = "#a2681a", "Chinstrap" = "#1a94a2", "Gentoo" = "black"))+
theme_minimal()
PC1 is representing bill length, and PC2 is representing bill depth. It appears that overall, adelie and chinstrap penguins have a lower bill length, and a moderate bill depth. However, Gentoo penguins have a lower bill length, though they still have a similar bill depth as the other two species.
For funsies
arrow_style <- arrow(
angle = 20, length = grid::unit(8, "pt"),
ends = "first", type = "closed"
)
pca_fit |>
# extract rotation matrix
tidy(matrix = "rotation") |>
pivot_wider(
names_from = "PC", values_from = "value",
names_prefix = "PC"
) |>
ggplot(aes(PC1, PC2)) +
geom_segment(
xend = 0, yend = 0,
arrow = arrow_style
) +
geom_text(aes(label = column), hjust = 1) +
xlim(-1.5, 0.5) + ylim(-1, 1) +
coord_fixed()+
theme_minimal()
pca_fit |>
# extract eigenvalues
tidy(matrix = "eigenvalues") |>
ggplot(aes(PC, percent)) +
geom_col(fill= "#1a94a2") +
scale_x_continuous(
# create one axis tick per PC
breaks = 1:6
) +
scale_y_continuous(
name = "variance explained",
breaks = seq(0, 1, by = 0.1),
# format y axis ticks as percent values
label = scales::label_percent(accuracy = 1)
)+
xlab("Principal Component (PC)") +
theme_stata()
pca_fit |>
tidy(matrix = "eigenvalues")
## # A tibble: 4 × 4
## PC std.dev percent cumulative
## <dbl> <dbl> <dbl> <dbl>
## 1 1 1.66 0.688 0.688
## 2 2 0.879 0.193 0.882
## 3 3 0.604 0.0913 0.973
## 4 4 0.329 0.0271 1
PC1 explains 68.84 percent of the variance and PC2 explains 19.31 percent of the variance. Together, they make up 88.16 percent of the variance.