setwd("/Users/natalia/Stats/Datasets")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
penguins <- read_csv("penguins.csv")
## Rows: 344 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): species, island, sex
## dbl (4): bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
penguin_clean <- penguins |>
drop_na() |>
select(bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g, species)
head(penguin_clean)
## # A tibble: 6 × 5
## bill_length_mm bill_depth_mm flipper_length_mm body_mass_g species
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 39.1 18.7 181 3750 Adelie
## 2 39.5 17.4 186 3800 Adelie
## 3 40.3 18 195 3250 Adelie
## 4 36.7 19.3 193 3450 Adelie
## 5 39.3 20.6 190 3650 Adelie
## 6 38.9 17.8 181 3625 Adelie
pca_fit2 <- penguin_clean |>
select(where(is.numeric)) |>
scale() |>
prcomp()
pca_fit2
## Standard deviations (1, .., p=4):
## [1] 1.6569115 0.8821095 0.6071594 0.3284579
##
## Rotation (n x k) = (4 x 4):
## PC1 PC2 PC3 PC4
## bill_length_mm 0.4537532 -0.60019490 -0.6424951 0.1451695
## bill_depth_mm -0.3990472 -0.79616951 0.4258004 -0.1599044
## flipper_length_mm 0.5768250 -0.00578817 0.2360952 -0.7819837
## body_mass_g 0.5496747 -0.07646366 0.5917374 0.5846861
summary(pca_fit2)
## Importance of components:
## PC1 PC2 PC3 PC4
## Standard deviation 1.6569 0.8821 0.60716 0.32846
## Proportion of Variance 0.6863 0.1945 0.09216 0.02697
## Cumulative Proportion 0.6863 0.8809 0.97303 1.00000
library(broom)
pca_fit2 |>
augment(penguin_clean) |>
ggplot(aes(.fittedPC1, .fittedPC2)) +
geom_point(aes(color = species))+
xlim(-5,5)+
ylim(-4,4)+
xlab("PC1")+
ylab("PC2")+
guides(color = guide_legend(title = NULL))+
theme_minimal()
arrow_style <- arrow(
angle = 20, length = grid::unit(8, "pt"),
ends = "first", type = "closed"
)
# Extract rotation matrix for your pca_fit2
rotation <- pca_fit2 |>
tidy(matrix = "rotation") |>
pivot_wider(
names_from = "PC", values_from = "value",
names_prefix = "PC"
)
# Make the plot
pca_fit2 |>
augment(penguin_clean) |>
ggplot(aes(.fittedPC1, .fittedPC2)) +
geom_segment(
data = rotation,
aes(x = PC1, y = PC2, xend = 0, yend = 0), # arrows point *to* the origin
arrow = arrow_style,
inherit.aes = FALSE
) +
geom_text(
data = rotation,
aes(x = PC1, y = PC2, label = column),
hjust = 1,
inherit.aes = FALSE
) +
xlim(-5, 5) +
ylim(-4, 4) +
xlab("PC1") +
ylab("PC2") +
coord_fixed() +
guides(color = guide_legend(title = NULL)) +
theme_minimal()
#chatgpt and life expectancy notes
PC1 reflects differences in body size. Penguins with longer flippers, heavier weight, and longer bills have higher PC1 scores, while smaller penguins have a lower PC1
PC2 separates penguins based on how deep their bills are
pca_fit2 |>
# extract eigenvalues
tidy(matrix = "eigenvalues") |>
ggplot(aes(PC, percent)) +
geom_col(fill= "#2e0354") +
scale_x_continuous(
# create one axis tick per PC
breaks = 1:6
) +
scale_y_continuous(
name = "variance explained",
breaks = seq(0, 1, by = 0.1),
# format y axis ticks as percent values
label = scales::label_percent(accuracy = 1)
)+
xlab("Principal Component (PC)") +
theme_minimal()
pca_fit2 |>
tidy(matrix = "eigenvalues")
## # A tibble: 4 × 4
## PC std.dev percent cumulative
## <dbl> <dbl> <dbl> <dbl>
## 1 1 1.66 0.686 0.686
## 2 2 0.882 0.195 0.881
## 3 3 0.607 0.0922 0.973
## 4 4 0.328 0.0270 1
How much variation is explained by PC1 and PC2?
PC1+PC2= 88% is explained by PC1 and PC2