setwd("/Users/natalia/Stats/Datasets")


library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
penguins <- read_csv("penguins.csv")
## Rows: 344 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): species, island, sex
## dbl (4): bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
penguin_clean <- penguins |>
  drop_na() |>
  select(bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g, species)

head(penguin_clean)
## # A tibble: 6 × 5
##   bill_length_mm bill_depth_mm flipper_length_mm body_mass_g species
##            <dbl>         <dbl>             <dbl>       <dbl> <chr>  
## 1           39.1          18.7               181        3750 Adelie 
## 2           39.5          17.4               186        3800 Adelie 
## 3           40.3          18                 195        3250 Adelie 
## 4           36.7          19.3               193        3450 Adelie 
## 5           39.3          20.6               190        3650 Adelie 
## 6           38.9          17.8               181        3625 Adelie
pca_fit2 <- penguin_clean |> 
  select(where(is.numeric)) |>
  scale() |>                   
  prcomp()                      

pca_fit2
## Standard deviations (1, .., p=4):
## [1] 1.6569115 0.8821095 0.6071594 0.3284579
## 
## Rotation (n x k) = (4 x 4):
##                          PC1         PC2        PC3        PC4
## bill_length_mm     0.4537532 -0.60019490 -0.6424951  0.1451695
## bill_depth_mm     -0.3990472 -0.79616951  0.4258004 -0.1599044
## flipper_length_mm  0.5768250 -0.00578817  0.2360952 -0.7819837
## body_mass_g        0.5496747 -0.07646366  0.5917374  0.5846861
summary(pca_fit2)
## Importance of components:
##                           PC1    PC2     PC3     PC4
## Standard deviation     1.6569 0.8821 0.60716 0.32846
## Proportion of Variance 0.6863 0.1945 0.09216 0.02697
## Cumulative Proportion  0.6863 0.8809 0.97303 1.00000
library(broom)  

pca_fit2 |>
  
  augment(penguin_clean) |>
  ggplot(aes(.fittedPC1, .fittedPC2)) +
  geom_point(aes(color = species))+
  xlim(-5,5)+
  ylim(-4,4)+
  xlab("PC1")+
  ylab("PC2")+
  guides(color = guide_legend(title = NULL))+
  theme_minimal()

arrow_style <- arrow(
  angle = 20, length = grid::unit(8, "pt"),
  ends = "first", type = "closed"
)

# Extract rotation matrix for your pca_fit2
rotation <- pca_fit2 |>
  tidy(matrix = "rotation") |>
  pivot_wider(
    names_from = "PC", values_from = "value",
    names_prefix = "PC"
  )

# Make the plot
pca_fit2 |>
  augment(penguin_clean) |>
  ggplot(aes(.fittedPC1, .fittedPC2)) +
  geom_segment(
    data = rotation,
    aes(x = PC1, y = PC2, xend = 0, yend = 0),  # arrows point *to* the origin
    arrow = arrow_style,
    inherit.aes = FALSE
  ) +
  geom_text(
    data = rotation,
    aes(x = PC1, y = PC2, label = column),
    hjust = 1,
    inherit.aes = FALSE
  ) +
  xlim(-5, 5) +
  ylim(-4, 4) +
  xlab("PC1") +
  ylab("PC2") +
  coord_fixed() +
  guides(color = guide_legend(title = NULL)) +
  theme_minimal()

#chatgpt and life expectancy notes

PC1 reflects differences in body size. Penguins with longer flippers, heavier weight, and longer bills have higher PC1 scores, while smaller penguins have a lower PC1

PC2 separates penguins based on how deep their bills are

pca_fit2 |>
  # extract eigenvalues
  tidy(matrix = "eigenvalues") |>
  ggplot(aes(PC, percent)) + 
  geom_col(fill= "#2e0354") + 
  scale_x_continuous(
    # create one axis tick per PC
    breaks = 1:6
  ) +
  scale_y_continuous(
    name = "variance explained",
    breaks = seq(0, 1, by = 0.1), 
    # format y axis ticks as percent values
    label = scales::label_percent(accuracy = 1)
  )+
  xlab("Principal Component (PC)") +
  theme_minimal()

pca_fit2 |>
 tidy(matrix = "eigenvalues")
## # A tibble: 4 × 4
##      PC std.dev percent cumulative
##   <dbl>   <dbl>   <dbl>      <dbl>
## 1     1   1.66   0.686       0.686
## 2     2   0.882  0.195       0.881
## 3     3   0.607  0.0922      0.973
## 4     4   0.328  0.0270      1

How much variation is explained by PC1 and PC2?

PC1+PC2= 88% is explained by PC1 and PC2