Project 3

PCA Test

Introduction

The happiness2019 data set is a ranking of 156 countries based on how happy their citizens precieve themselves to be. The data set is sourced from The World Happiness Report from 2019, and has a 9 variables. Preforming a PCA test on this data set will allow us to determine the variables that contribute the most to variance in happiness scores.

Data Preparation

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load in data set
setwd("~/Data 110 101/csv")
happiness19 <- read.csv("happiness2019.csv")

# Clean variable names
names(happiness19) <- tolower(names(happiness19))

head(happiness19)
##   overall.rank country.or.region score gdp.per.capita social.support
## 1            1           Finland 7.769          1.340          1.587
## 2            2           Denmark 7.600          1.383          1.573
## 3            3            Norway 7.554          1.488          1.582
## 4            4           Iceland 7.494          1.380          1.624
## 5            5       Netherlands 7.488          1.396          1.522
## 6            6       Switzerland 7.480          1.452          1.526
##   healthy.life.expectancy freedom.to.make.life.choices generosity
## 1                   0.986                        0.596      0.153
## 2                   0.996                        0.592      0.252
## 3                   1.028                        0.603      0.271
## 4                   1.026                        0.591      0.354
## 5                   0.999                        0.557      0.322
## 6                   1.052                        0.572      0.263
##   perceptions.of.corruption
## 1                     0.393
## 2                     0.410
## 3                     0.341
## 4                     0.118
## 5                     0.298
## 6                     0.343
# Turn overall rank into a categorical variable; only keep numeric columns + identifiers
happiness_clean <- 
  happiness19 |>
  mutate(ranking = case_when(
    overall.rank %in% 1:52 ~ "high", 
    overall.rank %in% 52:104 ~ "middle",
    overall.rank %in% 104:156 ~ "low"
  )) |>
  # Since I want to keep most of the variables I'm just removing the ones I don't want
  select(-overall.rank, -country.or.region) |>
  drop_na()

Principal Component Analysis (PCA) Setup

# Perform PCA on numeric variables
pca_fit <- happiness_clean |>
  select(where(is.numeric)) |>
  scale() |>
  prcomp()

# View result
pca_fit
## Standard deviations (1, .., p=7):
## [1] 1.9525737 1.1946293 0.7828699 0.7458601 0.5119599 0.4150977 0.3958726
## 
## Rotation (n x k) = (7 x 7):
##                                      PC1         PC2           PC3          PC4
## score                        -0.47586069  0.02837147  0.0715050421  0.007975002
## gdp.per.capita               -0.45482480  0.21337704 -0.0495984407 -0.242907697
## social.support               -0.43658226  0.20714812  0.2586453423  0.059169155
## healthy.life.expectancy      -0.45015043  0.17785645  0.0008731876 -0.277780251
## freedom.to.make.life.choices -0.33220068 -0.36212980  0.1063586174  0.807843752
## generosity                   -0.04823293 -0.69380874  0.5770086303 -0.422495145
## perceptions.of.corruption    -0.24651130 -0.51634628 -0.7624157220 -0.170750456
##                                      PC5         PC6         PC7
## score                         0.08099658  0.85414928 -0.17732359
## gdp.per.capita               -0.20442963 -0.06863729  0.79977368
## social.support                0.74155642 -0.36575462 -0.11137694
## healthy.life.expectancy      -0.53252340 -0.31810206 -0.55117960
## freedom.to.make.life.choices -0.25689731 -0.14826167  0.08126056
## generosity                   -0.01038139 -0.03484952  0.05949494
## perceptions.of.corruption     0.22816001 -0.08692841 -0.05071198

Visualize PCA Results

Scatter Plots

library(broom)

# Plot fitted data; Code from life expenctancy activity)
pca_fit |>
 augment(happiness_clean) |>
  ggplot(aes(.fittedPC1, .fittedPC2, color = ranking)) +
  geom_point(aes(color = ranking)) +
  xlab("PC1") +
  ylab("PC2") +
  theme_minimal() +
  # Added 2d density to see where the points are the most dense
  geom_density_2d(bins = 5, alpha = 0.5)

# Plot fitted data with axes (Code from life expenctancy activity)

pca_fit |>
  # Add PCs to the original dataset
  augment(happiness_clean) |>
  ggplot(aes(.fittedPC1, .fittedPC2)) +
  geom_point(aes(color = ranking)) +  # Use status for coloring

  # Add the PCA1 and PCA2 arrows
  geom_segment(aes(x = -4.9, y = 0, xend = 5, yend = 0), 
               arrow = arrow(type = "closed", length = unit(0.1, "inches")),
               color = "black") +  # PC1 arrow
  geom_segment(aes(x = 0, y = -2.5, xend = 0, yend = 4), 
               arrow = arrow(type = "closed", length = unit(0.1, "inches")),
               color = "black") +  # PC2 arrow

  # Add text labels
  geom_text(aes(x = 5, y = 0, label = "PC1"), 
            vjust = -0.5, color = "black") +  

  geom_text(aes(x = 0, y = 4, label = "PC2"), 
            hjust = -0.5, color = "black") +  

  # Customize the plot
  xlab("PC1") +
  ylab("PC2") +
  theme_minimal()

Rotation Arrows

#Rotation Matrix

arrow_style <- arrow(
  angle = 20, length = grid::unit(8, "pt"),
  ends = "first", type = "closed"
)
pca_fit |>
  # extract rotation matrix
  tidy(matrix = "rotation") |>
  pivot_wider(
    names_from = "PC", values_from = "value",
    names_prefix = "PC"
  ) |>
  ggplot(aes(PC1, PC2)) +
  geom_segment(
    xend = 0, yend = 0,
    arrow = arrow_style
  ) +
  geom_text(aes(label = column), hjust = 1) +
  xlim(-1, 0.5) + ylim(-1, 0.7) + 
  coord_fixed()+
  theme_minimal()

PCA Analysis Summary

  • PC1 shows a variation in happiness scores across the x-axis. High ranking scores have a negative PC1 while low ranking scores have are positive. Mid level rankings are mostly centered around 0. The rotation arrows show that higher happiness scores are associated with higher levels of all indicators, as all the indicators point toward negative PC1 values.

  • Positive PC2 values are associated with high life expectancy, gdp, and social support. Negative PC2 values are aassociated with high freedom, perceptions of curruption and generosity. There are dense clusters of high and low ranking points in both positive and negative PC2 quaderants, however the majority of middle ranking points have a positive PC2 value.

  • Both PC1 and PC2 indicate that countries that have high happiness levels generally have higher values of the indicators.

Variance Explained Plot

# Plot the variance explained by each PC- bar-graph (Code from life expectancy activity)

pca_fit |>
  # extract eigenvalues
  tidy(matrix = "eigenvalues") |>
  ggplot(aes(PC, percent)) + 
  geom_col(fill= "lightpink") + 
  scale_x_continuous(
    # create one axis tick per PC
    breaks = 1:9
  ) +
  scale_y_continuous(
    name = "variance explained",
    breaks = seq(0, 1, by = 0.1), 
    # format y axis ticks as percent values
    label = scales::label_percent(accuracy = 1)
  )+
  xlab("Principal Component (PC)") +
  theme_minimal()

pca_fit |>
  tidy(matrix = "eigenvalues")
## # A tibble: 7 × 4
##      PC std.dev percent cumulative
##   <dbl>   <dbl>   <dbl>      <dbl>
## 1     1   1.95   0.545       0.545
## 2     2   1.19   0.204       0.749
## 3     3   0.783  0.0876      0.836
## 4     4   0.746  0.0795      0.916
## 5     5   0.512  0.0374      0.953
## 6     6   0.415  0.0246      0.978
## 7     7   0.396  0.0224      1
  • PC1 and PC2 explain about 75% of the variation in the dataset.