1:

The dataset I chose was the World Happiness Report 2019, this dataset is a survery on the state of global happiness that ranks 156 countries by how happy their citizens perceive themselves to be. This dataset focuses on happiness and the community, how happiness has changed with our society, technology, social norms, conflicts and goverment policy.

2:

library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.4.2
## Warning: package 'purrr' was built under R version 4.4.3
## Warning: package 'lubridate' was built under R version 4.4.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
happiness <- read.csv("L:/R_Datasets/happiness2019.csv")

happiness_pca <- happiness |> select("GDP.per.capita", "Social.support", "Healthy.life.expectancy", 
                                "Freedom.to.make.life.choices", "Generosity", "Perceptions.of.corruption")
head(happiness_pca)
##   GDP.per.capita Social.support Healthy.life.expectancy
## 1          1.340          1.587                   0.986
## 2          1.383          1.573                   0.996
## 3          1.488          1.582                   1.028
## 4          1.380          1.624                   1.026
## 5          1.396          1.522                   0.999
## 6          1.452          1.526                   1.052
##   Freedom.to.make.life.choices Generosity Perceptions.of.corruption
## 1                        0.596      0.153                     0.393
## 2                        0.592      0.252                     0.410
## 3                        0.603      0.271                     0.341
## 4                        0.591      0.354                     0.118
## 5                        0.557      0.322                     0.298
## 6                        0.572      0.263                     0.343

3:

# Perform PCA on numeric variables
pca_fit <- happiness_pca |> 
 select(where(is.numeric)) |> # retain only numeric columns
 scale() |> 
# scale to zero mean and unit variance
 prcomp() 

# do PCA
pca_fit
## Standard deviations (1, .., p=6):
## [1] 1.7290436 1.1939862 0.7809297 0.7458369 0.5111981 0.3966832
## 
## Rotation (n x k) = (6 x 6):
##                                      PC1        PC2         PC3        PC4
## GDP.per.capita               -0.51459462 -0.2278181 -0.02380878  0.2404018
## Social.support               -0.49064918 -0.2202837  0.28141961 -0.0633134
## Healthy.life.expectancy      -0.51056655 -0.1922719  0.02808632  0.2748057
## Freedom.to.make.life.choices -0.38095770  0.3521218  0.11855036 -0.8104252
## Generosity                   -0.05948407  0.6935067  0.58081716  0.4189146
## Perceptions.of.corruption    -0.29173692  0.5076063 -0.75368730  0.1743609
##                                       PC5         PC6
## GDP.per.capita                0.183763758 -0.76887077
## Social.support               -0.771259058  0.18081020
## Healthy.life.expectancy       0.509355608  0.60547761
## Freedom.to.make.life.choices  0.240066817 -0.04905325
## Generosity                    0.005250993 -0.05142470
## Perceptions.of.corruption    -0.232996913  0.06701907

4:

library(broom)

pca_fit |>
 # add PCs to the original dataset
 augment(happiness_pca) |>
 ggplot(aes(.fittedPC1, .fittedPC2)) +
 geom_point(aes(color = Healthy.life.expectancy)) +
 
 # Add the PCA1 and PCA2 arrows
 geom_segment(aes(x = -4.9, y = 0, xend = 5, yend = 0), 
 
arrow = arrow(type = "closed", length = unit(0.1, "inches")),
 
color = "black") + # PC1 arrow
 geom_segment(aes(x = 0, y = -2.5, xend = 0, yend = 4), 
 
arrow = arrow(type = "closed", length = unit(0.1, "inches")),
 
color = "black") + # PC2 arrow
 
 # Add text labels- Positioning and color of the label
 geom_text(aes(x = 5, y = 0, label = "PC1"), 
 
vjust = -0.5, color = "black") + 
 
 geom_text(aes(x = 0, y = 4, label = "PC2"), 
 
hjust = -0.5, color = "black") + 
 xlim(-5, 5) +
 ylim(-4, 4) +
 xlab("PC1") +
 ylab("PC2") +
 guides(color = guide_legend(title = NULL)) + # No title for the legend
 theme_minimal()
## Warning in geom_segment(aes(x = -4.9, y = 0, xend = 5, yend = 0), arrow = arrow(type = "closed", : All aesthetics have length 1, but the data has 156 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.
## Warning in geom_segment(aes(x = 0, y = -2.5, xend = 0, yend = 4), arrow = arrow(type = "closed", : All aesthetics have length 1, but the data has 156 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.
## Warning in geom_text(aes(x = 5, y = 0, label = "PC1"), vjust = -0.5, color = "black"): All aesthetics have length 1, but the data has 156 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.
## Warning in geom_text(aes(x = 0, y = 4, label = "PC2"), hjust = -0.5, color = "black"): All aesthetics have length 1, but the data has 156 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

arrow_style <- arrow(
 angle = 20, length = grid::unit(8, "pt"),
 ends = "first", type = "closed"
)
pca_fit |>
 # extract rotation matrix
 tidy(matrix = "rotation") |>
 pivot_wider(
 
names_from = "PC", values_from = "value",
 
names_prefix = "PC"
 ) |>
 ggplot(aes(PC1, PC2)) +
 geom_segment(
 
xend = 0, yend = 0,
 
arrow = arrow_style
 ) +
 geom_text(aes(label = column), hjust = 1) +
 xlim(-1.5, 0.5) + ylim(-1, 1) + 
 coord_fixed()+
 theme_minimal()

5: PC1 represents overall happiness and development of a country. Countries which are richer, happier, and healthier have a higher PC1 value. PC1 is mainly influenced by happiness score, GDP, health, social support and happiness rank.

PC2 represents social and political factors, such as trust, generosity, sense of freedom and perception of corruption. Countries scoring high on PC2 scores are places where people feel free to make life choices and are more generous.

This analysis shows us that PC1 acts as a development axis, and can help us tell wealthy, healthy societies from developing ones. PC2 shows social culture and personal freedoms, independent of the economic status of a country.

6:

pca_fit |>
 # extract eigenvalues
 tidy(matrix = "eigenvalues") |>
 ggplot(aes(PC, percent)) + 
 geom_col(fill= "lightblue") + 
 scale_x_continuous(
 
# create one axis tick per PC
 
breaks = 1:6
 ) +
 scale_y_continuous(
 
name = "variance explained",
 
breaks = seq(0, 1, by = 0.1), 
 
# format y axis ticks as percent values
 
label = scales::label_percent(accuracy = 1)
 )+
 xlab("Principal Component (PC)") +
 theme_minimal()

Around 75% of the variation is explained by PC1 and PC2 meaning we have a good model.