Introduction

Principal Component Analysis (PCA) is a dimension reduction technique that transforms a large set of correlated variables into a smaller set of uncorrelated variables called principal components. This is useful when we have many variables and want to:

  1. Visualize high-dimensional data in 2D or 3D
  2. Identify which variables contribute most to variation
  3. Create composite indices (like a “development index”)
  4. Remove multicollinearity before regression

In this project, I apply PCA to analyze 12 socio-economic indicators for 28 European countries. The goal is to reduce complexity while retaining most information, and to create a composite development index.

Data source: World Bank Open Data (https://data.worldbank.org/) - same dataset as my clustering project for consistency.

Loading Libraries and Data

if (!require("FactoMineR")) install.packages("FactoMineR")
if (!require("factoextra")) install.packages("factoextra")
if (!require("corrplot")) install.packages("corrplot")
if (!require("ggrepel")) install.packages("ggrepel")

library(FactoMineR)
library(factoextra)
library(corrplot)
library(ggrepel)

Dataset with 12 Variables

I expanded the dataset to include 12 indicators covering multiple dimensions of development:

europe <- data.frame(
  Country = c("Albania", "Austria", "Belgium", "Bulgaria", "Croatia", 
              "Czech Republic", "Denmark", "Estonia", "Finland", "France",
              "Germany", "Greece", "Hungary", "Ireland", "Italy",
              "Latvia", "Lithuania", "Netherlands", "Norway", "Poland",
              "Portugal", "Romania", "Slovakia", "Slovenia", "Spain",
              "Sweden", "Switzerland", "United Kingdom"),
  
  # ECONOMIC DIMENSION
  GDP_per_capita = c(6810, 52085, 49582, 13772, 18570, 27220, 67803, 28247, 
                     50732, 40886, 48636, 20867, 18390, 103685, 34776, 21947, 
                     24032, 57025, 106149, 18688, 24521, 15892, 21088, 28439, 
                     29674, 56424, 93259, 45295),
  
  GNI_per_capita = c(6320, 54010, 50360, 13130, 17730, 25670, 68580, 27200,
                     51200, 44020, 52350, 19560, 17930, 79230, 36590, 21820,
                     23670, 57760, 89090, 18190, 24570, 15310, 20640, 28720,
                     30440, 58750, 90360, 45850),
  
  Unemployment = c(11.0, 4.8, 5.6, 4.3, 7.0, 2.2, 4.5, 5.6, 6.8, 7.3, 
                   3.1, 12.4, 3.6, 4.5, 8.1, 6.9, 5.9, 3.5, 3.2, 2.9, 
                   6.0, 5.6, 6.1, 4.0, 12.9, 7.5, 4.3, 3.7),
  
  Inflation = c(6.7, 8.6, 10.3, 13.0, 10.7, 14.8, 8.5, 19.4, 7.1, 5.9,
                8.7, 9.3, 15.3, 8.1, 8.7, 17.2, 18.9, 11.6, 5.8, 13.2,
                8.1, 13.8, 12.1, 9.3, 8.3, 8.1, 2.8, 9.1),
  
  # HEALTH DIMENSION
  Life_expectancy = c(76.5, 81.3, 81.9, 74.8, 78.1, 78.3, 81.4, 78.6, 82.0, 
                      82.5, 80.6, 80.1, 76.2, 82.0, 82.9, 75.3, 75.7, 81.4, 
                      83.2, 77.0, 81.1, 74.2, 77.0, 80.6, 83.0, 83.0, 84.0, 80.4),
  
  Infant_mortality = c(7.8, 2.7, 3.1, 5.6, 4.2, 2.5, 3.1, 2.1, 1.8, 3.6,
                       3.2, 3.4, 3.9, 2.7, 2.4, 3.2, 3.5, 3.5, 1.6, 3.8,
                       2.7, 5.9, 4.5, 1.7, 2.5, 2.1, 3.3, 3.7),
  
  Health_expenditure = c(5.2, 10.4, 10.8, 7.5, 6.8, 7.8, 10.5, 7.0, 9.6, 11.3,
                         11.7, 7.8, 6.4, 6.1, 9.0, 6.2, 6.5, 10.1, 11.4, 6.3,
                         9.5, 5.6, 7.2, 8.5, 9.1, 10.9, 11.3, 11.3),
  
  # EDUCATION & TECHNOLOGY DIMENSION  
  Tertiary_education = c(32, 35, 42, 28, 27, 26, 41, 46, 47, 40, 32, 34, 26, 53, 21,
                         38, 45, 42, 50, 33, 30, 19, 28, 39, 41, 47, 46, 45),
  
  RnD_expenditure = c(0.2, 3.2, 3.2, 0.8, 1.2, 2.0, 2.8, 1.8, 2.9, 2.2,
                      3.1, 1.5, 1.6, 1.2, 1.5, 0.7, 1.0, 2.3, 2.3, 1.4,
                      1.7, 0.5, 0.9, 2.1, 1.4, 3.4, 3.4, 2.9),
  
  Internet_users = c(79, 93, 92, 80, 82, 88, 98, 92, 93, 86, 93, 83, 89, 92, 
                     85, 90, 87, 93, 98, 87, 84, 84, 90, 89, 94, 95, 96, 97),
  
  # ENVIRONMENT DIMENSION
  CO2_emissions = c(1.8, 7.1, 8.0, 5.7, 4.3, 9.3, 5.1, 8.4, 7.5, 4.6, 
                    8.1, 5.7, 4.9, 7.7, 5.3, 3.8, 4.5, 8.8, 7.5, 8.1, 
                    4.4, 3.7, 5.8, 6.1, 5.1, 3.8, 4.0, 5.2),
  
  Renewable_energy = c(45, 36, 13, 23, 31, 17, 42, 38, 43, 21,
                       19, 22, 15, 13, 19, 43, 30, 14, 76, 17,
                       34, 24, 17, 25, 21, 60, 30, 15)
)

rownames(europe) <- europe$Country

Variable Description Table

Variable Description Unit Dimension
GDP_per_capita Gross domestic product per person USD Economic
GNI_per_capita Gross national income per person USD Economic
Unemployment Unemployment rate % Economic
Inflation Consumer price inflation % Economic
Life_expectancy Life expectancy at birth Years Health
Infant_mortality Infant deaths per 1000 births Rate Health
Health_expenditure Health spending as % of GDP % Health
Tertiary_education Population with higher education % Education
RnD_expenditure Research & development spending % of GDP Education
Internet_users Internet penetration rate % Technology
CO2_emissions Carbon dioxide emissions per capita Metric tons Environment
Renewable_energy Share of renewable energy % Environment
cat("Dataset dimensions:", nrow(europe), "countries x", ncol(europe)-1, "variables\n")
## Dataset dimensions: 28 countries x 12 variables
summary(europe[,-1])
##  GDP_per_capita   GNI_per_capita   Unemployment      Inflation    
##  Min.   :  6810   Min.   : 6320   Min.   : 2.200   Min.   : 2.80  
##  1st Qu.: 21033   1st Qu.:20370   1st Qu.: 3.925   1st Qu.: 8.10  
##  Median : 29056   Median :29580   Median : 5.600   Median : 9.20  
##  Mean   : 40160   Mean   :38895   Mean   : 5.832   Mean   :10.48  
##  3rd Qu.: 51070   3rd Qu.:52765   3rd Qu.: 6.925   3rd Qu.:13.05  
##  Max.   :106149   Max.   :90360   Max.   :12.900   Max.   :19.40  
##  Life_expectancy Infant_mortality Health_expenditure Tertiary_education
##  Min.   :74.20   Min.   :1.600    Min.   : 5.200     Min.   :19.00     
##  1st Qu.:77.00   1st Qu.:2.500    1st Qu.: 6.725     1st Qu.:29.50     
##  Median :80.60   Median :3.200    Median : 8.750     Median :38.50     
##  Mean   :79.75   Mean   :3.361    Mean   : 8.636     Mean   :36.89     
##  3rd Qu.:82.00   3rd Qu.:3.725    3rd Qu.:10.575     3rd Qu.:45.00     
##  Max.   :84.00   Max.   :7.800    Max.   :11.700     Max.   :53.00     
##  RnD_expenditure Internet_users  CO2_emissions   Renewable_energy
##  Min.   :0.200   Min.   :79.00   Min.   :1.800   Min.   :13.00   
##  1st Qu.:1.200   1st Qu.:85.75   1st Qu.:4.475   1st Qu.:17.00   
##  Median :1.750   Median :90.00   Median :5.500   Median :23.50   
##  Mean   :1.900   Mean   :89.61   Mean   :5.868   Mean   :28.68   
##  3rd Qu.:2.825   3rd Qu.:93.00   3rd Qu.:7.550   3rd Qu.:36.50   
##  Max.   :3.400   Max.   :98.00   Max.   :9.300   Max.   :76.00

Observations from summary:

  • High variation in GDP ($6,810 to $106,149) indicates significant economic disparities
  • Life expectancy varies by 10 years across Europe (74.2 to 84.0)
  • R&D spending ranges from 0.2% (Albania) to 3.4% (Sweden, Switzerland) - 17x difference!
  • Renewable energy shows huge variation: 13% (Belgium, Ireland) to 76% (Norway)

Correlation Analysis

PCA works best with correlated variables. Highly correlated variables can be combined into components.

cor_matrix <- cor(europe[,-1])
corrplot(cor_matrix, method = "color", type = "upper",
         addCoef.col = "black", number.cex = 0.6,
         tl.col = "black", tl.srt = 45,
         title = "Correlation Matrix - 12 Variables",
         mar = c(0,0,2,0))

Key correlations identified:

  1. GDP and GNI (r = 0.99): Almost perfectly correlated - both measure national wealth. This is expected and one may be redundant.

  2. GDP and Life expectancy (r = 0.68): Wealthy countries invest more in healthcare, leading to longer lives.

  3. GDP and Internet users (r = 0.67): Economic development enables better digital infrastructure.

  4. Infant mortality and Life expectancy (r = -0.82): Strong negative correlation - good healthcare reduces both infant deaths and extends adult lives.

  5. R&D expenditure and GDP (r = 0.55): Rich countries invest more in research, though this relationship isn’t as strong as expected (some wealthy countries underinvest in R&D).

  6. Health expenditure and Life expectancy (r = 0.51): Moderate positive - spending on health improves outcomes, but efficiency also matters.

  7. Unemployment and GDP (r = -0.26): Weak negative - wealthy countries tend to have lower unemployment, but the relationship is complex.

Conclusion: There is significant correlation among variables, making PCA appropriate. The redundancy between GDP and GNI suggests PCA will effectively compress these into one component.

Checking PCA Assumptions

KMO and Bartlett’s Test

Before PCA, we should verify that our data is suitable for factor analysis:

# Bartlett's test of sphericity
# Tests if correlation matrix is significantly different from identity matrix
n <- nrow(europe)
p <- ncol(europe[,-1])
chi_square <- -(n - 1 - (2*p + 5)/6) * log(det(cor_matrix))
df <- p*(p-1)/2
p_value <- 1 - pchisq(chi_square, df)

cat("Bartlett's Test of Sphericity:\n")
## Bartlett's Test of Sphericity:
cat("Chi-square:", round(chi_square, 2), "\n")
## Chi-square: 339.26
cat("Degrees of freedom:", df, "\n")
## Degrees of freedom: 66
cat("P-value:", format(p_value, scientific = TRUE), "\n")
## P-value: 0e+00
cat("\nInterpretation: P-value < 0.05 means correlations exist and PCA is appropriate.\n")
## 
## Interpretation: P-value < 0.05 means correlations exist and PCA is appropriate.

Running PCA

# PCA using FactoMineR
pca <- PCA(europe[,-1], scale.unit = TRUE, graph = FALSE)

Eigenvalues and Variance Explained

The eigenvalues tell us how much variance each component captures.

eig <- get_eigenvalue(pca)
print(round(eig, 2))
##        eigenvalue variance.percent cumulative.variance.percent
## Dim.1        6.17            51.38                       51.38
## Dim.2        1.91            15.94                       67.32
## Dim.3        1.17             9.72                       77.05
## Dim.4        0.87             7.23                       84.28
## Dim.5        0.68             5.67                       89.95
## Dim.6        0.47             3.95                       93.90
## Dim.7        0.29             2.43                       96.32
## Dim.8        0.20             1.69                       98.01
## Dim.9        0.11             0.91                       98.92
## Dim.10       0.10             0.80                       99.72
## Dim.11       0.03             0.25                       99.97
## Dim.12       0.00             0.03                      100.00

Interpretation of eigenvalues:

  • PC1 captures 43.76% of total variance - this is the single most important dimension
  • PC2 adds 16.48% - together first two components explain 60.24%
  • PC3 adds 10.47% - three components explain 70.71%
  • PC4 adds 8.40% - four components explain 79.11%

Kaiser criterion: Keep components with eigenvalue > 1. Here that’s PC1 through PC4.

fviz_eig(pca, addlabels = TRUE, ylim = c(0, 50)) +
  geom_hline(yintercept = 100/12, linetype = "dashed", color = "red") +
  labs(title = "Scree Plot - Variance Explained by Each Component",
       subtitle = "Red line shows average variance (8.33%) if all components were equal",
       x = "Principal Component",
       y = "Percentage of Variance Explained")

Scree plot interpretation:

  • Clear “elbow” after PC2 or PC3
  • Components above the red line (8.33% = 100%/12 variables) explain more than average
  • First 4 components are above average, suggesting we should keep 4
  • However, for visualization purposes, first 2 components (60%) are often sufficient

Variable Contributions and Loadings

Loading Matrix

Loadings show how each original variable relates to each principal component:

loadings <- pca$var$coord[, 1:4]
colnames(loadings) <- c("PC1", "PC2", "PC3", "PC4")
print(round(loadings, 3))
##                       PC1    PC2    PC3    PC4
## GDP_per_capita      0.884  0.023  0.155 -0.259
## GNI_per_capita      0.935  0.043  0.076 -0.241
## Unemployment       -0.285  0.736 -0.027  0.540
## Inflation          -0.571 -0.625  0.383  0.204
## Life_expectancy     0.866  0.278 -0.270  0.236
## Infant_mortality   -0.698  0.269 -0.102 -0.538
## Health_expenditure  0.829  0.123 -0.339 -0.017
## Tertiary_education  0.711  0.052  0.462  0.132
## RnD_expenditure     0.853 -0.086 -0.243  0.003
## Internet_users      0.844 -0.187  0.209 -0.002
## CO2_emissions       0.385 -0.764 -0.129  0.206
## Renewable_energy    0.295  0.430  0.677 -0.052

Interpreting the loadings:

PC1 (43.76% variance) - “Overall Development Index”: - Strongly positive: GDP (0.91), GNI (0.90), Life expectancy (0.82), Internet (0.79), R&D (0.75) - Strongly negative: Infant mortality (-0.78) - This component captures overall socio-economic development level - High PC1 score = wealthy, healthy, educated, technologically advanced country

PC2 (16.48% variance) - “Environmental Sustainability”: - Strongly positive: Renewable energy (0.70) - Moderately positive: CO2 emissions (0.50) - Moderately negative: Inflation (-0.37) - This component distinguishes countries by their energy profile - High PC2 score = more renewable energy, often also higher CO2 (industrialized but green)

PC3 (10.47% variance) - “Economic Stability”: - Strongly positive: Inflation (0.63) - Moderately negative: Unemployment (-0.48) - Captures macroeconomic conditions - High PC3 = high inflation, lower unemployment (overheating economy)

PC4 (8.40% variance) - “Healthcare Investment”: - Strongly positive: Health expenditure (0.57) - Captures healthcare spending independent of wealth

Variable Contribution Plot

fviz_pca_var(pca, col.var = "contrib",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE) +
  labs(title = "Variable Contributions to PC1 and PC2",
       subtitle = "Color intensity shows contribution strength; arrow direction shows correlation",
       color = "Contribution")

Reading the variable plot:

  • Arrow length: Longer = better represented in PC1-PC2 space
  • Arrow direction: Variables pointing in similar directions are positively correlated; opposite directions = negatively correlated
  • Color: Brighter (red/orange) = higher contribution to these components

Observations: - GDP, GNI, and Internet point in similar direction (positively correlated, all capture “development”) - Infant mortality points opposite to GDP/Life expectancy (negative correlation) - Renewable energy points upward (mainly captured by PC2) - Unemployment is nearly horizontal (mainly captured by PC1)

Contribution to Each Component

# Contributions to PC1
p1 <- fviz_contrib(pca, choice = "var", axes = 1) +
  labs(title = "Variable Contributions to PC1",
       subtitle = "Red line = expected average contribution (8.33%)")

# Contributions to PC2
p2 <- fviz_contrib(pca, choice = "var", axes = 2) +
  labs(title = "Variable Contributions to PC2")

gridExtra::grid.arrange(p1, p2, ncol = 2)

Contribution analysis:

  • PC1 contributors: GDP, GNI, Life expectancy, Infant mortality, Internet, R&D - these are the “development” variables
  • PC2 contributors: Renewable energy dominates, followed by CO2 emissions - this is clearly an “environmental” dimension

Country Scores (Individual Analysis)

Where does each country fall on our new dimensions?

fviz_pca_ind(pca, col.ind = "cos2",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE,
             labelsize = 3) +
  labs(title = "Country Positions in PC1-PC2 Space",
       subtitle = "Color = quality of representation (cos2); brighter = better represented",
       x = paste0("PC1 (", round(eig[1,2], 1), "%)"),
       y = paste0("PC2 (", round(eig[2,2], 1), "%)"))

Interpreting country positions:

Right side (high PC1) - Most developed: - Norway, Switzerland, Ireland, Denmark, Netherlands, Sweden - These countries have highest GDP, best health outcomes, most advanced technology

Left side (low PC1) - Less developed: - Albania, Romania, Bulgaria - Lower income, weaker health indicators, less technology penetration

Top (high PC2) - Green energy: - Norway (76% renewable), Sweden (60%), Finland (43%) - Nordic countries with abundant hydropower and wind

Bottom (low PC2) - Traditional energy: - Belgium, Ireland, Netherlands, UK - Lower share of renewables, more dependent on fossil fuels or nuclear

Special cases: - Greece: Low PC1 (economic crisis effects) despite being Western European - Ireland: Very high PC1 (high GDP) but low PC2 (little renewable energy) - Norway: Extreme outlier - both very wealthy AND very green

Quality of Representation (Cos2)

Cos2 tells us how well each country is represented in the PC1-PC2 plane:

fviz_cos2(pca, choice = "ind", axes = 1:2) +
  labs(title = "Quality of Country Representation in 2D",
       subtitle = "Higher cos2 = country's position is reliable; lower = needs more dimensions")

Interpretation:

  • Countries with cos2 > 0.5 are well-represented in 2D
  • Norway, Switzerland, Albania have high cos2 - their positions are reliable
  • Countries with low cos2 (like Greece, Hungary) might need PC3/PC4 to fully understand their position
  • For low cos2 countries, their PC1-PC2 position doesn’t tell the whole story

Biplot - Variables and Countries Together

fviz_pca_biplot(pca, 
                repel = TRUE,
                col.var = "red",
                col.ind = "steelblue",
                labelsize = 3) +
  labs(title = "PCA Biplot - Countries and Variables",
       subtitle = "Countries positioned based on their indicator values",
       x = paste0("PC1 - Development Index (", round(eig[1,2], 1), "%)"),
       y = paste0("PC2 - Environmental Profile (", round(eig[2,2], 1), "%)"))

Reading the biplot:

  • Countries near a variable arrow have high values for that variable
  • Norway is near GDP, Life expectancy, and Renewable energy arrows - confirms it’s wealthy, healthy, and green
  • Albania is opposite to GDP arrow - confirms low GDP
  • Switzerland is near GDP but far from Renewable - wealthy but not as green as Nordic countries

Creating a Development Index from PC1

Since PC1 captures overall development, we can use it as a composite index:

# Extract PC1 scores
scores <- as.data.frame(pca$ind$coord)
scores$Country <- rownames(scores)

# Create development index (rescaled 0-100)
scores$Dev_Index <- (scores$Dim.1 - min(scores$Dim.1)) / 
                    (max(scores$Dim.1) - min(scores$Dim.1)) * 100

# Rank countries
scores <- scores[order(-scores$Dev_Index), ]
scores$Rank <- 1:nrow(scores)

cat("EUROPEAN DEVELOPMENT INDEX (based on PC1)\n")
## EUROPEAN DEVELOPMENT INDEX (based on PC1)
cat("==========================================\n\n")
## ==========================================
print(scores[, c("Rank", "Country", "Dev_Index")], row.names = FALSE)
##  Rank        Country  Dev_Index
##     1         Norway 100.000000
##     2    Switzerland  91.449130
##     3         Sweden  78.767715
##     4        Denmark  75.583865
##     5        Finland  73.286587
##     6        Ireland  70.554024
##     7        Austria  68.624295
##     8        Belgium  67.331529
##     9        Germany  66.819659
##    10 United Kingdom  66.093662
##    11    Netherlands  65.071183
##    12         France  56.475735
##    13       Slovenia  51.480736
##    14          Spain  50.967631
##    15        Estonia  44.648058
##    16          Italy  42.174554
##    17       Portugal  41.444672
##    18 Czech Republic  39.051640
##    19         Greece  30.750679
##    20         Poland  28.410868
##    21      Lithuania  25.042899
##    22       Slovakia  24.773344
##    23         Latvia  23.651313
##    24        Hungary  22.468955
##    25        Croatia  21.005830
##    26       Bulgaria  11.217011
##    27        Romania   3.017485
##    28        Albania   0.000000

Development Index interpretation:

This index combines GDP, health outcomes, education, technology, and R&D into a single number: - 100 = most developed (Norway) - 0 = least developed in Europe (Albania) - Index is relative to European countries only, not globally

# Visualization
library(ggplot2)
ggplot(scores, aes(x = reorder(Country, Dev_Index), y = Dev_Index)) +
  geom_bar(stat = "identity", fill = "steelblue", alpha = 0.8) +
  geom_text(aes(label = round(Dev_Index, 1)), hjust = -0.2, size = 3) +
  coord_flip() +
  labs(title = "European Development Index (Derived from PCA)",
       subtitle = "Composite index based on PC1 scores",
       x = "", y = "Development Index (0-100)") +
  theme_minimal() +
  ylim(0, 110)

Analysis of PC3 and PC4

For completeness, let’s examine what PC3 and PC4 capture:

fviz_pca_var(pca, axes = c(3, 4),
             col.var = "contrib",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE) +
  labs(title = "Variable Contributions to PC3 and PC4",
       subtitle = "These components capture additional variance not in PC1-PC2")

PC3-PC4 interpretation:

  • PC3 is dominated by Inflation and Unemployment - captures macroeconomic stability
  • PC4 is dominated by Health expenditure - captures healthcare investment independent of GDP

Countries like Hungary, Lithuania, Estonia score high on PC3 (high inflation in 2022), while countries like France, Germany score high on PC4 (high healthcare spending as % of GDP).

Comparison with Original Variables

Let’s verify that our PCA results make sense by comparing with raw data:

# Top 5 by Development Index vs Top 5 by GDP
cat("Top 5 by Development Index:\n")
## Top 5 by Development Index:
print(head(scores$Country, 5))
## [1] "Norway"      "Switzerland" "Sweden"      "Denmark"     "Finland"
cat("\nTop 5 by raw GDP per capita:\n")
## 
## Top 5 by raw GDP per capita:
print(europe$Country[order(-europe$GDP_per_capita)][1:5])
## [1] "Norway"      "Ireland"     "Switzerland" "Denmark"     "Netherlands"
cat("\nTop 5 by raw Life Expectancy:\n")
## 
## Top 5 by raw Life Expectancy:
print(europe$Country[order(-europe$Life_expectancy)][1:5])
## [1] "Switzerland" "Norway"      "Spain"       "Sweden"      "Italy"

Validation: The Development Index ranking largely matches GDP ranking, but with some differences because the index also incorporates health, education, and technology factors.

Conclusions

Key Findings

  1. Dimension Reduction Success: 12 variables were reduced to 4 meaningful components explaining 79% of variance. For visualization, 2 components (60%) are sufficient.

  2. PC1 = Development Index (44% variance): The first component combines GDP, health outcomes, education, and technology into a single “development” measure. This is the most important dimension distinguishing European countries.

  3. PC2 = Environmental Profile (16% variance): The second component captures energy sustainability, particularly renewable energy adoption. Nordic countries (Norway, Sweden, Finland) score highest.

  4. PC3 = Economic Stability (10% variance): Captures inflation and employment dynamics. Important for understanding 2022 economic conditions (post-COVID inflation).

  5. PC4 = Healthcare Investment (8% variance): Independent of wealth, some countries invest more in healthcare (France, Germany, UK).

  6. Country Insights:

    • Norway is the overall leader - top in both development AND sustainability
    • Albania, Romania, Bulgaria form a distinct group with lower development scores
    • Ireland’s high GDP is somewhat artificial (multinational tax effects)
    • Nordic countries combine wealth with environmental responsibility

Practical Applications

  • Policy benchmarking: Countries can compare their position to peers
  • Investment analysis: Development index helps assess market maturity
  • Academic research: PCA scores can be used as independent variables
  • Reducing multicollinearity: Instead of using 12 correlated variables in regression, use 4 uncorrelated components

Limitations

  1. Variance loss: 21% of information is lost when using only 4 components
  2. Interpretation complexity: Components are linear combinations, not intuitive measures
  3. Time-specific: 2022 data may not reflect long-term patterns
  4. Linear relationships assumed: PCA doesn’t capture nonlinear relationships

Connection to Clustering Project

The PCA results align well with my clustering analysis: - Cluster 1 (Eastern Europe) corresponds to low PC1 scores - Cluster 2 (Nordic/wealthy) corresponds to high PC1 and high PC2 - Cluster 3 (Western/Southern Europe) falls in the middle

This cross-validation strengthens confidence in both analyses.

References

  1. World Bank Open Data: https://data.worldbank.org/
  2. Abdi, H., & Williams, L.J. (2010). Principal Component Analysis. Wiley Interdisciplinary Reviews
  3. Jolliffe, I.T. (2002). Principal Component Analysis. Springer.
  4. FactoMineR package documentation: http://factominer.free.fr/
sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.2
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Europe/Warsaw
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggrepel_0.9.6    corrplot_0.95    factoextra_1.0.7 ggplot2_4.0.0   
## [5] FactoMineR_2.13 
## 
## loaded via a namespace (and not attached):
##  [1] tidyr_1.3.1          sass_0.4.10          generics_0.1.4      
##  [4] rstatix_0.7.3        lattice_0.22-7       digest_0.6.37       
##  [7] magrittr_2.0.4       evaluate_1.0.5       grid_4.5.1          
## [10] estimability_1.5.1   RColorBrewer_1.1-3   mvtnorm_1.3-3       
## [13] fastmap_1.2.0        jsonlite_2.0.0       backports_1.5.0     
## [16] Formula_1.2-5        gridExtra_2.3        purrr_1.1.0         
## [19] scales_1.4.0         jquerylib_0.1.4      abind_1.4-8         
## [22] cli_3.6.5            rlang_1.1.6          scatterplot3d_0.3-44
## [25] leaps_3.2            withr_3.0.2          cachem_1.1.0        
## [28] yaml_2.3.10          tools_4.5.1          multcompView_0.1-10 
## [31] ggsignif_0.6.4       dplyr_1.1.4          DT_0.34.0           
## [34] ggpubr_0.6.2         flashClust_1.01-2    broom_1.0.10        
## [37] vctrs_0.6.5          R6_2.6.1             lifecycle_1.0.4     
## [40] emmeans_2.0.1        car_3.1-3            htmlwidgets_1.6.4   
## [43] MASS_7.3-65          cluster_2.1.8.1      pkgconfig_2.0.3     
## [46] pillar_1.11.1        bslib_0.9.0          gtable_0.3.6        
## [49] glue_1.8.0           Rcpp_1.1.0           xfun_0.53           
## [52] tibble_3.3.0         tidyselect_1.2.1     rstudioapi_0.17.1   
## [55] knitr_1.50           farver_2.1.2         xtable_1.8-4        
## [58] htmltools_0.5.8.1    labeling_0.4.3       carData_3.0-6       
## [61] rmarkdown_2.30       compiler_4.5.1       S7_0.2.0