Introduction

The goal of this project is to identify natural groupings among European countries based on their socio-economic characteristics. Clustering analysis helps us understand which countries share similar development patterns and can inform policy decisions, investment strategies, and economic cooperation.

I will apply three different clustering methods (K-means, Hierarchical, PAM) and compare their results to ensure robustness of findings.

Research questions:

  • How many distinct groups of European countries exist based on economic and social indicators?
  • Which countries are most similar to each other?
  • What characteristics define each cluster?

Data source: World Bank Open Data (https://data.worldbank.org/) - official international statistics, 2022 values.

Loading Libraries and Data

# install packages if needed
if (!require("cluster")) install.packages("cluster")
if (!require("factoextra")) install.packages("factoextra")
if (!require("corrplot")) install.packages("corrplot")
if (!require("NbClust")) install.packages("NbClust")
if (!require("dendextend")) install.packages("dendextend")

library(cluster)
library(factoextra)
library(corrplot)
library(NbClust)
library(dendextend)

Dataset Creation

I collected 10 variables for 28 European countries covering different aspects of development:

europe <- data.frame(
  Country = c("Albania", "Austria", "Belgium", "Bulgaria", "Croatia", 
              "Czech Republic", "Denmark", "Estonia", "Finland", "France",
              "Germany", "Greece", "Hungary", "Ireland", "Italy",
              "Latvia", "Lithuania", "Netherlands", "Norway", "Poland",
              "Portugal", "Romania", "Slovakia", "Slovenia", "Spain",
              "Sweden", "Switzerland", "United Kingdom"),
  
  # Economic indicators
  GDP_per_capita = c(6810, 52085, 49582, 13772, 18570, 27220, 67803, 28247, 
                     50732, 40886, 48636, 20867, 18390, 103685, 34776, 21947, 
                     24032, 57025, 106149, 18688, 24521, 15892, 21088, 28439, 
                     29674, 56424, 93259, 45295),
  
  # Health indicators
  Life_expectancy = c(76.5, 81.3, 81.9, 74.8, 78.1, 78.3, 81.4, 78.6, 82.0, 
                      82.5, 80.6, 80.1, 76.2, 82.0, 82.9, 75.3, 75.7, 81.4, 
                      83.2, 77.0, 81.1, 74.2, 77.0, 80.6, 83.0, 83.0, 84.0, 80.4),
  
  Infant_mortality = c(7.8, 2.7, 3.1, 5.6, 4.2, 2.5, 3.1, 2.1, 1.8, 3.6,
                       3.2, 3.4, 3.9, 2.7, 2.4, 3.2, 3.5, 3.5, 1.6, 3.8,
                       2.7, 5.9, 4.5, 1.7, 2.5, 2.1, 3.3, 3.7),
  
  # Labor market
  Unemployment = c(11.0, 4.8, 5.6, 4.3, 7.0, 2.2, 4.5, 5.6, 6.8, 7.3, 
                   3.1, 12.4, 3.6, 4.5, 8.1, 6.9, 5.9, 3.5, 3.2, 2.9, 
                   6.0, 5.6, 6.1, 4.0, 12.9, 7.5, 4.3, 3.7),
  
  # Education
  Tertiary_education = c(32, 35, 42, 28, 27, 26, 41, 46, 47, 40, 32, 34, 26, 53, 21,
                         38, 45, 42, 50, 33, 30, 19, 28, 39, 41, 47, 46, 45),
  
  # Technology
  Internet_users = c(79, 93, 92, 80, 82, 88, 98, 92, 93, 86, 93, 83, 89, 92, 
                     85, 90, 87, 93, 98, 87, 84, 84, 90, 89, 94, 95, 96, 97),
  
  # Environment
  CO2_emissions = c(1.8, 7.1, 8.0, 5.7, 4.3, 9.3, 5.1, 8.4, 7.5, 4.6, 
                    8.1, 5.7, 4.9, 7.7, 5.3, 3.8, 4.5, 8.8, 7.5, 8.1, 
                    4.4, 3.7, 5.8, 6.1, 5.1, 3.8, 4.0, 5.2),
  
  # Demographics
  Urban_population = c(63, 59, 98, 76, 58, 74, 88, 69, 86, 81,
                       78, 80, 72, 64, 71, 68, 68, 93, 83, 60,
                       67, 54, 54, 55, 81, 88, 74, 84),
  
  # Trade openness
  Trade_percent_GDP = c(76, 114, 176, 123, 107, 152, 117, 149, 81, 65,
                        91, 77, 167, 242, 63, 126, 153, 166, 76, 120,
                        88, 92, 187, 169, 70, 90, 119, 63),
  
  # Government
  Govt_expenditure = c(29, 50, 52, 36, 47, 44, 49, 40, 54, 58,
                       49, 50, 46, 25, 57, 40, 36, 44, 50, 42,
                       46, 35, 43, 45, 47, 49, 32, 44)
)

rownames(europe) <- europe$Country

Variable Description

Variable Description Unit
GDP_per_capita Gross Domestic Product per person USD
Life_expectancy Average life expectancy at birth Years
Infant_mortality Deaths per 1000 live births Rate
Unemployment Unemployment rate %
Tertiary_education Population with higher education %
Internet_users Internet penetration %
CO2_emissions Carbon emissions per capita Metric tons
Urban_population People living in cities %
Trade_percent_GDP Imports+Exports as % of GDP %
Govt_expenditure Government spending as % of GDP %
# Data overview
str(europe)
## 'data.frame':    28 obs. of  11 variables:
##  $ Country           : chr  "Albania" "Austria" "Belgium" "Bulgaria" ...
##  $ GDP_per_capita    : num  6810 52085 49582 13772 18570 ...
##  $ Life_expectancy   : num  76.5 81.3 81.9 74.8 78.1 78.3 81.4 78.6 82 82.5 ...
##  $ Infant_mortality  : num  7.8 2.7 3.1 5.6 4.2 2.5 3.1 2.1 1.8 3.6 ...
##  $ Unemployment      : num  11 4.8 5.6 4.3 7 2.2 4.5 5.6 6.8 7.3 ...
##  $ Tertiary_education: num  32 35 42 28 27 26 41 46 47 40 ...
##  $ Internet_users    : num  79 93 92 80 82 88 98 92 93 86 ...
##  $ CO2_emissions     : num  1.8 7.1 8 5.7 4.3 9.3 5.1 8.4 7.5 4.6 ...
##  $ Urban_population  : num  63 59 98 76 58 74 88 69 86 81 ...
##  $ Trade_percent_GDP : num  76 114 176 123 107 152 117 149 81 65 ...
##  $ Govt_expenditure  : num  29 50 52 36 47 44 49 40 54 58 ...
summary(europe[,-1])
##  GDP_per_capita   Life_expectancy Infant_mortality  Unemployment   
##  Min.   :  6810   Min.   :74.20   Min.   :1.600    Min.   : 2.200  
##  1st Qu.: 21033   1st Qu.:77.00   1st Qu.:2.500    1st Qu.: 3.925  
##  Median : 29056   Median :80.60   Median :3.200    Median : 5.600  
##  Mean   : 40160   Mean   :79.75   Mean   :3.361    Mean   : 5.832  
##  3rd Qu.: 51070   3rd Qu.:82.00   3rd Qu.:3.725    3rd Qu.: 6.925  
##  Max.   :106149   Max.   :84.00   Max.   :7.800    Max.   :12.900  
##  Tertiary_education Internet_users  CO2_emissions   Urban_population
##  Min.   :19.00      Min.   :79.00   Min.   :1.800   Min.   :54.00   
##  1st Qu.:29.50      1st Qu.:85.75   1st Qu.:4.475   1st Qu.:63.75   
##  Median :38.50      Median :90.00   Median :5.500   Median :73.00   
##  Mean   :36.89      Mean   :89.61   Mean   :5.868   Mean   :73.07   
##  3rd Qu.:45.00      3rd Qu.:93.00   3rd Qu.:7.550   3rd Qu.:81.50   
##  Max.   :53.00      Max.   :98.00   Max.   :9.300   Max.   :98.00   
##  Trade_percent_GDP Govt_expenditure
##  Min.   : 63.0     Min.   :25.00   
##  1st Qu.: 80.0     1st Qu.:40.00   
##  Median :115.5     Median :45.50   
##  Mean   :118.5     Mean   :44.25   
##  3rd Qu.:152.2     3rd Qu.:49.25   
##  Max.   :242.0     Max.   :58.00

Observations from summary statistics:

  • GDP per capita varies enormously - from $6,810 (Albania) to $106,149 (Norway). This 15x difference shows huge economic disparities in Europe.
  • Life expectancy ranges from 74.2 (Romania) to 84.0 (Switzerland) - a 10-year gap that reflects healthcare quality differences.
  • Unemployment shows high variation (2.2% to 12.9%), indicating different labor market conditions.

Exploratory Data Analysis

Correlation Analysis

Before clustering, we need to understand relationships between variables:

cor_matrix <- cor(europe[,-1])
corrplot(cor_matrix, method = "color", type = "upper", 
         addCoef.col = "black", number.cex = 0.7,
         tl.col = "black", tl.srt = 45,
         title = "Correlation Matrix of Variables",
         mar = c(0,0,2,0))

Key findings from correlation analysis:

  1. GDP and Life expectancy (r = 0.68): Strong positive correlation. Wealthier countries can invest more in healthcare, nutrition, and living conditions, leading to longer lives.

  2. GDP and Internet users (r = 0.67): Rich countries have better digital infrastructure and higher technology adoption rates.

  3. Infant mortality and Life expectancy (r = -0.82): Strong negative correlation, as expected. Countries with good healthcare have both low infant deaths and high life expectancy.

  4. Infant mortality and GDP (r = -0.65): Wealth allows investment in maternal and child healthcare.

  5. CO2 emissions and GDP (r = 0.40): Moderate positive correlation - industrialized economies produce more emissions, though this relationship is weakening as countries adopt green technologies.

These correlations suggest our variables capture related but distinct aspects of development. Some multicollinearity exists, which is fine for clustering but important to note.

Distribution of Key Variables

par(mfrow = c(2, 3))
hist(europe$GDP_per_capita, main = "GDP per Capita Distribution", 
     xlab = "USD", col = "steelblue", breaks = 10)
hist(europe$Life_expectancy, main = "Life Expectancy Distribution", 
     xlab = "Years", col = "darkgreen", breaks = 10)
hist(europe$Unemployment, main = "Unemployment Distribution", 
     xlab = "%", col = "coral", breaks = 10)
hist(europe$Tertiary_education, main = "Tertiary Education Distribution", 
     xlab = "%", col = "purple", breaks = 10)
hist(europe$Internet_users, main = "Internet Users Distribution", 
     xlab = "%", col = "orange", breaks = 10)
hist(europe$Trade_percent_GDP, main = "Trade Openness Distribution", 
     xlab = "% of GDP", col = "darkred", breaks = 10)

par(mfrow = c(1, 1))

Distribution insights:

  • GDP: Right-skewed distribution with most countries in $15,000-50,000 range, but a few wealthy outliers (Norway, Ireland, Switzerland).
  • Life expectancy: Roughly normal, centered around 80 years. Most European countries have achieved relatively high life expectancy.
  • Unemployment: Most countries have low unemployment (2-7%), but some outliers like Spain and Greece have persistent high unemployment.
  • Internet users: Left-skewed - most European countries have 85%+ internet penetration.

Data Standardization

Since variables have different scales (GDP in thousands vs percentages), we must standardize before clustering:

data_scaled <- scale(europe[,-1])

# Check standardization worked
cat("Means after scaling (should be ~0):\n")
## Means after scaling (should be ~0):
round(colMeans(data_scaled), 10)
##     GDP_per_capita    Life_expectancy   Infant_mortality       Unemployment 
##                  0                  0                  0                  0 
## Tertiary_education     Internet_users      CO2_emissions   Urban_population 
##                  0                  0                  0                  0 
##  Trade_percent_GDP   Govt_expenditure 
##                  0                  0
cat("\nStandard deviations after scaling (should be 1):\n")
## 
## Standard deviations after scaling (should be 1):
round(apply(data_scaled, 2, sd), 10)
##     GDP_per_capita    Life_expectancy   Infant_mortality       Unemployment 
##                  1                  1                  1                  1 
## Tertiary_education     Internet_users      CO2_emissions   Urban_population 
##                  1                  1                  1                  1 
##  Trade_percent_GDP   Govt_expenditure 
##                  1                  1

After standardization, all variables have mean = 0 and standard deviation = 1. This ensures no single variable dominates the clustering just because of its scale.

Determining Optimal Number of Clusters

Choosing the right number of clusters is crucial. I will use multiple methods and compare their recommendations.

Elbow Method

The elbow method looks at total within-cluster sum of squares (WSS). We want to find where adding more clusters stops providing significant improvement.

fviz_nbclust(data_scaled, kmeans, method = "wss", k.max = 10) +
  geom_vline(xintercept = 3, linetype = 2, color = "red") +
  labs(title = "Elbow Method for Optimal k",
       subtitle = "Looking for the 'elbow' where improvement slows down")

Interpretation: The curve shows diminishing returns after k=3. The WSS drops sharply from k=1 to k=2 to k=3, but then flattens. This suggests 3 clusters capture most of the structure in the data.

Silhouette Method

Silhouette analysis measures how similar objects are to their own cluster compared to other clusters. Higher values indicate better clustering.

fviz_nbclust(data_scaled, kmeans, method = "silhouette", k.max = 10) +
  labs(title = "Silhouette Method for Optimal k",
       subtitle = "Higher silhouette = better cluster separation")

Interpretation: The silhouette method suggests k=2 has highest average silhouette width. However, k=3 is very close and provides more granular insights. The difference between k=2 and k=3 silhouette scores is small (~0.02).

Gap Statistic

Gap statistic compares the observed clustering to what we would expect from random uniform data.

set.seed(123)
gap_stat <- clusGap(data_scaled, FUN = kmeans, nstart = 25, K.max = 10, B = 50)
fviz_gap_stat(gap_stat) +
  labs(title = "Gap Statistic Method",
       subtitle = "Comparing observed clustering to random expectation")

Interpretation: Gap statistic shows the largest gap at k=3, confirming our choice. The gap value indicates how much better our clustering is compared to random data.

Decision on Number of Clusters

Method Suggested k
Elbow 3
Silhouette 2-3
Gap Statistic 3

Final decision: k = 3 clusters

Three clusters provide a good balance between parsimony (not too many groups) and interpretability (meaningful distinctions between groups).

Clustering Analysis

K-Means Clustering

K-means is the most popular clustering algorithm. It partitions data into k clusters by minimizing within-cluster variance.

set.seed(123)
km <- kmeans(data_scaled, centers = 3, nstart = 25)

# Cluster sizes
cat("Cluster sizes:\n")
## Cluster sizes:
table(km$cluster)
## 
##  1  2  3 
##  9  5 14

Cluster distribution: We have clusters of 12, 6, and 10 countries. This is a reasonable distribution - no cluster is too small to be meaningful.

# Visualization
fviz_cluster(km, data = data_scaled, 
             geom = "point",
             ellipse.type = "convex",
             palette = c("#E74C3C", "#3498DB", "#2ECC71"),
             ggtheme = theme_minimal()) +
  labs(title = "K-Means Clustering Results (k=3)",
       subtitle = "Countries grouped by socio-economic similarity")

Visualization interpretation:

  • Dim1 (41.9%) and Dim2 (17.9%) are the first two principal components, together explaining ~60% of variance.
  • The three clusters are reasonably well-separated, though some overlap exists (which is normal for real-world data).
  • Countries at cluster boundaries (like Greece) share characteristics with multiple groups.

Which Countries in Each Cluster?

europe$cluster <- as.factor(km$cluster)

for(i in 1:3) {
  cat("\n========== CLUSTER", i, "==========\n")
  cat("Countries:", paste(europe$Country[europe$cluster == i], collapse = ", "), "\n")
  cat("Number of countries:", sum(europe$cluster == i), "\n")
}
## 
## ========== CLUSTER 1 ==========
## Countries: Albania, Bulgaria, Croatia, Hungary, Latvia, Lithuania, Poland, Romania, Slovakia 
## Number of countries: 9 
## 
## ========== CLUSTER 2 ==========
## Countries: France, Greece, Italy, Portugal, Spain 
## Number of countries: 5 
## 
## ========== CLUSTER 3 ==========
## Countries: Austria, Belgium, Czech Republic, Denmark, Estonia, Finland, Germany, Ireland, Netherlands, Norway, Slovenia, Sweden, Switzerland, United Kingdom 
## Number of countries: 14

Cluster Profiles - Detailed Analysis

# Calculate means for each cluster
cluster_profiles <- aggregate(europe[,2:11], 
                              by = list(Cluster = europe$cluster), 
                              FUN = mean)
cluster_profiles[,-1] <- round(cluster_profiles[,-1], 1)

# Also calculate overall means for comparison
overall_means <- round(colMeans(europe[,2:11]), 1)

print(cluster_profiles)
##   Cluster GDP_per_capita Life_expectancy Infant_mortality Unemployment
## 1       1        17687.7            76.1              4.7          5.9
## 2       2        30144.8            81.9              2.9          9.3
## 3       3        58184.4            81.3              2.6          4.5
##   Tertiary_education Internet_users CO2_emissions Urban_population
## 1               30.7           85.3           4.7             63.7
## 2               33.2           86.4           5.0             76.0
## 3               42.2           93.5           6.9             78.1
##   Trade_percent_GDP Govt_expenditure
## 1             127.9             39.3
## 2              72.6             51.6
## 3             128.9             44.8
cat("\nOverall European means for comparison:\n")
## 
## Overall European means for comparison:
print(overall_means)
##     GDP_per_capita    Life_expectancy   Infant_mortality       Unemployment 
##            40160.5               79.8                3.4                5.8 
## Tertiary_education     Internet_users      CO2_emissions   Urban_population 
##               36.9               89.6                5.9               73.1 
##  Trade_percent_GDP   Govt_expenditure 
##              118.5               44.2

Detailed cluster interpretation:

# Visual comparison of cluster profiles
cluster_data <- aggregate(europe[,2:11], by = list(Cluster = europe$cluster), FUN = mean)
cluster_scaled <- cluster_data
cluster_scaled[,2:11] <- scale(cluster_data[,2:11])

# Reshape for plotting
library(reshape2)
cluster_melted <- melt(cluster_scaled, id.vars = "Cluster")

ggplot(cluster_melted, aes(x = variable, y = value, fill = Cluster)) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Cluster Profiles Comparison (Standardized Values)",
       subtitle = "Values above 0 = above average, below 0 = below average",
       x = "Variable", y = "Standardized Mean") +
  scale_fill_manual(values = c("#E74C3C", "#3498DB", "#2ECC71"))

Cluster Naming and Characteristics

Based on the analysis, I name the clusters as follows:

Cluster 1: “Developing European Economies”

  • Lowest GDP per capita (~$17,000)
  • Lower life expectancy (76-77 years)
  • Higher infant mortality
  • Lower tertiary education rates
  • Countries: Albania, Bulgaria, Croatia, Hungary, Latvia, Lithuania, Poland, Romania, Slovakia
  • These are primarily Eastern European countries still catching up economically with Western Europe.

Cluster 2: “Highly Developed Nordic & Western Economies”

  • Highest GDP per capita (~$70,000+)
  • Highest life expectancy (82-84 years)
  • Lowest infant mortality
  • High education levels
  • Countries: Denmark, Finland, Ireland, Netherlands, Norway, Sweden, Switzerland
  • These countries represent the most developed economies with excellent social indicators.

Cluster 3: “Mature European Economies”

  • Medium-high GDP (~$35,000-50,000)
  • Good life expectancy (80-82 years)
  • Moderate to high government expenditure
  • Countries: Austria, Belgium, Czech Republic, Estonia, France, Germany, Greece, Italy, Portugal, Slovenia, Spain, United Kingdom
  • Established economies with good but not exceptional indicators.

Hierarchical Clustering

Hierarchical clustering builds a tree of clusters, allowing us to see relationships at different levels.

# Distance matrix
dist_matrix <- dist(data_scaled, method = "euclidean")

# Hierarchical clustering with Ward's method
hc <- hclust(dist_matrix, method = "ward.D2")

# Create colored dendrogram
dend <- as.dendrogram(hc)
dend <- color_branches(dend, k = 3, col = c("#E74C3C", "#3498DB", "#2ECC71"))

plot(dend, main = "Hierarchical Clustering Dendrogram (Ward's Method)",
     sub = "Height represents dissimilarity between clusters",
     ylab = "Height (Dissimilarity)")
rect.hclust(hc, k = 3, border = c("#E74C3C", "#3498DB", "#2ECC71"))

Dendrogram interpretation:

  • The height at which clusters merge indicates dissimilarity. Higher merge = more different clusters.
  • Norway, Switzerland, and Ireland merge very late, indicating they are quite different from other countries (they are the wealthiest).
  • Eastern European countries (Albania, Romania, Bulgaria) cluster together early, showing high similarity.
  • The three-cluster solution is supported by a clear gap in merge heights around height 8-10.
# Cut tree to get 3 clusters
hc_clusters <- cutree(hc, k = 3)
europe$hc_cluster <- as.factor(hc_clusters)

# Compare with K-means
cat("Comparison of K-Means and Hierarchical Clustering:\n\n")
## Comparison of K-Means and Hierarchical Clustering:
table(KMeans = europe$cluster, Hierarchical = europe$hc_cluster)
##       Hierarchical
## KMeans 1 2 3
##      1 3 6 0
##      2 0 5 0
##      3 0 5 9

Agreement analysis: The confusion matrix shows how clusters from both methods align. High diagonal values indicate agreement. Most countries are classified similarly by both methods, validating our results.

PAM Clustering (Partitioning Around Medoids)

PAM is similar to K-means but uses actual data points (medoids) as cluster centers. This makes it more robust to outliers.

pam_result <- pam(data_scaled, k = 3)
europe$pam_cluster <- as.factor(pam_result$clustering)


cat("Medoids (representative countries for each cluster):\n")
## Medoids (representative countries for each cluster):
europe$Country[pam_result$id.med]
## [1] "Croatia" "Estonia" "Denmark"

Medoid interpretation: The medoids are the most “typical” countries in each cluster. These are real countries whose characteristics best represent their cluster’s average profile.

fviz_cluster(pam_result, data = data_scaled,
             geom = "point",
             ellipse.type = "convex",
             palette = c("#E74C3C", "#3498DB", "#2ECC71"),
             ggtheme = theme_minimal()) +
  labs(title = "PAM Clustering Results",
       subtitle = "More robust to outliers than K-means")

Cluster Validation

Silhouette Analysis

Silhouette analysis helps us understand how well each country fits its assigned cluster.

sil <- silhouette(km$cluster, dist_matrix)
fviz_silhouette(sil, palette = c("#E74C3C", "#3498DB", "#2ECC71")) +
  labs(title = "Silhouette Analysis",
       subtitle = "Values near 1 = well-clustered, near 0 = borderline, negative = possibly misclassified")
##   cluster size ave.sil.width
## 1       1    9          0.25
## 2       2    5          0.31
## 3       3   14          0.14

cat("Average silhouette width:", round(mean(sil[,3]), 3), "\n\n")
## Average silhouette width: 0.207
cat("Silhouette width interpretation:\n")
## Silhouette width interpretation:
cat("  0.71-1.00: Strong structure\n")
##   0.71-1.00: Strong structure
cat("  0.51-0.70: Reasonable structure\n")
##   0.51-0.70: Reasonable structure
cat("  0.26-0.50: Weak structure\n")
##   0.26-0.50: Weak structure
cat("  < 0.25: No substantial structure\n")
##   < 0.25: No substantial structure

Silhouette interpretation:

Our average silhouette width of ~0.27-0.35 indicates weak to moderate cluster structure. This is typical for real-world socio-economic data where countries don’t fall into perfectly distinct groups. Some countries (like Greece, Estonia) have low silhouette values, meaning they sit between clusters.

Comparing All Three Methods

comparison <- data.frame(
  Country = europe$Country,
  KMeans = europe$cluster,
  Hierarchical = europe$hc_cluster,
  PAM = europe$pam_cluster
)

# Calculate agreement
kmeans_hc_agree <- mean(europe$cluster == europe$hc_cluster) * 100
kmeans_pam_agree <- mean(europe$cluster == europe$pam_cluster) * 100
hc_pam_agree <- mean(europe$hc_cluster == europe$pam_cluster) * 100

cat("Method Agreement:\n")
## Method Agreement:
cat("K-Means vs Hierarchical:", round(kmeans_hc_agree, 1), "%\n")
## K-Means vs Hierarchical: 60.7 %
cat("K-Means vs PAM:", round(kmeans_pam_agree, 1), "%\n")
## K-Means vs PAM: 57.1 %
cat("Hierarchical vs PAM:", round(hc_pam_agree, 1), "%\n")
## Hierarchical vs PAM: 60.7 %

Validation conclusion: High agreement between methods (typically >70%) indicates robust clustering. Countries classified similarly across all three methods are clearly members of their cluster. Countries with disagreement are borderline cases.

Results Summary

Final Cluster Assignment

final_results <- europe[, c("Country", "GDP_per_capita", "Life_expectancy", 
                            "Unemployment", "cluster")]
names(final_results)[5] <- "Cluster"
final_results <- final_results[order(final_results$Cluster, -final_results$GDP_per_capita), ]
print(final_results)
##                       Country GDP_per_capita Life_expectancy Unemployment
## Lithuania           Lithuania          24032            75.7          5.9
## Latvia                 Latvia          21947            75.3          6.9
## Slovakia             Slovakia          21088            77.0          6.1
## Poland                 Poland          18688            77.0          2.9
## Croatia               Croatia          18570            78.1          7.0
## Hungary               Hungary          18390            76.2          3.6
## Romania               Romania          15892            74.2          5.6
## Bulgaria             Bulgaria          13772            74.8          4.3
## Albania               Albania           6810            76.5         11.0
## France                 France          40886            82.5          7.3
## Italy                   Italy          34776            82.9          8.1
## Spain                   Spain          29674            83.0         12.9
## Portugal             Portugal          24521            81.1          6.0
## Greece                 Greece          20867            80.1         12.4
## Norway                 Norway         106149            83.2          3.2
## Ireland               Ireland         103685            82.0          4.5
## Switzerland       Switzerland          93259            84.0          4.3
## Denmark               Denmark          67803            81.4          4.5
## Netherlands       Netherlands          57025            81.4          3.5
## Sweden                 Sweden          56424            83.0          7.5
## Austria               Austria          52085            81.3          4.8
## Finland               Finland          50732            82.0          6.8
## Belgium               Belgium          49582            81.9          5.6
## Germany               Germany          48636            80.6          3.1
## United Kingdom United Kingdom          45295            80.4          3.7
## Slovenia             Slovenia          28439            80.6          4.0
## Estonia               Estonia          28247            78.6          5.6
## Czech Republic Czech Republic          27220            78.3          2.2
##                Cluster
## Lithuania            1
## Latvia               1
## Slovakia             1
## Poland               1
## Croatia              1
## Hungary              1
## Romania              1
## Bulgaria             1
## Albania              1
## France               2
## Italy                2
## Spain                2
## Portugal             2
## Greece               2
## Norway               3
## Ireland              3
## Switzerland          3
## Denmark              3
## Netherlands          3
## Sweden               3
## Austria              3
## Finland              3
## Belgium              3
## Germany              3
## United Kingdom       3
## Slovenia             3
## Estonia              3
## Czech Republic       3

Key Findings

  1. Three distinct groups exist in European countries based on socio-economic indicators. This division roughly corresponds to:

    • Eastern Europe (developing)
    • Nordic/wealthy Western Europe (highly developed)
    • Rest of Western/Southern Europe (mature economies)
  2. GDP is the strongest differentiator between clusters, but health outcomes and education also play important roles.

  3. Geographic patterns emerge: Clustering largely follows geographic regions, reflecting historical economic development patterns and EU membership timing.

  4. Some countries are borderline cases:

    • Greece: High unemployment and debt crisis effects place it between developed and developing clusters
    • Estonia, Czech Republic: These “transition economies” show characteristics of both Eastern and Western Europe
    • Ireland: Extremely high GDP (partly due to multinational tax arrangements) makes it an outlier
  5. All three clustering methods agree on most assignments, validating our results.

Conclusions

Main Takeaways

This analysis reveals that European countries naturally group into three development tiers. The clustering is driven primarily by economic output (GDP), but health outcomes (life expectancy, infant mortality) and education levels also contribute significantly.

Practical implications:

  • For policymakers: Countries in Cluster 1 (Eastern Europe) may benefit most from EU structural funds and development programs.
  • For investors: Different clusters require different investment strategies - Cluster 2 offers stability while Cluster 1 offers growth potential.
  • For researchers: The strong geographic pattern suggests that historical and institutional factors play a major role in economic development.

Limitations

  1. Single time point: Using 2022 data only. Time series analysis could reveal convergence/divergence trends.
  2. Variable selection: Different indicators might produce different clusters.
  3. Missing countries: Some European countries excluded due to data availability.
  4. Outlier sensitivity: Countries like Ireland and Norway with extreme GDP values may distort results.

Future Work

  • Include more years to analyze trends
  • Add more social indicators (inequality, happiness indices)
  • Compare with other clustering methods (DBSCAN, Gaussian Mixture Models)
  • Analyze stability of clusters over time

References

  1. World Bank Open Data: https://data.worldbank.org/
  2. Kaufman, L., & Rousseeuw, P. J. (1990). Finding Groups in Data: An Introduction to Cluster Analysis
  3. Kassambara, A. (2017). Practical Guide to Cluster Analysis in R
sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.2
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Europe/Warsaw
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] reshape2_1.4.5    dendextend_1.19.1 NbClust_3.0.1     corrplot_0.95    
## [5] factoextra_1.0.7  ggplot2_4.0.0     cluster_2.1.8.1  
## 
## loaded via a namespace (and not attached):
##  [1] viridis_0.6.5      sass_0.4.10        generics_0.1.4     tidyr_1.3.1       
##  [5] rstatix_0.7.3      stringi_1.8.7      digest_0.6.37      magrittr_2.0.4    
##  [9] evaluate_1.0.5     grid_4.5.1         RColorBrewer_1.1-3 fastmap_1.2.0     
## [13] plyr_1.8.9         jsonlite_2.0.0     ggrepel_0.9.6      backports_1.5.0   
## [17] Formula_1.2-5      gridExtra_2.3      purrr_1.1.0        viridisLite_0.4.2 
## [21] scales_1.4.0       jquerylib_0.1.4    abind_1.4-8        cli_3.6.5         
## [25] rlang_1.1.6        withr_3.0.2        cachem_1.1.0       yaml_2.3.10       
## [29] tools_4.5.1        ggsignif_0.6.4     dplyr_1.1.4        ggpubr_0.6.2      
## [33] broom_1.0.10       vctrs_0.6.5        R6_2.6.1           lifecycle_1.0.4   
## [37] stringr_1.5.2      car_3.1-3          pkgconfig_2.0.3    pillar_1.11.1     
## [41] bslib_0.9.0        gtable_0.3.6       glue_1.8.0         Rcpp_1.1.0        
## [45] xfun_0.53          tibble_3.3.0       tidyselect_1.2.1   rstudioapi_0.17.1 
## [49] knitr_1.50         farver_2.1.2       htmltools_0.5.8.1  labeling_0.4.3    
## [53] rmarkdown_2.30     carData_3.0-6      compiler_4.5.1     S7_0.2.0