library(readxl) # Reading excel files
library(skimr) # Summary statistics
library(tidyverse) # Pack of useful tools
library(mclust) # Model based clustering
library(cluster) # Cluster analysis
library(factoextra) # Visualizing distances
Import Dataset as a dataframe
dataset <- read_excel("Data_Aeroports_Clustersv1.xlsx")
df <- data.frame(dataset)
Data exploratary
Summary Statistics
Data summary
| Name |
df |
| Number of rows |
32 |
| Number of columns |
21 |
| _______________________ |
|
| Column type frequency: |
|
| character |
2 |
| numeric |
19 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
| Code |
0 |
1 |
3 |
3 |
0 |
32 |
0 |
| Airport |
0 |
1 |
4 |
35 |
0 |
32 |
0 |
Variable type: numeric
| Ordem |
0 |
1 |
16.50 |
9.38 |
1.00 |
8.75 |
16.50 |
24.25 |
32.00 |
▇▇▇▇▇ |
| Passengers |
0 |
1 |
20750710.88 |
17601931.34 |
456698.00 |
8927021.50 |
17275317.50 |
28666511.50 |
67054745.00 |
▇▅▂▂▁ |
| Movements |
0 |
1 |
205111.16 |
143564.45 |
5698.00 |
82765.75 |
191742.50 |
258654.50 |
518018.00 |
▇▅▇▂▃ |
| Numberofairlines |
0 |
1 |
57.81 |
40.42 |
1.00 |
22.50 |
55.50 |
90.25 |
136.00 |
▇▆▅▆▃ |
| Mainairlineflightspercentage |
0 |
1 |
33.78 |
22.08 |
12.00 |
22.00 |
28.50 |
33.00 |
95.00 |
▇▆▁▁▂ |
| Maximumpercentageoftrafficpercountry |
0 |
1 |
17.47 |
7.31 |
9.00 |
12.00 |
15.00 |
22.25 |
35.00 |
▇▂▃▂▂ |
| NumberofLCCflightsweekly |
0 |
1 |
397.59 |
221.56 |
37.00 |
226.25 |
366.50 |
546.75 |
776.00 |
▅▇▅▅▆ |
| NumberofLowCostAirlines |
0 |
1 |
11.59 |
5.60 |
1.00 |
7.75 |
12.00 |
16.00 |
23.00 |
▃▇▇▆▃ |
| LowCostAirlinespercentage |
0 |
1 |
36.44 |
30.10 |
6.25 |
16.29 |
19.59 |
50.36 |
100.00 |
▇▂▁▁▂ |
| Destinations |
0 |
1 |
167.62 |
80.13 |
20.00 |
109.25 |
168.50 |
222.50 |
301.00 |
▃▇▆▇▆ |
| Average_Route_Distance |
0 |
1 |
2275.19 |
930.28 |
1225.00 |
1599.50 |
2152.00 |
2765.00 |
5635.00 |
▇▆▂▁▁ |
| DistancetoclosestAirport |
0 |
1 |
90.19 |
64.56 |
13.84 |
45.83 |
66.50 |
111.61 |
244.50 |
▇▇▃▁▂ |
| DistancetoclosestSimilarAirport |
0 |
1 |
248.64 |
183.60 |
38.16 |
97.74 |
206.12 |
376.15 |
635.05 |
▇▅▃▁▃ |
| AirportRegionalrelevance |
0 |
1 |
0.73 |
0.23 |
0.19 |
0.58 |
0.80 |
0.91 |
0.99 |
▁▃▁▆▇ |
| Distancetocitykm |
0 |
1 |
25.81 |
25.44 |
3.00 |
9.75 |
14.50 |
35.00 |
100.00 |
▇▂▁▁▁ |
| Inhanbitantscorrected |
0 |
1 |
4528561.95 |
2590542.88 |
329240.50 |
2856960.30 |
4532760.00 |
6733158.88 |
9870818.00 |
▆▆▇▇▁ |
| numberofvisitorscorrected |
0 |
1 |
2766002.58 |
2549773.72 |
80232.50 |
1018390.89 |
1896295.60 |
3450491.78 |
9732062.00 |
▇▃▁▂▁ |
| GDPcorrected |
0 |
1 |
30160.75 |
10510.93 |
8500.00 |
25000.00 |
31150.00 |
35550.00 |
56600.00 |
▃▅▇▃▁ |
| Cargoton |
0 |
1 |
236531.76 |
478310.12 |
0.00 |
10325.00 |
72749.85 |
153372.85 |
1819000.00 |
▇▁▁▁▁ |
Sneak the plot
plot(Numberofairlines ~ Destinations, df) #plot
text(Numberofairlines ~ Destinations, df, label = Airport, pos = 4, cex = 0.6) #labels over the previous plot

Data preparation before clustering
Missing data
table(is.na(df)) # FALSE <-- No missing value; if yes, using listwise detection df <- na.omit(df)
##
## FALSE
## 672
Continuous variables
# Leave only continuous variables, make order as the row ID variable
df_reduced = df[,!(names(df) %in% c("Code", "Airport"))]
df_reduced = data.frame(df_reduced, row.names = 1)
head(df_reduced)
## Passengers Movements Numberofairlines Mainairlineflightspercentage
## 1 9830987 119322 64 18
## 2 9742300 132200 29 33
## 3 9155665 101557 47 17
## 4 9139479 74281 35 29
## 5 9129053 83013 11 37
## 6 8320927 115934 36 31
## Maximumpercentageoftrafficpercountry NumberofLCCflightsweekly
## 1 20 256
## 2 13 351
## 3 26 259
## 4 23 300
## 5 22 227
## 6 14 341
## NumberofLowCostAirlines LowCostAirlinespercentage Destinations
## 1 18 28.12500 104
## 2 12 41.37931 189
## 3 19 40.42553 116
## 4 18 51.42857 160
## 5 8 72.72727 87
## 6 7 19.44445 111
## Average_Route_Distance DistancetoclosestAirport
## 1 1253 23.66681
## 2 1721 63.45766
## 3 3143 122.58936
## 4 1701 63.09924
## 5 1582 45.13247
## 6 1460 244.49577
## DistancetoclosestSimilarAirport AirportRegionalrelevance Distancetocitykm
## 1 223.83824 0.8698581 6
## 2 63.45766 0.5127419 15
## 3 132.45082 0.7840877 19
## 4 134.50558 0.8098081 9
## 5 45.13247 0.1947903 55
## 6 559.31000 0.9810450 10
## Inhanbitantscorrected numberofvisitorscorrected GDPcorrected Cargoton
## 1 3551805.0 2152829.8 26300 11223.39
## 2 4180133.5 1151381.6 30100 562.00
## 3 705807.8 1678968.6 20700 25994.00
## 4 1508358.6 1944196.8 25000 3199.73
## 5 1562709.8 181063.5 32000 28698.00
## 6 6626197.0 770720.5 11200 82756.54
Standardize variables
mean <- apply(df_reduced, 2, mean) # "2" select the columns. MARGIN: c(1,2)
sd <- apply(df_reduced, 2, sd)
df_scaled <- scale(df_reduced, mean, sd)
Hierarchical Clustering
Euclidean distances
# Measuring Similarity through Euclidean distances
distance <- dist(df_scaled, method = "euclidean")
Visualise distance in heatmap
fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"), order = FALSE)

Types of hierarchical clustering
Single linkage (neareast neighbor) clustering algorithm
# Based on a bottom-up approach, by linking two clusters that have the closest distance between each other.
models <- hclust(distance, "single")
plot(models, labels = df$Airport, xlab = "Distance - Single linkage", cex=0.6, hang = -1)
rect.hclust(models, 4, border = "purple") # Visualize the cut on the dendogram, with 4 clusters

Complete linkage (Farthest neighbor) clustering algorithm
# Complete linkage is based on the maximum distance between observations in each cluster.
modelc <- hclust(distance, "complete")
plot(modelc, labels = df$Airport, xlab = "Distance - Complete linkage", cex=0.6, hang = -1)
rect.hclust(modelc, 4, border = "blue")

Average linkage between groups
# The average linkage considers the distance between clusters to be the average of the distances between observations in one cluster to all the members in the other cluster.
modela <- hclust(distance, "average")
plot(modela, labels = df$Airport, xlab = "Distance - Average linkage", cex=0.6, hang = -1)
rect.hclust(modelc, 4, border = "red")

Ward’s method
modelw <- hclust(distance, "ward.D2")
plot(modelw, labels = df$Airport, xlab = "Distance - Ward method", cex=0.6, hang = -1)
rect.hclust(modelw, 4, border = "orange")

Centroid method
modelcen <- hclust(distance, "centroid")
plot(modelcen, labels = df$Airport, xlab = "Distance - Centroid method", cex=0.6, hang = -1)
rect.hclust(modelcen, 4, border = "darkgreen")

Comparing results from different hierarchical methods
# evaluate the membership of each observation with the cutree function for each method.
member_single <- cutree(models, 4)
member_com <- cutree(modelc, 4)
member_av <- cutree(modela, 4)
member_ward <- cutree(modelw, 4)
member_cen <- cutree(modelcen, 4)
table(member_com, member_av) # compare the complete linkage with the average linkage
## member_av
## member_com 1 2 3 4
## 1 14 0 0 0
## 2 11 0 0 0
## 3 0 3 0 0
## 4 0 0 3 1
Silhouette Plots: evaluate which method is more appropriate
plot(silhouette(member_single, distance))

plot(silhouette(member_com, distance))

plot(silhouette(member_av, distance))

plot(silhouette(member_ward, distance))

plot(silhouette(member_cen, distance))

Non-hirarchical Clustering
Case 1: K-means clustering with k
km_clust <- kmeans(df_scaled, 3) # k=3
km_clust #print the results
## K-means clustering with 3 clusters of sizes 18, 7, 7
##
## Cluster means:
## Passengers Movements Numberofairlines Mainairlineflightspercentage
## 1 -0.2469314 -0.2011272 -0.004982515 -0.3699491
## 2 -0.9456035 -1.0500969 -1.239468064 1.4006677
## 3 1.5805700 1.5672811 1.252280245 -0.4493699
## Maximumpercentageoftrafficpercountry NumberofLCCflightsweekly
## 1 -0.1629508 0.04370786
## 2 0.9913460 -1.21033588
## 3 -0.5723298 1.09794424
## NumberofLowCostAirlines LowCostAirlinespercentage Destinations
## 1 0.3305562 -0.2689855 0.07609389
## 2 -1.2287547 1.4725164 -1.31999250
## 3 0.3787531 -0.7808392 1.12432250
## Average_Route_Distance DistancetoclosestAirport
## 1 -0.1062034 0.06631895
## 2 -0.8907198 0.20629602
## 3 1.1638144 -0.37683045
## DistancetoclosestSimilarAirport AirportRegionalrelevance Distancetocitykm
## 1 -0.07737412 0.1407640 -0.3136475
## 2 -0.67596963 -0.9617675 1.0125397
## 3 0.87493166 0.5998030 -0.2060176
## Inhanbitantscorrected numberofvisitorscorrected GDPcorrected Cargoton
## 1 -0.06258292 -0.2290652 -0.1117858 -0.3186842
## 2 -0.92990235 -0.7382133 -0.5168124 -0.4251367
## 3 1.09082987 1.3272381 0.8042615 1.2446104
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
## 1 1 1 1 2 1 1 2 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 1 1
## 27 28 29 30 31 32
## 2 2 1 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 150.32291 67.16167 70.90186
## (between_SS / total_SS = 48.3 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
Case 2: Find k
k <- list()
for(i in 1:10){ #detect how many clusters from 1 - 10 explains more variance
k[[i]] <- kmeans(df_scaled, i)
}
# Plot between_SS / total_SS into a scree plot, see how it varies when you add clusters.
betSS_totSS <- list()
for(i in 1:10){
betSS_totSS[[i]] <- k[[i]]$betweenss/k[[i]]$totss
}
plot(1:10, betSS_totSS, type = "b", ylab = "Between SS / Total SS", xlab = "Number of clusters")

Detect outliers
Examine the boxplots
par(cex.axis=0.6, mar=c(11,2,1,1))# Make labels fit in the boxplot
boxplot(df_scaled, las = 2) #labels rotated to vertical

Detect the outliers
outliers <- boxplot.stats(df_scaled)$out
outliers
## [1] 2.630622 2.772024 2.772024 3.611626 2.916185 2.523102 2.732030 2.515406
## [9] 3.308457 3.285459
Remove rows with outliers
## [1] 32
out_ind <- which(df_scaled %in% c(outliers)) #the row.names that contain outliers
df_no_outliers = df_scaled[-c(out_ind),] #remove those rows from the df_scaled
nrow(df_no_outliers) #31
## [1] 31
K-means cluster
km_no_outliers <- kmeans(df_no_outliers, 3)
km_no_outliers
## K-means clustering with 3 clusters of sizes 11, 15, 5
##
## Cluster means:
## Passengers Movements Numberofairlines Mainairlineflightspercentage
## 1 -0.6095999 -0.6567742 -0.5981361 -0.1917995
## 2 0.5890127 0.7260264 0.8111878 -0.5183690
## 3 -0.9520429 -1.0882984 -1.3214660 1.9931973
## Maximumpercentageoftrafficpercountry NumberofLCCflightsweekly
## 1 0.5577814 -0.5451081
## 2 -0.6296646 0.7116404
## 3 0.6473374 -1.2772631
## NumberofLowCostAirlines LowCostAirlinespercentage Destinations
## 1 0.07256111 0.1215978 -0.4548142
## 2 0.46550744 -0.6675400 0.7801097
## 3 -1.39205703 1.9302799 -1.4105623
## Average_Route_Distance DistancetoclosestAirport
## 1 -0.4759152 0.4952267
## 2 0.4294196 -0.4088129
## 3 -0.9635706 0.2765206
## DistancetoclosestSimilarAirport AirportRegionalrelevance Distancetocitykm
## 1 0.04572232 0.2308600 -0.4143005
## 2 0.27904593 0.2668308 -0.2520645
## 3 -0.71705981 -1.2364486 1.6976272
## Inhanbitantscorrected numberofvisitorscorrected GDPcorrected Cargoton
## 1 -0.4802512 -0.5070902 -0.6566693 -0.4356553
## 2 0.6091294 0.4709562 0.4324943 0.3093463
## 3 -0.9802943 -0.7587213 -0.3558915 -0.4142745
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 19 20 21 22 23 24 25 26 27
## 1 1 1 1 3 1 1 1 1 2 2 2 1 2 2 2 1 2 2 2 2 2 2 2 2 3
## 28 29 30 31 32
## 3 2 1 3 3
##
## Within cluster sum of squares by cluster:
## [1] 81.96037 135.98599 41.82151
## (between_SS / total_SS = 48.9 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
Plot the clusters
K-means with outliers
plot(Numberofairlines ~ Destinations, df, col = km_clust$cluster)
with(df, text(Numberofairlines ~ Destinations, label = Airport, pos = 1, cex = 0.6))

K-means without outliers
plot(Numberofairlines ~ Destinations, df, col = km_no_outliers$cluster)
with(df, text(Numberofairlines ~ Destinations, label = Airport, pos = 1, cex = 0.6))
