library(readxl) # Reading excel files
library(skimr) # Summary statistics
library(tidyverse) # Pack of useful tools
library(mclust) # Model based clustering
library(cluster) # Cluster analysis
library(factoextra) # Visualizing distances

1 Import Dataset as a dataframe

dataset <- read_excel("Data_Aeroports_Clustersv1.xlsx")
df <- data.frame(dataset)

2 Data exploratary

2.1 Summary Statistics

skim(df)

Data summary
Name	df
Number of rows	32
Number of columns	21
_______________________
Column type frequency:
character	2
numeric	19
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
Code	0	1	3	3	0	32	0
Airport	0	1	4	35	0	32	0

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
Ordem	1	16.50	9.38	1.00	8.75	16.50	24.25	32.00	▇▇▇▇▇
Passengers	1	20750710.88	17601931.34	456698.00	8927021.50	17275317.50	28666511.50	67054745.00	▇▅▂▂▁
Movements	1	205111.16	143564.45	5698.00	82765.75	191742.50	258654.50	518018.00	▇▅▇▂▃
Numberofairlines	1	57.81	40.42	1.00	22.50	55.50	90.25	136.00	▇▆▅▆▃
Mainairlineflightspercentage	1	33.78	22.08	12.00	22.00	28.50	33.00	95.00	▇▆▁▁▂
Maximumpercentageoftrafficpercountry	1	17.47	7.31	9.00	12.00	15.00	22.25	35.00	▇▂▃▂▂
NumberofLCCflightsweekly	1	397.59	221.56	37.00	226.25	366.50	546.75	776.00	▅▇▅▅▆
NumberofLowCostAirlines	1	11.59	5.60	1.00	7.75	12.00	16.00	23.00	▃▇▇▆▃
LowCostAirlinespercentage	1	36.44	30.10	6.25	16.29	19.59	50.36	100.00	▇▂▁▁▂
Destinations	1	167.62	80.13	20.00	109.25	168.50	222.50	301.00	▃▇▆▇▆
Average_Route_Distance	1	2275.19	930.28	1225.00	1599.50	2152.00	2765.00	5635.00	▇▆▂▁▁
DistancetoclosestAirport	1	90.19	64.56	13.84	45.83	66.50	111.61	244.50	▇▇▃▁▂
DistancetoclosestSimilarAirport	1	248.64	183.60	38.16	97.74	206.12	376.15	635.05	▇▅▃▁▃
AirportRegionalrelevance	1	0.73	0.23	0.19	0.58	0.80	0.91	0.99	▁▃▁▆▇
Distancetocitykm	1	25.81	25.44	3.00	9.75	14.50	35.00	100.00	▇▂▁▁▁
Inhanbitantscorrected	1	4528561.95	2590542.88	329240.50	2856960.30	4532760.00	6733158.88	9870818.00	▆▆▇▇▁
numberofvisitorscorrected	1	2766002.58	2549773.72	80232.50	1018390.89	1896295.60	3450491.78	9732062.00	▇▃▁▂▁
GDPcorrected	1	30160.75	10510.93	8500.00	25000.00	31150.00	35550.00	56600.00	▃▅▇▃▁
Cargoton	1	236531.76	478310.12	0.00	10325.00	72749.85	153372.85	1819000.00	▇▁▁▁▁

2.2 Sneak the plot

plot(Numberofairlines ~ Destinations, df) #plot
text(Numberofairlines ~ Destinations, df, label = Airport, pos = 4, cex = 0.6) #labels over the previous plot

3 Data preparation before clustering

3.1 Missing data

table(is.na(df)) # FALSE <-- No missing value; if yes, using listwise detection df <- na.omit(df)

## 
## FALSE 
##   672

3.2 Continuous variables

# Leave only continuous variables, make order as the row ID variable
df_reduced = df[,!(names(df) %in% c("Code", "Airport"))]
df_reduced = data.frame(df_reduced, row.names = 1)

head(df_reduced)

##   Passengers Movements Numberofairlines Mainairlineflightspercentage
## 1    9830987    119322               64                           18
## 2    9742300    132200               29                           33
## 3    9155665    101557               47                           17
## 4    9139479     74281               35                           29
## 5    9129053     83013               11                           37
## 6    8320927    115934               36                           31
##   Maximumpercentageoftrafficpercountry NumberofLCCflightsweekly
## 1                                   20                      256
## 2                                   13                      351
## 3                                   26                      259
## 4                                   23                      300
## 5                                   22                      227
## 6                                   14                      341
##   NumberofLowCostAirlines LowCostAirlinespercentage Destinations
## 1                      18                  28.12500          104
## 2                      12                  41.37931          189
## 3                      19                  40.42553          116
## 4                      18                  51.42857          160
## 5                       8                  72.72727           87
## 6                       7                  19.44445          111
##   Average_Route_Distance DistancetoclosestAirport
## 1                   1253                 23.66681
## 2                   1721                 63.45766
## 3                   3143                122.58936
## 4                   1701                 63.09924
## 5                   1582                 45.13247
## 6                   1460                244.49577
##   DistancetoclosestSimilarAirport AirportRegionalrelevance Distancetocitykm
## 1                       223.83824                0.8698581                6
## 2                        63.45766                0.5127419               15
## 3                       132.45082                0.7840877               19
## 4                       134.50558                0.8098081                9
## 5                        45.13247                0.1947903               55
## 6                       559.31000                0.9810450               10
##   Inhanbitantscorrected numberofvisitorscorrected GDPcorrected Cargoton
## 1             3551805.0                 2152829.8        26300 11223.39
## 2             4180133.5                 1151381.6        30100   562.00
## 3              705807.8                 1678968.6        20700 25994.00
## 4             1508358.6                 1944196.8        25000  3199.73
## 5             1562709.8                  181063.5        32000 28698.00
## 6             6626197.0                  770720.5        11200 82756.54

3.3 Standardize variables

mean <- apply(df_reduced, 2, mean) # "2" select the columns. MARGIN: c(1,2)
sd <- apply(df_reduced, 2, sd)
df_scaled <- scale(df_reduced, mean, sd)

4 Hierarchical Clustering

4.1 Euclidean distances

# Measuring Similarity through Euclidean distances
distance <- dist(df_scaled, method = "euclidean")

4.2 Visualise distance in heatmap

fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"), order = FALSE)

4.3 Types of hierarchical clustering

4.3.1 Single linkage (neareast neighbor) clustering algorithm

# Based on a bottom-up approach, by linking two clusters that have the closest distance between each other.

models <- hclust(distance, "single")
plot(models, labels = df$Airport, xlab = "Distance - Single linkage", cex=0.6, hang = -1)

rect.hclust(models, 4, border = "purple") # Visualize the cut on the dendogram, with 4 clusters

4.3.2 Complete linkage (Farthest neighbor) clustering algorithm

# Complete linkage is based on the maximum distance between observations in each cluster.
modelc <- hclust(distance, "complete")
plot(modelc, labels = df$Airport, xlab = "Distance - Complete linkage", cex=0.6, hang = -1)
rect.hclust(modelc, 4, border = "blue")

4.3.3 Average linkage between groups

# The average linkage considers the distance between clusters to be the average of the distances between observations in one cluster to all the members in the other cluster.
modela <- hclust(distance, "average")
plot(modela, labels = df$Airport, xlab = "Distance - Average linkage", cex=0.6, hang = -1)
rect.hclust(modelc, 4, border = "red")

4.3.4 Ward’s method

modelw <- hclust(distance, "ward.D2")
plot(modelw, labels = df$Airport, xlab = "Distance - Ward method", cex=0.6, hang = -1)
rect.hclust(modelw, 4, border = "orange")

4.3.5 Centroid method

modelcen <- hclust(distance, "centroid")
plot(modelcen, labels = df$Airport, xlab = "Distance - Centroid method", cex=0.6, hang = -1)
rect.hclust(modelcen, 4, border = "darkgreen")

4.3.6 Comparing results from different hierarchical methods

# evaluate the membership of each observation with the cutree function for each method.

member_single <- cutree(models, 4)
member_com <- cutree(modelc, 4)
member_av <- cutree(modela, 4)
member_ward <- cutree(modelw, 4)
member_cen <- cutree(modelcen, 4)

table(member_com, member_av) # compare the complete linkage with the average linkage

##           member_av
## member_com  1  2  3  4
##          1 14  0  0  0
##          2 11  0  0  0
##          3  0  3  0  0
##          4  0  0  3  1

4.3.7 Silhouette Plots: evaluate which method is more appropriate

plot(silhouette(member_single, distance))

plot(silhouette(member_com, distance))

plot(silhouette(member_av, distance))

plot(silhouette(member_ward, distance))

plot(silhouette(member_cen, distance))

5 Non-hirarchical Clustering

5.1 Case 1: K-means clustering with k

km_clust <- kmeans(df_scaled, 3) # k=3
km_clust #print the results

## K-means clustering with 3 clusters of sizes 18, 7, 7
## 
## Cluster means:
##   Passengers  Movements Numberofairlines Mainairlineflightspercentage
## 1 -0.2469314 -0.2011272     -0.004982515                   -0.3699491
## 2 -0.9456035 -1.0500969     -1.239468064                    1.4006677
## 3  1.5805700  1.5672811      1.252280245                   -0.4493699
##   Maximumpercentageoftrafficpercountry NumberofLCCflightsweekly
## 1                           -0.1629508               0.04370786
## 2                            0.9913460              -1.21033588
## 3                           -0.5723298               1.09794424
##   NumberofLowCostAirlines LowCostAirlinespercentage Destinations
## 1               0.3305562                -0.2689855   0.07609389
## 2              -1.2287547                 1.4725164  -1.31999250
## 3               0.3787531                -0.7808392   1.12432250
##   Average_Route_Distance DistancetoclosestAirport
## 1             -0.1062034               0.06631895
## 2             -0.8907198               0.20629602
## 3              1.1638144              -0.37683045
##   DistancetoclosestSimilarAirport AirportRegionalrelevance Distancetocitykm
## 1                     -0.07737412                0.1407640       -0.3136475
## 2                     -0.67596963               -0.9617675        1.0125397
## 3                      0.87493166                0.5998030       -0.2060176
##   Inhanbitantscorrected numberofvisitorscorrected GDPcorrected   Cargoton
## 1           -0.06258292                -0.2290652   -0.1117858 -0.3186842
## 2           -0.92990235                -0.7382133   -0.5168124 -0.4251367
## 3            1.09082987                 1.3272381    0.8042615  1.2446104
## 
## Clustering vector:
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 
##  1  1  1  1  2  1  1  2  1  1  1  1  1  1  1  1  1  3  3  3  3  3  3  3  1  1 
## 27 28 29 30 31 32 
##  2  2  1  2  2  2 
## 
## Within cluster sum of squares by cluster:
## [1] 150.32291  67.16167  70.90186
##  (between_SS / total_SS =  48.3 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

5.2 Case 2: Find k

k <- list()
  for(i in 1:10){     #detect how many clusters from 1 - 10 explains more variance
    k[[i]] <- kmeans(df_scaled, i)
  }

# Plot between_SS / total_SS into a scree plot, see how it varies when you add clusters.
betSS_totSS <- list()
for(i in 1:10){
betSS_totSS[[i]] <- k[[i]]$betweenss/k[[i]]$totss
}
plot(1:10, betSS_totSS, type = "b", ylab = "Between SS / Total SS", xlab = "Number of clusters")

5.3 Detect outliers

5.3.1 Examine the boxplots

par(cex.axis=0.6, mar=c(11,2,1,1))# Make labels fit in the boxplot
boxplot(df_scaled, las = 2) #labels rotated to vertical

5.3.2 Detect the outliers

outliers <- boxplot.stats(df_scaled)$out
outliers

##  [1] 2.630622 2.772024 2.772024 3.611626 2.916185 2.523102 2.732030 2.515406
##  [9] 3.308457 3.285459

5.3.3 Remove rows with outliers

nrow(df_scaled)

## [1] 32

out_ind <- which(df_scaled %in% c(outliers)) #the row.names that contain outliers
df_no_outliers = df_scaled[-c(out_ind),] #remove those rows from the df_scaled
nrow(df_no_outliers) #31

## [1] 31

5.3.4 K-means cluster

km_no_outliers <- kmeans(df_no_outliers, 3)
km_no_outliers

## K-means clustering with 3 clusters of sizes 11, 15, 5
## 
## Cluster means:
##   Passengers  Movements Numberofairlines Mainairlineflightspercentage
## 1 -0.6095999 -0.6567742       -0.5981361                   -0.1917995
## 2  0.5890127  0.7260264        0.8111878                   -0.5183690
## 3 -0.9520429 -1.0882984       -1.3214660                    1.9931973
##   Maximumpercentageoftrafficpercountry NumberofLCCflightsweekly
## 1                            0.5577814               -0.5451081
## 2                           -0.6296646                0.7116404
## 3                            0.6473374               -1.2772631
##   NumberofLowCostAirlines LowCostAirlinespercentage Destinations
## 1              0.07256111                 0.1215978   -0.4548142
## 2              0.46550744                -0.6675400    0.7801097
## 3             -1.39205703                 1.9302799   -1.4105623
##   Average_Route_Distance DistancetoclosestAirport
## 1             -0.4759152                0.4952267
## 2              0.4294196               -0.4088129
## 3             -0.9635706                0.2765206
##   DistancetoclosestSimilarAirport AirportRegionalrelevance Distancetocitykm
## 1                      0.04572232                0.2308600       -0.4143005
## 2                      0.27904593                0.2668308       -0.2520645
## 3                     -0.71705981               -1.2364486        1.6976272
##   Inhanbitantscorrected numberofvisitorscorrected GDPcorrected   Cargoton
## 1            -0.4802512                -0.5070902   -0.6566693 -0.4356553
## 2             0.6091294                 0.4709562    0.4324943  0.3093463
## 3            -0.9802943                -0.7587213   -0.3558915 -0.4142745
## 
## Clustering vector:
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 19 20 21 22 23 24 25 26 27 
##  1  1  1  1  3  1  1  1  1  2  2  2  1  2  2  2  1  2  2  2  2  2  2  2  2  3 
## 28 29 30 31 32 
##  3  2  1  3  3 
## 
## Within cluster sum of squares by cluster:
## [1]  81.96037 135.98599  41.82151
##  (between_SS / total_SS =  48.9 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

5.3.5 Plot the clusters

5.3.5.1 K-means with outliers

plot(Numberofairlines ~ Destinations, df, col = km_clust$cluster)
with(df, text(Numberofairlines ~ Destinations, label = Airport, pos = 1, cex = 0.6))

5.3.5.2 K-means without outliers

plot(Numberofairlines ~ Destinations, df, col = km_no_outliers$cluster)
with(df, text(Numberofairlines ~ Destinations, label = Airport, pos = 1, cex = 0.6))

6 Reference

https://github.com/U-Shift/Transport-Demand-Modelling/blob/master/4-ClusterAnalysis.md

Exploratory Data Analysis

Apr 2, 2022

1 Import Dataset as a dataframe

2 Data exploratary

2.1 Summary Statistics

2.2 Sneak the plot

3 Data preparation before clustering

3.1 Missing data

3.2 Continuous variables

3.3 Standardize variables

4 Hierarchical Clustering

4.1 Euclidean distances

4.2 Visualise distance in heatmap

4.3 Types of hierarchical clustering

4.3.1 Single linkage (neareast neighbor) clustering algorithm

4.3.2 Complete linkage (Farthest neighbor) clustering algorithm

4.3.3 Average linkage between groups

4.3.4 Ward’s method

4.3.5 Centroid method

4.3.6 Comparing results from different hierarchical methods

4.3.7 Silhouette Plots: evaluate which method is more appropriate

5 Non-hirarchical Clustering

5.1 Case 1: K-means clustering with k

5.2 Case 2: Find k

5.3 Detect outliers

5.3.1 Examine the boxplots

5.3.2 Detect the outliers

5.3.3 Remove rows with outliers

5.3.4 K-means cluster

5.3.5 Plot the clusters

5.3.5.1 K-means with outliers

5.3.5.2 K-means without outliers

6 Reference