Source: Data retrieved on Jan, 29th, 2024, from: https://www.kaggle.com/datasets/mysarahmadbhat/airline-passenger-satisfaction/data

library(readr)
AirplaneSatisfaction <- read_csv("~/Bootcamp_working/AirplaneSatisfaction.csv")
## Rows: 129880 Columns: 24
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): Gender, Customer Type, Type of Travel, Class, Satisfaction
## dbl (19): ID, Age, Flight Distance, Departure Delay, Arrival Delay, Departur...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
mydata <- data.frame(AirplaneSatisfaction)
nrow(mydata)
## [1] 129880
head(mydata)
##   ID Gender Age Customer.Type Type.of.Travel    Class Flight.Distance
## 1  1   Male  48    First-time       Business Business             821
## 2  2 Female  35     Returning       Business Business             821
## 3  3   Male  41     Returning       Business Business             853
## 4  4   Male  50     Returning       Business Business            1905
## 5  5 Female  49     Returning       Business Business            3470
## 6  6   Male  43     Returning       Business Business            3788
##   Departure.Delay Arrival.Delay Departure.and.Arrival.Time.Convenience
## 1               2             5                                      3
## 2              26            39                                      2
## 3               0             0                                      4
## 4               0             0                                      2
## 5               0             1                                      3
## 6               0             0                                      4
##   Ease.of.Online.Booking Check.in.Service Online.Boarding Gate.Location
## 1                      3                4               3             3
## 2                      2                3               5             2
## 3                      4                4               5             4
## 4                      2                3               4             2
## 5                      3                3               5             3
## 6                      4                3               5             4
##   On.board.Service Seat.Comfort Leg.Room.Service Cleanliness Food.and.Drink
## 1                3            5                2           5              5
## 2                5            4                5           5              3
## 3                3            5                3           5              5
## 4                5            5                5           4              4
## 5                3            4                4           5              4
## 6                4            4                4           3              3
##   In.flight.Service In.flight.Wifi.Service In.flight.Entertainment
## 1                 5                      3                       5
## 2                 5                      2                       5
## 3                 3                      4                       3
## 4                 5                      2                       5
## 5                 3                      3                       3
## 6                 4                      4                       4
##   Baggage.Handling            Satisfaction
## 1                5 Neutral or Dissatisfied
## 2                5               Satisfied
## 3                3               Satisfied
## 4                5               Satisfied
## 5                3               Satisfied
## 6                4               Satisfied

Description of the data set:

Our data set includes 129880 units (rows) and 24 variables (columns).

# Convert columns to numeric

mydata[c(1,3, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23)] <- lapply(mydata[c(1,3, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23)], as.numeric)
set.seed(1) #Setting initial point of sampling
mydata <- mydata[sample(nrow(mydata), 300), ]    #Random sample of 300 units

mydata$ID <- seq(1, nrow(mydata))
rownames(mydata) <- mydata$ID
any(is.na(mydata))
## [1] TRUE
library(tidyr)
mydata <- drop_na(mydata)
any(is.na(mydata))
## [1] FALSE
table(mydata$Gender) 
## 
## Female   Male 
##    147    151
table(mydata$Customer.Type)
## 
## First-time  Returning 
##         44        254
table(mydata$Type.of.Travel)
## 
## Business Personal 
##      199       99
table(mydata$Class)
## 
##     Business      Economy Economy Plus 
##          146          127           25
table(mydata$Satisfaction)
## 
## Neutral or Dissatisfied               Satisfied 
##                     167                     131

This was done to see, which levels of the variables should be set a a reference group, when factoring.

mydata$GenderF <- factor(mydata$Gender,            #Factoring the categorical variables
                             levels = c("Male", "Female"), 
                             labels = c("Male", "Female"))

mydata$Customer.TypeF <- factor(mydata$Customer.Type, 
                             levels = c( "Returning", "First-time"), 
                             labels = c("Returning", "First-time"))

mydata$Type.of.TravelF <- factor(mydata$Type.of.Travel, 
                             levels = c("Business", "Personal"), 
                             labels = c("Business", "Personal"))

mydata$ClassF <- factor(mydata$Class, 
                             levels = c("Business","Economy","Economy Plus"), 
                             labels = c("Business","Economy","Economy Plus"))
                             
mydata$SatisfactionF <- factor(mydata$Satisfaction, 
                             levels = c("Neutral or Dissatisfied", "Satisfied"), 
                             labels = c("Neutral or Dissatisfied", "Satisfied"))

summary(mydata[ , c(-2, -4, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24)]) 
##        ID              Age        Type.of.Travel        Class          
##  Min.   :  1.00   Min.   : 7.00   Length:298         Length:298        
##  1st Qu.: 75.25   1st Qu.:28.25   Class :character   Class :character  
##  Median :149.50   Median :40.50   Mode  :character   Mode  :character  
##  Mean   :149.98   Mean   :40.31                                        
##  3rd Qu.:223.75   3rd Qu.:51.00                                        
##  Max.   :300.00   Max.   :79.00                                        
##  Flight.Distance  Departure.Delay    GenderF       Customer.TypeF
##  Min.   :  67.0   Min.   :  0.00   Male  :151   Returning :254   
##  1st Qu.: 375.5   1st Qu.:  0.00   Female:147   First-time: 44   
##  Median : 765.0   Median :  0.00                                 
##  Mean   :1142.2   Mean   : 11.03                                 
##  3rd Qu.:1649.2   3rd Qu.:  6.00                                 
##  Max.   :3989.0   Max.   :211.00                                 
##  Type.of.TravelF          ClassF                    SatisfactionF
##  Business:199    Business    :146   Neutral or Dissatisfied:167  
##  Personal: 99    Economy     :127   Satisfied              :131  
##                  Economy Plus: 25                                
##                                                                  
##                                                                  
## 
mydata$Flight.Distance_z <- scale(mydata$Flight.Distance)  #Standardizing the variables used
mydata$Departure.Delay_z <- scale(mydata$Departure.Delay)
mydata$Seat.Comfort_z <- scale(mydata$Seat.Comfort)
library(Hmisc)
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
rcorr(as.matrix(mydata[ , c("Flight.Distance_z", "Departure.Delay_z", "Seat.Comfort_z")]),
      type = "pearson")
##                   Flight.Distance_z Departure.Delay_z Seat.Comfort_z
## Flight.Distance_z              1.00              0.02           0.08
## Departure.Delay_z              0.02              1.00          -0.07
## Seat.Comfort_z                 0.08             -0.07           1.00
## 
## n= 298 
## 
## 
## P
##                   Flight.Distance_z Departure.Delay_z Seat.Comfort_z
## Flight.Distance_z                   0.7301            0.1919        
## Departure.Delay_z 0.7301                              0.2437        
## Seat.Comfort_z    0.1919            0.2437

We see a weak correlation between the variables, which is what we aim to see here.

mydata$Dissimilarity <- sqrt(mydata$Flight.Distance_z^2 + mydata$Departure.Delay_z^2 + mydata$Seat.Comfort_z^2)
head(mydata[order(-mydata$Dissimilarity), c(1, 30, 31, 32, 33)], 10)   #Check for the outliers based on the Dissimilarity index
##      ID Flight.Distance_z Departure.Delay_z Seat.Comfort_z Dissimilarity
## 145 145       -0.23415541         7.0150083     -0.2987280      7.025269
## 203 203        0.05848708         6.1379851      0.4431132      6.154237
## 219 219       -0.62335001         5.9625805     -1.0405691      6.084712
## 43   43       -0.54571017         4.5593434     -1.7824103      4.925688
## 196 196       -0.83934804         4.4190197      0.4431132      4.519800
## 40   40        0.36207878         3.9980486      1.1849543      4.185643
## 254 256       -0.13660791         3.6472393     -0.2987280      3.662001
## 79   79       -0.27397071         3.3315109     -1.0405691      3.500973
## 146 146        2.73407554        -0.3870674     -1.7824103      3.286636
## 234 236        0.21177600         2.6649733     -1.7824103      3.213085
mydata1 <- mydata[c(-145, -203, -219, -43), ]   #Deleting the units
mydata1$Flight.Distance_z <- scale(mydata1$Flight.Distance_z)  #Standardizing the variables AGAIN
mydata1$Departure.Delay_z <- scale(mydata1$Departure.Delay_z)
mydata1$Seat.Comfort_z <- scale(mydata1$Seat.Comfort_z)
library(ggplot2)
#install.packages("factoextra")
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.3.2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
distance <- get_dist(mydata1[c("Flight.Distance_z", "Departure.Delay_z", "Seat.Comfort_z")],
                     method = "euclidian")       #Calculating Euclidean distances and adding them to the dataset

distance2 <- distance^2                          #Creating squared Euclidean distances

fviz_dist(distance2)

From the graph we can see the formation of squares but to determine if the data really is clusterable, we need to check it with Hopkins statistic.

get_clust_tendency(mydata1[c("Flight.Distance_z", "Departure.Delay_z", "Seat.Comfort_z")],
                   n = nrow(mydata1) - 1,
                   graph = FALSE)
## $hopkins_stat
## [1] 0.8703446
## 
## $plot
## NULL

We wish to have a Hopkins statistic above 0.5, which is achieved here. The higher the Hopkins, the more the data is clusterable. With the value of 0.87, our data is clusterable and we can continue with analysis on how many clusters we should have.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:Hmisc':
## 
##     src, summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
WARD <- mydata1[c("Flight.Distance_z", "Departure.Delay_z", "Seat.Comfort_z")] %>%
           get_dist(method = "euclidian") %>%
           hclust(method = "ward.D2")

WARD
## 
## Call:
## hclust(d = ., method = "ward.D2")
## 
## Cluster method   : ward.D2 
## Distance         : euclidean 
## Number of objects: 294
fviz_dend(WARD)  #Draw a dendrogram 
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

fviz_dend(WARD,
          k = 4,
          cex = 0.5,
          palette = "jama",
          color_labels_by_k = TRUE,
          rect = TRUE)

mydata1$ClusterWard <- cutree(WARD,
                             k = 4)
head(mydata1[c(1, 7, 8, 16, 34)])
##   ID Flight.Distance Departure.Delay Seat.Comfort ClusterWard
## 1  1             669               0            3           1
## 2  2             854              15            5           1
## 3  3            1917               0            5           2
## 4  4             387               8            4           1
## 5  5            2475              15            4           2
## 6  6             468              92            3           3

We got a new variable called ClusterWard, which tells us which of the cluster groups the unit was assigned to.

Initial_leaders <- aggregate(mydata1[ , c("Flight.Distance_z", "Departure.Delay_z", "Seat.Comfort_z")],
                             by = list(mydata1$ClusterWard),
                             FUN = mean)
Initial_leaders
##   Group.1 Flight.Distance_z Departure.Delay_z Seat.Comfort_z
## 1       1       -0.58260538        -0.3399479      0.5715136
## 2       2        1.46980516        -0.3313738      0.6355359
## 3       3        0.26337499         2.4171960      0.2807911
## 4       4       -0.02815888        -0.1149369     -1.2982233

Here, we have our initial leaders, which are used for K-Means clustering technique later on.

K_MEANS <- hkmeans(mydata1[c("Flight.Distance_z", "Departure.Delay_z", "Seat.Comfort_z")], 
                   k = 4,
                   hc.metric = "euclidean",
                   hc.method = "ward.D2")

K_MEANS
## Hierarchical K-means clustering with 4 clusters of sizes 149, 54, 19, 72
## 
## Cluster means:
##   Flight.Distance_z Departure.Delay_z Seat.Comfort_z
## 1        -0.4972643        -0.2694633      0.5191469
## 2         1.6547944        -0.2010963      0.4069226
## 3         0.2463884         3.1825929      0.1608489
## 4        -0.2770541        -0.1313894     -1.4219838
## 
## Clustering vector:
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##   1   1   2   1   2   3   1   1   1   2   4   2   1   2   1   1   2   1   2   1 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
##   4   4   1   4   4   1   3   1   2   1   1   2   4   1   1   2   2   4   1   3 
##  41  42  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60  61 
##   2   1   4   1   1   1   4   4   1   1   1   4   1   2   1   4   1   1   1   4 
##  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81 
##   1   1   1   2   1   1   4   2   1   1   2   2   1   1   2   2   1   3   4   4 
##  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101 
##   1   4   4   1   1   1   1   2   1   1   4   2   4   2   2   1   4   1   2   1 
## 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 
##   1   1   2   2   1   2   1   1   3   1   1   1   1   3   1   1   4   4   1   1 
## 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 
##   1   4   1   1   1   1   1   2   1   2   1   1   1   1   4   4   1   2   2   4 
## 142 143 144 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 
##   4   4   4   2   1   4   1   1   1   1   1   1   2   1   4   4   1   2   4   2 
## 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 
##   4   1   1   1   2   1   1   1   1   4   1   2   1   4   4   1   4   2   4   1 
## 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 
##   1   1   1   4   1   1   2   4   1   1   1   4   1   3   4   1   4   1   1   2 
## 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 220 221 222 223 224 
##   4   4   4   1   4   3   1   1   1   1   4   4   4   4   1   3   2   1   1   1 
## 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 
##   3   4   2   4   4   1   4   1   1   3   4   2   4   1   1   3   2   3   3   1 
## 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 
##   2   1   4   4   1   1   4   1   1   3   3   1   1   4   4   1   3   1   2   4 
## 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 
##   2   2   2   1   1   1   1   3   1   2   1   1   1   2   4   1   2   1   1   4 
## 285 286 287 288 289 290 291 292 293 294 295 296 297 298 
##   4   1   1   2   4   3   4   1   1   1   4   4   1   2 
## 
## Within cluster sum of squares by cluster:
## [1] 90.30902 69.43067 62.22226 63.77169
##  (between_SS / total_SS =  67.5 %)
## 
## Available components:
## 
##  [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
##  [6] "betweenss"    "size"         "iter"         "ifault"       "data"        
## [11] "hclust"
fviz_cluster(K_MEANS, 
             palette = "jama", 
             repel = FALSE,
             ggtheme = theme_classic())

Now, let’s check the reclassifications with the following tables.

mydata1$ClusterK_Means <- K_MEANS$cluster
head(mydata1[c("Flight.Distance_z", "Departure.Delay_z", "Seat.Comfort_z")])
##   Flight.Distance_z Departure.Delay_z Seat.Comfort_z
## 1        -0.4730148        -0.4257827     -0.3081386
## 2        -0.2898817         0.3051249      1.1769884
## 3         0.7623915        -0.4257827      1.1769884
## 4        -0.7521692        -0.0359653      0.4344249
## 5         1.3147607         0.3051249      0.4344249
## 6        -0.6719865         4.0571175     -0.3081386
table(mydata1$ClusterWard)      
## 
##   1   2   3   4 
## 130  48  29  87

Those are our initial numbers of units in each of the four clusters, done with hierarchical clustering.

table(mydata1$ClusterK_Means)
## 
##   1   2   3   4 
## 149  54  19  72

Those are our final numbers on units in each of the four clusters, which was done by K-Means. We see some of the reclassifications.

table(mydata1$ClusterWard, mydata1$ClusterK_Means)
##    
##       1   2   3   4
##   1 130   0   0   0
##   2   5  43   0   0
##   3   7   3  19   0
##   4   7   8   0  72

The explanation looks the same for all other rows and columns, except that when looking at the third and fourth column, there was no reclassification happening.

Centroids <- K_MEANS$centers
Centroids
##   Flight.Distance_z Departure.Delay_z Seat.Comfort_z
## 1        -0.4972643        -0.2694633      0.5191469
## 2         1.6547944        -0.2010963      0.4069226
## 3         0.2463884         3.1825929      0.1608489
## 4        -0.2770541        -0.1313894     -1.4219838

Those values will be used for a better graphical visualization of the differences between the groups

library(ggplot2)
library(tidyr)

Figure <- as.data.frame(Centroids)
Figure$Id <- 1:nrow(Figure)
Figure <- pivot_longer(Figure, cols = c("Flight.Distance_z", "Departure.Delay_z", "Seat.Comfort_z"))

Figure$Groups <- factor(Figure$Id, 
                        levels = c(1, 2, 3, 4), 
                        labels = c("1", "2", "3", "4"))

Figure$nameFactor <- factor(Figure$name, 
                            levels = c("Flight.Distance_z", "Departure.Delay_z", "Seat.Comfort_z"), 
                            labels = c("Flight.Distance_z", "Departure.Delay_z", "Seat.Comfort_z"))

ggplot(Figure, aes(x = nameFactor, y = value)) +
  geom_hline(yintercept = 0) +
  theme_bw() +
  geom_point(aes(shape = Groups, col = Groups), size = 4) +
  geom_line(aes(group = Id), linewidth = 1) +
  ylab("Averages") +
  xlab("Cluster variables") +
  ylim(-1.5, 3.5)

fit <- aov(cbind(Flight.Distance_z, Departure.Delay_z, Seat.Comfort_z) ~ as.factor(ClusterK_Means), 
                 data = mydata1)

summary(fit)
##  Response 1 :
##                            Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(ClusterK_Means)   3 191.39  63.798  182.09 < 2.2e-16 ***
## Residuals                 290 101.61   0.350                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 2 :
##                            Df  Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(ClusterK_Means)   3 206.695  68.898  231.51 < 2.2e-16 ***
## Residuals                 290  86.305   0.298                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 3 :
##                            Df  Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(ClusterK_Means)   3 195.177  65.059  192.87 < 2.2e-16 ***
## Residuals                 290  97.823   0.337                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

ANOVA:

H0: All means are equal.

H1: At least one of the means differs.

As we can see from the p-value, in all three tests, we can reject the null hypothesis. It means that the means differ among our cluster groups (p < 0.001).

Validation

aggregate(mydata1$Age,                     #Validation with age
          by = list(mydata1$ClusterK_Means), 
          FUN = "mean")
##   Group.1        x
## 1       1 41.58389
## 2       2 41.50000
## 3       3 46.36842
## 4       4 35.19444

Here we see the mean of each group. With the following ANOVa test, we can see if those values have a statistical effect on the groups, which were classified with the help of K-Means.

fit <- aov(Age ~ as.factor(ClusterK_Means), 
           data = mydata1)
summary(fit)
##                            Df Sum Sq Mean Sq F value  Pr(>F)   
## as.factor(ClusterK_Means)   3   2900   966.6   4.452 0.00447 **
## Residuals                 290  62963   217.1                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The hypotheses:

H0: The means of age are are the same across all clusters.

H1: At least one mean differs.

We reject the null hypothesis (p = 0.004) and conclude that at least one mean differs from others. We found a statistical effect of age on the clusters.

The third group has on average the oldest passengers and the fourth group has the youngest, on average.

chisq_results <- chisq.test(mydata1$GenderF, as.factor(mydata1$GenderF))  #Validation with the Gender

chisq_results
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  mydata1$GenderF and as.factor(mydata1$GenderF)
## X-squared = 290.01, df = 1, p-value < 2.2e-16

The hypotheses:

H0: There is no association between the gender of the passenger is flying in and the clusters made with K-Means.

H1: There is an association between the gender of the passenger is flying in and the clusters made with K-Means.

Based on the data set, we reject the null hypothesis at p < 0.001, and conclude that there is an association between the gender the passenger is flying in and the clusters made with K-Means.

round(chisq_results$expected, 2)
##                as.factor(mydata1$GenderF)
## mydata1$GenderF  Male Female
##          Male   75.51  73.49
##          Female 73.49  71.51

All expected frequencies are above 5, which is great.

chisq_results1 <- chisq.test(mydata1$Type.of.TravelF, as.factor(mydata1$Type.of.TravelF))   #Validation with a Type of Travel

chisq_results1
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  mydata1$Type.of.TravelF and as.factor(mydata1$Type.of.TravelF)
## X-squared = 289.52, df = 1, p-value < 2.2e-16

The hypotheses:

H0: There is no association between the passenger’s type of travel and the clusters made with K-Means.

H1: There is an association between the passenger’s type of travel and the clusters made with K-Means.

Based on the data set, we reject the null hypothesis at p < 0.001, and conclude that there is an association between the passenger’s type of travel (purpose) and the clusters made by K-Means. There is a statistically significant difference in types between the groups.

addmargins(chisq_results1$observed)     #Observed frequencies 
##                        
## mydata1$Type.of.TravelF Business Personal Sum
##                Business      196        0 196
##                Personal        0       98  98
##                Sum           196       98 294
round(chisq_results1$expected, 2)     #Expected frequencies
##                        
## mydata1$Type.of.TravelF Business Personal
##                Business   130.67    65.33
##                Personal    65.33    32.67

Conclusion for GROUP 1:

The segment size is the biggest - 149 units, which is 51% (149/294). Passengers in the first group are on average 42 years old.