Load data
airlines <- read.csv("AirlinesCluster.csv")
colMeans(airlines)
## Balance QualMiles BonusMiles BonusTrans
## 73601.327582 144.114529 17144.846212 11.601900
## FlightMiles FlightTrans DaysSinceEnroll
## 460.055764 1.373593 4118.559390
Normalize data. Get some column means, max, and min:
library(caret)
## Warning: package 'caret' was built under R version 3.1.3
## Loading required package: lattice
## Loading required package: ggplot2
preproc <- preProcess(airlines)
airlinesNorm <- predict(preproc, airlines)
(colMeans(airlinesNorm))
## Balance QualMiles BonusMiles BonusTrans
## 2.748375e-17 2.663813e-17 -3.802077e-17 -7.652564e-17
## FlightMiles FlightTrans DaysSinceEnroll
## 1.440723e-17 1.217733e-17 6.999624e-17
(apply(airlinesNorm, 2, max))
## Balance QualMiles BonusMiles BonusTrans
## 16.186811 14.223084 10.208293 7.746727
## FlightMiles FlightTrans DaysSinceEnroll
## 21.680292 13.610351 2.022842
(apply(airlinesNorm, 2, min))
## Balance QualMiles BonusMiles BonusTrans
## -0.7303482 -0.1862754 -0.7099031 -1.2080518
## FlightMiles FlightTrans DaysSinceEnroll
## -0.3285622 -0.3621226 -1.9933614
Generate Dendrogram
distances <- dist(airlinesNorm)
airlineClust <- hclust(distances, method="ward.D")
plot(airlineClust)
Divide into 5 clusters
tree <- cutree(airlineClust, k=5)
airlineClusters <- split(airlinesNorm, tree)
nrow(airlineClusters[[1]])
## [1] 776
Compute average of each variable in each cluster:
airlineUnnormClusters <- split(airlines, tree)
sapply(airlineUnnormClusters, colMeans)
## 1 2 3 4
## Balance 5.786690e+04 1.106693e+05 1.981916e+05 52335.913594
## QualMiles 6.443299e-01 1.065983e+03 3.034615e+01 4.847926
## BonusMiles 1.036012e+04 2.288176e+04 5.579586e+04 20788.766129
## BonusTrans 1.082345e+01 1.822929e+01 1.966397e+01 17.087558
## FlightMiles 8.318428e+01 2.613418e+03 3.276761e+02 111.573733
## FlightTrans 3.028351e-01 7.402697e+00 1.068826e+00 0.344470
## DaysSinceEnroll 6.235365e+03 4.402414e+03 5.615709e+03 2840.822581
## 5
## Balance 3.625591e+04
## QualMiles 2.511177e+00
## BonusMiles 2.264788e+03
## BonusTrans 2.973174e+00
## FlightMiles 1.193219e+02
## FlightTrans 4.388972e-01
## DaysSinceEnroll 3.060081e+03
K-means clustering
set.seed(88)
kmClust <- kmeans(airlinesNorm, centers=5, iter.max=1000)
kmClusters <- split(airlinesNorm, kmClust$cluster)
sum(kmClust$size > 1000)
## [1] 2
Compare cluster centroids to each other:
kmClust$centers
## Balance QualMiles BonusMiles BonusTrans FlightMiles FlightTrans
## 1 1.44439706 0.51115730 1.8769284 1.0331951 0.1169945 0.1444636
## 2 1.00054098 0.68382234 0.6144780 1.7214887 3.8559798 4.1196141
## 3 -0.05580605 -0.14104391 0.3041358 0.7108744 -0.1218278 -0.1287569
## 4 -0.13331742 -0.11491607 -0.3492669 -0.3373455 -0.1833989 -0.1961819
## 5 -0.40579897 -0.02281076 -0.5816482 -0.7619054 -0.1989602 -0.2196582
## DaysSinceEnroll
## 1 0.7198040
## 2 0.2742394
## 3 -0.3398209
## 4 0.9640923
## 5 -0.8897747