Segmentation for Airline

Reproducible notes for segmentation for Airlines

Anil Kumar

Source File code Connect connect

PRELIMINARIES

Load the library that are required in the assignment:

library("tm")
library("SnowballC")

library("caTools")
library("rpart")
library("rpart.plot")
library("ROCR")
library("randomForest")
library("caret")

INTRODUCTION

Market segmentation is a strategy that divides a broad target market of customers into smaller, more similar groups, and then designs a marketing strategy specifically for each group. Clustering is a common technique for market segmentation since it automatically finds similar groups given a data set.

here are seven different variables in the dataset, described below:

Load data

airlines = read.csv("AirlinesCluster.csv")
summary(airlines)
##     Balance          QualMiles         BonusMiles       BonusTrans  
##  Min.   :      0   Min.   :    0.0   Min.   :     0   Min.   : 0.0  
##  1st Qu.:  18528   1st Qu.:    0.0   1st Qu.:  1250   1st Qu.: 3.0  
##  Median :  43097   Median :    0.0   Median :  7171   Median :12.0  
##  Mean   :  73601   Mean   :  144.1   Mean   : 17145   Mean   :11.6  
##  3rd Qu.:  92404   3rd Qu.:    0.0   3rd Qu.: 23800   3rd Qu.:17.0  
##  Max.   :1704838   Max.   :11148.0   Max.   :263685   Max.   :86.0  
##   FlightMiles       FlightTrans     DaysSinceEnroll
##  Min.   :    0.0   Min.   : 0.000   Min.   :   2   
##  1st Qu.:    0.0   1st Qu.: 0.000   1st Qu.:2330   
##  Median :    0.0   Median : 0.000   Median :4096   
##  Mean   :  460.1   Mean   : 1.374   Mean   :4119   
##  3rd Qu.:  311.0   3rd Qu.: 1.000   3rd Qu.:5790   
##  Max.   :30817.0   Max.   :53.000   Max.   :8296

normalized data frame called “airlinesNorm”

preproc = preProcess(airlines)
airlinesNorm = predict(preproc, airlines)

The first command pre-processes the data, and the second command performs the normalization

summary(airlinesNorm)
##     Balance          QualMiles         BonusMiles        BonusTrans      
##  Min.   :-0.7303   Min.   :-0.1863   Min.   :-0.7099   Min.   :-1.20805  
##  1st Qu.:-0.5465   1st Qu.:-0.1863   1st Qu.:-0.6581   1st Qu.:-0.89568  
##  Median :-0.3027   Median :-0.1863   Median :-0.4130   Median : 0.04145  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.1866   3rd Qu.:-0.1863   3rd Qu.: 0.2756   3rd Qu.: 0.56208  
##  Max.   :16.1868   Max.   :14.2231   Max.   :10.2083   Max.   : 7.74673  
##   FlightMiles       FlightTrans       DaysSinceEnroll   
##  Min.   :-0.3286   Min.   :-0.36212   Min.   :-1.99336  
##  1st Qu.:-0.3286   1st Qu.:-0.36212   1st Qu.:-0.86607  
##  Median :-0.3286   Median :-0.36212   Median :-0.01092  
##  Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.00000  
##  3rd Qu.:-0.1065   3rd Qu.:-0.09849   3rd Qu.: 0.80960  
##  Max.   :21.6803   Max.   :13.61035   Max.   : 2.02284

HIERARCHICAL CLUSTERING

distances = dist(airlinesNorm, method="euclidean")
hierClust = hclust(distances, method="ward.D")
plot(hierClust)

plot of chunk clust Divide the data points into 5 clusters by using the cutree function.

clusterGroups = cutree(hierClust, k = 5)

use tapply to compare the average values in each of the variables for the 5 clusters

tapply(airlines$Balance, clusterGroups, mean)
##         1         2         3         4         5 
##  57866.90 110669.27 198191.57  52335.91  36255.91
tapply(airlines$QualMiles, clusterGroups, mean)
##            1            2            3            4            5 
##    0.6443299 1065.9826590   30.3461538    4.8479263    2.5111773
tapply(airlines$BonusMiles, clusterGroups, mean)
##         1         2         3         4         5 
## 10360.124 22881.763 55795.860 20788.766  2264.788
tapply(airlines$BonusTrans, clusterGroups, mean)
##         1         2         3         4         5 
## 10.823454 18.229287 19.663968 17.087558  2.973174
tapply(airlines$FlightMiles, clusterGroups, mean)
##          1          2          3          4          5 
##   83.18428 2613.41811  327.67611  111.57373  119.32191
tapply(airlines$FlightTrans, clusterGroups, mean)
##         1         2         3         4         5 
## 0.3028351 7.4026975 1.0688259 0.3444700 0.4388972
tapply(airlines$DaysSinceEnroll, clusterGroups, mean)
##        1        2        3        4        5 
## 6235.365 4402.414 5615.709 2840.823 3060.081

or we do colMeans and subset the by cluster

colMeans(subset(airlines, clusterGroups == 1))
##         Balance       QualMiles      BonusMiles      BonusTrans 
##    5.786690e+04    6.443299e-01    1.036012e+04    1.082345e+01 
##     FlightMiles     FlightTrans DaysSinceEnroll 
##    8.318428e+01    3.028351e-01    6.235365e+03
colMeans(subset(airlines, clusterGroups == 2))
##         Balance       QualMiles      BonusMiles      BonusTrans 
##    1.106693e+05    1.065983e+03    2.288176e+04    1.822929e+01 
##     FlightMiles     FlightTrans DaysSinceEnroll 
##    2.613418e+03    7.402697e+00    4.402414e+03
colMeans(subset(airlines, clusterGroups == 3))
##         Balance       QualMiles      BonusMiles      BonusTrans 
##    1.981916e+05    3.034615e+01    5.579586e+04    1.966397e+01 
##     FlightMiles     FlightTrans DaysSinceEnroll 
##    3.276761e+02    1.068826e+00    5.615709e+03
colMeans(subset(airlines, clusterGroups == 4))
##         Balance       QualMiles      BonusMiles      BonusTrans 
##    52335.913594        4.847926    20788.766129       17.087558 
##     FlightMiles     FlightTrans DaysSinceEnroll 
##      111.573733        0.344470     2840.822581
colMeans(subset(airlines, clusterGroups == 5))
##         Balance       QualMiles      BonusMiles      BonusTrans 
##    3.625591e+04    2.511177e+00    2.264788e+03    2.973174e+00 
##     FlightMiles     FlightTrans DaysSinceEnroll 
##    1.193219e+02    4.388972e-01    3.060081e+03

K-MEANS CLUSTERING

set.seed(88)
kmeansClust = kmeans(airlinesNorm, centers=5, iter.max=1000)
table(kmeansClust$cluster)
## 
##    1    2    3    4    5 
##  408  141  993 1182 1275