Source File code Connect connect
Load the library that are required in the assignment:
library("tm")
library("SnowballC")
library("caTools")
library("rpart")
library("rpart.plot")
library("ROCR")
library("randomForest")
library("caret")
Market segmentation is a strategy that divides a broad target market of customers into smaller, more similar groups, and then designs a marketing strategy specifically for each group. Clustering is a common technique for market segmentation since it automatically finds similar groups given a data set.
here are seven different variables in the dataset, described below:
airlines = read.csv("AirlinesCluster.csv")
summary(airlines)
## Balance QualMiles BonusMiles BonusTrans
## Min. : 0 Min. : 0.0 Min. : 0 Min. : 0.0
## 1st Qu.: 18528 1st Qu.: 0.0 1st Qu.: 1250 1st Qu.: 3.0
## Median : 43097 Median : 0.0 Median : 7171 Median :12.0
## Mean : 73601 Mean : 144.1 Mean : 17145 Mean :11.6
## 3rd Qu.: 92404 3rd Qu.: 0.0 3rd Qu.: 23800 3rd Qu.:17.0
## Max. :1704838 Max. :11148.0 Max. :263685 Max. :86.0
## FlightMiles FlightTrans DaysSinceEnroll
## Min. : 0.0 Min. : 0.000 Min. : 2
## 1st Qu.: 0.0 1st Qu.: 0.000 1st Qu.:2330
## Median : 0.0 Median : 0.000 Median :4096
## Mean : 460.1 Mean : 1.374 Mean :4119
## 3rd Qu.: 311.0 3rd Qu.: 1.000 3rd Qu.:5790
## Max. :30817.0 Max. :53.000 Max. :8296
normalized data frame called “airlinesNorm”
preproc = preProcess(airlines)
airlinesNorm = predict(preproc, airlines)
The first command pre-processes the data, and the second command performs the normalization
summary(airlinesNorm)
## Balance QualMiles BonusMiles BonusTrans
## Min. :-0.7303 Min. :-0.1863 Min. :-0.7099 Min. :-1.20805
## 1st Qu.:-0.5465 1st Qu.:-0.1863 1st Qu.:-0.6581 1st Qu.:-0.89568
## Median :-0.3027 Median :-0.1863 Median :-0.4130 Median : 0.04145
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.1866 3rd Qu.:-0.1863 3rd Qu.: 0.2756 3rd Qu.: 0.56208
## Max. :16.1868 Max. :14.2231 Max. :10.2083 Max. : 7.74673
## FlightMiles FlightTrans DaysSinceEnroll
## Min. :-0.3286 Min. :-0.36212 Min. :-1.99336
## 1st Qu.:-0.3286 1st Qu.:-0.36212 1st Qu.:-0.86607
## Median :-0.3286 Median :-0.36212 Median :-0.01092
## Mean : 0.0000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.:-0.1065 3rd Qu.:-0.09849 3rd Qu.: 0.80960
## Max. :21.6803 Max. :13.61035 Max. : 2.02284
distances = dist(airlinesNorm, method="euclidean")
hierClust = hclust(distances, method="ward.D")
plot(hierClust)
Divide the data points into 5 clusters by using the cutree function.
clusterGroups = cutree(hierClust, k = 5)
use tapply to compare the average values in each of the variables for the 5 clusters
tapply(airlines$Balance, clusterGroups, mean)
## 1 2 3 4 5
## 57866.90 110669.27 198191.57 52335.91 36255.91
tapply(airlines$QualMiles, clusterGroups, mean)
## 1 2 3 4 5
## 0.6443299 1065.9826590 30.3461538 4.8479263 2.5111773
tapply(airlines$BonusMiles, clusterGroups, mean)
## 1 2 3 4 5
## 10360.124 22881.763 55795.860 20788.766 2264.788
tapply(airlines$BonusTrans, clusterGroups, mean)
## 1 2 3 4 5
## 10.823454 18.229287 19.663968 17.087558 2.973174
tapply(airlines$FlightMiles, clusterGroups, mean)
## 1 2 3 4 5
## 83.18428 2613.41811 327.67611 111.57373 119.32191
tapply(airlines$FlightTrans, clusterGroups, mean)
## 1 2 3 4 5
## 0.3028351 7.4026975 1.0688259 0.3444700 0.4388972
tapply(airlines$DaysSinceEnroll, clusterGroups, mean)
## 1 2 3 4 5
## 6235.365 4402.414 5615.709 2840.823 3060.081
or we do colMeans and subset the by cluster
colMeans(subset(airlines, clusterGroups == 1))
## Balance QualMiles BonusMiles BonusTrans
## 5.786690e+04 6.443299e-01 1.036012e+04 1.082345e+01
## FlightMiles FlightTrans DaysSinceEnroll
## 8.318428e+01 3.028351e-01 6.235365e+03
colMeans(subset(airlines, clusterGroups == 2))
## Balance QualMiles BonusMiles BonusTrans
## 1.106693e+05 1.065983e+03 2.288176e+04 1.822929e+01
## FlightMiles FlightTrans DaysSinceEnroll
## 2.613418e+03 7.402697e+00 4.402414e+03
colMeans(subset(airlines, clusterGroups == 3))
## Balance QualMiles BonusMiles BonusTrans
## 1.981916e+05 3.034615e+01 5.579586e+04 1.966397e+01
## FlightMiles FlightTrans DaysSinceEnroll
## 3.276761e+02 1.068826e+00 5.615709e+03
colMeans(subset(airlines, clusterGroups == 4))
## Balance QualMiles BonusMiles BonusTrans
## 52335.913594 4.847926 20788.766129 17.087558
## FlightMiles FlightTrans DaysSinceEnroll
## 111.573733 0.344470 2840.822581
colMeans(subset(airlines, clusterGroups == 5))
## Balance QualMiles BonusMiles BonusTrans
## 3.625591e+04 2.511177e+00 2.264788e+03 2.973174e+00
## FlightMiles FlightTrans DaysSinceEnroll
## 1.193219e+02 4.388972e-01 3.060081e+03
set.seed(88)
kmeansClust = kmeans(airlinesNorm, centers=5, iter.max=1000)
table(kmeansClust$cluster)
##
## 1 2 3 4 5
## 408 141 993 1182 1275