Segmentation Process

Data Processing
Initial Diagnostics
Cluster Solutions
- Convergent K-Means
- Cluster Reproducibility
Typing Tool

Data Processing

#read in all data

all <- read_excel(path = "./data.xlsx", sheet = 1)

#select full variable set and segmentation variables

full.vars <- c("uuid",
          "record",
          "qdma",
          "qS1",
          "qS2r1",
          "qS2r2",
          "qS2r3",
          "qS2r4",
          "qS4r1",
          "qS4r2",
          "qS4r3",
          "qS4r4",
          "qS4r5",
          "qS4r6",
          "qS4r7",
          "qS4r8",
          "qS4r9",
          "qS4r10",
          "qS4r11",
          "qS4r12",
          "qS4r13",
          "qS4r14",
          "qS6",
          "q3r1",
          "q3r2",
          "q3r3",
          "q3r4",
          "q3r5",
          "q3r6",
          "q3r7",
          "q3r8",
          "q3r9",
          "q3r10",
          "q3r11",
          "q9_1p1r1",
          "q9_1p1r2",
          "q9_1p1r3",
          "q9_1p1r4",
          "q9_1p1r5",
          "q9_1p1r6",
          "q9_1p1r7",
          "q9_1p1r8",
          "q9_1p1r9",
          "q9_1p1r10",
          "q9_1p1r11",
          "q9_1p1r12",
          "q9_1p1r13",
          "q9_1p1r14",
          "q11_1p3r1",
          "q11_1p3r2",
          "q11_1p3r3",
          "q11_1p3r4",
          "q11_1p3r5",
          "q11_1p3r6",
          "q11_1p3r7",
          "q11_1p3r8",
          "q11_1p3r9",
          "q11_1p3r10",
          "q11_1p3r11",
          "q11_1p3r12",
          "q11_1p3r13",
          "q11_1p3r14",
          "q11_1p3r15",
          "q11_1p3r16",
          "q11_1p3r17",
          "q11_1p3r18",
          "q11_1p3r19",
          "q11_1p3r20",
          "q12", 
          "q14r4",
          "q15r4",
          "q16_1p4r1",
          "q16_1p4r2",
          "q16_1p4r3",
          "q16_1p4r4",
          "q16_1p4r5",
          "q16_1p4r6",
          "q16_1p4r7",
          "q16_1p4r8",
          "q16_1p4r9",
          "q16_1p4r10",
          "q16_1p4r11",
          "q16_1p4r12",
          "q16_1p4r13",
          "q16_1p4r14",
          "q16_1p4r15",
          "q16_1p4r16",
          "q16_1p4r17",
          "q16_1p4r18",
          "q16_1p4r19",
          "q16_1p4r20",
          "q17r1",
          "q17r2",
          "q17r3",
          "q17r4",
          "q17r5",
          "q17r6", 
          "qD1",
          "qD2",
          "qD3",
          "qD4",
          "qD7r1",
          "qD7r2",
          "qD7r3",
          "qD7r4",
          "qD7r5",
          "qD7r6",
          "qD8",
          "qD9",
          "qD10")

seg.vars <- c("q9_1p1r1",
          "q9_1p1r2",
          "q9_1p1r3",
          "q9_1p1r4",
          "q9_1p1r5",
          "q9_1p1r6",
          "q9_1p1r7",
          "q9_1p1r8",
          "q9_1p1r9",
          "q9_1p1r10",
          "q9_1p1r11",
          "q9_1p1r12",
          "q9_1p1r13",
          "q9_1p1r14",
          "q11_1p3r1",
          "q11_1p3r2",
          "q11_1p3r3",
          "q11_1p3r4",
          "q11_1p3r5",
          "q11_1p3r6",
          "q11_1p3r7",
          "q11_1p3r8",
          "q11_1p3r9",
          "q11_1p3r10",
          "q11_1p3r11",
          "q11_1p3r12",
          "q11_1p3r13",
          "q11_1p3r14",
          "q11_1p3r15",
          "q11_1p3r16",
          "q11_1p3r17",
          "q11_1p3r18",
          "q11_1p3r19",
          "q11_1p3r20")

#Subset data set

dat <- all[,full.vars]
seg <- all[,full.vars]

Initial Diagnostics

Scale-use Bias

Likert scale ratings notoriously suffer from scale-use bias. That is, respondents use the scale differently from each other. Some tend to give low ratings, others medium ratings, and still others high ratings.

A simple check for this scale-use bias is to run an initial 3-segment solution on the segmentation variables.

#Run initial clustering
k3 <- kmeans(seg[,seg.vars], centers = 3, nstart = 25)
seg[,"segment"] <- k3$cluster
seg <- as.data.frame(seg[,c(seg.vars, "segment")])
seg[,"segment"] <- as.factor(seg[,"segment"])

#Calculate F-ratio for each clustering variable
F <- vector()
for(i in 1:length(seg.vars)){
        res.aov <- aov(seg[,i] ~ segment, data = seg)
        
        F[i] <- summary(res.aov)[[1]]$F[1]
}

#Group Means and F-Ratio
agg <- aggregate(seg[,1:length(seg.vars)], by = list(seg[,"segment"]), FUN = mean)
agg <-t(agg)
agg <- data.frame(agg)[2:length(seg),]
agg <- data.frame(cbind(agg, F))
agg[,1] <- as.numeric(paste(agg[,1]))
agg[,2] <- as.numeric(paste(agg[,2]))
agg[,3] <- as.numeric(paste(agg[,3]))
write.xlsx(cbind(rownames(agg), agg), file = "InitialCluster.xlsx")

## Note: zip::zip() is deprecated, please use zip::zipr() instead

#Group Means as Deviations From Grand Mean Table and F-Ratio
##Subtract variable's grand mean from each row
agg2 <- agg
for(i in 1:length(seg.vars)){
        agg2[i,1:3] <- agg[i,1:3] - mean(seg[,i])
}
write.xlsx(cbind(rownames(agg2), agg2), file = "InitialClusterCentered.xlsx")

The table below shows this initial 3-segment solution and all 34 clustering variables. Means were calculated for each variable (by segment) and the grand mean subtracted from each. Clearly, X1 gives medium ratings, X2 gives low ratings, and X3 high ratings.

print(agg2)

##                   X1       X2        X3         F
## q9_1p1r1    0.074918 0.720510 -0.853222  82.70414
## q9_1p1r2    0.119807 0.091275 -0.266667  12.31861
## q9_1p1r3    0.029072 0.522993 -0.582944  34.46203
## q9_1p1r4   -0.074715 0.564685 -0.476889  34.03387
## q9_1p1r5   -0.147768 0.805933 -0.621500  59.70285
## q9_1p1r6    0.019498 0.777530 -0.832556  72.02029
## q9_1p1r7    0.034396 1.099597 -1.187222 113.70756
## q9_1p1r8   -0.061072 0.874537 -0.817111  61.58387
## q9_1p1r9   -0.222995 0.965638 -0.678611  53.28241
## q9_1p1r10  -0.217043 1.380564 -1.116500 111.51846
## q9_1p1r11   0.033517 0.854805 -0.932667  88.34221
## q9_1p1r12  -0.075005 0.944416 -0.869389  81.93023
## q9_1p1r13   0.102512 0.748591 -0.921944  85.02383
## q9_1p1r14   0.193865 0.777919 -1.083611  98.46678
## q11_1p3r1   0.092966 0.296027 -0.439944  32.57725
## q11_1p3r2  -0.146367 1.014188 -0.839000  71.12988
## q11_1p3r3   0.100232 0.154604 -0.304056  15.70019
## q11_1p3r4   0.004551 0.629490 -0.657889  54.24556
## q11_1p3r5   0.028870 0.989477 -1.065333 102.05182
## q11_1p3r6   0.074386 0.205275 -0.319333  20.43901
## q11_1p3r7   0.125845 0.754027 -0.961111 118.68725
## q11_1p3r8   0.107787 1.197208 -1.393722 196.70876
## q11_1p3r9   0.063884 0.368913 -0.473556  37.23672
## q11_1p3r10 -0.070831 1.229302 -1.170167 120.18032
## q11_1p3r11  0.041739 1.009933 -1.105000 108.63904
## q11_1p3r12  0.156599 0.271893 -0.506444  41.92379
## q11_1p3r13  0.005266 1.178725 -1.227222 121.82861
## q11_1p3r14  0.058686 0.341181 -0.437389  32.51510
## q11_1p3r15  0.004667 0.653436 -0.682833  29.94654
## q11_1p3r16  0.046773 1.127342 -1.233722 130.75987
## q11_1p3r17 -0.140135 1.497611 -1.348167 156.07739
## q11_1p3r18  0.149372 0.942550 -1.190000 152.28682
## q11_1p3r19  0.160039 0.373799 -0.616833  52.84421
## q11_1p3r20 -0.036676 0.561450 -0.528222  57.93319

Mean Centering (at the respondent level)

To remedy the scale-use bias, all likert ratings are mean centered at the respondent level. That is, the row mean (i.e. a respondent’s average rating across all of the clustering variables) is subtracted from each value.

#Remove initial clustering solution
seg <- seg[,(-length(seg))]

#Subtract row means
rMean <- rowMeans(seg)
for(i in 1:(count(seg)[[1]])){
        seg[i,1:(length(seg))] <- seg[i,1:(length(seg))] - rMean[i]
}

Hopkins Statistic (“Clumpiness” Diagnostic)

The Hopkins statistic measures the extent that the points are randomly spaced (in 34-dimensional space) or the degree that the points clump together. A score of .50 indicates that the data is random. Anything above .45 usually fails to produce reliable clusters, ideally the data yield a score below .35.

#Calcluate the Hopkins Statistic
hopkins(seg, n=nrow(seg)-1)

## $H
## [1] 0.3126098

Predict Number of Clusters (Cluster Fit Statistics)

Utilize the elbow, silhoutte, and gap statistic methods for predicting the best number of clusters. The following analysis is suggestive of 2-cluster, 3-cluster, and 5-cluster solutions.

##Elbow method
suppressMessages(fviz_nbclust(seg[,seg.vars], kmeans, method = "wss"))

##Silhoutte method
suppressMessages(fviz_nbclust(seg[,seg.vars], kmeans, method = "silhouette"))

##Gap Statistic
gap_stat <- suppressWarnings(clusGap(seg[,seg.vars], FUN = kmeans, nstart = 25, K.max = 10, B = 50))
suppressMessages(fviz_gap_stat(gap_stat))

Cluster Solutions

Convergent K-Means

A convergent k-means technique was used for clustering. This modifies the basic k-means algorithm, which only uses one set of cluster starting points. Clusters are senstive to these starting points, so instead 30 sets of starting points are used and the one most in common with all others (convergent validity) is chosen as the final solution.

We start with 3-segment, 4-segment, and 5-segment solutions. The 4-segment solution ended up as the winner.

k3 <- kmeans(seg[,seg.vars], centers = 3, nstart = 30)
k4 <- kmeans(seg[,seg.vars], centers = 4, nstart = 30)
k5 <- kmeans(seg[,seg.vars], centers = 5, nstart = 30)

Each solution assigns a segment to each respondent. These segment assignments are matched to the corresponding respondents’ UUID and sent to data processing for tabulation. The tabber is instructed to put the segments in the banners so that internal (i.e. clustering) variables and external variables can be compared between the different segments. This over and under-indexing of variables determines the validity and feasibility of the overall segment solution.

#Combine UUID and segment assignment
k3_solution <- cbind(uuid = dat[,"uuid"], segment = k3$cluster)
write.csv(x = k3_solution, file = "C:/Users/t.kassab/Desktop/Great Wolf Lodge/k3_solution", quote = FALSE)

k4_solution <- cbind(uuid = dat[,"uuid"], segment = k4$cluster)
write.csv(x = k4_solution, file = "C:/Users/t.kassab/Desktop/Great Wolf Lodge/k4_solution", quote = FALSE)

k5_solution <- cbind(uuid = dat[,"uuid"], segment = k5$cluster)
write.csv(x = k5_solution, file = "C:/Users/t.kassab/Desktop/Great Wolf Lodge/k5_solution", quote = FALSE)

#Calculate F-Ratio for Column means (grouped by segment)
##3-Segment
seg[,"segment"] <- k3$cluster
seg <- as.data.frame(seg[,c(seg.vars, "segment")])
seg[,"segment"] <- as.factor(seg[,"segment"])

F <- vector()
for(i in 1:length(seg.vars)){
        res.aov <- aov(seg[,i] ~ segment, data = seg)
        
        F[i] <- summary(res.aov)[[1]]$F[1]
}

Mean_3 <- as.data.frame(seg %>% 
              group_by(segment) %>% 
              summarise_all("mean"))

k3_stats <- cbind(as.data.frame(t(Mean_3))[2:length(Mean_3),], F)
k3_stats[,1] <- as.numeric(paste(k3_stats[,1]))
k3_stats[,2] <- as.numeric(paste(k3_stats[,2]))
k3_stats[,3] <- as.numeric(paste(k3_stats[,3]))
write.xlsx(cbind(rownames(k3_stats), k3_stats), file = "k3_stats.xlsx")

##4-Segment
seg[,"segment"] <- k4$cluster
seg <- as.data.frame(seg[,c(seg.vars, "segment")])
seg[,"segment"] <- as.factor(seg[,"segment"])

F <- vector()
for(i in 1:length(seg.vars)){
        res.aov <- aov(seg[,i] ~ segment, data = seg)
        
        F[i] <- summary(res.aov)[[1]]$F[1]
}

Mean_4 <- as.data.frame(seg %>% 
              group_by(segment) %>% 
              summarise_all("mean"))

k4_stats <- cbind(as.data.frame(t(Mean_4))[2:length(Mean_4),], F)
k4_stats[,1] <- as.numeric(paste(k4_stats[,1]))
k4_stats[,2] <- as.numeric(paste(k4_stats[,2]))
k4_stats[,3] <- as.numeric(paste(k4_stats[,3]))
write.xlsx(cbind(rownames(k4_stats), k4_stats), file = "k4_stats.xlsx")

##5-Segment
seg[,"segment"] <- k5$cluster
seg <- as.data.frame(seg[,c(seg.vars, "segment")])
seg[,"segment"] <- as.factor(seg[,"segment"])

F <- vector()
for(i in 1:length(seg.vars)){
        res.aov <- aov(seg[,i] ~ segment, data = seg)
        
        F[i] <- summary(res.aov)[[1]]$F[1]
}

Mean_5 <- as.data.frame(seg %>% 
              group_by(segment) %>% 
              summarise_all("mean"))

k5_stats <- cbind(as.data.frame(t(Mean_5))[2:length(Mean_5),], F)
k5_stats[,1] <- as.numeric(paste(k5_stats[,1]))
k5_stats[,2] <- as.numeric(paste(k5_stats[,2]))
k5_stats[,3] <- as.numeric(paste(k5_stats[,3]))
write.xlsx(cbind(rownames(k5_stats), k5_stats), file = "k5_stats.xlsx")

Cluster Reproducibility

Cluster reproducibility measures the relative quality of the cluster solution. Reproducibility is thought of in terms of the overlap in segments between the two clustering solutions. If the segments perfectly overlap (i.e. all of the respondents that are grouped together in solution B are also grouped together in solution B), then the clusters are said to be 100% “reproducible”. This report averages an Adjusted Rand Index (ARI) for 250 cluster solution pairs. Sawtooth suggests a similar method using a confusion matrix, where the % accuracy serves as the reproducibility score.

This convergent k-means method yields highly reproducible segments with the 4-segment solution at 89.9% (compared to 25% for a “random chance” classification).

rand <- vector()
for(i in 1:250) {
set.seed(i)
k_x <- kmeans(seg[,seg.vars], centers = 3, nstart = 30)

fit1 <- as.numeric(k_x$cluster)

set.seed(1000 + i)
k_x <- kmeans(seg[,seg.vars], centers = 3, nstart = 30)

fit2 <- as.numeric(k_x$cluster)


clust_stats <- cluster.stats(d = dist(seg[,1:34]), 
                             fit1, fit2)

rand[i] <- clust_stats$corrected.rand
}
k3_ARI <- mean(rand)
k3_ARI

## [1] 0.9990292

rand <- vector()
for(i in 1:250) {
set.seed(i)
k_x <- kmeans(seg[,seg.vars], centers = 4, nstart = 30)

fit1 <- as.numeric(k_x$cluster)

set.seed(1000 + i)
k_x <- kmeans(seg[,seg.vars], centers = 4, nstart = 30)

fit2 <- as.numeric(k_x$cluster)


clust_stats <- cluster.stats(d = dist(seg[,1:34]), 
                             fit1, fit2)

rand[i] <- clust_stats$corrected.rand
}
k4_ARI <- mean(rand)
k4_ARI

## [1] 0.8961499

rand <- vector()
for(i in 1:250) {
set.seed(i)
k_x <- kmeans(seg[,seg.vars], centers = 5, nstart = 30)

fit1 <- as.numeric(k_x$cluster)

set.seed(1000 + i)
k_x <- kmeans(seg[,seg.vars], centers = 5, nstart = 30)

fit2 <- as.numeric(k_x$cluster)


clust_stats <- cluster.stats(d = dist(seg[,1:34]), 
                             fit1, fit2)

rand[i] <- clust_stats$corrected.rand
}

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

k5_ARI <- mean(rand)
k5_ARI

## [1] 0.8578425

Typing Tool

Variable Importance

First, the 4-segment solution assignments are merged back into the full data frame (containing the clustering variables) using respondent UUID.

#merge segment solution (4-group) with df containing clustering variables

dat <- merge(dat, k4_solution, by = "uuid" )

dat[,"segment"] <- as.factor(dat[,"segment"])

A classification tree is run on this data set (D.V. = segment / I.V.’s = clustering variables), which generates a variable importance metric. This metric allows the rank ordering of the variables by “importance”, used below to achieve the most parsimonious model (fewest variables) with the greatest accuracy.

#decision tree for variable importance

tree <- rpart(dat$segment~., data = dat[,seg.vars])
imp <- varImp(tree)
imp <- as.data.frame(cbind(rownames(imp),imp[1]))
imp[,1] <- as.character(imp[,1])
imp <- arrange(imp,desc(imp[,2]))

Classification Accuracies

Classification accuracy is calculated using a confusion matrix. Accuracy measures the overlap between the actual segment (from the clustering solution) and the segment assignment by the classification algorithm.

Here we use Linear Discriminant Analysis (LDA) as our classification algortihm, which accepts the clustering variables as its argument.

First, the data is partitioned into train/test sets (80%/20%) for accuracy calculation. A partition index is supplied to the linDA function call, and classification and accuracy are computed all at once.

Next, the for loop steps through the clustering variables in order of greatest to least importance. The accuracy is saved for each of the runs.

Finally, the object “acc” shows the cumulative accuracy of the classification algorithm as the number of variables included are incremented.

# partition training and test

index <- createDataPartition(dat$segment, p = .8, list = FALSE)

#determine accuracies for different variable subsets 

acc <- data.frame()
for(i in 2:34){
        seg.reduced <- imp[1:i,1]
        
        linda <- linDA(group = dat$segment, 
                         variables = as.data.frame(dat[,seg.reduced]),
                         learn = as.vector(index),
                         test = as.vector(which("FALSE" == as.vector(1:500) %in% index)), 
                         validation = "learntest")
        
        accuracy = 1 - linda$error_rate
        
        acc[i,1] <- seg.reduced[i]
        acc[i,2] <- accuracy
        
}
print(acc)

##            V1        V2
## 1        <NA>        NA
## 2  q11_1p3r17 0.6632653
## 3  q11_1p3r10 0.6734694
## 4  q11_1p3r16 0.6734694
## 5   q11_1p3r2 0.7040816
## 6   q9_1p1r10 0.7653061
## 7    q9_1p1r8 0.7244898
## 8   q9_1p1r14 0.7959184
## 9   q11_1p3r6 0.8163265
## 10 q11_1p3r13 0.8469388
## 11  q11_1p3r8 0.8367347
## 12  q9_1p1r13 0.8469388
## 13 q11_1p3r18 0.8775510
## 14   q9_1p1r1 0.8673469
## 15  q11_1p3r3 0.8877551
## 16  q9_1p1r11 0.8979592
## 17   q9_1p1r2 0.8979592
## 18   q9_1p1r7 0.8877551
## 19 q11_1p3r19 0.8979592
## 20   q9_1p1r6 0.8979592
## 21  q11_1p3r9 0.9183673
## 22   q9_1p1r3 0.9183673
## 23 q11_1p3r20 0.9081633
## 24 q11_1p3r14 0.9183673
## 25 q11_1p3r12 0.9081633
## 26  q9_1p1r12 0.9285714
## 27  q11_1p3r1 0.9183673
## 28  q11_1p3r4 0.9081633
## 29  q11_1p3r7 0.9183673
## 30   q9_1p1r4 0.8877551
## 31   q9_1p1r5 0.8877551
## 32  q11_1p3r5 0.8775510
## 33 q11_1p3r11 0.8877551
## 34 q11_1p3r15 0.8775510

Most researchers insist on 80% segment classification accuracy. This requires that we include a minimum of 11 variables in the classification algorithm. These 11-variables will constitute the “reduced battery” in the typing tool.

Linear Discriminant Analysis (Classification Algorithm)

#first 11 variables for 80% accuracy

print(imp[1:11,1]) #first 11

##  [1] "q9_1p1r9"   "q11_1p3r17" "q11_1p3r10" "q11_1p3r16" "q11_1p3r2" 
##  [6] "q9_1p1r10"  "q9_1p1r8"   "q9_1p1r14"  "q11_1p3r6"  "q11_1p3r13"
## [11] "q11_1p3r8"

print(acc[11,2]) #accuracy

## [1] 0.8367347

An LDA is called one last time using the 11 most important clustering variables. The linear equations (one corresponding to each segment) are printed to an excel file for incorporation in the typing tool.

Each linear discriminant equation calculates the probability that a respondent is classified to that particular segment, given their responses to the 11 clustering variables.

#Final model using top 11 variables for 80% accuracy

final.var <- imp[1:11,1]

linda.final <- linDA(group = dat$segment, 
                         variables = as.data.frame(dat[,final.var]),
                         learn = as.vector(index),
                         test = as.vector(which("FALSE" == as.vector(1:500) %in% index)), 
                         validation = "learntest")

linda.write <- cbind(row.names(linda.final$functions), linda.final$functions)

write.xlsx(linda.write , 
           file = "/Users/traviskassab/Desktop/Great Wolf Lodge/DiscriminantFunctions.xlsx")

## Warning in file.create(to[okay]): cannot create file '/Users/traviskassab/
## Desktop/Great Wolf Lodge/DiscriminantFunctions.xlsx', reason 'No such file
## or directory'