Code not included, but packages used include: factoextra, ggplot2, cluster, dplyr, caret and ranger. Link to subscriber data: https://northeastern.instructure.com/courses/203836/files/32770207/download?download_frd=1
# MANAGE / STANDARDIZE DATA & PERFORM PCA BASED ON SUBSCRIPTION TYPE
pca.results<- prcomp(subscribers [,c(1:2,4:10)], scale.=TRUE)
# Convert pca.results to data frame
pca.data<- data.frame(pca.results$x)
# MAKE PCA PLOT BASED ON SUBSCRIPTION TYPE
ggplot(pca.data, aes(x = PC1, y = PC2, color = as.factor(subscribers$SubscriptionPlan))) +
geom_point(size = 2) +
labs(title = "PCA of Subscription Dataset",
x = "Principal Component 1",
y = "Principal Component 2",
color = "Subscription Plan")+theme_minimal()
# NOW WE WILL PERFORM A K-MEANS CLUSTERING TO IDENTIFY CUSTOMER SEGEMENTS.
# LET'S START BY USING THE ELBOW METHOD FOR CHOOSING K
k.values<- 2:8
wss.values<- sapply(k.values, function(k) {kmeans.model<-
kmeans(pca.data, centers = k, nstart = 25)
return(kmeans.model$tot.withinss)})
# CREATE AN ELBOW PLOT
elbow_plot <- ggplot(data.frame(k = k.values, WSS = wss.values), aes(x = k, y = WSS)) +
geom_point() +
geom_line() +
labs(title = "Elbow method for choosing k",
x = "Number of Clusters (k)",
y = "Total Within-Cluster Sum of Squares") +
theme_minimal()
# Display the plot
print(elbow_plot)
# Define function to compute average silhouette width for each value k
compute_silhouette <- function(pca.data, k) {
km <- kmeans(pca.data, centers = k, nstart = 25)
silhouette_score <- silhouette(km$cluster, dist(pca.data))
return(mean(silhouette_score[, 3])) # Extract average silhouette width
}
# Compute silhouette scores
sil_scores <- sapply(k.values, function(k) compute_silhouette(subscribers[c(1:2,4:10)], k))
# Create silhouette score plot
sil_plot <- ggplot(data.frame(k = k.values, Silhouette = sil_scores),
aes(x = k, y = Silhouette)) +
geom_point() +
geom_line() +
labs(title = "Silhouette method for choosing k",
x = "Number of Clusters (k)",
y = "Average Silhouette Score") +
theme_minimal()
# Display the plot
print(sil_plot)
# Compute Gap Statistic
set.seed(123) # For reproducibility
gap_stat <- clusGap(pca.data[c(1:9)], FUN = kmeans, K.max = 10, B = 50)
# Plot Gap Statistic
fviz_gap_stat(gap_stat, linecolor = "black")+ ggtitle("Gap statistic method for choosing k")
# Let's go with k=4 based on the outputs above
kmeans4.output<- kmeans(pca.data, centers = 4, nstart = 25)
Cluster.ID<- kmeans4.output$cluster
#Add labels
pca.data$SubscriptionPlan<- subscribers$SubscriptionPlan
# MAKE CLUSTER PLOT BASED ON CLUTER ID
ggplot(pca.data, aes(x = PC1, y = PC2, color = as.factor(kmeans4.output$cluster))) +
geom_point(size = 2) +
labs(title = "K-Means Output",
x = "Principal Component 1",
y = "Principal Component 2",
color = "Cluster ID")+theme_minimal()
# Add in cluster id as a column to the original data set
subscribers<- cbind(subscribers, Cluster.ID)
# Summarize cluster statistics — A company marketer would use this data to
# inform further marketing decisions
stats.by.plan<- subscribers %>%
group_by(Cluster.ID) %>%
summarise(avg.watch.time = mean(WatchTime),
median.unique.shows = median(UniqueShows),
number.subscribers = n(), avg.churn.risk = mean(ChurnRiskScore),
avg.buffer.time = mean(BufferingTime), avg.adusage = mean(AdUsagePct),
avg.genre.pref = mean(GenrePreferenceScore))
print(stats.by.plan)
## # A tibble: 4 × 8
## Cluster.ID avg.watch.time median.unique.shows number.subscribers
## <int> <dbl> <dbl> <int>
## 1 1 33.9 6 63
## 2 2 64.2 9 59
## 3 3 77.6 14 98
## 4 4 49.2 10 80
## # ℹ 4 more variables: avg.churn.risk <dbl>, avg.buffer.time <dbl>,
## # avg.adusage <dbl>, avg.genre.pref <dbl>
# THIS IS EXPERIMENTAL — NOT SURE IF THIS IS A GOOD USE OF THE
# K-MEANS RESULTS. BUT LETS TRY AND MAKE A PREDICTIVE MODEL,
# WITH CHURN RISK SCORE AS THE OUTPUT VARIABLE
# AND SOME CLUSTER CHARACTERISTICS AS INPUT VARIABLES
set.seed(123)
pca.data$Cluster.ID<- Cluster.ID
pca.data$ChurnRiskScore<- subscribers$ChurnRiskScore
random.vec <- runif(nrow(pca.data))
training.vec <- ifelse(random.vec < 0.7, 1, 0)
train.data <- pca.data[training.vec==1,]
test.data <- pca.data[training.vec==0,]
# ONE CHARACTERISTIC WE CAN USE IS CENTROID DISTANCE
centers_assigned <- kmeans4.output$centers[kmeans4.output$cluster[training.vec == 1], ]
train.data$Cluster_Distance <- rowSums((train.data[, 1:9] - centers_assigned)^2)
test.data$Cluster_Distance <- rowSums((test.data[,1:9] - centers_assigned)^2)
# ANOTHER ONE WE CAN USE IS AVERAGE CLUSTER CHURN RATE
cluster_churn_rates <- aggregate(ChurnRiskScore ~ Cluster.ID, data = train.data, FUN = mean)
train.data$Cluster_Churn_Rate <- cluster_churn_rates$ChurnRiskScore[train.data$Cluster.ID]
test.data$Cluster_Churn_Rate <- cluster_churn_rates$ChurnRiskScore[test.data$Cluster.ID]
# MAKE THE MODEL
log.reg<- glm(ChurnRiskScore ~ . - Cluster.ID - SubscriptionPlan - ChurnRiskScore,
data = train.data, family = 'gaussian')
summary(log.reg)
##
## Call:
## glm(formula = ChurnRiskScore ~ . - Cluster.ID - SubscriptionPlan -
## ChurnRiskScore, family = "gaussian", data = train.data)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.819e+01 1.452e-14 3.318e+15 <2e-16 ***
## PC1 -6.248e+00 1.974e-15 -3.165e+15 <2e-16 ***
## PC2 6.736e+00 2.377e-15 2.834e+15 <2e-16 ***
## PC3 -1.604e+01 2.158e-15 -7.433e+15 <2e-16 ***
## PC4 3.065e+00 2.210e-15 1.387e+15 <2e-16 ***
## PC5 1.329e+01 2.511e-15 5.293e+15 <2e-16 ***
## PC6 2.078e+00 2.724e-15 7.626e+14 <2e-16 ***
## PC7 -1.599e-01 2.732e-15 -5.855e+13 <2e-16 ***
## PC8 2.931e+00 2.937e-15 9.979e+14 <2e-16 ***
## PC9 -6.386e+00 3.423e-15 -1.866e+15 <2e-16 ***
## Cluster_Distance -2.239e-18 7.874e-16 -3.000e-03 0.998
## Cluster_Churn_Rate 6.254e-16 2.808e-16 2.227e+00 0.027 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 9.398434e-28)
##
## Null deviance: 1.2630e+05 on 215 degrees of freedom
## Residual deviance: 1.9173e-25 on 204 degrees of freedom
## AIC: -12815
##
## Number of Fisher Scoring iterations: 1
# REFINE LOGREG
log.reg<- step(log.reg, direction = 'backward')
## Start: AIC=-12815.44
## ChurnRiskScore ~ (PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 +
## PC9 + SubscriptionPlan + Cluster.ID + Cluster_Distance +
## Cluster_Churn_Rate) - Cluster.ID - SubscriptionPlan - ChurnRiskScore
##
## Df Deviance AIC
## - Cluster_Churn_Rate 1 0 -12846.8
## - Cluster_Distance 1 0 -12817.4
## <none> 0 -12815.4
## - PC7 1 3 -271.4
## - PC6 1 547 837.5
## - PC8 1 936 953.7
## - PC4 1 1807 1095.8
## - PC9 1 3271 1224.0
## - PC2 1 7548 1404.6
## - PC1 1 9413 1452.3
## - PC5 1 26330 1674.5
## - PC3 1 51928 1821.2
##
## Step: AIC=-12846.77
## ChurnRiskScore ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 +
## PC9 + Cluster_Distance
##
## Df Deviance AIC
## <none> 0 -12846.8
## - Cluster_Distance 1 0 -12846.5
## - PC7 1 3 -272.8
## - PC6 1 554 838.5
## - PC8 1 936 951.8
## - PC4 1 1813 1094.5
## - PC9 1 3287 1223.0
## - PC2 1 10607 1476.1
## - PC1 1 26821 1676.5
## - PC5 1 28621 1690.5
## - PC3 1 52637 1822.1
summary(log.reg)
##
## Call:
## glm(formula = ChurnRiskScore ~ PC1 + PC2 + PC3 + PC4 + PC5 +
## PC6 + PC7 + PC8 + PC9 + Cluster_Distance, family = "gaussian",
## data = train.data)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.819e+01 4.547e-15 1.060e+16 <2e-16 ***
## PC1 -6.248e+00 1.090e-15 -5.731e+15 <2e-16 ***
## PC2 6.736e+00 1.869e-15 3.604e+15 <2e-16 ***
## PC3 -1.604e+01 1.998e-15 -8.029e+15 <2e-16 ***
## PC4 3.065e+00 2.057e-15 1.490e+15 <2e-16 ***
## PC5 1.329e+01 2.245e-15 5.921e+15 <2e-16 ***
## PC6 2.078e+00 2.522e-15 8.239e+14 <2e-16 ***
## PC7 -1.599e-01 2.543e-15 -6.289e+13 <2e-16 ***
## PC8 2.931e+00 2.737e-15 1.071e+15 <2e-16 ***
## PC9 -6.386e+00 3.183e-15 -2.006e+15 <2e-16 ***
## Cluster_Distance -1.452e-16 7.315e-16 -1.990e-01 0.843
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 8.165144e-28)
##
## Null deviance: 1.2630e+05 on 215 degrees of freedom
## Residual deviance: 1.6739e-25 on 205 degrees of freedom
## AIC: -12847
##
## Number of Fisher Scoring iterations: 1
# MAKE PREDICTIONS
log.reg.preds<- predict(log.reg, test.data, type = 'response')
predictionsTF<- (log.reg.preds > 50)
Actuals<- (test.data$ChurnRiskScore > 50)
pred.table<- table(predictionsTF, Actuals)
output.table<- cbind(Actuals, predictionsTF)
cbind(output.table, test.data$ChurnRiskScore)
## Actuals predictionsTF
## 2 1 1 74.09
## 4 0 0 38.28
## 5 0 0 15.82
## 8 0 0 41.21
## 11 0 0 49.07
## 16 1 1 100.00
## 20 0 0 33.62
## 21 0 0 49.63
## 24 1 1 60.14
## 26 1 1 62.77
## 31 0 0 38.05
## 32 1 1 60.00
## 34 0 0 28.62
## 37 0 0 14.02
## 50 1 1 55.54
## 53 0 0 39.18
## 58 1 1 62.24
## 59 1 1 50.07
## 65 1 1 57.47
## 67 0 0 41.48
## 68 1 1 64.62
## 69 1 1 65.95
## 71 0 0 41.99
## 73 0 0 38.68
## 84 0 0 0.00
## 87 1 1 65.21
## 88 0 0 22.17
## 89 1 1 82.64
## 97 1 1 83.84
## 104 0 0 28.71
## 106 1 1 62.79
## 107 0 0 44.00
## 111 1 1 79.46
## 114 0 0 40.43
## 115 0 0 16.34
## 118 1 1 72.63
## 126 1 1 58.49
## 132 0 0 28.11
## 134 1 1 76.46
## 137 1 1 62.28
## 138 0 0 46.37
## 139 1 1 52.55
## 145 0 0 47.21
## 150 0 0 19.74
## 151 1 1 55.46
## 167 0 0 31.57
## 173 1 1 98.48
## 174 1 1 51.65
## 179 1 1 67.42
## 181 1 1 81.15
## 183 0 0 2.38
## 189 0 0 23.73
## 190 0 0 40.33
## 193 0 0 3.42
## 195 0 0 16.60
## 202 0 0 40.81
## 206 0 0 0.00
## 216 1 1 82.81
## 219 0 0 42.51
## 220 1 1 71.57
## 222 1 1 74.54
## 223 0 0 44.72
## 230 0 0 40.59
## 238 1 1 91.62
## 240 1 1 86.14
## 246 0 0 16.59
## 248 1 1 69.58
## 249 1 1 51.02
## 250 1 1 76.88
## 256 0 0 33.86
## 260 1 1 52.86
## 261 0 0 32.11
## 262 1 1 70.02
## 264 1 1 70.08
## 271 1 1 75.94
## 275 1 1 69.16
## 276 0 0 20.48
## 277 0 0 35.12
## 281 0 0 45.48
## 294 1 1 69.61
## 295 0 0 0.00
## 296 0 0 42.90
## 297 0 0 43.48
## 300 1 1 79.56
confusionMatrix(pred.table, positive = 'TRUE')
## Confusion Matrix and Statistics
##
## Actuals
## predictionsTF FALSE TRUE
## FALSE 43 0
## TRUE 0 41
##
## Accuracy : 1
## 95% CI : (0.957, 1)
## No Information Rate : 0.5119
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.4881
## Detection Rate : 0.4881
## Detection Prevalence : 0.4881
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : TRUE
##
# MAKE THE MODEL
ran.for<- ranger(ChurnRiskScore ~. - SubscriptionPlan - Cluster.ID - ChurnRiskScore,
data = train.data, num.trees = 200, importance = 'impurity')
# MAKE PREDICTIONS
ran.for.preds<- predict(ran.for, test.data, type = 'response')
ran.for.predsTF<- (ran.for.preds$predictions > 50)
ran.for.table<- table(ran.for.predsTF, Actuals)
confusionMatrix(ran.for.table, positive = 'TRUE')
## Confusion Matrix and Statistics
##
## Actuals
## ran.for.predsTF FALSE TRUE
## FALSE 39 3
## TRUE 4 38
##
## Accuracy : 0.9167
## 95% CI : (0.8358, 0.9658)
## No Information Rate : 0.5119
## P-Value [Acc > NIR] : 1.335e-15
##
## Kappa : 0.8333
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9268
## Specificity : 0.9070
## Pos Pred Value : 0.9048
## Neg Pred Value : 0.9286
## Prevalence : 0.4881
## Detection Rate : 0.4524
## Detection Prevalence : 0.5000
## Balanced Accuracy : 0.9169
##
## 'Positive' Class : TRUE
##
You’ll see that the logistic regression model performed better in every category, with perfect accuracy. It is important to note that I’ve taken any churn risk value higher than 50 as a positive prediction, given that churn risk runs on a scale from 0-100. This is a bold assumption and should be taken with a grain of salt. Prior to finalizing this code, I attempted using the ClusterID as a predictor variable, but it became clear that the characteristics that go into the cluster ID were not considered by the machine, and while there are cluster characteristics that are great predictors, the ID itself is not one. Changing the input variables from simply cluster ID to average cluster churn rate and centroid distance improved random forest prediction accuracy from 76% to ~92%, and logistic regression from 75% to 100%.
Using cluster statistics, we can develop retention strategies. Ie. For cluster 1, which has the lowest average watch time, the highest churn risk, and lowest genre affinity, we could offer promotions which decrease ad frequency or subscription renewal price depending on the number of shows they watch.