Code not included, but packages used include: factoextra, ggplot2, cluster, dplyr, caret and ranger. Link to subscriber data: https://northeastern.instructure.com/courses/203836/files/32770207/download?download_frd=1

# MANAGE / STANDARDIZE DATA & PERFORM PCA BASED ON SUBSCRIPTION TYPE
pca.results<- prcomp(subscribers [,c(1:2,4:10)], scale.=TRUE)

# Convert pca.results to data frame
pca.data<- data.frame(pca.results$x)

# MAKE PCA PLOT BASED ON SUBSCRIPTION TYPE 
ggplot(pca.data, aes(x = PC1, y = PC2, color = as.factor(subscribers$SubscriptionPlan))) +
  geom_point(size = 2) +
  labs(title = "PCA of Subscription Dataset",
       x = "Principal Component 1",
       y = "Principal Component 2",
       color = "Subscription Plan")+theme_minimal()

# NOW WE WILL PERFORM A K-MEANS CLUSTERING TO IDENTIFY CUSTOMER SEGEMENTS. 
# LET'S START BY USING THE ELBOW METHOD FOR CHOOSING K
k.values<- 2:8
wss.values<- sapply(k.values, function(k) {kmeans.model<- 
  kmeans(pca.data, centers = k, nstart = 25)
return(kmeans.model$tot.withinss)})

# CREATE AN ELBOW PLOT
elbow_plot <- ggplot(data.frame(k = k.values, WSS = wss.values), aes(x = k, y = WSS)) +
  geom_point() +
  geom_line() +
  labs(title = "Elbow method for choosing k",
       x = "Number of Clusters (k)",
       y = "Total Within-Cluster Sum of Squares") +
  theme_minimal()

# Display the plot
print(elbow_plot)

# Define function to compute average silhouette width for each value k
compute_silhouette <- function(pca.data, k) {
  km <- kmeans(pca.data, centers = k, nstart = 25)
  silhouette_score <- silhouette(km$cluster, dist(pca.data))
  return(mean(silhouette_score[, 3]))  # Extract average silhouette width
}

# Compute silhouette scores
sil_scores <- sapply(k.values, function(k) compute_silhouette(subscribers[c(1:2,4:10)], k))

# Create silhouette score plot
sil_plot <- ggplot(data.frame(k = k.values, Silhouette = sil_scores), 
                   aes(x = k, y = Silhouette)) + 
  geom_point() +
  geom_line() +
  labs(title = "Silhouette method for choosing k",
       x = "Number of Clusters (k)",
       y = "Average Silhouette Score") +
  theme_minimal()

# Display the plot
print(sil_plot)

# Compute Gap Statistic
set.seed(123)  # For reproducibility
gap_stat <- clusGap(pca.data[c(1:9)], FUN = kmeans, K.max = 10, B = 50)

# Plot Gap Statistic
fviz_gap_stat(gap_stat, linecolor = "black")+ ggtitle("Gap statistic method for choosing k")

# Let's go with k=4 based on the outputs above
kmeans4.output<- kmeans(pca.data, centers = 4, nstart = 25)
Cluster.ID<- kmeans4.output$cluster

#Add labels
pca.data$SubscriptionPlan<- subscribers$SubscriptionPlan

# MAKE CLUSTER PLOT BASED ON CLUTER ID
ggplot(pca.data, aes(x = PC1, y = PC2, color = as.factor(kmeans4.output$cluster))) +
  geom_point(size = 2) +
  labs(title = "K-Means Output",
       x = "Principal Component 1",
       y = "Principal Component 2",
       color = "Cluster ID")+theme_minimal()

# Add in cluster id as a column to the original data set
subscribers<- cbind(subscribers, Cluster.ID)

# Summarize cluster statistics — A company marketer would use this data to 
# inform further marketing decisions
stats.by.plan<- subscribers %>%
  group_by(Cluster.ID) %>%
  summarise(avg.watch.time = mean(WatchTime), 
            median.unique.shows = median(UniqueShows), 
            number.subscribers = n(), avg.churn.risk = mean(ChurnRiskScore),
            avg.buffer.time = mean(BufferingTime), avg.adusage = mean(AdUsagePct),
            avg.genre.pref = mean(GenrePreferenceScore))
print(stats.by.plan)
## # A tibble: 4 × 8
##   Cluster.ID avg.watch.time median.unique.shows number.subscribers
##        <int>          <dbl>               <dbl>              <int>
## 1          1           33.9                   6                 63
## 2          2           64.2                   9                 59
## 3          3           77.6                  14                 98
## 4          4           49.2                  10                 80
## # ℹ 4 more variables: avg.churn.risk <dbl>, avg.buffer.time <dbl>,
## #   avg.adusage <dbl>, avg.genre.pref <dbl>
# THIS IS EXPERIMENTAL — NOT SURE IF THIS IS A GOOD USE OF THE 
# K-MEANS RESULTS. BUT LETS TRY AND MAKE A PREDICTIVE MODEL, 
# WITH CHURN RISK SCORE AS THE OUTPUT VARIABLE
# AND SOME CLUSTER CHARACTERISTICS AS INPUT VARIABLES
set.seed(123)
pca.data$Cluster.ID<- Cluster.ID
pca.data$ChurnRiskScore<- subscribers$ChurnRiskScore
random.vec <- runif(nrow(pca.data))
training.vec <- ifelse(random.vec < 0.7, 1, 0)
train.data <- pca.data[training.vec==1,]
test.data <- pca.data[training.vec==0,]

# ONE CHARACTERISTIC WE CAN USE IS CENTROID DISTANCE
centers_assigned <- kmeans4.output$centers[kmeans4.output$cluster[training.vec == 1], ]
train.data$Cluster_Distance <- rowSums((train.data[, 1:9] - centers_assigned)^2)
test.data$Cluster_Distance <- rowSums((test.data[,1:9] - centers_assigned)^2)

# ANOTHER ONE WE CAN USE IS AVERAGE CLUSTER CHURN RATE
cluster_churn_rates <- aggregate(ChurnRiskScore ~ Cluster.ID, data = train.data, FUN = mean)
train.data$Cluster_Churn_Rate <- cluster_churn_rates$ChurnRiskScore[train.data$Cluster.ID]
test.data$Cluster_Churn_Rate <- cluster_churn_rates$ChurnRiskScore[test.data$Cluster.ID]

# MAKE THE MODEL
log.reg<- glm(ChurnRiskScore ~ . - Cluster.ID - SubscriptionPlan - ChurnRiskScore,
              data = train.data, family = 'gaussian')
summary(log.reg)
## 
## Call:
## glm(formula = ChurnRiskScore ~ . - Cluster.ID - SubscriptionPlan - 
##     ChurnRiskScore, family = "gaussian", data = train.data)
## 
## Coefficients:
##                      Estimate Std. Error    t value Pr(>|t|)    
## (Intercept)         4.819e+01  1.452e-14  3.318e+15   <2e-16 ***
## PC1                -6.248e+00  1.974e-15 -3.165e+15   <2e-16 ***
## PC2                 6.736e+00  2.377e-15  2.834e+15   <2e-16 ***
## PC3                -1.604e+01  2.158e-15 -7.433e+15   <2e-16 ***
## PC4                 3.065e+00  2.210e-15  1.387e+15   <2e-16 ***
## PC5                 1.329e+01  2.511e-15  5.293e+15   <2e-16 ***
## PC6                 2.078e+00  2.724e-15  7.626e+14   <2e-16 ***
## PC7                -1.599e-01  2.732e-15 -5.855e+13   <2e-16 ***
## PC8                 2.931e+00  2.937e-15  9.979e+14   <2e-16 ***
## PC9                -6.386e+00  3.423e-15 -1.866e+15   <2e-16 ***
## Cluster_Distance   -2.239e-18  7.874e-16 -3.000e-03    0.998    
## Cluster_Churn_Rate  6.254e-16  2.808e-16  2.227e+00    0.027 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 9.398434e-28)
## 
##     Null deviance: 1.2630e+05  on 215  degrees of freedom
## Residual deviance: 1.9173e-25  on 204  degrees of freedom
## AIC: -12815
## 
## Number of Fisher Scoring iterations: 1
# REFINE LOGREG
log.reg<- step(log.reg, direction = 'backward')
## Start:  AIC=-12815.44
## ChurnRiskScore ~ (PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + 
##     PC9 + SubscriptionPlan + Cluster.ID + Cluster_Distance + 
##     Cluster_Churn_Rate) - Cluster.ID - SubscriptionPlan - ChurnRiskScore
## 
##                      Df Deviance      AIC
## - Cluster_Churn_Rate  1        0 -12846.8
## - Cluster_Distance    1        0 -12817.4
## <none>                         0 -12815.4
## - PC7                 1        3   -271.4
## - PC6                 1      547    837.5
## - PC8                 1      936    953.7
## - PC4                 1     1807   1095.8
## - PC9                 1     3271   1224.0
## - PC2                 1     7548   1404.6
## - PC1                 1     9413   1452.3
## - PC5                 1    26330   1674.5
## - PC3                 1    51928   1821.2
## 
## Step:  AIC=-12846.77
## ChurnRiskScore ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + 
##     PC9 + Cluster_Distance
## 
##                    Df Deviance      AIC
## <none>                       0 -12846.8
## - Cluster_Distance  1        0 -12846.5
## - PC7               1        3   -272.8
## - PC6               1      554    838.5
## - PC8               1      936    951.8
## - PC4               1     1813   1094.5
## - PC9               1     3287   1223.0
## - PC2               1    10607   1476.1
## - PC1               1    26821   1676.5
## - PC5               1    28621   1690.5
## - PC3               1    52637   1822.1
summary(log.reg)
## 
## Call:
## glm(formula = ChurnRiskScore ~ PC1 + PC2 + PC3 + PC4 + PC5 + 
##     PC6 + PC7 + PC8 + PC9 + Cluster_Distance, family = "gaussian", 
##     data = train.data)
## 
## Coefficients:
##                    Estimate Std. Error    t value Pr(>|t|)    
## (Intercept)       4.819e+01  4.547e-15  1.060e+16   <2e-16 ***
## PC1              -6.248e+00  1.090e-15 -5.731e+15   <2e-16 ***
## PC2               6.736e+00  1.869e-15  3.604e+15   <2e-16 ***
## PC3              -1.604e+01  1.998e-15 -8.029e+15   <2e-16 ***
## PC4               3.065e+00  2.057e-15  1.490e+15   <2e-16 ***
## PC5               1.329e+01  2.245e-15  5.921e+15   <2e-16 ***
## PC6               2.078e+00  2.522e-15  8.239e+14   <2e-16 ***
## PC7              -1.599e-01  2.543e-15 -6.289e+13   <2e-16 ***
## PC8               2.931e+00  2.737e-15  1.071e+15   <2e-16 ***
## PC9              -6.386e+00  3.183e-15 -2.006e+15   <2e-16 ***
## Cluster_Distance -1.452e-16  7.315e-16 -1.990e-01    0.843    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 8.165144e-28)
## 
##     Null deviance: 1.2630e+05  on 215  degrees of freedom
## Residual deviance: 1.6739e-25  on 205  degrees of freedom
## AIC: -12847
## 
## Number of Fisher Scoring iterations: 1
# MAKE PREDICTIONS
log.reg.preds<- predict(log.reg, test.data, type = 'response')
predictionsTF<- (log.reg.preds > 50)
Actuals<- (test.data$ChurnRiskScore > 50)
pred.table<- table(predictionsTF, Actuals)
output.table<- cbind(Actuals, predictionsTF)
cbind(output.table, test.data$ChurnRiskScore)
##     Actuals predictionsTF       
## 2         1             1  74.09
## 4         0             0  38.28
## 5         0             0  15.82
## 8         0             0  41.21
## 11        0             0  49.07
## 16        1             1 100.00
## 20        0             0  33.62
## 21        0             0  49.63
## 24        1             1  60.14
## 26        1             1  62.77
## 31        0             0  38.05
## 32        1             1  60.00
## 34        0             0  28.62
## 37        0             0  14.02
## 50        1             1  55.54
## 53        0             0  39.18
## 58        1             1  62.24
## 59        1             1  50.07
## 65        1             1  57.47
## 67        0             0  41.48
## 68        1             1  64.62
## 69        1             1  65.95
## 71        0             0  41.99
## 73        0             0  38.68
## 84        0             0   0.00
## 87        1             1  65.21
## 88        0             0  22.17
## 89        1             1  82.64
## 97        1             1  83.84
## 104       0             0  28.71
## 106       1             1  62.79
## 107       0             0  44.00
## 111       1             1  79.46
## 114       0             0  40.43
## 115       0             0  16.34
## 118       1             1  72.63
## 126       1             1  58.49
## 132       0             0  28.11
## 134       1             1  76.46
## 137       1             1  62.28
## 138       0             0  46.37
## 139       1             1  52.55
## 145       0             0  47.21
## 150       0             0  19.74
## 151       1             1  55.46
## 167       0             0  31.57
## 173       1             1  98.48
## 174       1             1  51.65
## 179       1             1  67.42
## 181       1             1  81.15
## 183       0             0   2.38
## 189       0             0  23.73
## 190       0             0  40.33
## 193       0             0   3.42
## 195       0             0  16.60
## 202       0             0  40.81
## 206       0             0   0.00
## 216       1             1  82.81
## 219       0             0  42.51
## 220       1             1  71.57
## 222       1             1  74.54
## 223       0             0  44.72
## 230       0             0  40.59
## 238       1             1  91.62
## 240       1             1  86.14
## 246       0             0  16.59
## 248       1             1  69.58
## 249       1             1  51.02
## 250       1             1  76.88
## 256       0             0  33.86
## 260       1             1  52.86
## 261       0             0  32.11
## 262       1             1  70.02
## 264       1             1  70.08
## 271       1             1  75.94
## 275       1             1  69.16
## 276       0             0  20.48
## 277       0             0  35.12
## 281       0             0  45.48
## 294       1             1  69.61
## 295       0             0   0.00
## 296       0             0  42.90
## 297       0             0  43.48
## 300       1             1  79.56
confusionMatrix(pred.table, positive = 'TRUE')
## Confusion Matrix and Statistics
## 
##              Actuals
## predictionsTF FALSE TRUE
##         FALSE    43    0
##         TRUE      0   41
##                                     
##                Accuracy : 1         
##                  95% CI : (0.957, 1)
##     No Information Rate : 0.5119    
##     P-Value [Acc > NIR] : < 2.2e-16 
##                                     
##                   Kappa : 1         
##                                     
##  Mcnemar's Test P-Value : NA        
##                                     
##             Sensitivity : 1.0000    
##             Specificity : 1.0000    
##          Pos Pred Value : 1.0000    
##          Neg Pred Value : 1.0000    
##              Prevalence : 0.4881    
##          Detection Rate : 0.4881    
##    Detection Prevalence : 0.4881    
##       Balanced Accuracy : 1.0000    
##                                     
##        'Positive' Class : TRUE      
## 
# MAKE THE MODEL
ran.for<- ranger(ChurnRiskScore ~. - SubscriptionPlan - Cluster.ID - ChurnRiskScore, 
                 data = train.data, num.trees = 200, importance = 'impurity')

# MAKE PREDICTIONS
ran.for.preds<- predict(ran.for, test.data, type = 'response')
ran.for.predsTF<- (ran.for.preds$predictions > 50)
ran.for.table<- table(ran.for.predsTF, Actuals)
confusionMatrix(ran.for.table, positive = 'TRUE')
## Confusion Matrix and Statistics
## 
##                Actuals
## ran.for.predsTF FALSE TRUE
##           FALSE    39    3
##           TRUE      4   38
##                                           
##                Accuracy : 0.9167          
##                  95% CI : (0.8358, 0.9658)
##     No Information Rate : 0.5119          
##     P-Value [Acc > NIR] : 1.335e-15       
##                                           
##                   Kappa : 0.8333          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9268          
##             Specificity : 0.9070          
##          Pos Pred Value : 0.9048          
##          Neg Pred Value : 0.9286          
##              Prevalence : 0.4881          
##          Detection Rate : 0.4524          
##    Detection Prevalence : 0.5000          
##       Balanced Accuracy : 0.9169          
##                                           
##        'Positive' Class : TRUE            
## 

You’ll see that the logistic regression model performed better in every category, with perfect accuracy. It is important to note that I’ve taken any churn risk value higher than 50 as a positive prediction, given that churn risk runs on a scale from 0-100. This is a bold assumption and should be taken with a grain of salt. Prior to finalizing this code, I attempted using the ClusterID as a predictor variable, but it became clear that the characteristics that go into the cluster ID were not considered by the machine, and while there are cluster characteristics that are great predictors, the ID itself is not one. Changing the input variables from simply cluster ID to average cluster churn rate and centroid distance improved random forest prediction accuracy from 76% to ~92%, and logistic regression from 75% to 100%.

Using cluster statistics, we can develop retention strategies. Ie. For cluster 1, which has the lowest average watch time, the highest churn risk, and lowest genre affinity, we could offer promotions which decrease ad frequency or subscription renewal price depending on the number of shows they watch.