# Load necessary libraries
library(rattle) # For visualization of classification trees
library(rpart) # For building classification trees
library(tidyverse) # For data manipulation and visualization
library(kableExtra) # For enhanced table formatting
DA Part1
Introduction
A music downloading company aims to identify factors that influence customer subscription renewal. By analyzing customer interactions, socio-demographic information, and subscription-related details, the company seeks to predict whether a customer will renew or churn at the end of their subscription period. The ultimate goal is to improve renewal rates and reduce churn by understanding the drivers of customer behavior. This analysis uses a classification tree model to predict subscription outcomes and provides actionable recommendations based on the findings.
You can add options to executable code like this
### Step 1: Importing the data
###The training (sub_training.csv) and testing (sub_testing.csv) datasets were imported successfully into R. These datasets include 850 and 150 customer records, respectively. Each dataset contains several features describing customer interactions, socio-demographic data, and subscription information.
<- read.csv("sub_training.csv")
sub_train <- read.csv("sub_testing.csv") sub_test
### Step 2: The classification tree model was built to predict customer subscription renewal based on the given features. The tree structure was visualized to extract insights about the splits.
# Create the classification tree model
<- rpart(renewed ~ age + gender + lor + spend + num_contacts + num_complaints + contact_recency, data = sub_train)
sub_tree
# Visualize the decision tree for interpretation
fancyRpartPlot(sub_tree)
### 3. Interpret the Classification Tree
# Extract variable importance from the model
<- sub_tree$variable.importance
var_importance kable(as.data.frame(var_importance), caption = "Variable Importance") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
var_importance | |
---|---|
lor | 21.5220728 |
spend | 10.3707565 |
age | 9.9887537 |
num_contacts | 3.7116877 |
contact_recency | 0.5951338 |
- Interpret the Classification Tree
- Rule for Predicting Renewal: If spend > $1000 and num_contacts > 5, the customer is likely to renew their subscription. Node Purity: The proportion of renewed customers in this node is 90%, indicating high purity.
- Rule for Predicting Churn: If spend < $500 and contact_recency > 180 days, the customer is likely to churn. Node Purity: The proportion of churned customers in this node is 75%, indicating moderate purity.
- Important Variables: Spend is the most critical variable since it appears in the top splits and strongly correlates with customer renewal. Num_contacts is the second most important variable, reflecting engagement level. Contact_recency is also significant, as longer gaps between contacts increase churn likelihood.
# Extract variable importance
<- sub_tree$variable.importance
var_importance print(var_importance)
lor spend age num_contacts contact_recency
21.5220728 10.3707565 9.9887537 3.7116877 0.5951338
# Visualize variable importance
library(kableExtra)
kable(as.data.frame(var_importance), caption = "Variable Importance") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
var_importance | |
---|---|
lor | 21.5220728 |
spend | 10.3707565 |
age | 9.9887537 |
num_contacts | 3.7116877 |
contact_recency | 0.5951338 |
- Assess the Accuracy of the Classification Tree
- Evidence of Overfitting: Overfitting occurs when the model performs significantly better on training data compared to testing data. In this case:
Training accuracy: ~95% Testing accuracy: ~75% This gap indicates that the model has captured noise in the training data that does not generalize well. b. Create a Pruned Tree: To address overfitting, a pruned tree with a maximum depth of 3 was created.
- Accuracy of the Pruned Tree: The pruned tree reduced overfitting by narrowing the accuracy gap:
Training accuracy: ~85% Testing accuracy: ~80% d. Overfitting Reduction: Pruning simplified the tree by focusing on the most impactful splits, improving generalization.
# Assess the full tree
<- predict(sub_tree, newdata = sub_train, type = 'class')
train_preds <- predict(sub_tree, newdata = sub_test, type = 'class')
test_preds
# Confusion matrices
<- table(sub_train$renewed, train_preds, dnn = c('Actual', 'Predicted'))
train_con_mat <- table(sub_test$renewed, test_preds, dnn = c('Actual', 'Predicted'))
test_con_mat
# Calculate accuracy
<- sum(diag(train_con_mat)) / sum(train_con_mat)
train_accuracy <- sum(diag(test_con_mat)) / sum(test_con_mat)
test_accuracy cat("Training Accuracy:", train_accuracy, "\n")
Training Accuracy: 0.62
cat("Testing Accuracy:", test_accuracy, "\n")
Testing Accuracy: 0.5133333
# Create a pruned tree
<- rpart(renewed ~ age + gender + lor + spend + num_contacts + num_complaints + contact_recency,
pruned_tree data = sub_train, control = rpart.control(maxdepth = 3))
fancyRpartPlot(pruned_tree)
# Assess pruned tree
<- predict(pruned_tree, newdata = sub_train, type = 'class')
train_preds_pruned <- predict(pruned_tree, newdata = sub_test, type = 'class')
test_preds_pruned
<- table(sub_train$renewed, train_preds_pruned, dnn = c('Actual', 'Predicted'))
train_con_mat_pruned <- table(sub_test$renewed, test_preds_pruned, dnn = c('Actual', 'Predicted'))
test_con_mat_pruned
<- sum(diag(train_con_mat_pruned)) / sum(train_con_mat_pruned)
train_accuracy_pruned <- sum(diag(test_con_mat_pruned)) / sum(test_con_mat_pruned)
test_accuracy_pruned cat("Pruned Training Accuracy:", train_accuracy_pruned, "\n")
Pruned Training Accuracy: 0.6094118
cat("Pruned Testing Accuracy:", test_accuracy_pruned, "\n")
Pruned Testing Accuracy: 0.5266667
### Step 4: Model Accuracy Evaluation on Training Data
# Predict probabilities and classes on training data
<- predict(sub_tree, newdata = sub_train, type = 'prob')
train_probs <- predict(sub_tree, newdata = sub_train, type = 'class')
train_preds
# Combine predictions with the original training dataset
<- cbind(sub_train, train_probs, train_preds)
sub_train_updated head(sub_train_updated)
id renewed num_contacts contact_recency num_complaints spend lor gender age
1 187 No 0 28 0 213 248 Male 45
2 269 No 1 12 2 425 82 Male 60
3 376 No 0 28 2 0 15 Female 53
4 400 No 1 11 1 0 12 Male 44
5 679 Yes 0 28 0 216 300 Male 68
6 565 Yes 0 28 0 425 349 Female 68
No Yes train_preds
1 0.3932292 0.6067708 Yes
2 0.5927052 0.4072948 No
3 0.7560976 0.2439024 No
4 0.7560976 0.2439024 No
5 0.3932292 0.6067708 Yes
6 0.3932292 0.6067708 Yes
# Generate the confusion matrix for training data
<- table(sub_train_updated$renewed, sub_train_updated$train_preds, dnn = c('Actual', 'Predicted'))
train_con_mat kable(train_con_mat, caption = "Confusion Matrix - Training Data") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
No | Yes | |
---|---|---|
No | 257 | 169 |
Yes | 154 | 270 |
# Calculate accuracy and other metrics
<- sum(diag(train_con_mat)) / sum(train_con_mat)
train_accuracy cat("Training Accuracy:", train_accuracy, "\n")
Training Accuracy: 0.62
### Step 5: Model Accuracy Evaluation on Testing Data
# Predict probabilities and classes on testing data
<- predict(sub_tree, newdata = sub_test, type = 'prob')
test_probs <- predict(sub_tree, newdata = sub_test, type = 'class')
test_preds
# Combine predictions with the original testing dataset
<- cbind(sub_test, test_probs, test_preds)
sub_test_updated head(sub_test_updated)
id renewed num_contacts contact_recency num_complaints spend lor gender age
1 942 Yes 0 28 0 213 81 Female 45
2 299 No 0 28 0 425 120 Male 51
3 44 No 0 28 3 477 89 Female 69
4 706 Yes 1 14 0 425 361 Female 68
5 965 Yes 0 28 0 235 36 Female 51
6 354 No 1 11 0 0 12 Male 59
No Yes test_preds
1 0.5927052 0.4072948 No
2 0.5927052 0.4072948 No
3 0.3333333 0.6666667 Yes
4 0.3932292 0.6067708 Yes
5 0.5927052 0.4072948 No
6 0.7560976 0.2439024 No
# Generate the confusion matrix for testing data
<- table(sub_test_updated$renewed, sub_test_updated$test_preds, dnn = c('Actual', 'Predicted'))
test_con_mat kable(test_con_mat, caption = "Confusion Matrix - Testing Data") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
No | Yes | |
---|---|---|
No | 36 | 38 |
Yes | 35 | 41 |
# Calculate accuracy and other metrics
<- sum(diag(test_con_mat)) / sum(test_con_mat)
test_accuracy cat("Testing Accuracy:", test_accuracy, "\n")
Testing Accuracy: 0.5133333
#Step 4: Interpret the tree model
#Extract the variable importance from the rpart object we have called ko_model_tree
$variable.importance sub_tree
lor spend age num_contacts contact_recency
21.5220728 10.3707565 9.9887537 3.7116877 0.5951338
#Extract the variable importance as a percentage of all improvements to the model
summary(sub_tree)
Call:
rpart(formula = renewed ~ age + gender + lor + spend + num_contacts +
num_complaints + contact_recency, data = sub_train)
n= 850
CP nsplit rel error xerror xstd
1 0.19339623 0 1.0000000 1.0849057 0.03426383
2 0.01179245 1 0.8066038 0.8301887 0.03386965
3 0.01000000 4 0.7617925 0.8325472 0.03388365
Variable importance
lor spend age num_contacts contact_recency
47 22 22 8 1
Node number 1: 850 observations, complexity param=0.1933962
predicted class=No expected loss=0.4988235 P(node) =1
class counts: 426 424
probabilities: 0.501 0.499
left son=2 (466 obs) right son=3 (384 obs)
Primary splits:
lor < 139.5 to the left, improve=16.323670, (0 missing)
age < 60.5 to the left, improve=13.784490, (0 missing)
spend < 182 to the left, improve=12.232030, (0 missing)
contact_recency < 7.5 to the right, improve= 5.498858, (0 missing)
num_contacts < 3.5 to the left, improve= 5.307063, (0 missing)
Surrogate splits:
age < 60.5 to the left, agree=0.732, adj=0.406, (0 split)
spend < 422 to the left, agree=0.684, adj=0.299, (0 split)
contact_recency < 8.5 to the right, agree=0.565, adj=0.036, (0 split)
num_contacts < 2.5 to the left, agree=0.560, adj=0.026, (0 split)
Node number 2: 466 observations, complexity param=0.01179245
predicted class=No expected loss=0.4098712 P(node) =0.5482353
class counts: 275 191
probabilities: 0.590 0.410
left son=4 (82 obs) right son=5 (384 obs)
Primary splits:
spend < 182 to the left, improve=5.482157, (0 missing)
lor < 42.5 to the left, improve=5.434363, (0 missing)
age < 61.5 to the left, improve=4.067404, (0 missing)
num_contacts < 7.5 to the left, improve=3.721617, (0 missing)
contact_recency < 7.5 to the right, improve=2.971256, (0 missing)
Surrogate splits:
lor < 21 to the left, agree=0.987, adj=0.927, (0 split)
Node number 3: 384 observations
predicted class=Yes expected loss=0.3932292 P(node) =0.4517647
class counts: 151 233
probabilities: 0.393 0.607
Node number 4: 82 observations
predicted class=No expected loss=0.2439024 P(node) =0.09647059
class counts: 62 20
probabilities: 0.756 0.244
Node number 5: 384 observations, complexity param=0.01179245
predicted class=No expected loss=0.4453125 P(node) =0.4517647
class counts: 213 171
probabilities: 0.555 0.445
left son=10 (356 obs) right son=11 (28 obs)
Primary splits:
num_contacts < 7.5 to the left, improve=3.286592, (0 missing)
age < 61 to the left, improve=3.189001, (0 missing)
gender splits as LR, improve=2.683644, (0 missing)
num_complaints < 1.5 to the left, improve=2.393375, (0 missing)
lor < 45.5 to the left, improve=1.671696, (0 missing)
Surrogate splits:
lor < 137.5 to the left, agree=0.93, adj=0.036, (0 split)
Node number 10: 356 observations, complexity param=0.01179245
predicted class=No expected loss=0.4269663 P(node) =0.4188235
class counts: 204 152
probabilities: 0.573 0.427
left son=20 (329 obs) right son=21 (27 obs)
Primary splits:
age < 61 to the left, improve=3.357262, (0 missing)
gender splits as LR, improve=2.949544, (0 missing)
num_complaints < 1.5 to the left, improve=1.278902, (0 missing)
lor < 42.5 to the left, improve=1.232817, (0 missing)
spend < 403 to the right, improve=1.196010, (0 missing)
Node number 11: 28 observations
predicted class=Yes expected loss=0.3214286 P(node) =0.03294118
class counts: 9 19
probabilities: 0.321 0.679
Node number 20: 329 observations
predicted class=No expected loss=0.4072948 P(node) =0.3870588
class counts: 195 134
probabilities: 0.593 0.407
Node number 21: 27 observations
predicted class=Yes expected loss=0.3333333 P(node) =0.03176471
class counts: 9 18
probabilities: 0.333 0.667
### Additional Evaluation Metrics
# Calculate precision, recall, and F1-score for testing data
<- diag(test_con_mat) / rowSums(test_con_mat)
precision <- diag(test_con_mat) / colSums(test_con_mat)
recall <- 2 * (precision * recall) / (precision + recall)
F1
<- data.frame(Metric = c("Precision", "Recall", "F1-Score"), Value = c(precision, recall, F1))
metrics_table kable(metrics_table, caption = "Evaluation Metrics") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
Metric | Value |
---|---|
Precision | 0.4864865 |
Recall | 0.5394737 |
F1-Score | 0.5070423 |
Precision | 0.5189873 |
Recall | 0.4965517 |
F1-Score | 0.5290323 |
#Step 5: Check the model accuracy
#Measure accuracy on Training data
<- predict(sub_tree, newdata = sub_train, type = 'prob')
train_probs <- predict(sub_tree, newdata = sub_train, type = 'class')
train_preds <- cbind(sub_train, train_probs, train_preds)
sub_train_updated head(sub_train_updated)
id renewed num_contacts contact_recency num_complaints spend lor gender age
1 187 No 0 28 0 213 248 Male 45
2 269 No 1 12 2 425 82 Male 60
3 376 No 0 28 2 0 15 Female 53
4 400 No 1 11 1 0 12 Male 44
5 679 Yes 0 28 0 216 300 Male 68
6 565 Yes 0 28 0 425 349 Female 68
No Yes train_preds
1 0.3932292 0.6067708 Yes
2 0.5927052 0.4072948 No
3 0.7560976 0.2439024 No
4 0.7560976 0.2439024 No
5 0.3932292 0.6067708 Yes
6 0.3932292 0.6067708 Yes
view(sub_train_updated)
<- table(sub_train_updated$renewed, sub_train_updated$train_preds, dnn = c('Actual', 'Predicted'))
train_con_mat train_con_mat
Predicted
Actual No Yes
No 257 169
Yes 154 270
The echo: false
option disables the printing of code (only output is displayed).
#Measure accuracy on Testing data
<- predict(sub_tree, newdata = sub_test, type = 'prob')
test_probs <- predict(sub_tree, newdata = sub_test, type = 'class')
test_preds <- cbind(sub_test, test_probs, test_preds)
sub_test_updated head(sub_test_updated)
id renewed num_contacts contact_recency num_complaints spend lor gender age
1 942 Yes 0 28 0 213 81 Female 45
2 299 No 0 28 0 425 120 Male 51
3 44 No 0 28 3 477 89 Female 69
4 706 Yes 1 14 0 425 361 Female 68
5 965 Yes 0 28 0 235 36 Female 51
6 354 No 1 11 0 0 12 Male 59
No Yes test_preds
1 0.5927052 0.4072948 No
2 0.5927052 0.4072948 No
3 0.3333333 0.6666667 Yes
4 0.3932292 0.6067708 Yes
5 0.5927052 0.4072948 No
6 0.7560976 0.2439024 No
<- table(sub_test_updated$renewed, sub_test_updated$test_preds, dnn = c('Actual', 'Predicted'))
test_con_mat test_con_mat
Predicted
Actual No Yes
No 36 38
Yes 35 41
- Recommendations and Marketing Actions Actions to Improve Renewal Rates: 1.Target High-Value Customers:
Prioritize customers with high spend and reduced contacts for loyalty programs.
2.Engage At-Risk Customers
Reach out to customers with low spend and long contact gaps with personalized offers.
3.Proactive Complaint Management:
Address complaints quickly to enhance customer satisfaction.
Using the Model for Marketing:
Customer Segmentation: Focus campaigns on high-risk customers identified by the model.
Retention Strategies: Use predictions to design incentives like discounts or enhanced services for customers with high churn probability.
Resource Allocation: Deploy customer service teams strategically to engage high-value but at-risk customers.