setwd("/Users/harrychang/Downloads/BT4211/Assignment/Assignment 2")
library(readxl)
cellphone = read_excel("cellphone_billing_student.xlsx")
subscriber = read_excel("subscriber_info_student.xlsx")
zip = read_excel("zip_info_student.xlsx")
Exploring the cellphone billing dataset
cellphone
## # A tibble: 195,878 × 11
## cust_id bill_year bill_month churn plan_chosen plan1 plan2 plan3 plan4
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 19164958 2015 9 0 1 1 0 0 0
## 2 19164958 2015 10 0 1 1 0 0 0
## 3 19164958 2015 11 0 1 1 0 0 0
## 4 19164958 2015 12 0 1 1 0 0 0
## 5 19164958 2016 1 0 1 1 0 0 0
## 6 19164958 2016 2 0 1 1 0 0 0
## 7 19164958 2016 3 0 1 1 0 0 0
## 8 19164958 2016 4 0 1 1 0 0 0
## 9 19164958 2016 5 0 1 1 0 0 0
## 10 19164958 2016 6 0 1 1 0 0 0
## # … with 195,868 more rows, and 2 more variables: total_minute_peak <dbl>,
## # promo_lag1 <dbl>
summary(cellphone)
## cust_id bill_year bill_month churn
## Min. :19164958 Min. :2015 Min. : 1.00 Min. :0.00000
## 1st Qu.:74944945 1st Qu.:2016 1st Qu.: 3.00 1st Qu.:0.00000
## Median :81475998 Median :2016 Median : 5.00 Median :0.00000
## Mean :81935115 Mean :2016 Mean : 5.99 Mean :0.02508
## 3rd Qu.:88623109 3rd Qu.:2016 3rd Qu.: 9.00 3rd Qu.:0.00000
## Max. :88705192 Max. :2017 Max. :12.00 Max. :1.00000
## plan_chosen plan1 plan2 plan3
## Min. :1.000 Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :1.000 Median :1.0000 Median :0.00000 Median :0.0000
## Mean :2.021 Mean :0.5096 Mean :0.03194 Mean :0.3859
## 3rd Qu.:3.000 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :4.000 Max. :1.0000 Max. :1.00000 Max. :1.0000
## plan4 total_minute_peak promo_lag1
## Min. :0.00000 Min. : 0.0 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.: 58.0 1st Qu.:0.0000
## Median :0.00000 Median : 139.0 Median :0.0000
## Mean :0.07253 Mean : 171.4 Mean :0.1347
## 3rd Qu.:0.00000 3rd Qu.: 244.0 3rd Qu.:0.0000
## Max. :1.00000 Max. :1500.0 Max. :1.0000
Descriptive statistics
mean_total_minute_peak <- mean(cellphone$total_minute_peak)
median_total_minute_peak <- median(cellphone$total_minute_peak)
sd_total_minute_peak <- sd(cellphone$total_minute_peak)
cat("Mean of total_minute_peak:", mean_total_minute_peak, "\n")
## Mean of total_minute_peak: 171.4271
cat("Median of total_minute_peak:", median_total_minute_peak, "\n")
## Median of total_minute_peak: 139
cat("Standard deviation of total_minute_peak:", sd_total_minute_peak, "\n")
## Standard deviation of total_minute_peak: 150.8134
customer_count <- nrow(cellphone)
plan_proportions <- table(cellphone$plan_chosen) / customer_count
cat("Proportions of customers in each plan:\n")
## Proportions of customers in each plan:
print(plan_proportions)
##
## 1 2 3 4
## 0.50957739 0.03194335 0.38594942 0.07252984
library(ggplot2)
ggplot(cellphone, aes(x = total_minute_peak)) +
geom_histogram(bins = 50, fill = "blue", alpha = 0.7) +
theme_minimal() +
labs(title = "Distribution of Total Peak Minutes", x = "Total Peak Minutes", y = "Count")

Churn analysis
churn_rate_by_plan <- with(cellphone, tapply(churn, plan_chosen, function(x) mean(x == 1)))
cat("Churn rate by plan:\n")
## Churn rate by plan:
print(churn_rate_by_plan)
## 1 2 3 4
## 0.020257476 0.006233019 0.028174976 0.050749630
churn_rate_by_plan_df <- as.data.frame(churn_rate_by_plan)
churn_rate_by_plan_df$plan_chosen <- row.names(churn_rate_by_plan_df)
ggplot(churn_rate_by_plan_df, aes(x = plan_chosen, y = churn_rate_by_plan, fill = plan_chosen)) +
geom_bar(stat = "identity", width = 0.7) +
theme_minimal() +
labs(title = "Churn Rate by Plan", x = "Plan", y = "Churn Rate") +
geom_text(aes(label = round(churn_rate_by_plan,3)), vjust = 1.4, color = "white") +
scale_fill_discrete(name = "Plan")

churn_rate_by_promo <- with(cellphone, tapply(churn, promo_lag1, function(x) mean(x == 1)))
cat("Churn rate by promo_lag1:\n")
## Churn rate by promo_lag1:
print(churn_rate_by_promo)
## 0 1
## 0.02622635 0.01769476
churn_rate_by_promo_df <- as.data.frame(churn_rate_by_promo)
churn_rate_by_promo_df$promo_lag1 <- row.names(churn_rate_by_promo_df)
ggplot(churn_rate_by_promo_df, aes(x = promo_lag1, y = churn_rate_by_promo, fill = promo_lag1)) +
geom_bar(stat = "identity", width = 0.7) +
theme_minimal() +
labs(title = "Churn Rate by Promo Lag", x = "Promo Lag", y = "Churn Rate") +
geom_text(aes(label = round(churn_rate_by_promo,3)), vjust = 1.4, color = "white") +
scale_fill_discrete(name = "Promo Lag")

Customer segmentation
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data_scaled <- cellphone %>%
select(total_minute_peak, plan_chosen, promo_lag1) %>%
scale()
set.seed(123) # For reproducibility
k <- 3 # Number of clusters
kmeans_result <- kmeans(data_scaled, centers = k)
cellphone$cluster <- kmeans_result$cluster
cluster_summary <- cellphone %>%
group_by(cluster) %>%
summarise(
count = n(),
avg_total_minute_peak = mean(total_minute_peak),
avg_churn = mean(churn),
avg_promo_lag1 = mean(promo_lag1),
plan1_prop = mean(plan1),
plan2_prop = mean(plan2),
plan3_prop = mean(plan3),
plan4_prop = mean(plan4)
)
cat("Customer segmentation summary:\n")
## Customer segmentation summary:
print(cluster_summary)
## # A tibble: 3 × 9
## cluster count avg_total_minut… avg_churn avg_promo_lag1 plan1_prop plan2_prop
## <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 37530 408. 0.0245 0.144 0.0467 0.0359
## 2 2 55969 124. 0.0361 0.161 0 0.0106
## 3 3 102379 111. 0.0193 0.117 0.958 0.0422
## # … with 2 more variables: plan3_prop <dbl>, plan4_prop <dbl>
ggplot(cellphone, aes(x = total_minute_peak, y = plan_chosen, color = factor(cluster))) +
geom_point(alpha = 0.7) +
theme_minimal() +
labs(title = "Customer Segmentation", x = "Total Peak Minutes", y = "Plan Chosen", color = "Cluster")

Detailed descriptive analysis
- How many customers are there for each type of phone service
plan?
customers_per_plan <- table(cellphone$plan_chosen)
cat("Number of customers for each plan:\n")
## Number of customers for each plan:
print(customers_per_plan)
##
## 1 2 3 4
## 99815 6257 75599 14207
- What is the average number of customers acquired for each phone
service plan across the months of August to December 2015?
# Filter customers acquired between August and December 2015
data_2015 <- cellphone %>% filter(bill_year == 2015 & bill_month >= 8 & bill_month <= 12)
# Remove duplicate customer IDs and count unique customers per plan
unique_customers_2015 <- data_2015 %>% distinct(cust_id, .keep_all = TRUE)
customers_per_plan_2015 <- table(unique_customers_2015$plan_chosen)
# Calculate the average number of customers acquired per plan
avg_customers_per_plan_2015 <- customers_per_plan_2015 / 5
cat("Average number of customers acquired for each plan (August to December 2015):\n")
## Average number of customers acquired for each plan (August to December 2015):
print(avg_customers_per_plan_2015)
##
## 1 3 4
## 459.2 853.0 81.2
- What is the average number of months a customer stays with the
cellular phone service company from the start till the end of a phone
service subscription (i.e., only focus on customers who churned)?
churned_customers <- cellphone %>% filter(churn == 1)
churned_customers_months <- churned_customers %>% group_by(cust_id) %>% summarise(months_active = n())
avg_months_churned <- mean(churned_customers_months$months_active)
cat("Average number of months a churned customer stays with the company:", avg_months_churned, "\n")
## Average number of months a churned customer stays with the company: 1
- For each type of phone service plan, what is the number of customers
who churned?
churned_by_plan <- table(churned_customers$plan_chosen)
cat("Number of churned customers by plan:\n")
## Number of churned customers by plan:
print(churned_by_plan)
##
## 1 2 3 4
## 2022 39 2130 721
- For each type of phone service plan, what is the average number of
months a customer stays with the company from the start till the end of
a service subscription (i.e., only focus on customers who churned)?
avg_months_churned_by_plan <- churned_customers %>% group_by(plan_chosen) %>% summarise(avg_months_active = mean(n()))
#cat("Average number of months churned customers stay with the company by plan:\n")
print(avg_months_churned_by_plan)
## # A tibble: 4 × 2
## plan_chosen avg_months_active
## <dbl> <dbl>
## 1 1 2022
## 2 2 39
## 3 3 2130
## 4 4 721
- What is the average number of peak minutes used in a month under
each type of phone service plan?
avg_peak_minutes_by_plan <- cellphone %>% group_by(plan_chosen) %>% summarise(avg_peak_minutes = mean(total_minute_peak))
#cat("Average number of peak minutes used in a month by plan:\n")
print(avg_peak_minutes_by_plan)
## # A tibble: 4 × 2
## plan_chosen avg_peak_minutes
## <dbl> <dbl>
## 1 1 117.
## 2 2 185.
## 3 3 214.
## 4 4 320.
- How many customers are over-utilizing their allocated maximum peak
time minutes (by more than 5%) for each type of phone service plan?
# Calculate maximum peak minutes allowed for each plan
max_minutes <- c(200, 300, 350, 500)
cellphone$max_peak_minutes <- max_minutes[cellphone$plan_chosen]
# Identify customers who exceed their allocated maximum peak time minutes by more than 5%
cellphone$over_utilizing <- cellphone$total_minute_peak > cellphone$max_peak_minutes * 1.05
# Count the number of customers who over-utilize their allocated maximum peak time minutes for each plan
over_utilizing_customers_by_plan <- cellphone %>%
filter(over_utilizing == TRUE) %>%
group_by(plan_chosen) %>%
summarise(count = n())
#cat("Number of customers over-utilizing their allocated maximum peak time minutes by plan:\n")
print(over_utilizing_customers_by_plan)
## # A tibble: 4 × 2
## plan_chosen count
## <dbl> <int>
## 1 1 13275
## 2 2 611
## 3 3 10365
## 4 4 2343
- What is the average total phone bill (fixed subscription charge plus
variable excess usage fees) per customer across all months under each
type of phone service plan?
fixed_prices <- c(30, 35, 40, 50)
excess_prices <- 0.40
cellphone$fixed_charge <- fixed_prices[cellphone$plan_chosen]
cellphone$excess_usage <- pmax(0, cellphone$total_minute_peak - cellphone$max_peak_minutes) * excess_prices
cellphone$total_bill <- cellphone$fixed_charge + cellphone$excess_usage
avg_total_bill_by_plan <- cellphone %>%
group_by(plan_chosen) %>%
summarise(avg_total_bill = mean(total_bill))
#cat("Average total phone bill per customer across all months by plan:\n")
print(avg_total_bill_by_plan)
## # A tibble: 4 × 2
## plan_chosen avg_total_bill
## <dbl> <dbl>
## 1 1 35.6
## 2 2 38.6
## 3 3 47.2
## 4 4 61.6
- What is the average profit per customer across all months under each
type of phone service plan?
profit_margin <- 0.53
cellphone$profit <- cellphone$total_bill * profit_margin
avg_profit_by_plan <- cellphone %>%
group_by(plan_chosen) %>%
summarise(avg_profit = mean(profit))
#cat("Average profit per customer across all months by plan:\n")
print(avg_profit_by_plan)
## # A tibble: 4 × 2
## plan_chosen avg_profit
## <dbl> <dbl>
## 1 1 18.9
## 2 2 20.4
## 3 3 25.0
## 4 4 32.7
- For customers who have churned, what is the average customer
lifetime value under each type of phone service plan? Assume a monthly
discount rate of 1% and compute the lifetime value as of the month of
customer acquisition onward.
monthly_discount_rate <- 0.01
churned_customers <- cellphone %>% filter(churn == 1)
churned_customers <- churned_customers %>%
group_by(cust_id, plan_chosen) %>%
summarise(
months_active = n(),
total_profit = sum(profit)
)
## `summarise()` has grouped output by 'cust_id'. You can override using the
## `.groups` argument.
churned_customers$lifetime_value <- churned_customers$total_profit / ((1 - (1 + monthly_discount_rate)^(-churned_customers$months_active)) / monthly_discount_rate)
avg_lifetime_value_by_plan <- churned_customers %>%
group_by(plan_chosen) %>%
summarise(avg_lifetime_value = mean(lifetime_value))
#cat("Average customer lifetime value for churned customers by plan:\n")
print(avg_lifetime_value_by_plan)
## # A tibble: 4 × 2
## plan_chosen avg_lifetime_value
## <dbl> <dbl>
## 1 1 20.9
## 2 2 21.4
## 3 3 27.3
## 4 4 33.3
Creating aggregated dataset using the 3 originally provided
datasets
# Aggregate the panel-level customer billing data into a cross-sectional one, keeping the last observation of each customer that records the churn status (churn):
billing_data_cross_sectional <- cellphone %>%
group_by(cust_id) %>%
slice_tail(n = 1)
# Compute average monthly peak minutes used for each customer (ave_minute_peak):
average_minutes <- cellphone %>%
group_by(cust_id) %>%
summarise(ave_minute_peak = mean(total_minute_peak))
billing_data_cross_sectional <- left_join(billing_data_cross_sectional, average_minutes, by = "cust_id")
# Compute the percentage of months a customer received marketing promotions (promo_pct):
promo_percentage <- cellphone %>%
group_by(cust_id) %>%
summarise(promo_pct = mean(promo_lag1))
billing_data_cross_sectional <- left_join(billing_data_cross_sectional, promo_percentage, by = "cust_id")
# Merge the cross-sectional billing data with the subscriber information data:
merged_data <- left_join(billing_data_cross_sectional, subscriber, by = "cust_id")
# Create the customer age variable (age):
current_year <- 2017
merged_data$age <- current_year - merged_data$birth_year
# Further merge the data set created in Step 5 above with the ZIP code data:
merged_data <- left_join(merged_data, zip, by = "zip_code")
# Create the commercial ZIP code type dummy variable (zip_comm):
merged_data$zip_comm <- ifelse(merged_data$zip_type == 2, 1, 0)
merged_data
## # A tibble: 12,495 × 33
## # Groups: cust_id [12,495]
## cust_id bill_year bill_month churn plan_chosen plan1 plan2 plan3 plan4
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 19164958 2017 4 1 1 1 0 0 0
## 2 39244924 2016 2 1 3 0 0 1 0
## 3 39578413 2016 4 1 3 0 0 1 0
## 4 40992265 2016 5 1 3 0 0 1 0
## 5 43061957 2016 10 1 3 0 0 1 0
## 6 47196850 2017 5 0 2 0 1 0 0
## 7 51236987 2017 2 1 3 0 0 1 0
## 8 51326773 2016 4 1 3 0 0 1 0
## 9 54271247 2016 11 1 1 1 0 0 0
## 10 70765025 2016 11 1 3 0 0 1 0
## # … with 12,485 more rows, and 24 more variables: total_minute_peak <dbl>,
## # promo_lag1 <dbl>, cluster <int>, max_peak_minutes <dbl>,
## # over_utilizing <lgl>, fixed_charge <dbl>, excess_usage <dbl>,
## # total_bill <dbl>, profit <dbl>, ave_minute_peak <dbl>, promo_pct <dbl>,
## # birth_year <dbl>, birth_month <dbl>, zip_code <dbl>, newcell <dbl>,
## # oldcell <dbl>, prizm_code <dbl>, age <dbl>, ruca2 <dbl>, zip_type <dbl>,
## # zip_popn5 <dbl>, state_code <dbl>, state_abbr <chr>, zip_comm <dbl>
# Check dimensions of merged dataset
dim(merged_data)
## [1] 12495 33
Using logit and probit models on merged dataset
- Estimate the binomial logit and probit models of customer churn
decision:
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
# Binomial Logit Model
logit_model <- glm(churn ~ plan1 + plan2 + plan3 + plan4 + ave_minute_peak +
promo_pct + age + newcell + ruca2 + zip_comm,
family = binomial(link = "logit"), data = merged_data)
# Binomial Probit Model
probit_model <- glm(churn ~ plan1 + plan2 + plan3 + plan4 + ave_minute_peak +
promo_pct + age + newcell + ruca2 + zip_comm,
family = binomial(link = "probit"), data = merged_data)
# Create a dataframe with coefficients and model names
coefs <- data.frame(
variable = names(logit_model$coefficients),
logit_coef = logit_model$coefficients,
probit_coef = probit_model$coefficients
)
coefs <- reshape2::melt(coefs, id.vars = "variable", variable.name = "model", value.name = "coefficient")
# Plot coefficients
ggplot(coefs, aes(x = variable, y = coefficient, fill = model)) +
geom_col(position = "dodge") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Logit and Probit Model Coefficients", x = "Variable", y = "Coefficient", fill = "Model")
## Warning: Removed 2 rows containing missing values (`geom_col()`).

- Compare the model fit using AIC and accuracy:
library(caret)
## Loading required package: lattice
# Split data into training and testing sets
set.seed(123)
train_index <- createDataPartition(merged_data$churn, p = 0.8, list = FALSE)
train_data <- merged_data[train_index, ]
test_data <- merged_data[-train_index, ]
# Fit the models on the training data
logit_model <- glm(churn ~ plan1 + plan2 + plan3 + ave_minute_peak + promo_pct + age + newcell + ruca2 + zip_comm, data = train_data, family = binomial(link = "logit"))
probit_model <- glm(churn ~ plan1 + plan2 + plan3 + ave_minute_peak + promo_pct + age + newcell + ruca2 + zip_comm, data = train_data, family = binomial(link = "probit"))
# Predictions on the test data
logit_pred <- ifelse(predict(logit_model, newdata = test_data, type = "response") > 0.5, 1, 0)
probit_pred <- ifelse(predict(probit_model, newdata = test_data, type = "response") > 0.5, 1, 0)
# Confusion matrices
logit_conf_mat <- table(Predicted = as.factor(logit_pred), Actual = as.factor(test_data$churn))
probit_conf_mat <- table(Predicted = as.factor(probit_pred), Actual = as.factor(test_data$churn))
# Accuracy
logit_accuracy <- sum(diag(logit_conf_mat)) / sum(logit_conf_mat)
probit_accuracy <- sum(diag(probit_conf_mat)) / sum(probit_conf_mat)
# Create a dataframe with AIC and accuracy
model_comparison <- data.frame(
model = c("Logit", "Probit"),
AIC = c(AIC(logit_model), AIC(probit_model)),
accuracy = c(logit_accuracy, probit_accuracy)
)
model_comparison
## model AIC accuracy
## 1 Logit 11391.22 0.6562373
## 2 Probit 11388.33 0.6562373
- Marginal effects of ave_minute_peak and promo_pct on the probability
of customer churn:
library(margins)
# Calculate marginal effects for the best-fit model (e.g., logit_model)
marginal_effects <- margins(logit_model, variables = c("ave_minute_peak", "promo_pct"))
# Summary of marginal effects
marginal_effects_summary <- summary(marginal_effects)
marginal_effects_summary
## factor AME SE z p lower upper
## ave_minute_peak 0.0004 0.0000 8.7750 0.0000 0.0003 0.0005
## promo_pct 0.0187 0.0187 1.0008 0.3169 -0.0180 0.0554
# Plot marginal effects
plot(marginal_effects)

- Multinomial logit model of customer plan choice decision:
library(nnet)
# Create a new variable for plan choice
merged_data$plan_choice <- as.factor(ifelse(merged_data$plan1 == 1, 1, ifelse(merged_data$plan2 == 1, 2, ifelse(merged_data$plan3 == 1, 3, 4))))
# Remove missing data (if any)
merged_data_clean <- merged_data %>%
na.omit()
multinom_model <- multinom(plan_choice ~ ave_minute_peak + promo_pct + age + newcell + ruca2 + zip_comm, data = merged_data_clean)
## # weights: 32 (21 variable)
## initial value 15791.279068
## iter 10 value 12580.890145
## iter 20 value 11644.332942
## iter 30 value 11518.606463
## final value 11518.584009
## converged
# Summary of the multinomial logit model
multinom_summary <- summary(multinom_model)
multinom_summary
## Call:
## multinom(formula = plan_choice ~ ave_minute_peak + promo_pct +
## age + newcell + ruca2 + zip_comm, data = merged_data_clean)
##
## Coefficients:
## (Intercept) ave_minute_peak promo_pct age newcell ruca2
## 2 -2.434356 0.007756583 0.2996368 0.0005083711 0.27851183 -0.01439087
## 3 -2.292736 0.014357492 0.4232658 -0.0055970647 -0.30487260 0.04846221
## 4 -5.044267 0.021095249 0.2469391 -0.0103360633 -0.07173525 0.06775428
## zip_comm
## 2 0.04336654
## 3 0.65914560
## 4 0.52568588
##
## Std. Errors:
## (Intercept) ave_minute_peak promo_pct age newcell ruca2
## 2 0.1644395 0.0004134632 0.1286716 0.002206126 0.07786722 0.02431938
## 3 0.1307836 0.0003395906 0.1018426 0.001775667 0.06958558 0.01772544
## 4 0.2086960 0.0004423015 0.1609257 0.002970345 0.10919213 0.02932189
## zip_comm
## 2 0.3612414
## 3 0.2510011
## 4 0.3739297
##
## Residual Deviance: 23037.17
## AIC: 23079.17
# Prepare the coefficients data for visualization
coefs <- coef(multinom_model)
coefs <- as.data.frame(t(coefs))
coefs$Variable <- row.names(coefs)
coefs_long <- reshape2::melt(coefs, id.vars = "Variable", variable.name = "Plan", value.name = "Coefficient")
# Visualize the coefficients
ggplot(coefs_long, aes(x = Variable, y = Coefficient, fill = Plan)) +
geom_col(position = "dodge") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Multinomial Logit Model Coefficients", x = "Variable", y = "Coefficient", fill = "Plan")

Model estimations for duration models
- Plot the Kaplan-Meier survival function estimates for each plan
choice on the same graph
#install.packages("survival")
library(survival)
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
# Create a Surv object for survival analysis
surv_obj <- Surv(time = merged_data$bill_month, event = merged_data$churn)
# Compute the Kaplan-Meier survival function estimates for each plan choice
km_estimates <- survfit(surv_obj ~ merged_data$plan_chosen)
# Plot the Kaplan-Meier survival function estimates for each plan choice on the same graph
plot(km_estimates, col = 1:4, xlab = "Months", ylab = "Survival Probability", main = "Kaplan-Meier Survival Function Estimates by Plan Choice")

- Estimate the Cox proportional hazard (PH) type duration data models
using a semi-parametric approach.
# Estimate the Cox PH model
cox_ph_model <- coxph(surv_obj ~ plan_chosen + ave_minute_peak + promo_pct + age + newcell + ruca2 + zip_comm, data = merged_data)
# Display the results
summary(cox_ph_model)
## Call:
## coxph(formula = surv_obj ~ plan_chosen + ave_minute_peak + promo_pct +
## age + newcell + ruca2 + zip_comm, data = merged_data)
##
## n= 12319, number of events= 4848
## (176 observations deleted due to missingness)
##
## coef exp(coef) se(coef) z Pr(>|z|)
## plan_chosen 0.1120465 1.1185649 0.0156654 7.152 8.52e-13 ***
## ave_minute_peak 0.0002078 1.0002079 0.0001454 1.429 0.15290
## promo_pct 0.1226752 1.1305172 0.0593211 2.068 0.03864 *
## age -0.0109161 0.9891432 0.0010917 -9.999 < 2e-16 ***
## newcell -0.1348020 0.8738889 0.0488949 -2.757 0.00583 **
## ruca2 0.0275484 1.0279314 0.0089252 3.087 0.00202 **
## zip_comm 0.2008662 1.2224612 0.0998352 2.012 0.04422 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## plan_chosen 1.1186 0.8940 1.0847 1.1534
## ave_minute_peak 1.0002 0.9998 0.9999 1.0005
## promo_pct 1.1305 0.8846 1.0064 1.2699
## age 0.9891 1.0110 0.9870 0.9913
## newcell 0.8739 1.1443 0.7940 0.9618
## ruca2 1.0279 0.9728 1.0101 1.0461
## zip_comm 1.2225 0.8180 1.0052 1.4867
##
## Concordance= 0.585 (se = 0.006 )
## Likelihood ratio test= 278.9 on 7 df, p=<2e-16
## Wald test = 272.8 on 7 df, p=<2e-16
## Score (logrank) test = 274.5 on 7 df, p=<2e-16
# Compute the AIC for the Cox PH model
cox_ph_aic <- AIC(cox_ph_model)
cox_ph_aic
## [1] 77001.51
- Estimate the accelerated failure time (AFT) type duration data
models using the exponential and Weibull distributions.
# Estimate the AFT model with the exponential distribution
aft_exp_model <- survreg(surv_obj ~ plan_chosen + ave_minute_peak + promo_pct + age + newcell + ruca2 + zip_comm, data = merged_data, dist = "exponential")
# Display the results
summary(aft_exp_model)
##
## Call:
## survreg(formula = surv_obj ~ plan_chosen + ave_minute_peak +
## promo_pct + age + newcell + ruca2 + zip_comm, data = merged_data,
## dist = "exponential")
## Value Std. Error z p
## (Intercept) 2.207795 0.075942 29.07 < 2e-16
## plan_chosen -0.048248 0.015889 -3.04 0.0024
## ave_minute_peak -0.000925 0.000143 -6.45 1.1e-10
## promo_pct -0.025705 0.059485 -0.43 0.6656
## age 0.013740 0.001094 12.56 < 2e-16
## newcell 0.416840 0.048551 8.59 < 2e-16
## ruca2 -0.043480 0.008944 -4.86 1.2e-06
## zip_comm -0.185351 0.099721 -1.86 0.0631
##
## Scale fixed at 1
##
## Exponential distribution
## Loglik(model)= -17481.8 Loglik(intercept only)= -17725.5
## Chisq= 487.45 on 7 degrees of freedom, p= 4e-101
## Number of Newton-Raphson Iterations: 5
## n=12319 (176 observations deleted due to missingness)
# Compute the AIC for the AFT model with the exponential distribution
aft_exp_aic <- AIC(aft_exp_model)
# Estimate the AFT model with the Weibull distribution
aft_weibull_model <- survreg(surv_obj ~ plan_chosen + ave_minute_peak + promo_pct + age + newcell + ruca2 + zip_comm, data = merged_data, dist = "weibull")
# Display the results
summary(aft_weibull_model)
##
## Call:
## survreg(formula = surv_obj ~ plan_chosen + ave_minute_peak +
## promo_pct + age + newcell + ruca2 + zip_comm, data = merged_data,
## dist = "weibull")
## Value Std. Error z p
## (Intercept) 2.11e+00 3.20e-02 65.95 < 2e-16
## plan_chosen -2.94e-02 6.63e-03 -4.43 9.3e-06
## ave_minute_peak -1.77e-04 6.17e-05 -2.86 0.00426
## promo_pct -3.16e-02 2.50e-02 -1.26 0.20645
## age 4.45e-03 4.66e-04 9.54 < 2e-16
## newcell 9.20e-02 2.07e-02 4.45 8.6e-06
## ruca2 -1.41e-02 3.78e-03 -3.74 0.00018
## zip_comm -8.56e-02 4.21e-02 -2.03 0.04218
## Log(scale) -8.62e-01 1.07e-02 -80.75 < 2e-16
##
## Scale= 0.422
##
## Weibull distribution
## Loglik(model)= -15403 Loglik(intercept only)= -15528
## Chisq= 249.94 on 7 degrees of freedom, p= 2.9e-50
## Number of Newton-Raphson Iterations: 6
## n=12319 (176 observations deleted due to missingness)
# Compute the AIC for the AFT model with the Weibull distribution
aft_weibull_aic <- AIC(aft_weibull_model)
aft_weibull_aic
## [1] 30823.97
- Evaluate the estimated model fit and performance using the AIC
measure.
# Compare the AIC values
cat("AIC values:\nCox PH model:", cox_ph_aic, "\nAFT Exponential model:", aft_exp_aic, "\nAFT Weibull model:", aft_weibull_aic, "\n")
## AIC values:
## Cox PH model: 77001.51
## AFT Exponential model: 34979.55
## AFT Weibull model: 30823.97
- Interpret either the hazard ratio (for PH model) or time ratio (for
AFT model) associated with plan2 to plan 4 dummies (relative to plan 1)
and age on the probability of or time to churning.
# Compute time ratios for the best model (AFT Weibull)
time_ratios <- exp(coef(aft_weibull_model))
time_ratios
## (Intercept) plan_chosen ave_minute_peak promo_pct age
## 8.2692361 0.9710389 0.9998235 0.9688519 1.0044574
## newcell ruca2 zip_comm
## 1.0964031 0.9859539 0.9179447
# Compute 95% confidence intervals for time ratios
conf_ints <- confint(aft_weibull_model, level = 0.95)
ci_lower <- exp(conf_ints[, 1])
ci_upper <- exp(conf_ints[, 2])
# Combine time ratios and their confidence intervals
time_ratios_ci <- data.frame(TimeRatio = time_ratios, LowerCI = ci_lower, UpperCI = ci_upper)
time_ratios_ci
## TimeRatio LowerCI UpperCI
## (Intercept) 8.2692361 7.7660449 8.8050309
## plan_chosen 0.9710389 0.9585009 0.9837410
## ave_minute_peak 0.9998235 0.9997025 0.9999445
## promo_pct 0.9688519 0.9224392 1.0176000
## age 1.0044574 1.0035396 1.0053760
## newcell 1.0964031 1.0528486 1.1417594
## ruca2 0.9859539 0.9786799 0.9932821
## zip_comm 0.9179447 0.8451745 0.9969805
Using random forest classifier to predict churn
# Load the necessary library
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
merged_data_clean$churn <- as.factor(merged_data_clean$churn)
# Split the data into training and test sets
set.seed(123)
trainIndex <- sample(1:nrow(merged_data_clean), round(0.7 * nrow(merged_data_clean)), replace = FALSE)
trainData <- merged_data_clean[trainIndex, ]
testData <- merged_data_clean[-trainIndex, ]
# Train the model
rf_model <- randomForest(churn ~., data = trainData, importance = TRUE)
# Make predictions on the test set
rf_pred <- predict(rf_model, testData)
# Evaluate the model performance
rf_cm <- table(Predicted = rf_pred, Actual = testData$churn)
rf_accuracy <- sum(diag(rf_cm)) / sum(rf_cm)
print(paste0("Random Forest Classifier Accuracy: ", round(rf_accuracy * 100, 2), "%"))
## [1] "Random Forest Classifier Accuracy: 100%"
# Plot variable importance
varImpPlot(rf_model)
