Dataset:
Title: Bank Marketing (with social/economic context) This dataset is publicly available for research. The details are described in [Moro et al., 2014].
Citation: [Moro et al., 2014] S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of Bank Telemarketing. Decision Support Systems, In press, http://dx.doi.org/10.1016/j.dss.2014.03.001
This dataset is based on “Bank Marketing” UCI dataset published by the Banco de Portugal.
Number of Instances: 41188 for bank-additional-full.csv
Number of Attributes: 20 + output attribute.
Attribute information:
For more information, read [Moro et al., 2014].
Input variables:
# Bank client data: 1 - age (numeric) 2 - job : type of job (categorical: “admin.”,“blue-collar”,“entrepreneur”,“housemaid”,“management”,“retired”,“self-employed”,“services”,“student”,“technician”,“unemployed”,“unknown”) 3 - marital : marital status (categorical: “divorced”,“married”,“single”,“unknown”; note: “divorced” means divorced or widowed) 4 - education (categorical: “basic.4y”,“basic.6y”,“basic.9y”,“high.school”,“illiterate”,“professional.course”,“university.degree”,“unknown”) 5 - default: has credit in default? (categorical: “no”,“yes”,“unknown”) 6 - housing: has housing loan? (categorical: “no”,“yes”,“unknown”) 7 - loan: has personal loan? (categorical: “no”,“yes”,“unknown”) # related with the last contact of the current campaign: 8 - contact: contact communication type (categorical: “cellular”,“telephone”) 9 - month: last contact month of year (categorical: “jan”, “feb”, “mar”, …, “nov”, “dec”) 10 - day_of_week: last contact day of the week (categorical: “mon”,“tue”,“wed”,“thu”,“fri”) 11 - duration: last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y=“no”). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.
# Other attributes: 12 - campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact) 13 - pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted) 14 - previous: number of contacts performed before this campaign and for this client (numeric) 15 - poutcome: outcome of the previous marketing campaign (categorical: “failure”,“nonexistent”,“success”)
# social and economic context attributes 16 - emp.var.rate:
employment variation rate - quarterly indicator (numeric) 17 -
cons.price.idx: consumer price index - monthly indicator (numeric)
18 - cons.conf.idx: consumer confidence index - monthly indicator
(numeric)
19 - euribor3m: euribor 3 month rate - daily indicator (numeric) 20 -
nr.employed: number of employees - quarterly indicator (numeric)
Output variable (desired target): 21 - y - has the client subscribed a term deposit? (binary: “yes”,“no”)
Use Randomforest to create a predictive model to identify if a customer will subscribe to a term deposit or not
Use OLS to identify the significant features influencing the decision of a customer to subscribe to a term deposit
Use Logit to create a predictive model to identify customer who are likely to churn
Use CART to create a simulation and understand how does the change in pdays change the decision of customer to subscribe to a term deposit
To create customer segments based on probability, and identify using Lasso the top and bottom most features, influencing a customer’s deciion in that particular segment
options(warn = -1)
Libraries
# Load required libraries
library(rpart)
library(rpart.plot)
library(caret)
library(dplyr)
library(ggplot2)
library(pROC)
library(stats)
library(randomForest)
library(caTools)
library(tidyr)
library(broom)
Data Import
data1 <- read.csv("bank-additional-full.csv", sep = ";")
data <- as.data.frame(data1)
Dropping and Pre-Processing
# Drop 'duration' as recommended for realistic predictive models
data <- data %>% select(-duration)
# Convert all character variables to factors for one-hot encoding
data <- data %>% mutate_if(is.character, as.factor)
EDA
# Function to plot graphs for each variable by outcome
plot_variable_by_outcome <- function(data, var_name, outcome_var = "y") {
if (is.factor(data[[var_name]]) || is.character(data[[var_name]])) {
# Categorical variable
p <- ggplot(data, aes_string(x = var_name, fill = outcome_var)) +
geom_bar(position = "dodge") + # Use position = "fill" for proportional stacking
scale_fill_manual(values = c("yes" = "steelblue", "no" = "skyblue")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = paste(var_name, "Distribution by Outcome"), x = var_name, y = "Count")
} else {
# Numerical variable, use faceting to create separate plots for each outcome
p <- ggplot(data, aes_string(x = var_name, fill = outcome_var)) +
geom_histogram(data = data[data[[outcome_var]] == 'yes', ], binwidth = diff(range(data[[var_name]]))/30, alpha = 0.5, fill = "steelblue") +
geom_histogram(data = data[data[[outcome_var]] == 'no', ], binwidth = diff(range(data[[var_name]]))/30, alpha = 0.5, fill = "skyblue") +
facet_wrap(~get(outcome_var), scales = "free_y") +
theme_minimal() +
labs(title = paste(var_name, "Distribution by Outcome"), x = var_name, y = "Density")
}
print(p)
}
# Assuming 'df' is your dataframe and 'y' is your outcome variable
# Replace 'df' with your actual dataframe name
for(var_name in names(data)) {
if(var_name != 'y') {
plot_variable_by_outcome(data, var_name, 'y')
}
}
# Split the data
set.seed(123) # Setting seed for reproducibility
indices <- sample.split(data$y, SplitRatio = 0.7)
train_data <- subset(data, indices == TRUE)
test_data <- subset(data, indices == FALSE)
# Build the model
rf_model <- randomForest(y ~ ., data = train_data, ntree = 100, mtry = sqrt(ncol(train_data)-1))
predictions <- predict(rf_model, newdata = test_data)
confusionMatrix <- table(Actual = test_data$y, Predicted = predictions)
library(ggplot2)
library(dplyr)
plot_confusion_matrix_corrected <- function(cm, title = "Confusion Matrix - Percentages") {
# Ensure cm is a matrix
if(is.table(cm)) cm <- as.matrix(cm)
# Convert the confusion matrix to a dataframe
cm_df <- as.data.frame(as.table(cm))
colnames(cm_df) <- c("TrueLabels", "PredictedLabels", "Count")
# Calculate the total observations for normalization
total_counts <- sum(cm_df$Count)
# Calculate percentages of total observations
cm_df$Percentage <- (cm_df$Count / total_counts) * 100
# Plot
p <- ggplot(cm_df, aes(x = PredictedLabels, y = TrueLabels)) +
geom_tile(aes(fill = Percentage), color = "white") +
geom_text(aes(label = paste0(sprintf("%.1f%%", Percentage))), color = "black", size = 4) +
scale_fill_gradient(low = "skyblue", high = "blue") +
labs(title = title, x = "Predicted Labels", y = "True Labels", fill = "Percentage") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(p)
}
plot_confusion_matrix_corrected(confusionMatrix, "Confusion Matrix - RF")
# Calculate accuracy
accuracy <- sum(diag(confusionMatrix)) / sum(confusionMatrix)
print(paste("Accuracy of Random Forest - ", accuracy))
## [1] "Accuracy of Random Forest - 0.900857882809971"
data2 <- as.data.frame(data)
# Convert the dependent variable 'y' from 'yes'/'no' to 1/0
data2$y_numeric <- ifelse(data$y == "yes", 1, 0)
data2 <- data2 %>% select(-y)
data2 <- data2 %>% mutate_if(is.character, as.factor)
# Load the necessary library
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
# Conduct OLS regression
ols_model <- lm(y_numeric ~ ., data = data2)
# Automatic identification
categorical_vars <- names(data2)[sapply(data2, is.factor)]
numeric_vars <- names(data2)[sapply(data2, is.numeric) | sapply(data2, is.integer)]
model_summary <- tidy(ols_model)
# Plotting for categorical variables
library(stringr)
for(cat_var in categorical_vars) {
# Generate plot for each categorical variable
var_coefficients <- model_summary %>%
filter(str_detect(term, paste0("^", cat_var)))
if(nrow(var_coefficients) > 0) {
p <- ggplot(var_coefficients, aes(x = term, y = estimate)) +
geom_col(fill = "skyblue") +
geom_errorbar(aes(ymin = estimate - std.error, ymax = estimate + std.error), width = 0.4) +
theme_minimal() +
labs(title = paste("Impact of", cat_var, "on Outcome"),
x = cat_var,
y = "Coefficient Estimate") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(p)
}
}
# Plotting for numeric variables in one plot
num_coefficients <- model_summary %>% filter(term %in% numeric_vars)
ggplot(num_coefficients, aes(x = reorder(term, estimate), y = estimate)) +
geom_col(fill = "lightgreen") +
geom_errorbar(aes(ymin = estimate - std.error, ymax = estimate + std.error), width = 0.4) +
theme_minimal() +
labs(title = "Impact of Numeric Variables on Outcome",
x = "Variable",
y = "Coefficient Estimate") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Simplify the coefficient data frame
simplified_coeffs <- model_summary %>%
select(term, estimate, p.value) %>%
mutate(significance = case_when(
p.value < .001 ~ '***',
p.value < .01 ~ '**',
p.value < .05 ~ '*',
TRUE ~ ''
))
simplified_coeffs <- subset(simplified_coeffs, term != "(Intercept)")
library(scales)
# Assuming 'simplified_coeffs' contains your significant variables
# Ensure it's filtered for significance if not already
significant_vars <- simplified_coeffs %>%
filter(significance != "") # Filter for significant variables if needed
# Creating a bar chart with color gradients based on 'estimate'
ggplot(significant_vars, aes(x = reorder(term, estimate), y = estimate, fill = estimate)) +
geom_col() + # Use geom_col for pre-summarized data; it's equivalent to geom_bar(stat = "identity")
scale_fill_gradient2(low = "red", mid = "yellow", high = "green", midpoint = median(significant_vars$estimate)) +
labs(title = "Significant Variables", x = "Term", y = "Estimate") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
legend.title = element_text(size = 10),
legend.text = element_text(size = 8)) +
coord_flip()
# Predict probabilities for the entire dataset
full_prob_predictions <- predict(rf_model, newdata = data, type = "prob")
data3 <- as.data.frame(data)
# Add to the original data
data3$prob_yes <- full_prob_predictions[, 'yes']
# Create the 'clusters' column based on the 'probability' intervals
data3$clusters <- cut(data3$prob_yes, breaks = c(0, 0.25, 0.50, 0.75, 1), labels = c(1, 2, 3, 4), include.lowest = TRUE)
# Convert 'clusters' to numeric if necessary
data3$clusters <- as.numeric(as.character(data3$clusters))
library(dplyr)
# Calculate the average probability for each segment
average_probs_df <- data3 %>%
group_by(clusters) %>%
summarise(AverageProbability = mean(prob_yes, na.rm = TRUE)) %>%
ungroup() # to remove grouping structure from dataframe
# Print the resulting dataframe
print(average_probs_df)
## # A tibble: 4 × 2
## clusters AverageProbability
## <dbl> <dbl>
## 1 1 0.0338
## 2 2 0.361
## 3 3 0.637
## 4 4 0.860
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-8
data4 <- as.data.frame(data3)
data4 <- data4 %>% select(-prob_yes)
# Create a model matrix for the predictors, excluding the clusters and response variable 'y'
# This automatically creates dummy variables for factor/categorical variables
predictors <- model.matrix(y ~ . - 1 - clusters, data = data4)
# Scale the predictors
predictors_scaled <- scale(predictors)
# The response variable
response <- data4$y
# Placeholder for results
results <- data.frame(Segment = character(), Increase = character(), Decrease = character())
for (i in 1:4) {
segment_indices <- data4$clusters == as.character(i)
if (sum(segment_indices) == 0) {
# If no data for this segment, skip
next
}
x_segment <- predictors_scaled[segment_indices, ]
y_segment <- response[segment_indices]
if (is.null(dim(x_segment)) || dim(x_segment)[1] < 2) {
# Not enough data to fit the model
next
}
# Fit Lasso model using cross-validation
cv_fit <- cv.glmnet(x_segment, y_segment, family = "binomial", alpha = 1)
# Extract coefficients at the optimal lambda, excluding the intercept
coef_lasso <- as.matrix(coef(cv_fit, s = "lambda.min"))
# Remove intercept and zero coefficients
coef_lasso <- coef_lasso[coef_lasso != 0, , drop = FALSE]
coef_lasso <- coef_lasso[-1, , drop = FALSE] # Removing intercept
if (is.null(coef_lasso) || length(coef_lasso) <= 1) {
# If no or only one non-zero coefficient
next
}
# Names of the coefficients
coef_names <- rownames(coef_lasso)
# Sorting coefficients
sorted_indices <- order(coef_lasso[, 1], decreasing = TRUE)
top_increase <- paste(coef_names[sorted_indices][1:2], collapse = ", ")
top_decrease <- paste(coef_names[sorted_indices][(length(sorted_indices)-1):length(sorted_indices)], collapse = ", ")
# Append to results dataframe
results <- rbind(results, data.frame(Segment = paste("Segment", i), Increase = top_increase, Decrease = top_decrease))
}
print(results)
## Segment Increase
## 1 Segment 1 poutcomenonexistent, educationuniversity.degree
## 2 Segment 2 emp.var.rate, monthmay
## 3 Segment 3 monthmay, pdays
## 4 Segment 4 defaultunknown, monthjul
## Decrease
## 1 campaign, contacttelephone
## 2 jobretired, poutcomesuccess
## 3 poutcomenonexistent, jobretired
## 4 jobunknown, euribor3m
library(glmnet)
data4 <- as.data.frame(data3)
data4 <- data4 %>% select(-prob_yes)
# Create a model matrix for the predictors, excluding the clusters and response variable 'y'
# This automatically creates dummy variables for factor/categorical variables
predictors <- model.matrix(y ~ . - 1 - clusters, data = data4)
# Scale the predictors
predictors_scaled <- scale(predictors)
# The response variable
response <- data4$y
results <- data.frame(Segment = character(), Top1 = character(), Top2 = character(), Last = character(), `2ndLast` = character())
for (i in 1:4) {
segment_indices <- data3$clusters == as.character(i)
if (sum(segment_indices) == 0) { next }
x_segment <- predictors_scaled[segment_indices, ]
y_segment <- response[segment_indices]
if (is.null(dim(x_segment)) || dim(x_segment)[1] < 2) { next }
cv_fit <- cv.glmnet(x_segment, y_segment, family = "binomial", alpha = 1)
coef_lasso <- as.matrix(coef(cv_fit, s = "lambda.min"))
coef_lasso <- coef_lasso[coef_lasso != 0, , drop = FALSE]
coef_lasso <- coef_lasso[-1, , drop = FALSE] # Removing intercept
if (is.null(coef_lasso) || length(coef_lasso) <= 1) { next }
coef_names <- rownames(coef_lasso)
sorted_indices <- order(coef_lasso[, 1], decreasing = TRUE)
results <- rbind(results, data.frame(
Segment = paste("Segment", i),
Top1 = coef_names[sorted_indices][1],
Top2 = coef_names[sorted_indices][2],
Last = coef_names[sorted_indices][length(sorted_indices)],
`2ndLast` = coef_names[sorted_indices][length(sorted_indices)-1]
))
}
results
## Segment Top1 Top2 Last
## 1 Segment 1 poutcomenonexistent educationuniversity.degree contacttelephone
## 2 Segment 2 emp.var.rate monthmay cons.price.idx
## 3 Segment 3 monthmay pdays jobretired
## 4 Segment 4 defaultunknown monthjul euribor3m
## X2ndLast
## 1 campaign
## 2 euribor3m
## 3 poutcomenonexistent
## 4 jobunknown
data5 <- as.data.frame(data)
library(caTools)
# Subset data for 'current customers'
current_customers <- subset(data5, poutcome == 'success' & pdays != 999)
current_customers <- subset(current_customers, education != "illiterate")
# Convert 'y' to a binary variable where 'yes' = 0 (not churning) and 'no' = 1 (churning)
current_customers$churn <- ifelse(current_customers$y == 'yes', 0, 1)
# Convert 'churn' to a factor for logistic regression
current_customers$churn <- as.factor(current_customers$churn)
current_customers <- current_customers %>% select(-y)
current_customers <- current_customers %>% select(-poutcome)
# Split the current customers into training and testing sets
set.seed(123) # Setting seed for reproducibility
train_indices <- sample.split(current_customers$churn, SplitRatio = 0.7)
train_data_cc <- subset(current_customers, train_indices == TRUE)
test_data_cc <- subset(current_customers, train_indices == FALSE)
train_data_cc <- train_data_cc %>% mutate_if(is.character, as.factor)
test_data_cc <- test_data_cc %>% mutate_if(is.character, as.factor)
# Function to align factor levels in test data with those in training data
align_factor_levels <- function(train, test) {
for (col_name in names(train)) {
if (is.factor(train[[col_name]])) {
levels(test[[col_name]]) <- levels(train[[col_name]])
if ("unknown" %in% levels(train[[col_name]])) {
test[[col_name]][is.na(test[[col_name]])] <- 'unknown'
} else {
test <- test[!is.na(test[[col_name]]),]
}
}
}
return(test)
}
# Align factor levels of test data with training data
test_data_cc <- align_factor_levels(train_data_cc, test_data_cc)
# Now, fit the logistic regression model using the training data
logit_model <- glm(churn ~ ., data = train_data_cc, family = 'binomial')
# Predict probabilities on the testing set
test_data_cc$predicted_prob <- predict(logit_model, newdata = test_data_cc, type = 'response')
# Create a classification threshold, for example, 0.5
threshold <- 0.5
test_data_cc$predicted_churn <- ifelse(test_data_cc$predicted_prob > threshold, 1, 0)
# Evaluate model performance
confusion_matrix <- table(Predicted = test_data_cc$predicted_churn, Actual = test_data_cc$churn)
plot_confusion_matrix_corrected(confusion_matrix, "Confusion Matrix - LR")
# Calculate accuracy
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
print(paste("Accuracy of the logistic model: ", accuracy))
## [1] "Accuracy of the logistic model: 0.66747572815534"
# Load the required library
library(rpart)
# Fit the CART model on the training data
# Assuming 'y' is the target variable and 'pdays' among other variables are predictors
data6 <- as.data.frame(data)
data6 <- subset(data6, pdays != "999")
indices <- sample.split(data6$y, SplitRatio = 0.7)
train_data1 <- subset(data6, indices == TRUE)
test_data1 <- subset(data6, indices == FALSE)
cart_model <- rpart(y ~., data = train_data1, method = "class")
# Validate the model on the holdout (test) data
predictions <- predict(cart_model, newdata = test_data1, type = "class")
# Calculate the confusion matrix to see the performance
conf_matrix <- table(test_data1$y, predictions)
plot_confusion_matrix_corrected(conf_matrix, "Confusion Matrix - CART")
# Calculate accuracy or other performance metrics as needed
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
print(paste("Accuracy: ", accuracy))
## [1] "Accuracy: 0.700440528634361"
# Pre-calculate the baseline percentage of 'yes' for pdays = 999
baseline_preds <- predict(cart_model, newdata = data5, type = "class")
baseline_yes_percentage <- (sum(baseline_preds == "yes") / length(baseline_preds))*100
# Initialize a vector to hold the percentage of 'yes' for each pdays scenario from 1 to 27
yes_percentages <- numeric(27)
# Loop through each scenario from 1 to 27
for(i in 1:27) {
# Copy the original dataset and change 'pdays' from 999 to i
modified_data <- data5
modified_data$pdays[modified_data$pdays == 999] <- i
# Apply the CART model to the modified data
scenario_preds <- predict(cart_model, newdata = modified_data, type = "class")
# Calculate and store the percentage of 'yes' predictions
yes_percentages[i] <- (sum(scenario_preds == "yes") / length(scenario_preds))*100
}
# Combine baseline and scenarios for plotting
all_percentages <- c(baseline_yes_percentage, yes_percentages)
scenarios <- 0:27 # Include baseline (0 for pdays = 999) and each scenario
# Create a data frame for plotting
plot_data <- data.frame(Scenario = scenarios, PercentageYes = all_percentages)
# Use ggplot2 to create the graph
library(ggplot2)
ggplot(plot_data, aes(x = Scenario, y = PercentageYes)) +
geom_line() + geom_point() +
scale_x_continuous(breaks = 0:27) +
labs(title = "Change in Percentage of 'Yes' Responses by Modification in 'pdays'",
x = "Pdays (with 0 representing baseline pdays=999)",
y = "Percentage of 'Yes' Responses") +
theme_minimal()