Part 1

Bank Marketing Campaign Analysis

Exploratory Data Analysis

Let us first look at our clients using socio-demographic parameters and indicators of in-bank activity. The value of interest here is the response to the marketing campaign, so this variable is shown by color in each graph.

# Data preparation: age category
databar <- data %>% dplyr::select(agecat, response) %>% 
              na.omit() %>% group_by(agecat) %>% 
              arrange(agecat) %>% 
              count(response) %>%                                     
              mutate(percent = n/sum(n))


plot1 <- ggplot(data = databar, mapping = aes(x = agecat, y = percent, fill = response)) +
  geom_col() +
  geom_text(mapping = aes(label = percent(percent)),              
            size = 3,                                             
            position = position_stack(vjust = 0.5)) +                    
  labs(x = "Age category",                                              
         y = "Percentage",                                        
         title = "by age",       
         fill = "Response") +                               
  scale_y_continuous(labels = scales::percent_format()) +
  scale_fill_manual(values = c("#999999", "#E69F00")) + coord_flip() + theme_test()

# Data preparation: job
databar <- data %>% dplyr::select(job, response) %>% 
              na.omit() %>% group_by(job) %>% 
              arrange(job) %>% 
              count(response) %>%                                     
              mutate(percent = round(n/sum(n), 5))


plot2 <- ggplot(data = databar, mapping = aes(x = job, y = percent, fill = response)) +
  geom_col() +
  geom_text(mapping = aes(label = percent(percent)),              
            size = 3,                                             
            position = position_stack(vjust = 0.5)) +                    
  labs(x = "Job",                                              
         y = "Percentage",                                        
         title = "by type of job",       
         fill = "Response") +                               
  scale_y_continuous(labels = scales::percent_format()) +
  scale_fill_manual(values = c("#999999", "#E69F00")) + coord_flip() + theme_test()

# Data preparation: balance category
databar <- data %>% dplyr::select(balcat, response) %>% 
              na.omit() %>% group_by(balcat) %>% 
              arrange(balcat) %>% 
              count(response) %>%                                     
              mutate(percent = n/sum(n))


plot3 <- ggplot(data = databar, mapping = aes(x = balcat, y = percent, fill = response)) +
  geom_col() +
  geom_text(mapping = aes(label = percent(percent)),              
            size = 3,                                             
            position = position_stack(vjust = 0.5)) +                    
  labs(x = "Balance category",                                              
         y = "Percentage",                                        
         title = "by balance category",       
         fill = "Response") +                               
  scale_y_continuous(labels = scales::percent_format()) +
  scale_fill_manual(values = c("#999999", "#E69F00")) + theme_test()


# Data preparation: education
databar <- data %>% dplyr::select(education, response) %>% 
              na.omit() %>% group_by(education) %>% 
              arrange(education) %>% 
              count(response) %>%                                     
              mutate(percent = n/sum(n))


plot4 <- ggplot(data = databar, mapping = aes(x = education, y = percent, fill = response)) +
  geom_col() +
  geom_text(mapping = aes(label = percent(percent)),              
            size = 3,                                             
            position = position_stack(vjust = 0.5)) +                    
  labs(x = "Education",                                              
         y = "Percentage",                                        
         title = "by obtained education",       
         fill = "Response") +                               
  scale_y_continuous(labels = scales::percent_format()) +
  scale_fill_manual(values = c("#999999", "#E69F00")) + theme_test()

figure1 <- ggarrange(plot1, plot2, plot3, plot4, 
                    labels = c("1", "2", "3", "4"),
                    ncol = 2, nrow = 2,  common.legend = TRUE, legend = "top")

annotate_figure(figure1,
               top = text_grob("Distribution of clients \nwith their status according to their clients' history", 
                               color = "midnightblue", face = "bold", size = 14))

# Data preparation: marital
databar <- data %>% dplyr::select(marital, response) %>% 
              na.omit() %>% group_by(marital) %>% 
              arrange(marital) %>% 
              count(response) %>%                                     
              mutate(percent = n/sum(n))


plot5 <- ggplot(data = databar, mapping = aes(x = marital, y = percent, fill = response)) +
  geom_col() +
  geom_text(mapping = aes(label = percent(percent)),              
            size = 3,                                             
            position = position_stack(vjust = 0.5)) +                    
  labs(x = "Marital",                                              
         y = "Percentage",                                        
         title = "by marital status",       
         fill = "Response") +                               
  scale_y_continuous(labels = scales::percent_format()) +
  scale_fill_manual(values = c("#999999", "#E69F00")) + theme_test()

# Data preparation: default
databar <- data %>% dplyr::select(default, response) %>% 
              na.omit() %>% group_by(default) %>% 
              arrange(default) %>% 
              count(response) %>%                                     
              mutate(percent = n/sum(n))

plot6 <- ggplot(data = databar, mapping = aes(x = default, y = percent, fill = response)) +
  geom_col() +
  geom_text(mapping = aes(label = percent(percent)),              
            size = 3,                                             
            position = position_stack(vjust = 0.5)) +                    
  labs(x = "Having default",                                              
         y = "Percentage",                                        
         title = "by having default",       
         fill = "Response") +                               
  scale_y_continuous(labels = scales::percent_format()) +
  scale_fill_manual(values = c("#999999", "#E69F00")) + theme_test()

# Data preparation: housing
databar <- data %>% dplyr::select(housing, response) %>% 
              na.omit() %>% group_by(housing) %>% 
              arrange(housing) %>% 
              count(response) %>%                                     
              mutate(percent = n/sum(n))


plot7 <- ggplot(data = databar, mapping = aes(x = housing, y = percent, fill = response)) +
  geom_col() +
  geom_text(mapping = aes(label = percent(percent)),              
            size = 3,                                             
            position = position_stack(vjust = 0.5)) +                    
  labs(x = "Having housing",                                              
         y = "Percentage",                                        
         title = "by having housing",       
         fill = "Response") +                               
  scale_y_continuous(labels = scales::percent_format()) +
  scale_fill_manual(values = c("#999999", "#E69F00")) + theme_test()

# Data preparation: loan
databar <- data %>% dplyr::select(loan, response) %>% 
              na.omit() %>% group_by(loan) %>% 
              arrange(loan) %>% 
              count(response) %>%                                     
              mutate(percent = n/sum(n))


plot8 <- ggplot(data = databar, mapping = aes(x = loan, y = percent, fill = response)) +
  geom_col() +
  geom_text(mapping = aes(label = percent(percent)),              
            size = 3,                                             
            position = position_stack(vjust = 0.5)) +                    
  labs(x = "Having loan",                                              
         y = "Percentage",                                        
         title = "by having loan",       
         fill = "Response") +                               
  scale_y_continuous(labels = scales::percent_format()) +
  scale_fill_manual(values = c("#999999", "#E69F00")) + theme_test()


figure2 <- ggarrange(plot5, plot6, plot7, plot8, 
                    labels = c("5", "6", "7", "8"),
                    ncol = 2, nrow = 2, common.legend = TRUE, legend = "none")

figure2

remove(databar, plot1, plot2, plot3, plot4, plot5, plot6, plot7, plot8)

Socio-demographic parameters:

1.There are clear differences in subscriptions in categories age and type of job.

In details, seniors (65+) and youth (18-24) subscribed in 43.3% and 24.9% correspondingly, while the maximum for other age categories is 12.7% This observation is related to the distribution in the type of job. Thus, a higher subscription rate is observed for retired and students - 22.2% and 29.5% Also, categories in which the rate exceeds 10% can be selected (listed in descending order): unemployed, management, admin, self-employed, technician.

2.There are small differences in status and education rates.

For example, a single person subscribed in 14.6% of observations while married just in 9.8%. Concerning education, the graph shows the following trend - the higher the level of education, the more people subscribed.

Note: students & retired people are quite similar in the subscriptions rate. In further campaigns, it is possible to distinguish them as a separate segment of the audience and prepare a more targeted product. If the proposal remains unchanged in the next campaigns, it should focus on single people with a tertiary level of obtained education.

In-bank activity:

1.Those who have any type of credit (housing, loan, default) subscribed less. The data shows that such clients subscribed in less than 8% of observations.

2.There is a trend that clients with higher balance subscribed more. The subscription rate for High and Medium balance is 14.9% and 14.8%. Then, it drops by about 4% for Low and then for Negative categories.

Note: In the case of the unchanged proposal remains for the next campaign it is better to focus on clients with a balance higher than 1500 and no credits.

Time is important, and business sphere is not an exception. The next step is to consider the campaign itself. Here, the focus is on the number of calls and their duration.

df_time <- data %>% group_by(response) %>% 
                    summarise(total_duration = sum(duration)/360, 
                              total_num_calls = sum(campaign))

kable(df_time, caption = "Number of calls & their duration") %>%
  kable_styling(bootstrap_options = "striped", full_width = F)

Number of calls & their duration
response	total_duration	total_num_calls
no	370.4944	103346
yes	117.6528	9953

On a roll call of all the customers, the Bank has spent around 488 hours. Meanwhile, for subscribers total duration is about 3 times less than for non-subscribers.

The total number of calls is 113 299. The most dramatic difference is observed in this indicator. For subscribers, the total number of calls is 9953 which about 10 times less than for non-subscribers.

df_time <- data %>% group_by(response) %>% 
                    summarise(median_time = median(duration), 
                              median_calls = median(campaign))

kable(df_time, 
      caption = "Number of calls & their duration \n (in average)") %>%
  kable_styling(bootstrap_options = "striped", full_width = F)

Number of calls & their duration (in average)
response	median_time	median_calls
no	3	2
yes	7	2

Surprisingly, the median call duration for subscribers is 7 that is about 2 times higher than for non-subscribers. Hypothetically, it can be explained by the assumption that subscribers were clarifying information during a phone call when non-subscribers were trying to end the conversation sooner. At the same time, there is no any difference in the median number of calls.

To estimate the efficiency of the marketing campaign, more insights can be obtained by analyzing people who accepted the company’s offer. The following table demonstrates how much time was spent and how many calls were made for people, who accepted the offer, per their age group.

df_time <- data %>% filter(response == "yes") %>% 
                    group_by(agecat) %>% 
                    summarise(median_time = median(duration), 
                              median_calls = median(campaign))

kable(df_time) %>%
  kable_styling(bootstrap_options = "striped", full_width = F)

agecat	median_time	median_calls
18-24	5	1
25-34	7	2
35-44	9	2
45-54	8	2
55-64	7	1
65+	6	1

According to the data, the emploees have to spend more time on people whose age are between 35-44 and 45-54 to provide they with information. While talking with the youngest audience is less time-consuming.

Continuing to consider the duration and number of calls let us plot the association between the number of calls and time spent to make them.

# Scatter plot
ggplot(data = data) +
  geom_point(aes(x = campaign, y = duration), color = "steelblue") + 
  facet_grid(~response) +
  ggtitle("Assosiation between number of calls and time spent to make them \n(a case of the last campaign with its result)") + 
  xlab("Number of calls") + ylab("Spent time") + theme_bw()

It is seen, that the distribution of subscribers and non-subscribers differs. Subscribers were contacted less and the densest part of the plot is up to 5 calls. It can be assumed that a large number of calls repels the client and he/she refuses to subscribe to the term deposit.

Further, the next metric that can be calculated to evaluate marketing campaign is a conversion rate. The conversion rate is a number of customers who accepted company’s offer divided to the total number of all clients and multiplied by 100%. The most common way to depict a conversion rate is a funnel plot.

# Data preparation
subscribers <- c("Non-subscribers", "New subscribers", "Returned subscribers")
df1 <- data %>% count()
df2 <- data %>% filter(response == "yes") %>% count()
df3 <- data %>% filter(poutcome == "success" & response == "yes") %>% count()
df <- rbind(df1, df2, df3)
df <- cbind(subscribers, df)
df <- df %>% rename(quantity = n)

# Funnel plot
library(plotly)

plot_ly(
  type = "funnelarea",
  text = df$subscribers,
  values = df$quantity) %>%
  layout(
    title = "Funnel plot: conversion rate",
    scene = list(
      xaxis = list(title = "Cos"),
      yaxis = list(title = "Sin"),
      zaxis = list(title = "Z")
    ))

# data %>% filter(previous != 0 & campaign != 0) %>% count() # 6044 people who took in both campaigns
# 896 / 6044 # 14.82%

# data %>% filter(previous == 0 & campaign != 0) %>% count() # 34797 - people who took part only in the last performed campaign
# 4639 / 34797 # 13.33%

All three parts of the presented above plot are about people who have been participated in the last marketing campaign. The total conversion rate for the last marketing campaign is 100% - 88.1% = 11.9% CR. Let us look at the plot more precisely. The blue area is a bit ambiguous at first glance. 88.1% of clients who participated in the last campaign do not subscribe. To depict the right percentage of this group the total number of all of the clients should be depicted in this part of the funnel chart in order to make its flow correct (that is why the size of clients in this part is equal to 40841).

The next part, that is colored in orange, stands for clients who decided to accept the offer that was suggested in the last marketing campaign and does not participate in the previous campaign. The number of these clients is 4639 that is 10% of the total number of clients that took part in the campaign.

Сonversation rate for new subscribers is 13.33% To calculate it, data were filtered by previous and campaign variables. It appeared, that the total amount of clients who took part only in the last performed campaign is 34 797. While only 4 639 of them subscribed to a term deposit.

1.93% is a percentage of people regarding the total number of clients who agreed to accept offers as a result of the two marketing campaigns.

Сonversation rate for returned subscribers is 14.82% To calculate it, data were filtered by previous and campaign variables. It appeared, that the total amount of clients who took part in both campaigns is 6 044. 896 of them subscribed to a term deposit the second time.

The next step is another plot. A Pareto chart is a great tool for decision making and focusing on the question of interest. We create a Pareto chart for the balance to know what categories of customers give the bank the most subscribers.

# Pareto chart 
ggpareto <- function(x) {
  
  title <- deparse(substitute(data))

  library(dplyr)
  
  Df <- data %>% group_by(balcat) %>% summarise(frequency = n()) %>% 
    arrange(desc(frequency))
  
  Df$balcat <- ordered(Df$balcat, levels = unlist(Df$balcat, use.names = F))
  
  Df <- Df %>% mutate(modality_int = as.integer(balcat), 
                      cumfreq = cumsum(frequency), cumperc = cumfreq/nrow(data) * 100)
  nr <- nrow(Df)
  N <- sum(Df$frequency)
  
  Df_ticks <- data.frame(xtick0 = rep(nr + .55, 11), xtick1 = rep(nr + .59, 11), 
                         ytick = seq(0, N, N/10))
  
  y2 <- c("  0%", " 10%", " 20%", " 30%", " 40%", " 50%", " 60%", " 70%", " 80%", " 90%", "100%")
  
  library(ggplot2)
  
  g <- ggplot(Df, aes(x = balcat, y = frequency)) + 
    geom_bar(stat = "identity", aes(fill = modality_int)) +
    geom_line(aes(x = modality_int, y = cumfreq, color = modality_int)) +
    geom_point(aes(x = modality_int, y = cumfreq, color = modality_int), pch = 19) +
    scale_y_continuous(breaks = seq(0, N, N/10), limits = c(-.02 * N, N * 1.02)) + 
    scale_x_discrete(breaks = Df$balcat) +
    guides(fill = FALSE, color = FALSE) + 
    annotate("rect", xmin = nr + .55, xmax = nr + 1, 
             ymin = -.02 * N, ymax = N * 1.02, fill = "white") +
    annotate("text", x = nr + .8, y = seq(0, N, N/10), label = y2, size = 3.5) +
    geom_segment(x = nr + .55, xend = nr + .55, y = -.02 * N, yend = N * 1.02, color = "grey50") +
    geom_segment(data = Df_ticks, aes(x = xtick0, y = ytick, xend = xtick1, yend = ytick)) +
    labs(title = paste0("Pareto Chart: results of the current campaign"), y = "Absolute frequency", x = "Category of balance") +
    theme_bw() + coord_flip()
  
  return(g)
}

# applying the function to the factor variable:
ggpareto(data$balcat)

Creating the Pareto Chart for the marketing campaign data and applying the 80/20 rule, it is seen from the chart that something is wrong.

It occurs that the company’s specialists communicated more with the clients whose balance is low (less than 1000 and not negative). There is 20% of them. Thinking logically, it is quite controversial as the company tried to promote an offer to make a debt to people who have pretty limited sum of money. These results contradict with first basic barplot which showed that Medium and High balance clients have a higher subscription rate.

Overall, it can be concluded that these results may be due to the fact that the marketing campaign was focused on the audience with a balance below 1000 and such customers in the sample were much more than others.

Note: In further campaigns, it is possible to focus on customers with High or Medium balance. Another way to improve the marketing campaign focused on a low-balanced audience is to prepare more targeted product for this particular segment.

Summing up

As it can be mentioned, subscription rate for clients for 18-24 and 65+ age category and for students and retired people strongly positively differs from the rest of the sample, so variables age and job should be included in further analysis.

It is clear that everyone who already has any type of credit had a smaller subscription rate compared to other customers and that there is a tendency to increase the subscription rate with an increase in the balance. These facts mean that it is necessary to consider the variables of activity in the bank along with the socio-demographical variables. Consideration of these variables together will be able to identify important relationships that affect the campaign

Based on the association between the number of calls and time spent to make them it was assumed that a large number of calls repels the client and he/she refuses to subscribe to the term deposit. Thus, to check this hypothesis variables about the marketing campaign process (as duration, pdays, etc) itself should be included in further analysis.

Predictive Models

One of the popular methods used for prediction is construction decision tree models.

1st tree

In order to get some insights about the probabilities of clients’ subscriptions the decision tree related to clients’ socio-demographic characteristics have been plotted.

To make a tree more representative, variable that is about age is used as categorical one. As a result, there are 5 variables in the tree: age, education, marital status, and job.

library(partykit)
library(tree)
library(rpart)
library(rpart.plot)
library(caret)

# Data preparation
data2 <- data %>% dplyr::select(agecat, education, jobsh, marital, response_binary) %>% na.omit()

# Test & Train
set.seed(100)
test_ind <-  createDataPartition(data2$response_binary, p = 0.2, list = F)
tree_test <- data2[test_ind,]
tree_train <- data2[-test_ind,]

# Tree 1
tree1 <- rpart(response_binary ~ ., method = "class", data = tree_train)

# Prediction & Confusion matrix
pred <- predict(tree1, tree_test, type = "class")

confusionMatrix(pred, tree_test$response_binary)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6001  777
##          1    0    0
##                                           
##                Accuracy : 0.8854          
##                  95% CI : (0.8775, 0.8929)
##     No Information Rate : 0.8854          
##     P-Value [Acc > NIR] : 0.5096          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.8854          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.8854          
##          Detection Rate : 0.8854          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : 0               
##

Above parameters of the classification decision tree based on socio-demographic characteristics can be seen. In this case a confusion matrix can be mainly considered. According to the matrix the tree did not predict correctly any clients who potentially could subsribed. Although accuracy is 88% (test sample), it is hard to rely on the results as “true negative” (if a client with a set of specific characteristics is subscribed, when a similar client is predicted to be subscribed) is 0.

2nd tree

Let us try to use all variables to create our first decision tree model. However, there are some remarks. We exclude such variables as day and month as we have no idea the tendency of how intensive the campaign has been hold. And we also should take into account a duration varible. If duration is 0, when a contact was not established. That is why the data was filtered, and the tree presented below is focused on the clients with whome contacts were made.

# Data preparation
data2 <- data %>% dplyr::select(-c(X, response, job, dur, age, balance, day, month)) %>% filter(duration != 0)

# Test & Train
set.seed(100)
test_ind <-  createDataPartition(data2$response_binary, p = 0.2, list = F)
tree_test <- data2[test_ind,]
tree_train <- data2[-test_ind,]

# Tree 2: almost all variables
tree2 <- rpart(response_binary ~ ., method = "class", data = tree_train)
prp(tree2, extra = 4)

# Prediction & Confusion matrix
pred <- predict(tree2, tree_test, type = "class")

confusionMatrix(pred, tree_test$response_binary)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6701  599
##          1  206  328
##                                           
##                Accuracy : 0.8972          
##                  95% CI : (0.8903, 0.9039)
##     No Information Rate : 0.8817          
##     P-Value [Acc > NIR] : 7.431e-06       
##                                           
##                   Kappa : 0.3968          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9702          
##             Specificity : 0.3538          
##          Pos Pred Value : 0.9179          
##          Neg Pred Value : 0.6142          
##              Prevalence : 0.8817          
##          Detection Rate : 0.8554          
##    Detection Prevalence : 0.9318          
##       Balanced Accuracy : 0.6620          
##                                           
##        'Positive' Class : 0               
##

Insights

The algorithm decided that the duration is a key to the prediction of customers’ subscription. The following interpretation can be said:

if the time spent on conversation was less than 8.5 min and a client had already participated in the campaign previously with no positive distinction for the company or had not taken part, there would be 94% of probability that he or she is not going to subscribe.
if the time spent on conversation was less than 2.5 min and a client had already participated in the previous campaign successfully for the company, then there is 70% of probability that the client is not subscribed.
if the time spent on conversation was more than 2.5 min and less than 8.5 and a client had already participated in the previous campaign successfully for the company, then there is 28% of probability that the client is subscribed.
if the time spent on conversation was less than 14 min but more than 8.5 min and a client had already participated in the campaign previously with no positive distinction for the company or had not taken part, then there is 66% of probability that the client is not subscribed.
if the time spent on conversation was less than 14 min but more than 8.5 min and a client had already participated in the previous campaign successfully for the company, then there is 17% of probability that the client is subscribed.
if the time spent on conversation was more than 14 min, then there is 40% of probability that the client is subscribed.

Overall, it can be inferred that the concept of consumer awareness works. Informing clients is a part of the corporate social responsibilities (CSR) regarding firms’ activities. This idea has two sides: (1) although consumers are not willing to pay a higher price, they will more likely purchase goods from firms that are more socially responsible, and (2) consumers are often not aware of a firm’s CSR activities (Servaes & Tamayo, 2013). Anyway, to make relationship between a client and a firm, it is essential to inform in time as well as keep exiting relationship.

Estimation

Accuracy of this model is 89.92% At first it may seem that it is good, but it is worth to pay attention to other parameters of evaluation. Looking at the confusion matrix, it is clear that the model predicts subscription to the term deposit right in 323 observation out of 7834. So, specificity is low - 34.84% However, the model is good in the prediction of churn (non-subscription to the term deposit). The model predicted 6721 non-subscribers and was wrong 186 times. Here, the sensitivity parameter is high - 97.31%. Kappa index shows that the model is in 39.95% better than random predictions. It should be noted, that positive class of model - “0” - those clients who do not subscribe to the terms deposit.

Also, the another metric for estimation can be used - Gini impurity. This estimation can give some insights how the classes in each segment has been shuffled. In case of ideal splitting, the Gini impurity is equal to 0. If it is 0.5, then splitting does not help anyhow.

# Calculate gini index
source("D:/data/compute_gini.R")
tree_test$pred <- pred
gini_split(data = tree_test, real = response_binary, predicted = pred)

## [1] 0.1726783

The Gini impurity (or Gini index) is 0.17. It means that the degree of homogeneity obtained as a result of separation in groups is extremely high. As follows, this index proves that the classes in the tree is classified correctly with a low degree of uncertanty.

At this step we also can define important variables as we use almost all of avaliable ones.

# Importance of variables
set.seed(100)
tree_train <- na.exclude(tree_train)
trctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 3)
model <- train(response_binary ~ ., data = tree_train, 
               method = "rpart", 
               trControl = trctrl,
               tuneLength = 10)
importance <- varImp(model, scale = FALSE)
print(importance)

## rpart variable importance
## 
##   only 20 most important variables shown (out of 35)
## 
##                    Overall
## poutcomesuccess   1212.232
## duration          1045.798
## pdays              570.745
## previous           464.402
## poutcomeunknown    436.353
## housingyes         161.802
## agecat65+          148.953
## jobshst             95.040
## maritalmarried      49.158
## loanyes             31.664
## educationtertiary   23.970
## jobshrtr            20.670
## campaign            20.528
## maritalsingle       12.346
## balcatMedium        11.348
## balcatLow           10.877
## agecat25-34         10.662
## agecat35-44         10.475
## jobshse              6.390
## agecat55-64          5.379

plot(importance)

As was expected, the most important variables have appered those that were taken by the algorithm previously to create a decision tree.

Checking dependencies

Further, let us exclude duration, as well. Duration is not taken into account due to its impact on the output target (response).

# Correlation
data$response_binary <- as.numeric(data$response_binary)
cor(data$response_binary, data$duration)

## [1] 0.3998286

cor(data$response_binary, data$pdays)

## [1] 0.1085542

data$response_binary <- as.factor(data$response_binary)
chisq.test(data$response_binary, data$poutcome)

## 
##  Pearson's Chi-squared test
## 
## data:  data$response_binary and data$poutcome
## X-squared = 4072.1, df = 2, p-value < 2.2e-16

The correlation between response and duration is 0.4 that is thought as a low but still prove the existence of association. The correlation between response and a number of days that passed by after the client was last contacted from a previous campaign is pretty low (it is 0.1): there is no relationship. According to a chi-squared test there is an association between result of participating in the prevous campaign and response variable.

3rd tree

# Data preparation
data2 <- data %>% dplyr::select(-c(X, response, job, dur, age, balance, day, month, duration)) 

# Test & Train
set.seed(100)
test_ind <-  createDataPartition(data2$response_binary, p = 0.2, list = F)
tree_test <- data2[test_ind,]
tree_train <- data2[-test_ind,]

# Tree 2: almost all variables
tree2 <- rpart(response_binary ~ ., method = "class", data = tree_train)

# Prediction & Confusion matrix
pred <- predict(tree2, tree_test, type = "class")
prp(tree2, extra = 4)

confusionMatrix(pred, tree_test$response_binary)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    2
##          1 7144  741
##          2   97  187
##                                           
##                Accuracy : 0.8974          
##                  95% CI : (0.8906, 0.9039)
##     No Information Rate : 0.8864          
##     P-Value [Acc > NIR] : 0.0007844       
##                                           
##                   Kappa : 0.2697          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9866          
##             Specificity : 0.2015          
##          Pos Pred Value : 0.9060          
##          Neg Pred Value : 0.6585          
##              Prevalence : 0.8864          
##          Detection Rate : 0.8745          
##    Detection Prevalence : 0.9652          
##       Balanced Accuracy : 0.5941          
##                                           
##        'Positive' Class : 1               
##

Without duration variable a result of the previous marketing campaign comes into play. If a person took part previously and (s)he subscibed, there is 36% that (s)he would do it once again (and 64% that is pretty high that (s)he would not).

Accuracy (test sample) is higher than before and equals to 89%. The prediction was misleading regarding 750 out of 8166 potential clients. Sensitivity is 98.54%, and pecificity is 19.18%: most of the positive examples are correctly recognized but there are a lot of false positives.

# Calculate gini index
tree_test$pred <- pred
gini_split(data = tree_test, real = response_binary, predicted = pred)

## [1] 0.1800058

The Gini impurity (or Gini index) is 0.18. It means that the degree of homogeneity obtained as a result of separation in groups is still high.

4th tree

Here we exclude pdays variable (number of days that passed by after the client was last contacted), previous(), and result of participating before as they stand for information about results of previous campaign leading to bias because a majority of clients are actually new (did not participate before).

# Data preparation
data2 <- data %>% dplyr::select(-c(X, response, job, dur, age, balance, day, month, duration, pdays, poutcome, previous)) 

# Test & Train
set.seed(100)
test_ind <-  createDataPartition(data2$response_binary, p = 0.2, list = F)
tree_test <- data2[test_ind,]
tree_train <- data2[-test_ind,]

# Tree 2: almost all variables
tree2 <- rpart(response_binary ~ ., method = "class", data = tree_train)

# Prediction & Confusion matrix
pred <- predict(tree2, tree_test, type = "class")

confusionMatrix(pred, tree_test$response_binary)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    2
##          1 7241  928
##          2    0    0
##                                           
##                Accuracy : 0.8864          
##                  95% CI : (0.8793, 0.8932)
##     No Information Rate : 0.8864          
##     P-Value [Acc > NIR] : 0.5087          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.8864          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.8864          
##          Detection Rate : 0.8864          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : 1               
##

According to the data and variables that are available in it, the algorithm cannot predict any possible subscription in favor of campaign results. There were 928 clients who subscribed but the model does not recognize them. The accuracy of this model is 88.64%. However, it does not make any sense because the Kappa index is equal to 0 which means the model no better than a random prediction. The true positive rate (the ability of a test to correctly identify) is 1. According to McNemar’s test, there is a difference between the paired cases (p-value is <2e-16). Overall, hardly can this prediction be valid in case of any decision-making about the marketing campaign.

Let us have a look at the importance of variables.

# Importance of variables
set.seed(100)
tree_train <- na.exclude(tree_train)
trctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 3)
model <- train(response_binary ~ ., data = tree_train, 
               method = "rpart", 
               trControl = trctrl,
               tuneLength = 10)
importance <- varImp(model, scale = FALSE)
print(importance)

## rpart variable importance
## 
##   only 20 most important variables shown (out of 27)
## 
##                    Overall
## agecat65+          119.692
## housingyes         113.191
## jobshblc            41.523
## jobshst             38.689
## jobshrtr            38.059
## maritalmarried       7.911
## maritalsingle        5.484
## campaign             3.709
## educationsecondary   0.000
## loanyes              0.000
## jobshhm              0.000
## jobshoth             0.000
## `agecat35-44`        0.000
## `agecat65+`          0.000
## balcatMedium         0.000
## jobshmng             0.000
## jobshune             0.000
## `agecat45-54`        0.000
## jobshent             0.000
## defaultyes           0.000

plot(importance)

There are some interesting facts. Еhe algorithm does not allocate any variables and does not specify an importance parameter. This suggests that it is impossible to make any valid predictions and conclusions based on customer categories (sociodemographic and in-bank activity data)

Thus, there are not enough variables to evaluate efficiency of the telemarketing campaign.

General conclusion about decision tree models:

Testing 4 decision tree models on different sets of variables we came to the conclusion that it classifies customer response incorrectly. Therefore, it is worth checking other prediction algorithms, such as logistic regression.

Kranti Kumar who also works with this data came to the same conclusion.

Random forest

Another way that is pretty closed to decision trees is to make some prediction is to use Random forest algorithm.

Let us remove only duration variable (due to its impact on the outcome) and leave the others.

# Data preparation
data2 <- data %>% dplyr::select(-c(X, response, job, dur, age, balance, day, month, duration)) 

# Test & Train
set.seed(100)
test_ind <-  createDataPartition(data2$response_binary, p = 0.2, list = F)
tree_test <- data2[test_ind,]
tree_train <- data2[-test_ind,]

library(randomForest)

set.seed(123)
tree_train <- na.exclude(tree_train)
model.rf <- randomForest(response_binary ~., data = tree_train, importance = TRUE, mtry = 5, ntree = 100)
predTest.rf <- predict(model.rf, tree_test)

# Test sample
confusionMatrix(predTest.rf, tree_test$response_binary, positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    2
##          1 5897  640
##          2  103  158
##                                         
##                Accuracy : 0.8907        
##                  95% CI : (0.883, 0.898)
##     No Information Rate : 0.8826        
##     P-Value [Acc > NIR] : 0.01925       
##                                         
##                   Kappa : 0.2553        
##                                         
##  Mcnemar's Test P-Value : < 2e-16       
##                                         
##             Sensitivity : 0.9828        
##             Specificity : 0.1980        
##          Pos Pred Value : 0.9021        
##          Neg Pred Value : 0.6054        
##              Prevalence : 0.8826        
##          Detection Rate : 0.8675        
##    Detection Prevalence : 0.9616        
##       Balanced Accuracy : 0.5904        
##                                         
##        'Positive' Class : 1             
##

Without duration variable and the other such as day and month, the results are better in this algorith than those were obtained before. Simply speaking, the key idea of the Random Forst algorith is to create many decision trees (here 100 of them) and take average values as a result.

The accuracy is 89.46% that is not so different from the other statistics of this type recieved previously. Kappa statistic that compares the accuracy of the system to the accuracy of a random system is 25.25%: the model is in 25.25% better than random predictions. In total, the model was wrong in 711 times out of 6798. However, 100 of potential clients were predicted as subscribed but they are out of 6798 that is still prove the extremily low efficiency of the marketing campaign.

What about importance in the case of the Random forest algorithm?

library(vip)

# Importance
fit.ranger <- ranger::ranger(
  response_binary ~ ., 
  data = tree_train, 
  importance = 'impurity',
  probability = TRUE
)

vip(fit.ranger, fill = "steelblue") + ggtitle("Ranger: Random forest") + theme_bw()

As was supposed, the variables that have been defined as important have remained almost unchanged. Moreover, the results of the previously done actions are considerable. Also, age, job and having housing loan are big deals. Balance of clients can be thought for predictions, as well.

Bayesian networks

The first Bayesian Network is based on logical assumptions. Variable stands for age is not influesced by anything as we cannot somehow influence this variable. But age influences educational level, as educational system is usually linked to age of a person. Than educational level can have an impact on job. If we perceive job as an indicator of income, it has an effect on balance. This is the first branch of the tree.

Another branche is devoted to last campaign. It seems logical that the outcome of previous campaign influence the number of days passed by after it: if it was not successful, it is likely that person does not want a new one.

A few variables were added becuase of an existing possibly fact that potential influensing on response, but not affectd by anything: they are duration, default and loan.

library(bnlearn)

bn = model2network("[agecat][poutcome][loan][default][pdays|poutcome][balcat|jobsh][jobsh|education][education|agecat][response_binary|balcat:pdays:loan:default]")
graphviz.plot(bn, shape = "ellipse")

dataCut = data[,c("agecat", "balcat", "education", "jobsh", "default", "loan", "poutcome", "pdays", "response_binary")]
dataCut$pdays = cut(as.numeric(dataCut$pdays), breaks = c(-1000, 250, 500, 750, 1000))

dataCut = as.data.frame(dataCut)
dataCut$education = as.factor(dataCut$education)
dataCut$default = as.factor(dataCut$default)
dataCut$loan = as.factor(dataCut$loan)
dataCut$poutcome = as.factor(dataCut$poutcome)

# cpt
fitted = bn.fit(bn, data = dataCut)

# Huge probability table is here:
# fitted$response_binary

And after it we can make some queries to see, how does the model work. For example, first of all we will look at the probability to make a subscription.

knitr::kable(cpquery(fitted, event = (response_binary == "1"), evidence = TRUE))

x
0.8891

Also it may be interesting to explore some particular segments of customers. For example, young and old people.

knitr::kable(cpquery(fitted, event = (response_binary == "1"), evidence = (agecat == '18-24')))

x
0.8628571

knitr::kable(cpquery(fitted, event = (response_binary == "1"), evidence = (agecat == '65+')))

x
0.875817

It is seen that they are a little bit more likely to subscribe than an average customer (11.76% in students to 10.93% on average). While the oldest category is less willing to do it.

And what about jobs?

Technical stuff

knitr::kable(cpquery(fitted, event = (response_binary == "1"), evidence = (jobsh == 'tch')))

x
0

Self-employed

knitr::kable(cpquery(fitted, event = (response_binary == "1"), evidence = (jobsh == 'se')))

x
0.8873563

Administrators

knitr::kable(cpquery(fitted, event = (response_binary == "1"), evidence = (jobsh == 'adm')))

x
0.8971831

Unemployed

knitr::kable(cpquery(fitted, event = (response_binary == "1"), evidence = (jobsh == 'une')))

x
0.8932292

Students

knitr::kable(cpquery(fitted, event = (response_binary == "1"), evidence = (jobsh == 'st')))

x
0.8853211

Retired

knitr::kable(cpquery(fitted, event = (response_binary == "1"), evidence = (jobsh == 'rtr')))

x
0.8695652

Managers

knitr::kable(cpquery(fitted, event = (response_binary == "1"), evidence = (jobsh == 'mng')))

x
0.8813222

Housemaid

knitr::kable(cpquery(fitted, event = (response_binary == "1"), evidence = (jobsh == 'hm')))

x
0.9017544

Enterpreneur

knitr::kable(cpquery(fitted, event = (response_binary == "1"), evidence = (jobsh == 'ent')))

x
0.8882979

Blue-collar

knitr::kable(cpquery(fitted, event = (response_binary == "1"), evidence = (jobsh == 'bc')))

x
0

After checking all the jobs we have found that students have the highest subscription rate among all, while technical stuff and blue-collar do not subscribe at all!

What about balance? High

knitr::kable(cpquery(fitted, event = (response_binary == "1"), evidence = (balcat == 'High')))

x
0.8385093

Medium

knitr::kable(cpquery(fitted, event = (response_binary == "1"), evidence = (balcat == 'Medium')))

x
0.8538079

Low

knitr::kable(cpquery(fitted, event = (response_binary == "1"), evidence = (balcat == 'Low')))

x
0.8970733

Negative

knitr::kable(cpquery(fitted, event = (response_binary == "1"), evidence = (balcat == 'Negative')))

x
0.9495305

Very intersting, that people with medium balance nave the highest rate of subscriptions. As it was expected, those who have negative balance have the lowest rate of subscription.

What about number of days that passed by after the client was last contacted from a previous campaign?

knitr::kable(cpquery(fitted, event = (response_binary == "1"), evidence = (pdays == "(750,1e+03]")))

x
0.3333333

As it was expected, people who have not had any campaign for long, have extremely low chance to subscribe. Probably, they have churned already.

Exploring BNs

Firstly, let us include all the variables, occurred important in EDA, in the data, and make an automated network, based on data.

dataEDA <- data[,c("agecat", "balcat", "campaign", "previous", "default", "poutcome", "dur", "housing", "jobsh", "response_binary")]

dataEDA <- na.omit(dataEDA)
dataEDA[sapply(dataEDA, is.character)] <- lapply(dataEDA[sapply(dataEDA, is.character)], 
                                       as.factor)

dataEDA[sapply(dataEDA, is.integer)] <- lapply(dataEDA[sapply(dataEDA, is.integer)], 
                                       as.factor)

dataEDA <- discretize(dataEDA, breaks = 4, method = "interval")

So, here we build a new network, based on the previously revealed hypotheses, and try to do it based on data. We use Hill-Climbing approach here.

Here is an edited network because the one we built first had some inaccuracies in arcs.

learned = hc(dataEDA, whitelist = data.frame(from = c("agecat", "campaign", "balcat", "default", "previous", "poutcome", "dur"), 
                                                to = c('campaign', 'response_binary', 'response_binary','response_binary', 'poutcome', 'dur', 'response_binary')))
learned  = drop.arc(learned, from = "poutcome", to = "agecat")
learned = set.arc(learned, from = "campaign", to = "poutcome")
plot(learned, shape = "ellipse")

Great! Everything seems logical and consistent with previous hypotheses. We assume, that campaigns are targeted on people of certain age. Also it is logical, that age can have an effect on both job and housing. What is more, campaign is thout to affect the outcome of previous campaign. Also, tree revealed bank activities important, therefore we include them in the network.

Let’s explore our network a little bit and then move to what-if analysis, based on our improvement suggestions.

dataEDA = as.data.frame(dataEDA)
fit2 = bn.fit(learned, data = dataEDA)

library(gRain)
bn2 <- as.grain(fit2)
knitr::kable(querygrain(bn2, nodes=c("response_binary"), type="marginal"))

x
0.8846003
0.1153997

At this stage, we again see a low subscription rate.

knitr::kable(querygrain(bn2, nodes=c("response_binary", "housing"), type="conditional"))

no	yes
0.8819581	0.8866494
0.1180419	0.1133506

Here the difference between categories is very small. And what about default?

knitr::kable(querygrain(bn2, nodes=c("response_binary", "default"), type="conditional"))

no	yes
0.8838461	0.9237214
0.1161539	0.0762786

People with default are less willing to subscribe.

Well, finally, let’s have a look at any fixed category. For example, at people with high balance, which we highlighted in EDA.

net2 <- setEvidence(bn2, evidence=list(balcat = 'High'))
knitr::kable(querygrain(net2, nodes=c("response_binary"), type="marginal"))

x
0.8588456
0.1411544

They subscribe more than the average person!

Actually, this net has proved our hypothesis. Accordingly, these results will be useful in the What-If analysis

What-If’s

The next step is to make what-if analysis according to our subscription policies. After each suggestion we will check the relevance of our approach.

We propose the following improvements:

- it is necessary to specify the target audience.

For now, the bank should focus on next telemarketing campaigns on listed categories:

students, retired OR managers, and admin with a more targeted offer
clients with balance > 1000
clients with no loan in default

In case of perceptiveness, working with the young can be pretty advantageous due to some business and social reasons. For instance, the young people may be proned to have financial support not from parents or relatives as they are assumed to be independent (e.i. taking housing loan to live in separately from parents) (Heath, 2008). Secondly, nowadays there are many opportunities for clients due to highly rapid developing of some spheres such as mobile banking or e-commerce in which young people can be good at using and fostering development (Koenig-Lewis, Palmer, & Moll, 2010). As for retired people, working with this category of clients it might be relatively easy to promote a new offer for them and to persuade them to take a debt as their financial well-being may be less stable and lower regarding percived sum of money. According to the obtained results of performed marketing analysis, managers and admins can be also a potentially promising audience that is why they are also should be considered.

As for clients’ balance, the key idea is to suggest to take a debt for people who can afford it that is why it is a pretty weird strategic to focus on whom whose balance is low. The same can be said regarding people who have already had a loan: every client uderstand that the sum of money must be paid back and there is no sense to increase that sum if a person has not opportunity to make payment back soon.

The next step of the analysis is to increase the campaign for these particular people.

Let’s first focus on students:

pol2 <- setEvidence(bn2, evidence=list(jobsh = "st", campaign = '(26,38.5]'))
knitr::kable(querygrain(pol2, nodes=c("response_binary"), type="marginal"))

x
0.8820041
0.1179959

pol3 <- setEvidence(bn2, evidence=list(jobsh = "st", campaign = '(38.5,51]'))
knitr::kable(querygrain(pol3, nodes=c("response_binary"), type="marginal"))

x
0.8820041
0.1179959

We see, that if we fix it at level (26,38.5] for students, we reduce the subscription rate, however if we increase it more, the subscription rate increases also.

Now let’s try to manipulate people with high balance, for example, we can make targeted offer just for those who succeed in previous campaign and has a high balance:

pol4 <- setEvidence(bn2, evidence=list(balcat = "High", poutcome = 'success'))
knitr::kable(querygrain(pol4, nodes=c("response_binary"), type="marginal"))

x
0.8387179
0.1612821

Well, it makes sence, and we can do special offers for this group of customers.

Finally, let’s consider people with no default. And by increasing campaign we can try to influence those who did not subscribe during the last campaign.

pol5 <- setEvidence(bn2, evidence=list(default = "no", poutcome = 'failure', campaign = '(26,38.5]'))
knitr::kable(querygrain(pol5, nodes=c("response_binary"), type="marginal"))

x
0.8911966
0.1088034

There are not so much people of this type to check this policy properly, but being based on our data we can assume that this policy works.

Summing up, in order to improve marketing campaign it is essential to define a right target audience and take into account its specificity and preferences. Reaching appropriate customers in case of promoting new offer is possible by unique and targeted actions. For instance, in case of the youth, the bank can create a policy that can be : the more ways of reaching customers (communicational channels) are used, the more propoted company’s offers are, the more successful the company is regarding its activity at the market. Acquision process is the next step. Keeping in touch with potential clients and attract them more is important. As for possible actions, for instance, the bank can suggest their clients to open a deposit account like savngs one (wiht high percent for the youth and the eldest).

Overall, using more targeted procedures and valid decisions leads to increasing of conversion rate that will lead to a successful state of the future marketing campaign.

References

Literature:

De Stefani, E. (2018). Institutional invitations to a meeting: Cold calls to bank clients. Journal of Pragmatics, 125, 180–199. https://doi.org/10.1016/j.pragma.2017.06.011

Heath, S. (2008). Housing choices and issues for young people in the UK. Housing Choices and Issues for Young People in the UK, 1–42.

Koenig-Lewis, N., Palmer, A., & Moll, A. (2010). Predicting young consumers’ take up of mobile banking services. International Journal of Bank Marketing, 28(5), 410–432. https://doi.org/Koenig-Lewis, Nicole http://orca.cf.ac.uk/view/cardiffauthors/A21438337.html,

Palmer, Adrian and Moll, Alexander 2010. Predicting young consumers’ take up of mobile banking services. International Journal of Bank Marketing 28 (5) , pp. 410-432. 10.1108/02652321011064917 http://dx.doi.org/10.1108/02652321011064917 file http://orca.cf.ac.uk/70880/1/16_IJBM_Koenig_Lewis_Moll_Mobile%20Banking_2010_Post-Print.pdf

Nagy, I. K., & Gaspar-Papanek, C. (2009). User Behaviour Analysis Based on Time Spent on Web Pages. In I.-H. Ting & H.-J. Wu (Eds.), Web Mining Applications in E-commerce and E-services (Vol. 172, pp. 117–136). https://doi.org/10.1007/978-3-540-88081-3_7

Servaes, H., & Tamayo, A. (2013). The Impact of Corporate Social Responsibility on Firm Value: The Role of Customer Awareness. Management Science, 59(5), 1045–1061.

Additional materials:

Decision Tree Flavors: Gini Index and Information Gain

Decision Trees in R

How to make a Pareto Chart using ggplot2 (and dplyr)

ISLR8 - Tree-Based Methods - Bagging and Random Forests

Visualizing ML Models with LIME

Portuguese Bank Telemarketing Analytics

Part 2

Slovak Start-Up

This part of the project has been done by Olya Yarygina.

Introduction

In the final part we work with Slovak start-up company. This company produces innovative food and fitness gadgets. To detect the target audience, they conducted a survey among young people to understand, how their ad campaign should look like.

To perform this, as a bysiness analysis, I will perform the following steps:

investigate the audience with the help of some descriptics
analyse, in what is this audience potentially more interested
define the outcome variable among those available in the survey, which would best predict the adoption of my product
examine the data with a Bayesian network, testing hypotheses on who would be the best target audience of the product, and identify the best target group for the promotion of the product
propose an insight for your client on what the target audience is like, and how to frame the ad campaign based on that knowledge

We have such a lot of variables in the dataset. As we are producing food and fitness apps, we do not need all of them, so I leave only health habits, spending and demographics variables, which are related to our products.

library(readr)
library(tidyverse)
library(bnlearn)
library(gridExtra)

survey <- read_csv("D:/data/responses.csv")
surv_filtered = survey[, c(50, 51, 54, 55, 62, 74:76, 134:150)]
surv_filtered = na.omit(surv_filtered)

Exploratory Data Analysis

First of all, I do some exploration to understand, what young people do we actually have.

Socio-demographichs

To begin with, I explore socio-demographics.

g1 = ggplot(surv_filtered) +
  geom_histogram(aes(x = Age), fill = "darkblue") + theme_bw() + ggtitle("Age") + xlab("")

g2 = ggplot(surv_filtered) +
  geom_histogram(aes(x = Weight), fill = "darkblue") + theme_bw() + ggtitle("Weight") + xlab("")

g3 = ggplot(surv_filtered) +
  geom_histogram(aes(x = Height), fill = "darkblue") + theme_bw() + ggtitle("Height") + xlab("")

g4 = ggplot(surv_filtered) +
  geom_bar(aes(x = Gender), fill = "darkblue") + theme_bw() + ggtitle("Gender") + xlab("")

grid.arrange(g1, g2, g3, g4, ncol = 2, name = "soc-dem")

Well, we have people from 15 to 30 years, most of them are near 20. They are of average height of 165-185, and their weight is on average about 55-85. This can be taken as an indicator of a normal healthy physique.

Housing conditions

Than I look at housing conditions:

g5 = ggplot(surv_filtered) +
  geom_bar(aes(x = `House - block of flats`), fill = "darkblue") + theme_bw() + ggtitle("House") + xlab("")

g6 = ggplot(surv_filtered) +
  geom_bar(aes(x = `Village - town`), fill = "darkblue") + theme_bw() + ggtitle("Village") + xlab("")

g7 = ggplot(surv_filtered) +
  geom_bar(aes(x = `Only child`), fill = "darkblue") + theme_bw() + ggtitle("Children") + xlab("")

g8 = ggplot(surv_filtered) +
  geom_bar(aes(x = `Number of siblings`), fill = "darkblue") + theme_bw() + ggtitle("Gender") + xlab("")

grid.arrange(g5, g6, g7, g8, ncol = 2, name = "housing")

Here we see, that mostly we have young people from cities, living in a flat. The vast majority does not have brothers or sisters, and mostly have 1 sibling.

Lifestyle

Also, I want to take a look at the activeness and habits of people in the sample.

g9 = ggplot(surv_filtered) +
  geom_bar(aes(x = `Countryside, outdoors`), fill = "darkblue") + theme_bw() + ggtitle("Outdoors") + xlab("")

g10 = ggplot(surv_filtered) +
  geom_bar(aes(x = `Dancing`), fill = "darkblue") + theme_bw() + ggtitle("Dancing") + xlab("")

g11 = ggplot(surv_filtered) +
  geom_bar(aes(x = `Passive sport`), fill = "darkblue") + theme_bw() + ggtitle("Passive sport") + xlab("")

g12 = ggplot(surv_filtered) +
  geom_bar(aes(x = `Active sport`), fill = "darkblue") + theme_bw() + ggtitle("Active sport") + xlab("")

g13 = ggplot(surv_filtered) +
  geom_bar(aes(x = `Adrenaline sports`), fill = "darkblue") + theme_bw() + ggtitle("Adreline sport") + xlab("")

g14 = ggplot(surv_filtered) +
  geom_bar(aes(x = `Alcohol`), fill = "darkblue") + theme_bw() + ggtitle("Alcohol") + xlab("")

g15 = ggplot(surv_filtered) +
  geom_bar(aes(x = `Smoking`), fill = "darkblue") + theme_bw() + ggtitle("Smoking") + coord_flip() + xlab("")

g16 = ggplot(surv_filtered) +
  geom_bar(aes(x = `Healthy eating`), fill = "darkblue") + theme_bw() + ggtitle("Healthy eating") + xlab("")

grid.arrange(g9, g10, g11, g12, g13, g14, g15, g16, ncol = 3, name = "habits")

From this part we reveal that we have people who are interested in outdoor activities, do sports, rarely smoke and drink alcohol only at occasions, and, prefer just a usual food, not to junky, not too healthy.

Finances

And finally, it is crucial to take a look at the spending tendencies.

g17 = ggplot(surv_filtered) +
  geom_bar(aes(x = `Spending on looks`), fill = "darkblue") + theme_bw() + ggtitle("Looks")

g18 = ggplot(surv_filtered) +
  geom_bar(aes(x = `Spending on gadgets`), fill = "darkblue") + theme_bw() + ggtitle("Gadgets")

g19 = ggplot(surv_filtered) +
  geom_bar(aes(x = `Entertainment spending`), fill = "darkblue") + theme_bw() + ggtitle("Entertainment")

g20 = ggplot(surv_filtered) +
  geom_bar(aes(x = `Spending on healthy eating`), fill = "darkblue") + theme_bw() + ggtitle("Healthy Food")

g21 = ggplot(surv_filtered) +
  geom_bar(aes(x = `Finances`), fill = "darkblue") + theme_bw() + ggtitle("Finances")

grid.arrange(g17, g18, g19, g20, g21, ncol = 2, name = "spendings")

Well, from this graph we can conclude that our audience have enough money to buy new clothes and go out time to time. They are not very interested in gadgets, but pretty interested in good products.

# to perform further analysis, all variables should be transformed to factors.

rm(g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, g11, g12, g13, g14, g15, g16, g17, g18, g19, g20, g21)

# summary(surv_filtered$Age)
# we have age distribution from 15 to 30, so I will split it into 5 categories

surv_filtered$Age = ifelse(surv_filtered$Age <= 18, "15-18",
                           ifelse(surv_filtered$Age <= 21, "19-21",
                                  ifelse(surv_filtered$Age <= 24, "22-24",
                                         ifelse(surv_filtered$Age <= 27, "25-27", "28+"))))
surv_filtered$Age = as.factor(surv_filtered$Age)

# summary(surv_filtered$Weight)
# we have height distribution from 41 to 150, so I will split it into 5 categories

surv_filtered$Weight = ifelse(surv_filtered$Weight <= 50, "<50",
                           ifelse(surv_filtered$Weight <= 60, "51-60",
                                  ifelse(surv_filtered$Weight <= 70, "61-70",
                                         ifelse(surv_filtered$Weight <= 80, "71-80", "80+"))))
surv_filtered$Weight = as.factor(surv_filtered$Weight)

# summary(surv_filtered$Height)
# we have height distribution from 41 to 150, so I will split it into 5 categories

surv_filtered$Height = ifelse(surv_filtered$Height <= 150, "<150",
                           ifelse(surv_filtered$Height <= 165, "151-165",
                                  ifelse(surv_filtered$Height <= 175, "166-175",
                                         ifelse(surv_filtered$Height <= 190, "176-190", "190+"))))
surv_filtered$Height = as.factor(surv_filtered$Height)

# explained later
surv_filtered$`buy_food` = ifelse(surv_filtered$`Spending on healthy eating` >= 3, "1", "0")
surv_filtered$`buy_food` = as.factor(surv_filtered$`buy_food`)

# other variables can be converted to facors automatically
surv_filtered[sapply(surv_filtered, is.character)] <- lapply(surv_filtered[sapply(surv_filtered, is.character)], as.factor)
surv_filtered[sapply(surv_filtered, is.numeric)] <- lapply(surv_filtered[sapply(surv_filtered, is.numeric)], as.factor)

surv_filtered = na.omit(surv_filtered)

Product

I decided to concentrate on those part of campaign, which is devoted to innovative food delivery. I made this choice, because it seems to me, that we have more variables in the dataset which can be possible predictors of buying a new food, than buying of a gadget.

I think that sports does not require fitness apps or trackers if person do it at school or at college, or just to be healthy, and also not everyone can afford it, or do not have appropriate software for a special app. While healthy food is more affordable and included at concept of general healthy lifestyle. Also food is inevitable part of people’s lives, so people probably would be more interested to try something new in this category of products.

All in all, I decided to predict "Spending on healthy eating variable’, as it seems to me an analogue to decision to buy our new product or not. To make it appropriate for business analysis I recategorise it into binary one, where “1” - is buying and “0” is not.

Bayesian Networks

To make my analysis more readable and convenient I rename some variables and drop not needed from the dataset.

surv_filtered$country = surv_filtered$`Countryside, outdoors`
surv_filtered$pass_sport = surv_filtered$`Passive sport`
surv_filtered$act_sport = surv_filtered$`Active sport`
surv_filtered$adr_sport = surv_filtered$`Adrenaline sports`
surv_filtered$health_eat = surv_filtered$`Healthy eating`
surv_filtered$gadg = surv_filtered$`Spending on gadgets`
surv_filtered$village = surv_filtered$`Village - town`
surv_filtered$flat = surv_filtered$`House - block of flats`
surv_filtered$ent = surv_filtered$`Entertainment spending`

surv_filtered = select(surv_filtered, country, pass_sport, act_sport, adr_sport, Smoking, Alcohol, health_eat, Finances, ent, gadg, buy_food, Age, Height, Weight, Gender, Education, village, flat)

I want to try to build several networks, but first of all I will try to explore an automatic one, which is based on data.

srv = as.data.frame(surv_filtered)
hn = hc(srv)
graphviz.plot(hn, shape = "ellipse")

Well :) We have only obe variable connected with buying food :) I will not use this network, but it helped me to reveal some links between other variables which seems to me important.

I also decided to combine all sport variables into one, which describes overall interest in sport.

srv = srv %>% mutate(sport = as.numeric(act_sport)+as.numeric(pass_sport)+as.numeric(adr_sport)/3)
srv$sport = ifelse(srv$sport <= 3, "1",
                   ifelse(srv$sport <= 6, "2",
                          ifelse(srv$sport <= 9, "3",
                                 ifelse(srv$sport <= 12, "4", "5"))))
srv$sport = as.factor(srv$sport)
srv = srv %>% select(-Age, -Education, -act_sport, -pass_sport, -adr_sport)

So, now I will construct my one network. It is very complex!, but all seems quite logical to me.
It is based on the following hypotheses:
- Gender affect height, which in turn affect weight of a person due to biological laws.
- Weight affect sport, as people with excess weight want to ged rid of it, and people who do sports have smaller weight.
- Doing sports affects interest in gadgets, which are sometimes very useful for sport activities.
- Finances afffect interest in gadgets too, as they are usually pretty expensive, and some people cannot afford them.
- Finances affect entertainment, as entertainments are pricey too.
- Sports afects healthy eating, because to be effective in sports activities people should eat energetic and protein food.
- Interest in gadgets can be interpreted as interest in modern and healthy stuff overall, so it can influence buying supefoods.
- Of course, healthy eating influence buying superfoods. It is just obvious.
- Entertainment affect alcohol behavior, as we have a lot of social drinkers.
- Alcohol is opposed to healthy food, so it influences eating behavior.
- So do smoking.
- On entertainments people usually eat pizza, you know…
- Villages are characterized by specific architecture, so this variable affects the type of housing.
- Which both in turn affect spending time outdoors, because people in flat a less likely to spend a nice evening in their own garden…
- Spending a lot of times outdoors is usually active spending times, so it is linked with sport.
- Also, living in a village might mean growing own vegetables, so it is connected with buying superfood. What is more, and even more important, as we have delivery, people from villages that are too far are not our clients.
- Nuts and vegetables, and other healthy food are expensive, so finances influence buying superfoods.
- And of cource healthy eating is strongly connected with interest in our product.

Fuh! Even if I missed explanation of one link, I can explain it, believe me, please.

bn = model2network("[sport|Weight:gadg:country][Weight|Height][Height|Gender][Gender][health_eat|sport:Smoking:ent:Alcohol][Smoking][ent|Finances][Finances][gadg|Finances][country|flat][flat|village][village][Alcohol|ent][buy_food|health_eat:gadg:Finances:village]")
graphviz.plot(bn)

#plot(bn)

So! The most interesting part. Now we connect our network with the data, and explore it.

srv = as.data.frame(srv)
fitted = bn.fit(bn, data = srv)

library(gRain)
net <- as.grain(fitted)
knitr::kable(querygrain(net, nodes=c("buy_food"), type="marginal"))

	x
0	0.1836643
1	0.8163357

Such a wonderful result! According to our network, we have ~80% of people who are potentially interested in our product!

Segments

However, we can make our campaign more targeted, as probably not all people really will buy it. Let’s explore certain segments.

Sports

For instanse, let’s compare people who are interested in sports and who are not interested.

Interested:

knitr::kable(cpquery(fitted, event = (buy_food == "1"), evidence = (sport == '1')))

x
0.3973214

Not interested:

knitr::kable(cpquery(fitted, event = (buy_food == "1"), evidence = (sport == '4')))

x
0.7685854

Our hypothesis was right, and people whi do sports are more interested in our food delievery! So, we can target them in our campaign.

Placement

Next, we explore placement:

People living in village:

knitr::kable(cpquery(fitted, event = (buy_food == "1"), evidence = (village == 'village')))

x
0.6900598

And those who live in city:

knitr::kable(cpquery(fitted, event = (buy_food == "1"), evidence = (village == 'city')))

x
0.7554491

We see, that, really, according to the hypothesis, people in cities are more willing to use the delivery of superfoods.

Healthy Lifestyle

And what about interest in healthy stuff in general, which is detected by interest in gadgets?

People not interested in gadgets:

knitr::kable(cpquery(fitted, event = (buy_food == "1"), evidence = (gadg == '1')))

x
0.6360601

Interested people:

knitr::kable(cpquery(fitted, event = (buy_food == "1"), evidence = (gadg == '5')))

x
0.7384686

Wonderful! One more hypothesis is confirmed. People who buy gadgets probably can afford both gadgets and delievery, and also are interested in interesting healthy stuff.

Healthy Eating

And, of course, let’s explore the interest in healthy food.

People who eat junky food:

knitr::kable(cpquery(fitted, event = (buy_food == "1"), evidence = (health_eat == '1')))

x
0.5586061

People who eat healthy:

knitr::kable(cpquery(fitted, event = (buy_food == "1"), evidence = (health_eat == '5')))

x
0.7799296

knitr::kable(cpquery(fitted, event = (buy_food == "1"), evidence = (health_eat == '4')))

x
0.920307

I think, that’s a great result, which gave us a nice understanding of who our potential customer is!

Conclusion

After the exploration of condition probability tables, we can state, that our potential customer is a person:

who live in the city
and who is interested in healthy life style
he/she do sports regularly
and as a result is interested in healthy eating
what is more, he/she has enough money to use our service

Considering all of the above, we can suggest the following marketing campaign:
(1) Advertising should be placed in cities, in places, which are covered by the delivery area.
(2) Posters and flyers should be placed in places where people do sports. For instance, in sports fields on the streets, in fitness clubs, etc.
(3) It can be effective to advertise at points of sale of sports gadgets or in sports stores, as people visiting such stores are likely to have enough money for our service, and are possibly interested in superfoods.
(4) You can also advertise in grocery stores, in points of sale of low-calorie products, products with sugar substitutes and dietary products, as our potential audience tends to support a healthy lifestyle and try new unusual products.

Business Analytics. Final Project

Belorukova L., Yarygina O., Zakharova V.

December 17, 2019

Part 1

Exploratory Data Analysis

Summing up

Predictive Models

1st tree

2nd tree

Checking dependencies

3rd tree

4th tree

Random forest

Bayesian networks

Exploring BNs

What-If’s

References

Part 2

Introduction

Exploratory Data Analysis

Socio-demographichs

Housing conditions

Lifestyle

Finances

Product

Bayesian Networks

Segments

Sports

Placement

Healthy Lifestyle

Healthy Eating

Conclusion