H2O.ai + GBM

Bank Marketing Data Set - UCI Machine Learning

By: Aaron Stearns

This dataset comes from the UCI Machine Learning repository available at the link below:

https://archive.ics.uci.edu/ml/datasets/bank+marketing

GBM documentation available at H2O.ai:

http://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science/gbm.html

library(tidyverse)

bank <- read.csv("~/bank_update/bank-full.csv", sep=";")

#inspect dataset
str(bank)

## 'data.frame':    45211 obs. of  17 variables:
##  $ age      : int  58 44 33 47 33 35 28 42 58 43 ...
##  $ job      : Factor w/ 12 levels "admin.","blue-collar",..: 5 10 3 2 12 5 5 3 6 10 ...
##  $ marital  : Factor w/ 3 levels "divorced","married",..: 2 3 2 2 3 2 3 1 2 3 ...
##  $ education: Factor w/ 4 levels "primary","secondary",..: 3 2 2 4 4 3 3 3 1 2 ...
##  $ default  : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 2 1 1 ...
##  $ balance  : int  2143 29 2 1506 1 231 447 2 121 593 ...
##  $ housing  : Factor w/ 2 levels "no","yes": 2 2 2 2 1 2 2 2 2 2 ...
##  $ loan     : Factor w/ 2 levels "no","yes": 1 1 2 1 1 1 2 1 1 1 ...
##  $ contact  : Factor w/ 3 levels "cellular","telephone",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ day      : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ month    : Factor w/ 12 levels "apr","aug","dec",..: 9 9 9 9 9 9 9 9 9 9 ...
##  $ duration : int  261 151 76 92 198 139 217 380 50 55 ...
##  $ campaign : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ pdays    : int  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ previous : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome : Factor w/ 4 levels "failure","other",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ y        : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...

#check for NA values
table(is.na(bank))

## 
##  FALSE 
## 768587

#find # of unique ages to choose histogram bin #
binnum <- length(unique(bank$age))

ggplot(bank, aes(age, fill = y)) + 
  geom_histogram(bins = binnum) + 
  ggtitle("Age of subscribers and non-subscribers") +
  xlab("Age") + 
  ylab("# of subscribers") +
  theme_bw() +
  scale_fill_brewer(palette="Set2") +
  theme(legend.title=element_blank())

In the histogram above we can see that there is one customer age that is significantly more prevalent than others.

Now I will visualize subscriber percentage by education level, marital status, occupation, and age:

### subscriber % by education
yn <- bank %>% group_by(education, y) %>% summarise(n = n())
yed <- bank %>% group_by(education) %>% summarise(n = n())
jed <- left_join(yn, yed, by = "education")
jed <- jed %>% mutate(perc = round((n.x / n.y) * 100, digits = 0))

ggplot(jed, aes(x = education, y = perc, fill = y, label = perc)) + 
  geom_bar(stat = "identity", alpha = 0.7) + 
  geom_text(position = "stack", size = 6) + 
  ggtitle("Percentage of subscribers by education level") + 
  xlab("") +
  ylab("% subscribed") +
  scale_fill_brewer(palette="Set3") +
  theme_bw() +
  theme(legend.title=element_blank()) +
  coord_flip()

### subscriber % by marital status
mar <- bank %>% group_by(marital, y) %>% summarise(n = n())
ymar <- bank %>% group_by(marital) %>% summarise(n = n())
jmar <- left_join(mar, ymar, by = "marital")
jmar <- jmar %>% mutate(perc = round((n.x / n.y) * 100, digits = 0))

ggplot(jmar, aes(x = marital, y = perc, fill = y, label = perc)) + 
  geom_bar(stat = "identity", alpha = 0.7) + 
  geom_text(position = "stack", size = 6)  + 
  scale_fill_brewer(palette="Spectral") +
  ggtitle("Percentage of subscribers by marital status") +
  xlab("") +
  ylab("% subscribed") +
  theme_bw() +
  theme(legend.title=element_blank())+
  coord_flip()

### subscribed by job type
ageout <- data.frame(table(bank$job, bank$y))
colnames(ageout) <- c("job", "y", "Freq")
jobs <- bank %>% group_by(job) %>% summarise(n = n())
aj <- left_join(ageout, jobs, by = "job")
aj <- aj %>% mutate(perc = round((Freq / n) * 100, digits = 0))

ggplot(aj, aes(x = job, y = perc, fill = y, label = perc)) + 
  geom_bar(stat = "identity", alpha = 0.7) + 
  geom_text(position = "stack", size = 6)  + 
  scale_fill_brewer(palette="Set2") +
  ggtitle("Percentage of subscribers by occupation") +
  xlab("") +
  ylab("% subscribed") +
  theme_bw() +
  theme(legend.title=element_blank())+
  coord_flip()

### subscriber by age
age <- bank %>% group_by(age, y) %>% summarise(n = n())
yage <- bank %>% group_by(age) %>% summarise(n = n())
jage <- left_join(age, yage, by = "age")
jage <- jage %>% mutate(perc = round((n.x / n.y) * 100, digits = 1))

ggplot(jage, aes(x = age, y = perc, fill = y, label = perc)) + 
  geom_bar(stat = "identity", position = "dodge", alpha = 0.6) + 
  scale_fill_brewer(palette="Paired") +
  ggtitle("Percentage of subscribers by age") +
  xlab("Age") +
  ylab("% subscribed") +
  theme_bw() +
  theme(legend.title=element_blank())

Upon completing the intial exploratory data analysis, I uploaded the dataset to H2O to run it through a GBM model. The model was trained on eight of the columns in the dataset: age, job, marital, education, default, balance, housing, loan

library(h2o)

h2o.init(nthreads = -1)

train.hex <- h2o.importFile("/Users/aaron/Desktop/bank_munge.csv")

summary(train.hex)

splits <- h2o.splitFrame(train.hex, 0.75, seed=1234)

gbm <- h2o.gbm(x = c(2:9), y = 18,
               training_frame = splits[[1]])

pred <- h2o.predict(gbm, splits[[2]])

The h2o.predict function outputs a data frame with “no” and “yes” prediction columns and the percentage prediction for each observation in the data. I will use an ifelse statement to categorize the predictions into “yes” and “no” from their prediction percentages. My initial split will be if the percentage is greater than 90%

tester <- as.data.frame(splits[[2]])
pred_df <- as.data.frame(pred)

pred_df$yn <- ifelse(pred_df$no > 0.9, "no", "yes")

confusionMatrix(tester$y, pred_df$yn)

The accuracy of this initial split is very low at 62.79 percent, with a very high true negative rate, but a very low true positive rate:

Here I will try again with the split at 60%:

tester <- as.data.frame(splits[[2]])
pred_df <- as.data.frame(pred)

pred_df$yn <- ifelse(pred_df$no > 0.6, "no", "yes")

confusionMatrix(tester$y, pred_df$yn)

Here, the model accuracy has improved greatly, with the true positive rate more than doubling and the overall accuracy rising from 62.79 percent to 88.14 percent

Upon further experimentation, dropping the split value from 60% to 50% did not improve accuracy.