This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
# Identifying Consumer Segments (R)
# call in R packages for use in this study
library(lattice) # multivariate data visualization
library(vcd) # data visualization for categorical variables
## Loading required package: grid
library(cluster) # cluster analysis methods
# read bank data into R, creating data frame bank
# note that this is a semicolon-delimited file
bank <- read.csv("bank.csv", sep = ";", stringsAsFactors = FALSE)
# examine the structure of the bank data frame
print(str(bank))
## 'data.frame': 4521 obs. of 17 variables:
## $ age : int 30 33 35 30 59 35 36 39 41 43 ...
## $ job : chr "unemployed" "services" "management" "management" ...
## $ marital : chr "married" "married" "single" "married" ...
## $ education: chr "primary" "secondary" "tertiary" "tertiary" ...
## $ default : chr "no" "no" "no" "no" ...
## $ balance : int 1787 4789 1350 1476 0 747 307 147 221 -88 ...
## $ housing : chr "no" "yes" "yes" "yes" ...
## $ loan : chr "no" "yes" "no" "yes" ...
## $ contact : chr "cellular" "cellular" "cellular" "unknown" ...
## $ day : int 19 11 16 3 5 23 14 6 14 17 ...
## $ month : chr "oct" "may" "apr" "jun" ...
## $ duration : int 79 220 185 199 226 141 341 151 57 313 ...
## $ campaign : int 1 1 1 4 1 2 1 2 2 1 ...
## $ pdays : int -1 339 330 -1 -1 176 330 -1 -1 147 ...
## $ previous : int 0 4 1 0 0 3 2 0 0 2 ...
## $ poutcome : chr "unknown" "failure" "failure" "unknown" ...
## $ response : chr "no" "no" "no" "no" ...
## NULL
print(head(bank))
## age job marital education default balance housing loan contact day
## 1 30 unemployed married primary no 1787 no no cellular 19
## 2 33 services married secondary no 4789 yes yes cellular 11
## 3 35 management single tertiary no 1350 yes no cellular 16
## 4 30 management married tertiary no 1476 yes yes unknown 3
## 5 59 blue-collar married secondary no 0 yes no unknown 5
## 6 35 management single tertiary no 747 no no cellular 23
## month duration campaign pdays previous poutcome response
## 1 oct 79 1 -1 0 unknown no
## 2 may 220 1 339 4 failure no
## 3 apr 185 1 330 1 failure no
## 4 jun 199 4 -1 0 unknown no
## 5 may 226 1 -1 0 unknown no
## 6 feb 141 2 176 3 failure no
print(table(bank$job , useNA = c("always")))
##
## admin. blue-collar entrepreneur housemaid management
## 478 946 168 112 969
## retired self-employed services student technician
## 230 183 417 84 768
## unemployed unknown <NA>
## 128 38 0
print(table(bank$marital , useNA = c("always")))
##
## divorced married single <NA>
## 528 2797 1196 0
print(table(bank$education , useNA = c("always")))
##
## primary secondary tertiary unknown <NA>
## 678 2306 1350 187 0
print(table(bank$default , useNA = c("always")))
##
## no yes <NA>
## 4445 76 0
print(table(bank$housing , useNA = c("always")))
##
## no yes <NA>
## 1962 2559 0
print(table(bank$loan , useNA = c("always")))
##
## no yes <NA>
## 3830 691 0
# Type of job (admin., unknown, unemployed, management,
# housemaid, entrepreneur, student, blue-collar, self-employed,
# retired, technician, services)
# put job into three major categories defining the factor variable jobtype
# the "unknown" category is how missing data were coded for job...
# include these in "Other/Unknown" category/level
white_collar_list <- c("admin.","entrepreneur","management","self-employed")
blue_collar_list <- c("blue-collar","services","technician")
bank$jobtype <- rep(3, length = nrow(bank))
bank$jobtype <- ifelse((bank$job %in% white_collar_list), 1, bank$jobtype)
bank$jobtype <- ifelse((bank$job %in% blue_collar_list), 2, bank$jobtype)
bank$jobtype <- factor(bank$jobtype, levels = c(1, 2, 3),
labels = c("White Collar", "Blue Collar", "Other/Unknown"))
with(bank, table(job, jobtype, useNA = c("always"))) # check definition
## jobtype
## job White Collar Blue Collar Other/Unknown <NA>
## admin. 478 0 0 0
## blue-collar 0 946 0 0
## entrepreneur 168 0 0 0
## housemaid 0 0 112 0
## management 969 0 0 0
## retired 0 0 230 0
## self-employed 183 0 0 0
## services 0 417 0 0
## student 0 0 84 0
## technician 0 768 0 0
## unemployed 0 0 128 0
## unknown 0 0 38 0
## <NA> 0 0 0 0
# define binary indicator variables as numeric 0/1 variables
bank$whitecollar <- ifelse((bank$jobtype == "White Collar"), 1, 0)
bank$bluecollar <- ifelse((bank$jobtype == "Blue Collar"), 1, 0)
with(bank, print(table(whitecollar, bluecollar))) # check definition
## bluecollar
## whitecollar 0 1
## 0 592 2131
## 1 1798 0
with(bank, print(table(jobtype))) # check definition
## jobtype
## White Collar Blue Collar Other/Unknown
## 1798 2131 592
# define factor variables with labels for plotting and binary factors
bank$marital <- factor(bank$marital,
labels = c("Divorced", "Married", "Single"))
# define binary indicator variables as numeric 0/1 variables
bank$divorced <- ifelse((bank$marital == "Divorced"), 1, 0)
bank$married <- ifelse((bank$marital == "Married"), 1, 0)
with(bank, print(table(divorced, married))) # check definition
## married
## divorced 0 1
## 0 1196 2797
## 1 528 0
with(bank, print(table(marital))) # check definition
## marital
## Divorced Married Single
## 528 2797 1196
bank$education <- factor(bank$education,
labels = c("Primary", "Secondary", "Tertiary", "Unknown"))
# define binary indicator variables as numeric 0/1 variables
bank$primary <- ifelse((bank$education == "Primary"), 1, 0)
bank$secondary <- ifelse((bank$education == "Secondary"), 1, 0)
bank$tertiary <- ifelse((bank$education == "Tertiary"), 1, 0)
with(bank, print(table(primary, secondary, tertiary))) # check definition
## , , tertiary = 0
##
## secondary
## primary 0 1
## 0 187 2306
## 1 678 0
##
## , , tertiary = 1
##
## secondary
## primary 0 1
## 0 1350 0
## 1 0 0
with(bank, print(table(education))) # check definition
## education
## Primary Secondary Tertiary Unknown
## 678 2306 1350 187
# client experience variables will not be useful for segmentation
# but can be referred to after segments have been defined
bank$default <- factor(bank$default, labels = c("No", "Yes"))
bank$housing <- factor(bank$housing, labels = c("No", "Yes"))
bank$loan <- factor(bank$loan, labels = c("No", "Yes"))
bank$response <- factor(bank$response, labels = c("No", "Yes"))
# select subset of cases never perviously contacted by sales
# keeping variables needed for cluster analysis and post-analysis
bankfull <- subset(bank, subset = (previous == 0),
select = c("response", "age", "jobtype", "marital", "education",
"default", "balance", "housing", "loan",
"whitecollar", "bluecollar", "divorced", "married",
"primary", "secondary", "tertiary"))
# examine the structure of the full bank data frame
print(str(bankfull))
## 'data.frame': 3705 obs. of 16 variables:
## $ response : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 2 1 ...
## $ age : int 30 30 59 39 41 39 43 36 20 40 ...
## $ jobtype : Factor w/ 3 levels "White Collar",..: 3 1 2 2 1 2 1 2 3 1 ...
## $ marital : Factor w/ 3 levels "Divorced","Married",..: 2 2 2 2 2 2 2 2 3 2 ...
## $ education : Factor w/ 4 levels "Primary","Secondary",..: 1 3 2 2 3 2 2 3 2 3 ...
## $ default : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ balance : int 1787 1476 0 147 221 9374 264 1109 502 194 ...
## $ housing : Factor w/ 2 levels "No","Yes": 1 2 2 2 2 2 2 1 1 1 ...
## $ loan : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 1 1 1 2 ...
## $ whitecollar: num 0 1 0 0 1 0 1 0 0 1 ...
## $ bluecollar : num 0 0 1 1 0 1 0 1 0 0 ...
## $ divorced : num 0 0 0 0 0 0 0 0 0 0 ...
## $ married : num 1 1 1 1 1 1 1 1 0 1 ...
## $ primary : num 1 0 0 0 0 0 0 0 0 0 ...
## $ secondary : num 0 0 1 1 0 1 1 0 1 0 ...
## $ tertiary : num 0 1 0 0 1 0 0 1 0 1 ...
## NULL
print(head(bankfull))
## response age jobtype marital education default balance housing loan
## 1 No 30 Other/Unknown Married Primary No 1787 No No
## 4 No 30 White Collar Married Tertiary No 1476 Yes Yes
## 5 No 59 Blue Collar Married Secondary No 0 Yes No
## 8 No 39 Blue Collar Married Secondary No 147 Yes No
## 9 No 41 White Collar Married Tertiary No 221 Yes No
## 11 No 39 Blue Collar Married Secondary No 9374 Yes No
## whitecollar bluecollar divorced married primary secondary tertiary
## 1 0 0 0 1 1 0 0
## 4 1 0 0 1 0 0 1
## 5 0 1 0 1 0 1 0
## 8 0 1 0 1 0 1 0
## 9 1 0 0 1 0 0 1
## 11 0 1 0 1 0 1 0
# select subset of variables for input to cluster analysis
data_for_clustering <- subset(bankfull,
select = c("age",
"whitecollar", "bluecollar",
"divorced", "married",
"primary", "secondary", "tertiary"))
# -----------------------------------------------------
# clustering solutions (min_clusters to max_clusters)
# this step may take 10 minutes or more to complete
# -----------------------------------------------------
# set file for graphical output from the clustering solutions
pdf(file = "fig_finding_new_customers_cluster_search.pdf",
width = 8.5, height = 11)
min_clusters <- 2
max_clusters <- 20
# evaluate alternative numbers of clusters/segments
# we use the average silhouette width as a statistical criterion
evaluation_vector <- NULL # initialize evaluation vector
# selected algorithm is pam (partitioning around medoids)
# with so many binary variables, manhattan distances seemed
# to work better than Euclidean distances
for (number_of_clusters in min_clusters:max_clusters) {
try_clustering <- pam(data_for_clustering, k = number_of_clusters,
metric = "manhattan", stand = TRUE)
evaluation_vector <- rbind(evaluation_vector,
data.frame(number_of_clusters,
average_silhouette_width =
try_clustering$silinfo$avg.width))
# show plot for this clustering solution
plot(try_clustering) # add this clustering solution to results file
}
dev.off() # close the pdf results file for the clustering solution
## png
## 2
# examine the cluster solution results,
# look for average silhouette width > 0.5
# look for last big jump in average silhoutte width
print(evaluation_vector)
## number_of_clusters average_silhouette_width
## 1 2 0.3619243
## 2 3 0.4418062
## 3 4 0.4563305
## 4 5 0.4787168
## 5 6 0.4750381
## 6 7 0.4798570
## 7 8 0.5018247
## 8 9 0.5218733
## 9 10 0.5069539
## 10 11 0.5008017
## 11 12 0.5111868
## 12 13 0.5350198
## 13 14 0.5497882
## 14 15 0.5595420
## 15 16 0.5518343
## 16 17 0.5595772
## 17 18 0.5627061
## 18 19 0.5496746
## 19 20 0.5667843
# provide a single summary plot for the clustering solutions
pdf(file = "fig_finding_new_customers_cluster_summary.pdf",
width = 8.5, height = 8.5)
with(evaluation_vector, plot(number_of_clusters,
average_silhouette_width))
png(file = "fig_cluster_summary.png", width = 800, height = 800)
with(evaluation_vector, plot(number_of_clusters,
average_silhouette_width))
dev.off() # close summary results file
## png
## 2
# -----------------------------------------------------
# select clustering solution and examine it
# -----------------------------------------------------
# examine the seven-cluster solution in more detail
seven_cluster_solution <- pam(data_for_clustering, k = 8,
metric = "manhattan", stand = TRUE)
pdf(file = "fig_finding_new_customers_seven_cluster_solution.pdf",
width = 8.5, height = 8.5)
plot(seven_cluster_solution)
dev.off()
## png
## 2
# from the silhouette plot, the first five of the seven
# clusters appear to be large and well-defined
# add the cluster membership information and select first five
bankfull$cluster <- seven_cluster_solution$clustering
bankpart <- subset(bankfull, subset = (cluster < 6))
bankpart$cluster <- factor(bankpart$cluster,
labels = c("A", "B", "C", "D", "E"))
# look at demographics across the clusters/segments
# -----------------
# age Age in years
# -----------------
# examine relationship between age and response to promotion
with(bankpart, print(by(age, cluster, mean)))
## cluster: A
## [1] 45.9057
## ------------------------------------------------------------
## cluster: B
## [1] 41.6633
## ------------------------------------------------------------
## cluster: C
## [1] 43.04649
## ------------------------------------------------------------
## cluster: D
## [1] 40.03083
## ------------------------------------------------------------
## cluster: E
## [1] 32.29765
pdf(file = "fig_finding_new_customers_age_lattice.pdf",
width = 8.5, height = 11)
lattice_plot_object <- histogram(~age | cluster, data = bankpart,
type = "density",
xlab = "Age of Bank Client", layout = c(1,5))
print(lattice_plot_object) # responders tend to be older
dev.off()
## png
## 2
# -----------------------------------------------------------
# education
# Level of education (unknown, secondary, primary, tertiary)
# -----------------------------------------------------------
with(bankpart, print(table(cluster, education)))
## education
## cluster Primary Secondary Tertiary Unknown
## A 509 0 0 0
## B 0 0 594 0
## C 0 826 0 56
## D 0 485 0 34
## E 0 354 0 29
# ---------------------------------------------------------------
# job status using jobtype
# White Collar: admin., entrepreneur, management, self-employed
# Blue Collar: blue-collar, services, technician
# Other/Unknown
# ---------------------------------------------------------------
with(bankpart, print(table(cluster, jobtype)))
## jobtype
## cluster White Collar Blue Collar Other/Unknown
## A 70 313 126
## B 470 81 43
## C 0 768 114
## D 488 0 31
## E 0 314 69
# ----------------------------------------------
# marital status
# Marital status (married, divorced, single)
# [Note: "divorced" means divorced or widowed]
# ----------------------------------------------
with(bankpart, print(table(cluster, marital)))
## marital
## cluster Divorced Married Single
## A 0 449 60
## B 0 594 0
## C 0 882 0
## D 0 380 139
## E 0 0 383
# look at bank client history across the clusters/segments
# -----------------------------------------
# default Has credit in default? (yes, no)
# -----------------------------------------
with(bankpart, print(table(cluster, default)))
## default
## cluster No Yes
## A 502 7
## B 590 4
## C 868 14
## D 508 11
## E 372 11
# ------------------------------------------
# balance Average yearly balance (in Euros)
# ------------------------------------------
with(bankpart, print(by(balance, cluster, mean)))
## cluster: A
## [1] 1446.106
## ------------------------------------------------------------
## cluster: B
## [1] 1882.47
## ------------------------------------------------------------
## cluster: C
## [1] 1273.861
## ------------------------------------------------------------
## cluster: D
## [1] 1331.609
## ------------------------------------------------------------
## cluster: E
## [1] 1018.661
pdf(file = "fig_finding_new_customers_blance_lattice.pdf",
width = 8.5, height = 11)
lattice_plot_object <- histogram(~balance | cluster, data = bankpart,
type = "density", xlab = "Age of Bank Client",
layout = c(1,5))
print(lattice_plot_object) # responders tend to be older
dev.off()
## png
## 2
# ------------------------------------
# housing Has housing loan? (yes, no)
# ------------------------------------
with(bankpart, print(table(cluster, housing)))
## housing
## cluster No Yes
## A 225 284
## B 310 284
## C 322 560
## D 208 311
## E 194 189
# ----------------------------------
# loan Has personal loan? (yes, no)
# ----------------------------------
with(bankpart, print(table(cluster, loan)))
## loan
## cluster No Yes
## A 435 74
## B 509 85
## C 719 163
## D 428 91
## E 328 55
# ----------------------------------------------------
# response Response to term deposit offer (yes, no)
# ----------------------------------------------------
with(bankpart, print(table(cluster, response)))
## response
## cluster No Yes
## A 473 36
## B 535 59
## C 824 58
## D 489 30
## E 333 50
pdf(file = "fig_finding_new_customers_response_mosaic.pdf",
width = 8.5, height = 8.5)
mosaic( ~ response + cluster, data = bankpart,
labeling_args = list(set_varnames = c(response = "Response to Term Deposit Offer",
cluster = "Segment Membership")),
highlighting = "response",
highlighting_fill = c("cornsilk","violet"),
rot_labels = c(left = 0, top = 0),
pos_labels = c("center","center"),
offset_labels = c(0.0,0.6))
dev.off()
## png
## 2
# compute percentage of yes responses to term deposit offer
response_table <- table(bankpart$cluster, bankpart$response)
cat("\nPercentage Responses\n")
##
## Percentage Responses
for (i in 1:5)
cat("\n", toupper(letters[i]),
round(100 * response_table[i,2] /
sum(response_table[i,]), digits = 1))
##
## A 7.1
## B 9.9
## C 6.6
## D 5.8
## E 13.1
# note the percentage of the customers receiving offers
# for the first time falling into each of the clusters/segments
# A = 1 ... E = 5 ...
print(round(100 * table(bankfull$cluster) / nrow(bankfull), digits = 1))
##
## 1 2 3 4 5 6 7 8
## 13.7 16.0 23.8 14.0 10.3 3.9 8.1 10.1
# Suggestions for the student:
# Try alternative clustering methods, using various
# distance measures and clustering algorithms.
# Try your hand at slecting a best clustering solution
# and interpreting the solution for bank managers.
# Could your clustering solution be useful in
# target marketing?
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.