R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

# Identifying Consumer Segments (R)



# call in R packages for use in this study

library(lattice)  # multivariate data visualization

library(vcd)  # data visualization for categorical variables
## Loading required package: grid
library(cluster)  # cluster analysis methods



# read bank data into R, creating data frame bank

# note that this is a semicolon-delimited file

bank <- read.csv("bank.csv", sep = ";", stringsAsFactors = FALSE)



# examine the structure of the bank data frame

print(str(bank))
## 'data.frame':    4521 obs. of  17 variables:
##  $ age      : int  30 33 35 30 59 35 36 39 41 43 ...
##  $ job      : chr  "unemployed" "services" "management" "management" ...
##  $ marital  : chr  "married" "married" "single" "married" ...
##  $ education: chr  "primary" "secondary" "tertiary" "tertiary" ...
##  $ default  : chr  "no" "no" "no" "no" ...
##  $ balance  : int  1787 4789 1350 1476 0 747 307 147 221 -88 ...
##  $ housing  : chr  "no" "yes" "yes" "yes" ...
##  $ loan     : chr  "no" "yes" "no" "yes" ...
##  $ contact  : chr  "cellular" "cellular" "cellular" "unknown" ...
##  $ day      : int  19 11 16 3 5 23 14 6 14 17 ...
##  $ month    : chr  "oct" "may" "apr" "jun" ...
##  $ duration : int  79 220 185 199 226 141 341 151 57 313 ...
##  $ campaign : int  1 1 1 4 1 2 1 2 2 1 ...
##  $ pdays    : int  -1 339 330 -1 -1 176 330 -1 -1 147 ...
##  $ previous : int  0 4 1 0 0 3 2 0 0 2 ...
##  $ poutcome : chr  "unknown" "failure" "failure" "unknown" ...
##  $ response : chr  "no" "no" "no" "no" ...
## NULL
print(head(bank))
##   age         job marital education default balance housing loan  contact day
## 1  30  unemployed married   primary      no    1787      no   no cellular  19
## 2  33    services married secondary      no    4789     yes  yes cellular  11
## 3  35  management  single  tertiary      no    1350     yes   no cellular  16
## 4  30  management married  tertiary      no    1476     yes  yes  unknown   3
## 5  59 blue-collar married secondary      no       0     yes   no  unknown   5
## 6  35  management  single  tertiary      no     747      no   no cellular  23
##   month duration campaign pdays previous poutcome response
## 1   oct       79        1    -1        0  unknown       no
## 2   may      220        1   339        4  failure       no
## 3   apr      185        1   330        1  failure       no
## 4   jun      199        4    -1        0  unknown       no
## 5   may      226        1    -1        0  unknown       no
## 6   feb      141        2   176        3  failure       no
print(table(bank$job , useNA = c("always")))
## 
##        admin.   blue-collar  entrepreneur     housemaid    management 
##           478           946           168           112           969 
##       retired self-employed      services       student    technician 
##           230           183           417            84           768 
##    unemployed       unknown          <NA> 
##           128            38             0
print(table(bank$marital , useNA = c("always")))
## 
## divorced  married   single     <NA> 
##      528     2797     1196        0
print(table(bank$education , useNA = c("always")))
## 
##   primary secondary  tertiary   unknown      <NA> 
##       678      2306      1350       187         0
print(table(bank$default , useNA = c("always")))
## 
##   no  yes <NA> 
## 4445   76    0
print(table(bank$housing , useNA = c("always")))
## 
##   no  yes <NA> 
## 1962 2559    0
print(table(bank$loan , useNA = c("always")))
## 
##   no  yes <NA> 
## 3830  691    0
# Type of job (admin., unknown, unemployed, management,

# housemaid, entrepreneur, student, blue-collar, self-employed,

# retired, technician, services)

# put job into three major categories defining the factor variable jobtype

# the "unknown" category is how missing data were coded for job...

# include these in "Other/Unknown" category/level

white_collar_list <- c("admin.","entrepreneur","management","self-employed")

blue_collar_list <- c("blue-collar","services","technician")

bank$jobtype <- rep(3, length = nrow(bank))

bank$jobtype <- ifelse((bank$job %in% white_collar_list), 1, bank$jobtype)

bank$jobtype <- ifelse((bank$job %in% blue_collar_list), 2, bank$jobtype)

bank$jobtype <- factor(bank$jobtype, levels = c(1, 2, 3),

    labels = c("White Collar", "Blue Collar", "Other/Unknown"))

with(bank, table(job, jobtype, useNA = c("always")))  # check definition
##                jobtype
## job             White Collar Blue Collar Other/Unknown <NA>
##   admin.                 478           0             0    0
##   blue-collar              0         946             0    0
##   entrepreneur           168           0             0    0
##   housemaid                0           0           112    0
##   management             969           0             0    0
##   retired                  0           0           230    0
##   self-employed          183           0             0    0
##   services                 0         417             0    0
##   student                  0           0            84    0
##   technician               0         768             0    0
##   unemployed               0           0           128    0
##   unknown                  0           0            38    0
##   <NA>                     0           0             0    0
# define binary indicator variables as numeric 0/1 variables

bank$whitecollar <- ifelse((bank$jobtype == "White Collar"), 1, 0)

bank$bluecollar <- ifelse((bank$jobtype == "Blue Collar"), 1, 0)

with(bank, print(table(whitecollar, bluecollar)))  # check definition
##            bluecollar
## whitecollar    0    1
##           0  592 2131
##           1 1798    0
with(bank, print(table(jobtype)))  # check definition
## jobtype
##  White Collar   Blue Collar Other/Unknown 
##          1798          2131           592
# define factor variables with labels for plotting and binary factors

bank$marital <- factor(bank$marital,

    labels = c("Divorced", "Married", "Single"))



# define binary indicator variables as numeric 0/1 variables

bank$divorced <- ifelse((bank$marital == "Divorced"), 1, 0)

bank$married <- ifelse((bank$marital == "Married"), 1, 0)

with(bank, print(table(divorced, married)))  # check definition
##         married
## divorced    0    1
##        0 1196 2797
##        1  528    0
with(bank, print(table(marital)))  # check definition
## marital
## Divorced  Married   Single 
##      528     2797     1196
bank$education <- factor(bank$education,

    labels = c("Primary", "Secondary", "Tertiary", "Unknown"))

# define binary indicator variables as numeric 0/1 variables

bank$primary <- ifelse((bank$education == "Primary"), 1, 0)

bank$secondary <- ifelse((bank$education == "Secondary"), 1, 0)

bank$tertiary <- ifelse((bank$education == "Tertiary"), 1, 0)

with(bank, print(table(primary, secondary, tertiary)))  # check definition
## , , tertiary = 0
## 
##        secondary
## primary    0    1
##       0  187 2306
##       1  678    0
## 
## , , tertiary = 1
## 
##        secondary
## primary    0    1
##       0 1350    0
##       1    0    0
with(bank, print(table(education)))  # check definition
## education
##   Primary Secondary  Tertiary   Unknown 
##       678      2306      1350       187
# client experience variables will not be useful for segmentation

# but can be referred to after segments have been defined

bank$default <- factor(bank$default, labels = c("No", "Yes"))

bank$housing <- factor(bank$housing, labels = c("No", "Yes"))

bank$loan <- factor(bank$loan, labels = c("No", "Yes"))

bank$response <- factor(bank$response, labels = c("No", "Yes"))



# select subset of cases never perviously contacted by sales

# keeping variables needed for cluster analysis and post-analysis

bankfull <- subset(bank, subset = (previous == 0),

    select = c("response", "age", "jobtype", "marital", "education",

               "default", "balance", "housing", "loan",

               "whitecollar", "bluecollar", "divorced", "married",

               "primary", "secondary", "tertiary"))



# examine the structure of the full bank data frame

print(str(bankfull))
## 'data.frame':    3705 obs. of  16 variables:
##  $ response   : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 2 1 ...
##  $ age        : int  30 30 59 39 41 39 43 36 20 40 ...
##  $ jobtype    : Factor w/ 3 levels "White Collar",..: 3 1 2 2 1 2 1 2 3 1 ...
##  $ marital    : Factor w/ 3 levels "Divorced","Married",..: 2 2 2 2 2 2 2 2 3 2 ...
##  $ education  : Factor w/ 4 levels "Primary","Secondary",..: 1 3 2 2 3 2 2 3 2 3 ...
##  $ default    : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ balance    : int  1787 1476 0 147 221 9374 264 1109 502 194 ...
##  $ housing    : Factor w/ 2 levels "No","Yes": 1 2 2 2 2 2 2 1 1 1 ...
##  $ loan       : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 1 1 1 2 ...
##  $ whitecollar: num  0 1 0 0 1 0 1 0 0 1 ...
##  $ bluecollar : num  0 0 1 1 0 1 0 1 0 0 ...
##  $ divorced   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ married    : num  1 1 1 1 1 1 1 1 0 1 ...
##  $ primary    : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ secondary  : num  0 0 1 1 0 1 1 0 1 0 ...
##  $ tertiary   : num  0 1 0 0 1 0 0 1 0 1 ...
## NULL
print(head(bankfull))
##    response age       jobtype marital education default balance housing loan
## 1        No  30 Other/Unknown Married   Primary      No    1787      No   No
## 4        No  30  White Collar Married  Tertiary      No    1476     Yes  Yes
## 5        No  59   Blue Collar Married Secondary      No       0     Yes   No
## 8        No  39   Blue Collar Married Secondary      No     147     Yes   No
## 9        No  41  White Collar Married  Tertiary      No     221     Yes   No
## 11       No  39   Blue Collar Married Secondary      No    9374     Yes   No
##    whitecollar bluecollar divorced married primary secondary tertiary
## 1            0          0        0       1       1         0        0
## 4            1          0        0       1       0         0        1
## 5            0          1        0       1       0         1        0
## 8            0          1        0       1       0         1        0
## 9            1          0        0       1       0         0        1
## 11           0          1        0       1       0         1        0
# select subset of variables for input to cluster analysis

data_for_clustering <- subset(bankfull,

    select = c("age",

               "whitecollar", "bluecollar",

               "divorced", "married",

               "primary", "secondary", "tertiary"))



# -----------------------------------------------------

# clustering solutions (min_clusters to max_clusters)

# this step may take 10 minutes or more to complete

# -----------------------------------------------------

# set file for graphical output from the clustering solutions

pdf(file = "fig_finding_new_customers_cluster_search.pdf",

        width = 8.5, height = 11)

min_clusters <- 2

max_clusters <- 20

# evaluate alternative numbers of clusters/segments

# we use the average silhouette width as a statistical criterion

evaluation_vector <- NULL  # initialize evaluation vector

# selected algorithm is pam (partitioning around medoids)

# with so many binary variables, manhattan distances seemed

# to work better than Euclidean distances

for (number_of_clusters in min_clusters:max_clusters) {

    try_clustering <- pam(data_for_clustering, k = number_of_clusters,

       metric = "manhattan", stand = TRUE)

    evaluation_vector <- rbind(evaluation_vector,

        data.frame(number_of_clusters,

            average_silhouette_width =

                try_clustering$silinfo$avg.width))

    # show plot for this clustering solution

    plot(try_clustering)  # add this clustering solution to results file

    }

dev.off()  # close the pdf results file for the clustering solution
## png 
##   2
# examine the cluster solution results,

# look for average silhouette width > 0.5

# look for last big jump in average silhoutte width



print(evaluation_vector)
##    number_of_clusters average_silhouette_width
## 1                   2                0.3619243
## 2                   3                0.4418062
## 3                   4                0.4563305
## 4                   5                0.4787168
## 5                   6                0.4750381
## 6                   7                0.4798570
## 7                   8                0.5018247
## 8                   9                0.5218733
## 9                  10                0.5069539
## 10                 11                0.5008017
## 11                 12                0.5111868
## 12                 13                0.5350198
## 13                 14                0.5497882
## 14                 15                0.5595420
## 15                 16                0.5518343
## 16                 17                0.5595772
## 17                 18                0.5627061
## 18                 19                0.5496746
## 19                 20                0.5667843
# provide a single summary plot for the clustering solutions

pdf(file = "fig_finding_new_customers_cluster_summary.pdf",

        width = 8.5, height = 8.5)

with(evaluation_vector, plot(number_of_clusters,

    average_silhouette_width))

png(file = "fig_cluster_summary.png", width = 800, height = 800)
with(evaluation_vector, plot(number_of_clusters,
    average_silhouette_width))

dev.off()  # close summary results file
## png 
##   2
# -----------------------------------------------------

# select clustering solution and examine it

# -----------------------------------------------------

# examine the seven-cluster solution in more detail

seven_cluster_solution <- pam(data_for_clustering, k = 8,

        metric = "manhattan", stand = TRUE)

pdf(file = "fig_finding_new_customers_seven_cluster_solution.pdf",

    width = 8.5, height = 8.5)

plot(seven_cluster_solution)

dev.off()
## png 
##   2
# from the silhouette plot, the first five of the seven

# clusters appear to be large and well-defined



# add the cluster membership information and select first five

bankfull$cluster <- seven_cluster_solution$clustering

bankpart <- subset(bankfull, subset = (cluster < 6))

bankpart$cluster <- factor(bankpart$cluster,

    labels = c("A", "B", "C", "D", "E"))



# look at demographics across the clusters/segments

# -----------------

# age   Age in years

# -----------------

# examine relationship between age and response to promotion

with(bankpart, print(by(age, cluster, mean)))
## cluster: A
## [1] 45.9057
## ------------------------------------------------------------ 
## cluster: B
## [1] 41.6633
## ------------------------------------------------------------ 
## cluster: C
## [1] 43.04649
## ------------------------------------------------------------ 
## cluster: D
## [1] 40.03083
## ------------------------------------------------------------ 
## cluster: E
## [1] 32.29765
pdf(file = "fig_finding_new_customers_age_lattice.pdf",

    width = 8.5, height = 11)

lattice_plot_object <- histogram(~age | cluster, data = bankpart,

    type = "density",

    xlab = "Age of Bank Client", layout = c(1,5))

print(lattice_plot_object)  # responders tend to be older

dev.off()
## png 
##   2
# -----------------------------------------------------------

# education

# Level of education (unknown, secondary, primary, tertiary)

# -----------------------------------------------------------

with(bankpart, print(table(cluster, education)))
##        education
## cluster Primary Secondary Tertiary Unknown
##       A     509         0        0       0
##       B       0         0      594       0
##       C       0       826        0      56
##       D       0       485        0      34
##       E       0       354        0      29
# ---------------------------------------------------------------

# job status using jobtype

# White Collar: admin., entrepreneur, management, self-employed

# Blue Collar: blue-collar, services, technician

# Other/Unknown

# ---------------------------------------------------------------

with(bankpart, print(table(cluster, jobtype)))
##        jobtype
## cluster White Collar Blue Collar Other/Unknown
##       A           70         313           126
##       B          470          81            43
##       C            0         768           114
##       D          488           0            31
##       E            0         314            69
# ----------------------------------------------

# marital status

# Marital status (married, divorced, single)

# [Note: "divorced" means divorced or widowed]

# ----------------------------------------------

with(bankpart, print(table(cluster, marital)))
##        marital
## cluster Divorced Married Single
##       A        0     449     60
##       B        0     594      0
##       C        0     882      0
##       D        0     380    139
##       E        0       0    383
# look at bank client history across the clusters/segments



# -----------------------------------------

# default  Has credit in default? (yes, no)

# -----------------------------------------

with(bankpart, print(table(cluster, default)))
##        default
## cluster  No Yes
##       A 502   7
##       B 590   4
##       C 868  14
##       D 508  11
##       E 372  11
# ------------------------------------------

# balance  Average yearly balance (in Euros)

# ------------------------------------------

with(bankpart, print(by(balance, cluster, mean)))
## cluster: A
## [1] 1446.106
## ------------------------------------------------------------ 
## cluster: B
## [1] 1882.47
## ------------------------------------------------------------ 
## cluster: C
## [1] 1273.861
## ------------------------------------------------------------ 
## cluster: D
## [1] 1331.609
## ------------------------------------------------------------ 
## cluster: E
## [1] 1018.661
pdf(file = "fig_finding_new_customers_blance_lattice.pdf",

    width = 8.5, height = 11)

lattice_plot_object <- histogram(~balance | cluster, data = bankpart,

    type = "density", xlab = "Age of Bank Client",

    layout = c(1,5))

print(lattice_plot_object)  # responders tend to be older

dev.off()
## png 
##   2
# ------------------------------------

# housing  Has housing loan? (yes, no)

# ------------------------------------

with(bankpart, print(table(cluster, housing)))
##        housing
## cluster  No Yes
##       A 225 284
##       B 310 284
##       C 322 560
##       D 208 311
##       E 194 189
# ----------------------------------

# loan  Has personal loan? (yes, no)

# ----------------------------------

with(bankpart, print(table(cluster, loan)))
##        loan
## cluster  No Yes
##       A 435  74
##       B 509  85
##       C 719 163
##       D 428  91
##       E 328  55
# ----------------------------------------------------

# response  Response to term deposit offer (yes, no)

# ----------------------------------------------------

with(bankpart, print(table(cluster, response)))
##        response
## cluster  No Yes
##       A 473  36
##       B 535  59
##       C 824  58
##       D 489  30
##       E 333  50
pdf(file = "fig_finding_new_customers_response_mosaic.pdf",

    width = 8.5, height = 8.5)

mosaic( ~ response + cluster, data = bankpart,

  labeling_args = list(set_varnames = c(response = "Response to Term Deposit Offer",

  cluster = "Segment Membership")),

  highlighting = "response",

  highlighting_fill = c("cornsilk","violet"),

  rot_labels = c(left = 0, top = 0),

  pos_labels = c("center","center"),

  offset_labels = c(0.0,0.6))

dev.off()
## png 
##   2
# compute percentage of yes responses to term deposit offer

response_table <- table(bankpart$cluster, bankpart$response)

cat("\nPercentage Responses\n")
## 
## Percentage Responses
for (i in 1:5)

     cat("\n", toupper(letters[i]),

         round(100 * response_table[i,2] /

             sum(response_table[i,]), digits = 1))
## 
##  A 7.1
##  B 9.9
##  C 6.6
##  D 5.8
##  E 13.1
# note the percentage of the customers receiving offers

# for the first time falling into each of the clusters/segments

# A = 1 ... E = 5 ...

print(round(100 * table(bankfull$cluster) / nrow(bankfull), digits = 1))
## 
##    1    2    3    4    5    6    7    8 
## 13.7 16.0 23.8 14.0 10.3  3.9  8.1 10.1
# Suggestions for the student:

# Try alternative clustering methods, using various

# distance measures and clustering algorithms.

# Try your hand at slecting a best clustering solution

# and interpreting the solution for bank managers.

# Could your clustering solution be useful in

# target marketing?

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.