R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

# Identifying Consumer Segments (R)



# call in R packages for use in this study

library(lattice)  # multivariate data visualization

library(vcd)  # data visualization for categorical variables
## Loading required package: grid
library(cluster)  # cluster analysis methods



# read bank data into R, creating data frame bank

# note that this is a semicolon-delimited file

bank <- read.csv("bank.csv", sep = ";", stringsAsFactors = FALSE)



# examine the structure of the bank data frame

print(str(bank))
## 'data.frame':    4521 obs. of  17 variables:
##  $ age      : int  30 33 35 30 59 35 36 39 41 43 ...
##  $ job      : chr  "unemployed" "services" "management" "management" ...
##  $ marital  : chr  "married" "married" "single" "married" ...
##  $ education: chr  "primary" "secondary" "tertiary" "tertiary" ...
##  $ default  : chr  "no" "no" "no" "no" ...
##  $ balance  : int  1787 4789 1350 1476 0 747 307 147 221 -88 ...
##  $ housing  : chr  "no" "yes" "yes" "yes" ...
##  $ loan     : chr  "no" "yes" "no" "yes" ...
##  $ contact  : chr  "cellular" "cellular" "cellular" "unknown" ...
##  $ day      : int  19 11 16 3 5 23 14 6 14 17 ...
##  $ month    : chr  "oct" "may" "apr" "jun" ...
##  $ duration : int  79 220 185 199 226 141 341 151 57 313 ...
##  $ campaign : int  1 1 1 4 1 2 1 2 2 1 ...
##  $ pdays    : int  -1 339 330 -1 -1 176 330 -1 -1 147 ...
##  $ previous : int  0 4 1 0 0 3 2 0 0 2 ...
##  $ poutcome : chr  "unknown" "failure" "failure" "unknown" ...
##  $ response : chr  "no" "no" "no" "no" ...
## NULL
print(head(bank))
##   age         job marital education default balance housing loan  contact day
## 1  30  unemployed married   primary      no    1787      no   no cellular  19
## 2  33    services married secondary      no    4789     yes  yes cellular  11
## 3  35  management  single  tertiary      no    1350     yes   no cellular  16
## 4  30  management married  tertiary      no    1476     yes  yes  unknown   3
## 5  59 blue-collar married secondary      no       0     yes   no  unknown   5
## 6  35  management  single  tertiary      no     747      no   no cellular  23
##   month duration campaign pdays previous poutcome response
## 1   oct       79        1    -1        0  unknown       no
## 2   may      220        1   339        4  failure       no
## 3   apr      185        1   330        1  failure       no
## 4   jun      199        4    -1        0  unknown       no
## 5   may      226        1    -1        0  unknown       no
## 6   feb      141        2   176        3  failure       no
print(table(bank$job , useNA = c("always")))
## 
##        admin.   blue-collar  entrepreneur     housemaid    management 
##           478           946           168           112           969 
##       retired self-employed      services       student    technician 
##           230           183           417            84           768 
##    unemployed       unknown          <NA> 
##           128            38             0
print(table(bank$marital , useNA = c("always")))
## 
## divorced  married   single     <NA> 
##      528     2797     1196        0
print(table(bank$education , useNA = c("always")))
## 
##   primary secondary  tertiary   unknown      <NA> 
##       678      2306      1350       187         0
print(table(bank$default , useNA = c("always")))
## 
##   no  yes <NA> 
## 4445   76    0
print(table(bank$housing , useNA = c("always")))
## 
##   no  yes <NA> 
## 1962 2559    0
print(table(bank$loan , useNA = c("always")))
## 
##   no  yes <NA> 
## 3830  691    0
# Type of job (admin., unknown, unemployed, management,

# housemaid, entrepreneur, student, blue-collar, self-employed,

# retired, technician, services)

# put job into three major categories defining the factor variable jobtype

# the "unknown" category is how missing data were coded for job...

# include these in "Other/Unknown" category/level

white_collar_list <- c("admin.","entrepreneur","management","self-employed")

blue_collar_list <- c("blue-collar","services","technician")

bank$jobtype <- rep(3, length = nrow(bank))

bank$jobtype <- ifelse((bank$job %in% white_collar_list), 1, bank$jobtype)

bank$jobtype <- ifelse((bank$job %in% blue_collar_list), 2, bank$jobtype)

bank$jobtype <- factor(bank$jobtype, levels = c(1, 2, 3),

    labels = c("White Collar", "Blue Collar", "Other/Unknown"))

with(bank, table(job, jobtype, useNA = c("always")))  # check definition
##                jobtype
## job             White Collar Blue Collar Other/Unknown <NA>
##   admin.                 478           0             0    0
##   blue-collar              0         946             0    0
##   entrepreneur           168           0             0    0
##   housemaid                0           0           112    0
##   management             969           0             0    0
##   retired                  0           0           230    0
##   self-employed          183           0             0    0
##   services                 0         417             0    0
##   student                  0           0            84    0
##   technician               0         768             0    0
##   unemployed               0           0           128    0
##   unknown                  0           0            38    0
##   <NA>                     0           0             0    0
# define binary indicator variables as numeric 0/1 variables

bank$whitecollar <- ifelse((bank$jobtype == "White Collar"), 1, 0)

bank$bluecollar <- ifelse((bank$jobtype == "Blue Collar"), 1, 0)

with(bank, print(table(whitecollar, bluecollar)))  # check definition
##            bluecollar
## whitecollar    0    1
##           0  592 2131
##           1 1798    0
with(bank, print(table(jobtype)))  # check definition
## jobtype
##  White Collar   Blue Collar Other/Unknown 
##          1798          2131           592
# define factor variables with labels for plotting and binary factors

bank$marital <- factor(bank$marital,

    labels = c("Divorced", "Married", "Single"))



# define binary indicator variables as numeric 0/1 variables

bank$divorced <- ifelse((bank$marital == "Divorced"), 1, 0)

bank$married <- ifelse((bank$marital == "Married"), 1, 0)

with(bank, print(table(divorced, married)))  # check definition
##         married
## divorced    0    1
##        0 1196 2797
##        1  528    0
with(bank, print(table(marital)))  # check definition
## marital
## Divorced  Married   Single 
##      528     2797     1196
bank$education <- factor(bank$education,

    labels = c("Primary", "Secondary", "Tertiary", "Unknown"))

# define binary indicator variables as numeric 0/1 variables

bank$primary <- ifelse((bank$education == "Primary"), 1, 0)

bank$secondary <- ifelse((bank$education == "Secondary"), 1, 0)

bank$tertiary <- ifelse((bank$education == "Tertiary"), 1, 0)

with(bank, print(table(primary, secondary, tertiary)))  # check definition
## , , tertiary = 0
## 
##        secondary
## primary    0    1
##       0  187 2306
##       1  678    0
## 
## , , tertiary = 1
## 
##        secondary
## primary    0    1
##       0 1350    0
##       1    0    0
with(bank, print(table(education)))  # check definition
## education
##   Primary Secondary  Tertiary   Unknown 
##       678      2306      1350       187
# client experience variables will not be useful for segmentation

# but can be referred to after segments have been defined

bank$default <- factor(bank$default, labels = c("No", "Yes"))

bank$housing <- factor(bank$housing, labels = c("No", "Yes"))

bank$loan <- factor(bank$loan, labels = c("No", "Yes"))

bank$response <- factor(bank$response, labels = c("No", "Yes"))



# select subset of cases never perviously contacted by sales

# keeping variables needed for cluster analysis and post-analysis

bankfull <- subset(bank, subset = (previous == 0),

    select = c("response", "age", "jobtype", "marital", "education",

               "default", "balance", "housing", "loan",

               "whitecollar", "bluecollar", "divorced", "married",

               "primary", "secondary", "tertiary"))



# examine the structure of the full bank data frame

print(str(bankfull))
## 'data.frame':    3705 obs. of  16 variables:
##  $ response   : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 2 1 ...
##  $ age        : int  30 30 59 39 41 39 43 36 20 40 ...
##  $ jobtype    : Factor w/ 3 levels "White Collar",..: 3 1 2 2 1 2 1 2 3 1 ...
##  $ marital    : Factor w/ 3 levels "Divorced","Married",..: 2 2 2 2 2 2 2 2 3 2 ...
##  $ education  : Factor w/ 4 levels "Primary","Secondary",..: 1 3 2 2 3 2 2 3 2 3 ...
##  $ default    : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ balance    : int  1787 1476 0 147 221 9374 264 1109 502 194 ...
##  $ housing    : Factor w/ 2 levels "No","Yes": 1 2 2 2 2 2 2 1 1 1 ...
##  $ loan       : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 1 1 1 2 ...
##  $ whitecollar: num  0 1 0 0 1 0 1 0 0 1 ...
##  $ bluecollar : num  0 0 1 1 0 1 0 1 0 0 ...
##  $ divorced   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ married    : num  1 1 1 1 1 1 1 1 0 1 ...
##  $ primary    : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ secondary  : num  0 0 1 1 0 1 1 0 1 0 ...
##  $ tertiary   : num  0 1 0 0 1 0 0 1 0 1 ...
## NULL
print(head(bankfull))
##    response age       jobtype marital education default balance housing loan
## 1        No  30 Other/Unknown Married   Primary      No    1787      No   No
## 4        No  30  White Collar Married  Tertiary      No    1476     Yes  Yes
## 5        No  59   Blue Collar Married Secondary      No       0     Yes   No
## 8        No  39   Blue Collar Married Secondary      No     147     Yes   No
## 9        No  41  White Collar Married  Tertiary      No     221     Yes   No
## 11       No  39   Blue Collar Married Secondary      No    9374     Yes   No
##    whitecollar bluecollar divorced married primary secondary tertiary
## 1            0          0        0       1       1         0        0
## 4            1          0        0       1       0         0        1
## 5            0          1        0       1       0         1        0
## 8            0          1        0       1       0         1        0
## 9            1          0        0       1       0         0        1
## 11           0          1        0       1       0         1        0
# select subset of variables for input to cluster analysis

data_for_clustering <- subset(bankfull,

    select = c("age",

               "whitecollar", "bluecollar",

               "divorced", "married",

               "primary", "secondary", "tertiary"))