This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
# Identifying Consumer Segments (R)
# call in R packages for use in this study
library(lattice) # multivariate data visualization
library(vcd) # data visualization for categorical variables
## Loading required package: grid
library(cluster) # cluster analysis methods
# read bank data into R, creating data frame bank
# note that this is a semicolon-delimited file
bank <- read.csv("bank.csv", sep = ";", stringsAsFactors = FALSE)
# examine the structure of the bank data frame
print(str(bank))
## 'data.frame': 4521 obs. of 17 variables:
## $ age : int 30 33 35 30 59 35 36 39 41 43 ...
## $ job : chr "unemployed" "services" "management" "management" ...
## $ marital : chr "married" "married" "single" "married" ...
## $ education: chr "primary" "secondary" "tertiary" "tertiary" ...
## $ default : chr "no" "no" "no" "no" ...
## $ balance : int 1787 4789 1350 1476 0 747 307 147 221 -88 ...
## $ housing : chr "no" "yes" "yes" "yes" ...
## $ loan : chr "no" "yes" "no" "yes" ...
## $ contact : chr "cellular" "cellular" "cellular" "unknown" ...
## $ day : int 19 11 16 3 5 23 14 6 14 17 ...
## $ month : chr "oct" "may" "apr" "jun" ...
## $ duration : int 79 220 185 199 226 141 341 151 57 313 ...
## $ campaign : int 1 1 1 4 1 2 1 2 2 1 ...
## $ pdays : int -1 339 330 -1 -1 176 330 -1 -1 147 ...
## $ previous : int 0 4 1 0 0 3 2 0 0 2 ...
## $ poutcome : chr "unknown" "failure" "failure" "unknown" ...
## $ response : chr "no" "no" "no" "no" ...
## NULL
print(head(bank))
## age job marital education default balance housing loan contact day
## 1 30 unemployed married primary no 1787 no no cellular 19
## 2 33 services married secondary no 4789 yes yes cellular 11
## 3 35 management single tertiary no 1350 yes no cellular 16
## 4 30 management married tertiary no 1476 yes yes unknown 3
## 5 59 blue-collar married secondary no 0 yes no unknown 5
## 6 35 management single tertiary no 747 no no cellular 23
## month duration campaign pdays previous poutcome response
## 1 oct 79 1 -1 0 unknown no
## 2 may 220 1 339 4 failure no
## 3 apr 185 1 330 1 failure no
## 4 jun 199 4 -1 0 unknown no
## 5 may 226 1 -1 0 unknown no
## 6 feb 141 2 176 3 failure no
print(table(bank$job , useNA = c("always")))
##
## admin. blue-collar entrepreneur housemaid management
## 478 946 168 112 969
## retired self-employed services student technician
## 230 183 417 84 768
## unemployed unknown <NA>
## 128 38 0
print(table(bank$marital , useNA = c("always")))
##
## divorced married single <NA>
## 528 2797 1196 0
print(table(bank$education , useNA = c("always")))
##
## primary secondary tertiary unknown <NA>
## 678 2306 1350 187 0
print(table(bank$default , useNA = c("always")))
##
## no yes <NA>
## 4445 76 0
print(table(bank$housing , useNA = c("always")))
##
## no yes <NA>
## 1962 2559 0
print(table(bank$loan , useNA = c("always")))
##
## no yes <NA>
## 3830 691 0
# Type of job (admin., unknown, unemployed, management,
# housemaid, entrepreneur, student, blue-collar, self-employed,
# retired, technician, services)
# put job into three major categories defining the factor variable jobtype
# the "unknown" category is how missing data were coded for job...
# include these in "Other/Unknown" category/level
white_collar_list <- c("admin.","entrepreneur","management","self-employed")
blue_collar_list <- c("blue-collar","services","technician")
bank$jobtype <- rep(3, length = nrow(bank))
bank$jobtype <- ifelse((bank$job %in% white_collar_list), 1, bank$jobtype)
bank$jobtype <- ifelse((bank$job %in% blue_collar_list), 2, bank$jobtype)
bank$jobtype <- factor(bank$jobtype, levels = c(1, 2, 3),
labels = c("White Collar", "Blue Collar", "Other/Unknown"))
with(bank, table(job, jobtype, useNA = c("always"))) # check definition
## jobtype
## job White Collar Blue Collar Other/Unknown <NA>
## admin. 478 0 0 0
## blue-collar 0 946 0 0
## entrepreneur 168 0 0 0
## housemaid 0 0 112 0
## management 969 0 0 0
## retired 0 0 230 0
## self-employed 183 0 0 0
## services 0 417 0 0
## student 0 0 84 0
## technician 0 768 0 0
## unemployed 0 0 128 0
## unknown 0 0 38 0
## <NA> 0 0 0 0
# define binary indicator variables as numeric 0/1 variables
bank$whitecollar <- ifelse((bank$jobtype == "White Collar"), 1, 0)
bank$bluecollar <- ifelse((bank$jobtype == "Blue Collar"), 1, 0)
with(bank, print(table(whitecollar, bluecollar))) # check definition
## bluecollar
## whitecollar 0 1
## 0 592 2131
## 1 1798 0
with(bank, print(table(jobtype))) # check definition
## jobtype
## White Collar Blue Collar Other/Unknown
## 1798 2131 592
# define factor variables with labels for plotting and binary factors
bank$marital <- factor(bank$marital,
labels = c("Divorced", "Married", "Single"))
# define binary indicator variables as numeric 0/1 variables
bank$divorced <- ifelse((bank$marital == "Divorced"), 1, 0)
bank$married <- ifelse((bank$marital == "Married"), 1, 0)
with(bank, print(table(divorced, married))) # check definition
## married
## divorced 0 1
## 0 1196 2797
## 1 528 0
with(bank, print(table(marital))) # check definition
## marital
## Divorced Married Single
## 528 2797 1196
bank$education <- factor(bank$education,
labels = c("Primary", "Secondary", "Tertiary", "Unknown"))
# define binary indicator variables as numeric 0/1 variables
bank$primary <- ifelse((bank$education == "Primary"), 1, 0)
bank$secondary <- ifelse((bank$education == "Secondary"), 1, 0)
bank$tertiary <- ifelse((bank$education == "Tertiary"), 1, 0)
with(bank, print(table(primary, secondary, tertiary))) # check definition
## , , tertiary = 0
##
## secondary
## primary 0 1
## 0 187 2306
## 1 678 0
##
## , , tertiary = 1
##
## secondary
## primary 0 1
## 0 1350 0
## 1 0 0
with(bank, print(table(education))) # check definition
## education
## Primary Secondary Tertiary Unknown
## 678 2306 1350 187
# client experience variables will not be useful for segmentation
# but can be referred to after segments have been defined
bank$default <- factor(bank$default, labels = c("No", "Yes"))
bank$housing <- factor(bank$housing, labels = c("No", "Yes"))
bank$loan <- factor(bank$loan, labels = c("No", "Yes"))
bank$response <- factor(bank$response, labels = c("No", "Yes"))
# select subset of cases never perviously contacted by sales
# keeping variables needed for cluster analysis and post-analysis
bankfull <- subset(bank, subset = (previous == 0),
select = c("response", "age", "jobtype", "marital", "education",
"default", "balance", "housing", "loan",
"whitecollar", "bluecollar", "divorced", "married",
"primary", "secondary", "tertiary"))
# examine the structure of the full bank data frame
print(str(bankfull))
## 'data.frame': 3705 obs. of 16 variables:
## $ response : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 2 1 ...
## $ age : int 30 30 59 39 41 39 43 36 20 40 ...
## $ jobtype : Factor w/ 3 levels "White Collar",..: 3 1 2 2 1 2 1 2 3 1 ...
## $ marital : Factor w/ 3 levels "Divorced","Married",..: 2 2 2 2 2 2 2 2 3 2 ...
## $ education : Factor w/ 4 levels "Primary","Secondary",..: 1 3 2 2 3 2 2 3 2 3 ...
## $ default : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ balance : int 1787 1476 0 147 221 9374 264 1109 502 194 ...
## $ housing : Factor w/ 2 levels "No","Yes": 1 2 2 2 2 2 2 1 1 1 ...
## $ loan : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 1 1 1 2 ...
## $ whitecollar: num 0 1 0 0 1 0 1 0 0 1 ...
## $ bluecollar : num 0 0 1 1 0 1 0 1 0 0 ...
## $ divorced : num 0 0 0 0 0 0 0 0 0 0 ...
## $ married : num 1 1 1 1 1 1 1 1 0 1 ...
## $ primary : num 1 0 0 0 0 0 0 0 0 0 ...
## $ secondary : num 0 0 1 1 0 1 1 0 1 0 ...
## $ tertiary : num 0 1 0 0 1 0 0 1 0 1 ...
## NULL
print(head(bankfull))
## response age jobtype marital education default balance housing loan
## 1 No 30 Other/Unknown Married Primary No 1787 No No
## 4 No 30 White Collar Married Tertiary No 1476 Yes Yes
## 5 No 59 Blue Collar Married Secondary No 0 Yes No
## 8 No 39 Blue Collar Married Secondary No 147 Yes No
## 9 No 41 White Collar Married Tertiary No 221 Yes No
## 11 No 39 Blue Collar Married Secondary No 9374 Yes No
## whitecollar bluecollar divorced married primary secondary tertiary
## 1 0 0 0 1 1 0 0
## 4 1 0 0 1 0 0 1
## 5 0 1 0 1 0 1 0
## 8 0 1 0 1 0 1 0
## 9 1 0 0 1 0 0 1
## 11 0 1 0 1 0 1 0
# select subset of variables for input to cluster analysis
data_for_clustering <- subset(bankfull,
select = c("age",
"whitecollar", "bluecollar",
"divorced", "married",
"primary", "secondary", "tertiary"))