# Identifying Customer Targets (R)
# call in R packages for use in this study
library(lattice) # multivariate data visualization
library(vcd) # data visualization for categorical variables
## Loading required package: grid
library(ROCR) # evaluation of binary classifiers
## Loading required package: gplots
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
# read bank data into R, creating data frame bank
# note that this is a semicolon-delimited file
bank <- read.csv("/Users/jyothi/Downloads/MDS_Chapter_3/bank.csv", sep = ";", stringsAsFactors = FALSE)
# examine the structure of the bank data frame
print(str(bank))
## 'data.frame': 4521 obs. of 17 variables:
## $ age : int 30 33 35 30 59 35 36 39 41 43 ...
## $ job : chr "unemployed" "services" "management" "management" ...
## $ marital : chr "married" "married" "single" "married" ...
## $ education: chr "primary" "secondary" "tertiary" "tertiary" ...
## $ default : chr "no" "no" "no" "no" ...
## $ balance : int 1787 4789 1350 1476 0 747 307 147 221 -88 ...
## $ housing : chr "no" "yes" "yes" "yes" ...
## $ loan : chr "no" "yes" "no" "yes" ...
## $ contact : chr "cellular" "cellular" "cellular" "unknown" ...
## $ day : int 19 11 16 3 5 23 14 6 14 17 ...
## $ month : chr "oct" "may" "apr" "jun" ...
## $ duration : int 79 220 185 199 226 141 341 151 57 313 ...
## $ campaign : int 1 1 1 4 1 2 1 2 2 1 ...
## $ pdays : int -1 339 330 -1 -1 176 330 -1 -1 147 ...
## $ previous : int 0 4 1 0 0 3 2 0 0 2 ...
## $ poutcome : chr "unknown" "failure" "failure" "unknown" ...
## $ response : chr "no" "no" "no" "no" ...
## NULL
# look at the first few rows of the bank data frame
print(head(bank))
## age job marital education default balance housing loan contact
## 1 30 unemployed married primary no 1787 no no cellular
## 2 33 services married secondary no 4789 yes yes cellular
## 3 35 management single tertiary no 1350 yes no cellular
## 4 30 management married tertiary no 1476 yes yes unknown
## 5 59 blue-collar married secondary no 0 yes no unknown
## 6 35 management single tertiary no 747 no no cellular
## day month duration campaign pdays previous poutcome response
## 1 19 oct 79 1 -1 0 unknown no
## 2 11 may 220 1 339 4 failure no
## 3 16 apr 185 1 330 1 failure no
## 4 3 jun 199 4 -1 0 unknown no
## 5 5 may 226 1 -1 0 unknown no
## 6 23 feb 141 2 176 3 failure no
# look at the list of column names for the variables
print(names(bank))
## [1] "age" "job" "marital" "education" "default"
## [6] "balance" "housing" "loan" "contact" "day"
## [11] "month" "duration" "campaign" "pdays" "previous"
## [16] "poutcome" "response"
# look at class and attributes of one of the variables
print(class(bank$age))
## [1] "integer"
print(attributes(bank$age)) # NULL means no special attributes defined
## NULL
# plot a histogram for this variable
with(bank, hist(age))

# examine the frequency tables for categorical/factor variables
# showing the number of observations with missing data (if any)
print(table(bank$job , useNA = c("always")))
##
## admin. blue-collar entrepreneur housemaid management
## 478 946 168 112 969
## retired self-employed services student technician
## 230 183 417 84 768
## unemployed unknown <NA>
## 128 38 0
print(table(bank$marital , useNA = c("always")))
##
## divorced married single <NA>
## 528 2797 1196 0
print(table(bank$education , useNA = c("always")))
##
## primary secondary tertiary unknown <NA>
## 678 2306 1350 187 0
print(table(bank$default , useNA = c("always")))
##
## no yes <NA>
## 4445 76 0
print(table(bank$housing , useNA = c("always")))
##
## no yes <NA>
## 1962 2559 0
print(table(bank$loan , useNA = c("always")))
##
## no yes <NA>
## 3830 691 0
# Type of job (admin., unknown, unemployed, management,
# housemaid, entrepreneur, student, blue-collar, self-employed,
# retired, technician, services)
# put job into three major categories defining the factor variable jobtype
# the "unknown" category is how missing data were coded for job...
# include these in "Other/Unknown" category/level
white_collar_list <- c("admin.","entrepreneur","management","self-employed")
blue_collar_list <- c("blue-collar","services","technician")
bank$jobtype <- rep(3, length = nrow(bank))
bank$jobtype <- ifelse((bank$job %in% white_collar_list), 1, bank$jobtype)
bank$jobtype <- ifelse((bank$job %in% blue_collar_list), 2, bank$jobtype)
bank$jobtype <- factor(bank$jobtype, levels = c(1, 2, 3),
labels = c("White Collar", "Blue Collar", "Other/Unknown"))
with(bank, table(job, jobtype, useNA = c("always"))) # check definition
## jobtype
## job White Collar Blue Collar Other/Unknown <NA>
## admin. 478 0 0 0
## blue-collar 0 946 0 0
## entrepreneur 168 0 0 0
## housemaid 0 0 112 0
## management 969 0 0 0
## retired 0 0 230 0
## self-employed 183 0 0 0
## services 0 417 0 0
## student 0 0 84 0
## technician 0 768 0 0
## unemployed 0 0 128 0
## unknown 0 0 38 0
## <NA> 0 0 0 0
# define factor variables with labels for plotting
bank$marital <- factor(bank$marital,
labels = c("Divorced", "Married", "Single"))
bank$education <- factor(bank$education,
labels = c("Primary", "Secondary", "Tertiary", "Unknown"))
bank$default <- factor(bank$default, labels = c("No", "Yes"))
bank$housing <- factor(bank$housing, labels = c("No", "Yes"))
bank$loan <- factor(bank$loan, labels = c("No", "Yes"))
bank$response <- factor(bank$response, labels = c("No", "Yes"))
# select subset of cases never perviously contacted by sales
# keeping variables needed for modeling
bankwork <- subset(bank, subset = (previous == 0),
select = c("response", "age", "jobtype", "marital", "education",
"default", "balance", "housing", "loan"))
# examine the structure of the bank data frame
# ----------------------------------
bank_spec <- {response ~ age + jobtype + marital + education +
default + balance + housing + loan}
# ----------------------------------
# fit logistic regression model
# ----------------------------------
#bank_fit <- glm(bank_spec, family=binomial, data=bankwork)
bank_smp_size <-floor(0.80* nrow(bankwork))
#for(i in 1:1){
train_bank <-sample(seq_len(nrow(bankwork)), size = bank_smp_size)
train_set <- bankwork[train_bank,]
test_set <-bankwork[-train_bank,]
#print(summary(bank_fit))
#print(anova(bank_fit, test="Chisq"))
# utilizing multifold cross-validation.
library(caret)
## Loading required package: ggplot2
train_control = trainControl(method="cv", number=5)
ctrl <- trainControl(method = "repeatedcv", number = 10, savePredictions = TRUE)
mod_fit <- train(bank_spec, data=train_set, method="glm", family="binomial", trControl = ctrl, tuneLength = 5)
pred = predict(mod_fit, newdata=test_set)
x =confusionMatrix(data=pred, test_set$response)
x$table
## Reference
## Prediction No Yes
## No 670 71
## Yes 0 0
y = x$overall
y["Accuracy"]
## Accuracy
## 0.9041835