# Identifying Customer Targets (R)

# call in R packages for use in this study
library(lattice)  # multivariate data visualization
library(vcd)  # data visualization for categorical variables
## Loading required package: grid
library(ROCR)  # evaluation of binary classifiers
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## 
## The following object is masked from 'package:stats':
## 
##     lowess
# read bank data into R, creating data frame bank
# note that this is a semicolon-delimited file
bank <- read.csv("/Users/jyothi/Downloads/MDS_Chapter_3/bank.csv", sep = ";", stringsAsFactors = FALSE)

# examine the structure of the bank data frame
print(str(bank))
## 'data.frame':    4521 obs. of  17 variables:
##  $ age      : int  30 33 35 30 59 35 36 39 41 43 ...
##  $ job      : chr  "unemployed" "services" "management" "management" ...
##  $ marital  : chr  "married" "married" "single" "married" ...
##  $ education: chr  "primary" "secondary" "tertiary" "tertiary" ...
##  $ default  : chr  "no" "no" "no" "no" ...
##  $ balance  : int  1787 4789 1350 1476 0 747 307 147 221 -88 ...
##  $ housing  : chr  "no" "yes" "yes" "yes" ...
##  $ loan     : chr  "no" "yes" "no" "yes" ...
##  $ contact  : chr  "cellular" "cellular" "cellular" "unknown" ...
##  $ day      : int  19 11 16 3 5 23 14 6 14 17 ...
##  $ month    : chr  "oct" "may" "apr" "jun" ...
##  $ duration : int  79 220 185 199 226 141 341 151 57 313 ...
##  $ campaign : int  1 1 1 4 1 2 1 2 2 1 ...
##  $ pdays    : int  -1 339 330 -1 -1 176 330 -1 -1 147 ...
##  $ previous : int  0 4 1 0 0 3 2 0 0 2 ...
##  $ poutcome : chr  "unknown" "failure" "failure" "unknown" ...
##  $ response : chr  "no" "no" "no" "no" ...
## NULL
# look at the first few rows of the bank data frame
print(head(bank))
##   age         job marital education default balance housing loan  contact
## 1  30  unemployed married   primary      no    1787      no   no cellular
## 2  33    services married secondary      no    4789     yes  yes cellular
## 3  35  management  single  tertiary      no    1350     yes   no cellular
## 4  30  management married  tertiary      no    1476     yes  yes  unknown
## 5  59 blue-collar married secondary      no       0     yes   no  unknown
## 6  35  management  single  tertiary      no     747      no   no cellular
##   day month duration campaign pdays previous poutcome response
## 1  19   oct       79        1    -1        0  unknown       no
## 2  11   may      220        1   339        4  failure       no
## 3  16   apr      185        1   330        1  failure       no
## 4   3   jun      199        4    -1        0  unknown       no
## 5   5   may      226        1    -1        0  unknown       no
## 6  23   feb      141        2   176        3  failure       no
# look at the list of column names for the variables
print(names(bank))
##  [1] "age"       "job"       "marital"   "education" "default"  
##  [6] "balance"   "housing"   "loan"      "contact"   "day"      
## [11] "month"     "duration"  "campaign"  "pdays"     "previous" 
## [16] "poutcome"  "response"
# look at class and attributes of one of the variables
print(class(bank$age))
## [1] "integer"
print(attributes(bank$age))  # NULL means no special attributes defined
## NULL
# plot a histogram for this variable
with(bank, hist(age))

# examine the frequency tables for categorical/factor variables  
# showing the number of observations with missing data (if any)

print(table(bank$job , useNA = c("always")))
## 
##        admin.   blue-collar  entrepreneur     housemaid    management 
##           478           946           168           112           969 
##       retired self-employed      services       student    technician 
##           230           183           417            84           768 
##    unemployed       unknown          <NA> 
##           128            38             0
print(table(bank$marital , useNA = c("always")))
## 
## divorced  married   single     <NA> 
##      528     2797     1196        0
print(table(bank$education , useNA = c("always")))
## 
##   primary secondary  tertiary   unknown      <NA> 
##       678      2306      1350       187         0
print(table(bank$default , useNA = c("always")))
## 
##   no  yes <NA> 
## 4445   76    0
print(table(bank$housing , useNA = c("always")))
## 
##   no  yes <NA> 
## 1962 2559    0
print(table(bank$loan , useNA = c("always")))
## 
##   no  yes <NA> 
## 3830  691    0
# Type of job (admin., unknown, unemployed, management,
# housemaid, entrepreneur, student, blue-collar, self-employed,
# retired, technician, services)
# put job into three major categories defining the factor variable jobtype
# the "unknown" category is how missing data were coded for job... 
# include these in "Other/Unknown" category/level
white_collar_list <- c("admin.","entrepreneur","management","self-employed")  
blue_collar_list <- c("blue-collar","services","technician")
bank$jobtype <- rep(3, length = nrow(bank))
bank$jobtype <- ifelse((bank$job %in% white_collar_list), 1, bank$jobtype) 
bank$jobtype <- ifelse((bank$job %in% blue_collar_list), 2, bank$jobtype) 
bank$jobtype <- factor(bank$jobtype, levels = c(1, 2, 3), 
    labels = c("White Collar", "Blue Collar", "Other/Unknown"))
with(bank, table(job, jobtype, useNA = c("always")))  # check definition    
##                jobtype
## job             White Collar Blue Collar Other/Unknown <NA>
##   admin.                 478           0             0    0
##   blue-collar              0         946             0    0
##   entrepreneur           168           0             0    0
##   housemaid                0           0           112    0
##   management             969           0             0    0
##   retired                  0           0           230    0
##   self-employed          183           0             0    0
##   services                 0         417             0    0
##   student                  0           0            84    0
##   technician               0         768             0    0
##   unemployed               0           0           128    0
##   unknown                  0           0            38    0
##   <NA>                     0           0             0    0
# define factor variables with labels for plotting
bank$marital <- factor(bank$marital, 
    labels = c("Divorced", "Married", "Single"))
bank$education <- factor(bank$education, 
    labels = c("Primary", "Secondary", "Tertiary", "Unknown"))
bank$default <- factor(bank$default, labels = c("No", "Yes"))
bank$housing <- factor(bank$housing, labels = c("No", "Yes"))
bank$loan <- factor(bank$loan, labels = c("No", "Yes"))
bank$response <- factor(bank$response, labels = c("No", "Yes"))
    
# select subset of cases never perviously contacted by sales
# keeping variables needed for modeling
bankwork <- subset(bank, subset = (previous == 0),
    select = c("response", "age", "jobtype", "marital", "education", 
               "default", "balance", "housing", "loan"))

# examine the structure of the bank data frame

# ----------------------------------
bank_spec <- {response ~ age + jobtype + marital + education +
    default + balance + housing + loan}

# ----------------------------------
# fit logistic regression model 
# ----------------------------------
#bank_fit <- glm(bank_spec, family=binomial, data=bankwork)



bank_smp_size <-floor(0.80* nrow(bankwork))


#for(i in 1:1){
  train_bank <-sample(seq_len(nrow(bankwork)), size = bank_smp_size)
  train_set <- bankwork[train_bank,]
  test_set  <-bankwork[-train_bank,]
  



#print(summary(bank_fit))
#print(anova(bank_fit, test="Chisq"))


# utilizing multifold cross-validation.


library(caret)
## Loading required package: ggplot2
train_control = trainControl(method="cv", number=5)

ctrl <- trainControl(method = "repeatedcv", number = 10, savePredictions = TRUE)
mod_fit <- train(bank_spec,  data=train_set, method="glm", family="binomial", trControl = ctrl, tuneLength = 5)
pred = predict(mod_fit, newdata=test_set)


x =confusionMatrix(data=pred, test_set$response)
x$table
##           Reference
## Prediction  No Yes
##        No  670  71
##        Yes   0   0
y = x$overall
y["Accuracy"]
##  Accuracy 
## 0.9041835