R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

# Identifying Customer Targets (R)



# call in R packages for use in this study

library(lattice)  # multivariate data visualization

library(vcd)  # data visualization for categorical variables
## Loading required package: grid
library(ROCR)  # evaluation of binary classifiers



# read bank data into R, creating data frame bank

# note that this is a semicolon-delimited file

bank <- read.csv("bank.csv", sep = ";", stringsAsFactors = FALSE)

# examine the structure of the bank data frame

print(str(bank))
## 'data.frame':    4521 obs. of  17 variables:
##  $ age      : int  30 33 35 30 59 35 36 39 41 43 ...
##  $ job      : chr  "unemployed" "services" "management" "management" ...
##  $ marital  : chr  "married" "married" "single" "married" ...
##  $ education: chr  "primary" "secondary" "tertiary" "tertiary" ...
##  $ default  : chr  "no" "no" "no" "no" ...
##  $ balance  : int  1787 4789 1350 1476 0 747 307 147 221 -88 ...
##  $ housing  : chr  "no" "yes" "yes" "yes" ...
##  $ loan     : chr  "no" "yes" "no" "yes" ...
##  $ contact  : chr  "cellular" "cellular" "cellular" "unknown" ...
##  $ day      : int  19 11 16 3 5 23 14 6 14 17 ...
##  $ month    : chr  "oct" "may" "apr" "jun" ...
##  $ duration : int  79 220 185 199 226 141 341 151 57 313 ...
##  $ campaign : int  1 1 1 4 1 2 1 2 2 1 ...
##  $ pdays    : int  -1 339 330 -1 -1 176 330 -1 -1 147 ...
##  $ previous : int  0 4 1 0 0 3 2 0 0 2 ...
##  $ poutcome : chr  "unknown" "failure" "failure" "unknown" ...
##  $ response : chr  "no" "no" "no" "no" ...
## NULL
# look at the first few rows of the bank data frame

print(head(bank))
##   age         job marital education default balance housing loan  contact day
## 1  30  unemployed married   primary      no    1787      no   no cellular  19
## 2  33    services married secondary      no    4789     yes  yes cellular  11
## 3  35  management  single  tertiary      no    1350     yes   no cellular  16
## 4  30  management married  tertiary      no    1476     yes  yes  unknown   3
## 5  59 blue-collar married secondary      no       0     yes   no  unknown   5
## 6  35  management  single  tertiary      no     747      no   no cellular  23
##   month duration campaign pdays previous poutcome response
## 1   oct       79        1    -1        0  unknown       no
## 2   may      220        1   339        4  failure       no
## 3   apr      185        1   330        1  failure       no
## 4   jun      199        4    -1        0  unknown       no
## 5   may      226        1    -1        0  unknown       no
## 6   feb      141        2   176        3  failure       no
# look at the list of column names for the variables

print(names(bank))
##  [1] "age"       "job"       "marital"   "education" "default"   "balance"  
##  [7] "housing"   "loan"      "contact"   "day"       "month"     "duration" 
## [13] "campaign"  "pdays"     "previous"  "poutcome"  "response"
# look at class and attributes of one of the variables

print(class(bank$age))
## [1] "integer"
print(attributes(bank$age))  # NULL means no special attributes defined
## NULL
# plot a histogram for this variable

with(bank, hist(age))

# examine the frequency tables for categorical/factor variables

# showing the number of observations with missing data (if any)



print(table(bank$job , useNA = c("always")))
## 
##        admin.   blue-collar  entrepreneur     housemaid    management 
##           478           946           168           112           969 
##       retired self-employed      services       student    technician 
##           230           183           417            84           768 
##    unemployed       unknown          <NA> 
##           128            38             0
print(table(bank$marital , useNA = c("always")))
## 
## divorced  married   single     <NA> 
##      528     2797     1196        0
print(table(bank$education , useNA = c("always")))
## 
##   primary secondary  tertiary   unknown      <NA> 
##       678      2306      1350       187         0
print(table(bank$default , useNA = c("always")))
## 
##   no  yes <NA> 
## 4445   76    0
print(table(bank$housing , useNA = c("always")))
## 
##   no  yes <NA> 
## 1962 2559    0
print(table(bank$loan , useNA = c("always")))
## 
##   no  yes <NA> 
## 3830  691    0