This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
# Identifying Customer Targets (R)
# call in R packages for use in this study
library(lattice) # multivariate data visualization
library(vcd) # data visualization for categorical variables
## Loading required package: grid
library(ROCR) # evaluation of binary classifiers
# read bank data into R, creating data frame bank
# note that this is a semicolon-delimited file
bank <- read.csv("bank.csv", sep = ";", stringsAsFactors = FALSE)
# examine the structure of the bank data frame
print(str(bank))
## 'data.frame': 4521 obs. of 17 variables:
## $ age : int 30 33 35 30 59 35 36 39 41 43 ...
## $ job : chr "unemployed" "services" "management" "management" ...
## $ marital : chr "married" "married" "single" "married" ...
## $ education: chr "primary" "secondary" "tertiary" "tertiary" ...
## $ default : chr "no" "no" "no" "no" ...
## $ balance : int 1787 4789 1350 1476 0 747 307 147 221 -88 ...
## $ housing : chr "no" "yes" "yes" "yes" ...
## $ loan : chr "no" "yes" "no" "yes" ...
## $ contact : chr "cellular" "cellular" "cellular" "unknown" ...
## $ day : int 19 11 16 3 5 23 14 6 14 17 ...
## $ month : chr "oct" "may" "apr" "jun" ...
## $ duration : int 79 220 185 199 226 141 341 151 57 313 ...
## $ campaign : int 1 1 1 4 1 2 1 2 2 1 ...
## $ pdays : int -1 339 330 -1 -1 176 330 -1 -1 147 ...
## $ previous : int 0 4 1 0 0 3 2 0 0 2 ...
## $ poutcome : chr "unknown" "failure" "failure" "unknown" ...
## $ response : chr "no" "no" "no" "no" ...
## NULL
# look at the first few rows of the bank data frame
print(head(bank))
## age job marital education default balance housing loan contact day
## 1 30 unemployed married primary no 1787 no no cellular 19
## 2 33 services married secondary no 4789 yes yes cellular 11
## 3 35 management single tertiary no 1350 yes no cellular 16
## 4 30 management married tertiary no 1476 yes yes unknown 3
## 5 59 blue-collar married secondary no 0 yes no unknown 5
## 6 35 management single tertiary no 747 no no cellular 23
## month duration campaign pdays previous poutcome response
## 1 oct 79 1 -1 0 unknown no
## 2 may 220 1 339 4 failure no
## 3 apr 185 1 330 1 failure no
## 4 jun 199 4 -1 0 unknown no
## 5 may 226 1 -1 0 unknown no
## 6 feb 141 2 176 3 failure no
# look at the list of column names for the variables
print(names(bank))
## [1] "age" "job" "marital" "education" "default" "balance"
## [7] "housing" "loan" "contact" "day" "month" "duration"
## [13] "campaign" "pdays" "previous" "poutcome" "response"
# look at class and attributes of one of the variables
print(class(bank$age))
## [1] "integer"
print(attributes(bank$age)) # NULL means no special attributes defined
## NULL
# plot a histogram for this variable
with(bank, hist(age))
# examine the frequency tables for categorical/factor variables
# showing the number of observations with missing data (if any)
print(table(bank$job , useNA = c("always")))
##
## admin. blue-collar entrepreneur housemaid management
## 478 946 168 112 969
## retired self-employed services student technician
## 230 183 417 84 768
## unemployed unknown <NA>
## 128 38 0
print(table(bank$marital , useNA = c("always")))
##
## divorced married single <NA>
## 528 2797 1196 0
print(table(bank$education , useNA = c("always")))
##
## primary secondary tertiary unknown <NA>
## 678 2306 1350 187 0
print(table(bank$default , useNA = c("always")))
##
## no yes <NA>
## 4445 76 0
print(table(bank$housing , useNA = c("always")))
##
## no yes <NA>
## 1962 2559 0
print(table(bank$loan , useNA = c("always")))
##
## no yes <NA>
## 3830 691 0