This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
## the following syntax is adapted from Dr. Miller's chapter named "Retaining customers"
data <- read.csv('att.csv') #read data
library(lattice) # lattice plot
library(vcd) # mosaic plots
## Loading required package: grid
library(gam) # generalized additive models for probability smooth
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.22-7
library(rpart) # tree-structured modeling
library(e1071) # support vector machines
library(randomForest) # random forests
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
library(nnet) # neural networks
library(rpart.plot) # plot tree-structured model information
library(ROCR) # ROC curve objects for binary classification
# user-defined function for plotting ROC curve using ROC objects from ROCR
# read them as character fields initially
att <- read.csv("att.csv", stringsAsFactors = FALSE)
print(str(att))
## 'data.frame': 1000 obs. of 10 variables:
## $ pick : chr "OCC" "ATT" "OCC" "OCC" ...
## $ income : chr "<7.5" "45-75" "" "" ...
## $ moves : chr "0" "2" "0" "2" ...
## $ age : chr "35-44" "25-34" "" "65+" ...
## $ education : chr "HS" "HS" "" "<HS" ...
## $ employment: chr "F" "F" "" "R" ...
## $ usage : int 9 2 6 7 0 0 3 1 0 2 ...
## $ nonpub : chr "YES" "YES" "NO" "NO" ...
## $ reachout : chr "NO" "NO" "NO" "NO" ...
## $ card : chr "NO" "NO" "YES" "NO" ...
## NULL
# convert blank character fields to missing data codes
att[att == ""] <- NA
# convert character fields to factor fields
att$pick <- factor(att$pick)
att$income <- factor(att$income)
att$moves <- factor(att$moves)
att$age <- factor(att$age)
att$education <- factor(att$education)
att$employment <- factor(att$employment)
att$nonpub <- factor(att$nonpub)
att$reachout <- factor(att$reachout)
att$card <- factor(att$card)
# check revised structure of att data frame
print(str(att))
## 'data.frame': 1000 obs. of 10 variables:
## $ pick : Factor w/ 2 levels "ATT","OCC": 2 1 2 2 2 2 2 2 2 2 ...
## $ income : Factor w/ 7 levels "<7.5",">75","15-25",..: 1 6 NA NA NA NA 4 3 NA 4 ...
## $ moves : Factor w/ 9 levels ">10","0","1",..: 2 4 2 4 2 2 2 2 2 2 ...
## $ age : Factor w/ 6 levels "18-24","25-34",..: 3 2 NA 6 6 6 4 5 5 4 ...
## $ education : Factor w/ 6 levels "<HS",">BA","BA",..: 5 5 NA 1 5 NA 1 5 1 1 ...
## $ employment: Factor w/ 7 levels "D","F","H","P",..: 2 2 NA 5 3 NA 2 5 2 3 ...
## $ usage : int 9 2 6 7 0 0 3 1 0 2 ...
## $ nonpub : Factor w/ 2 levels "NO","YES": 2 2 1 1 1 1 1 1 1 1 ...
## $ reachout : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 1 1 1 ...
## $ card : Factor w/ 2 levels "NO","YES": 1 1 2 1 1 1 1 2 1 1 ...
## NULL
# select usage and AT&T marketing plan factors
attwork <- subset(att, select = c("pick", "usage", "reachout", "card"))
attwork <- na.omit(attwork)
# listwise case deletion for usage and marketing factors
attwork <- na.omit(attwork)
print(summary(attwork))
## pick usage reachout card
## ATT:502 Min. : 0.00 NO :919 NO :701
## OCC:479 1st Qu.: 1.00 YES: 62 YES:280
## Median : 6.00
## Mean : 16.32
## 3rd Qu.: 23.00
## Max. :291.00
# provide overview of data
print(summary(att))
## pick income moves age education employment
## ATT:504 15-25 :185 0 :597 18-24: 61 <HS :153 F :548
## OCC:496 25-35 :171 1 :221 25-34:214 >BA : 60 R :215
## 7.5-15 :114 2 : 88 35-44:203 BA :150 H : 93
## 35-45 :107 3 : 38 45-54:152 Coll:187 P : 67
## <7.5 : 96 4 : 16 55-64:153 HS :361 U : 26
## (Other):112 (Other): 23 65+ :184 Voc : 54 (Other): 25
## NA's :215 NA's : 17 NA's : 33 NA's: 35 NA's : 26
## usage nonpub reachout card
## Min. : 0.00 NO :808 NO :919 NO :702
## 1st Qu.: 1.00 YES :188 YES : 62 YES :281
## Median : 6.00 NA's: 4 NA's: 19 NA's: 17
## Mean : 16.34
## 3rd Qu.: 23.00
## Max. :291.00
##
# -----------------
# usage and pick
# -----------------
# examine relationship between age and response to promotion
lattice_plot_object <- histogram(~usage | pick, data = att,
type = "density", xlab = "Telephone Usage (Minutes per Month)",
layout = c(1,2))
print(lattice_plot_object) # switchers tend to have lower usage
att_gam_model <- gam(pick == "OCC" ~ s(usage), family=binomial,data=att)
# probability smooth for usage and switching
plot(att$usage, att$pick == "OCC", type="n",
ylim=c(-0.1,1.1), yaxt="n",
ylab="Estimated Probability of Switching",
xlab="Telephone Usage (Minutes per Month)")
axis(2, at=c(0,.5,1))
points(jitter(att$usage),
att$pick=="OCC",pch="|")
o <- order(att$usage)
lines(att$usage[o],fitted(att_gam_model)[o])