R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

## the following syntax is adapted from Dr. Miller's chapter named "Retaining customers"

data <- read.csv('att.csv') #read data
library(lattice)  # lattice plot

library(vcd)  # mosaic plots
## Loading required package: grid
library(gam)  # generalized additive models for probability smooth
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.22-7
library(rpart)  # tree-structured modeling

library(e1071)  # support vector machines

library(randomForest)  # random forests
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
library(nnet)  # neural networks

library(rpart.plot)  # plot tree-structured model information

library(ROCR)  # ROC curve objects for binary classification



# user-defined function for plotting ROC curve using ROC objects from ROCR


# read them as character fields initially

att <- read.csv("att.csv", stringsAsFactors = FALSE)

print(str(att))
## 'data.frame':    1000 obs. of  10 variables:
##  $ pick      : chr  "OCC" "ATT" "OCC" "OCC" ...
##  $ income    : chr  "<7.5" "45-75" "" "" ...
##  $ moves     : chr  "0" "2" "0" "2" ...
##  $ age       : chr  "35-44" "25-34" "" "65+" ...
##  $ education : chr  "HS" "HS" "" "<HS" ...
##  $ employment: chr  "F" "F" "" "R" ...
##  $ usage     : int  9 2 6 7 0 0 3 1 0 2 ...
##  $ nonpub    : chr  "YES" "YES" "NO" "NO" ...
##  $ reachout  : chr  "NO" "NO" "NO" "NO" ...
##  $ card      : chr  "NO" "NO" "YES" "NO" ...
## NULL
# convert blank character fields to missing data codes

att[att == ""] <- NA



# convert character fields to factor fields

att$pick <- factor(att$pick)

att$income <- factor(att$income)

att$moves <- factor(att$moves)

att$age <- factor(att$age)

att$education <- factor(att$education)

att$employment <- factor(att$employment)

att$nonpub <- factor(att$nonpub)

att$reachout <- factor(att$reachout)

att$card <- factor(att$card)



# check revised structure of att data frame

print(str(att))
## 'data.frame':    1000 obs. of  10 variables:
##  $ pick      : Factor w/ 2 levels "ATT","OCC": 2 1 2 2 2 2 2 2 2 2 ...
##  $ income    : Factor w/ 7 levels "<7.5",">75","15-25",..: 1 6 NA NA NA NA 4 3 NA 4 ...
##  $ moves     : Factor w/ 9 levels ">10","0","1",..: 2 4 2 4 2 2 2 2 2 2 ...
##  $ age       : Factor w/ 6 levels "18-24","25-34",..: 3 2 NA 6 6 6 4 5 5 4 ...
##  $ education : Factor w/ 6 levels "<HS",">BA","BA",..: 5 5 NA 1 5 NA 1 5 1 1 ...
##  $ employment: Factor w/ 7 levels "D","F","H","P",..: 2 2 NA 5 3 NA 2 5 2 3 ...
##  $ usage     : int  9 2 6 7 0 0 3 1 0 2 ...
##  $ nonpub    : Factor w/ 2 levels "NO","YES": 2 2 1 1 1 1 1 1 1 1 ...
##  $ reachout  : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 1 1 1 ...
##  $ card      : Factor w/ 2 levels "NO","YES": 1 1 2 1 1 1 1 2 1 1 ...
## NULL
# select usage and AT&T marketing plan factors

attwork <- subset(att, select = c("pick", "usage", "reachout", "card"))

attwork <- na.omit(attwork)

# listwise case deletion for usage and marketing factors

attwork <- na.omit(attwork)

print(summary(attwork))
##   pick         usage        reachout   card    
##  ATT:502   Min.   :  0.00   NO :919   NO :701  
##  OCC:479   1st Qu.:  1.00   YES: 62   YES:280  
##            Median :  6.00                      
##            Mean   : 16.32                      
##            3rd Qu.: 23.00                      
##            Max.   :291.00
# provide overview of data

print(summary(att))
##   pick         income        moves        age      education    employment 
##  ATT:504   15-25  :185   0      :597   18-24: 61   <HS :153   F      :548  
##  OCC:496   25-35  :171   1      :221   25-34:214   >BA : 60   R      :215  
##            7.5-15 :114   2      : 88   35-44:203   BA  :150   H      : 93  
##            35-45  :107   3      : 38   45-54:152   Coll:187   P      : 67  
##            <7.5   : 96   4      : 16   55-64:153   HS  :361   U      : 26  
##            (Other):112   (Other): 23   65+  :184   Voc : 54   (Other): 25  
##            NA's   :215   NA's   : 17   NA's : 33   NA's: 35   NA's   : 26  
##      usage         nonpub    reachout     card    
##  Min.   :  0.00   NO  :808   NO  :919   NO  :702  
##  1st Qu.:  1.00   YES :188   YES : 62   YES :281  
##  Median :  6.00   NA's:  4   NA's: 19   NA's: 17  
##  Mean   : 16.34                                   
##  3rd Qu.: 23.00                                   
##  Max.   :291.00                                   
## 
# -----------------

# usage and pick

# -----------------

# examine relationship between age and response to promotion


lattice_plot_object <- histogram(~usage | pick, data = att,
                                 
                                 type = "density", xlab = "Telephone Usage (Minutes per Month)",
                                 
                                 layout = c(1,2))

print(lattice_plot_object)  # switchers tend to have lower usage

att_gam_model <- gam(pick == "OCC"  ~ s(usage), family=binomial,data=att)

# probability smooth for usage and switching


plot(att$usage, att$pick == "OCC", type="n",
     
     ylim=c(-0.1,1.1), yaxt="n",
     
     ylab="Estimated Probability of Switching",
     
     xlab="Telephone Usage (Minutes per Month)")

axis(2, at=c(0,.5,1))

points(jitter(att$usage),
       
       att$pick=="OCC",pch="|")

o <- order(att$usage)

lines(att$usage[o],fitted(att_gam_model)[o])