Background

Tayko is a firm that works in software catalogs in order to sell games and education software. They intend to expand its customer base so they joined a consortium of similar catalog firms. So now they can pull 200000 names from the poll. So our goal is to establish which names have the best chance of performing well. To do this we will develop a predictive model that classifies customers into purchasers or non-purchasers and another model that predicts the amount those purchasers would spend.

When estimating profit on average, people spent $205.04

The gross profit per person was $8.87

Which makes the overall profit: 205.04 * 8.87 = $1596082

Now let us consider all the other variables

D = read.csv("C:/Users/marsh/Downloads/Tayko.csv")
names(D)
##  [1] "sequence_number"      "US"                   "source_a"            
##  [4] "source_c"             "source_b"             "source_d"            
##  [7] "source_e"             "source_m"             "source_o"            
## [10] "source_h"             "source_r"             "source_s"            
## [13] "source_t"             "source_u"             "source_p"            
## [16] "source_x"             "source_w"             "Freq"                
## [19] "last_update_days_ago" "X1st_update_days_ago" "Web.order"           
## [22] "Gender.male"          "Address_is_res"       "Purchase"            
## [25] "Spending"
D = D[, -1]

Partitioning the data

n = nrow(D)
n1=800
n2=700
n3=500
permuted.index = sample(1:n)

D = D[permuted.index, ]

train = D[1:n1, ]
valid = D[(n1+1):(n1+n2), ]
holdout = D[(n1+n2+1):n, ]
recipe = preProcess(train, method = "range")
train.norm = predict(recipe, train)
valid.norm = predict(recipe, valid)
holdout.norm = predict(recipe, holdout)

Model 1:Logistic Regression

logistic.model = glm(Purchase ~ . - Spending, data = train.norm, family = binomial)

selected.logistic.model = step(logistic.model, direction = "backward", trace = 0)

pred.prob = predict(selected.logistic.model, newdata = holdout.norm, type = "response")

selected.logistic.model = step(logistic.model, direction = "backward", trace = 0)
logistic.model <- glm(Purchase ~ . - Spending, data = train.norm, family = binomial)
pred.prob = predict(selected.logistic.model, newdata = holdout.norm, type = "response")
summary(logistic.model)
## 
## Call:
## glm(formula = Purchase ~ . - Spending, family = binomial, data = train.norm)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.5313  -0.5720  -0.1152   0.5773   2.2940  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -3.1686     0.7301  -4.340 1.43e-05 ***
## US                     0.1013     0.2877   0.352 0.724638    
## source_a               1.9975     0.6791   2.941 0.003269 ** 
## source_c              -0.7572     0.7744  -0.978 0.328194    
## source_b               0.3253     0.8216   0.396 0.692111    
## source_d               0.2749     0.7412   0.371 0.710733    
## source_e               0.7322     0.6450   1.135 0.256346    
## source_m               1.7570     1.0409   1.688 0.091416 .  
## source_o               0.4733     1.1051   0.428 0.668480    
## source_h              -4.8871     1.3004  -3.758 0.000171 ***
## source_r               0.6169     0.6986   0.883 0.377175    
## source_s               0.1926     0.7835   0.246 0.805815    
## source_t               0.5956     0.8501   0.701 0.483588    
## source_u               1.9163     0.6643   2.885 0.003916 ** 
## source_p              15.4375   501.6727   0.031 0.975451    
## source_x               1.0338     0.8831   1.171 0.241736    
## source_w               0.9319     0.6655   1.400 0.161415    
## Freq                  26.6206     2.5663  10.373  < 2e-16 ***
## last_update_days_ago  -1.8602     1.2046  -1.544 0.122526    
## X1st_update_days_ago   1.0678     1.2537   0.852 0.394374    
## Web.order              0.8398     0.2058   4.081 4.49e-05 ***
## Gender.male           -0.3479     0.2054  -1.693 0.090368 .  
## Address_is_res        -0.9572     0.2871  -3.334 0.000857 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1107.91  on 799  degrees of freedom
## Residual deviance:  605.84  on 777  degrees of freedom
## AIC: 651.84
## 
## Number of Fisher Scoring iterations: 14

Model 2: Miltiple linear regression

train.subset = subset(train.norm, Purchase==1)

valid.subset = subset(valid.norm, Purchase==1)

multiple.reg = lm(Spending ~ . - Purchase, data = train.subset)
selected.multiple.reg = step(multiple.reg, direction = "both", trace = 0)


pred.spending = predict(selected.multiple.reg, newdata = valid.subset)
pred.spending.forHoldout = predict(selected.multiple.reg, newdata = holdout.norm)


plot(pred.spending, valid.subset$Spending)

forecast::accuracy(pred.spending, valid.subset$Spending)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
##                   ME      RMSE        MAE      MPE     MAPE
## Test set 0.002937854 0.1243622 0.07654571 -101.552 132.2501

Model 3: Regression tree

tree_model <- rpart(Purchase ~., data = train, method = "class")
tree_model
## n= 800 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 800 385 0 (0.5187500 0.4812500)  
##   2) Spending< 1.5 415   0 0 (1.0000000 0.0000000) *
##   3) Spending>=1.5 385   0 1 (0.0000000 1.0000000) *
rpart.plot(tree_model, type = 3, extra = 101, nn = TRUE)

Lift chart of the expected spending

holdout.norm$pred.prob = pred.prob
holdout.norm$pred.spending.forHoldout = pred.spending.forHoldout
holdout.norm$adj.pred.prob = pred.prob*0.107