Preliminaries

Determine notebook defaults:

Load packages:

Read in the data:

load("intuit_online.RData")

# Create new variables on FULL dataset before splitting
intuit <- intuit %>%
  mutate(
    has_orders        = ifelse(numords > 0, 1, 0),
    dollars_per_order = ifelse(numords > 0, dollars / numords, 0),
    sex_male          = ifelse(sex == 1, 1, 0),
    sex_female        = ifelse(sex == 2, 1, 0)
  )

# Same mutations on wave2
intuit.wave2 <- intuit.wave2 %>%
  mutate(
    has_orders        = ifelse(numords > 0, 1, 0),
    dollars_per_order = ifelse(numords > 0, dollars / numords, 0),
    sex_male          = ifelse(sex == 1, 1, 0),
    sex_female        = ifelse(sex == 2, 1, 0)
  )

# Split AFTER mutations
intuit.train <- intuit %>% filter(training == 1)
intuit.test  <- intuit %>% filter(training == 0)

# Make res a factor
intuit.train <- intuit.train %>% mutate(res = as.factor(res))
intuit.test  <- intuit.test  %>% mutate(res = as.factor(res))

Assignment answers

Question 1

Type your answer text here, intervowen with blocks of R code

# LOGISTIC REGRESSION
logit_model <- glm(res ~ speeddown + speedup + last + numords + dollars +
                     sincepurch + version2013 + upgraded + payroll +
                     bizflag + income + medhvalue + sex_male + sex_female +
                     has_orders + dollars_per_order,
                   data = intuit.train,
                   family = binomial(link = "logit"))

# Score test set
intuit.test <- intuit.test %>%
  mutate(logit_score = predict(logit_model, newdata = intuit.test, type = "response"))

# NEURAL NETWORK
set.seed(1234)
nn_model <- nnet(res ~ speeddown + speedup + last + numords + dollars +
                   sincepurch + version2013 + upgraded + payroll +
                   bizflag + income + medhvalue + sex_male + sex_female +
                   has_orders + dollars_per_order,
                 data = intuit.train,
                 size = 5,
                 decay = 0.01,
                 maxit = 500)
# weights:  91
initial  value 27656.002491 
iter  10 value 5729.813308
iter  20 value 5540.332826
iter  30 value 5446.734078
iter  40 value 5316.409534
iter  50 value 5187.616413
iter  60 value 5106.272062
iter  70 value 5020.821044
iter  80 value 4911.480330
iter  90 value 4857.000186
iter 100 value 4801.781516
iter 110 value 4769.192332
iter 120 value 4748.104794
iter 130 value 4739.896958
iter 140 value 4736.335078
iter 150 value 4733.674338
iter 160 value 4732.263168
iter 170 value 4730.708654
iter 180 value 4727.935918
iter 190 value 4726.188365
iter 200 value 4725.141060
iter 210 value 4724.531912
iter 220 value 4723.555278
iter 230 value 4722.891555
iter 240 value 4722.842140
iter 250 value 4722.816144
iter 260 value 4722.748847
iter 270 value 4722.651608
iter 280 value 4722.347087
iter 290 value 4722.289794
iter 300 value 4722.286867
iter 300 value 4722.286832
iter 300 value 4722.286832
final  value 4722.286832 
converged
# Score test set
intuit.test <- intuit.test %>%
  mutate(nn_score = predict(nn_model, newdata = intuit.test, type = "raw")[,1])

# GAINS CURVE comparing both models on test set
gainsplot(intuit.test$logit_score, intuit.test$nn_score,
          outcome.var = intuit.test$res)

                   Score  AUGC
 intuit.test$logit_score 0.745
    intuit.test$nn_score 0.812
# VARIABLE IMPORTANCE for both models on test set
varimpplot(logit_model, target = "res", data = intuit.test)

varimpplot(nn_model,    target = "res", data = intuit.test)

Question 2

Type your answer text here, intervowen with blocks of R code

# Targeting threshold
# Need: P(respond) x $180 - $1.60 > $5.60
# So:   P(respond) > $7.20 / $180 = 0.04
threshold <- (5.60 + 1.60) / 180
cat("Targeting threshold:", round(threshold, 4), "\n")
Targeting threshold: 0.04 
# Score wave2 with best model (swap logit_model for nn_model if nn won)
intuit.wave2 <- intuit.wave2 %>%
  mutate(
    best.score   = predict(logit_model, newdata = intuit.wave2, type = "response"),
    target.wave2 = ifelse(best.score > threshold, 1, 0)
  )

table(intuit.wave2$target.wave2)

    0     1 
16657  8343 
cat("Targeting", sum(intuit.wave2$target.wave2), "out of 25,000 customers\n")
Targeting 8343 out of 25,000 customers

Question 3

Type your answer text here, intervowen with blocks of R code

n_targeted      <- sum(intuit.wave2$target.wave2)
expected_conv   <- sum(intuit.wave2$best.score[intuit.wave2$target.wave2 == 1])
expected_profit <- (expected_conv * 180) - (n_targeted * 1.60)
profit_per_cust <- expected_profit / n_targeted

cat("Customers targeted:            ", n_targeted, "\n")
Customers targeted:             8343 
cat("Expected conversions:          ", round(expected_conv, 0), "\n")
Expected conversions:           654 
cat("Expected total profit:        $", round(expected_profit, 0), "\n")
Expected total profit:        $ 104328 
cat("Expected profit per customer: $", round(profit_per_cust, 2), "\n")
Expected profit per customer: $ 12.5 

Question 4

Type your answer text here, intervowen with blocks of R code

# Significant predictors from logistic regression
coef(summary(logit_model)) %>%
  as.data.frame() %>%
  rownames_to_column("variable") %>%
  arrange(`Pr(>|z|)`) %>%
  filter(`Pr(>|z|)` < 0.05)
# Variable importance plots on test set
varimpplot(logit_model, target = "res", data = intuit.test)

varimpplot(nn_model,    target = "res", data = intuit.test)

Submission

Type your answer text here, intervowen with blocks of R code

intuit.wave2.submit <- intuit.wave2 %>%
  select(id, best.score, target.wave2) %>%
  mutate(
    group   = "Kathryn_GG_Emma",  # your actual first names
    section = 1                   # your actual section number
  )

saveRDS(intuit.wave2.submit, "Kathryn_GG_Emma.RDS")