Tayko is a firm that works in software catalogs in order to sell games and education software. They intend to expand its customer base so they joined a consortium of similar catalog firms. So now they can pull 200000 names from the poll. So our goal is to establish which names have the best chance of performing well. To do this we will develop a predictive model that classifies customers into purchasers or non-purchasers and another model that predicts the amount those purchasers would spend.
When estimating profit on average, people spent $205.04
The gross profit per person was $8.87
Which makes the overall profit: 205.04 * 8.87 = $1596082
D = read.csv("C:/Users/marsh/Downloads/Tayko.csv")
names(D)
## [1] "sequence_number" "US" "source_a"
## [4] "source_c" "source_b" "source_d"
## [7] "source_e" "source_m" "source_o"
## [10] "source_h" "source_r" "source_s"
## [13] "source_t" "source_u" "source_p"
## [16] "source_x" "source_w" "Freq"
## [19] "last_update_days_ago" "X1st_update_days_ago" "Web.order"
## [22] "Gender.male" "Address_is_res" "Purchase"
## [25] "Spending"
D = D[, -1]
n = nrow(D)
n1=800
n2=700
n3=500
permuted.index = sample(1:n)
D = D[permuted.index, ]
train = D[1:n1, ]
valid = D[(n1+1):(n1+n2), ]
holdout = D[(n1+n2+1):n, ]
recipe = preProcess(train, method = "range")
train.norm = predict(recipe, train)
valid.norm = predict(recipe, valid)
holdout.norm = predict(recipe, holdout)
logistic.model = glm(Purchase ~ . - Spending, data = train.norm, family = binomial)
selected.logistic.model = step(logistic.model, direction = "backward", trace = 0)
pred.prob = predict(selected.logistic.model, newdata = holdout.norm, type = "response")
selected.logistic.model = step(logistic.model, direction = "backward", trace = 0)
logistic.model <- glm(Purchase ~ . - Spending, data = train.norm, family = binomial)
pred.prob = predict(selected.logistic.model, newdata = holdout.norm, type = "response")
summary(logistic.model)
##
## Call:
## glm(formula = Purchase ~ . - Spending, family = binomial, data = train.norm)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.5313 -0.5720 -0.1152 0.5773 2.2940
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.1686 0.7301 -4.340 1.43e-05 ***
## US 0.1013 0.2877 0.352 0.724638
## source_a 1.9975 0.6791 2.941 0.003269 **
## source_c -0.7572 0.7744 -0.978 0.328194
## source_b 0.3253 0.8216 0.396 0.692111
## source_d 0.2749 0.7412 0.371 0.710733
## source_e 0.7322 0.6450 1.135 0.256346
## source_m 1.7570 1.0409 1.688 0.091416 .
## source_o 0.4733 1.1051 0.428 0.668480
## source_h -4.8871 1.3004 -3.758 0.000171 ***
## source_r 0.6169 0.6986 0.883 0.377175
## source_s 0.1926 0.7835 0.246 0.805815
## source_t 0.5956 0.8501 0.701 0.483588
## source_u 1.9163 0.6643 2.885 0.003916 **
## source_p 15.4375 501.6727 0.031 0.975451
## source_x 1.0338 0.8831 1.171 0.241736
## source_w 0.9319 0.6655 1.400 0.161415
## Freq 26.6206 2.5663 10.373 < 2e-16 ***
## last_update_days_ago -1.8602 1.2046 -1.544 0.122526
## X1st_update_days_ago 1.0678 1.2537 0.852 0.394374
## Web.order 0.8398 0.2058 4.081 4.49e-05 ***
## Gender.male -0.3479 0.2054 -1.693 0.090368 .
## Address_is_res -0.9572 0.2871 -3.334 0.000857 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1107.91 on 799 degrees of freedom
## Residual deviance: 605.84 on 777 degrees of freedom
## AIC: 651.84
##
## Number of Fisher Scoring iterations: 14
train.subset = subset(train.norm, Purchase==1)
valid.subset = subset(valid.norm, Purchase==1)
multiple.reg = lm(Spending ~ . - Purchase, data = train.subset)
selected.multiple.reg = step(multiple.reg, direction = "both", trace = 0)
pred.spending = predict(selected.multiple.reg, newdata = valid.subset)
pred.spending.forHoldout = predict(selected.multiple.reg, newdata = holdout.norm)
plot(pred.spending, valid.subset$Spending)
forecast::accuracy(pred.spending, valid.subset$Spending)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## ME RMSE MAE MPE MAPE
## Test set 0.002937854 0.1243622 0.07654571 -101.552 132.2501
tree_model <- rpart(Purchase ~., data = train, method = "class")
tree_model
## n= 800
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 800 385 0 (0.5187500 0.4812500)
## 2) Spending< 1.5 415 0 0 (1.0000000 0.0000000) *
## 3) Spending>=1.5 385 0 1 (0.0000000 1.0000000) *
rpart.plot(tree_model, type = 3, extra = 101, nn = TRUE)
holdout.norm$pred.prob = pred.prob
holdout.norm$pred.spending.forHoldout = pred.spending.forHoldout
holdout.norm$adj.pred.prob = pred.prob*0.107