Decision Summary
Based on the outcome of the analysis, we have resolved to maintain/adopt Google-US, Google-Global, Yahoo, as preferred publishers since it has been evidently proven that, they significantly contribute to profit and volume of booking. Kayak can be considered as a publisher to be invested in as it has a very good ROA and Transaction conversion.
Also, these publishers have a good CTR, Transaction conversion and ROA.
Overture-US and Overture Global can be dropped since it has been evidently proven that, they do not have a good ROA nor do they contribute highly towards net revenue and volume of booking. Even though MSN US and MSN global posses an above average CTR and Transaction conversion yet it isn’t significantly contributing towards profit and bookings.
The cost saved on dropping MSN-US, MSN-Global, Overture-US and Overture Global can be redirected to the adopted publishers to increase profit and ROA.
Appendix R-Code for the Regression analysis, ROC v/s AUC and Decision Tree
library(readxl)
library(lmtest)
#library(dplyr)
library(caret)
library(plotrix)
library(ROCR)
library(rpart)
library(rpart.plot)
index <- sample(1:nrow(air_france), size=0.8*nrow(air_france))
air_france_train <- air_france[index,]
air_france_test <- air_france[-index,]
air_france_train$booking_success <- gsub("TRUE","1",air_france_train$booking_success)
air_france_train$booking_success <- gsub("FALSE","0",air_france_train$booking_success)
air_france_test$booking_success <- gsub("TRUE","1",air_france_test$booking_success)
air_france_test$booking_success <- gsub("FALSE","0",air_france_test$booking_success)
air_france_train$booking_success <- as.numeric(air_france_train$booking_success)
air_france_test$booking_success <- as.numeric(air_france_test$booking_success)
#Create linear models to try
my_logit_bookings_success <- glm(booking_success ~ campaign, data=air_france_train)
summary(my_logit_bookings_success)
##
## Call:
## glm(formula = booking_success ~ campaign, data = air_france_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.58621 -0.07841 -0.04687 -0.02899 0.98795
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.18595 0.01685 11.038 < 2e-16 ***
## campaignAir France Branded 0.40026 0.03831 10.447 < 2e-16 ***
## campaignAir France Global Campaign -0.02077 0.02430 -0.855 0.392662
## campaignBusiness Class -0.14595 0.05505 -2.651 0.008058 **
## campaignFrench Destinations -0.12002 0.03222 -3.724 0.000199 ***
## campaignGeo Targeted Atlanta -0.18595 0.05072 -3.666 0.000250 ***
## campaignGeo Targeted Boston -0.15205 0.02942 -5.168 2.50e-07 ***
## campaignGeo Targeted Chicago -0.15696 0.03576 -4.389 1.17e-05 ***
## campaignGeo Targeted Cincinnati -0.18595 0.10830 -1.717 0.086075 .
## campaignGeo Targeted DC -0.15892 0.03004 -5.290 1.29e-07 ***
## campaignGeo Targeted Detroit -0.17390 0.03333 -5.217 1.92e-07 ***
## campaignGeo Targeted Houston -0.15437 0.03173 -4.865 1.19e-06 ***
## campaignGeo Targeted Los Angeles -0.16982 0.02894 -5.868 4.82e-09 ***
## campaignGeo Targeted Miami -0.16214 0.04381 -3.701 0.000218 ***
## campaignGeo Targeted New York -0.10376 0.02444 -4.245 2.24e-05 ***
## campaignGeo Targeted Philadelphia -0.17296 0.03429 -5.044 4.77e-07 ***
## campaignGeo Targeted San Francisco -0.15791 0.03042 -5.190 2.21e-07 ***
## campaignGeo Targeted Seattle -0.11928 0.05072 -2.352 0.018745 *
## campaignGoogle_Yearlong 2006 -0.15661 0.02125 -7.369 2.12e-13 ***
## campaignOutside Western Europe -0.18595 0.07750 -2.399 0.016478 *
## campaignParis & France Terms 0.03793 0.02822 1.344 0.178979
## campaignUnassigned -0.10754 0.01881 -5.718 1.17e-08 ***
## campaignWestern Europe Destinations -0.13908 0.02232 -6.230 5.21e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.06867428)
##
## Null deviance: 272.55 on 3607 degrees of freedom
## Residual deviance: 246.20 on 3585 degrees of freedom
## AIC: 600.39
##
## Number of Fisher Scoring iterations: 2
my_logit_bookings_success_6 <- glm(booking_success_6 ~ clicks + impressions + campaign, data=air_france_train)
summary(my_logit_bookings_success_6)
##
## Call:
## glm(formula = booking_success_6 ~ clicks + impressions + campaign,
## data = air_france_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.21460 -0.00836 -0.00343 0.00016 0.99641
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.000e-02 6.440e-03 6.211 5.86e-10 ***
## clicks 6.843e-05 1.987e-06 34.441 < 2e-16 ***
## impressions -3.370e-08 1.170e-08 -2.881 0.003982 **
## campaignAir France Branded 9.957e-02 1.493e-02 6.667 3.00e-11 ***
## campaignAir France Global Campaign -3.117e-02 9.273e-03 -3.362 0.000783 ***
## campaignBusiness Class -4.171e-02 2.101e-02 -1.985 0.047184 *
## campaignFrench Destinations -4.250e-02 1.230e-02 -3.455 0.000557 ***
## campaignGeo Targeted Atlanta -4.026e-02 1.936e-02 -2.080 0.037633 *
## campaignGeo Targeted Boston -4.027e-02 1.123e-02 -3.584 0.000342 ***
## campaignGeo Targeted Chicago -4.037e-02 1.365e-02 -2.957 0.003125 **
## campaignGeo Targeted Cincinnati -4.014e-02 4.133e-02 -0.971 0.331544
## campaignGeo Targeted DC -4.026e-02 1.147e-02 -3.510 0.000454 ***
## campaignGeo Targeted Detroit -4.019e-02 1.273e-02 -3.158 0.001603 **
## campaignGeo Targeted Houston -4.038e-02 1.211e-02 -3.333 0.000867 ***
## campaignGeo Targeted Los Angeles -4.029e-02 1.105e-02 -3.646 0.000270 ***
## campaignGeo Targeted Miami -4.032e-02 1.672e-02 -2.411 0.015945 *
## campaignGeo Targeted New York -3.185e-02 9.334e-03 -3.412 0.000652 ***
## campaignGeo Targeted Philadelphia -4.019e-02 1.309e-02 -3.070 0.002154 **
## campaignGeo Targeted San Francisco -4.044e-02 1.162e-02 -3.481 0.000505 ***
## campaignGeo Targeted Seattle -4.088e-02 1.936e-02 -2.112 0.034783 *
## campaignGoogle_Yearlong 2006 -4.324e-02 8.116e-03 -5.328 1.05e-07 ***
## campaignOutside Western Europe -4.093e-02 2.958e-02 -1.384 0.166477
## campaignParis & France Terms -2.281e-03 1.077e-02 -0.212 0.832261
## campaignUnassigned -3.684e-02 7.184e-03 -5.128 3.08e-07 ***
## campaignWestern Europe Destinations -3.649e-02 8.525e-03 -4.280 1.92e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.01000003)
##
## Null deviance: 52.221 on 3607 degrees of freedom
## Residual deviance: 35.830 on 3583 degrees of freedom
## AIC: -6349.5
##
## Number of Fisher Scoring iterations: 2
my_logit_bookings_success_1 <- glm(booking_success ~ clicks + impressions, data=air_france_train, family = "binomial")
summary(my_logit_bookings_success_1)
##
## Call:
## glm(formula = booking_success ~ clicks + impressions, family = "binomial",
## data = air_france_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -8.49 0.00 0.00 0.00 8.49
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.005e+14 1.125e+06 -89373799 <2e-16 ***
## clicks 7.728e+11 1.290e+03 598857054 <2e-16 ***
## impressions 3.237e+08 7.829e+00 41349641 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2052.2 on 3607 degrees of freedom
## Residual deviance: 18310.2 on 3605 degrees of freedom
## AIC: 18316
##
## Number of Fisher Scoring iterations: 23
my_logit_bookings_success_6 <- glm(booking_success_6 ~ clicks + impressions, data=air_france_train, family = "binomial")
summary(my_logit_bookings_success_6)
##
## Call:
## glm(formula = booking_success_6 ~ clicks + impressions, family = "binomial",
## data = air_france_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.8090 -0.0830 -0.0813 -0.0809 3.3832
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.724e+00 2.801e-01 -20.440 < 2e-16 ***
## clicks 5.170e-03 4.984e-04 10.375 < 2e-16 ***
## impressions -2.854e-05 4.074e-06 -7.006 2.45e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 552.60 on 3607 degrees of freedom
## Residual deviance: 215.43 on 3605 degrees of freedom
## AIC: 221.43
##
## Number of Fisher Scoring iterations: 8
test_data <- data.frame(
clicks = c(500),
impressions = c(50020)
)
#Test best regressions found
my_prediction_testing_1 <- predict(my_logit_bookings_success_1, air_france_test, type="response")
my_prediction_testing_6 <- predict(my_logit_bookings_success_6, air_france_test, type="response")
#Create a confussion matrix to see the performance of our model
confusionMatrix(data=as.factor(as.numeric(my_prediction_testing_6 > 0.5)),
reference = as.factor(as.numeric(air_france_test$booking_success_6))) #99% accuracy
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 878 7
## 1 5 12
##
## Accuracy : 0.9867
## 95% CI : (0.9769, 0.9931)
## No Information Rate : 0.9789
## P-Value [Acc > NIR] : 0.05865
##
## Kappa : 0.6599
##
## Mcnemar's Test P-Value : 0.77283
##
## Sensitivity : 0.9943
## Specificity : 0.6316
## Pos Pred Value : 0.9921
## Neg Pred Value : 0.7059
## Prevalence : 0.9789
## Detection Rate : 0.9734
## Detection Prevalence : 0.9812
## Balanced Accuracy : 0.8130
##
## 'Positive' Class : 0
##
#Regression not very useful because most of the values in business succes is 0
my_logit <- glm(total_cost ~ campaign , data=air_france)
summary(my_logit, options=options(max.print=100000))
##
## Call:
## glm(formula = total_cost ~ campaign, data = air_france)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1960 -167 -88 -7 44228
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 286.23 75.71 3.781 0.000158 ***
## campaignAir France Branded 1673.74 172.05 9.728 < 2e-16 ***
## campaignAir France Global Campaign -66.16 109.14 -0.606 0.544395
## campaignBusiness Class -180.78 248.88 -0.726 0.467659
## campaignFrench Destinations -205.93 149.43 -1.378 0.168227
## campaignGeneral Terms 320.24 1322.23 0.242 0.808637
## campaignGeo Targeted Atlanta -279.06 229.84 -1.214 0.224757
## campaignGeo Targeted Boston -276.67 132.92 -2.081 0.037446 *
## campaignGeo Targeted Chicago -269.58 155.16 -1.737 0.082377 .
## campaignGeo Targeted Cincinnati -282.48 446.49 -0.633 0.526976
## campaignGeo Targeted DC -277.58 132.61 -2.093 0.036389 *
## campaignGeo Targeted Detroit -279.38 149.43 -1.870 0.061587 .
## campaignGeo Targeted Houston -269.05 144.06 -1.868 0.061882 .
## campaignGeo Targeted Los Angeles -274.62 130.29 -2.108 0.035102 *
## campaignGeo Targeted Miami -273.95 198.10 -1.383 0.166759
## campaignGeo Targeted New York -253.11 108.55 -2.332 0.019757 *
## campaignGeo Targeted Philadelphia -278.12 157.08 -1.771 0.076690 .
## campaignGeo Targeted San Francisco -269.52 140.26 -1.922 0.054724 .
## campaignGeo Targeted Seattle -252.68 235.63 -1.072 0.283603
## campaignGoogle_Yearlong 2006 -128.62 95.31 -1.350 0.177235
## campaignOutside Western Europe -243.53 360.83 -0.675 0.499768
## campaignParis & France Terms 320.85 126.66 2.533 0.011338 *
## campaignUnassigned -116.32 84.66 -1.374 0.169522
## campaignWestern Europe Destinations -194.63 100.72 -1.932 0.053363 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 1742555)
##
## Null deviance: 8125095270 on 4509 degrees of freedom
## Residual deviance: 7817102775 on 4486 degrees of freedom
## AIC: 77637
##
## Number of Fisher Scoring iterations: 2
my_logit <- glm(amount ~ campaign , data=air_france)
summary(my_logit, options=options(max.print=100000))
##
## Call:
## glm(formula = amount ~ campaign, data = air_france)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -32190 -640 -107 -35 535273
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2594.2 826.8 3.138 0.00171 **
## campaignAir France Branded 29595.8 1878.9 15.752 < 2e-16 ***
## campaignAir France Global Campaign -934.7 1191.8 -0.784 0.43293
## campaignBusiness Class -2589.6 2717.9 -0.953 0.34075
## campaignFrench Destinations -2534.9 1631.8 -1.554 0.12037
## campaignGeneral Terms -616.3 14439.1 -0.043 0.96596
## campaignGeo Targeted Atlanta -2589.6 2510.0 -1.032 0.30225
## campaignGeo Targeted Boston -2574.5 1451.5 -1.774 0.07619 .
## campaignGeo Targeted Chicago -2519.0 1694.4 -1.487 0.13717
## campaignGeo Targeted Cincinnati -2594.2 4875.7 -0.532 0.59471
## campaignGeo Targeted DC -2558.9 1448.2 -1.767 0.07730 .
## campaignGeo Targeted Detroit -2585.4 1631.8 -1.584 0.11317
## campaignGeo Targeted Houston -2533.3 1573.2 -1.610 0.10741
## campaignGeo Targeted Los Angeles -2580.1 1422.8 -1.813 0.06983 .
## campaignGeo Targeted Miami -2585.2 2163.3 -1.195 0.23214
## campaignGeo Targeted New York -2470.7 1185.4 -2.084 0.03719 *
## campaignGeo Targeted Philadelphia -2589.5 1715.3 -1.510 0.13121
## campaignGeo Targeted San Francisco -2563.6 1531.7 -1.674 0.09425 .
## campaignGeo Targeted Seattle -2513.7 2573.1 -0.977 0.32866
## campaignGoogle_Yearlong 2006 -2551.2 1040.8 -2.451 0.01427 *
## campaignOutside Western Europe -2594.2 3940.4 -0.658 0.51034
## campaignParis & France Terms -1787.2 1383.2 -1.292 0.19640
## campaignUnassigned -1953.8 924.5 -2.113 0.03463 *
## campaignWestern Europe Destinations -2487.6 1099.8 -2.262 0.02376 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 207803256)
##
## Null deviance: 1.0065e+12 on 4509 degrees of freedom
## Residual deviance: 9.3221e+11 on 4486 degrees of freedom
## AIC: 99201
##
## Number of Fisher Scoring iterations: 2
#AUC ROC
my_prediction_training <- predict(my_logit_bookings_success_6, air_france_train, type="response")
my_prediction <- my_prediction_training
pred_val_logit <- prediction(my_prediction, air_france_train$booking_success_6)
perf_logit <- performance(pred_val_logit, "tpr", "fpr")
plot(perf_logit)

##Building a gini decision tree as a challenger model
my_tree <- rpart(booking_success_6 ~ clicks+impressions,
data=air_france_train, method = "class", cp=0.015)
rpart.plot(my_tree, type=1, extra=1)

## Building ROC AUC for the tree:
my_tree_predict_test <- predict(my_tree, air_france_test, type="prob")
my_tree_predict_train <- predict(my_tree, air_france_train, type="prob")
my_tree_train_prediction <- prediction(my_tree_predict_train[ , 2],
air_france_train$booking_success_6)
my_tree_performance <- performance(my_tree_train_prediction, "tpr", "fpr")
plot(my_tree_performance, col="black")
plot(perf_logit, col="green4", add=TRUE)
