##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Warning: package 'tidyr' was built under R version 3.3.2
## Warning: package 'ggplot2' was built under R version 3.3.2
## Warning: package 'readr' was built under R version 3.3.3
## Warning: package 'pander' was built under R version 3.3.3
## Warning: package 'gRain' was built under R version 3.3.3
## Loading required package: gRbase
## Warning: package 'gRbase' was built under R version 3.3.3
## Loading required package: graph
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, cbind, colnames,
## do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff,
## sort, table, tapply, union, unique, unsplit, which, which.max,
## which.min
## Loading required package: grid
ds <- read_csv("C:/ThoughtWorks/conversion_data.csv",
col_types = cols(gender = col_factor(levels = c("M","F"))))
glimpse(tbl_df(ds))
## Observations: 1,143
## Variables: 11
## $ ad_id <int> 708746, 708749, 708771, 708815, 708818, 70...
## $ xyz_campaign_id <int> 916, 916, 916, 916, 916, 916, 916, 916, 91...
## $ fb_campaign_id <int> 103916, 103917, 103920, 103928, 103928, 10...
## $ age <chr> "30-34", "30-34", "30-34", "30-34", "30-34...
## $ gender <fctr> M, M, M, M, M, M, M, M, M, M, M, M, M, M,...
## $ interest <int> 15, 16, 20, 28, 28, 29, 15, 16, 27, 28, 31...
## $ Impressions <int> 7350, 17861, 693, 4259, 4133, 1915, 15615,...
## $ Clicks <int> 1, 2, 0, 1, 1, 0, 3, 1, 1, 3, 0, 0, 0, 0, ...
## $ Spent <dbl> 1.43, 1.82, 0.00, 1.25, 1.29, 0.00, 4.77, ...
## $ Total_Conversion <int> 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ Approved_Conversion <int> 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, ...
(ids <- which(sapply(ds, function(x) length(unique(x))) == nrow(ds)))
## ad_id
## 1
ignore<-names(ids)
vars<-names(ds)
mvc <- sapply(ds[vars], function(x) sum(is.na(x)))
(mvn <- names(which(mvc == nrow(ds))))
## character(0)
##### Ignore missing values
ignore <- union(ignore,mvn)
(constants <- names(which(sapply(ds[vars], function(x) all(x == x[1L])))))
## character(0)
ignore <- union(ignore, constants)
###### Removing columns
ds<-ds[ , !(names(ds) %in% ignore)]
ds<- ds %>%
mutate(click_rate = as.factor(ifelse(ds$Impressions != 0 , round(ds$Clicks/ds$Impressions*10000) , 0)))
ds<- ds %>%
mutate(conv_rate = ifelse(ds$Total_Conversion != 0 , round(ds$Approved_Conversion/ds$Total_Conversion*100) , 0))
ds$conv_rate <-cut(ds$conv_rate, seq(0,100,10), right=TRUE, labels=c("0-10%","10-20%","20-30%","30-40%","40-50%","50-60%","60-70%","70-80%","80-90%","90-100%"))
ds$conv_rate[is.na(ds$conv_rate)] <- "0-10%"
ds<- ds %>%
mutate(loss_amount = ifelse(ds$Clicks !=0, ds$Spent*(1 - ds$Approved_Conversion / ds$Clicks),0))
ds<- ds %>%
mutate(loss_rate = ifelse(ds$Spent>0, round(ds$loss_amount/ds$Spent)*100,0))
ds$loss_rate <-cut(ds$loss_rate, seq(0,100,10), right=TRUE, labels=c("0-10%","10-20%","20-30%","30-40%","40-50%","50-60%","60-70%","70-80%","80-90%","90-100%"))
ds$loss_rate[is.na(ds$loss_rate)] <- "0-10%"
write.table(ds, file = "C:/ThoughtWorks/ds_final.csv", col.names = TRUE, row.names = FALSE, sep = ",")
table(ds$xyz_campaign_id)
##
## 916 936 1178
## 54 464 625
ds_916<-ds %>% filter(xyz_campaign_id %in% c("916"))
ds_916 %>%
group_by(fb_campaign_id) %>%
summarize(frequency = n()) %>%
arrange(desc(frequency)) %>%
slice(1:10) %>%
pander(split.table = 120)
| fb_campaign_id | frequency |
|---|---|
| 104013 | 3 |
| 103928 | 2 |
| 103965 | 2 |
| 104012 | 2 |
| 104085 | 2 |
| 104205 | 2 |
| 103916 | 1 |
| 103917 | 1 |
| 103920 | 1 |
| 103929 | 1 |
ds_916 %>%
filter(fb_campaign_id %in% c("104013")) %>%
select(age, gender, interest,Impressions, Clicks,Total_Conversion,Approved_Conversion, click_rate) %>%
pander(split.table = 120)
| age | gender | interest | Impressions | Clicks | Total_Conversion | Approved_Conversion | click_rate |
|---|---|---|---|---|---|---|---|
| 35-39 | M | 16 | 12190 | 2 | 1 | 0 | 2 |
| 35-39 | M | 16 | 12193 | 2 | 1 | 1 | 2 |
| 35-39 | M | 16 | 3332 | 0 | 1 | 1 | 0 |
ds_916 %>%
group_by(interest) %>%
summarize(frequency = n()) %>%
arrange(desc(frequency)) %>%
slice(1:15) %>%
pander(split.table = 120)
| interest | frequency |
|---|---|
| 16 | 16 |
| 15 | 9 |
| 28 | 4 |
| 20 | 3 |
| 27 | 3 |
| 29 | 3 |
| 7 | 2 |
| 10 | 2 |
| 31 | 2 |
| 63 | 2 |
| 18 | 1 |
| 19 | 1 |
| 21 | 1 |
| 24 | 1 |
| 25 | 1 |
Conversion rate is relatively high (more than 50%) for women with two agr group - “30-34” and “45-49” ##### Three segments can be identified for women and men groups with respect to conversion rate - “0-10”, “40-50”, “90-100%”. The first and the third segments include all age group for women and men. ##### ##### What segments company #916 has with high and low cost per acquisition (based on “loss_rate”" feature, “0-10%” means low cost, “90-100%” - high cost“)
For women all age segments are distributed approximately equally between high and low cost. “40-44” group exists only in low cost group , but since it’s just one record, no any relaible conclusion can be done about it
rpivotTable::rpivotTable(ds_916,rows = "gender",cols="age", vals = "Freq",width="800", height="600px")
ds_936<-ds %>% filter(xyz_campaign_id %in% c("936"))
ds_936 %>%
group_by(fb_campaign_id) %>%
summarize(frequency = n()) %>%
arrange(desc(frequency)) %>%
slice(1:20) %>%
pander(split.table = 120)
| fb_campaign_id | frequency |
|---|---|
| 110982 | 4 |
| 111006 | 4 |
| 115510 | 4 |
| 115541 | 4 |
| 115547 | 4 |
| 115485 | 3 |
| 115487 | 3 |
| 115523 | 3 |
| 115545 | 3 |
| 115631 | 3 |
| 116501 | 3 |
| 123457 | 3 |
| 123556 | 3 |
| 123601 | 3 |
| 123671 | 3 |
| 123736 | 3 |
| 108654 | 2 |
| 108672 | 2 |
| 108752 | 2 |
| 108753 | 2 |
ds_936_fb_campaign <- ds_936 %>%
filter(fb_campaign_id %in% c("110982","111006","115510","115541","115547"))
write.table(ds_936_fb_campaign, file = "C:/ThoughtWorks/ds_936_fb_campaign.csv", col.names = TRUE, row.names = FALSE, sep = ",")
ds_936 %>%
group_by(interest) %>%
summarize(frequency = n()) %>%
arrange(desc(frequency)) %>%
slice(1:10) %>%
pander(split.table = 120)
| interest | frequency |
|---|---|
| 16 | 86 |
| 10 | 48 |
| 29 | 39 |
| 27 | 31 |
| 64 | 24 |
| 26 | 23 |
| 15 | 22 |
| 18 | 22 |
| 20 | 22 |
| 28 | 21 |
ds_936_interest <- ds_936 %>%
filter(interest %in% c("16","10","29","27","64","26","15","18","20","28"))
write.table(ds_936_interest, file = "C:/ThoughtWorks/ds_936_interest.csv", col.names = TRUE, row.names = FALSE, sep = ",")
rpivotTable::rpivotTable(ds_936,rows = "gender",cols="age", vals = "Freq",width="800", height="600px")
ds_1178<-ds %>% filter(xyz_campaign_id %in% c("1178"))
ds_1178 %>%
group_by(fb_campaign_id) %>%
summarize(frequency = n()) %>%
arrange(desc(frequency)) %>%
slice(1:20) %>%
pander(split.table = 120)
| fb_campaign_id | frequency |
|---|---|
| 144536 | 6 |
| 144562 | 6 |
| 144599 | 6 |
| 144611 | 6 |
| 144636 | 6 |
| 144674 | 6 |
| 144531 | 5 |
| 144532 | 5 |
| 144537 | 5 |
| 144549 | 5 |
| 144554 | 5 |
| 144565 | 5 |
| 144572 | 5 |
| 144580 | 5 |
| 144585 | 5 |
| 144586 | 5 |
| 144592 | 5 |
| 144597 | 5 |
| 144602 | 5 |
| 144622 | 5 |
ds_1178_fb_campaign <- ds_1178 %>%
filter(fb_campaign_id %in% c("144536","144562","144599","144611","144636","144674"))
write.table(ds_1178_fb_campaign, file = "C:/ThoughtWorks/ds_1178_fb_campaign.csv", col.names = TRUE, row.names = FALSE, sep = ",")
ds_1178 %>%
group_by(interest) %>%
summarize(frequency = n()) %>%
arrange(desc(frequency)) %>%
slice(1:10) %>%
pander(split.table = 120)
| interest | frequency |
|---|---|
| 16 | 38 |
| 10 | 35 |
| 29 | 35 |
| 27 | 26 |
| 28 | 26 |
| 63 | 26 |
| 20 | 24 |
| 64 | 24 |
| 19 | 23 |
| 21 | 22 |
ds_1178_interest <- ds_1178 %>%
filter(interest %in% c("16","10","29","27","28","63","20","64","19","21"))
write.table(ds_1178_interest, file = "C:/ThoughtWorks/ds_1178_interest.csv", col.names = TRUE, row.names = FALSE, sep = ",")
rpivotTable::rpivotTable(ds_1178,rows = "gender",cols="age", vals = "Freq",width="800", height="600px")
#### Predictive modeling
library(caret)
## Warning: package 'caret' was built under R version 3.3.3
## Loading required package: lattice
ds_1178<-ds %>% filter(xyz_campaign_id %in% c("1178"))
ds_1178<- ds_1178 %>%
mutate(response = ifelse(ds_1178$conv_rate == "0-10%" , 0,1))
### remove xyz_campagn_id, fb_campaign_id
ds_1178_predict<-ds_1178 %>% select( age, gender,interest, Impressions,Clicks,Spent,response)
prop.table(table(ds_1178_predict$response))
##
## 0 1
## 0.3872 0.6128
#### split to training and testing
set.seed(1234)
ds_1178_predict_Dummy <- dummyVars("~.",data=ds_1178_predict, fullRank=T)
ds_1178_predict_final <- as.data.frame(predict(ds_1178_predict_Dummy,ds_1178_predict))
print(names(ds_1178_predict_final))
## [1] "age35-39" "age40-44" "age45-49" "gender.F" "interest"
## [6] "Impressions" "Clicks" "Spent" "response"
ds_1178_predict_final$response <- ifelse(ds_1178_predict_final$response==1,'Success','Failure')
outcomeName <- 'response'
predictorsNames <- names(ds_1178_predict_final)[names(ds_1178_predict_final) != outcomeName]
set.seed(1234)
splitIndex <- createDataPartition(ds_1178_predict_final[,outcomeName], p = .75, list = FALSE, times = 1)
trainDF <- ds_1178_predict_final[ splitIndex,]
testDF <- ds_1178_predict_final[-splitIndex,]
set.seed(42)
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.3.3
fit <- rpart(response ~ .,
data = trainDF,
method = "class",
control = rpart.control(xval = 12,
minbucket = 10,
cp = 0),
parms = list(split = "information"))
rpart.plot(fit,tweak=1.5)
library(C50)
## Warning: package 'C50' was built under R version 3.3.2
outcome_name<-"response"
predictorsNames <- names(ds_1178_predict)[names(ds_1178_predict) != outcomeName]
ds_1178_predict$response<-as.factor(ds_1178_predict$response)
c50_model <- C5.0(x=ds_1178_predict[,-c(1,2,7)], y=ds_1178_predict$response)
summary(c50_model)
##
## Call:
## C5.0.default(x = ds_1178_predict[, -c(1, 2, 7)], y
## = ds_1178_predict$response)
##
##
## C5.0 [Release 2.07 GPL Edition] Tue May 23 12:30:26 2017
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 625 cases (5 attributes) from undefined.data
##
## Decision tree:
##
## Impressions > 410310: 1 (166/25)
## Impressions <= 410310:
## :...Impressions <= 73222: 0 (133/48)
## Impressions > 73222:
## :...interest <= 21: 1 (104/28)
## interest > 21:
## :...interest <= 28: 0 (84/33)
## interest > 28: 1 (138/53)
##
##
## Evaluation on training data (625 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 5 187(29.9%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 136 106 (a): class 0
## 81 302 (b): class 1
##
##
## Attribute usage:
##
## 100.00% Impressions
## 52.16% interest
##
##
## Time: 0.0 secs
set.seed(42)
model_rf <- caret::train(response ~ .,
data = trainDF,
method = "rf",
preProcess = c("scale", "center"),
trControl = trainControl(method = "repeatedcv",
number = 5,
repeats = 5,
savePredictions = TRUE,
verboseIter = FALSE))
## Loading required package: randomForest
## Warning: package 'randomForest' was built under R version 3.3.2
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:BiocGenerics':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
model_rf$finalModel$confusion
## Failure Success class.error
## Failure 68 114 0.6263736
## Success 68 220 0.2361111
imp <- model_rf$finalModel$importance
imp[order(imp, decreasing = TRUE), ]
## Impressions Spent Clicks interest gender.F `age45-49`
## 49.009356 44.910180 36.447911 36.164429 4.467851 4.245633
## `age40-44` `age35-39`
## 4.100636 3.803721
# estimate variable importance
importance <- varImp(model_rf, scale = TRUE)
plot(importance)
##### Predicting test data
confusionMatrix(predict(model_rf, testDF), testDF$response)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Failure Success
## Failure 22 28
## Success 38 67
##
## Accuracy : 0.5742
## 95% CI : (0.4923, 0.6532)
## No Information Rate : 0.6129
## P-Value [Acc > NIR] : 0.8579
##
## Kappa : 0.0742
## Mcnemar's Test P-Value : 0.2679
##
## Sensitivity : 0.3667
## Specificity : 0.7053
## Pos Pred Value : 0.4400
## Neg Pred Value : 0.6381
## Prevalence : 0.3871
## Detection Rate : 0.1419
## Detection Prevalence : 0.3226
## Balanced Accuracy : 0.5360
##
## 'Positive' Class : Failure
##
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.3.2
# calculate correlation matrix
corMatMy <- cor(trainDF[, -9])
corrplot(corMatMy, order = "hclust")
There are three highly correlated features - Impressions, Clicks and Spent. We can filter out the features with a correlation higher than 0.7, keeping the feature with the lower mean
#Apply correlation filter at 0.70,
highlyCor <- colnames(testDF[, -9])[findCorrelation(corMatMy, cutoff = 0.7, verbose = TRUE)]
## Compare row 7 and column 8 with corr 0.99
## Means: 0.381 vs 0.193 so flagging column 7
## Compare row 8 and column 6 with corr 0.962
## Means: 0.27 vs 0.149 so flagging column 8
## All correlations <= 0.7
highlyCor
## [1] "Clicks" "Spent"
Based on our analysis we can remove “Clicks” and “Spent” features and keep everything else,
FB <- dag(~Int:Age+Int:Gender+Clicks:Int+Clicks:Impres+Clicks:Fb_camp+Conv_rate:Clicks+Conv_rate:Total_conv)
plot( FB )
addmargins(prop.table(table(ds_916$age)))
##
## 30-34 35-39 40-44 45-49 Sum
## 0.5370370 0.2222222 0.1111111 0.1296296 1.0000000
addmargins(prop.table(table(ds_916$gender)))
##
## M F Sum
## 0.6481481 0.3518519 1.0000000
Condition probability for interest according to our graphical model is:
prop.table((ftable(ds_916$interest, ds_916$gender,ds_916$age)))
## 30-34 35-39 40-44 45-49
##
## 7 M 0.01851852 0.01851852 0.00000000 0.00000000
## F 0.00000000 0.00000000 0.00000000 0.00000000
## 10 M 0.00000000 0.00000000 0.00000000 0.00000000
## F 0.00000000 0.00000000 0.00000000 0.03703704
## 15 M 0.03703704 0.05555556 0.01851852 0.00000000
## F 0.03703704 0.00000000 0.01851852 0.00000000
## 16 M 0.07407407 0.07407407 0.07407407 0.01851852
## F 0.05555556 0.00000000 0.00000000 0.00000000
## 18 M 0.00000000 0.00000000 0.00000000 0.00000000
## F 0.01851852 0.00000000 0.00000000 0.00000000
## 19 M 0.00000000 0.00000000 0.00000000 0.00000000
## F 0.00000000 0.00000000 0.00000000 0.01851852
## 20 M 0.03703704 0.00000000 0.00000000 0.00000000
## F 0.00000000 0.01851852 0.00000000 0.00000000
## 21 M 0.00000000 0.00000000 0.00000000 0.01851852
## F 0.00000000 0.00000000 0.00000000 0.00000000
## 24 M 0.00000000 0.00000000 0.00000000 0.01851852
## F 0.00000000 0.00000000 0.00000000 0.00000000
## 25 M 0.00000000 0.00000000 0.00000000 0.00000000
## F 0.00000000 0.01851852 0.00000000 0.00000000
## 27 M 0.01851852 0.00000000 0.00000000 0.00000000
## F 0.01851852 0.01851852 0.00000000 0.00000000
## 28 M 0.07407407 0.00000000 0.00000000 0.00000000
## F 0.00000000 0.00000000 0.00000000 0.00000000
## 29 M 0.01851852 0.01851852 0.00000000 0.00000000
## F 0.01851852 0.00000000 0.00000000 0.00000000
## 30 M 0.01851852 0.00000000 0.00000000 0.00000000
## F 0.00000000 0.00000000 0.00000000 0.00000000
## 31 M 0.03703704 0.00000000 0.00000000 0.00000000
## F 0.00000000 0.00000000 0.00000000 0.00000000
## 32 M 0.00000000 0.00000000 0.00000000 0.00000000
## F 0.01851852 0.00000000 0.00000000 0.00000000
## 63 M 0.00000000 0.00000000 0.00000000 0.00000000
## F 0.01851852 0.00000000 0.00000000 0.01851852
## 65 M 0.00000000 0.00000000 0.00000000 0.00000000
## F 0.01851852 0.00000000 0.00000000 0.00000000