################################################################################
##      Analyzing Facebook ads campaigns using machine learning techniques
##        by Dr. Jimmy (Zhenning) Xu, 
##        follow me on Twitter https://twitter.com/MKTJimmyxu
################################################################################

Introduction

The goals of the analysis is to identify which factors have the direct impact on the ad campaign, compare how the various ad campaigns are performing with respect to different companies, quantify how conversion rate varies with age, gender or interest, identify segments with high and low cost per acquisition and how segment the audience based on click through rates conversion rates, and predict conversion rates for each possible age/gender/interest segment.

setwd("C:/Users/xzhenning/Documents/R/fall 2018/Markdown")
library(tidyverse)
## -- Attaching packages -------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.0     v purrr   0.2.5
## v tibble  1.4.2     v dplyr   0.7.8
## v tidyr   0.8.2     v stringr 1.3.1
## v readr   1.3.1     v forcats 0.3.0
## -- Conflicts ----------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
#install.packages("heatmaply")
library(heatmaply)
## Warning: package 'heatmaply' was built under R version 3.5.3
## Loading required package: plotly
## Warning: package 'plotly' was built under R version 3.5.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
## Loading required package: viridis
## Loading required package: viridisLite
## 
## ======================
## Welcome to heatmaply version 0.15.2
## 
## Type citation('heatmaply') for how to cite the package.
## Type ?heatmaply for the main documentation.
## 
## The github page is: https://github.com/talgalili/heatmaply/
## Please submit your suggestions and bug-reports at: https://github.com/talgalili/heatmaply/issues
## Or contact: <tal.galili@gmail.com>
## ======================
library(tidyverse)

#import data
data <- read_csv("conversion_data.csv")
## Parsed with column specification:
## cols(
##   ad_id = col_double(),
##   xyz_campaign_id = col_double(),
##   fb_campaign_id = col_double(),
##   age = col_character(),
##   gender = col_character(),
##   interest = col_double(),
##   Impressions = col_double(),
##   Clicks = col_double(),
##   Spent = col_double(),
##   Total_Conversion = col_double(),
##   Approved_Conversion = col_double()
## )
# look for unique values in 'age' column
unique(data$age)
## [1] "30-34" "35-39" "40-44" "45-49"
# create copy of data for editing
dataTf <- data

head(dataTf)
## # A tibble: 6 x 11
##    ad_id xyz_campaign_id fb_campaign_id age   gender interest Impressions
##    <dbl>           <dbl>          <dbl> <chr> <chr>     <dbl>       <dbl>
## 1 708746             916         103916 30-34 M            15        7350
## 2 708749             916         103917 30-34 M            16       17861
## 3 708771             916         103920 30-34 M            20         693
## 4 708815             916         103928 30-34 M            28        4259
## 5 708818             916         103928 30-34 M            28        4133
## 6 708820             916         103929 30-34 M            29        1915
## # ... with 4 more variables: Clicks <dbl>, Spent <dbl>,
## #   Total_Conversion <dbl>, Approved_Conversion <dbl>

Data cleanup and exploratory analysis

Evaluate the Structure of the data convert variable types Handling the missing values Re-arranging columns

Identifying column types and data type conversion

# replace character string age ranges with number
dataTf$age[dataTf$age == '30-34'] <- 32
dataTf$age[dataTf$age == '35-39'] <- 37
dataTf$age[dataTf$age == '40-44'] <- 42
dataTf$age[dataTf$age == '45-49'] <- 47

# convert variable to integer
dataTf$age <- as.integer(dataTf$age)

# let's just check that age variable now
unique(dataTf$age)
## [1] 32 37 42 47
str(dataTf$age)
##  int [1:1143] 32 32 32 32 32 32 32 32 32 32 ...
# convert gender variable to integer
dataTf$gender[dataTf$gender == 'M'] <- 0
dataTf$gender[dataTf$gender == 'F'] <- 1
dataTf$gender <- as.integer(dataTf$gender)

# abbreviate some variable names
dataTf <- dataTf %>%
  rename(xyzCampId = xyz_campaign_id, fbCampId = fb_campaign_id, impr = Impressions,
         conv = Total_Conversion, appConv = Approved_Conversion)

Exploratory data Analysis - heatmap analysis

Using hierarchically clustered heatmaps, we will be able to see which variables are grouped together.

# create a heatmap
dataMatNorm <- as.matrix(normalize(dataTf, method = "standardize"))
heatmap(dataMatNorm)

Evaluate variables and KPIs

While we have the main ‘building blocks’ of our KPIs in the original dataset, there are some standard metrics missing, so let’s take the opportunity to add them here.

# use error = true option to let Markdown skip an error
#https://stackoverflow.com/questions/34280043/how-to-skip-error-checking-at-rmarkdown-compiling
#install.packages("DataExplorer")
library(DataExplorer)
## Warning: package 'DataExplorer' was built under R version 3.5.3
data1178 <- data %>%
  rename(xyzCampId = xyz_campaign_id, fbCampId = fb_campaign_id, impr = Impressions,
         conv = Total_Conversion, appConv = Approved_Conversion) %>%
  filter(xyzCampId == 1178)
glimpse(data1178)
## Observations: 625
## Variables: 11
## $ ad_id     <dbl> 1121091, 1121092, 1121094, 1121095, 1121096, 1121097...
## $ xyzCampId <dbl> 1178, 1178, 1178, 1178, 1178, 1178, 1178, 1178, 1178...
## $ fbCampId  <dbl> 144531, 144531, 144531, 144531, 144531, 144532, 1445...
## $ age       <chr> "30-34", "30-34", "30-34", "30-34", "30-34", "30-34"...
## $ gender    <chr> "M", "M", "M", "M", "M", "M", "M", "M", "M", "M", "M...
## $ interest  <dbl> 10, 10, 10, 10, 10, 15, 15, 15, 15, 15, 16, 16, 16, ...
## $ impr      <dbl> 1194718, 637648, 24362, 459690, 750060, 30068, 12675...
## $ Clicks    <dbl> 141, 67, 0, 50, 86, 1, 123, 340, 1, 30, 202, 9, 1, 9...
## $ Spent     <dbl> 254.05, 122.40, 0.00, 86.33, 161.91, 1.82, 236.77, 6...
## $ conv      <dbl> 28, 13, 1, 5, 11, 1, 24, 60, 2, 7, 40, 5, 2, 26, 6, ...
## $ appConv   <dbl> 14, 5, 1, 2, 2, 0, 10, 17, 1, 3, 21, 2, 0, 14, 2, 1,...
data1178 <- data1178 %>%
  mutate(totConv = conv + appConv,
         conVal = conv * 5,
         appConVal = appConv * 100) %>%
  mutate(totConVal = conVal + appConVal) %>%
  mutate(costPerCon = round(Spent / totConv, 2),
         ROAS = round(totConVal / Spent, 2))
data1178 <- data1178 %>%
  mutate(CPM = round((Spent / impr) * 1000, 2))
# take a look at our new variables
head(data1178)
## # A tibble: 6 x 18
##    ad_id xyzCampId fbCampId age   gender interest   impr Clicks  Spent
##    <dbl>     <dbl>    <dbl> <chr> <chr>     <dbl>  <dbl>  <dbl>  <dbl>
## 1 1.12e6      1178   144531 30-34 M            10 1.19e6    141 254.  
## 2 1.12e6      1178   144531 30-34 M            10 6.38e5     67 122.  
## 3 1.12e6      1178   144531 30-34 M            10 2.44e4      0   0   
## 4 1.12e6      1178   144531 30-34 M            10 4.60e5     50  86.3 
## 5 1.12e6      1178   144531 30-34 M            10 7.50e5     86 162.  
## 6 1.12e6      1178   144532 30-34 M            15 3.01e4      1   1.82
## # ... with 9 more variables: conv <dbl>, appConv <dbl>, totConv <dbl>,
## #   conVal <dbl>, appConVal <dbl>, totConVal <dbl>, costPerCon <dbl>,
## #   ROAS <dbl>, CPM <dbl>
options(repr.plot.width=8, repr.plot.height=4)
plot_histogram(data1178)
## Warning: Computation failed in `stat_bin()`:
## `binwidth` must be positive

Analysis by gender & age

options(repr.plot.width=8, repr.plot.height=3)
data1178 %>%
  filter(interest == 101 | interest == 15 | interest == 21) %>%
  ggplot(aes(x = as.factor(interest), y = ROAS, fill = gender)) + geom_boxplot() + scale_y_log10() +
  labs(x = 'Interest ID', y = 'ROAS')
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).

options(repr.plot.width=8, repr.plot.height=4)
data1178 %>%
  filter(interest == 21 | interest == 15 & gender == 'M') %>%
  group_by(age, interest) %>% 
  ggplot(aes(x = as.factor(age), y = ROAS, fill = as.factor(interest))) + geom_boxplot() + scale_y_log10() +
  labs(x = 'Age group', y = 'ROAS') + scale_fill_discrete(name="Interest\nID")
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).

Understanding conversition tracking and ROAS

For an e-commerce site, we could implement conversion tracking to tie-up the value of specific transactions to particular campaigns, which would allow us to assign the actual amount of revenue generated by each campaign / ad creative.

Knowing the conversion value would allow us to calculate other KPIs such as the Return on Advertising Spend (ROAS). While advertising campaigns have other benefits (such as increased brand awareness and future purchases based on customer lifetime value) that may factor into the over return on investment (ROI), ROAS can quickly tell us how a campaign is paying for itself. It is simply the revenue as a percentage of the advertising spend. If a campaign costs $100 and leads to $400 sales, the ROAS is 400% (or 4).

options(repr.plot.width=8, repr.plot.height=3)
data1178 %>%
  ggplot(aes(as.factor(interest), ROAS)) + geom_boxplot() + scale_y_log10() +
  labs(x = "Interest Identifier", y = "ROAS")
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 16 rows containing non-finite values (stat_boxplot).

data1178<- data1178 %>% 
  mutate(CTR = ifelse(data1178$totConv != 0 , 
                            round(data1178$appConv/data1178$totConv*100) , 0))

Relabel conversion rate as a factor

We will implement a decision tree analysis for the campaign ‘1178.’

data1178<- data1178 %>% 
  mutate(CTR = ifelse(data1178$totConv != 0 , 
                            round(data1178$appConv/data1178$totConv*100) , 0)) 

#Conversion rate as a factor

data1178$CTR <-cut(data1178$CTR, seq(0,100,10), right=TRUE, labels=c("0-10%",
                                                                     "10-20%","20-30%","30-40%","40-50%","50-60%","60-70%","70-80%","80-90%","90-100%"))
data1178$CTR[is.na(data1178$CTR)] <- "0-10%"

Using decision tree analysis to predict campaign success (response rate >10%)

A decision tree structure consists of a root node, test nodes, and decision nodes (leaves). The root node is the main node in a decision tree. Our goal is to find which features (variables) are of importance to get “Success” as an outcome.

library(caret)
## Warning: package 'caret' was built under R version 3.5.3
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
## Warning: package 'caret' was built under R version 3.3.3

head(data1178)
## # A tibble: 6 x 19
##    ad_id xyzCampId fbCampId age   gender interest   impr Clicks  Spent
##    <dbl>     <dbl>    <dbl> <chr> <chr>     <dbl>  <dbl>  <dbl>  <dbl>
## 1 1.12e6      1178   144531 30-34 M            10 1.19e6    141 254.  
## 2 1.12e6      1178   144531 30-34 M            10 6.38e5     67 122.  
## 3 1.12e6      1178   144531 30-34 M            10 2.44e4      0   0   
## 4 1.12e6      1178   144531 30-34 M            10 4.60e5     50  86.3 
## 5 1.12e6      1178   144531 30-34 M            10 7.50e5     86 162.  
## 6 1.12e6      1178   144532 30-34 M            15 3.01e4      1   1.82
## # ... with 10 more variables: conv <dbl>, appConv <dbl>, totConv <dbl>,
## #   conVal <dbl>, appConVal <dbl>, totConVal <dbl>, costPerCon <dbl>,
## #   ROAS <dbl>, CPM <dbl>, CTR <fct>
## Loading required package: lattice 
data1178<- data1178 %>% 
  mutate(response = ifelse(data1178$CTR == "0-10%" , 0,1))

#since our model is predictive, we can remove all "id" columns from the dataser

### remove xyz_campagn_id, fb_campaign_id 
data1178_predict<-data1178 %>% select(age, gender,interest, impr,Clicks,Spent,response)
prop.table(table(data1178_predict$response))
## 
##      0      1 
## 0.3888 0.6112
#### split to training and testing
set.seed(1234)
data1178_predict_Dummy <- dummyVars("~.",data=data1178_predict, fullRank=T)
data1178_predict_final <- as.data.frame(predict(data1178_predict_Dummy,data1178_predict))
print(names(data1178_predict_final))
## [1] "age35-39" "age40-44" "age45-49" "genderM"  "interest" "impr"    
## [7] "Clicks"   "Spent"    "response"
#install.packages("rpart.plot")
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.5.3
## Loading required package: rpart
data1178_predict_final$response <- ifelse(data1178_predict_final$response==1,'Success','Failure')

outcomeName <- 'response'
predictorsNames <- names(data1178_predict_final)[names(data1178_predict_final) != outcomeName]

set.seed(1234)
splitIndex <- createDataPartition(data1178_predict_final[,outcomeName], p = .75, list = FALSE, times = 1)
trainDF <- data1178_predict_final[ splitIndex,]
testDF  <- data1178_predict_final[-splitIndex,]
#As the first model we can use decision tree (rpart library)
set.seed(42)

library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.3.3
fit <- rpart(response ~ .,
             data = trainDF,
             method = "class",
             control = rpart.control(xval = 12, 
                                     minbucket = 10, 
                                     cp = 0), 
             parms = list(split = "information"))

rpart.plot(fit,tweak=1.5)

Reference

The data for this case study is originally from a Kaggle competition. Enjoy learning and have fun!