Package Loading

rm(list=ls())
installIfAbsentAndLoad <- function(neededVector) {
for(thispackage in neededVector) {
if( ! require(thispackage, character.only = T) )
{ install.packages(thispackage)}
require(thispackage, character.only = T)}}
needed <- c('tidyverse', 'reshape2', 'ggthemes', 'ggrepel', 'RColorBrewer', 'ChannelAttribution', 'markovchain', 'visNetwork', 'expm', 'stringr', 'purrrlyr', 'GameTheoryAllocation')

installIfAbsentAndLoad(needed)

Data Inspect

campaign_data <- read.csv('campaign_data.csv')
dim(campaign_data)
## [1] 650000      6
str(campaign_data)
## 'data.frame':    650000 obs. of  6 variables:
##  $ cookie          : Factor w/ 240108 levels "00000FkCnDfDDf0iC97iC703B",..: 48777 104167 169977 169977 48083 48083 4671 4671 4671 4671 ...
##  $ time            : Factor w/ 529083 levels "2018-07-01T13:13:16Z",..: 376244 216394 106646 106650 495198 495231 244905 258829 258837 512054 ...
##  $ interaction     : Factor w/ 2 levels "conversion","impression": 2 2 2 2 2 2 2 2 2 2 ...
##  $ conversion      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ conversion_value: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ channel         : Factor w/ 5 levels "Facebook","Instagram",..: 2 1 1 1 5 5 2 2 1 2 ...
summary(campaign_data)
##                        cookie                         time       
##  FBAfi0nkAB7oAiiACfBC7non0:   773   2018-07-14T20:50:09Z:    12  
##  F3h7D7kE7C9Dhk3C9oBiEDC0o:   524   2018-07-01T15:15:17Z:    11  
##  FCDiE9Cii37EFA0D9fFioooiD:   325   2018-07-01T15:18:17Z:    11  
##  9BnAAFfn3Ah3Akof3Ci0DDB7B:   302   2018-07-01T15:14:46Z:     9  
##  k0nk030FAECCDoAkiC7koEk3f:   245   2018-07-01T15:15:15Z:     9  
##  hkFDFB9f3Bhin93E9Cfi70DCh:   243   2018-07-01T15:16:53Z:     9  
##  (Other)                  :647588   (Other)             :649939  
##      interaction       conversion      conversion_value
##  conversion: 19613   Min.   :0.00000   Min.   :0.0000  
##  impression:630387   1st Qu.:0.00000   1st Qu.:0.0000  
##                      Median :0.00000   Median :0.0000  
##                      Mean   :0.03017   Mean   :0.1885  
##                      3rd Qu.:0.00000   3rd Qu.:0.0000  
##                      Max.   :1.00000   Max.   :8.5000  
##                                                        
##            channel      
##  Facebook      :199312  
##  Instagram     : 85281  
##  Online Display: 74677  
##  Online Video  :131768  
##  Paid Search   :158962  
##                         
## 

Check missing values

sapply(campaign_data, function(x) sum(is.na(x)))
##           cookie             time      interaction       conversion 
##                0                0                0                0 
## conversion_value          channel 
##                0                0

Check duplicates

sum(duplicated(campaign_data))
## [1] 5006

Data Preprocessing

Remove duplicates

campaign_data <- unique(campaign_data)
dim(campaign_data)
## [1] 644994      6

Transform data

# transform time
campaign_data$time <- lapply(strsplit(as.character(campaign_data$time),'Z|T'), paste, collapse=' ')
campaign_data$time <- strptime(campaign_data$time, "%Y-%m-%d %H:%M:%S")
campaign_data$time <- as.POSIXct(campaign_data$time, "%Y-%m-%d %H:%M:%S", tz=Sys.timezone())

Split paths

df_paths <- campaign_data %>%
  group_by(cookie) %>%
  arrange(time) %>%
  mutate(path_no = ifelse(is.na(lag(cumsum(conversion))), 0, lag(cumsum(conversion))) + 1) %>%
  ungroup()

Modeling

First Touch

# Get all the converted path
conversion_path = df_paths %>%
  group_by(cookie, path_no) %>%
  mutate(new_conversion=sum(conversion)) %>%
  filter(new_conversion==1) %>%
  ungroup()
first_touch = conversion_path %>%
  group_by(cookie, path_no) %>%
  top_n(-1, time) %>%
  mutate(
    order_participation = 1/n()
  ) %>%
  ungroup() %>%
  group_by(channel) %>%
  summarize(total_conversions = sum(order_participation)) %>% collect()

# check
# sum(first_touch$total_conversions)

Last touch

last_touch = conversion_path %>%
  group_by(cookie, path_no) %>%
  top_n(1, time) %>%
  mutate(
    order_participation = 1/n()
  ) %>%
  ungroup() %>%
  group_by(channel) %>%
  summarize(total_conversions = sum(order_participation)) %>% collect()

Even touch

even_touch = conversion_path %>%
  group_by(cookie, path_no) %>%
  mutate(
    order_participation = 1/n()
  ) %>%
  ungroup() %>%
  group_by(channel) %>%
  summarize(total_conversions = sum(order_participation)) %>% collect()

Time Decay

half_life <- 7*24*60*60 # 7 days times 24 hours times 60 minutes time 60 seconds

timeDecay_touch = conversion_path %>%
  group_by(cookie, path_no) %>%
  mutate(
    # channel credit decays by half every seven days
    order_participation = 0.5^(as.double(time - min(time), unit='secs')*1/half_life),
    order_participation = order_participation/sum(order_participation)
  ) %>%
  ungroup() %>%
  group_by(channel) %>%
  summarize(total_conversions = sum(order_participation)) %>% collect()
model_compare <- cbind(first_touch, last_touch$total_conversions, even_touch$total_conversions, timeDecay_touch$total_conversions)
colnames(model_compare) = c('channel', 'first_touch', 'last_touch', 'even_touch', 'time_decay')
model_compare

Sharpley Model

df_seq <- df_paths %>%
  group_by(cookie, path_no) %>%
  summarize(
    fb_touches = max(ifelse(channel == 'Facebook', 1, 0)),
    ins_touches = max(ifelse(channel == 'Instagram', 1, 0)),
    vid_touches = max(ifelse(channel == 'Online Video', 1, 0)),
    dis_touches = max(ifelse(channel == 'Online Display', 1, 0)),
    sear_touches = max(ifelse(channel == 'Paid Search', 1, 0)),
    conversions = sum(conversion)
  ) %>% ungroup()
conv_rates = df_seq %>%
 group_by(fb_touches,
   ins_touches,
   vid_touches,
   dis_touches,
   sear_touches) %>%
 summarize(
   conversions = sum(conversions),
   total_sequences = n()
 ) %>% collect()
number_of_channels = 5

# The coalitions function is a handy function from the GameTheoryALlocation
# library that creates a binary matrix to which can fit characteristic function 
touch_combos = as.data.frame(coalitions(number_of_channels)$Binary)
names(touch_combos) = c("Facebook","Instagram","Online Display", "Online Video", "Paid Search")

# join my previous summary results with the binary matrix the GameTheoryAllocation library built for me.
touch_combo_conv_rate = left_join(touch_combos, conv_rates, 
 by = c(
 "Facebook"="fb_touches",
 "Instagram" = "ins_touches",
 "Online Display" = "dis_touches",
 "Online Video" = "vid_touches",
 "Paid Search" = "sear_touches"
 )
)

# fill in any NAs with 0
touch_combo_conv_rate = touch_combo_conv_rate %>%
 mutate_all(funs(ifelse(is.na(.),0,.))) %>%
 mutate(
 conv_rate = ifelse(total_sequences > 0, conversions/total_sequences, 0)
 )
# Building Shapley Values for each channel combination

shap_vals = as.data.frame(coalitions(number_of_channels)$Binary)
names(shap_vals) = c("Facebook","Instagram","Online Display", "Online Video",
  "Paid Search")
coalition_mat = shap_vals
shap_vals[2^number_of_channels,] = Shapley_value(touch_combo_conv_rate$conv_rate, game="profit")
## [1] "Shapley Value"
for(i in 2:(2^number_of_channels-1)){
  if(sum(coalition_mat[i,]) == 1){
    shap_vals[i,which(shap_vals[i,]==1)] = touch_combo_conv_rate[i,"conv_rate"]
  }else if(sum(coalition_mat[i,]) > 1){
    if(sum(coalition_mat[i,]) < number_of_channels){
      channels_of_interest = which(coalition_mat[i,] == 1)
      char_func = data.frame(rates = touch_combo_conv_rate[1,"conv_rate"])
      for(j in 2:i){
        if(sum(coalition_mat[j,channels_of_interest])>0 & 
            sum(coalition_mat[j,-channels_of_interest])==0)
          char_func = rbind(char_func,touch_combo_conv_rate[j,"conv_rate"])
      }
      shap_vals[i,channels_of_interest] = 
        Shapley_value(char_func$rates, game="profit")
    }
  }
}
## [1] "Shapley Value"
## [1] "Shapley Value"
## [1] "Shapley Value"
## [1] "Shapley Value"
## [1] "Shapley Value"
## [1] "Shapley Value"
## [1] "Shapley Value"
## [1] "Shapley Value"
## [1] "Shapley Value"
## [1] "Shapley Value"
## [1] "Shapley Value"
## [1] "Shapley Value"
## [1] "Shapley Value"
## [1] "Shapley Value"
## [1] "Shapley Value"
## [1] "Shapley Value"
## [1] "Shapley Value"
## [1] "Shapley Value"
## [1] "Shapley Value"
## [1] "Shapley Value"
## [1] "Shapley Value"
## [1] "Shapley Value"
## [1] "Shapley Value"
## [1] "Shapley Value"
## [1] "Shapley Value"
# Apply Shapley Values as attribution weighting
order_distribution = shap_vals * touch_combo_conv_rate$total_sequences
shapley_value = t(t(round(colSums(order_distribution))))
shapley_value = data.frame(mid_campaign = row.names(shapley_value), 
  total_conversions = as.numeric(shapley_value))
model_compare$shapley_value = shapley_value$total_conversions
model_compare

Markov Chain

# attribution model for splitted multi and unique channel paths
df_all_paths_compl <- df_paths %>%
  group_by(cookie, path_no) %>%
  summarise(path = paste(channel, collapse = ' > '),
            conversion = sum(conversion)) %>%
  ungroup() %>%
  mutate(null_conversion = ifelse(conversion == 1, 0, 1))
mod_attrib_complete <- markov_model(
  df_all_paths_compl,
  var_path = 'path',
  var_conv = 'conversion',
  var_null = 'null_conversion',
  out_more = TRUE
)
markov_chain <- mod_attrib_complete$result
colnames(markov_chain) <- c('channel', 'markov_chain')
model_compare = merge(model_compare, markov_chain, by = 'channel')
mod_attrib_complete$removal_effects

Markov Chain Result Visualization

g_channel_performance <- ggplot(model_compare, aes(x = channel, y = markov_chain, fill = channel)) + 
  geom_bar(stat = "identity", width = 0.6) +
  ylim(0, 7000) +
  scale_fill_manual(values = c("#CE2D4F",
                               "#A14DA0",
                               "#9D79BC",
                               "#7F96FF",
                               "#A9CEF4")) +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 9, angle = 30, hjust = 0.6, face = "bold")) +
  theme(panel.grid.major.x = element_blank()) +
  theme(plot.title = element_text(hjust = 0.5)) +
  geom_text(aes(label = round(markov_chain, 0)), fontface = "bold", size = 4, vjust = -1) + 
  labs(x = "", y = "Conversions") +
  ggtitle("Channel Performance") +
  guides(fill=FALSE)

g_channel_performance

Model Comparison

df_compare = melt(model_compare, id='channel')

g_model_comparison <- ggplot(df_compare, aes(x = channel, y = value, fill = variable)) + 
  geom_bar(stat = "identity", width = 0.6, position = position_dodge(width = 0.7)) +
  scale_fill_manual(labels = c("First Touch", "Last Touch", "Linear", 'Time Decay', 'Sharpley', 'Markov'), 
                    values = c("#e65368",
                               "#4e74ff",
                               "#87BFFF",
                               "#3BCEAC",
                               "#CE2D4F",
                               "#A14DA0")) +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 9, angle = 30, hjust = 0.6, face = "bold")) +
  theme(panel.grid.major.x = element_blank()) +
  labs(x = "", y = "Budget $") +
  ggtitle("Markov vs Heuristics") +
  theme(plot.title = element_text(hjust = 0.5))

g_model_comparison

Economic Analysis

campaign_budget_daily = read.csv("budget_sample_daily.csv")
campaign_budget_total = campaign_budget_daily %>%
  group_by(channel) %>%
  summarise(total_cost = round(sum(cost), 1))
campaign_attribution = merge(model_compare, campaign_budget_total, 
                             by.x = "channel")

Calculate ROAS and CPA

campaign_attribution = 
  campaign_attribution %>%
  mutate(chanel_weight = (shapley_value / sum(shapley_value)),
         cost_weight = (total_cost / sum(total_cost)),
         roas = chanel_weight / cost_weight,
         optimal_budget = total_cost * roas,
         CPA = total_cost / shapley_value)

Budget Allocation

df_g2 = campaign_attribution[, c("channel", "total_cost", "optimal_budget")]
df_g2 = melt(df_g2, id = "channel")

# Create double bar chart
g_budget_allocation <- ggplot(df_g2, aes(x = channel, y = value, fill = variable)) + 
  geom_bar(stat = "identity", width = 0.6, position = position_dodge(width = 0.7)) +
  scale_fill_manual(labels = c("Current Budget", "Optimal Budget"), values = c("#FFD166", "#04A777")) +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 10, angle = 30, hjust = 0.6, face = "bold")) +
  theme(panel.grid.major.x = element_blank()) +
  geom_text(aes(label = round(value, 0)), 
            fontface = "bold", size = 3.5, 
            vjust = -0.5, position = position_dodge(width = 0.75)) +
  labs(x = "", y = "Budget $") +
  ggtitle("Budget Allocation") +
  theme(plot.title = element_text(hjust = 0.5))

g_budget_allocation

Customer Journey

trans_matrix_prob <- mod_attrib_complete$transition_matrix %>%
  dmap_at(c(1, 2), as.character)
edges <-
  data.frame(
    from = trans_matrix_prob$channel_from,
    to = trans_matrix_prob$channel_to,
    label = round(trans_matrix_prob$transition_probability, 2),
    font.size = trans_matrix_prob$transition_probability * 100,
    width = trans_matrix_prob$transition_probability * 15,
    shadow = TRUE,
    arrows = "to",
    color = list(color = "#95cbee", highlight = "red")
  )

nodes <- data_frame(id = c( c(trans_matrix_prob$channel_from), c(trans_matrix_prob$channel_to) )) %>%
  distinct(id) %>%
  arrange(id) %>%
  mutate(
    label = id,
    color = ifelse(
      label %in% c('(start)', '(conversion)'),
      '#4ab04a',
      ifelse(label == '(null)', '#ce472e', '#ffd73e')
    ),
    shadow = TRUE,
    shape = "box"
  )
## Warning: `data_frame()` is deprecated, use `tibble()`.
## This warning is displayed once per session.
visNetwork(nodes,
           edges,
           height = "2000px",
           width = "100%",
           main = "Generic Probabilistic model's Transition Matrix") %>%
  visIgraphLayout(randomSeed = 123) %>%
  visNodes(size = 5) %>%
  visOptions(highlightNearest = TRUE)
##### modeling states and conversions #####
# transition matrix preprocessing
df_dummy <- data.frame(channel_from = c('(start)', '(conversion)', '(null)'),
                       channel_to = c('(start)', '(conversion)', '(null)'),
                       n = c(0, 0, 0),
                       tot_n = c(0, 0, 0),
                       perc = c(0, 1, 1))

trans_matrix_complete <- mod_attrib_complete$transition_matrix
trans_matrix_complete <- rbind(trans_matrix_complete, df_dummy %>%
                                 mutate(transition_probability = perc) %>%
                                 select(channel_from, channel_to, transition_probability))
trans_matrix_complete$channel_to <- factor(trans_matrix_complete$channel_to, levels = c(levels(trans_matrix_complete$channel_from)))
trans_matrix_complete <- dcast(trans_matrix_complete, channel_from ~ channel_to, value.var = 'transition_probability')
trans_matrix_complete[is.na(trans_matrix_complete)] <- 0
rownames(trans_matrix_complete) <- trans_matrix_complete$channel_from
trans_matrix_complete <- as.matrix(trans_matrix_complete[, -1])
# transition matrix heatmap for "real" data
df_plot_trans <- mod_attrib_complete$transition_matrix

cols <- c("#e7f0fa", "#c9e2f6", "#95cbee", "#0099dc", "#4ab04a", "#ffd73e", "#eec73a",
          "#e29421", "#e29421", "#f05336", "#ce472e")
t <- max(df_plot_trans$transition_probability)

ggplot(df_plot_trans, aes(y = channel_from, x = channel_to, fill = transition_probability)) +
  theme_minimal() +
  geom_tile(colour = "white", width = .9, height = .9) +
  scale_fill_gradientn(colours = cols, limits = c(0, t),
                       breaks = seq(0, t, by = t/4),
                       labels = c("0", round(t/4*1, 2), round(t/4*2, 2), round(t/4*3, 2), round(t/4*4, 2)),
                       guide = guide_colourbar(ticks = T, nbin = 50, barheight = .5, label = T, barwidth = 10)) +
  geom_text(aes(label = round(transition_probability, 2)), fontface = "bold", size = 4) +
  theme(legend.position = 'bottom',
        legend.direction = "horizontal",
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        plot.title = element_text(size = 20, face = "bold", vjust = 2, color = 'black', lineheight = 0.8),
        axis.title.x = element_text(size = 10, face = "bold"),
        axis.title.y = element_text(size = 10, face = "bold"),
        axis.text.y = element_text(size = 8, face = "bold", color = 'black'),
        axis.text.x = element_text(size = 8, angle = 90, hjust = 0.5, vjust = 0.5, face = "plain")) +
  ggtitle("Transition matrix heatmap")

Customer Journey Duration

# computing time lapses from the first contact to conversion/last contact
df_journey <- df_paths %>%
        group_by(cookie, path_no) %>%
        summarise(path = paste(channel, collapse = ' > '),
                  first_touch_date = min(time),
                  last_touch_date = max(time),
                  tot_time_lapse = round(as.numeric(last_touch_date - first_touch_date)),
                  conversion = sum(conversion)) %>%
        ungroup()
# distribution plot
ggplot(df_journey %>% filter(conversion == 1), aes(x = tot_time_lapse)) + theme_minimal() + geom_histogram(fill = '#4e79a6', binwidth = 1)

# cumulative distribution plot
ggplot(df_journey %>% filter(conversion == 1), aes(x = tot_time_lapse)) +
        theme_minimal() +
        stat_ecdf(geom = 'step', color = '#4e79a7', size = 2, alpha = 0.7) +
        geom_hline(yintercept = 0.95, color = '#e15759', size = 1.5) +
        geom_vline(xintercept = 23, color = '#e15759', size = 1.5, linetype = 2)

### for generic probabilistic model ###
df_journey_2 <- melt(df_journey[c(50:100), ] %>% select(cookie, first_touch_date, last_touch_date, conversion),
                    id.vars = c('cookie', 'conversion'),
                    value.name = 'touch_date') %>%
        arrange(cookie)
 
ggplot(df_journey_2, aes(x = cookie, y = touch_date, color = factor(conversion), group = cookie)) +
        theme_minimal() +
        coord_flip() +
        geom_point(size = 2) +
        geom_line(size = 0.5, color = 'darkgrey') +
        theme(legend.position = 'bottom',
              panel.border = element_blank(),
              panel.grid.major = element_blank(),
              panel.grid.minor = element_blank(),
              axis.ticks.x = element_blank(),
              axis.ticks.y = element_blank()) +
        guides(colour = guide_legend(override.aes = list(size = 5)))