Dataset

The dataset is from an article in Medium. It tracked customer journeys from July 1, 2018, to July 31, 2018, and detailedly recorded when each ad showed up, which customer it reached, and whether the customer successfully converted.

For example, for the first cookie, its journey includes four states: Instagram > Online Display > Online Display > Online Display. Unfortunately, the user did not convert during the observed period, and thus both conversion and conversion value are 0.

dt = fread('attribution data.csv')

head(dt) %>% kable() %>%
           kable_styling(bootstrap_options = "striped",
                         full_width = F, 
                         position="center")
cookie time interaction conversion conversion_value channel
00000FkCnDfDDf0iC97iC703B 2018-07-03T13:02:11Z impression 0 0 Instagram
00000FkCnDfDDf0iC97iC703B 2018-07-17T19:15:07Z impression 0 0 Online Display
00000FkCnDfDDf0iC97iC703B 2018-07-24T15:51:46Z impression 0 0 Online Display
00000FkCnDfDDf0iC97iC703B 2018-07-29T07:44:51Z impression 0 0 Online Display
0000nACkD9nFkBBDECD3ki00E 2018-07-03T09:44:57Z impression 0 0 Paid Search
0000nACkD9nFkBBDECD3ki00E 2018-07-03T23:36:49Z impression 0 0 Paid Search

Preprocessing

In order to apply the Markov Chain model, we have to transform our data and create a path variable like our previous example. Here I used data.table to deal with our data because it can process data much more quickly than data.frame.

library(lubridate)
## Warning: package 'lubridate' was built under R version 3.6.3
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(stringr)

dt = dt[order(cookie, time),time:=ymd_hms(time)][,id := seq_len(.N), by = cookie]

dt_wide = dcast(data = dt, formula = cookie ~ id, value.var = "channel")
dt_wide = dt_wide[, path:=do.call(paste,c(.SD, sep=' > ')), .SDcols=-1]
dt_wide = dt_wide[, path:=word(path, 1, sep = " > NA")]

conversion = dt[, .(conversion=sum(conversion), conversion_value=sum(conversion_value)), by=cookie]

setkey(conversion, cookie)
setkey(dt_wide, cookie)

dt_wide = merge(dt_wide, conversion)

head(dt_wide[, .(path, conversion, conversion_value)]) %>%
           kable() %>%
           kable_styling(bootstrap_options = "striped",
                         full_width = F, 
                         position="center")
path conversion conversion_value
Instagram > Online Display > Online Display > Online Display 0 0
Paid Search > Paid Search > Paid Search > Paid Search > Paid Search > Paid Search 0 0
Paid Search > Paid Search > Paid Search > Paid Search > Paid Search 0 0
Instagram 0 0
Paid Search 0 0
Instagram > Facebook > Facebook > Instagram 0 0

Markov Chain Modeling

R has a great package designed for channel attribution, called ChannelAttribution. It can be used to build models based on heuristic and markovian approaches, respectively. To evaluate the Markov Chain results, I simultaneously run heuristic models and regard them as base models.

# Apply models
H = heuristic_models(Data = dt_wide, 
                     var_path = 'path', 
                     var_conv = 'conversion', 
                     var_value='conversion_value', 
                     sep='>')

M = markov_model(Data = dt_wide, 
                     var_path = 'path', 
                     var_conv = 'conversion', 
                     var_value='conversion_value', 
                     sep='>', 
                     order=1, 
                     out_more=TRUE)
## Warning in markov_model(Data = dt_wide, var_path = "path", var_conv =
## "conversion", : This function is deprecated and it could be removed from
## future versions. Use markov_model_mp instead. Disable this warning setting
## vebose=FALSE.
M$result %>%
  kable() %>%
  kable_styling(bootstrap_options = "striped",
                full_width = F, 
                position="center")
channel_name total_conversions total_conversion_value
Instagram 3490.969 21831.79
Online Display 2028.840 12634.91
Paid Search 3997.732 24937.06
Facebook 5270.212 32948.91
Online Video 2851.247 17878.33
H %>%
  kable() %>%
  kable_styling(bootstrap_options = "striped",
                full_width = F, 
                position="center")
channel_name first_touch_conversions first_touch_value last_touch_conversions last_touch_value linear_touch_conversions linear_touch_value
Instagram 2329 14579.5 2244 14039.5 2265.179 14171.72
Online Display 2160 13419.0 2139 13298.5 2124.315 13205.19
Paid Search 4757 29724.0 4547 28331.5 4681.199 29194.46
Facebook 5177 32283.0 5301 33143.5 5218.903 32614.30
Online Video 3216 20225.5 3408 21418.0 3349.403 21045.33
M$transition_matrix %>%
  kable() %>%
  kable_styling(bootstrap_options = "striped",
                full_width = F, 
                position="center")
channel_from channel_to transition_probability
(start) Paid Search 0.2696865
(start) Facebook 0.2934974
(start) Online Video 0.1823233
(start) Instagram 0.1320370
(start) Online Display 0.1224559
Instagram Online Video 0.0342033
Instagram Facebook 0.5980769
Instagram (conversion) 0.3082418
Instagram Paid Search 0.0379121
Instagram Online Display 0.0215659
Online Display (conversion) 0.6551302
Online Display Instagram 0.0462481
Online Display Paid Search 0.1601838
Online Display Online Video 0.0398162
Online Display Facebook 0.0986217
Paid Search (conversion) 0.6954726
Paid Search Online Video 0.0620985
Paid Search Online Display 0.0819823
Paid Search Instagram 0.0455797
Paid Search Facebook 0.1148669
Facebook (conversion) 0.4766229
Facebook Instagram 0.3845531
Facebook Online Video 0.0524186
Facebook Online Display 0.0272433
Facebook Paid Search 0.0591620
Online Video Instagram 0.0490838
Online Video (conversion) 0.7434555
Online Video Paid Search 0.0706806
Online Video Online Display 0.0237784
Online Video Facebook 0.1130017
M$transition_matrix %>%
  kable() %>%
  kable_styling(bootstrap_options = "striped",
                full_width = F, 
                position="center")
channel_from channel_to transition_probability
(start) Paid Search 0.2696865
(start) Facebook 0.2934974
(start) Online Video 0.1823233
(start) Instagram 0.1320370
(start) Online Display 0.1224559
Instagram Online Video 0.0342033
Instagram Facebook 0.5980769
Instagram (conversion) 0.3082418
Instagram Paid Search 0.0379121
Instagram Online Display 0.0215659
Online Display (conversion) 0.6551302
Online Display Instagram 0.0462481
Online Display Paid Search 0.1601838
Online Display Online Video 0.0398162
Online Display Facebook 0.0986217
Paid Search (conversion) 0.6954726
Paid Search Online Video 0.0620985
Paid Search Online Display 0.0819823
Paid Search Instagram 0.0455797
Paid Search Facebook 0.1148669
Facebook (conversion) 0.4766229
Facebook Instagram 0.3845531
Facebook Online Video 0.0524186
Facebook Online Display 0.0272433
Facebook Paid Search 0.0591620
Online Video Instagram 0.0490838
Online Video (conversion) 0.7434555
Online Video Paid Search 0.0706806
Online Video Online Display 0.0237784
Online Video Facebook 0.1130017
M$removal_effects %>%
  kable() %>%
  kable_styling(bootstrap_options = "striped",
                full_width = F, 
                position="center")
channel_name removal_effects_conversion removal_effects_conversion_value
Instagram 0.310793 0.3110501
Online Display 0.180623 0.1800167
Paid Search 0.355909 0.3552926
Facebook 0.469195 0.4694419
Online Video 0.253840 0.2547228