rm(list=ls())
installIfAbsentAndLoad <- function(neededVector) {
for(thispackage in neededVector) {
if( ! require(thispackage, character.only = T) )
{ install.packages(thispackage)}
require(thispackage, character.only = T)}}
needed <- c('tidyverse', 'reshape2', 'ggthemes', 'ggrepel', 'RColorBrewer', 'ChannelAttribution', 'markovchain', 'visNetwork', 'expm', 'stringr', 'purrrlyr', 'GameTheoryAllocation')
installIfAbsentAndLoad(needed)
campaign_data <- read.csv('campaign_data.csv')
dim(campaign_data)
## [1] 650000 6
str(campaign_data)
## 'data.frame': 650000 obs. of 6 variables:
## $ cookie : Factor w/ 240108 levels "00000FkCnDfDDf0iC97iC703B",..: 48777 104167 169977 169977 48083 48083 4671 4671 4671 4671 ...
## $ time : Factor w/ 529083 levels "2018-07-01T13:13:16Z",..: 376244 216394 106646 106650 495198 495231 244905 258829 258837 512054 ...
## $ interaction : Factor w/ 2 levels "conversion","impression": 2 2 2 2 2 2 2 2 2 2 ...
## $ conversion : int 0 0 0 0 0 0 0 0 0 0 ...
## $ conversion_value: num 0 0 0 0 0 0 0 0 0 0 ...
## $ channel : Factor w/ 5 levels "Facebook","Instagram",..: 2 1 1 1 5 5 2 2 1 2 ...
summary(campaign_data)
## cookie time
## FBAfi0nkAB7oAiiACfBC7non0: 773 2018-07-14T20:50:09Z: 12
## F3h7D7kE7C9Dhk3C9oBiEDC0o: 524 2018-07-01T15:15:17Z: 11
## FCDiE9Cii37EFA0D9fFioooiD: 325 2018-07-01T15:18:17Z: 11
## 9BnAAFfn3Ah3Akof3Ci0DDB7B: 302 2018-07-01T15:14:46Z: 9
## k0nk030FAECCDoAkiC7koEk3f: 245 2018-07-01T15:15:15Z: 9
## hkFDFB9f3Bhin93E9Cfi70DCh: 243 2018-07-01T15:16:53Z: 9
## (Other) :647588 (Other) :649939
## interaction conversion conversion_value
## conversion: 19613 Min. :0.00000 Min. :0.0000
## impression:630387 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.00000 Median :0.0000
## Mean :0.03017 Mean :0.1885
## 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :1.00000 Max. :8.5000
##
## channel
## Facebook :199312
## Instagram : 85281
## Online Display: 74677
## Online Video :131768
## Paid Search :158962
##
##
Check missing values
sapply(campaign_data, function(x) sum(is.na(x)))
## cookie time interaction conversion
## 0 0 0 0
## conversion_value channel
## 0 0
Check duplicates
sum(duplicated(campaign_data))
## [1] 5006
Remove duplicates
campaign_data <- unique(campaign_data)
dim(campaign_data)
## [1] 644994 6
Transform data
# transform time
campaign_data$time <- lapply(strsplit(as.character(campaign_data$time),'Z|T'), paste, collapse=' ')
campaign_data$time <- strptime(campaign_data$time, "%Y-%m-%d %H:%M:%S")
campaign_data$time <- as.POSIXct(campaign_data$time, "%Y-%m-%d %H:%M:%S", tz=Sys.timezone())
Split paths
df_paths <- campaign_data %>%
group_by(cookie) %>%
arrange(time) %>%
mutate(path_no = ifelse(is.na(lag(cumsum(conversion))), 0, lag(cumsum(conversion))) + 1) %>%
ungroup()
# Get all the converted path
conversion_path = df_paths %>%
group_by(cookie, path_no) %>%
mutate(new_conversion=sum(conversion)) %>%
filter(new_conversion==1) %>%
ungroup()
first_touch = conversion_path %>%
group_by(cookie, path_no) %>%
top_n(-1, time) %>%
mutate(
order_participation = 1/n()
) %>%
ungroup() %>%
group_by(channel) %>%
summarize(total_conversions = sum(order_participation)) %>% collect()
# check
# sum(first_touch$total_conversions)
last_touch = conversion_path %>%
group_by(cookie, path_no) %>%
top_n(1, time) %>%
mutate(
order_participation = 1/n()
) %>%
ungroup() %>%
group_by(channel) %>%
summarize(total_conversions = sum(order_participation)) %>% collect()
even_touch = conversion_path %>%
group_by(cookie, path_no) %>%
mutate(
order_participation = 1/n()
) %>%
ungroup() %>%
group_by(channel) %>%
summarize(total_conversions = sum(order_participation)) %>% collect()
half_life <- 7*24*60*60 # 7 days times 24 hours times 60 minutes time 60 seconds
timeDecay_touch = conversion_path %>%
group_by(cookie, path_no) %>%
mutate(
# channel credit decays by half every seven days
order_participation = 0.5^(as.double(time - min(time), unit='secs')*1/half_life),
order_participation = order_participation/sum(order_participation)
) %>%
ungroup() %>%
group_by(channel) %>%
summarize(total_conversions = sum(order_participation)) %>% collect()
model_compare <- cbind(first_touch, last_touch$total_conversions, even_touch$total_conversions, timeDecay_touch$total_conversions)
colnames(model_compare) = c('channel', 'first_touch', 'last_touch', 'even_touch', 'time_decay')
model_compare
# attribution model for splitted multi and unique channel paths
df_all_paths_compl <- df_paths %>%
group_by(cookie, path_no) %>%
summarise(path = paste(channel, collapse = ' > '),
conversion = sum(conversion)) %>%
ungroup() %>%
mutate(null_conversion = ifelse(conversion == 1, 0, 1))
mod_attrib_complete <- markov_model(
df_all_paths_compl,
var_path = 'path',
var_conv = 'conversion',
var_null = 'null_conversion',
out_more = TRUE
)
markov_chain <- mod_attrib_complete$result
colnames(markov_chain) <- c('channel', 'markov_chain')
model_compare = merge(model_compare, markov_chain, by = 'channel')
mod_attrib_complete$removal_effects
g_channel_performance <- ggplot(model_compare, aes(x = channel, y = markov_chain, fill = channel)) +
geom_bar(stat = "identity", width = 0.6) +
ylim(0, 7000) +
scale_fill_manual(values = c("#CE2D4F",
"#A14DA0",
"#9D79BC",
"#7F96FF",
"#A9CEF4")) +
theme_minimal() +
theme(axis.text.x = element_text(size = 9, angle = 30, hjust = 0.6, face = "bold")) +
theme(panel.grid.major.x = element_blank()) +
theme(plot.title = element_text(hjust = 0.5)) +
geom_text(aes(label = round(markov_chain, 0)), fontface = "bold", size = 4, vjust = -1) +
labs(x = "", y = "Conversions") +
ggtitle("Channel Performance") +
guides(fill=FALSE)
g_channel_performance
df_compare = melt(model_compare, id='channel')
g_model_comparison <- ggplot(df_compare, aes(x = channel, y = value, fill = variable)) +
geom_bar(stat = "identity", width = 0.6, position = position_dodge(width = 0.7)) +
scale_fill_manual(labels = c("First Touch", "Last Touch", "Linear", 'Time Decay', 'Sharpley', 'Markov'),
values = c("#e65368",
"#4e74ff",
"#87BFFF",
"#3BCEAC",
"#CE2D4F",
"#A14DA0")) +
theme_minimal() +
theme(axis.text.x = element_text(size = 9, angle = 30, hjust = 0.6, face = "bold")) +
theme(panel.grid.major.x = element_blank()) +
labs(x = "", y = "Budget $") +
ggtitle("Markov vs Heuristics") +
theme(plot.title = element_text(hjust = 0.5))
g_model_comparison
campaign_budget_daily = read.csv("budget_sample_daily.csv")
campaign_budget_total = campaign_budget_daily %>%
group_by(channel) %>%
summarise(total_cost = round(sum(cost), 1))
campaign_attribution = merge(model_compare, campaign_budget_total,
by.x = "channel")
campaign_attribution =
campaign_attribution %>%
mutate(chanel_weight = (shapley_value / sum(shapley_value)),
cost_weight = (total_cost / sum(total_cost)),
roas = chanel_weight / cost_weight,
optimal_budget = total_cost * roas,
CPA = total_cost / shapley_value)
df_g2 = campaign_attribution[, c("channel", "total_cost", "optimal_budget")]
df_g2 = melt(df_g2, id = "channel")
# Create double bar chart
g_budget_allocation <- ggplot(df_g2, aes(x = channel, y = value, fill = variable)) +
geom_bar(stat = "identity", width = 0.6, position = position_dodge(width = 0.7)) +
scale_fill_manual(labels = c("Current Budget", "Optimal Budget"), values = c("#FFD166", "#04A777")) +
theme_minimal() +
theme(axis.text.x = element_text(size = 10, angle = 30, hjust = 0.6, face = "bold")) +
theme(panel.grid.major.x = element_blank()) +
geom_text(aes(label = round(value, 0)),
fontface = "bold", size = 3.5,
vjust = -0.5, position = position_dodge(width = 0.75)) +
labs(x = "", y = "Budget $") +
ggtitle("Budget Allocation") +
theme(plot.title = element_text(hjust = 0.5))
g_budget_allocation
trans_matrix_prob <- mod_attrib_complete$transition_matrix %>%
dmap_at(c(1, 2), as.character)
edges <-
data.frame(
from = trans_matrix_prob$channel_from,
to = trans_matrix_prob$channel_to,
label = round(trans_matrix_prob$transition_probability, 2),
font.size = trans_matrix_prob$transition_probability * 100,
width = trans_matrix_prob$transition_probability * 15,
shadow = TRUE,
arrows = "to",
color = list(color = "#95cbee", highlight = "red")
)
nodes <- data_frame(id = c( c(trans_matrix_prob$channel_from), c(trans_matrix_prob$channel_to) )) %>%
distinct(id) %>%
arrange(id) %>%
mutate(
label = id,
color = ifelse(
label %in% c('(start)', '(conversion)'),
'#4ab04a',
ifelse(label == '(null)', '#ce472e', '#ffd73e')
),
shadow = TRUE,
shape = "box"
)
## Warning: `data_frame()` is deprecated, use `tibble()`.
## This warning is displayed once per session.
visNetwork(nodes,
edges,
height = "2000px",
width = "100%",
main = "Generic Probabilistic model's Transition Matrix") %>%
visIgraphLayout(randomSeed = 123) %>%
visNodes(size = 5) %>%
visOptions(highlightNearest = TRUE)
##### modeling states and conversions #####
# transition matrix preprocessing
df_dummy <- data.frame(channel_from = c('(start)', '(conversion)', '(null)'),
channel_to = c('(start)', '(conversion)', '(null)'),
n = c(0, 0, 0),
tot_n = c(0, 0, 0),
perc = c(0, 1, 1))
trans_matrix_complete <- mod_attrib_complete$transition_matrix
trans_matrix_complete <- rbind(trans_matrix_complete, df_dummy %>%
mutate(transition_probability = perc) %>%
select(channel_from, channel_to, transition_probability))
trans_matrix_complete$channel_to <- factor(trans_matrix_complete$channel_to, levels = c(levels(trans_matrix_complete$channel_from)))
trans_matrix_complete <- dcast(trans_matrix_complete, channel_from ~ channel_to, value.var = 'transition_probability')
trans_matrix_complete[is.na(trans_matrix_complete)] <- 0
rownames(trans_matrix_complete) <- trans_matrix_complete$channel_from
trans_matrix_complete <- as.matrix(trans_matrix_complete[, -1])
# transition matrix heatmap for "real" data
df_plot_trans <- mod_attrib_complete$transition_matrix
cols <- c("#e7f0fa", "#c9e2f6", "#95cbee", "#0099dc", "#4ab04a", "#ffd73e", "#eec73a",
"#e29421", "#e29421", "#f05336", "#ce472e")
t <- max(df_plot_trans$transition_probability)
ggplot(df_plot_trans, aes(y = channel_from, x = channel_to, fill = transition_probability)) +
theme_minimal() +
geom_tile(colour = "white", width = .9, height = .9) +
scale_fill_gradientn(colours = cols, limits = c(0, t),
breaks = seq(0, t, by = t/4),
labels = c("0", round(t/4*1, 2), round(t/4*2, 2), round(t/4*3, 2), round(t/4*4, 2)),
guide = guide_colourbar(ticks = T, nbin = 50, barheight = .5, label = T, barwidth = 10)) +
geom_text(aes(label = round(transition_probability, 2)), fontface = "bold", size = 4) +
theme(legend.position = 'bottom',
legend.direction = "horizontal",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(size = 20, face = "bold", vjust = 2, color = 'black', lineheight = 0.8),
axis.title.x = element_text(size = 10, face = "bold"),
axis.title.y = element_text(size = 10, face = "bold"),
axis.text.y = element_text(size = 8, face = "bold", color = 'black'),
axis.text.x = element_text(size = 8, angle = 90, hjust = 0.5, vjust = 0.5, face = "plain")) +
ggtitle("Transition matrix heatmap")
# computing time lapses from the first contact to conversion/last contact
df_journey <- df_paths %>%
group_by(cookie, path_no) %>%
summarise(path = paste(channel, collapse = ' > '),
first_touch_date = min(time),
last_touch_date = max(time),
tot_time_lapse = round(as.numeric(last_touch_date - first_touch_date)),
conversion = sum(conversion)) %>%
ungroup()
# distribution plot
ggplot(df_journey %>% filter(conversion == 1), aes(x = tot_time_lapse)) + theme_minimal() + geom_histogram(fill = '#4e79a6', binwidth = 1)
# cumulative distribution plot
ggplot(df_journey %>% filter(conversion == 1), aes(x = tot_time_lapse)) +
theme_minimal() +
stat_ecdf(geom = 'step', color = '#4e79a7', size = 2, alpha = 0.7) +
geom_hline(yintercept = 0.95, color = '#e15759', size = 1.5) +
geom_vline(xintercept = 23, color = '#e15759', size = 1.5, linetype = 2)
### for generic probabilistic model ###
df_journey_2 <- melt(df_journey[c(50:100), ] %>% select(cookie, first_touch_date, last_touch_date, conversion),
id.vars = c('cookie', 'conversion'),
value.name = 'touch_date') %>%
arrange(cookie)
ggplot(df_journey_2, aes(x = cookie, y = touch_date, color = factor(conversion), group = cookie)) +
theme_minimal() +
coord_flip() +
geom_point(size = 2) +
geom_line(size = 0.5, color = 'darkgrey') +
theme(legend.position = 'bottom',
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.ticks.x = element_blank(),
axis.ticks.y = element_blank()) +
guides(colour = guide_legend(override.aes = list(size = 5)))