#Reading the data
conversions<- read.csv("Conversions.csv")
str(conversions)
## 'data.frame': 10324 obs. of 6 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ source : chr "organic" "none" "google" "lt" ...
## $ channel : chr "direct" "none" "search" "affiliate" ...
## $ indicator1: int 4 9 1 8 8 8 7 7 3 1 ...
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ indicator2: int 4 9 1 8 8 8 7 7 3 1 ...
summary(conversions)
## X source channel indicator1
## Min. : 1 Length:10324 Length:10324 Min. :1.00
## 1st Qu.: 2582 Class :character Class :character 1st Qu.:3.00
## Median : 5162 Mode :character Mode :character Median :5.00
## Mean : 5162 Mean :4.96
## 3rd Qu.: 7743 3rd Qu.:7.00
## Max. :10324 Max. :9.00
## id indicator2
## Min. : 1 Min. :1.00
## 1st Qu.: 2582 1st Qu.:3.00
## Median : 5162 Median :5.00
## Mean : 5162 Mean :4.96
## 3rd Qu.: 7743 3rd Qu.:7.00
## Max. :10324 Max. :9.00
head(conversions,10)
spend<- read.csv("TotalSpend.csv")
str(spend)
## 'data.frame': 9 obs. of 7 variables:
## $ X : int 1 2 3 4 5 6 7 8 9
## $ source : chr "google" "facebook" "instagram" "organic" ...
## $ channel : chr "search" "social" "social" "direct" ...
## $ totalspend : num 25000 80000 36000 10000 75000 100000 25000 100000 0
## $ totaltraffic: int 15000 10000 15000 30000 5000 25000 12000 20000 5000
## $ indicator1 : int 1 2 3 4 5 6 7 8 9
## $ indicator2 : int 1 2 3 4 5 6 7 8 9
summary(spend)
## X source channel totalspend
## Min. :1 Length:9 Length:9 Min. : 0
## 1st Qu.:3 Class :character Class :character 1st Qu.: 25000
## Median :5 Mode :character Mode :character Median : 36000
## Mean :5 Mean : 50111
## 3rd Qu.:7 3rd Qu.: 80000
## Max. :9 Max. :100000
## totaltraffic indicator1 indicator2
## Min. : 5000 Min. :1 Min. :1
## 1st Qu.:10000 1st Qu.:3 1st Qu.:3
## Median :15000 Median :5 Median :5
## Mean :15222 Mean :5 Mean :5
## 3rd Qu.:20000 3rd Qu.:7 3rd Qu.:7
## Max. :30000 Max. :9 Max. :9
head(spend,10)
#View(spend)
#View(conversions)
#Please Note: Indicator2 is the same variable as indicator1. There were readability issues with indicator1, so created a new column and named it indicator2.
#install.packages("sqldf")
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
data<- sqldf("select s.source,
s.channel,
s.totalspend,
s.totaltraffic,
s.indicator2,
count (distinct c.id) tot_conversions
from spend s
left join conversions c on s.indicator2 = c.indicator2
group by 1,2,3,4
order by 5 asc")
head(data,50)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble 3.0.3 ✓ purrr 0.3.4
## ✓ tidyr 1.1.1 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
data2<- data%>%
group_by(source,channel,indicator2)%>%
mutate(cost_per_visit = round(totalspend/totaltraffic,2),
cost_per_conversion = round(totalspend/tot_conversions,2),
conversion_rate = round(tot_conversions/totaltraffic*100,2),
traffic_conversion_ratio= round(totaltraffic/tot_conversions,2),
source_channel = paste(source,channel,sep = "-"))
head(data2,10)
#Cleanup code for the Graphs and Plots
cleanup = theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line.x = element_line(color = 'black'),
axis.line.y = element_line(color = 'black'),
legend.key = element_rect(fill = 'white'),
text = element_text(size = 15))
#Plot1- Spend Vs Traffic
##Calculating mid-point of x and y axis
x_mid <- mean(c(max(data2$totalspend, na.rm = TRUE),
min(data2$totalspend, na.rm = TRUE)))
y_mid <- mean(c(max(data2$totaltraffic, na.rm = TRUE),
min(data2$totaltraffic, na.rm = TRUE)))
##Creating plot
data2 %>%
group_by(channel)%>%
summarise(totalspend=sum(totalspend),
totaltraffic=sum(totaltraffic))%>%
ggplot(aes(totalspend, totaltraffic)) + geom_point(aes(colour = channel, size=3)) +
geom_vline(xintercept = x_mid)+
geom_hline(yintercept = y_mid)+
guides(alpha=F,size=F) +
labs(title = "Spend vs Traffic by channel ") +
labs(x = "Total Spend", y = "Total Traffic")+
cleanup
## `summarise()` ungrouping output (override with `.groups` argument)
#We can see that we are spending highest in affiliate channel and its bringing in highest traffic.
#The spend is second highest on Social channels which is bringing traffic similar to
#direct channel where we arent even spending that much. This is something to look further into and drill down.
#Lets dig into Source to figure out the spend effectiveness of each source.
#Plot2- Spend Vs Cost per Visit
##Calculating mid-point of x and y axis
x0_mid <- mean(c(max(data2$totalspend, na.rm = TRUE),
min(data2$totalspend, na.rm = TRUE)))
y0_mid <- mean(c(max(data2$cost_per_visit, na.rm = TRUE),
min(data2$cost_per_visit, na.rm = TRUE)))
##Creating Plot
ggplot(data2, aes(totalspend, cost_per_visit)) + geom_point(aes(colour = source, size=3))+
geom_vline(xintercept = x0_mid)+
geom_hline(yintercept = y0_mid)+
guides(alpha=F,size=F) +
labs(title = "Spend vs Cost per Visit by Source ") +
labs(x = "Total Spend", y = "Cost per Visit")+
cleanup
#Plot2: Spend Vs CPL(Cost per Conversion)
##Calculating mid-point of x and y axis
x1_mid <- mean(c(max(data2$totalspend, na.rm = TRUE),
min(data2$totalspend, na.rm = TRUE)))
y1_mid <- mean(c(max(data2$cost_per_conversion, na.rm = TRUE),
min(data2$cost_per_conversion, na.rm = TRUE)))
##Creating Plot
ggplot(data2, aes(totalspend, cost_per_conversion)) + geom_point(aes(colour = source, size=3))+
geom_vline(xintercept = x1_mid)+
geom_hline(yintercept = y1_mid)+
guides(alpha=F,size=F) +
labs(title = "Spend vs Cost per Conversion by Source ") +
labs(x = "Total Spend", y = "Cost per Conversion")+
cleanup
#Digging deeper I noticed that lt and nyt have high spend but cost per conversion
#is lower which is a good thing.
#Facebook, youtube is high spend and cost per conversion is highest. This is the source
#that should be focused on to lower cost per conversion.
#Spend Vs Conversion Rate
##Calculating mid-point of x and y axis
x2_mid <- mean(c(max(data2$totalspend, na.rm = TRUE),
min(data2$totalspend, na.rm = TRUE)))
y2_mid <- mean(c(max(data2$conversion_rate, na.rm = TRUE),
min(data2$conversion_rate, na.rm = TRUE)))
##Creating Plot
ggplot(data2, aes(totalspend, conversion_rate)) + geom_point(aes(colour = source, size=3))+
geom_vline(xintercept = x2_mid)+
geom_hline(yintercept = y2_mid)+
guides(alpha=F,size=F) +
labs(title = "Spend vs Conversion Rate by Source ") +
labs(x = "Total Spend", y = "Conversion Rate")+
cleanup
#channels/sources are most and least likely to lead to a conversion
#CONVERSION INDEX
##For this analysis I will use the CI (Conversion Index) methodology.
#This allows us to see relative conversion efficiency of a particular source or channel.
#It is a technique to normalize the data to one so that it can be compared.
f<-data2 %>% group_by(source) %>%
summarise(totalspend=sum(totalspend),
totaltraffic = sum(totaltraffic),
tot_conversions = sum(tot_conversions))%>%
mutate(pc_totconv= round(tot_conversions/sum(tot_conversions)*100,2),
pc_tottraffic= round(totaltraffic/sum(totaltraffic)*100,2),
CI= round(pc_totconv/pc_tottraffic,2))
## `summarise()` ungrouping output (override with `.groups` argument)
f
f %>% select(source, CI)
##Index value greater than 1 suggest that traffic from that channel is more likely to convert
##than the population at large. Similarly, value lesser than 1 suggest that traffic from that
##channel is less likely to convert.
##For instance, Source- Lending tree with a CI of 1.17 suggests that it is 1-2 times more
##efficient than average channel at converting visitors into customers and will likely convert more customers.
##Thus, Overall, sources like google(1.05), instagram(1.03), nyt(1.01), lt(1.18)
#STATISTICAL MODEL
mean(data2$tot_conversions)
## [1] 1147.111
##Creating a binary variable indicating whether or not the total conversions is greater than 1147
data2$value.g1147<- data2$tot_conversions>1147
##Conducting logistic regression on the new binary variable
data2.glm<- glm(formula = value.g1147~ totaltraffic + source,
data = data2,
family = binomial)
summary(data2.glm)$coefficients
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.456607e+01 243305.58171 -1.009680e-04 0.9999194
## totaltraffic 7.586915e-18 26.46816 2.866431e-19 1.0000000
## sourcefacebook -1.741299e-14 165293.59347 -1.053459e-19 1.0000000
## sourcegoogle 4.913214e+01 235254.17897 2.088470e-04 0.9998334
## sourceinstagram 4.913214e+01 235254.17895 2.088470e-04 0.9998334
## sourcelt 4.913214e+01 344086.05550 1.427903e-04 0.9998861
## sourcenone 5.873391e-14 185277.10571 3.170058e-19 1.0000000
## sourcenyt 4.913214e+01 465267.78034 1.055997e-04 0.9999157
## sourceorganic 4.913214e+01 591253.87933 8.309821e-05 0.9999337
##adding logistic fitted values back to dataframe as new column pred.g1147
options(scipen = 999, digits = 1)
data2$pred.g1147<- data2.glm$fitted.values
##Looking at the data (of the named columns)
head(data2[c("totaltraffic", "source", "tot_conversions", "pred.g1147")],10)
##The probabilities match our data and the previous Conversion Index(CI) analysis pretty well.
##The column pred.g1147 shows that Google, Instagram, Organic, nyt and Lending tree has higher probability of conversion.
#The first takeaway from the analysis for the marketing team, looking at the CI (Conversion Index) is spend more time, energy and money on driving traffic to sources like google(1.05), instagram(1.03), nyt(1.01), lt(1.18) because traffic from these sources convert really well.
#The source and channel none/none issue is due to inbound links that drive traffic to the website lacking referral data or parameters to see where it came from. This ususally happens due to lack of utm parameters or tagging. The way Google identifies traffic is: GA checks for auto tagging, utm campaign tagging parameters, http referrer field (in that order). If none of these is set and there isnt any prior data associated with user's browsing id(cookie) then GA marks the traffic is (none). It just means unknown/undefined.
#The best solution is to ensure that all inbound links from any of the campaigns are properly utm tagged consistently with consistent naming conventions so it could be identified and tracked by GA and the analysis is easier too. UTM parameters are case sensitive so make sure all letters are lower case, else "Google" and "google" shows as different sources.
#Other factors that may affect marketing spend efficiency and conversion rates
#Number of returning users is an important metrics as it tells us how many repeat visitors we are getting from a particular source/channel. Source driving higher repeat visitor number should be focused more.
#Sources driving higher Page Views per session tells us how much engaged the user is with the website. For example- How many page views a user has viewed in a session is an indicator that user is actively viewing and comparing quotes. This is an important indicator that tells us the quality of the traffic coming from a particular source and higher spend can be allocated there.