## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'caretEnsemble'
## The following object is masked from 'package:ggplot2':
##
## autoplot
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
The data set for this analysis has been downloaded and placed on a local drive within R working directory
Loading of data set
df<- read.csv(".//Datasets/Facebook/conversion_data.csv", header = TRUE, sep=",")
Data set dimensions
dim(df)
## [1] 1143 11
Summary of data set
summary(df)
## ad_id xyz_campaign_id fb_campaign_id age gender
## Min. : 708746 Min. : 916 Min. :103916 30-34:426 F:551
## 1st Qu.: 777632 1st Qu.: 936 1st Qu.:115716 35-39:248 M:592
## Median :1121185 Median :1178 Median :144549 40-44:210
## Mean : 987261 Mean :1067 Mean :133784 45-49:259
## 3rd Qu.:1121804 3rd Qu.:1178 3rd Qu.:144658
## Max. :1314415 Max. :1178 Max. :179982
## interest Impressions Clicks Spent
## Min. : 2.00 Min. : 87 Min. : 0.00 Min. : 0.00
## 1st Qu.: 16.00 1st Qu.: 6504 1st Qu.: 1.00 1st Qu.: 1.48
## Median : 25.00 Median : 51509 Median : 8.00 Median : 12.37
## Mean : 32.77 Mean : 186732 Mean : 33.39 Mean : 51.36
## 3rd Qu.: 31.00 3rd Qu.: 221769 3rd Qu.: 37.50 3rd Qu.: 60.02
## Max. :114.00 Max. :3052003 Max. :421.00 Max. :639.95
## Total_Conversion Approved_Conversion
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 1.000 1st Qu.: 0.000
## Median : 1.000 Median : 1.000
## Mean : 2.856 Mean : 0.944
## 3rd Qu.: 3.000 3rd Qu.: 1.000
## Max. :60.000 Max. :21.000
Structure of variables
str(df)
## 'data.frame': 1143 obs. of 11 variables:
## $ ad_id : int 708746 708749 708771 708815 708818 708820 708889 708895 708953 708958 ...
## $ xyz_campaign_id : int 916 916 916 916 916 916 916 916 916 916 ...
## $ fb_campaign_id : int 103916 103917 103920 103928 103928 103929 103940 103941 103951 103952 ...
## $ age : Factor w/ 4 levels "30-34","35-39",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ gender : Factor w/ 2 levels "F","M": 2 2 2 2 2 2 2 2 2 2 ...
## $ interest : int 15 16 20 28 28 29 15 16 27 28 ...
## $ Impressions : int 7350 17861 693 4259 4133 1915 15615 10951 2355 9502 ...
## $ Clicks : int 1 2 0 1 1 0 3 1 1 3 ...
## $ Spent : num 1.43 1.82 0 1.25 1.29 ...
## $ Total_Conversion : int 2 2 1 1 1 1 1 1 1 1 ...
## $ Approved_Conversion: int 1 0 0 0 1 1 0 1 0 0 ...
As an example of clustering technique which can be used to explore the data set we can use Hierarchical Clustering method. Let’s try this for the “age” feature.
Removing all the non-numeric variables but “age” and scaling the numeric matrix
df_cl_age <- dplyr::select(df, age, Impressions, Clicks, Spent, Total_Conversion, Approved_Conversion )
variables.to.use <- colnames(df_cl_age)[-1]
pmatrix <- scale(df_cl_age[,variables.to.use])
Create the distance matrix using Euclidean method and do the clustering
d <- dist(pmatrix, method="euclidean")
pfit <- hclust(d, method="ward.D2")
Plotting the dendogram
plot(pfit, labels=df_cl_age$age) # plot the cluster
rect.hclust(pfit, k=3)
Let’s print the clusters
groups <- cutree(pfit, k=3)
print_clusters <- function(labels, k) {
for(i in 1:k) {
print(paste("cluster", i))
print(df_cl_age[labels==i,c("age","Impressions", "Clicks","Spent","Total_Conversion","Approved_Conversion")])
}
}
The following command would display the clusters in detail. Knitr parameter eval=FALSE has been used to prevent long print in this document.
print_clusters(groups, 3)
Let’s see and analyse the cumulative numbers characterizing the clusters
group1 <- as.data.frame(df_cl_age[groups==1,])
group2 <- as.data.frame(df_cl_age[groups==2,])
group3 <- as.data.frame(df_cl_age[groups==3,])
sapply(group1[2:6], sum)
## Impressions Clicks Spent
## 34045429.0 5649.0 8628.5
## Total_Conversion Approved_Conversion
## 1123.0 402.0
sapply(group2[2:6], sum)
## Impressions Clicks Spent
## 101619966.00 18048.00 27914.67
## Total_Conversion Approved_Conversion
## 1152.00 340.00
sapply(group3[2:6], sum)
## Impressions Clicks Spent
## 77769433.00 14468.00 22162.06
## Total_Conversion Approved_Conversion
## 989.00 337.00
sapply(group1[2:6], mean)
## Impressions Clicks Spent
## 4.167127e+04 6.914321e+00 1.056120e+01
## Total_Conversion Approved_Conversion
## 1.374541e+00 4.920441e-01
sapply(group2[2:6], mean)
## Impressions Clicks Spent
## 3.969530e+05 7.050000e+01 1.090417e+02
## Total_Conversion Approved_Conversion
## 4.500000e+00 1.328125e+00
sapply(group3[2:6], mean)
## Impressions Clicks Spent
## 1.110992e+06 2.066857e+02 3.166009e+02
## Total_Conversion Approved_Conversion
## 1.412857e+01 4.814286e+00
Let’s have a look at the distribution of impressions throughout age groups and gender:
library(ggplot2)
t <- ggplot(df, aes(age, Impressions), colour=age) + geom_bar(aes(fill=age),stat = "identity", width = .5)
t <- t + facet_grid(.~ gender)
t + scale_fill_brewer(palette = "Paired") + ggtitle("Impressions Distribution per age group and gender")
Let’s have a look at this from the audience distribution perspective.
b <- ggplot(df, aes(age, Impressions), colour=age) + geom_bar(aes(fill=age),stat = "identity", width = .5) + coord_polar(theta = "x", direction=1)
b
c <- ggplot(df, aes(age, Clicks), colour=age) + geom_bar(aes(fill=gender),stat = "identity", width = .5) + coord_polar(theta = "x", direction=1)
c + ggtitle("Ditribution of Clicks per age group and gender")
d <- ggplot(df, aes(gender, Clicks), colour=age) + geom_bar(aes(fill=age),stat = "identity", width = .5) + coord_polar(theta = "x", direction=1)
e <- ggplot(df, aes(gender, Clicks), colour=age) + geom_bar(aes(fill=age),stat = "identity", width = .5)
d + ggtitle("Ditribution of Clicks per gender and age group")
e + ggtitle("Bar distribution of Clicks per gender and age group")
f <- ggplot(df, aes(age, Spent), colour=age) + geom_bar(aes(fill=age),stat = "identity", width = .5)
f + ggtitle("Distribution of the Spent across age groups")
g <- ggplot(df, aes(age, Spent), colour=age) + geom_bar(aes(fill=age),stat = "identity", width = .5)
g <- g + facet_grid(.~ gender) + ggtitle("Cost distribution across age groups and gender")
g + ggtitle("Distribution of the Spent across age groups and gender")
rm(df.mod)
## Warning in rm(df.mod): object 'df.mod' not found
df.mod <- df %>%
mutate(Conversion_Ratio = 100*Clicks/Impressions)
df.mod <- aggregate(Conversion_Ratio ~ gender + interest+ age, data=df.mod, FUN = sum)
g <- ggplot(df.mod, aes(factor(interest), Conversion_Ratio)) + geom_bar(aes(fill=gender),stat = "identity", width = .5)
g + facet_grid(.~ gender) + theme(axis.text.x=element_text(angle=90, hjust=1)) + ggtitle("CTR distribution across age groups and gender")
#### Identify segments with high and low cost per acquisition.
df.mod <- aggregate(Spent ~ gender + interest+ Spent, data=df, FUN = sum)
h <- ggplot(df.mod, aes(factor(interest), Spent)) + geom_bar(aes(fill=gender),stat = "identity", width = .5)
h + facet_grid(.~ gender) + theme(axis.text.x=element_text(angle=90, hjust=1)) + ggtitle("Spent distribution across age groups and gender")
rm(df.mod)
df.mod <- df %>%
mutate(Conversion_Ratio = 100*Clicks/Impressions)
df.mod <- aggregate(Conversion_Ratio ~ age + gender, data=df.mod, FUN = sum)
i <- ggplot(df.mod, aes(age, Conversion_Ratio), colour=age) + geom_bar(aes(fill=gender),stat = "identity", width = .5) + coord_polar(theta = "x", direction=1)
i + ggtitle("Distribution of the conversion ratio through age groups and gender")
df.mod <- df %>%
mutate(Conversion_Ratio = 100*Clicks/Impressions)
qq(factor(gender) ~ Conversion_Ratio, df.mod, groups = age, auto.key = TRUE, f.value = ppoints(100), type = c("p", "g"), aspect = 1)
The above graph shows that the Conversion Ratio in male group has got the inclination to return slightly lower conversion rates compared to female group but there is a general linearity in this relations
As a final business measurement of a campaign performance I’ve assumed the ratio between the variable Spent(Cost of the campaign) and the Approved Conversion variable.
Let’s calculate this ratio and see how various companies (xyz_campaigns) perform against each other
df.mod<- df %>%
mutate(Cost_per_AprConv=Spent/Approved_Conversion)
df.mod <- na.omit(df.mod)
df.mod <- df.mod[-which(is.infinite(df.mod$Cost_per_AprConv)), ]
f <- aggregate(Cost_per_AprConv ~ xyz_campaign_id, FUN=sum, data=df.mod) %>%
arrange(desc(Cost_per_AprConv))
f_plot <- ggplot(df.mod, aes(as.factor(fb_campaign_id), Cost_per_AprConv)) + geom_bar(aes(fill=xyz_campaign_id),stat = "identity", width = .5)
f_plot + facet_grid(.~ xyz_campaign_id) + ggtitle("Facebook campaigns per Cost per Approved conversion")
We observe significant disproportions between the three xyz_campaign_id companies in terms of the effectivenss of their marketing investments and how it translates into the Approved Convertion figures. It is not a definitive measure and should be used to trigger a separate analysis which would inlclude wider spectrum of data points to properly understand the observed differences.
Top 25 Facebook campaigns in terms of conversion rate
df.mod <- df %>%
mutate(Conversion_Ratio = 100*Clicks/Impressions)
df_25 <- aggregate(Conversion_Ratio ~ fb_campaign_id, FUN=sum, data=df.mod) %>%
arrange(desc(Conversion_Ratio)) %>%
head(25)
df_25$Conversion_Ratio <- round(df_25$Conversion_Ratio,3)
p <- ggplot(df_25, aes(x=as.factor(fb_campaign_id), y=Conversion_Ratio))
p <- p + geom_bar(stat = "identity", colour="red", fill="blue", width = 0.5)
p <- p + geom_text(aes(label = Conversion_Ratio), vjust = - 0.20, size = 3 )
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p
Top 25 most expensive Facebook campaigns
df.mod<- df %>%
mutate(Cost_per_AprConv=Spent/Approved_Conversion)
df.mod <- na.omit(df.mod)
df.mod <- df.mod[-which(is.infinite(df.mod$Cost_per_AprConv)), ]
i <- aggregate(Cost_per_AprConv ~ fb_campaign_id +xyz_campaign_id, FUN=sum, data=df.mod) %>%
arrange(desc(Cost_per_AprConv))
i <- head(i, 25)
i$Cost_per_AprConv <- round(i$Cost_per_AprConv,2)
i_plot <- ggplot(i, aes(factor(fb_campaign_id), Cost_per_AprConv)) + geom_bar(stat = "identity", width = .5, fill="lightblue")
i_plot <- i_plot + geom_text(aes(label = Cost_per_AprConv), vjust = - 0.20, size = 3 ) + theme(axis.text.x=element_text(angle=45, hjust=1))
i_plot + facet_grid(.~ xyz_campaign_id) + ggtitle("Top 25 most expensive Facebook campaigns per Cost per Approoved conversion")
Average Conversion rate across age groups
df.mod <- df %>%
mutate(Conversion_Ratio = 100*Clicks/Impressions)
df_age <-
aggregate(Conversion_Ratio ~ age, FUN=mean, data=df.mod) %>%
arrange(desc(Conversion_Ratio))
df_age$Conversion_Ratio <- round(df_age$Conversion_Ratio,3)
r <- ggplot(df_age, aes(x=as.factor(age), y=Conversion_Ratio))
r<- r + geom_bar(aes(fill=age), stat = "identity", width = 0.4)
r <- r + geom_text(aes(label = Conversion_Ratio), vjust = - 0.20, size = 3 ) + scale_fill_brewer(palette = "Paired")
r <- r + theme(axis.text.x=element_text(angle=45, hjust=1)) +ggtitle("Conversion rate across age groups")
r
Average Conversion rate across gender groups
df.mod <- df %>%
mutate(Conversion_Ratio = 100*Clicks/Impressions)
df_gender <-
aggregate(Conversion_Ratio ~ gender, FUN=mean, data=df.mod) %>%
arrange(desc(Conversion_Ratio))
df_gender$Conversion_Ratio <- round(df_gender$Conversion_Ratio,3)
r <- ggplot(df_gender, aes(x=as.factor(gender), y=Conversion_Ratio))
r<- r + geom_bar(aes(fill=gender), stat = "identity", width = 0.4)
r <- r + geom_text(aes(label = Conversion_Ratio), vjust = - 0.20, size = 3 ) + scale_fill_brewer(palette = "Spectral")
r <- r + theme(axis.text.x=element_text(angle=45, hjust=1)) + ggtitle("Conversion rate across gender groups")
r
Average Conversion rate across interest groups
df.mod <- df %>%
mutate(Conversion_Ratio = 100*Clicks/Impressions)
df_interest <-
aggregate(Conversion_Ratio ~ interest, FUN=mean, data=df.mod) %>%
arrange(desc(Conversion_Ratio))
df_interest$Conversion_Ratio <- round(df_interest$Conversion_Ratio,3)
r <- ggplot(df_interest, aes(x=as.factor(interest), y=Conversion_Ratio))
r <- r + geom_bar(aes(fill=interest), stat = "identity", width = 0.4)
r <- r + ggtitle("Conversion rate across interest groups")
r