Data Analysis scope

  1. Identify which age/gender/interest segments the ad campaign management team should focus on. Why?
  2. Compare how the various xyz_campaigns and fb_campaigns are performing.
  3. Quantify how conversion rate varies with age, gender or interest.
  4. Identify segments with high and low cost per acquisition.
  5. Segment the audience based on
  1. click through rates
  2. conversion rates
  1. Predict conversion rates for each possible age/gender/interest segment (to be provided in a separate document)

Loading required libraries

## Loading required package: lattice
## Loading required package: ggplot2
## 
## Attaching package: 'caretEnsemble'
## The following object is masked from 'package:ggplot2':
## 
##     autoplot
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Data set

The data set for this analysis has been downloaded and placed on a local drive within R working directory

Loading of data set

df<- read.csv(".//Datasets/Facebook/conversion_data.csv", header = TRUE, sep=",")

Data set dimensions

dim(df)
## [1] 1143   11

Summary of data set

summary(df)
##      ad_id         xyz_campaign_id fb_campaign_id      age      gender 
##  Min.   : 708746   Min.   : 916    Min.   :103916   30-34:426   F:551  
##  1st Qu.: 777632   1st Qu.: 936    1st Qu.:115716   35-39:248   M:592  
##  Median :1121185   Median :1178    Median :144549   40-44:210          
##  Mean   : 987261   Mean   :1067    Mean   :133784   45-49:259          
##  3rd Qu.:1121804   3rd Qu.:1178    3rd Qu.:144658                      
##  Max.   :1314415   Max.   :1178    Max.   :179982                      
##     interest       Impressions          Clicks           Spent       
##  Min.   :  2.00   Min.   :     87   Min.   :  0.00   Min.   :  0.00  
##  1st Qu.: 16.00   1st Qu.:   6504   1st Qu.:  1.00   1st Qu.:  1.48  
##  Median : 25.00   Median :  51509   Median :  8.00   Median : 12.37  
##  Mean   : 32.77   Mean   : 186732   Mean   : 33.39   Mean   : 51.36  
##  3rd Qu.: 31.00   3rd Qu.: 221769   3rd Qu.: 37.50   3rd Qu.: 60.02  
##  Max.   :114.00   Max.   :3052003   Max.   :421.00   Max.   :639.95  
##  Total_Conversion Approved_Conversion
##  Min.   : 0.000   Min.   : 0.000     
##  1st Qu.: 1.000   1st Qu.: 0.000     
##  Median : 1.000   Median : 1.000     
##  Mean   : 2.856   Mean   : 0.944     
##  3rd Qu.: 3.000   3rd Qu.: 1.000     
##  Max.   :60.000   Max.   :21.000

Structure of variables

str(df)
## 'data.frame':    1143 obs. of  11 variables:
##  $ ad_id              : int  708746 708749 708771 708815 708818 708820 708889 708895 708953 708958 ...
##  $ xyz_campaign_id    : int  916 916 916 916 916 916 916 916 916 916 ...
##  $ fb_campaign_id     : int  103916 103917 103920 103928 103928 103929 103940 103941 103951 103952 ...
##  $ age                : Factor w/ 4 levels "30-34","35-39",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ gender             : Factor w/ 2 levels "F","M": 2 2 2 2 2 2 2 2 2 2 ...
##  $ interest           : int  15 16 20 28 28 29 15 16 27 28 ...
##  $ Impressions        : int  7350 17861 693 4259 4133 1915 15615 10951 2355 9502 ...
##  $ Clicks             : int  1 2 0 1 1 0 3 1 1 3 ...
##  $ Spent              : num  1.43 1.82 0 1.25 1.29 ...
##  $ Total_Conversion   : int  2 2 1 1 1 1 1 1 1 1 ...
##  $ Approved_Conversion: int  1 0 0 0 1 1 0 1 0 0 ...

Clustering analysis

As an example of clustering technique which can be used to explore the data set we can use Hierarchical Clustering method. Let’s try this for the “age” feature.

Removing all the non-numeric variables but “age” and scaling the numeric matrix

df_cl_age <- dplyr::select(df, age, Impressions, Clicks, Spent, Total_Conversion, Approved_Conversion )
variables.to.use <- colnames(df_cl_age)[-1]
pmatrix <- scale(df_cl_age[,variables.to.use])

Create the distance matrix using Euclidean method and do the clustering

d <- dist(pmatrix, method="euclidean")
pfit <- hclust(d, method="ward.D2")

Plotting the dendogram

plot(pfit, labels=df_cl_age$age) # plot the cluster

rect.hclust(pfit, k=3)

Let’s print the clusters

groups <- cutree(pfit, k=3)
print_clusters <- function(labels, k) {
  for(i in 1:k) {
    print(paste("cluster", i))
    print(df_cl_age[labels==i,c("age","Impressions", "Clicks","Spent","Total_Conversion","Approved_Conversion")])
  }
}

The following command would display the clusters in detail. Knitr parameter eval=FALSE has been used to prevent long print in this document.

print_clusters(groups, 3)

Let’s see and analyse the cumulative numbers characterizing the clusters

group1 <- as.data.frame(df_cl_age[groups==1,])
group2 <- as.data.frame(df_cl_age[groups==2,])
group3 <- as.data.frame(df_cl_age[groups==3,])

sapply(group1[2:6], sum)
##         Impressions              Clicks               Spent 
##          34045429.0              5649.0              8628.5 
##    Total_Conversion Approved_Conversion 
##              1123.0               402.0
sapply(group2[2:6], sum)
##         Impressions              Clicks               Spent 
##        101619966.00            18048.00            27914.67 
##    Total_Conversion Approved_Conversion 
##             1152.00              340.00
sapply(group3[2:6], sum)
##         Impressions              Clicks               Spent 
##         77769433.00            14468.00            22162.06 
##    Total_Conversion Approved_Conversion 
##              989.00              337.00
sapply(group1[2:6], mean)
##         Impressions              Clicks               Spent 
##        4.167127e+04        6.914321e+00        1.056120e+01 
##    Total_Conversion Approved_Conversion 
##        1.374541e+00        4.920441e-01
sapply(group2[2:6], mean)
##         Impressions              Clicks               Spent 
##        3.969530e+05        7.050000e+01        1.090417e+02 
##    Total_Conversion Approved_Conversion 
##        4.500000e+00        1.328125e+00
sapply(group3[2:6], mean)
##         Impressions              Clicks               Spent 
##        1.110992e+06        2.066857e+02        3.166009e+02 
##    Total_Conversion Approved_Conversion 
##        1.412857e+01        4.814286e+00

Identify which age/gender/interest segments the ad campaign management team should focus on.

Let’s have a look at the distribution of impressions throughout age groups and gender:

library(ggplot2)
t <- ggplot(df, aes(age, Impressions), colour=age) + geom_bar(aes(fill=age),stat = "identity", width = .5)
t <- t + facet_grid(.~ gender) 
t + scale_fill_brewer(palette = "Paired") + ggtitle("Impressions Distribution per age group and gender")

Let’s have a look at this from the audience distribution perspective.

b <- ggplot(df, aes(age, Impressions), colour=age) + geom_bar(aes(fill=age),stat = "identity", width = .5) + coord_polar(theta = "x", direction=1)
b

Let’s visualise the distribution of the Clicks

c <- ggplot(df, aes(age, Clicks), colour=age) + geom_bar(aes(fill=gender),stat = "identity", width = .5) + coord_polar(theta = "x", direction=1)
c + ggtitle("Ditribution of Clicks per age group and gender")

d <- ggplot(df, aes(gender, Clicks), colour=age) + geom_bar(aes(fill=age),stat = "identity", width = .5) + coord_polar(theta = "x", direction=1)
e <- ggplot(df, aes(gender, Clicks), colour=age) + geom_bar(aes(fill=age),stat = "identity", width = .5)
d + ggtitle("Ditribution of Clicks per gender and age group")

e + ggtitle("Bar distribution of Clicks per gender and age group")

Let’s have a look at the distribution of the cost across age groups

f <- ggplot(df, aes(age, Spent), colour=age) + geom_bar(aes(fill=age),stat = "identity", width = .5)
f + ggtitle("Distribution of the Spent across age groups")

Including gender split

g <- ggplot(df, aes(age, Spent), colour=age) + geom_bar(aes(fill=age),stat = "identity", width = .5)
g <- g + facet_grid(.~ gender) + ggtitle("Cost distribution across age groups and gender")
g + ggtitle("Distribution of the Spent across age groups and gender")

Let’s see the how the situation looks like from the interest group perspective

rm(df.mod)
## Warning in rm(df.mod): object 'df.mod' not found
df.mod <- df %>%
    mutate(Conversion_Ratio = 100*Clicks/Impressions)
df.mod <- aggregate(Conversion_Ratio ~  gender + interest+ age,  data=df.mod, FUN = sum)
g <- ggplot(df.mod, aes(factor(interest), Conversion_Ratio)) + geom_bar(aes(fill=gender),stat = "identity", width = .5)
g + facet_grid(.~ gender) + theme(axis.text.x=element_text(angle=90, hjust=1)) + ggtitle("CTR distribution across age groups and gender")

#### Identify segments with high and low cost per acquisition.

df.mod <- aggregate(Spent ~  gender + interest+ Spent,  data=df, FUN = sum)
h <- ggplot(df.mod, aes(factor(interest), Spent)) + geom_bar(aes(fill=gender),stat = "identity", width = .5)
h + facet_grid(.~ gender) + theme(axis.text.x=element_text(angle=90, hjust=1)) + ggtitle("Spent distribution across age groups and gender")

Distribution of the conversion ratio across the interest groups for male and female

rm(df.mod)
df.mod <- df %>%
    mutate(Conversion_Ratio = 100*Clicks/Impressions)
df.mod <- aggregate(Conversion_Ratio ~  age + gender,  data=df.mod, FUN = sum)
i <- ggplot(df.mod, aes(age, Conversion_Ratio), colour=age) + geom_bar(aes(fill=gender),stat = "identity", width = .5) + coord_polar(theta = "x", direction=1)
i + ggtitle("Distribution of the conversion ratio through age groups and gender")

Let’s illustrate how the Conversion_Ratio differs bewteen Male and Female

df.mod <- df %>%
    mutate(Conversion_Ratio = 100*Clicks/Impressions)
qq(factor(gender) ~ Conversion_Ratio, df.mod, groups = age, auto.key = TRUE, f.value = ppoints(100), type = c("p", "g"), aspect = 1)

The above graph shows that the Conversion Ratio in male group has got the inclination to return slightly lower conversion rates compared to female group but there is a general linearity in this relations

Compare how the various xyz_campaigns and fb_campaigns are performing.

As a final business measurement of a campaign performance I’ve assumed the ratio between the variable Spent(Cost of the campaign) and the Approved Conversion variable.

Let’s calculate this ratio and see how various companies (xyz_campaigns) perform against each other

df.mod<- df %>%
  mutate(Cost_per_AprConv=Spent/Approved_Conversion)
df.mod <- na.omit(df.mod)
df.mod <- df.mod[-which(is.infinite(df.mod$Cost_per_AprConv)), ]
f <- aggregate(Cost_per_AprConv ~ xyz_campaign_id, FUN=sum, data=df.mod) %>%
  arrange(desc(Cost_per_AprConv))

f_plot <- ggplot(df.mod, aes(as.factor(fb_campaign_id), Cost_per_AprConv)) + geom_bar(aes(fill=xyz_campaign_id),stat = "identity", width = .5)
f_plot + facet_grid(.~ xyz_campaign_id) + ggtitle("Facebook campaigns per Cost per Approved conversion")

We observe significant disproportions between the three xyz_campaign_id companies in terms of the effectivenss of their marketing investments and how it translates into the Approved Convertion figures. It is not a definitive measure and should be used to trigger a separate analysis which would inlclude wider spectrum of data points to properly understand the observed differences.

Top 25 Facebook campaigns in terms of conversion rate

df.mod <- df %>%
    mutate(Conversion_Ratio = 100*Clicks/Impressions)
df_25 <- aggregate(Conversion_Ratio ~ fb_campaign_id, FUN=sum, data=df.mod) %>%
  arrange(desc(Conversion_Ratio)) %>% 
  head(25)
df_25$Conversion_Ratio <- round(df_25$Conversion_Ratio,3)
p <- ggplot(df_25, aes(x=as.factor(fb_campaign_id), y=Conversion_Ratio))
p <- p + geom_bar(stat = "identity", colour="red", fill="blue", width = 0.5)
p <- p + geom_text(aes(label = Conversion_Ratio), vjust = - 0.20, size = 3 )
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1)) 
p

Top 25 most expensive Facebook campaigns

df.mod<- df %>%
  mutate(Cost_per_AprConv=Spent/Approved_Conversion)
df.mod <- na.omit(df.mod)
df.mod <- df.mod[-which(is.infinite(df.mod$Cost_per_AprConv)), ]

i <- aggregate(Cost_per_AprConv ~ fb_campaign_id +xyz_campaign_id, FUN=sum, data=df.mod) %>%
  arrange(desc(Cost_per_AprConv)) 
i <- head(i, 25)
i$Cost_per_AprConv <- round(i$Cost_per_AprConv,2)
i_plot <- ggplot(i, aes(factor(fb_campaign_id), Cost_per_AprConv)) + geom_bar(stat = "identity", width = .5, fill="lightblue")
i_plot <- i_plot + geom_text(aes(label = Cost_per_AprConv), vjust = - 0.20, size = 3 ) + theme(axis.text.x=element_text(angle=45, hjust=1)) 
i_plot + facet_grid(.~ xyz_campaign_id) + ggtitle("Top 25 most expensive Facebook campaigns per Cost per Approoved conversion")

Quantify how conversion rate varies with age, gender or interest.

Average Conversion rate across age groups

df.mod <- df %>%
    mutate(Conversion_Ratio = 100*Clicks/Impressions)
df_age <- 
  aggregate(Conversion_Ratio ~ age, FUN=mean, data=df.mod) %>%
  arrange(desc(Conversion_Ratio))

df_age$Conversion_Ratio <- round(df_age$Conversion_Ratio,3)
r <- ggplot(df_age, aes(x=as.factor(age), y=Conversion_Ratio))
r<- r + geom_bar(aes(fill=age), stat = "identity", width = 0.4)
r <- r + geom_text(aes(label = Conversion_Ratio), vjust = - 0.20, size = 3 ) + scale_fill_brewer(palette = "Paired")
r <- r + theme(axis.text.x=element_text(angle=45, hjust=1)) +ggtitle("Conversion rate across age groups")
r

Average Conversion rate across gender groups

df.mod <- df %>%
    mutate(Conversion_Ratio = 100*Clicks/Impressions)
df_gender <- 
  aggregate(Conversion_Ratio ~ gender, FUN=mean, data=df.mod) %>%
  arrange(desc(Conversion_Ratio))

df_gender$Conversion_Ratio <- round(df_gender$Conversion_Ratio,3)
r <- ggplot(df_gender, aes(x=as.factor(gender), y=Conversion_Ratio))
r<- r + geom_bar(aes(fill=gender), stat = "identity", width = 0.4)
r <- r + geom_text(aes(label = Conversion_Ratio), vjust = - 0.20, size = 3 ) + scale_fill_brewer(palette = "Spectral")
r <- r + theme(axis.text.x=element_text(angle=45, hjust=1)) + ggtitle("Conversion rate across gender groups")
r

Average Conversion rate across interest groups

df.mod <- df %>%
    mutate(Conversion_Ratio = 100*Clicks/Impressions)
df_interest <- 
  aggregate(Conversion_Ratio ~ interest, FUN=mean, data=df.mod) %>%
  arrange(desc(Conversion_Ratio))

df_interest$Conversion_Ratio <- round(df_interest$Conversion_Ratio,3)
r <- ggplot(df_interest, aes(x=as.factor(interest), y=Conversion_Ratio))
r <- r + geom_bar(aes(fill=interest), stat = "identity", width = 0.4)
r <- r + ggtitle("Conversion rate across interest groups")
r