Setup

Load data wrangling, visualization and modeling libraries.

Load extra viz. libraries

Load & create personal ggplot2 theme “chewyTheme”.

Import dataset

## Rows: 790 Columns: 5

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (2): Date, promotion
## dbl (3): installs, spend_facebook, spend_google

## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Inspection of beginning and end

head(adcampaign) # first 6 rows, OK

## # A tibble: 6 × 5
##   Date     installs spend_facebook spend_google promotion
##   <chr>       <dbl>          <dbl>        <dbl> <chr>    
## 1 1-1-2019     1667           1406           NA Yes      
## 2 1-2-2019     1889           1197           NA Yes      
## 3 1-3-2019     1462           1090           NA Yes      
## 4 1-4-2019     1218           1434           NA Yes      
## 5 1-5-2019     1721           1449           NA Yes      
## 6 1-6-2019     1696           1475           NA Yes

tail(adcampaign) # first 6 rows, OK

## # A tibble: 6 × 5
##   Date      installs spend_facebook spend_google promotion
##   <chr>        <dbl>          <dbl>        <dbl> <chr>    
## 1 2-23-2021     2060            838           NA <NA>     
## 2 2-24-2021     1972            389           NA <NA>     
## 3 2-25-2021     1812            484           NA <NA>     
## 4 2-26-2021     1734           1044           NA <NA>     
## 5 2-27-2021     2001            731           NA <NA>     
## 6 2-28-2021     2190            929           NA <NA>

Glimpse data.

100 random sampled rows.

Clean data

We can transform our Date column first into datatype “date” and in the same step converting into format “year-month-day”. Let’s also make sure all column names are lowercase.
Instead of having NAs in our “spend_facebook”, “spend_google”, and “promotion”. We could set our NAs to 0 for the dollars spent on each platform, and “NO” when promotion was not used.

adcampaign <- adcampaign %>% replace_na(list(
  spend_facebook = 0,
  spend_google = 0,
  promotion = "No"
))

set.seed(4)
adcampaign %>% sample_n(10)

## # A tibble: 10 × 5
##    date       installs spend_facebook spend_google promotion
##    <date>        <dbl>          <dbl>        <dbl> <chr>    
##  1 2020-05-18     1952            521            0 No       
##  2 2020-08-09     1497              0            0 No       
##  3 2021-02-09     2326            667            0 No       
##  4 2019-03-12      479              0            0 No       
##  5 2020-11-14     1892            224            0 No       
##  6 2020-01-06     2463              0         1018 Yes      
##  7 2021-01-26     3183            297            0 Yes      
##  8 2020-11-28     2018            922            0 No       
##  9 2019-11-03      467              0            0 No       
## 10 2019-11-08      386              0            0 No

Lastly, we can transform “promotion” into a factor variable.

## Rows: 790
## Columns: 5
## $ date           <date> 2019-01-01, 2019-01-02, 2019-01-03, 2019-01-04, 2019-0…
## $ installs       <dbl> 1667, 1889, 1462, 1218, 1721, 1696, 1528, 1246, 1248, 1…
## $ spend_facebook <dbl> 1406, 1197, 1090, 1434, 1449, 1475, 1254, 1223, 1414, 1…
## $ spend_google   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ promotion      <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, …

Now that the data is cleaned, we can explore the data and check if there’s anything left to tidy. We can also move on to explore the data itself visually and statistically.

Explore data

Promotions

Let’s start by exploring the rows that have promotion == Yes, and the ones that == No.

promotion_count <- adcampaign %>% 
  group_by(promotion) %>% 
  count()
  
promotion_count %>%
  ggplot(aes(x = promotion,
             y = n)) +
  geom_col(aes(fill = promotion),
           show.legend = FALSE) +
  geom_text(aes(x = promotion, 
                      y = n, 
                      label = n),
            vjust = 1.5,
            fontface = "bold",
            family = "Lato") +
  scale_fill_carto_d(palette = "Safe") +
  labs(title = "Days with promotion over the past 2 years",
       subtitle = "The promotion was only active 20% of the time the campaign was running",
       y = "days") +
  coord_cartesian(clip = "off",
                  ylim = c(0, 790)) +
  scale_y_continuous(expand = c(0, 0)) +
  chewyTheme() +
  theme(axis.text.x = element_text(size = rel(1.33),
                                       face = "bold"))

How much was the sum total spent on ads on each platform (Google/FB)?

Using librar(dplyr) is similar to SQL for data wrangling. Although, I would rather have stored this data in a way so that these data fetches don’t eat up my memory. Since the dataset is small, we can disregard an issue like that.

total_spent <- adcampaign %>% 
  summarise(Facebook = sum(spend_facebook),
            Google = sum(spend_google),
            Total = sum(spend_facebook, spend_google)) %>% 
  pivot_longer(cols = 1:3, 
               names_to = "platform",
               values_to = "spent")

Visualise total spending.

total_spent %>% ggplot(aes(x = platform,
             y = spent)) +
  geom_col(aes(fill = platform),
           show.legend = FALSE) +
  geom_text(aes(x = platform, 
                      y = spent, 
                      label = bignumber(spent)),
            vjust = 1.5,
            fontface = "bold",
            family = "Lato") +
  scale_fill_carto_d(palette = "Safe") +
  coord_cartesian(clip = "off") +
  scale_y_continuous(expand = c(0,0),
                     labels = bignumber) +
  labs(title = "$ spent on during 2yr ad campaign",
       y = "$usd") +
  chewyTheme() +
  theme(axis.text.x = element_text(size = rel(1.33),
                                       face = "bold"),
        axis.title.x = element_blank())

Now we have a total spending across the whole period, which can be useful for decision making, but it’s important to understand the spending over time.

Some simple descriptive statistics can be visualised through a boxplot. The median is shown by the line in the middle of the bar, the bar begins and end at the 1st quantile and 3rd quantile with a statistical minimum and maximum as well as outliers.

spending_data <- adcampaign %>%
  select(spend_facebook, spend_google) %>%
  #filter(spend_facebook > 0 & spend_google > 0) %>%
  rename(Facebook = spend_facebook,
         Google = spend_google) %>%
  pivot_longer(cols = 1:2,
               names_to = "platform",
               values_to = "spent")

spending_palette <- c(carto_pal(name = "Safe")[1],
                      carto_pal(name = "Safe")[2])

spending_boxplot <- spending_data %>%
  ggplot(aes(x = platform,
             y = spent)) +
  geom_boxplot(
      outlier.shape = NA,
      fill = spending_palette,
      colour = spending_palette,
      alpha = 0.5,
      width = 0.1
    ) +
    gghalves::geom_half_point(
      ## draw jitter on the left
      side = "l",
      ## control range of jitter
      range_scale = .4,
      ## add some transparency
      alpha = 1,
      color = spending_palette,
      size = 0.5
    ) +
  labs(title = "$ spent on each platform",
       subtitle = "Showing how the data varies", y = "$usd") +
  chewyTheme() +
  theme(axis.title.y = element_blank())

spending_boxplot

Here we easily see, that there were a huge chunk of data that had $0 spent.

Let’s filter those days out and see how our spending looks like, only considering days where spending was above 0

fb_boxplot <- spending_data %>%
  filter(platform == "Facebook",
         spent > 0) %>%
  ggplot(aes(x = platform,
             y = spent)) +
  geom_boxplot(
      outlier.shape = NA,
      fill = spending_palette[1],
      colour = spending_palette[1],
      alpha = 0.5,
      width = 0.1
    ) +
    gghalves::geom_half_point(
      ## draw jitter on the left
      side = "l",
      ## control range of jitter
      range_scale = .4,
      ## add some transparency
      alpha = 1,
      color = spending_palette[1],
      size = 0.5
    ) +
  labs(title = "$ spent on Facebook", 
       subtitle = "Excluding days below $0 \nspent on ads", 
       y = "$usd") +
  chewyTheme() +
  theme(axis.title.y = element_blank())

google_boxplot <- spending_data %>%
  filter(platform == "Google",
         spent > 0) %>%
  ggplot(aes(x = platform,
             y = spent)) +
  geom_boxplot(
      outlier.shape = NA,
      fill = spending_palette[2],
      colour = spending_palette[2],
      alpha = 0.5,
      width = 0.1
    ) +
    gghalves::geom_half_point(
      ## draw jitter on the left
      side = "l",
      ## control range of jitter
      range_scale = .4,
      ## add some transparency
      alpha = 1,
      color = spending_palette[2],
      size = 0.5
    ) +
  labs(title = "$ spent on Google",
       subtitle ="Excluding days below $0 \nspent on ads",
       y = "$usd") +
  chewyTheme() +
  theme(axis.title.y = element_blank())

fb_boxplot + google_boxplot

Time series

Let’s now see how the money was spent on Google Ads and Facebook Ads over the period.

colors <- c(
  "Facebook Ads" = carto_pal(name = "Safe")[11] ,
  "Google Ads" = carto_pal(name = "Safe")[9],
  "Facebook Smooth" = carto_pal(name = "Safe")[1],
  "Google Smooth" = carto_pal(name = "Safe")[2]
)

platform_timeseries_plot <- adcampaign %>%
  ggplot() +
  geom_line(aes(x = date,
                y = spend_facebook,
                color = "Facebook Ads")) +
  geom_line(aes(x = date,
                y = spend_google,
                color = "Google Ads")) +
  geom_line(
    aes(x = date,
        y = spend_facebook,
        color = "Facebook Smooth"),
    stat = "smooth",
    method = "loess",
    linetype = "dashed",
    alpha = 0.6,
    show.legend = FALSE
  ) +
  geom_line(
    aes(x = date,
        y = spend_google,
        color = "Google Smooth"),
    stat = "smooth",
    method = "loess",
    linetype = "dotdash",
    alpha = 0.6,
    show.legend = FALSE
  ) +
  scale_color_manual(values = colors) +
  scale_x_date(expand = c(0, 0),
               date_breaks = "6 month") +
  labs(title = "Dollars spent on ads during the 2yr ad campaign",
       color = "",
       y = "$usd") +
  chewyTheme() +
  theme(axis.title.x = element_blank())

suppressMessages(print(platform_timeseries_plot))

This shows a large chunk of 2019 hade no spending on ads at all.

Let’s figure out the exact dates where there was no money spent on marketing.

adcampaign_nospending <- adcampaign %>% 
  filter(spend_facebook == 0 & spend_google == 0) %>%
  mutate(year = lubridate::year(date)) %>%
  relocate(year, .after = date)

Plotting the daily installs over the time period.

installs_timeseries_plot <- adcampaign %>%
  ggplot() +
  geom_line(aes(x = date,
                y = installs),
            color = carto_pal(name = "Safe")[4],
            alpha = 0.9) +
  geom_line(
    aes(x = date,
        y = installs),
    color = carto_pal(name = "Safe")[1],
    stat = "smooth",
    method = "loess",
    size = 1.5,
    alpha = 0.6,
    show.legend = FALSE
  ) +
  # geom_smooth(aes(x = date,
  #                 y = installs),
  #             method = "loess",
  #             se = FALSE) +
  labs(title = "Daily installs during the 2yr ad campaign ",
       y = "Installs") +
  chewyTheme() +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_blank())

suppressMessages(print(installs_timeseries_plot / platform_timeseries_plot))

Model

Hypothesis

Let’s explore ideas on how to model this so we can find out:

How many users installed the app due to marketing executed by Twigeo historically
If we can forecast “install volume” for the coming 4 weeks based on the following marketing budget:

$500 per day on FB Ads
$1000 per day on Google Ads
No promotion during this

Setting up

Let’s start by loading libraries needed. We can also create a timeseries object immediately, we don’t want to lose ‘promotion’ column, so we’ll have to transform that into a boolean number.

“Yes” = 1, “No” = 0.

library(modeltime) # time series modeling
library(forecast) # time series forecasting

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

## 
## Attaching package: 'forecast'

## The following object is masked from 'package:yardstick':
## 
##     accuracy

library(timetk) # time series analysis tools
library(sweep) # tidy time series data
library(ggfortify) # let ggplot2 understand timeseries "ts objects"

## Registered S3 methods overwritten by 'ggfortify':
##   method                 from    
##   autoplot.Arima         forecast
##   autoplot.acf           forecast
##   autoplot.ar            forecast
##   autoplot.bats          forecast
##   autoplot.decomposed.ts forecast
##   autoplot.ets           forecast
##   autoplot.forecast      forecast
##   autoplot.stl           forecast
##   autoplot.ts            forecast
##   fitted.ar              forecast
##   fortify.ts             forecast
##   residuals.ar           forecast

## transform 'promotion' column to boolean
adcampaign_transformed <- adcampaign %>% 
  mutate(promotion = ifelse(promotion == "Yes", 1, 0))

## 80 / 20 split
splits <- initial_time_split(adcampaign_transformed, prop = 0.8)

## 80% Analysis data split
glimpse(training(splits))

## Rows: 632
## Columns: 5
## $ date           <date> 2019-01-01, 2019-01-02, 2019-01-03, 2019-01-04, 2019-0…
## $ installs       <dbl> 1667, 1889, 1462, 1218, 1721, 1696, 1528, 1246, 1248, 1…
## $ spend_facebook <dbl> 1406, 1197, 1090, 1434, 1449, 1475, 1254, 1223, 1414, 1…
## $ spend_google   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ promotion      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…

trainingstart <- training(splits) %>% head(1) %>% select(date) %>% toString()
trainingend <- training(splits) %>% tail(1) %>% select(date) %>% toString()

## 20% Assessment data split
glimpse(training(splits))

## Rows: 632
## Columns: 5
## $ date           <date> 2019-01-01, 2019-01-02, 2019-01-03, 2019-01-04, 2019-0…
## $ installs       <dbl> 1667, 1889, 1462, 1218, 1721, 1696, 1528, 1246, 1248, 1…
## $ spend_facebook <dbl> 1406, 1197, 1090, 1434, 1449, 1475, 1254, 1223, 1414, 1…
## $ spend_google   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ promotion      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…

testingstart <- testing(splits) %>% head(1) %>% select(date) %>% toString()
testingend <- testing(splits) %>% tail(1) %>% select(date) %>% toString()

## create ts object from training data
adcampaign_ts <- tk_ts(training(splits), 
                       start = trainingstart, 
                       end = trainingend, 
                       frequency = 1)

## check that ts object has index
has_timetk_idx(adcampaign_ts)

## [1] TRUE

ARIMA model

Note:

I’m really sorry to inform this. But I was not succesful in setting up my model within the time frame of this case. Being involved in multiple recruitment processes, work and thesis writing; I was unable to complete the project at hand.

What I would do to finish the case and deliver the results:

Set up the time series object properly
Set up the budget targets of each platform
Use a forecasting model that allows for external factors:
- Exponential smoothing due to the instant rise and decrease of ad impact
- ARIMA, linear models are always useful for comparing models in a case like this.

Moving on…

Fit ARIMA model with external factor: “spend_google”, “spend_facebook”

predictors <- c("spend_facebook", "spend_google", "promotion")

fit.arima <- auto.arima(adcampaign_ts[, "installs"],
                        xreg = adcampaign_ts[, predictors])

has_timetk_idx(fit.arima)

## [1] FALSE

sw_tidy(fit.arima)

## # A tibble: 9 × 2
##   term           estimate
##   <chr>             <dbl>
## 1 ar1              0.947 
## 2 ar2             -0.159 
## 3 ar3             -0.0695
## 4 ar4              0.161 
## 5 ar5              0.0272
## 6 ma1             -0.981 
## 7 spend_facebook   0.381 
## 8 spend_google     0.223 
## 9 promotion      364.

sw_glance(fit.arima) %>% glimpse()

## Rows: 1
## Columns: 12
## $ model.desc <chr> "Regression with ARIMA(5,1,1) errors"
## $ sigma      <dbl> 164.7011
## $ logLik     <dbl> -4111.845
## $ AIC        <dbl> 8243.691
## $ BIC        <dbl> 8288.164
## $ ME         <dbl> 6.976343
## $ RMSE       <dbl> 163.3929
## $ MAE        <dbl> 89.97099
## $ MPE        <dbl> -0.4555886
## $ MAPE       <dbl> 8.557262
## $ MASE       <dbl> 0.9518585
## $ ACF1       <dbl> -0.002730702

sw_augment(fit.arima, timetk_idx = TRUE)

## Warning in sw_augment.Arima(fit.arima, timetk_idx = TRUE): Object has no timetk
## index. Using default index.

## # A tibble: 632 × 4
##    index .actual .fitted   .resid
##    <dbl>   <dbl>   <dbl>    <dbl>
##  1 17897    1667   1666.    0.767
##  2 17898    1889   1602.  287.   
##  3 17899    1462   1838. -376.   
##  4 17900    1218   1518. -300.   
##  5 17901    1721   1223.  498.   
##  6 17902    1696   1855. -159.   
##  7 17903    1528   1626.  -97.5  
##  8 17904    1246   1431. -185.   
##  9 17905    1248   1328.  -79.8  
## 10 17906    1150   1303. -153.   
## # … with 622 more rows

Residuals

Let’s plot the residuals for the ARIMA model.

sw_augment(fit.arima) %>%
  ggplot(aes(x = as.Date(index), y = .resid)) +
  geom_point(size = 0.5) +
  geom_hline(yintercept = 0, color = "red") +
  labs(title = "Residual diagnostic for ARIMA modell",
       subtitle = "Predictors: spend_google, spend_facebook, promotion") +
  coord_cartesian(clip = "off") +
  labs(x = "date",
       y = "resid") +
  chewyTheme() +
  theme(panel.grid.major.y = element_line(),
        panel.grid.minor.y = element_line())

#### Forecast

Let’s make a forecast 4 weeks into the future.

testdata_ts <- tk_ts(testing(splits), 
                     start = testingstart, 
                     end = testingend, 
                     freq = 1)

## Warning: Non-numeric columns being dropped: date

forecast_arima <- forecast(fit.arima, h = 4,
                           xreg = testdata_ts[, predictors])

class(forecast_arima)

## [1] "forecast"

Here’s the results of the ARIMA model. I’m showing the last 60 days, where the final 30 are the predicted.

has_timetk_idx(forecast_arima)

## [1] FALSE

forecast_table <- sw_sweep(forecast_arima)

forecast_table %>% tail(300)

## # A tibble: 300 × 7
##    index key    value lo.80 lo.95 hi.80 hi.95
##    <dbl> <chr>  <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 18387 actual  2027    NA    NA    NA    NA
##  2 18388 actual  1962    NA    NA    NA    NA
##  3 18389 actual  1861    NA    NA    NA    NA
##  4 18390 actual  1709    NA    NA    NA    NA
##  5 18391 actual  1752    NA    NA    NA    NA
##  6 18392 actual  1666    NA    NA    NA    NA
##  7 18393 actual  1896    NA    NA    NA    NA
##  8 18394 actual  1849    NA    NA    NA    NA
##  9 18395 actual  1794    NA    NA    NA    NA
## 10 18396 actual  1752    NA    NA    NA    NA
## # … with 290 more rows

forecast_table %>%
  ggplot(aes(x = index,
             y = value,
             color = key)) +
  # 95% CI
  geom_ribbon(aes(ymin = lo.95, ymax = hi.95),
              fill = carto_pal(name = "DarkMint")[4],
              color = NA,
              size = 0) +
  # 80% CI
  geom_ribbon(aes(ymin = lo.80, ymax = hi.80),
              fill = carto_pal(name = "DarkMint")[1],
              color = NA,
              size = 0,
              alpha = 0.8) +
  # Prediction
  geom_line() +
  geom_point(size = 0.3) +
  chewyTheme()

Twigeo Case

Kareem Elgindy

1/27/2022