1. Loading libraries.

library(tidyverse)
library(ggstatsplot)
library(tidymodels)
library(themis)
library(vip)
library(vetiver)



2. Loading the data.

socio <- read_csv("socio_demos.csv", na = c("#NULL!", "NA")) %>% janitor::clean_names()
media <- read_csv("media_contacts.csv") %>% janitor::clean_names()



3. Data cleaning and preprocessing.

soc <- socio %>% 
  mutate(birthday = ymd(birthday),
         number_of_children = parse_number(number_of_children),
         people_in_household = parse_number(people_in_household),
         number_of_children = as.factor(number_of_children),
         people_in_household = as.factor(people_in_household),
         year_of_birth = year(birthday), 
         birth_cat = case_when(year_of_birth >= 1920 & year_of_birth < 1940 ~ "1920-1939",
                               year_of_birth >= 1940 & year_of_birth < 1960 ~ "1940-1959",
                               year_of_birth >= 1960 & year_of_birth < 1980 ~ "1960-1979",
                               year_of_birth >= 1980 & year_of_birth < 2000 ~ "1980-1999",
                               year_of_birth >= 2000 ~ "2000s"), .after = birthday)
soc <- soc %>% mutate(row_num = row_number(), .before = person_id)
media <- media %>% mutate(row_num = row_number(), 
                          purchase = as.factor(purchase),
                          purchase = fct_recode(purchase, "Yes" = "1", "No" = "0"),)
df <- left_join(soc, media)



4. EDA. Normality check.

df %>% 
  select(5, 10:22, -year_of_birth) %>% 
  pivot_longer(everything()) %>% 
  ggplot(aes(value)) +
  geom_histogram() +
  theme(text = element_text(size = 20)) +
  labs(x = "Media") +
  facet_wrap(vars(name))

df %>% 
  select(year_of_birth) %>% 
  ggplot(aes(year_of_birth)) +
  theme(text = element_text(size = 20)) +
  labs(x = "Yaer of birth") +
  geom_histogram()

4.1 Correlations.

df %>% 
  select(5, 8:22) %>% 
  mutate(number_of_children = as.numeric(number_of_children), 
         people_in_household = as.numeric(people_in_household)) %>% 
  ggcorrmat(
  type = "n",
  ggtheme = ggstatsplot::theme_ggstatsplot(),
  ggplot.component = theme(text = element_text(size = 20),
                           plot.subtitle = element_text(size = 20)))


Conclusion: The highest positive correlation with the age of birth and media channels is seen in all YouTube channels. The highest negative correlation with the age of birth is shown in the flyers variable and less strongly with print total. Number of children and people in household show similar trend. They are positively correlated with YouTube channels and negatively correlated with TV media.



4.2 Which age groups generated the highest number of impressions across all channels?

df %>% 
  pivot_longer(tv_total:pinterest) %>% 
  mutate(name = fct_inorder(name), birth_cat = factor(birth_cat, 
                                                      levels = c("1920-1939", "1940-1959", 
                                                                 "1960-1979", "1980-1999", "2000s"))) %>% 
  summarise(birth = sum(value, na.rm = T), .by = c(birth_cat, name)) %>%
  drop_na(birth_cat) %>%
  ggplot(aes(birth, birth_cat, fill = birth_cat)) +
  geom_col() +
  geom_text(aes(label = birth), hjust = -0.2, size = 5) +
  scale_x_continuous(n.breaks = 4, expand = expansion(mult = c(0, 0.3))) +
  facet_wrap(vars(name)) +
  theme(text = element_text(size = 20)) +
  labs(y = NULL, x = "Advertisement exposition", fill = "Year of birth:") +
  guides(fill = guide_legend(reverse = TRUE))


Conclusion: The age group born between 1960-1979 generated highest number of impressions across almost all channels. It is followed by the age group born between 1940-1959, and 1980-1999.



4.4 Which media channels had the greatest impact on driving purchases?

df %>% 
  select(10:23) %>% 
  pivot_longer(-purchase) %>% 
  mutate(name = fct_inorder(name)) %>% 
  summarise(purch = sum(value, na.rm = T), .by = c(purchase, name)) %>%
  ggplot(aes(purch, purchase, fill = purchase)) +
  geom_col() +
  geom_text(aes(label = purch), hjust = -0.2, size = 5) +
  scale_x_continuous(expand = expansion(mult = c(0, 0.3))) +
  facet_wrap(vars(name)) +
  theme(text = element_text(size = 20)) +
  labs(y = NULL, x = "Number of purchases/no purchases", fill = "Purchase:") +
  guides(fill = guide_legend(reverse = TRUE))


5. Modelling (Machine Learning).

In both models used, the target variable was purchase and the predictors were all media channels plus gender, number of people in households, year of birth, and number of children from the socio-demographic variables.

df <- df %>% 
  select(5, 7, 8:23) %>% 
  mutate(across(c(number_of_children, people_in_household), as.numeric)) %>% 
  drop_na()

# Splitting the dataset
set.seed(2019)
df_split <- initial_split(df, strata = purchase)
df_train <- training(df_split)
df_test <- testing(df_split)

# The validation set via K-fold cross validation
set.seed(2020)
folds <- vfold_cv(df_train, strata = purchase)

# Recipe
log_rec <- recipe(purchase ~ ., data = df_train) %>%
  step_zv(all_numeric_predictors()) %>% 
  step_nzv(all_numeric_predictors()) %>% 
  step_normalize(all_numeric_predictors()) %>% 
  step_corr(all_numeric_predictors()) %>% 
  step_dummy(all_nominal_predictors()) %>% 
  step_downsample(purchase)
log_rec

# Control and metrics
model_control <- control_grid(save_pred = TRUE)
model_metrics <- metric_set(roc_auc, accuracy)

# Specify LOG REG
log_spec <- 
    logistic_reg(mixture = 1, penalty = tune()) %>% 
    set_engine("glmnet") %>% 
    set_mode("classification")

# Workflow
log_wf <- workflow() %>%
    add_recipe(log_rec) %>%
    add_model(log_spec)

# Grid
log_grid <- grid_space_filling(
    penalty(),
    size = 15)

# Tune
doParallel::registerDoParallel()
set.seed(1234)
log_tune <- log_wf %>%
    tune_grid(folds,
                        metrics = model_metrics,
                        control = model_control,
                        grid = log_grid)

# Select best metric
log_best <- log_tune %>% 
  select_best(metric = "accuracy")
autoplot(log_tune)

log_best

# Train results
log_train_results <- log_tune %>%
    filter_parameters(parameters = log_best) %>%
    collect_metrics()
log_train_results

# Last fit
log_test_results <- log_wf %>% 
    finalize_workflow(log_best) %>%
    last_fit(split = df_split, metrics = model_metrics)

log_results <- log_test_results %>% collect_metrics()
log_results

log_results %>% 
  ggplot(aes(.estimate, .metric, fill = .metric)) +
  geom_col(show.legend = F) +
  geom_text(aes(label = round(.estimate, 3)), hjust = -0.2) +
  scale_x_continuous(expand = expansion(mult = c(0, 0.1))) +
    labs(y = NULL, x = "Estimate",
        title = "Estimated metrics of the model") +
    theme(text = element_text(size = 16))

# Logistic Regression Results
log_results %>%
    select(-.config, -.estimator) %>%
    rename(metric = .metric,
                 Test_set = .estimate) %>% 
    arrange(desc(Test_set))

# Coefficients
log_test_results %>% 
    pluck(".workflow", 1) %>%
    extract_fit_parsnip() %>% 
  tidy(exponentiate = TRUE, conf.level = 0.95) %>% 
  filter(estimate > 0) %>% 
  arrange(-estimate)

# Confusion matrix
collect_predictions(log_test_results) %>%
    conf_mat(purchase, .pred_class) %>%
    pluck(1) %>%
    as_tibble() %>%
    ggplot(aes(Prediction, Truth, alpha = n)) +
    geom_tile(show.legend = FALSE) +
    geom_text(aes(label = n), colour = "white", alpha = 1, size = 8) +
    labs(y = "Actual result", x = "Predicted result", fill = NULL,
        title = "Confusion Matrix - Logistic regression") +
    theme(text = element_text(size = 16))

# ROC AUC
log_test_results %>%
    collect_predictions() %>%
    roc_curve(purchase, .pred_No) %>%
    ggplot(aes(x = 1 - specificity, y = sensitivity)) +
    geom_line(linewidth = 0.5, color = "midnightblue") +
    geom_abline(lty = 2, color = "black", linewidth = 0.5) +
  theme(text = element_text(size = 16)) +
    labs(title = "Logistic Regression - ROC curve", 
         subtitle = paste0("AUC = ", round(log_results$.estimate[4], 3)))

# Variable importance
log_test_results %>% 
    pluck(".workflow", 1) %>%
    extract_fit_parsnip() %>% 
    vip(geom = "col", num_features = 10, horiz = TRUE, aesthetics = list(size = 4)) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
  geom_text(aes(label = round(Importance, 3)), hjust = -0.2) +
  theme(text = element_text(size = 16)) +
    labs(title = "Variable Importance - Logistic Regression")

# Compare truth vs predicted in a table
v <- log_test_results %>%
    extract_workflow() %>%
    vetiver_model("purchase")
v
augment(v, slice_sample(df_test, n = 10))%>% 
  select(purchase, everything())

# Predict on new data
new_data <-
  tibble(
    year_of_birth = sample(1924:2007, 100, replace = T),
    gender = factor(sample(c("male", "female"), 100, replace = T)),
    number_of_children = sample(1:4, 100, replace = T),
    people_in_household = sample(1:10, 100, replace = T),
    tv_total = sample(0:113, 100, replace = T),
    flyers = sample(0:3, 100, replace = T),
    print_total = sample(0:23, 100, replace = T),
    online_video = sample(0:18, 100, replace = T),
    online_display = sample(0:13, 100, replace = T),
    online_total = sample(0:24, 100, replace = T),
    tik_tok = sample(0:9, 100, replace = T),
    you_tube_total = sample(0:14, 100, replace = T),
    you_tube_mobile = sample(0:14, 100, replace = T),
    you_tube_desktop = sample(0:8, 100, replace = T),
    you_tube_tablet = sample(0:12, 100, replace = T),
    you_tube_ctv = sample(0:12, 100, replace = T),
    pinterest = sample(0:63, 100, replace = T))

augment(v, new_data = new_data)

predict(v, new_data, type = "class")


6. Results.

6.1 Logistic regression.



Conclusion: Accuracy is the proportion of the data that are predicted correctly. Accuracy is reported as a value between 0 and 1. Accuracy of 0 means the classifier always predicts the wrong label, whereas accuracy of 1 means that it always predicts the correct label. In other words, the higher the estimate, the better. In our case, accuracy is almost 68% meaning that the model is not great but more reliable than choosing at random.



Conclusion: A confusion matrix is a simple graph that shows how well a classification model is performing by comparing its predictions to the actual results. However, in this case the model predicts more correctly non purchasing clients than purchasing ones.



Conclusion: ROC AUC stands for Receiver Operating Characteristic Area Under the Curve. ROC AUC score is a single number that summarizes the classifier’s performance. ROC AUC score shows how well the classifier distinguishes positive and negative classes. It can take values from 0 to 1. A higher ROC AUC indicates better performance. A perfect model would have an AUC of 1, while a random model would have an AUC of 0.5. The AUC value shows that this model would be much more successful in predicting the purchasing than relying on chance.



Conclusion: The most important variable is total TV media channel where should be focused all the purchasing efforts.


6.2 Decision trees.

The results from the second ML model at larger extend repeat those from the previous one, therefore I will skip the comments of the shown graphs.







Conclusion: The decision tree shows that some of the variables are very good tool in predicting whether a client will purchase or not a given product. For example, it is almost certain (90%) that there will be a purchase if tv_total is >= 4.5 and you_tube_total is also >= 0.5. This includes 11% of the whole sample of 15836 observations. For another 8% of the sample is also almost certain (84%) that they will purchase a product if their print_total is >= 0.5. On the other hand, looking at the leftmost part of the tree, for 17% of people is 30% certain that they will not purchase a product if tv_total is < 4.5, pinterest is < 2.5, you_tube_total is < 0.5, year_of_birth is < 1964, and flyers is < 0.5.

Summary and recommendations: TV channels had the greatest impact on generating purchases. Also, the purchasing/advertising effort should be focused on people born between 1940-1979, more on males than on females, but both groups had almost equal media consumption. It should also be focused on households with no or at most one child, and on households with two, one or up to three family members. Finally, as already mentioned, the greatest purchase effort should cover first TV channels, then pinterest, and print media preferentially.