library(tidyverse)
library(ggstatsplot)
library(tidymodels)
library(themis)
library(vip)
library(vetiver)
socio <- read_csv("socio_demos.csv", na = c("#NULL!", "NA")) %>% janitor::clean_names()
media <- read_csv("media_contacts.csv") %>% janitor::clean_names()
soc <- socio %>%
mutate(birthday = ymd(birthday),
number_of_children = parse_number(number_of_children),
people_in_household = parse_number(people_in_household),
number_of_children = as.factor(number_of_children),
people_in_household = as.factor(people_in_household),
year_of_birth = year(birthday),
birth_cat = case_when(year_of_birth >= 1920 & year_of_birth < 1940 ~ "1920-1939",
year_of_birth >= 1940 & year_of_birth < 1960 ~ "1940-1959",
year_of_birth >= 1960 & year_of_birth < 1980 ~ "1960-1979",
year_of_birth >= 1980 & year_of_birth < 2000 ~ "1980-1999",
year_of_birth >= 2000 ~ "2000s"), .after = birthday)
soc <- soc %>% mutate(row_num = row_number(), .before = person_id)
media <- media %>% mutate(row_num = row_number(),
purchase = as.factor(purchase),
purchase = fct_recode(purchase, "Yes" = "1", "No" = "0"),)
df <- left_join(soc, media)
df %>%
select(5, 10:22, -year_of_birth) %>%
pivot_longer(everything()) %>%
ggplot(aes(value)) +
geom_histogram() +
theme(text = element_text(size = 20)) +
labs(x = "Media") +
facet_wrap(vars(name))
df %>%
select(year_of_birth) %>%
ggplot(aes(year_of_birth)) +
theme(text = element_text(size = 20)) +
labs(x = "Yaer of birth") +
geom_histogram()
df %>%
select(5, 8:22) %>%
mutate(number_of_children = as.numeric(number_of_children),
people_in_household = as.numeric(people_in_household)) %>%
ggcorrmat(
type = "n",
ggtheme = ggstatsplot::theme_ggstatsplot(),
ggplot.component = theme(text = element_text(size = 20),
plot.subtitle = element_text(size = 20)))
df %>%
pivot_longer(tv_total:pinterest) %>%
mutate(name = fct_inorder(name), birth_cat = factor(birth_cat,
levels = c("1920-1939", "1940-1959",
"1960-1979", "1980-1999", "2000s"))) %>%
summarise(birth = sum(value, na.rm = T), .by = c(birth_cat, name)) %>%
drop_na(birth_cat) %>%
ggplot(aes(birth, birth_cat, fill = birth_cat)) +
geom_col() +
geom_text(aes(label = birth), hjust = -0.2, size = 5) +
scale_x_continuous(n.breaks = 4, expand = expansion(mult = c(0, 0.3))) +
facet_wrap(vars(name)) +
theme(text = element_text(size = 20)) +
labs(y = NULL, x = "Advertisement exposition", fill = "Year of birth:") +
guides(fill = guide_legend(reverse = TRUE))
df %>%
pivot_longer(tv_total:pinterest) %>%
filter(name != "purchase") %>%
mutate(name = fct_inorder(name),
gender = fct_relevel(gender, "male", "female")) %>%
summarise(gen_name = sum(value, na.rm = T), .by = c(gender, name)) %>%
drop_na(gender) %>%
ggplot(aes(gen_name, gender, fill = gender)) +
geom_col() +
geom_text(aes(label = gen_name), hjust = -0.2, size = 5) +
scale_x_continuous(expand = expansion(mult = c(0, 0.3))) +
facet_wrap(vars(name)) +
theme(text = element_text(size = 20)) +
labs(y = NULL, x = "Advertisement exposition", fill = "Gender:") +
guides(fill = guide_legend(reverse = TRUE))
df %>%
pivot_longer(tv_total:pinterest) %>%
filter(name != "purchase") %>%
mutate(name = fct_inorder(name)) %>%
summarise(children = sum(value, na.rm = T), .by = c(number_of_children, name)) %>%
drop_na(number_of_children) %>%
ggplot(aes(children, number_of_children, fill = number_of_children)) +
geom_col() +
geom_text(aes(label = children), hjust = -0.2, size = 5) +
scale_x_continuous(expand = expansion(mult = c(0, 0.3))) +
facet_wrap(vars(name)) +
theme(text = element_text(size = 20)) +
labs(y = NULL, x = "Advertisement exposition", fill = "Number of\nchildren:") +
guides(fill = guide_legend(reverse = TRUE))
df %>%
pivot_longer(tv_total:pinterest) %>%
mutate(name = fct_inorder(name)) %>%
filter(name != "purchase", value > 0) %>%
summarise(people = sum(value, na.rm = T), .by = c(people_in_household, name)) %>%
drop_na(people_in_household) %>%
ggplot(aes(people, people_in_household, fill = people_in_household)) +
geom_col() +
geom_text(aes(label = people), hjust = -0.2, size = 5) +
scale_x_continuous(expand = expansion(mult = c(0, 0.3))) +
facet_wrap(vars(name)) +
theme(text = element_text(size = 20)) +
labs(y = NULL, x = "Advertisement exposition", fill = "People in\nhousehold:") +
guides(fill = guide_legend(reverse = TRUE))
df %>%
select(10:23) %>%
pivot_longer(-purchase) %>%
mutate(name = fct_inorder(name)) %>%
summarise(purch = sum(value, na.rm = T), .by = c(purchase, name)) %>%
ggplot(aes(purch, purchase, fill = purchase)) +
geom_col() +
geom_text(aes(label = purch), hjust = -0.2, size = 5) +
scale_x_continuous(expand = expansion(mult = c(0, 0.3))) +
facet_wrap(vars(name)) +
theme(text = element_text(size = 20)) +
labs(y = NULL, x = "Number of purchases/no purchases", fill = "Purchase:") +
guides(fill = guide_legend(reverse = TRUE))
df <- df %>%
select(5, 7, 8:23) %>%
mutate(across(c(number_of_children, people_in_household), as.numeric)) %>%
drop_na()
# Splitting the dataset
set.seed(2019)
df_split <- initial_split(df, strata = purchase)
df_train <- training(df_split)
df_test <- testing(df_split)
# The validation set via K-fold cross validation
set.seed(2020)
folds <- vfold_cv(df_train, strata = purchase)
# Recipe
log_rec <- recipe(purchase ~ ., data = df_train) %>%
step_zv(all_numeric_predictors()) %>%
step_nzv(all_numeric_predictors()) %>%
step_normalize(all_numeric_predictors()) %>%
step_corr(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_downsample(purchase)
log_rec
# Control and metrics
model_control <- control_grid(save_pred = TRUE)
model_metrics <- metric_set(roc_auc, accuracy)
# Specify LOG REG
log_spec <-
logistic_reg(mixture = 1, penalty = tune()) %>%
set_engine("glmnet") %>%
set_mode("classification")
# Workflow
log_wf <- workflow() %>%
add_recipe(log_rec) %>%
add_model(log_spec)
# Grid
log_grid <- grid_space_filling(
penalty(),
size = 15)
# Tune
doParallel::registerDoParallel()
set.seed(1234)
log_tune <- log_wf %>%
tune_grid(folds,
metrics = model_metrics,
control = model_control,
grid = log_grid)
# Select best metric
log_best <- log_tune %>%
select_best(metric = "accuracy")
autoplot(log_tune)
log_best
# Train results
log_train_results <- log_tune %>%
filter_parameters(parameters = log_best) %>%
collect_metrics()
log_train_results
# Last fit
log_test_results <- log_wf %>%
finalize_workflow(log_best) %>%
last_fit(split = df_split, metrics = model_metrics)
log_results <- log_test_results %>% collect_metrics()
log_results
log_results %>%
ggplot(aes(.estimate, .metric, fill = .metric)) +
geom_col(show.legend = F) +
geom_text(aes(label = round(.estimate, 3)), hjust = -0.2) +
scale_x_continuous(expand = expansion(mult = c(0, 0.1))) +
labs(y = NULL, x = "Estimate",
title = "Estimated metrics of the model") +
theme(text = element_text(size = 16))
# Logistic Regression Results
log_results %>%
select(-.config, -.estimator) %>%
rename(metric = .metric,
Test_set = .estimate) %>%
arrange(desc(Test_set))
# Coefficients
log_test_results %>%
pluck(".workflow", 1) %>%
extract_fit_parsnip() %>%
tidy(exponentiate = TRUE, conf.level = 0.95) %>%
filter(estimate > 0) %>%
arrange(-estimate)
# Confusion matrix
collect_predictions(log_test_results) %>%
conf_mat(purchase, .pred_class) %>%
pluck(1) %>%
as_tibble() %>%
ggplot(aes(Prediction, Truth, alpha = n)) +
geom_tile(show.legend = FALSE) +
geom_text(aes(label = n), colour = "white", alpha = 1, size = 8) +
labs(y = "Actual result", x = "Predicted result", fill = NULL,
title = "Confusion Matrix - Logistic regression") +
theme(text = element_text(size = 16))
# ROC AUC
log_test_results %>%
collect_predictions() %>%
roc_curve(purchase, .pred_No) %>%
ggplot(aes(x = 1 - specificity, y = sensitivity)) +
geom_line(linewidth = 0.5, color = "midnightblue") +
geom_abline(lty = 2, color = "black", linewidth = 0.5) +
theme(text = element_text(size = 16)) +
labs(title = "Logistic Regression - ROC curve",
subtitle = paste0("AUC = ", round(log_results$.estimate[4], 3)))
# Variable importance
log_test_results %>%
pluck(".workflow", 1) %>%
extract_fit_parsnip() %>%
vip(geom = "col", num_features = 10, horiz = TRUE, aesthetics = list(size = 4)) +
scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
geom_text(aes(label = round(Importance, 3)), hjust = -0.2) +
theme(text = element_text(size = 16)) +
labs(title = "Variable Importance - Logistic Regression")
# Compare truth vs predicted in a table
v <- log_test_results %>%
extract_workflow() %>%
vetiver_model("purchase")
v
augment(v, slice_sample(df_test, n = 10))%>%
select(purchase, everything())
# Predict on new data
new_data <-
tibble(
year_of_birth = sample(1924:2007, 100, replace = T),
gender = factor(sample(c("male", "female"), 100, replace = T)),
number_of_children = sample(1:4, 100, replace = T),
people_in_household = sample(1:10, 100, replace = T),
tv_total = sample(0:113, 100, replace = T),
flyers = sample(0:3, 100, replace = T),
print_total = sample(0:23, 100, replace = T),
online_video = sample(0:18, 100, replace = T),
online_display = sample(0:13, 100, replace = T),
online_total = sample(0:24, 100, replace = T),
tik_tok = sample(0:9, 100, replace = T),
you_tube_total = sample(0:14, 100, replace = T),
you_tube_mobile = sample(0:14, 100, replace = T),
you_tube_desktop = sample(0:8, 100, replace = T),
you_tube_tablet = sample(0:12, 100, replace = T),
you_tube_ctv = sample(0:12, 100, replace = T),
pinterest = sample(0:63, 100, replace = T))
augment(v, new_data = new_data)
predict(v, new_data, type = "class")