Problem statement
Predicting Coupon Redemption
Setup
library(plyr);library(dplyr)
library(caret)
library(tidyverse)
library(tidymodels)
library(lubridate)
library(skimr)
library(doParallel)
library(cluster)
library(factoextra)
source("functions.R")
ERD

Import
campaign <- read_csv('data/campaign_data.csv') %>%
mutate(start_date = as_date(x=start_date, format="%d/%m/%y", tz=""),
end_date = as_date(x=end_date, format="%d/%m/%y", tz="")) %>%
arrange(start_date) %>%
mutate(camp_duration = as.numeric(end_date - start_date),
camp_days_from_first_campaign = as.numeric(start_date - as_date("2012-08-12")),
dataset = ifelse(between(campaign_id,1,13) | between(campaign_id,26,30),"train","test"),
campaign_type = as_factor(campaign_type))
coupon_item_mapping <- read_csv('data/coupon_item_mapping.csv')
customer_demographics <- read_csv("data/customer_demographics.csv") %>%
mutate_at(c("age_range","marital_status","rented","family_size","no_of_children","income_bracket"), ~as.character(.)) %>%
mutate_if(is.character, ~replace(., is.na(.), 'Na'))
customer_transaction <- read_csv("data/customer_transaction_data.csv")
item <- read_csv("data/item_data.csv")
item <- recipe(~. , item) %>%
step_mutate(category = str_replace_all(category,"([[:punct:]]|\\s)","")) %>%
step_mutate(brand_orig = brand) %>%
step_other(brand, threshold = 0.005) %>%
step_mutate(cat_brand = paste(category, brand, sep="_")) %>%
step_string2factor(brand_type, category, brand) %>%
prep(item) %>%
bake(item)
train <- read_csv("data/train.csv")
test <- read_csv("data/test_QyjYwdj.csv")
profiles
customer_profile <- Fn_customer_profile(customer_transaction, "2012-08-12", "2013-07-03")
customer_profile <- customer_profile %>%
left_join(Fn_customer_tenure(customer_transaction), by="customer_id") %>%
left_join(Fn_custering(customer_profile), by="customer_id")
customer_coupon_fav <- Fn_customer_items_fav(customer_transaction, "2012-08-12", "2013-07-03")
item_profile <- Fn_item_profile(customer_transaction, "2012-08-12", "2013-07-03")
coupon <- Fn_coupon(item_profile)
coupon_items_cat_brand <- Fn_coupon_item_cat_brand(coupon)
coupon_top_items_by_spend <- Fn_coupon_top_items_by_spend(coupon)
coupon_top_items_by_cust <- Fn_coupon_top_items_by_cust(coupon)
coupon_top_items_by_visit <- Fn_coupon_top_items_by_visit(coupon)
coupon_top_items_by_price <- Fn_coupon_top_items_by_price(coupon)
Build train
train <- read_csv("data/train.csv") %>%
mutate(redemption_status = as_factor(if_else(redemption_status==1, "Yes", "No"))) %>%
#..campaign
left_join(campaign, by="campaign_id") %>%
left_join(Fn_campaign_profile(train), by="campaign_id") %>%
#..customer
left_join(customer_profile, by=c("customer_id")) %>%
left_join(customer_demographics, by="customer_id") %>%
left_join(customer_coupon_fav, by=c("customer_id", "coupon_id")) %>%
#..coupon
left_join(coupon_top_items_by_spend, by=c("coupon_id")) %>%
left_join(coupon_top_items_by_cust, by=c("coupon_id")) %>%
left_join(coupon_top_items_by_visit, by=c("coupon_id")) %>%
left_join(coupon_items_cat_brand, by=c("coupon_id")) %>%
left_join(coupon_top_items_by_price, by=c("coupon_id")) %>%
#..impute
mutate_if(is_character, ~replace(., is.na(.), 'Na')) %>%
mutate_if(is_character, ~as_factor(.)) %>%
mutate_if(is.numeric, ~replace(., is.na(.), 0))
Build test
test <- read_csv("data/test_QyjYwdj.csv") %>%
#..campaign
left_join(campaign, by="campaign_id") %>%
left_join(Fn_campaign_profile(test), by="campaign_id") %>%
#..customer
left_join(customer_profile, by=c("customer_id")) %>%
left_join(customer_demographics, by="customer_id") %>%
left_join(customer_coupon_fav, by=c("customer_id", "coupon_id")) %>%
#..coupon
left_join(coupon_top_items_by_spend, by=c("coupon_id")) %>%
left_join(coupon_top_items_by_cust, by=c("coupon_id")) %>%
left_join(coupon_top_items_by_visit, by=c("coupon_id")) %>%
left_join(coupon_items_cat_brand, by=c("coupon_id")) %>%
left_join(coupon_top_items_by_price, by=c("coupon_id")) %>%
#..impute
mutate_if(is_character, ~replace(., is.na(.), 'Na')) %>%
mutate_if(is_character, ~as_factor(.)) %>%
mutate_if(is.numeric, ~replace(., is.na(.), 0))
Outcome distribution
ggplot(train, aes(x=redemption_status)) +
geom_bar() +
geom_text(aes(label=scales::percent(..count../sum(..count..))),
stat="count",position=position_stack(),vjust=1, color="orange")

Preprocessing
train_recipe <- recipe(redemption_status ~ ., train) %>%
update_role(id, campaign_id, coupon_id, customer_id, new_role = "id variable") %>%
step_rm(start_date, end_date, dataset) %>%
step_num2factor(starts_with("cc_")) %>%
step_dummy(age_range, campaign_type, family_size, income_bracket, marital_status, no_of_children, rented, cluster, starts_with("cc_"), one_hot=TRUE) %>%
step_nzv(all_predictors()) %>%
step_center(all_numeric(),-all_predictors()) %>%
step_scale(all_numeric(),-all_predictors()) %>%
step_downsample(redemption_status, ratio = 5)
Modeling
fitControl <- trainControl(method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary,
verboseIter = TRUE,
search = "random")
cl <- makeCluster(detectCores())
registerDoParallel(cl)
Modeling XGB
library(xgboost)
Attaching package: 㤼㸱xgboost㤼㸲
The following object is masked from 㤼㸱package:dplyr㤼㸲:
slice
parametersGrid <- expand.grid(eta = c(0.1),
colsample_bytree=c(1),
max_depth=c(10),
nrounds=c(500),
gamma=c(1),
min_child_weight=5,
subsample=1)
set.seed(1968)
xgb.fit <- train(train_recipe,
data=train,
method="xgbTree",
# tuneLength = 5,
tuneGrid=parametersGrid,
metric="ROC",
trControl=fitControl)
Preparing recipe
There are new levels in a factor: NA
Aggregating results
Fitting final model on full training set
There are new levels in a factor: NA
xgb.fit
eXtreme Gradient Boosting
78369 samples
137 predictor
2 classes: 'No', 'Yes'
Recipe steps: rm, num2factor, dummy, nzv, center, scale, downsample
Resampling: Cross-Validated (5 fold)
Summary of sample sizes: 62695, 62695, 62695, 62695, 62696
Resampling results:
ROC Sens Spec
0.979255 0.9570711 0.8546339
Tuning parameter 'nrounds' was held constant at a value of 500
Tuning parameter
parameter 'min_child_weight' was held constant at a value of 5
Tuning parameter
'subsample' was held constant at a value of 1
confusionMatrix(xgb.fit, norm = "none")
Cross-Validated (5 fold) Confusion Matrix
(entries are un-normalized aggregated counts)
Reference
Prediction No Yes
No 74307 106
Yes 3333 623
Accuracy (average) : 0.9561
Fn_var_imp(xgb.fit)
Modeling RF
library(ranger)
parametersGrid <- expand.grid(mtry = c(43), splitrule = c('extratrees'), min.node.size = c(6))
set.seed(1968)
rf.fit <- train(train_recipe,
data=train,
method="ranger",
importance="impurity",
# tuneLength = 10,
tuneGrid=parametersGrid,
metric="ROC",
trControl=fitControl)
rf.fit
confusionMatrix.train(rf.fit, norm="none")
Fn_var_imp(rf.fit)
Submission
test.pred.prob <- predict(xgb.fit, test, type = "prob")
submission <- bind_cols(id=test$id, redemption_status=test.pred.prob$Yes)
write_csv(submission,'sub7-xgb.csv')
---
title: "Analytics Vidhya | AmExpert 2019 – Machine Learning Hackathon - Coupon redemption"
output: html_notebook
author: Ronen Cohen - http://linkedin.com/in/ronencozen
---

### Problem statement
Predicting Coupon Redemption

### Setup
```{r message=FALSE, warning=FALSE}
library(plyr);library(dplyr)
library(caret)
library(tidyverse)
library(tidymodels)
library(lubridate)
library(skimr)
library(doParallel)
library(cluster)
library(factoextra)

source("functions.R")
```

### ERD
```{r ERD, echo=FALSE, out.width = '100%'}
knitr::include_graphics("erd.png")
```

### Import 
```{r message=FALSE, warning=FALSE}
campaign <- read_csv('data/campaign_data.csv') %>% 
  mutate(start_date = as_date(x=start_date, format="%d/%m/%y", tz=""),
         end_date = as_date(x=end_date, format="%d/%m/%y", tz="")) %>% 
         arrange(start_date) %>% 
  mutate(camp_duration = as.numeric(end_date - start_date),
         camp_days_from_first_campaign = as.numeric(start_date - as_date("2012-08-12")),
         dataset = ifelse(between(campaign_id,1,13) | between(campaign_id,26,30),"train","test"),
         campaign_type = as_factor(campaign_type))

coupon_item_mapping <- read_csv('data/coupon_item_mapping.csv')

customer_demographics <- read_csv("data/customer_demographics.csv") %>% 
  mutate_at(c("age_range","marital_status","rented","family_size","no_of_children","income_bracket"), ~as.character(.)) %>%
  mutate_if(is.character, ~replace(., is.na(.), 'Na'))

customer_transaction <- read_csv("data/customer_transaction_data.csv")

item <- read_csv("data/item_data.csv") 
item <- recipe(~. , item) %>% 
  step_mutate(category = str_replace_all(category,"([[:punct:]]|\\s)","")) %>% 
  step_mutate(brand_orig = brand) %>% 
  step_other(brand, threshold = 0.005) %>%
  step_mutate(cat_brand = paste(category, brand, sep="_")) %>% 
  step_string2factor(brand_type, category, brand) %>% 
  prep(item) %>% 
  bake(item)

train <- read_csv("data/train.csv")
test <- read_csv("data/test_QyjYwdj.csv")
```

### profiles
```{r}
customer_profile <- Fn_customer_profile(customer_transaction, "2012-08-12", "2013-07-03")
customer_profile <- customer_profile %>% 
  left_join(Fn_customer_tenure(customer_transaction), by="customer_id") %>% 
  left_join(Fn_custering(customer_profile), by="customer_id")

customer_coupon_fav <- Fn_customer_items_fav(customer_transaction, "2012-08-12", "2013-07-03")

item_profile <- Fn_item_profile(customer_transaction, "2012-08-12", "2013-07-03")
coupon <- Fn_coupon(item_profile)
coupon_items_cat_brand <- Fn_coupon_item_cat_brand(coupon)
coupon_top_items_by_spend <- Fn_coupon_top_items_by_spend(coupon)
coupon_top_items_by_cust <- Fn_coupon_top_items_by_cust(coupon)
coupon_top_items_by_visit <- Fn_coupon_top_items_by_visit(coupon)
coupon_top_items_by_price <- Fn_coupon_top_items_by_price(coupon)
```

### Build train
```{r message=FALSE}
train <- read_csv("data/train.csv") %>% 
  mutate(redemption_status = as_factor(if_else(redemption_status==1, "Yes", "No"))) %>%  
  #..campaign
  left_join(campaign, by="campaign_id") %>%
  left_join(Fn_campaign_profile(train), by="campaign_id") %>%
  #..customer
  left_join(customer_profile, by=c("customer_id")) %>% 
  left_join(customer_demographics, by="customer_id") %>%
  left_join(customer_coupon_fav, by=c("customer_id", "coupon_id")) %>%
  #..coupon
  left_join(coupon_top_items_by_spend, by=c("coupon_id")) %>% 
  left_join(coupon_top_items_by_cust, by=c("coupon_id")) %>%
  left_join(coupon_top_items_by_visit, by=c("coupon_id")) %>% 
  left_join(coupon_items_cat_brand,  by=c("coupon_id")) %>%
  left_join(coupon_top_items_by_price,  by=c("coupon_id")) %>% 
  #..impute
  mutate_if(is_character, ~replace(., is.na(.), 'Na')) %>% 
  mutate_if(is_character, ~as_factor(.)) %>% 
  mutate_if(is.numeric, ~replace(., is.na(.), 0))
```

### Build test
```{r message=FALSE}
test <- read_csv("data/test_QyjYwdj.csv") %>% 
  #..campaign
  left_join(campaign, by="campaign_id") %>%
  left_join(Fn_campaign_profile(test), by="campaign_id") %>%
  #..customer
  left_join(customer_profile, by=c("customer_id")) %>% 
  left_join(customer_demographics, by="customer_id") %>%
  left_join(customer_coupon_fav, by=c("customer_id", "coupon_id")) %>% 
  #..coupon
  left_join(coupon_top_items_by_spend, by=c("coupon_id")) %>% 
  left_join(coupon_top_items_by_cust, by=c("coupon_id")) %>%
  left_join(coupon_top_items_by_visit, by=c("coupon_id")) %>% 
  left_join(coupon_items_cat_brand,  by=c("coupon_id")) %>%
  left_join(coupon_top_items_by_price,  by=c("coupon_id")) %>% 
  #..impute
  mutate_if(is_character, ~replace(., is.na(.), 'Na')) %>% 
  mutate_if(is_character, ~as_factor(.)) %>% 
  mutate_if(is.numeric, ~replace(., is.na(.), 0))
```

### Outcome distribution
```{r}
ggplot(train, aes(x=redemption_status)) +
  geom_bar() +
  geom_text(aes(label=scales::percent(..count../sum(..count..))),
            stat="count",position=position_stack(),vjust=1, color="orange")
```
### Preprocessing
```{r}
train_recipe <- recipe(redemption_status ~ ., train) %>% 
  update_role(id, campaign_id, coupon_id, customer_id, new_role = "id variable") %>%
  step_rm(start_date, end_date, dataset) %>%
  step_num2factor(starts_with("cc_")) %>% 
  step_dummy(age_range, campaign_type, family_size, income_bracket, marital_status, no_of_children, rented, cluster, starts_with("cc_"), one_hot=TRUE) %>% 
  step_nzv(all_predictors()) %>% 
  step_center(all_numeric(),-all_predictors()) %>%
  step_scale(all_numeric(),-all_predictors()) %>% 
  step_downsample(redemption_status, ratio = 5)
```

### Modeling
```{r}
fitControl <- trainControl(method = "cv",
                           number = 5,
                           classProbs = TRUE,
                           summaryFunction = twoClassSummary,
                           verboseIter = TRUE,
                           search = "random")

cl <- makeCluster(detectCores())
registerDoParallel(cl)
```

### Modeling XGB
```{r}
library(xgboost)

parametersGrid <-  expand.grid(eta = c(0.1),
                            colsample_bytree=c(1),
                            max_depth=c(10),
                            nrounds=c(500),
                            gamma=c(1),
                            min_child_weight=5,
                            subsample=1)

set.seed(1968)

xgb.fit <- train(train_recipe,
                data=train,
                method="xgbTree", 
                # tuneLength = 5,
                tuneGrid=parametersGrid,
                metric="ROC",
                trControl=fitControl)

xgb.fit

confusionMatrix(xgb.fit, norm = "none")

Fn_var_imp(xgb.fit)
```

### Modeling RF
```{r}
library(ranger)

parametersGrid <-  expand.grid(mtry = c(43), splitrule = c('extratrees'), min.node.size = c(6))

set.seed(1968)

rf.fit <- train(train_recipe,  
                data=train, 
                method="ranger", 
                importance="impurity",
                # tuneLength = 10,
                tuneGrid=parametersGrid,
                metric="ROC",
                trControl=fitControl)

rf.fit

confusionMatrix.train(rf.fit, norm="none")

Fn_var_imp(rf.fit)
```


### Submission
```{r}
test.pred.prob <- predict(xgb.fit, test, type = "prob")
submission <- bind_cols(id=test$id, redemption_status=test.pred.prob$Yes)
write_csv(submission,'sub7-xgb.csv')

```

