Predict if the user will click on an ad. Using history of clicks and searches for products.
create a user profile based on search information, and store impression profile.
library(caret)
library(tidyverse)
library(tidymodels)
library(lubridate)
library(skimr)
Fn_var_imp <- function(model.fit){
varImp(model.fit)$importance %>%
as.data.frame() %>%
rownames_to_column() %>%
arrange(desc(Overall)) %>%
filter(Overall>0)}
skim(df)
Skim summary statistics
n obs: 237609
n variables: 7
-- Variable type:character -----------------------------------------------------
variable missing complete n min max empty n_unique
impression_id 0 237609 237609 32 32 0 237609
os_version 0 237609 237609 3 12 0 3
-- Variable type:numeric -------------------------------------------------------
variable missing complete n mean sd p0 p25 p50 p75 p100 hist
app_code 0 237609 237609 249.1 135.21 0 163 213 385 522 ▅▂▃▇▂▇▁▂
is_4G 0 237609 237609 0.36 0.48 0 0 0 1 1 ▇▁▁▁▁▁▁▅
is_click 0 237609 237609 0.046 0.21 0 0 0 0 1 ▇▁▁▁▁▁▁▁
user_id 0 237609 237609 46454.53 26802.73 0 23197 46597 69684 92586 ▇▇▇▇▇▇▇▇
-- Variable type:POSIXct -------------------------------------------------------
variable missing complete n min max median n_unique
impression_time 0 237609 237609 2018-11-15 2018-12-13 2018-11-29 36461
viewlogitem <- viewlog %>%
left_join(items, by="item_id")
user_search_profile <- viewlogitem %>%
group_by(user_id) %>%
summarise(n_search = n_distinct(item_id),
n_product_type = n_distinct(product_type),
n_top_cat = n_distinct(category_1),
n_session = n_distinct(session_id),
search_to_session = n_search / n_session,
item_to_cat = n_search / n_top_cat,
first_session = yday(min(as_date(server_time))),
last_session = yday(max(as_date(server_time))),
search_period = (last_session - first_session) + 1,
avg_price = mean(item_price, na.rm=TRUE))
store_imp_profile <- df %>%
group_by(app_code) %>%
summarise(n_store_imp = n_distinct(impression_id),
n_store_user = n_distinct(user_id),
imp_to_user = n_store_imp / n_store_user,
first_store_imp = yday(min(as_date(impression_time))),
last_store_imp = yday(max(as_date(impression_time))),
imp_store_period = (last_store_imp - first_store_imp) + 1,
n_store_yday = n_distinct(yday(impression_time)))
train <- df %>%
left_join(user_search_profile, by="user_id") %>%
left_join(store_imp_profile, by="app_code") %>%
mutate(is_click = factor(if_else(is_click==1,"Yes","No")),
last_session_to_first_impression = first_store_imp - last_session,
last_session_to_last_impression = last_store_imp - last_session,
imp_to_search_period = as.numeric(imp_store_period) / as.numeric(search_period)) %>%
filter(!is.na(avg_price))
ggplot(train, aes(x=is_click)) +
geom_bar() +
geom_text(aes(label=scales::percent(..count../sum(..count..))),
stat="count",position=position_stack(),vjust=1, color="yellow")
preproc_recipe <- recipe(~ ., train) %>%
update_role(is_click, new_role = "outcome") %>%
step_mutate(week = week(impression_time),
yday = yday(impression_time),
wday = wday(impression_time)) %>%
step_dummy(os_version) %>%
step_rm(impression_id, impression_time, user_id) %>%
step_scale(all_numeric())
library(ranger)
fitControl <- trainControl(method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary,
verboseIter = TRUE,
search = "random")
set.seed(1968)
rf.fit <- train(preproc_recipe,
data=train,
method="ranger",
importance="impurity",
# tuneLength = 3,
tuneGrid=expand.grid(mtry=c(15),
splitrule=c('gini'),
min.node.size=9),
metric="ROC",
trControl=fitControl)
Loading required namespace: e1071
Preparing recipe
+ Fold1: mtry=15, splitrule=gini, min.node.size=9
Growing trees.. Progress: 20%. Estimated remaining time: 2 minutes, 5 seconds.
Growing trees.. Progress: 38%. Estimated remaining time: 1 minute, 39 seconds.
Growing trees.. Progress: 57%. Estimated remaining time: 1 minute, 11 seconds.
Growing trees.. Progress: 74%. Estimated remaining time: 43 seconds.
Growing trees.. Progress: 91%. Estimated remaining time: 15 seconds.
- Fold1: mtry=15, splitrule=gini, min.node.size=9
+ Fold2: mtry=15, splitrule=gini, min.node.size=9
Growing trees.. Progress: 16%. Estimated remaining time: 2 minutes, 40 seconds.
Growing trees.. Progress: 34%. Estimated remaining time: 2 minutes, 0 seconds.
Growing trees.. Progress: 53%. Estimated remaining time: 1 minute, 22 seconds.
Growing trees.. Progress: 72%. Estimated remaining time: 49 seconds.
Growing trees.. Progress: 90%. Estimated remaining time: 17 seconds.
- Fold2: mtry=15, splitrule=gini, min.node.size=9
+ Fold3: mtry=15, splitrule=gini, min.node.size=9
Growing trees.. Progress: 16%. Estimated remaining time: 2 minutes, 40 seconds.
Growing trees.. Progress: 35%. Estimated remaining time: 1 minute, 59 seconds.
Growing trees.. Progress: 51%. Estimated remaining time: 1 minute, 32 seconds.
Growing trees.. Progress: 66%. Estimated remaining time: 1 minute, 4 seconds.
Growing trees.. Progress: 83%. Estimated remaining time: 33 seconds.
Growing trees.. Progress: 98%. Estimated remaining time: 3 seconds.
- Fold3: mtry=15, splitrule=gini, min.node.size=9
+ Fold4: mtry=15, splitrule=gini, min.node.size=9
Growing trees.. Progress: 17%. Estimated remaining time: 2 minutes, 33 seconds.
Growing trees.. Progress: 33%. Estimated remaining time: 2 minutes, 9 seconds.
Growing trees.. Progress: 50%. Estimated remaining time: 1 minute, 35 seconds.
Growing trees.. Progress: 66%. Estimated remaining time: 1 minute, 6 seconds.
Growing trees.. Progress: 83%. Estimated remaining time: 31 seconds.
- Fold4: mtry=15, splitrule=gini, min.node.size=9
+ Fold5: mtry=15, splitrule=gini, min.node.size=9
Growing trees.. Progress: 16%. Estimated remaining time: 2 minutes, 40 seconds.
Growing trees.. Progress: 33%. Estimated remaining time: 2 minutes, 8 seconds.
Growing trees.. Progress: 49%. Estimated remaining time: 1 minute, 35 seconds.
Growing trees.. Progress: 67%. Estimated remaining time: 1 minute, 0 seconds.
Growing trees.. Progress: 84%. Estimated remaining time: 29 seconds.
- Fold5: mtry=15, splitrule=gini, min.node.size=9
Aggregating results
Fitting final model on full training set
Growing trees.. Progress: 12%. Estimated remaining time: 3 minutes, 43 seconds.
Growing trees.. Progress: 25%. Estimated remaining time: 3 minutes, 9 seconds.
Growing trees.. Progress: 38%. Estimated remaining time: 2 minutes, 36 seconds.
Growing trees.. Progress: 51%. Estimated remaining time: 2 minutes, 2 seconds.
Growing trees.. Progress: 64%. Estimated remaining time: 1 minute, 28 seconds.
Growing trees.. Progress: 77%. Estimated remaining time: 55 seconds.
Growing trees.. Progress: 91%. Estimated remaining time: 20 seconds.
rf.fit
Random Forest
237606 samples
26 predictor
2 classes: 'No', 'Yes'
Recipe steps: mutate, dummy, rm, scale
Resampling: Cross-Validated (5 fold)
Summary of sample sizes: 190085, 190085, 190084, 190085, 190085
Resampling results:
ROC Sens Spec
0.7213679 0.9945622 0.0349838
Tuning parameter 'mtry' was held constant at a value of 15
Tuning parameter 'splitrule' was held constant at a value of gini
Tuning
parameter 'min.node.size' was held constant at a value of 9
# Variable importance
Fn_var_imp(rf.fit)
# Confustion metric
confusionMatrix.train(rf.fit, norm="none")
Cross-Validated (5 fold) Confusion Matrix
(entries are un-normalized aggregated counts)
Reference
Prediction No Yes
No 225511 10482
Yes 1233 380
Accuracy (average) : 0.9507
xgb.fit
eXtreme Gradient Boosting
237606 samples
26 predictor
2 classes: 'No', 'Yes'
Recipe steps: mutate, dummy, rm, scale
Resampling: Cross-Validated (5 fold)
Summary of sample sizes: 190085, 190085, 190084, 190085, 190085
Resampling results:
ROC Sens Spec
0.7349856 0.9999824 0.0005524014
Tuning parameter 'nrounds' was held constant at a value of 1000
Tuning parameter 'max_depth' was held constant at a value of 6
parameter 'colsample_bytree' was held constant at a value of 0.5
Tuning parameter 'min_child_weight' was held constant at a value of
2
Tuning parameter 'subsample' was held constant at a value of 1
test <- read_csv('data/test.csv') %>%
left_join(user_search_profile, by="user_id") %>%
left_join(store_imp_profile, by="app_code") %>%
mutate(last_session_to_first_impression = first_store_imp - last_session,
last_session_to_last_impression = last_store_imp - last_session,
imp_to_search_period = as.numeric(imp_store_period) / as.numeric(search_period)) %>%
mutate_if(is.numeric, replace_na, replace = 0) %>%
test.pred.prob <- predict(xgb.fit, test, type = "prob")
submission <- bind_cols(impression_id=test$impression_id, is_click=test.pred.prob$Yes)
write_csv(submission,'submission7.csv')