Utilizaremos no ajuste dos Modelos Uplift três diferentes bases de dados

Esse dataset é relacionado ao artigo “A Large Scale Benchmark for Uplift Modeling” - Eustache Diemert, Artem Betlei, Christophe Renaudin; (Criteo AI Lab), Massih-Reza Amini (LIG, Grenoble INP). Disponível em: https://ailab.criteo.com/criteo-uplift-prediction-dataset/.

Os dados em questao sao resultantes de varios testes incrementais, onde uma parte aleatoria da populacao é controle ( impedida de ser alvo da acao) e a outra tratamento. Consiste de 25 milhoes de linhas, cada uma representando o usuario com 11 variaveis, um indicador de tratamento e 2 rotulos (visitas e convers?es).

Descricao detalhada dos campos:

f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11: variaveis descaracterizadas que foram observadas tratamento: grupo de tratamento (1 = tratado, 0 = controle) conversao: se ocorreu uma conversao para este usuario visit: se uma visita ocorreu para este usuario exposicao: efeito do tratamento, se o usuario foi efetivamente exposto (binario).

Esses dados foram disponibilizados em Fevereiro de 2020, em: https://zenodo.org/record/3653141#.X1orrWdKhQJ.

Desafio no Kaggle com dados de uma Campanha de Marketing, ele foi abordado como um problema no contexto de Modelagem Uplift. Disponível em: https://www.kaggle.com/davinwijaya/customer-retention.

Tabela data3.csv

library(data.table)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
data=fread('amostra_criteo_uplift')
## Warning in fread("amostra_criteo_uplift"): Detected 16 column names but the
## data has 17 columns (i.e. invalid file). Added 1 extra default column name for
## the first column which is guessed to be row names or an index. Use setnames()
## afterwards if this guess is not correct, or fix the file write command that
## created the file to create a valid file.
summary(data)
##        V1               f0                f1              f2       
##  Min.   :     1   Min.   :-1.9463   Min.   :3.264   Min.   :8.272  
##  1st Qu.: 25001   1st Qu.:-1.0118   1st Qu.:3.264   1st Qu.:8.272  
##  Median : 50000   Median : 1.6254   Median :3.264   Median :8.272  
##  Mean   : 50000   Mean   : 0.6148   Mean   :3.266   Mean   :8.505  
##  3rd Qu.: 75000   3rd Qu.: 1.9920   3rd Qu.:3.264   3rd Qu.:8.748  
##  Max.   :100000   Max.   : 1.9920   Max.   :4.503   Max.   :9.337  
##        f3              f4              f5              f6         
##  Min.   :1.682   Min.   :3.507   Min.   :10.16   Min.   :-6.5997  
##  1st Qu.:3.736   1st Qu.:3.507   1st Qu.:10.16   1st Qu.: 0.9452  
##  Median :3.736   Median :3.507   Median :10.16   Median : 2.3861  
##  Mean   :3.656   Mean   :3.522   Mean   :10.20   Mean   : 1.7535  
##  3rd Qu.:3.736   3rd Qu.:3.507   3rd Qu.:10.16   3rd Qu.: 2.9817  
##  Max.   :3.736   Max.   :6.750   Max.   :16.98   Max.   : 2.9817  
##        f7                f8                f9             f10         
##  Min.   :-4.7244   Min.   :-31.747   Min.   : 9.85   Min.   :-13.037  
##  1st Qu.:-0.1667   1st Qu.: -2.716   1st Qu.: 9.85   1st Qu.: -1.861  
##  Median :-0.1667   Median :  1.108   Median : 9.85   Median : -1.861  
##  Mean   :-0.2930   Mean   : -2.045   Mean   :10.67   Mean   : -2.084  
##  3rd Qu.:-0.1667   3rd Qu.:  1.108   3rd Qu.: 9.85   3rd Qu.: -1.861  
##  Max.   :-0.1667   Max.   :  1.108   Max.   :28.10   Max.   : -1.861  
##       f11          treatment        conversion         visit        
##  Min.   :4.158   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:4.158   1st Qu.:1.0000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :4.158   Median :1.0000   Median :0.0000   Median :0.00000  
##  Mean   :4.159   Mean   :0.8459   Mean   :0.0021   Mean   :0.04204  
##  3rd Qu.:4.158   3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :4.795   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##     exposure      
##  Min.   :0.00000  
##  1st Qu.:0.00000  
##  Median :0.00000  
##  Mean   :0.03487  
##  3rd Qu.:0.00000  
##  Max.   :1.00000
#retirando a coluna de index da base
datan=select(data,-V1)

Analisando as variáveis da base de dados.

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0     ✓ purrr   0.3.3
## ✓ tibble  3.0.0     ✓ stringr 1.4.0
## ✓ tidyr   1.0.2     ✓ forcats 0.5.0
## ✓ readr   1.3.1
## ── Conflicts ────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::between()   masks data.table::between()
## x dplyr::filter()    masks stats::filter()
## x dplyr::first()     masks data.table::first()
## x dplyr::lag()       masks stats::lag()
## x dplyr::last()      masks data.table::last()
## x purrr::transpose() masks data.table::transpose()
library(skimr)

#Visao inicial dos dados
glimpse(datan)
## Rows: 100,000
## Columns: 16
## $ f0         <dbl> 1.9919806, 1.9919806, 1.9919806, 1.9919806, 1.9919806, 1.9…
## $ f1         <dbl> 3.263641, 3.263641, 3.263641, 3.263641, 3.263641, 3.263641…
## $ f2         <dbl> 8.272483, 8.272483, 9.003290, 8.272483, 8.821857, 8.272483…
## $ f3         <dbl> 3.735871, 3.735871, 3.735871, 3.735871, 3.735871, 3.735871…
## $ f4         <dbl> 3.506733, 3.506733, 3.506733, 3.506733, 3.506733, 3.506733…
## $ f5         <dbl> 10.16128, 10.16128, 10.16128, 10.16128, 10.16128, 10.16128…
## $ f6         <dbl> 2.98172089, 2.98172089, 2.98172089, 2.98172089, 2.98172089…
## $ f7         <dbl> -0.1666894, -0.1666894, -0.1666894, -0.1666894, -0.1666894…
## $ f8         <dbl> 1.107571, 1.107571, -7.012504, 1.107571, -1.784858, 1.1075…
## $ f9         <dbl> 9.850093, 9.850093, 9.850093, 9.850093, 11.700187, 9.85009…
## $ f10        <dbl> -1.8609, -1.8609, -1.8609, -1.8609, -1.8609, -1.8609, -1.8…
## $ f11        <dbl> 4.157648, 4.157648, 4.157648, 4.157648, 4.157648, 4.157648…
## $ treatment  <int> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1…
## $ conversion <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ visit      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ exposure   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
datan %>%
  mutate_if(is.character,as.factor) %>%
  skimr::skim()
Data summary
Name Piped data
Number of rows 100000
Number of columns 16
_______________________
Column type frequency:
numeric 16
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
f0 0 1 0.61 1.54 -1.95 -1.01 1.63 1.99 1.99 ▃▂▁▁▇
f1 0 1 3.27 0.02 3.26 3.26 3.26 3.26 4.50 ▇▁▁▁▁
f2 0 1 8.50 0.36 8.27 8.27 8.27 8.75 9.34 ▇▁▁▁▂
f3 0 1 3.66 0.24 1.68 3.74 3.74 3.74 3.74 ▁▁▁▁▇
f4 0 1 3.52 0.11 3.51 3.51 3.51 3.51 6.75 ▇▁▁▁▁
f5 0 1 10.20 0.23 10.16 10.16 10.16 10.16 16.98 ▇▁▁▁▁
f6 0 1 1.75 1.67 -6.60 0.95 2.39 2.98 2.98 ▁▁▁▂▇
f7 0 1 -0.29 0.66 -4.72 -0.17 -0.17 -0.17 -0.17 ▁▁▁▁▇
f8 0 1 -2.05 5.47 -31.75 -2.72 1.11 1.11 1.11 ▁▁▁▁▇
f9 0 1 10.67 2.29 9.85 9.85 9.85 9.85 28.10 ▇▁▁▁▁
f10 0 1 -2.08 1.36 -13.04 -1.86 -1.86 -1.86 -1.86 ▁▁▁▁▇
f11 0 1 4.16 0.02 4.16 4.16 4.16 4.16 4.80 ▇▁▁▁▁
treatment 0 1 0.85 0.36 0.00 1.00 1.00 1.00 1.00 ▂▁▁▁▇
conversion 0 1 0.00 0.05 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
visit 0 1 0.04 0.20 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
exposure 0 1 0.03 0.18 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁

Trabalhando com os dados para depois ajustar um modelo.

#instalando uplift
require(uplift)
## Loading required package: uplift
## Loading required package: RItools
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## Loading required package: coin
## Loading required package: survival
## Loading required package: tables
## Loading required package: penalized
## Welcome to penalized. For extended examples, see vignette("penalized").
#funcao que transforma a variavel resposta - rvtu
require(plyr)
## Loading required package: plyr
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following object is masked from 'package:purrr':
## 
##     compact
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
data_rvtu <- rvtu(visit~f1+f2+f3+f4+f5+f6+f7+f8+f9+f10+f11+trt(as.numeric(treatment)),datan,method="none")

names(data_rvtu)
##  [1] "f1"  "f2"  "f3"  "f4"  "f5"  "f6"  "f7"  "f8"  "f9"  "f10" "f11" "ct" 
## [13] "y"   "z"
explore(y~f1+f2+f3+f4+f5+f6+f7+f8+f9+f10+f11+trt(ct),
        data=data_rvtu)
## $f1
##                      N(Treat=0) N(Treat=1) Mean Resp.(Treat=0)
## [3.2636406,3.263641]      15291      83728              0.0269
## (3.263641,4.502868]         114        867              0.1316
##                      Mean Resp.(Treat=1) Uplift
## [3.2636406,3.263641]              0.0425 0.0156
## (3.263641,4.502868]               0.2491 0.1176
## 
## $f2
##             N(Treat=0) N(Treat=1) Mean Resp.(Treat=0) Mean Resp.(Treat=1)
## [8.27,8.75]      11999      63001              0.0307              0.0506
## (8.75,9.34]       3406      21594              0.0173              0.0274
##             Uplift
## [8.27,8.75] 0.0199
## (8.75,9.34] 0.0101
## 
## $f3
##             N(Treat=0) N(Treat=1) Mean Resp.(Treat=0) Mean Resp.(Treat=1)
## [1.68,3.74]      15405      84595              0.0277              0.0446
##             Uplift
## [1.68,3.74] 0.0169
## 
## $f4
##             N(Treat=0) N(Treat=1) Mean Resp.(Treat=0) Mean Resp.(Treat=1)
## [3.51,6.75]      15405      84595              0.0277              0.0446
##             Uplift
## [3.51,6.75] 0.0169
## 
## $f5
##           N(Treat=0) N(Treat=1) Mean Resp.(Treat=0) Mean Resp.(Treat=1) Uplift
## [10.2,17]      15405      84595              0.0277              0.0446 0.0169
## 
## $f6
##              N(Treat=0) N(Treat=1) Mean Resp.(Treat=0) Mean Resp.(Treat=1)
## [-6.6,0.945]       4200      21637              0.0405              0.0910
## (0.945,2.39]       5430      21569              0.0131              0.0233
## (2.39,2.98]        5775      41389              0.0322              0.0316
##               Uplift
## [-6.6,0.945]  0.0505
## (0.945,2.39]  0.0102
## (2.39,2.98]  -0.0007
## 
## $f7
##                          N(Treat=0) N(Treat=1) Mean Resp.(Treat=0)
## [-4.724423,-0.1666894]        15259      83785              0.0277
## (-0.1666894,-0.16668936]        146        810              0.0342
##                          Mean Resp.(Treat=1)  Uplift
## [-4.724423,-0.1666894]                0.0448  0.0171
## (-0.1666894,-0.16668936]              0.0309 -0.0034
## 
## $f8
##               N(Treat=0) N(Treat=1) Mean Resp.(Treat=0) Mean Resp.(Treat=1)
## [-31.7,-2.72]       3407      24461              0.1071              0.1360
## (-2.72,1.11]       11998      60134              0.0052              0.0075
##               Uplift
## [-31.7,-2.72] 0.0288
## (-2.72,1.11]  0.0023
## 
## $f9
##             N(Treat=0) N(Treat=1) Mean Resp.(Treat=0) Mean Resp.(Treat=1)
## [9.85,28.1]      15405      84595              0.0277              0.0446
##             Uplift
## [9.85,28.1] 0.0169
## 
## $f10
##                      N(Treat=0) N(Treat=1) Mean Resp.(Treat=0)
## [-13.03661,-1.8609]       15261      83770              0.0278
## (-1.8609,-1.8608999]        144        825              0.0208
##                      Mean Resp.(Treat=1) Uplift
## [-13.03661,-1.8609]               0.0448 0.0170
## (-1.8609,-1.8608999]              0.0315 0.0107
## 
## $f11
##            N(Treat=0) N(Treat=1) Mean Resp.(Treat=0) Mean Resp.(Treat=1) Uplift
## [4.16,4.8]      15405      84595              0.0277              0.0446 0.0169
# targeted
count(data_rvtu[data_rvtu$ct == 1,], "y")$freq / sum(data_rvtu$ct == 1)
## [1] 0.95535197 0.04464803
# control
count(data_rvtu[data_rvtu$ct == 0,], "y")$freq / sum(data_rvtu$ct == 0)
## [1] 0.97228173 0.02771827

Modelagem considerando Regressão Logistica

#ajustando uma logistico simples
logit.formula <- ~f1+f2+f3+f4+f5+f6+f7+f8+f9+f10+f11
#testar intera??es

set.seed(123)
require(glmnet)
## Loading required package: glmnet
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 3.0-2
#library(tidyr)
logit.x.interactions <- model.matrix(logit.formula, data=data_rvtu)
logit.z <- data_rvtu$z
logit.y <- data_rvtu$y

# traditional classifier, y as response
logit.cv.lasso.y.interactions <- cv.glmnet(logit.x.interactions, logit.y, alpha=1, family="binomial")
plot(logit.cv.lasso.y.interactions)

# uplift classifier, z as response
logit.cv.lasso.z.interactions <- cv.glmnet(logit.x.interactions, logit.z, alpha=1, family="binomial")
plot(logit.cv.lasso.z.interactions)

coef(logit.cv.lasso.z.interactions)[which(coef(logit.cv.lasso.z.interactions) != 0),]
##  (Intercept)           f2           f4           f6           f9          f10 
## -2.604446556 -0.047059251  0.135638594 -0.062440901  0.106193447 -0.005877698
coef(logit.cv.lasso.z.interactions,
     s=logit.cv.lasso.z.interactions$lambda.min)[which(coef(logit.cv.lasso.z.interactions,
                                                            s=logit.cv.lasso.z.interactions$lambda.min) !=0),]
## (Intercept)          f1          f2          f6          f8          f9 
## -3.60777825 -0.22454262 -0.41308015 -0.08267585 -0.02384136  0.07569372 
##         f10         f11 
## -0.02682158  1.34202229