En este documento resumiré los pasos seguidos para la generación del score generado con el Funnel L2S Model de X empresa, en el cual logramos calcular cuál es la probabilidad de que un lead o una oportunidad nueva vaya a convertir (comprar una póliza todo riesgo) y poder tomar mejores decisiones basadas en Machine Learning.
NOTA:Hay que tomar en cuenta que este score no varía con el tiempo ya que sólo tiene información referente a la persona como tal y no a su comportamiento.
El algorítmo definitivo empleado para generar los scores fue XGBoost, utilizando R (sampling, limpieza de data y modelo) y Python (implementación en el CRM).
Las librerías empleadas fueron las siguientes:
library(RPostgreSQL)
library(dplyr)
library(stringr)
library(Amelia)
library(ggplot2)
library(gridExtra)
Luego, se trajo la data del servidor a R directamente usando PostgreSQL (se omite el código por motivos de seguridad). Las consultas realizadas de una tabla compuesta por las siguientes dos categorías:
## EMITIDAS
q <- "
SELECT
DISTINCT(o.id),
random(),
o.created,
app.vehicle_body,
app.sex,
EXTRACT(year from app.date_of_birth) as year_of_birth,
app.vehicle_model,
app.vehicle_city,
app.current_situation,
app.vehicle_is_mine,
app.form,
app.already_insured_soat,
app.when_need_policy,
app.vehicle_financed,
app.vehicle_commercial_value,
app.identification,
app.vehicle_is_zero_km,
app.vehicle_has_registration,
app.client_type,
app.vehicle_service_type,
CASE WHEN app.already_insured_with_company IS NOT NULL THEN 'SI'
ELSE 'NO' END as already_insured,
m.medium,
app.vehicle_brand,
o.quoted_policies_count
FROM applications_carinsuranceapplication as app
LEFT JOIN opportunities_opportunity as o ON (app.id = o.application_object_id)
LEFT JOIN opportunities_userjourney as uj ON (o.id = uj.opportunity_id)
LEFT JOIN opportunities_userjourneystepdone as sd ON (uj.id = sd.user_journey_id)
LEFT JOIN marketing_visitor as m ON (o.marketing_id = m.id)
WHERE
sd.name IN ('issue')
AND o.created BETWEEN '2016-01-01' AND (CURRENT_DATE - INTERVAL '1 month')
AND o.fake = FALSE
AND m.medium != 'api'
AND app.form NOT IN ('default','ux31')
AND o.status != 'descartada'
ORDER BY random()
LIMIT 10000"
q <- dbSendQuery(con, q)
emitidos <- fetch(q, n = -1)
emitidos$emitido <- c(1)
emitidos <- head(emitidos,6000) #Cuántos registros máximos queremos para "1s"
## NO EMITIDAS
q <- "
SELECT
DISTINCT(o.id),
random(),
o.created,
app.vehicle_body,
app.sex,
EXTRACT(year from app.date_of_birth) as year_of_birth,
app.vehicle_model,
app.vehicle_city,
app.current_situation,
app.vehicle_is_mine,
app.form,
app.already_insured_soat,
app.when_need_policy,
app.vehicle_financed,
app.vehicle_commercial_value,
app.identification,
app.vehicle_is_zero_km,
app.vehicle_has_registration,
app.client_type,
app.vehicle_service_type,
CASE WHEN app.already_insured_with_company IS NOT NULL THEN 'SI'
ELSE 'NO' END as already_insured,
m.medium,
app.vehicle_brand,
o.quoted_policies_count
FROM applications_carinsuranceapplication as app
LEFT JOIN opportunities_opportunity as o ON (app.id = o.application_object_id)
LEFT JOIN opportunities_userjourney as uj ON (o.id = uj.opportunity_id)
LEFT JOIN opportunities_userjourneystepdone as sd ON (uj.id = sd.user_journey_id)
LEFT JOIN marketing_visitor as m ON (o.marketing_id = m.id)
WHERE
sd.name NOT IN ('issue','terms','payment','docs-physics','acquired','fin')
AND o.created BETWEEN '2016-01-01' AND (CURRENT_DATE - INTERVAL '1 month')
AND o.fake = FALSE
AND m.medium != 'api'
AND app.form NOT IN ('default','ux31')
AND o.status != 'descartada'
ORDER BY random()
LIMIT 8000"
q <- dbSendQuery(con, q)
no.emitidos <- fetch(q, n = -1)
no.emitidos <- filter(no.emitidos,!id %in% emitidos$id)
no.emitidos$emitido <- c(0)
no.emitidos <- head(no.emitidos,5000) #Cuántos registros máximos queremos para "0s"
## JOIN BOTH
sample <- rbind(emitidos,no.emitidos)
## Delete random()
sample$random <- NULL
#Reordenar
sample <- cbind(select(sample,emitido,id,created),select(sample,-emitido,-id,-created))
## EXPORT RAW
write.csv2(sample,"Data.16.raw.csv")
Ahora vemos la data que exportamos del servidor a un archivo CSV:
str(sample)
## 'data.frame': 8545 obs. of 24 variables:
## $ emitido : num 1 1 1 1 1 1 1 1 1 1 ...
## $ id : int 409930 524091 433491 479206 415992 314664 437186 551997 552896 517335 ...
## $ created : POSIXct, format: "2016-08-10 08:13:29" "2017-01-05 15:39:17" ...
## $ vehicle_body : chr "AUTOMOVIL" "AUTOMOVIL" "AUTOMOVIL" "MOTOCICLETA" ...
## $ sex : chr "M" "M" "M" "M" ...
## $ year_of_birth : num 1976 1982 1969 1983 1986 ...
## $ vehicle_model : int 2013 2017 2008 2015 2007 2010 2013 2016 2016 2017 ...
## $ vehicle_city : chr "Bogotá, Bogota D.C., Colombia" "Medellín, Antioquia, Colombia" "Bogotá, Bogota D.C., Colombia" "Cali, Valle del Cauca, Colombia" ...
## $ current_situation : chr "" "" "" "" ...
## $ vehicle_is_mine : chr "yes" NA "yes" "yes" ...
## $ form : chr "uj40" "uj40" "shorty" "shorty" ...
## $ already_insured_soat : chr "only-have-soat" NA "only-have-soat" "only-have-soat" ...
## $ when_need_policy : chr "inmediately" "inmediately" "inmediately" "inmediately" ...
## $ vehicle_financed : chr NA "yes-bank-or-financial" NA NA ...
## $ vehicle_commercial_value: num 33000000 65240000 19300000 4100000 10700000 ...
## $ identification : chr "79840127" "98694582" "79635065" "94072489" ...
## $ vehicle_is_zero_km : int 0 1 0 0 0 0 0 0 0 1 ...
## $ vehicle_has_registration: int 1 0 1 1 0 1 0 1 1 1 ...
## $ client_type : chr "natural" "natural" "natural" "natural" ...
## $ vehicle_service_type : chr "particular" "particular" "particular" "particular" ...
## $ already_insured : chr "SI" "NO" "NO" "NO" ...
## $ medium : chr "ET" "cpc" "direct" "direct" ...
## $ vehicle_brand : chr "VOLKSWAGEN" "KIA" "CHEVROLET" "SUZUKI" ...
## $ quoted_policies_count : int 10 9 10 1 13 16 1 13 10 12 ...
head(sample,1)
## emitido id created vehicle_body sex year_of_birth
## 1 1 409930 2016-08-10 08:13:29 AUTOMOVIL M 1976
## vehicle_model vehicle_city current_situation
## 1 2013 Bogotá, Bogota D.C., Colombia
## vehicle_is_mine form already_insured_soat when_need_policy
## 1 yes uj40 only-have-soat inmediately
## vehicle_financed vehicle_commercial_value identification
## 1 <NA> 3.3e+07 79840127
## vehicle_is_zero_km vehicle_has_registration client_type
## 1 0 1 natural
## vehicle_service_type already_insured medium vehicle_brand
## 1 particular SI ET VOLKSWAGEN
## quoted_policies_count
## 1 10
sample %>% group_by(emitido) %>% tally()
## # A tibble: 2 × 2
## emitido n
## <dbl> <int>
## 1 0 5000
## 2 1 3545
Ahora que tenemos ya el extracto de la data que usaremos para el modelo, preparamos los campos que requieran organizar, limpiar, mejorar…
df.train <- read.csv2('Data.16.raw.csv',stringsAsFactors=TRUE)
df.train$X <- NULL #No nos sirve
df.train$civil_status <- NULL #Mala data
df.train$email_address <- NULL #Mal predictor
df.train$domain <- NULL #Mal predictor
df.train$mobile_phone <- NULL #Mal predictor
df.train$phone <- NULL #Mal predictor
#Missing values
missmap(df.train,legend=FALSE,rank.order=TRUE)
#CIUDADES
#Fix data
df.train$vehicle_city <- sub(", Colombia","",df.train$vehicle_city)
df.train$vehicle_city <- sub(", .*","",df.train$vehicle_city)
#View data - Ciudad
fix <- as.data.frame(df.train$vehicle_city)
fix <- fix %>% group_by(Campo=fix[,1]) %>% tally(sort = TRUE) %>%
mutate(Perc=round(n/nrow(fix),2)) %>% top_n(10,n)
print(fix)
## # A tibble: 10 × 3
## Campo n Perc
## <fctr> <int> <dbl>
## 1 Bogotá 4193 0.49
## 2 Medellín 1711 0.20
## 3 Cali 933 0.11
## 4 Barranquilla 313 0.04
## 5 Bucaramanga 129 0.02
## 6 Pereira 117 0.01
## 7 Cartagena 97 0.01
## 8 Manizales 91 0.01
## 9 Ibagué 85 0.01
## 10 Armenia 52 0.01
#Reescribir las campos por nueva agrupación
df.train <- df.train %>%
mutate(vehicle_city = ifelse(vehicle_city=="Bogotá","BOG",
ifelse(vehicle_city=="Medellín","MED",
ifelse(vehicle_city=="Cali","CAL",
ifelse(vehicle_city=="Barranquilla","BAR","OTRA")))))
# NACIMIENTO - EDAD
df.train$year_of_birth[is.na(df.train$year_of_birth)] <- as.integer(format(Sys.time(), "%Y")) #NAs
df.train$edad <- as.integer(format(Sys.time(), "%Y")) - df.train$year_of_birth #Edades
df.train$year_of_birth <- NULL
df.train <- df.train %>%
mutate(edad = ifelse(edad>=80,">80",
ifelse(edad>=55,"55-79",
ifelse(edad>=40,"40-54",
ifelse(edad>=30,"30-39",
ifelse(edad>=18,"18-29",
ifelse(edad>=1,"MENOR","SIN")))))))
# MODELO DEL VEHÍCULO
fix <- as.data.frame(df.train$vehicle_model)
fix <- fix %>% group_by(Campo=fix[,1]) %>% tally(sort = TRUE) %>%
mutate(Perc=round(n/nrow(fix),2)) %>% top_n(10,n)
print(fix)
## # A tibble: 10 × 3
## Campo n Perc
## <int> <int> <dbl>
## 1 2016 1019 0.12
## 2 2015 976 0.11
## 3 2013 841 0.10
## 4 2012 829 0.10
## 5 2011 804 0.09
## 6 2017 794 0.09
## 7 2014 774 0.09
## 8 2008 460 0.05
## 9 2010 421 0.05
## 10 2009 419 0.05
#Reescribir las campos por nueva agrupación
año <- as.integer(format(Sys.Date(), "%Y"))
df.train <- df.train %>%
mutate(vehicle_model = ifelse(vehicle_model>=año,"DEL.AÑO",
ifelse(vehicle_model>=(año-1),"AÑO.PASADO",
ifelse(vehicle_model>=(año-2),"AÑO.ANTEPASADO",
ifelse(vehicle_model>=(año-5),"5.AÑOS",
ifelse(vehicle_model>=(año-10),"10.AÑOS","MAS.10.AÑOS"))))))
# MEDIUM
fix <- as.data.frame(df.train$medium)
fix <- fix %>% group_by(Campo=fix[,1]) %>% tally(sort = TRUE) %>%
mutate(Perc=round(n/nrow(fix),2)) %>% top_n(10,n)
print(fix)
## # A tibble: 12 × 3
## Campo n Perc
## <fctr> <int> <dbl>
## 1 cpc 3826 0.45
## 2 direct 1751 0.20
## 3 seo 1719 0.20
## 4 ET 837 0.10
## 5 referral 217 0.03
## 6 link-calculadora 85 0.01
## 7 autolab 15 0.00
## 8 EXACTTARGET 15 0.00
## 9 DISPLAY 10 0.00
## 10 facilidades de pago 9 0.00
## 11 widget 9 0.00
## 12 WP 9 0.00
#Reescribir las campos por nueva agrupación
df.train <- df.train %>%
mutate(medium = ifelse(medium=="cpc","SEM",
ifelse(medium=="direct","DIRECT",
ifelse(medium=="seo","SEO",
ifelse(medium=="et","ET",
ifelse(medium=="referral","REFERRAL",
ifelse(medium=="INBOXLABS","INBOXLABS","OTRO")))))))
# BODY
fix <- as.data.frame(df.train$vehicle_body)
fix <- fix %>% group_by(Campo=fix[,1]) %>% tally(sort = TRUE) %>%
mutate(Perc=round(n/nrow(fix),2)) %>% top_n(10,n)
print(fix)
## # A tibble: 10 × 3
## Campo n Perc
## <fctr> <int> <dbl>
## 1 AUTOMOVIL 5915 0.69
## 2 CAMIONETA 1465 0.17
## 3 MOTO 289 0.03
## 4 CAMIONETA PASAJ. 211 0.02
## 5 CAMPERO 211 0.02
## 6 PESADO 132 0.02
## 7 PICKUP 115 0.01
## 8 PICKUP DOBLE CAB 77 0.01
## 9 MOTOCICLETA 41 0.00
## 10 BUS 25 0.00
#Reescribir las campos por nueva agrupación
df.train <- df.train %>%
mutate(vehicle_body = ifelse(vehicle_body=="AUTOMOVIL","AUTOMOVIL",
ifelse(vehicle_body=="CAMIONETA","CAMIONETA",
ifelse(vehicle_body=="MOTO","MOTO",
ifelse(vehicle_body=="CAMPERO","CAMPERO",
ifelse(vehicle_body=="CAMIONETA PASAJ.","CAMIONETA PASAJ.",
ifelse(vehicle_body=="PICKUP","PICKUP","OTRO")))))))
# IDENTIFICACIÓN
fix <- as.data.frame(df.train$identification)
fix <- fix %>% group_by(Campo=fix[,1]) %>% tally(sort = TRUE) %>%
mutate(Perc=round(n/nrow(fix),2)) %>% top_n(5,n)
print(fix)
## # A tibble: 24 × 3
## Campo n Perc
## <fctr> <int> <dbl>
## 1 0000000001 1443 0.17
## 2 000000001 5 0.00
## 3 900720376 5 0.00
## 4 1022407594 3 0.00
## 5 1024558143 3 0.00
## 6 1026132655 3 0.00
## 7 1036644733 3 0.00
## 8 1113619217 3 0.00
## 9 14997738 3 0.00
## 10 18514751 3 0.00
## # ... with 14 more rows
#Reescribir las campos por nueva agrupación
df.train <- df.train %>% mutate(identification = ifelse(as.integer(identification)==1,0,1))
df.train$identification[is.na(df.train$identification)] <- 0
# CREATED (Día de semana)
df.train$weekday <- weekdays(as.Date(df.train$created,format='%Y-%m-%d',tz="BO"))
#Clases de los campos
for(i in c(1,1:ncol(df.train))) {
df.train[,i] <- as.factor(df.train[,i])
}
df.train[,2] <- as.integer(as.character(df.train[,2]))
## EXPORT CLEAN
write.csv2(df.train,"Data.16.clean.csv")
str(df.train)
## 'data.frame': 8545 obs. of 25 variables:
## $ emitido : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ id : int 409930 524091 433491 479206 415992 314664 437186 551997 552896 517335 ...
## $ created : Factor w/ 8363 levels "2016-01-04 12:11:00",..: 1436 6426 2383 4229 1614 86 2555 7980 8054 6049 ...
## $ vehicle_body : Factor w/ 7 levels "AUTOMOVIL","CAMIONETA",..: 1 1 1 6 1 1 5 2 2 2 ...
## $ sex : Factor w/ 3 levels "","F","M": 3 3 3 3 2 3 3 3 2 3 ...
## $ vehicle_model : Factor w/ 6 levels "10.AÑOS","5.AÑOS",..: 2 5 1 3 1 1 2 4 4 5 ...
## $ vehicle_city : Factor w/ 5 levels "BAR","BOG","CAL",..: 2 4 2 3 3 4 5 3 1 5 ...
## $ current_situation : Factor w/ 7 levels "","asociados",..: 1 1 1 1 1 5 1 1 1 1 ...
## $ vehicle_is_mine : Factor w/ 2 levels "no","yes": 2 NA 2 2 NA NA NA NA NA NA ...
## $ form : Factor w/ 5 levels "rastreator-v2",..: 3 3 2 2 3 1 2 3 3 3 ...
## $ already_insured_soat : Factor w/ 4 levels "also-i-need-soat",..: 4 NA 4 4 1 NA 4 4 2 NA ...
## $ when_need_policy : Factor w/ 5 levels "asociados","between_one_and_two_weeks",..: 5 5 5 5 5 NA 4 5 5 5 ...
## $ vehicle_financed : Factor w/ 3 levels "no-use-savings",..: NA 2 NA NA NA NA NA NA NA 2 ...
## $ vehicle_commercial_value: Factor w/ 1120 levels "410000","503000",..: 355 740 194 36 104 386 291 635 936 612 ...
## $ identification : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ vehicle_is_zero_km : Factor w/ 2 levels "0","1": 1 2 1 1 1 1 1 1 1 2 ...
## $ vehicle_has_registration: Factor w/ 2 levels "0","1": 2 1 2 2 1 2 1 2 2 2 ...
## $ client_type : Factor w/ 2 levels "juridica","natural": 2 2 2 2 2 2 2 2 2 2 ...
## $ vehicle_service_type : Factor w/ 4 levels "","particular",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ already_insured : Factor w/ 2 levels "NO","SI": 2 1 1 1 1 1 1 1 1 1 ...
## $ medium : Factor w/ 6 levels "DIRECT","INBOXLABS",..: 3 5 1 1 5 5 1 5 5 1 ...
## $ vehicle_brand : Factor w/ 82 levels "ACURA","AGRALE",..: 77 46 15 70 15 46 32 15 15 15 ...
## $ quoted_policies_count : Factor w/ 30 levels "0","1","2","3",..: 11 10 11 2 14 17 2 14 11 13 ...
## $ edad : Factor w/ 5 levels "18-29","30-39",..: 3 2 3 2 2 2 3 4 1 2 ...
## $ weekday : Factor w/ 7 levels "Friday","Monday",..: 7 5 3 5 7 3 5 6 7 7 ...
Una vez tenemos la data lista para nuestro modelo, podemos entrenarlo, ajustarlo y conseguir los mejores resultados.
Las librerías empleadas son las siguientes:
library(dplyr)
library(xgboost)
library(data.table)
library(caTools)
library(pROC)
library(gridExtra)
library(caret)
library(ggplot2)
Importamos la data limpia para el modelo:
df.train <- read.csv2('Data.16.clean.csv',stringsAsFactors=TRUE,na=as.factor("NULO"))
fechas <- paste(min(as.Date(df.train$created)),"-",max(as.Date(df.train$created)))
df.train$X <- NULL
df.train$vehicle_commercial_value <- as.numeric(as.character(df.train$vehicle_commercial_value))
df.train$vehicle_brand <- NULL #Noice
Dividimos la data en train (para entrenamiento) y test (para las pruebas), con una relación de 30/70.
set.seed(1)
split <- sample.split(df.train$emitido, SplitRatio = 0.7)
train <- subset(df.train, split == TRUE) #Training
test <- subset(df.train, split == FALSE) #Testing
n <- nrow(test)
emitidos <- nrow(filter(test,emitido==1))
Una vez tengamos nuestra data segmentada, la preparamos para XGBoost:
setDT(train)
setDT(test)
#One hot encoding
labels <- train$emitido #Target train
ts_label <- test$emitido #Target test
new_tr <- model.matrix(~.+0,data = train[,-c("emitido","id","created"),with=F])
new_ts <- model.matrix(~.+0,data = test[,-c("emitido","id","created"),with=F])
#Convert data table into a matrix (xgb.DMatrix):
dtrain <- xgb.DMatrix(data = new_tr,label = labels)
dtest <- xgb.DMatrix(data = new_ts,label = ts_label)
Definimos los valores de los parámetros para iniciar el modelo:
params <- list(
booster = "gbtree", # 'gbtree' / 'gblinear'
objective = "binary:logistic", # 'binary:logistic' / 'reg:linear'
eta=0.1, #Step size shrinkage (prevents overfitting) - default=0.3
gamma=0, #Minimum loss reduction required to split
max_depth=5, #Default=6 <- Complexity (ver xgb.plot.deepness)
min_child_weight=1,
subsample=1,#Robust to noise
colsample_bytree=1 #Robust to noise
)
Hacemos cross-validation buscando la mejor iteración para este modelo. Además, podemos calcular el Accuracy del cross-validation.
xgbcv <- xgb.cv(params = params,
data = dtrain,
nrounds = 150, #n iteraciones
nfold = 5, #folds cross validation
showsd = T,
stratified = T,
print_every_n = 1, #Intervalos a mostrar
early_stopping_rounds = 20, #20
maximize = F,
prediction = F)
## [1] train-error:0.328318+0.003051 test-error:0.337179+0.003116
## Multiple eval metrics are present. Will use test_error for early stopping.
## Will train until test_error hasn't improved in 20 rounds.
##
## [2] train-error:0.324933+0.003052 test-error:0.330657+0.007450
## [3] train-error:0.325059+0.004019 test-error:0.329320+0.006796
## [4] train-error:0.323637+0.005058 test-error:0.329655+0.007128
## [5] train-error:0.319541+0.006556 test-error:0.329489+0.007270
## [6] train-error:0.318330+0.006015 test-error:0.325143+0.006348
## [7] train-error:0.316867+0.006064 test-error:0.325477+0.006745
## [8] train-error:0.315989+0.006950 test-error:0.323805+0.004943
## [9] train-error:0.315195+0.006889 test-error:0.322467+0.003769
## [10] train-error:0.313482+0.007000 test-error:0.321632+0.004167
## [11] train-error:0.311016+0.004278 test-error:0.318956+0.007306
## [12] train-error:0.309929+0.003531 test-error:0.318455+0.006756
## [13] train-error:0.308341+0.002678 test-error:0.318454+0.007146
## [14] train-error:0.306962+0.002371 test-error:0.317451+0.006749
## [15] train-error:0.305625+0.001556 test-error:0.316615+0.005501
## [16] train-error:0.304037+0.002503 test-error:0.314108+0.006027
## [17] train-error:0.303452+0.003529 test-error:0.314777+0.006410
## [18] train-error:0.300736+0.004365 test-error:0.312269+0.005136
## [19] train-error:0.298270+0.003512 test-error:0.311100+0.003550
## [20] train-error:0.298228+0.003966 test-error:0.310931+0.006340
## [21] train-error:0.298186+0.003985 test-error:0.310932+0.005289
## [22] train-error:0.294885+0.002634 test-error:0.309261+0.005259
## [23] train-error:0.293338+0.003444 test-error:0.308759+0.006530
## [24] train-error:0.292210+0.002011 test-error:0.308425+0.005800
## [25] train-error:0.291750+0.002021 test-error:0.306585+0.004609
## [26] train-error:0.290664+0.002975 test-error:0.307087+0.005814
## [27] train-error:0.288741+0.003232 test-error:0.307254+0.005729
## [28] train-error:0.288950+0.003077 test-error:0.307254+0.006076
## [29] train-error:0.287989+0.002549 test-error:0.306753+0.004600
## [30] train-error:0.284353+0.002633 test-error:0.306251+0.005797
## [31] train-error:0.283726+0.002380 test-error:0.306084+0.006337
## [32] train-error:0.282723+0.002259 test-error:0.305415+0.006219
## [33] train-error:0.282807+0.002709 test-error:0.303910+0.006515
## [34] train-error:0.281260+0.002032 test-error:0.303076+0.005742
## [35] train-error:0.280174+0.002556 test-error:0.303243+0.006801
## [36] train-error:0.279171+0.001710 test-error:0.302407+0.006016
## [37] train-error:0.279213+0.001992 test-error:0.302741+0.006443
## [38] train-error:0.278711+0.001503 test-error:0.302742+0.006695
## [39] train-error:0.277708+0.001568 test-error:0.302742+0.006307
## [40] train-error:0.277165+0.001438 test-error:0.302241+0.006365
## [41] train-error:0.275744+0.002711 test-error:0.301070+0.005205
## [42] train-error:0.275326+0.001925 test-error:0.300569+0.006023
## [43] train-error:0.274073+0.002675 test-error:0.299732+0.007171
## [44] train-error:0.273320+0.002694 test-error:0.300067+0.007636
## [45] train-error:0.272442+0.002397 test-error:0.300233+0.007363
## [46] train-error:0.272359+0.002732 test-error:0.299899+0.006605
## [47] train-error:0.271063+0.002851 test-error:0.299563+0.005723
## [48] train-error:0.270353+0.003394 test-error:0.300232+0.005335
## [49] train-error:0.269977+0.003281 test-error:0.299061+0.005179
## [50] train-error:0.267929+0.002493 test-error:0.298059+0.004471
## [51] train-error:0.267804+0.002122 test-error:0.297893+0.004735
## [52] train-error:0.267135+0.001910 test-error:0.297726+0.004854
## [53] train-error:0.266341+0.001804 test-error:0.299064+0.004574
## [54] train-error:0.265881+0.001389 test-error:0.300067+0.004839
## [55] train-error:0.265171+0.000868 test-error:0.300401+0.004398
## [56] train-error:0.264544+0.001396 test-error:0.301236+0.004398
## [57] train-error:0.264669+0.001417 test-error:0.301403+0.004934
## [58] train-error:0.264335+0.001539 test-error:0.300735+0.004995
## [59] train-error:0.263708+0.001255 test-error:0.299063+0.005889
## [60] train-error:0.263123+0.001543 test-error:0.300233+0.006504
## [61] train-error:0.261535+0.002149 test-error:0.300735+0.006459
## [62] train-error:0.261158+0.002308 test-error:0.300066+0.006143
## [63] train-error:0.261033+0.002727 test-error:0.298394+0.006197
## [64] train-error:0.260531+0.002740 test-error:0.298561+0.006589
## [65] train-error:0.260657+0.003309 test-error:0.299230+0.004740
## [66] train-error:0.259069+0.002235 test-error:0.299565+0.004869
## [67] train-error:0.258526+0.001995 test-error:0.299231+0.005509
## [68] train-error:0.258609+0.001401 test-error:0.298896+0.004154
## [69] train-error:0.257857+0.001427 test-error:0.300568+0.004969
## [70] train-error:0.257439+0.001892 test-error:0.301403+0.005667
## [71] train-error:0.256311+0.002313 test-error:0.301403+0.005764
## [72] train-error:0.255976+0.003099 test-error:0.301235+0.006014
## Stopping. Best iteration:
## [52] train-error:0.267135+0.001910 test-error:0.297726+0.004854
#The model returned lowest error @:
bestn <- xgbcv$best_iteration #cambia cada vez = 68 empleado en CRM
paste("Mejor iteración:",bestn)
## [1] "Mejor iteración: 52"
paste("CV Accuracy: ",round((1-min(xgbcv$evaluation_log$test_error_mean))*100,2),"%",sep="")
## [1] "CV Accuracy: 70.23%"
Y ahora, entrenamos nuestro modelo de pruebas y calculamos Accuracy:
xgb1 <- xgb.train(
params = params,
data = dtrain,
nrounds = bestn, #default=bestn para no hacer overfitting
watchlist = list(val=dtest,train=dtrain),
print_every_n = 1,
maximize = F,
eval_metric = "error"
)
## [1] val-error:0.305111 train-error:0.323303
## [2] val-error:0.305501 train-error:0.323805
## [3] val-error:0.307062 train-error:0.325978
## [4] val-error:0.309403 train-error:0.325644
## [5] val-error:0.309403 train-error:0.325811
## [6] val-error:0.306282 train-error:0.324139
## [7] val-error:0.300429 train-error:0.315948
## [8] val-error:0.301600 train-error:0.314276
## [9] val-error:0.297698 train-error:0.312772
## [10] val-error:0.298869 train-error:0.313273
## [11] val-error:0.293796 train-error:0.309930
## [12] val-error:0.293406 train-error:0.309930
## [13] val-error:0.289504 train-error:0.305249
## [14] val-error:0.290285 train-error:0.305583
## [15] val-error:0.287554 train-error:0.305249
## [16] val-error:0.289504 train-error:0.302407
## [17] val-error:0.288724 train-error:0.302407
## [18] val-error:0.285993 train-error:0.302240
## [19] val-error:0.283652 train-error:0.302073
## [20] val-error:0.281701 train-error:0.299064
## [21] val-error:0.281701 train-error:0.297392
## [22] val-error:0.282091 train-error:0.294216
## [23] val-error:0.282481 train-error:0.294717
## [24] val-error:0.282091 train-error:0.293882
## [25] val-error:0.279360 train-error:0.295888
## [26] val-error:0.277409 train-error:0.293213
## [27] val-error:0.277409 train-error:0.290873
## [28] val-error:0.276239 train-error:0.289702
## [29] val-error:0.273508 train-error:0.286526
## [30] val-error:0.274678 train-error:0.287028
## [31] val-error:0.273898 train-error:0.286693
## [32] val-error:0.272337 train-error:0.283852
## [33] val-error:0.271167 train-error:0.283517
## [34] val-error:0.273117 train-error:0.283016
## [35] val-error:0.273898 train-error:0.283016
## [36] val-error:0.272727 train-error:0.281846
## [37] val-error:0.271167 train-error:0.280675
## [38] val-error:0.269996 train-error:0.278001
## [39] val-error:0.271557 train-error:0.277332
## [40] val-error:0.272727 train-error:0.277499
## [41] val-error:0.275849 train-error:0.276830
## [42] val-error:0.275068 train-error:0.276162
## [43] val-error:0.274288 train-error:0.274657
## [44] val-error:0.277409 train-error:0.274490
## [45] val-error:0.278970 train-error:0.273989
## [46] val-error:0.277409 train-error:0.274490
## [47] val-error:0.277409 train-error:0.273821
## [48] val-error:0.277409 train-error:0.272986
## [49] val-error:0.276629 train-error:0.273153
## [50] val-error:0.275068 train-error:0.272986
## [51] val-error:0.275458 train-error:0.273989
## [52] val-error:0.277409 train-error:0.273989
paste("Accuracy: ",round((1-min(xgb1$evaluation_log$val_error))*100,2),"%",sep="")
## [1] "Accuracy: 73%"
Una vez satisfechos con el valor obtenido de Accuracy, podemos empezar a evaluar, predecir, estudiar los resultados y exportarlo.
result <- as.data.frame(cbind(id_opp=train$id,date=as.Date(train$created),real=train$emitido,score=predict(xgb1,dtrain)))
result$date <- as.Date(result$date,origin='1970-01-01')
head(result)
## id_opp date real score
## 1 409930 2016-08-10 1 0.5628895
## 2 524091 2017-01-05 1 0.2356761
## 3 433491 2016-09-10 1 0.7169653
## 4 415992 2016-08-17 1 0.2603933
## 5 551997 2017-02-14 1 0.5450000
## 6 552896 2017-02-15 1 0.3885509
threshold <- 0.3 #Definir threshold (0.5 es lo convencional pero depende del caso)
result <- result %>% mutate(predicción=ifelse(score>=threshold,1,0))
xgbpred <- predict(xgb1,dtest)
xgbpred <- ifelse(xgbpred>threshold,1,0)
mat <- xgb.importance(feature_names=colnames(new_tr),model=xgb1) # Importancia variables
MC <- table(test$emitido, xgbpred > threshold)
deciles <- quantile(result$score, probs = seq(0.1, 0.9, length = 9), names = TRUE)
deciles <- data.frame(cbind(
Deciles=row.names(as.data.frame(deciles)),
Threshold=as.data.frame(deciles)),row.names=NULL)
#Entonces, tenemos:
resultados <- list("Mejor iteración + ACC"=
paste(
max(xgb1$evaluation_log$iter),'<-',
round((1-min(xgb1$evaluation_log$val_error))*100,2),"%"),
"Top 10 predictores"=mat[1:10,1:2],
"Rango de fechas"=fechas,
"% Relación Emitidas"=paste(
round(emitidos/n,2),"<-",emitidos,"emitidos"),
"Matriz de Confusión @Threshold"=MC,
"Threshold empleada"=threshold,
"Accuracy (ACC) @Threshold"=round((MC[1,1]+MC[2,2])/n,4),
"% True Positives: emitida & gestionada"=MC[2,2]/emitidos,
"% True: total gestionadas"=(MC[1,2]+MC[2,2])/n,
"Curva ROC"=plot.roc(
x=result$real,
predictor=result$score,
smooth=FALSE,auc=TRUE,ci=TRUE,print.auc=TRUE,percent=TRUE,grid=TRUE),
"Deciles"=deciles)
print(resultados)
## $`Mejor iteración + ACC`
## [1] "52 <- 73 %"
##
## $`Top 10 predictores`
## Feature Gain
## 1: quoted_policies_count 0.35040705
## 2: when_need_policyinmediately 0.12192499
## 3: vehicle_commercial_value 0.07523043
## 4: formuj40 0.06704095
## 5: vehicle_has_registration 0.03987402
## 6: when_need_policyin_a_week_or_less 0.03650641
## 7: mediumSEM 0.03408752
## 8: formshorty 0.02855945
## 9: when_need_policyin_a_month_or_more 0.02371690
## 10: already_insured_soatonly-have-soat 0.02210930
##
## $`Rango de fechas`
## [1] "2016-01-04 - 2017-02-22"
##
## $`% Relación Emitidas`
## [1] "0.41 <- 1063 emitidos"
##
## $`Matriz de Confusión @Threshold`
##
## FALSE TRUE
## 0 624 876
## 1 95 968
##
## $`Threshold empleada`
## [1] 0.3
##
## $`Accuracy (ACC) @Threshold`
## [1] 0.6211
##
## $`% True Positives: emitida & gestionada`
## [1] 0.9106303
##
## $`% True: total gestionadas`
## [1] 0.7194694
##
## $`Curva ROC`
##
## Call:
## plot.roc.default(x = result$real, predictor = result$score, smooth = FALSE, auc = TRUE, ci = TRUE, print.auc = TRUE, percent = TRUE, grid = TRUE)
##
## Data: result$score in 3500 controls (result$real 0) < 2482 cases (result$real 1).
## Area under the curve: 80.87%
## 95% CI: 79.8%-81.95% (DeLong)
##
## $Deciles
## Deciles deciles
## 1 10% 0.1566591
## 2 20% 0.2528037
## 3 30% 0.3112978
## 4 40% 0.3594880
## 5 50% 0.4017697
## 6 60% 0.4435182
## 7 70% 0.5036746
## 8 80% 0.5870904
## 9 90% 0.6987775
Veamos algunos gráficos:
grid.arrange(arrangeGrob(
ggplot(mat[1:20,1:2],
aes(x=reorder(Feature,Gain),
y=Gain,
label=round(Gain,2),fill=as.numeric(Gain))) +
geom_col() + coord_flip() + xlab('') + ylab('Importancia') +
guides(fill=FALSE) + geom_text(hjust=-0.5)))
ggplot(deciles, aes(
x=Deciles,
y=deciles,
label=round(deciles*100,2),fill=as.numeric(deciles))) +
geom_col() +
xlab('Threshold por deciles') + ylab('Corte del score') +
guides(fill=FALSE) + geom_text(vjust=-1)
grid.arrange(ggplot(as.data.frame(result))+
geom_histogram(
aes(x=score),
binwidth = 0.01, fill="black", color="white", alpha=0.5) +
ylab("Contador") + xlab(''),
ggplot(filter(result,real==1)) +
geom_histogram(
aes(x=score),
binwidth = 0.01, fill="blue", color="white", alpha=0.5) +
xlab('') + ylab('Emitidos') +
xlim(0, 1) + ylim(0,120),
ggplot(filter(result,real==0)) +
geom_histogram(
aes(x=score),
binwidth = 0.01, fill="red", color="white", alpha=0.5) +
xlab('Score') + ylab('No emitidos') +
xlim(0, 1) + ylim(0,120), ncol=1)
ggplot(select(result,real,round(score,2)) %>%
group_by(Score=round(score,2),Emisión=real) %>% tally()) +
geom_bar(
aes(x=Score,y=n,fill = as.factor(Emisión)),stat="identity") +
ylab('Frecuencia') +
scale_x_continuous(breaks = round(seq(0, 1, by = 0.1),1)) +
theme(legend.position="bottom", legend.direction="horizontal", legend.title = element_blank())
xgbi <- xgb.train(params = params,data = dtrain,
nrounds = 200, #default=bestn para no hacer overfitting
watchlist = list(val=dtest,train=dtrain),print_every_n = 1,maximize = F,eval_metric = "error")
## [1] val-error:0.305111 train-error:0.323303
## [2] val-error:0.305501 train-error:0.323805
## [3] val-error:0.307062 train-error:0.325978
## [4] val-error:0.309403 train-error:0.325644
## [5] val-error:0.309403 train-error:0.325811
## [6] val-error:0.306282 train-error:0.324139
## [7] val-error:0.300429 train-error:0.315948
## [8] val-error:0.301600 train-error:0.314276
## [9] val-error:0.297698 train-error:0.312772
## [10] val-error:0.298869 train-error:0.313273
## [11] val-error:0.293796 train-error:0.309930
## [12] val-error:0.293406 train-error:0.309930
## [13] val-error:0.289504 train-error:0.305249
## [14] val-error:0.290285 train-error:0.305583
## [15] val-error:0.287554 train-error:0.305249
## [16] val-error:0.289504 train-error:0.302407
## [17] val-error:0.288724 train-error:0.302407
## [18] val-error:0.285993 train-error:0.302240
## [19] val-error:0.283652 train-error:0.302073
## [20] val-error:0.281701 train-error:0.299064
## [21] val-error:0.281701 train-error:0.297392
## [22] val-error:0.282091 train-error:0.294216
## [23] val-error:0.282481 train-error:0.294717
## [24] val-error:0.282091 train-error:0.293882
## [25] val-error:0.279360 train-error:0.295888
## [26] val-error:0.277409 train-error:0.293213
## [27] val-error:0.277409 train-error:0.290873
## [28] val-error:0.276239 train-error:0.289702
## [29] val-error:0.273508 train-error:0.286526
## [30] val-error:0.274678 train-error:0.287028
## [31] val-error:0.273898 train-error:0.286693
## [32] val-error:0.272337 train-error:0.283852
## [33] val-error:0.271167 train-error:0.283517
## [34] val-error:0.273117 train-error:0.283016
## [35] val-error:0.273898 train-error:0.283016
## [36] val-error:0.272727 train-error:0.281846
## [37] val-error:0.271167 train-error:0.280675
## [38] val-error:0.269996 train-error:0.278001
## [39] val-error:0.271557 train-error:0.277332
## [40] val-error:0.272727 train-error:0.277499
## [41] val-error:0.275849 train-error:0.276830
## [42] val-error:0.275068 train-error:0.276162
## [43] val-error:0.274288 train-error:0.274657
## [44] val-error:0.277409 train-error:0.274490
## [45] val-error:0.278970 train-error:0.273989
## [46] val-error:0.277409 train-error:0.274490
## [47] val-error:0.277409 train-error:0.273821
## [48] val-error:0.277409 train-error:0.272986
## [49] val-error:0.276629 train-error:0.273153
## [50] val-error:0.275068 train-error:0.272986
## [51] val-error:0.275458 train-error:0.273989
## [52] val-error:0.277409 train-error:0.273989
## [53] val-error:0.276239 train-error:0.272484
## [54] val-error:0.276239 train-error:0.272317
## [55] val-error:0.277799 train-error:0.272317
## [56] val-error:0.277019 train-error:0.272484
## [57] val-error:0.278190 train-error:0.271815
## [58] val-error:0.275458 train-error:0.270478
## [59] val-error:0.275849 train-error:0.269642
## [60] val-error:0.276239 train-error:0.269141
## [61] val-error:0.275458 train-error:0.268472
## [62] val-error:0.275068 train-error:0.268639
## [63] val-error:0.277409 train-error:0.268138
## [64] val-error:0.275849 train-error:0.267636
## [65] val-error:0.275458 train-error:0.268472
## [66] val-error:0.275068 train-error:0.268305
## [67] val-error:0.274288 train-error:0.268138
## [68] val-error:0.274678 train-error:0.267636
## [69] val-error:0.274288 train-error:0.266633
## [70] val-error:0.274288 train-error:0.266633
## [71] val-error:0.274678 train-error:0.265965
## [72] val-error:0.273508 train-error:0.265797
## [73] val-error:0.272727 train-error:0.265463
## [74] val-error:0.270776 train-error:0.264293
## [75] val-error:0.271557 train-error:0.263791
## [76] val-error:0.273117 train-error:0.261785
## [77] val-error:0.272727 train-error:0.261284
## [78] val-error:0.272337 train-error:0.259278
## [79] val-error:0.274288 train-error:0.257272
## [80] val-error:0.274288 train-error:0.257606
## [81] val-error:0.274288 train-error:0.256603
## [82] val-error:0.274288 train-error:0.256603
## [83] val-error:0.273117 train-error:0.256770
## [84] val-error:0.273117 train-error:0.256937
## [85] val-error:0.273898 train-error:0.255767
## [86] val-error:0.273898 train-error:0.255266
## [87] val-error:0.276239 train-error:0.253594
## [88] val-error:0.276239 train-error:0.253093
## [89] val-error:0.277409 train-error:0.253427
## [90] val-error:0.277799 train-error:0.253260
## [91] val-error:0.277409 train-error:0.253260
## [92] val-error:0.276239 train-error:0.252591
## [93] val-error:0.276629 train-error:0.252591
## [94] val-error:0.277019 train-error:0.252591
## [95] val-error:0.278190 train-error:0.252257
## [96] val-error:0.278580 train-error:0.251254
## [97] val-error:0.277799 train-error:0.251421
## [98] val-error:0.277799 train-error:0.250084
## [99] val-error:0.277019 train-error:0.249749
## [100] val-error:0.273898 train-error:0.248913
## [101] val-error:0.275458 train-error:0.249248
## [102] val-error:0.277019 train-error:0.248078
## [103] val-error:0.276239 train-error:0.248078
## [104] val-error:0.277019 train-error:0.248078
## [105] val-error:0.277019 train-error:0.247910
## [106] val-error:0.277409 train-error:0.247075
## [107] val-error:0.275849 train-error:0.246573
## [108] val-error:0.275849 train-error:0.246406
## [109] val-error:0.276239 train-error:0.246406
## [110] val-error:0.276239 train-error:0.246406
## [111] val-error:0.275458 train-error:0.245069
## [112] val-error:0.274288 train-error:0.244066
## [113] val-error:0.273898 train-error:0.243898
## [114] val-error:0.273508 train-error:0.243898
## [115] val-error:0.275068 train-error:0.244066
## [116] val-error:0.275458 train-error:0.243898
## [117] val-error:0.275849 train-error:0.243230
## [118] val-error:0.276239 train-error:0.243397
## [119] val-error:0.276239 train-error:0.243230
## [120] val-error:0.275849 train-error:0.243230
## [121] val-error:0.275068 train-error:0.243063
## [122] val-error:0.275068 train-error:0.243230
## [123] val-error:0.275849 train-error:0.241224
## [124] val-error:0.275458 train-error:0.241224
## [125] val-error:0.275068 train-error:0.239218
## [126] val-error:0.275068 train-error:0.239385
## [127] val-error:0.275068 train-error:0.239385
## [128] val-error:0.275458 train-error:0.239385
## [129] val-error:0.274678 train-error:0.238716
## [130] val-error:0.274678 train-error:0.238883
## [131] val-error:0.274288 train-error:0.238883
## [132] val-error:0.275458 train-error:0.238549
## [133] val-error:0.275849 train-error:0.237546
## [134] val-error:0.274678 train-error:0.236710
## [135] val-error:0.275068 train-error:0.235874
## [136] val-error:0.274678 train-error:0.235038
## [137] val-error:0.274678 train-error:0.235206
## [138] val-error:0.274678 train-error:0.235206
## [139] val-error:0.275458 train-error:0.234537
## [140] val-error:0.275458 train-error:0.233032
## [141] val-error:0.275068 train-error:0.232197
## [142] val-error:0.275068 train-error:0.232364
## [143] val-error:0.275068 train-error:0.231361
## [144] val-error:0.275458 train-error:0.230525
## [145] val-error:0.277799 train-error:0.229856
## [146] val-error:0.278580 train-error:0.228686
## [147] val-error:0.278190 train-error:0.228519
## [148] val-error:0.281701 train-error:0.227182
## [149] val-error:0.281311 train-error:0.225677
## [150] val-error:0.279360 train-error:0.224674
## [151] val-error:0.279750 train-error:0.224340
## [152] val-error:0.279750 train-error:0.223838
## [153] val-error:0.281311 train-error:0.223504
## [154] val-error:0.280531 train-error:0.223838
## [155] val-error:0.280531 train-error:0.224005
## [156] val-error:0.280921 train-error:0.223337
## [157] val-error:0.280531 train-error:0.224340
## [158] val-error:0.280921 train-error:0.224173
## [159] val-error:0.280140 train-error:0.221832
## [160] val-error:0.279750 train-error:0.221331
## [161] val-error:0.279750 train-error:0.220996
## [162] val-error:0.279750 train-error:0.221163
## [163] val-error:0.279360 train-error:0.219659
## [164] val-error:0.279360 train-error:0.218656
## [165] val-error:0.279360 train-error:0.217987
## [166] val-error:0.278970 train-error:0.217987
## [167] val-error:0.278970 train-error:0.217486
## [168] val-error:0.278190 train-error:0.216650
## [169] val-error:0.278580 train-error:0.216817
## [170] val-error:0.280921 train-error:0.214811
## [171] val-error:0.280921 train-error:0.214477
## [172] val-error:0.278580 train-error:0.214142
## [173] val-error:0.279360 train-error:0.214644
## [174] val-error:0.279360 train-error:0.214477
## [175] val-error:0.279360 train-error:0.214310
## [176] val-error:0.280140 train-error:0.213808
## [177] val-error:0.282481 train-error:0.214142
## [178] val-error:0.282872 train-error:0.214142
## [179] val-error:0.281701 train-error:0.212972
## [180] val-error:0.281701 train-error:0.213139
## [181] val-error:0.281311 train-error:0.213139
## [182] val-error:0.280921 train-error:0.212805
## [183] val-error:0.280921 train-error:0.212805
## [184] val-error:0.280921 train-error:0.212805
## [185] val-error:0.280140 train-error:0.212805
## [186] val-error:0.279750 train-error:0.212805
## [187] val-error:0.282091 train-error:0.211301
## [188] val-error:0.280531 train-error:0.211468
## [189] val-error:0.279360 train-error:0.209963
## [190] val-error:0.280140 train-error:0.210130
## [191] val-error:0.280531 train-error:0.209462
## [192] val-error:0.279750 train-error:0.209295
## [193] val-error:0.280921 train-error:0.208626
## [194] val-error:0.281701 train-error:0.207121
## [195] val-error:0.282091 train-error:0.206286
## [196] val-error:0.281701 train-error:0.205617
## [197] val-error:0.282091 train-error:0.205115
## [198] val-error:0.282481 train-error:0.205450
## [199] val-error:0.281311 train-error:0.204948
## [200] val-error:0.281311 train-error:0.203945
val_error <- as.data.frame(xgbi$evaluation_log$val_error)
train_error <- as.data.frame(xgbi$evaluation_log$train_error)
ggplot(val_error, aes(row(val_error))) +
geom_line(aes(y = abs(val_error),color='red')) +
geom_line(aes(y = abs(train_error),color='blue')) +
xlab('Iteraciones') + ylab('Error') +
ggtitle('Delta Train & Test Error') + guides(colour=FALSE) +
geom_vline(xintercept=bestn) #bestn used in model
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.
Exportemos ahora nuestro modelo en formato binario para luego ser implementado en nuestro CRM usando Python y XGBoost.
xgb.save(xgb1, fname="xgb1.model")
## [1] TRUE
# Chequeo si se exportó bien:
pred <- predict(xgb1,dtrain)
# Cargamos el modelo binario
xgb2 <- xgb.load("xgb1.model")
pred2 <- predict(xgb2, dtrain, ntreelimit = bestn)
# pred2 = pred ? Perfecto:
sum(abs(pred2-pred))
## [1] 0