Se presenta información de los clientes de manera individual, sin tener en cuenta el tipo de licencia que poseen. Se han calculado los siguientes atributos:
Se filtran los clientes que tienen menos de 12 meses en la empresa y que no hayan realizado al menos una renovacion de servicio.
# Se calcula el tiempo entre facturaciones en meses
client_info_elapsed_time=invoice_data %>% group_by(client_id) %>% summarise(firstdate=min(timestamp),lastdate=max(timestamp)) %>% mutate(elapsed_time=(lastdate-firstdate)/60/60/24/30) %>% select(client_id,elapsed_time)
# Aquellos clientes con una operacion se los considera con un mes como tiempo minimo de permanencia
client_info_elapsed_time$elapsed_time=ifelse(client_info_elapsed_time$elapsed_time<1,1,client_info_elapsed_time$elapsed_time)
# Se calcula el numero de operaciones que se facturaron
client_info_operations=invoice_data %>% group_by(client_id) %>%
summarise(total_operation_count_per_client=n_distinct(invoice_id))
# Numero de veces que un cliente realizo un increase o upgrade
number_of_increase_server=invoice_data %>% filter(invoice_item_kind=='increase'& invoice_item_article=='server') %>%
group_by(client_id) %>%
summarise(n_server_increase=n())
number_of_renew_server=invoice_data %>% filter(invoice_item_kind=='renew'& invoice_item_article=='server') %>%
group_by(client_id) %>%
summarise(n_server_renew=n())
number_of_renew_sp=invoice_data %>% filter(invoice_item_kind=='renew'& invoice_item_article=='supercache') %>%
group_by(client_id) %>%
summarise(n_sp_renew=n())
number_of_increase_sp=invoice_data %>% filter(invoice_item_kind=='increase'& invoice_item_article=='supercache') %>%
group_by(client_id) %>%
summarise(n_sp_increase=n())
number_of_remove=invoice_data %>% filter(invoice_item_kind=='remove') %>%
group_by(client_id) %>%
summarise(n_remove=n())
server_type_by_client=invoice_data %>% group_by(client_id,server_id) %>% select(client_id,server_number_of_clients) %>% ungroup() %>% distinct() %>% select(client_id,server_id,server_number_of_clients)
# Numero total de clientes por cliente
server_type_by_client=server_type_by_client %>% filter(!is.na(server_id))
total_number_of_clients=server_type_by_client %>% group_by(client_id) %>% summarise(total_number_of_clients=sum(server_number_of_clients))
# number total de clientes por licencia
server_type_by_client=reshape2::dcast(server_type_by_client,client_id~server_number_of_clients)
#server_type_by_client=server_type_by_client[,1:ncol(server_type_by_client)-1]
number_of_servers_per_client = invoice_data %>% group_by(client_id) %>% summarise(n_servers=n_distinct(server_id,na.rm = TRUE))
# Extrayendo informacion de facturacion por cliente
client_invoice_amount= invoice_data %>% group_by(client_id) %>%
summarise(total_invoice_amount=sum(invoice_item_amount)/1000)
# Extrayendo informacion personal del cliente
client_personal_info = invoice_data %>%
select(client_id,client_name,client_company_name,client_country_name,client_geo_provice) %>%
distinct()
client_info=
full_join(client_info_operations,client_info_elapsed_time,by="client_id") %>%
full_join(client_invoice_amount,by="client_id") %>%
full_join(client_personal_info,by="client_id") %>%
full_join(number_of_increase_server,by="client_id") %>%
full_join(number_of_renew_server,by="client_id") %>%
full_join(number_of_increase_sp,by="client_id") %>%
full_join(number_of_renew_sp,by="client_id") %>%
full_join(number_of_servers_per_client,by="client_id") %>%
full_join(number_of_remove,by="client_id") %>%
full_join(total_number_of_clients,by="client_id") %>%
full_join(server_type_by_client,by="client_id")
client_info = client_info %>% mutate( n_server_increase = replace(n_server_increase, is.na(n_server_increase), 0)) %>%
mutate( n_server_renew = replace(n_server_renew, is.na(n_server_renew), 0)) %>%
mutate( n_sp_increase = replace(n_sp_increase, is.na(n_sp_increase), 0)) %>%
mutate( n_sp_renew = replace(n_sp_renew, is.na(n_sp_renew), 0)) %>%
mutate( n_remove = replace(n_remove, is.na(n_remove), 0))
# Se calcula la taza de renew de licencias de servidores por cliente
client_info=client_info %>% mutate(renew_server_rate=(n_server_renew+1)/((as.integer(elapsed_time/12)*n_servers)+2))
# Se calcula la taza de renew de licencias sp por cliente
client_info=client_info %>% mutate(renew_sp_rate=(n_sp_renew+1)/(as.integer(elapsed_time/12)+2))
# Se calcula promedio mensual de facturacion por cliente
client_info$avg_monthly_invoice=(client_info$total_invoice_amount/client_info$elapsed_time)
client_info=client_info %>% rowwise() %>% mutate(n_top_operations=sum(n_server_increase,n_server_renew,n_sp_increase,n_sp_renew,na.rm=T))
# Solo consideramos aquellos que han estado por lo menos 12 meses y hayan hecho al menos un renew
client_info=client_info %>% filter(elapsed_time >12 & (n_sp_renew >=1 | n_server_renew>=1))
# & avg_monthly_invoice <=0.22 & n_server_increase<=1)
client_info$client_id=factor(client_info$client_id)
library(DT)
datatable(client_info[,-c(5,6,7,8)], filter="bottom", options = list(
search = list(regex = TRUE, caseInsensitive = FALSE),
pageLength = 5
), class = 'cell-border stripe', extensions = 'Buttons'
)
El siguiente grafico muestra la relación entre el tiempo transcurrido en la empresa y el monto total de facturación por cada cliente. Cada punto representa un cliente. Ademas, el color indica el promedio de facturacion mensual de cada cliente y el tamaño, la cantidad de operaciones de alta importancia (server o superache increase o renew) . Finalmente, la recta de color cyan, muestra la tendencia observada.
gg=ggplot(client_info,aes(x=elapsed_time,y=total_invoice_amount))+
geom_jitter(aes(colour=avg_monthly_invoice,size=n_top_operations,
text = paste0("client_name:", client_name, "<BR>number_of_users:",total_number_of_clients,
"<BR>n_servers:",n_servers,"<BR>n_server_increase:",n_server_increase,"<BR>n_top_operations:",n_top_operations,"<BR>n_total_operations:",total_operation_count_per_client)),alpha=0.8)+
scale_x_continuous('Tiempo de permanencia en la empresa [meses]', expand=c(0.01,0.3 ) )+ylab("monto total de facturacion [Ku$S]")+
labs(colour = 'Facturacion promedio mensual', size = 'numero de renovaciones y upgrades (increase|renew)')+
scale_color_gradient2(low = "cyan", mid='blue',high = "red")+
ggtitle("Información por cliente (sin considerar tipo de server)")+
theme(plot.title = element_text(lineheight=3.5, face="bold"))+
scale_size(range = c(1, 10))+
geom_smooth(method='lm',colour='skyblue',se = F)
# facet_wrap(~client_country_name)
#gg
ggplotly(gg,layerData=1)
El indicador de facturacion mensual parece ser un buen discriminador, se pueden diferenciar al menos 4 tipos de clientes sin importar el tiempo transcurrido.
Por otro lado dado que los casos que superan los 10K de facturacion total y un 0.22K de facturacion mensual son escasos, estos deberian ser removidos del conjunto de dato.
par(mfrow=c(3,1))
bwplot(~avg_monthly_invoice,data=client_info,horizontal = T,xlab="promedio de facturacion mensual")
client_info=client_info %>% filter(avg_monthly_invoice <=0.22 & n_server_increase<=1)
readr::write_csv(client_info,"wptree/data/client_info.csv")
save(client_info,file="wptree/data/client_info.RData")
Los montos de facturacion mensual estan influenciados por diversas variables que deben analizarse en conjunto. Los arboles de decision son una herramienta que permite establecer diferentes combinaciones de variables/atributos utilizando un enfoque Divide & Conquer. Se seleccionan aquellos atributos y valores que tienen mayor impacto a la hora de discriminar entre las posibles categorias. Cada una de la hojas del arbol termina siendo una regla que explica una categoria con una probabilidad determininada.
Una version dinamica del arbol de decision se encuentra en (http://harpomaxx.shinyapps.io/wptree)
client_info_tree=client_info[,c(15,30,31,32)]
formula <- as.formula(avg_monthly_invoice~.)
tree=rpart(formula,data=client_info_tree,control = rpart.control(minsplit=10, cp=0.005,xval=10))
prp(tree,cex=0.8,varlen=0,extra="auto" )
printcp(tree)
##
## Regression tree:
## rpart(formula = formula, data = client_info_tree, control = rpart.control(minsplit = 10,
## cp = 0.005, xval = 10))
##
## Variables actually used in tree construction:
## [1] renew_server_rate renew_sp_rate total_number_of_clients
##
## Root node error: 0.40929/214 = 0.0019126
##
## n= 214
##
## CP nsplit rel error xerror xstd
## 1 0.4205658 0 1.00000 1.00449 0.111571
## 2 0.0670980 1 0.57943 0.67338 0.078178
## 3 0.0571566 2 0.51234 0.62227 0.072024
## 4 0.0438877 3 0.45518 0.55605 0.067311
## 5 0.0417167 4 0.41129 0.56718 0.068132
## 6 0.0305019 5 0.36958 0.55546 0.067394
## 7 0.0178284 7 0.30857 0.49712 0.067013
## 8 0.0112245 8 0.29074 0.45252 0.062844
## 9 0.0070784 9 0.27952 0.44182 0.061153
## 10 0.0069847 10 0.27244 0.47338 0.064476
## 11 0.0056917 11 0.26546 0.45826 0.061591
## 12 0.0050228 13 0.25407 0.46286 0.062660
## 13 0.0050000 14 0.24905 0.46648 0.063041
#plot(tree,uniform=T,main="Regression Tree for Customer categories")
#text(tree,cex=0.7,minlength=20,all=T,use.n=T,xpd=T)