This is my first visual exploratory analysis of the Santander data set used for the company’s Kaggle competition, which can be found at https://www.kaggle.com/c/santander-product-recommendation/data. The purpose of this competition is to predict which product an individual consumer will purchase next. Thus, I’m identifying each instance of a new product being purchased, then looking at different variables by product to see if there is anything obvious that drives these decisions.

First we’ll read and reformat the data:

#load required packages
options(stringsAsFactors=F,scipen=99)
require(xgboost)
require(data.table)
require(ggplot2)


setwd(wd)

#load data
tr1 <- suppressWarnings(fread("train_ver2.csv"))
## 
Read 0.0% of 13647309 rows
Read 3.1% of 13647309 rows
Read 7.8% of 13647309 rows
Read 12.7% of 13647309 rows
Read 17.4% of 13647309 rows
Read 22.3% of 13647309 rows
Read 27.1% of 13647309 rows
Read 27.8% of 13647309 rows
Read 32.9% of 13647309 rows
Read 38.0% of 13647309 rows
Read 43.1% of 13647309 rows
Read 48.2% of 13647309 rows
Read 53.3% of 13647309 rows
Read 58.5% of 13647309 rows
Read 63.5% of 13647309 rows
Read 68.7% of 13647309 rows
Read 73.7% of 13647309 rows
Read 78.8% of 13647309 rows
Read 83.9% of 13647309 rows
Read 88.9% of 13647309 rows
Read 93.9% of 13647309 rows
Read 98.8% of 13647309 rows
Read 13647309 rows and 48 (of 48) columns from 2.135 GB file in 00:00:33
te1 <- suppressWarnings(fread("test_ver2.csv"))

#add new variable to each data set
tr1[, entry := "train"]
te1[, entry := "test"]

#combine the data set
t <- rbind(tr1, te1, fill = TRUE)
rm(tr1, te1); gc()
##             used   (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   1318249   70.5    2164898  115.7   1768079   94.5
## Vcells 482905144 3684.3  968689606 7390.6 952821113 7269.5
#format the date fields
t[, ':='(fecha_dato = as.Date(fecha_dato), fecha_alta = as.Date(fecha_alta))]

#keep only the report date, the user id, and the binary fields
t2_cols <- c("fecha_dato", "ncodpers", colnames(t)[grepl("ult1", colnames(t)) == TRUE])
t2 <- t[, t2_cols, with = FALSE]
#reformat the data from wide to long and retain the structure of all fields that do are not product fields
t2 <- data.table(melt(t2, 
                      id.vars = colnames(t2)[grepl("ult1", colnames(t2)) == FALSE],
                      measure.vars = colnames(t2)[grepl("ult1", colnames(t2)) == TRUE]))
#reorder the 't' by user id and then report date
setorder(t2, ncodpers, variable, fecha_dato)

#keep only those products that have value of 1 (i.e., the product is purchased)
t2 <- t2[value == 1]
#keep only the first instance of each used id/product combination that is not '2015-01-28'
#because the data table has been ordered by report date, this is the transaction date for all new products purchased after '2015-01-28'
setkey(t2, ncodpers, variable)
uniqdat <- subset(unique(t2))
uniqdat[, new_pur := "y"]
uniqdat[, new_pur := ifelse(fecha_dato == "2015-01-28", NA, new_pur)]
uniqdat$value <- NULL
setorder(uniqdat, ncodpers, variable, fecha_dato)
#merge back with the t2 data table
t2 <- merge(t2, uniqdat, by = c("ncodpers", "variable", "fecha_dato"), all.x = TRUE)

#keep only the new purchase report date, user id, product combindations
t3 <- t2[new_pur == "y"]

#calculate the total number of orders per product
volume_order <- t3[, list(orders = sum(value)), by = "variable"]
volume_order <- volume_order[order(-orders)]
setorder(t3, ncodpers, fecha_dato)

#create a new data table that merges specific demographics with the new purchase data table
demo_cols <- colnames(t)[grepl("ult1", colnames(t)) == FALSE]
demos <- t[, demo_cols, with = FALSE]
setorder(demos, ncodpers, fecha_dato)
t4 <- merge(t3, demos, by = c("ncodpers", "fecha_dato"))
rm(demos); gc()
##             used   (Mb) gc trigger    (Mb)   max used    (Mb)
## Ncells   1323949   70.8    2164898   115.7    1768079    94.5
## Vcells 577210202 4403.8 1865579436 14233.3 2914926018 22239.2
#calculate the total number of orders per country
country_order <- t4[, list(orders = sum(value)), by = "pais_residencia"]
country_order <- country_order[order(orders)]

#calculate the number of months between the client established month and the product purchase month
min_month <- as.Date(paste0(as.character(substr(min(t4[!is.na(fecha_alta)]$fecha_alta), 1, 7)), "-01"))
max_month <- as.Date(paste0(as.character(substr(max(t4[!is.na(fecha_dato)]$fecha_dato), 1, 7)), "-01"))
est_months_table <- data.table(est_month = substr(seq.Date(min_month, max_month, by = "month"), 1, 7),
                           est_counter = c(1:length(seq.Date(min_month, max_month, by = "month"))))
pur_months_table <- data.table(pur_month = substr(seq.Date(min_month, max_month, by = "month"), 1, 7),
                           pur_counter = c(1:length(seq.Date(min_month, max_month, by = "month"))))
t4[, ':='(est_month = substr(fecha_alta, 1, 7), pur_month = substr(fecha_dato, 1, 7))]
t4 <- merge(t4, est_months_table, by = "est_month", all.x = TRUE)
t4 <- merge(t4, pur_months_table, by = "pur_month", all.x = TRUE)
t4[, account_age := pur_counter - est_counter + 1]

Bar chart of the volume of each product purchased since 2015-02

p1 <- ggplot(data = t3, aes(x = variable)) + geom_bar(fill = "blue", color = "orange") +
  scale_x_discrete(limits = volume_order$variable) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
p1

Box plot of the user id age ranges for each product purchased since 2015-02. The x-axis is ordered by volume per product.

p2 <- ggplot(data = t4, aes(x = variable, y = age)) + geom_boxplot(fill = "orange", color = "blue") + 
  scale_x_discrete(limits = volume_order$variable) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
p2

Box plot of the age of the account (in number of months) ranges for each product purchased since 2015-02. The x-axis is ordered by volume per product.

p3 <- ggplot(data = t4, aes(x = variable, y = account_age)) + geom_boxplot(fill = "green", color = "blue") + 
  scale_x_discrete(limits = volume_order$variable) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
p3

Plot of the resident country for all products purchased since 2015-02. The x-axis is ordered by volume per product.

p4 <- ggplot(data = t4, aes(x = variable, y = pais_residencia)) + geom_count(color = "red") + scale_size_area() +
  scale_x_discrete(limits = volume_order$variable) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
p4