In this markdown document, we provide summary findings from exploratory data analysis of the four datasets provided by Kaggle on the Acquire Valued Shoppers project. Project and dataset descriptions are available at https://www.kaggle.com/c/acquire-valued-shoppers-challenge

First download necessary libraries and set work directory

library(ggplot2)
library(dplyr)
setwd("/Users/z001pt4/Documents/DeepLearning/Project/Kaggle_AcquireValuedShoppers/Datasets")

Now read in datasets

offers <- read.csv("offers.csv")
trainHistory <- read.csv("trainHistory.csv")
testHistory <- read.csv("testHistory.csv")
#reducedTrans <- read.csv("reduced_trans_to_offer_cat_comp_subset.csv") 
#save(reducedTrans, file = "reducedTrans.Rdata")
load("reducedTrans.Rdata")

Offers dataset:

str(offers)
## 'data.frame':    37 obs. of  6 variables:
##  $ offer     : int  1190530 1194044 1197502 1198271 1198272 1198273 1198274 1198275 1199256 1199258 ...
##  $ category  : int  9115 9909 3203 5558 5558 5558 5558 5558 4401 4401 ...
##  $ quantity  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ company   : int  108500080 107127979 106414464 107120272 107120272 107120272 107120272 107120272 105100050 105100050 ...
##  $ offervalue: num  5 1 0.75 1.5 1.5 1.5 1.5 1.5 2 2 ...
##  $ brand     : int  93904 6732 13474 5072 5072 5072 5072 5072 13791 13791 ...

Offers dataset has 37 rows and following columns: offer, category, quantity, company, offervalue, brand

Let’s convert offer, company, category and brand to a categorical field

offers$offer <- factor(offers$offer) #37 offers
offers$category <- factor(offers$category) #20 categories
offers$company <- factor(offers$company) #18 companies
offers$brand <- factor(offers$brand) #19 brands

There are 37 distinct offers from 20 categories, 18 companies and 19 brands.

Number of offers by company (the manufacturer of the item):

  sort(table(offers$company), decreasing=TRUE)
## 
##  104460040  103700030  107120272  105100050  105450050  107717272 
##          7          5          5          2          2          2 
## 1076211171 1087744888  103320030  104127141  104610040  105190050 
##          2          2          1          1          1          1 
##  106414464  107106878  107127979  108079383  108500080 1089520383 
##          1          1          1          1          1          1
  offers <- within(offers, company <- factor(company, levels=names(sort(table(company), decreasing=TRUE))))
  qplot(company, data = offers, geom = "bar", fill=I("pink"))+
    theme(axis.text.x=element_text(angle=90,hjust=1))

Company 104460040 has 7 offers.

Number of offers by category

  sort(table(offers$category), decreasing=TRUE) 
## 
## 5558 7205  799 1703 1726 2202 4401 4517 5619 6202 9909  706 2119 3203 3504 
##    5    5    2    2    2    2    2    2    2    2    2    1    1    1    1 
## 3509 5122 5616 5824 9115 
##    1    1    1    1    1
  offers <- within(offers, category <- factor(category, levels=names(sort(table(category), decreasing=TRUE))))
  qplot(category, data = offers, geom = "bar", fill = I("lightgreen"))+
    theme(axis.text.x=element_text(angle=90,hjust=1))

Categories 5558 and 7205 has 5 offers as maximum number of offers whereas the next top offer count is 2 in a category

Number of offers by brand

  sort(table(offers$brand), decreasing=TRUE)
## 
##   4294   5072   7668   1322   3718  13791  17286  64486 102504    875 
##      5      5      5      2      2      2      2      2      2      1 
##   6732   6926  13474  15889  17311  26189  26456  28840  93904 
##      1      1      1      1      1      1      1      1      1
  offers <- within(offers, brand <- factor(brand, levels=names(sort(table(brand), decreasing=TRUE))))
  qplot(brand, data = offers, geom = "bar", fill = I("darkblue"))+
    theme(axis.text.x=element_text(angle=90,hjust=1))

Top 3 brands have 5 offers, next from top brand has 2 offers

Offer value summary

  summary(offers$offervalue) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.750   1.500   1.500   1.649   1.500   5.000

Offer values are distributed from 75 cents to $5 with a mean value of $1.65 and median $1.5

Offer quantity summary

  summary(offers$quantity) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   1.000   1.027   1.000   2.000
  offers %>% filter(quantity > 1)
##     offer category quantity   company offervalue brand
## 1 1221658     7205        2 103700030          3  4294

Number of units one must purchase to get the discount has a max of 2, and a min of 1, with a mean 1.027. Only one offer requires to get 2 units for the discount to apply.

Train History dataset

str(trainHistory) 
## 'data.frame':    160057 obs. of  7 variables:
##  $ id         : num  86246 86252 12682470 12996040 13089312 ...
##  $ chain      : int  205 205 18 15 15 14 15 14 4 4 ...
##  $ offer      : int  1208251 1197502 1197502 1197502 1204821 1197502 1200581 1200581 1204576 1197502 ...
##  $ market     : int  34 34 11 9 9 8 9 8 1 1 ...
##  $ repeattrips: int  5 16 0 0 0 0 0 0 0 0 ...
##  $ repeater   : Factor w/ 2 levels "f","t": 2 2 1 1 1 1 1 1 1 1 ...
##  $ offerdate  : Factor w/ 56 levels "2013-03-01","2013-03-02",..: 50 22 23 20 27 24 25 25 31 21 ...

There are 160,057 rows in the dataset, with columns id, chain, offer, market, repeattrips, repeater, offerdate.

Let’s convert id, chain, offer, and market to categorical fields.

trainHistory$id <- factor(trainHistory$id) #160,057 customers
trainHistory$chain <- factor(trainHistory$chain) #130 store chains (the store where customer shopped)
trainHistory$offer <- factor(trainHistory$offer) #24 offers among 37 original offers show up in train history file
trainHistory$market <- factor(trainHistory$market) #34 markets
trainHistory$offerdate <- as.Date(trainHistory$offerdate)

Since number of distinct customer ids in the file matches number of rows, each customer in training set has only one offer.

Date range of offers

min(trainHistory$offerdate) #Minimum offer date in train History = '2013-03-01'
## [1] "2013-03-01"
max(trainHistory$offerdate) #Maximum offer date in train History = '2013-04-30'
## [1] "2013-04-30"

Number of customers for each offer

trainHistory <- within(trainHistory, offer <- factor(offer, levels=names(sort(table(offer), decreasing=TRUE))))
qplot(offer, data = trainHistory, geom = "bar", fill=I("darkblue"))+
  theme(axis.text.x=element_text(angle=90,hjust=1))

sort(table(trainHistory$offer), decreasing = TRUE)
## 
## 1197502 1208329 1203052 1208251 1199256 1204576 1199258 1194044 1208501 
##   45652   18767   15337   15028    7971    7293    7037    6810    6643 
## 1200581 1208252 1200578 1208503 1204821 1204822 1198275 1198272 1200988 
##    5032    3393    2922    2413    2326    2290    1977    1924    1418 
## 1200582 1200579 1198274 1198273 1198271 1200584 
##    1291    1238    1124     992     957     222

Maximum customer count per offer = 45,652 (for offer 1197502),and minimum customer count = 222 in trainHistory.

Repeaters

table(trainHistory$repeater) #43,438 repeaters
## 
##      f      t 
## 116619  43438
nrow(trainHistory %>% filter(repeater == "t"))/nrow(trainHistory) #27.1% of all customers are repeaters
## [1] 0.2713908

Repeat percentage within a category

trainHistory_OfferDetails <- merge(trainHistory, offers)
trainHistory_OfferDetails$category <- factor(trainHistory_OfferDetails$category, levels=names(sort(table(trainHistory_OfferDetails$category), decreasing=TRUE)))

trainHistory_OfferDetails %>%
  ggplot(aes(x=category, fill=repeater))+
  geom_bar()+
  theme(axis.text.x=element_text(angle=90,hjust=1))

Visually, categories 9909 and 2119 seem to have the largest percentage of repeaters. Let’s print the percentage of repeaters within a category in a table format:

a <- trainHistory_OfferDetails %>% filter(repeater == "t") %>% group_by(category) %>% summarize(RepeatCustomerCnt = n())
b <- trainHistory_OfferDetails %>% group_by(category) %>% summarize(TotalCustomerCnt = n())
c <- merge(a, b, all = T)
c$PercentRepeat <- c$RepeatCustomerCnt*100.0/c$TotalCustomerCnt
c %>% arrange(-PercentRepeat)
##    category RepeatCustomerCnt TotalCustomerCnt PercentRepeat
## 1      9909              9950            22147     44.927078
## 2      2119              8152            18767     43.437950
## 3      6202              3333             9056     36.804329
## 4      2202              5857            18421     31.795234
## 5      5616              2075             7293     28.451940
## 6      3504                51              222     22.972973
## 7      5558              1385             6974     19.859478
## 8      3203              8940            45652     19.582932
## 9      3509               264             1418     18.617772
## 10     5619               792             4616     17.157712
## 11     1726               928             6323     14.676578
## 12     1703               379             4160      9.110577
## 13     4401              1332            15008      8.875267

Repeat percentage within a brand

trainHistory_OfferDetails$brand <- factor(trainHistory_OfferDetails$brand, levels=names(sort(table(trainHistory_OfferDetails$brand), decreasing=TRUE)))

trainHistory_OfferDetails %>%
  ggplot(aes(x=brand, fill=repeater))+
  geom_bar()+
  theme(axis.text.x=element_text(angle=90,hjust=1))

Visually, brands 6926, 6732 and 28840 seem to have the largest percentage of repeaters. Let’s print the percentage of repeaters within a brand in a table format:

trainHistory_OfferDetails <- merge(trainHistory, offers)
a <- trainHistory_OfferDetails %>% filter(repeater == "t") %>% group_by(brand) %>% summarize(RepeatCustomerCnt = n())
b <- trainHistory_OfferDetails %>% group_by(brand) %>% summarize(TotalCustomerCnt = n())
c <- merge(a, b, all = T)
c$PercentRepeat <- c$RepeatCustomerCnt*100.0/c$TotalCustomerCnt
c %>% arrange(-PercentRepeat)
##     brand RepeatCustomerCnt TotalCustomerCnt PercentRepeat
## 1    6732              3453             6810     50.704846
## 2    6926              8152            18767     43.437950
## 3   28840              6497            15337     42.361609
## 4   64486              3333             9056     36.804329
## 5    3718              5857            18421     31.795234
## 6   15889              2075             7293     28.451940
## 7    5072              1385             6974     19.859478
## 8   13474              8940            45652     19.582932
## 9     875               264             1418     18.617772
## 10 102504               792             4616     17.157712
## 11   7668              1358            10705     12.685661
## 12  13791              1332            15008      8.875267

Repeat Trip Count Distribution

Let’s take a look at the summary distribution of repeat trip counts of repeaters in the train history dataset.

summary(trainHistory_OfferDetails$repeattrips[trainHistory_OfferDetails$repeater=="t"])
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##    1.000    1.000    1.000    2.418    2.000 2124.000

Median is 1 repeat trip and mean is 2.418 repeat trips. Strangely, the maximum repeat trip count is too high: 2124 trips for one customer for a specific offer product (data error?). Let’s take a look at all the offers that generated more than 100 repeat trips after using the offer.

trainHistory_OfferDetails %>% filter(repeattrips > 100)
##     offer         id chain market repeattrips repeater  offerdate category
## 1 1197502 3465135195   306      6        2124        t 2013-03-25     3203
## 2 1197502 3450535153   306      6        1418        t 2013-04-18     3203
## 3 1203052 4176323168   166     14         104        t 2013-04-01     9909
## 4 1203052 4427711419   152     10         549        t 2013-04-02     9909
##   quantity    company offervalue brand
## 1        1  106414464       0.75 13474
## 2        1  106414464       0.75 13474
## 3        1 1089520383       1.00 28840
## 4        1 1089520383       1.00 28840

What is the repeat trip count distribution by offer?

trainHistory_OfferDetails %>%
  filter(repeater == "t") %>%
  ggplot(aes(x=offer, y=repeattrips))+
  geom_boxplot()+
  theme(axis.text.x=element_text(angle=90,hjust=1))

There are very big outliers as we saw in the table summary as well. Let’s take a look at a zoomed in version.

trainHistory_OfferDetails %>%
  filter(repeater == "t") %>%
  ggplot(aes(x=offer, y=repeattrips))+
  geom_boxplot()+
  ylim(0,20)+
  theme(axis.text.x=element_text(angle=90,hjust=1))

The above chart removed 154 rows of train history dataset where number of repeat trips were greater than 20. Median repeat trips is 1 for 20 out of 24 offers, and 2 for the remaining 4 offers in train history dataset.

Let’s take a look at the same summary by pooling in offers by category. So here is the distribution of repeat trips of repeaters by category:

trainHistory_OfferDetails %>%
  filter(repeater == "t") %>%
  ggplot(aes(x=category, y=repeattrips))+
  geom_boxplot()+
  ylim(0,20)+
  theme(axis.text.x=element_text(angle=90,hjust=1))

And repeat trip counts of repeaters by brand:

trainHistory_OfferDetails %>%
  filter(repeater == "t") %>%
  ggplot(aes(x=brand, y=repeattrips))+
  geom_boxplot()+
  ylim(0,20)+
  theme(axis.text.x=element_text(angle=90,hjust=1))

Test History file

Let’s take a look at the data in test history file.

str(testHistory)
## 'data.frame':    151484 obs. of  5 variables:
##  $ id       : num  12262064 12277270 12332190 12524696 13074629 ...
##  $ chain    : int  95 95 95 4 14 14 4 15 14 14 ...
##  $ offer    : int  1219903 1221658 1213242 1221665 1221658 1221658 1203439 1203439 1221665 1221665 ...
##  $ market   : int  39 39 39 1 8 8 1 9 8 8 ...
##  $ offerdate: Factor w/ 89 levels "2013-05-01","2013-05-02",..: 55 51 46 48 49 50 13 12 52 55 ...

There are 151,484 rows in the file with columns: id, chain, offer, market, offerdate.

Let’s convert id, chain, offer, and market to categorical fields

testHistory$id <- factor(testHistory$id) #151,484 customers -> so each customer in test set has a unique offer in test history file
testHistory$chain <- factor(testHistory$chain) #131 store chains (the store where customer shopped) --> training set had 130.
testHistory$offer <- factor(testHistory$offer) #29 offers among 37 original offers show up in train history file --> training set had 24..
testHistory$market <- factor(testHistory$market) #34 markets --> same as train history
testHistory$offerdate <- as.Date(testHistory$offerdate)

Min and Max offerdates

min(testHistory$offerdate) #Minimum offer date in test History = '2013-05-01' 
## [1] "2013-05-01"
max(testHistory$offerdate) #Maximum offer date in train History = '2013-07-31'
## [1] "2013-07-31"

Test dataset offer dates are within the two month time frame right after the end of train dataset offers (which was ‘2013-04-30’). So, there is no overlap in train and test offer time frames, but time horizon length is the same (2 months).

Number of customers for each offer

testHistory <- within(testHistory, offer <- factor(offer, levels=names(sort(table(offer), decreasing=TRUE))))
qplot(offer, data = testHistory, geom = "bar")

sort(table(testHistory$offer), decreasing = TRUE) 
## 
## 1221663 1221658 1230218 1203439 1220503 1220502 1221665 1221666 1221667 
##   43819   32447   23350   12864    7709    5454    4802    3137    3095 
## 1213242 1219900 1219903 1190530 1208501 1200581 1208503 1204576 1200582 
##    3062    2390    1960    1827     894     853     671     539     535 
## 1208251 1208329 1198275 1198272 1200584 1198274 1204821 1198273 1198271 
##     501     350     237     209     160     148     114     112     108 
## 1208252 1204822 
##     105      32

Maximum customer count per offer = 43,819 (for offer 1221663) , and minimum customer count = 32 in testHistory

Any overlap in customers in train vs. test?

trainCustomers <- select(trainHistory, id) %>% distinct(id)
testCustomers <- select(testHistory, id) %>% distinct(id)
joint_ids <- merge(trainCustomers, testCustomers) 
nrow(joint_ids)
## [1] 0

There is no overlap of customers between train and test history datasets.

Offers and their customer count in train and test set

trainOffers <- select(trainHistory, id, offer)
testOffers <- select(testHistory, id, offer)
  
trainOffers <- trainOffers %>% group_by(offer) %>% summarize(CustomerCnt = n())
testOffers <- testOffers %>% group_by(offer) %>% summarize(CustomerCnt = n())
  
colnames(trainOffers) <- c("offer", "TrainCustomerCnt")
colnames(testOffers) <- c("offer", "TestCustomerCnt")
  
TestVsTrainOffersSmmry <- merge(trainOffers, testOffers, all = T)
TestVsTrainOffersSmmry
##      offer TrainCustomerCnt TestCustomerCnt
## 1  1197502            45652              NA
## 2  1208329            18767             350
## 3  1203052            15337              NA
## 4  1208251            15028             501
## 5  1199256             7971              NA
## 6  1204576             7293             539
## 7  1199258             7037              NA
## 8  1194044             6810              NA
## 9  1208501             6643             894
## 10 1200581             5032             853
## 11 1208252             3393             105
## 12 1200578             2922              NA
## 13 1208503             2413             671
## 14 1204821             2326             114
## 15 1204822             2290              32
## 16 1198275             1977             237
## 17 1198272             1924             209
## 18 1200988             1418              NA
## 19 1200582             1291             535
## 20 1200579             1238              NA
## 21 1198274             1124             148
## 22 1198273              992             112
## 23 1198271              957             108
## 24 1200584              222             160
## 25 1221663               NA           43819
## 26 1221658               NA           32447
## 27 1230218               NA           23350
## 28 1203439               NA           12864
## 29 1220503               NA            7709
## 30 1220502               NA            5454
## 31 1221665               NA            4802
## 32 1221666               NA            3137
## 33 1221667               NA            3095
## 34 1213242               NA            3062
## 35 1219900               NA            2390
## 36 1219903               NA            1960
## 37 1190530               NA            1827

All 37 offers show up in at least one of the train or test history files. Only 16 offers show up in both tables. - 8 offers in train history file do not show up in test history (33% of train offers) - 13 offers in test history file do not show up in train history (45% of test offers)

Categories of offers and their customer count in train and test set

trainOfferCat <- merge(trainOffers, offers)
testOfferCat <- merge(testOffers, offers)
  
trainOfferCat <- select(trainOfferCat, TrainCustomerCnt, category) %>% 
  group_by(category) %>% 
  summarize(TrainCustomerCnt = sum(TrainCustomerCnt))

testOfferCat <- select(testOfferCat, TestCustomerCnt, category) %>% 
  group_by(category) %>% 
  summarize(TestCustomerCnt = sum(TestCustomerCnt))
  
TestVsTrainOfferCatSmmry <- merge(trainOfferCat, testOfferCat, all = T)
TestVsTrainOfferCatSmmry
##    category TrainCustomerCnt TestCustomerCnt
## 1      5558             6974             814
## 2      7205               NA           87300
## 3       799               NA            4350
## 4      1703             4160              NA
## 5      1726             6323            1388
## 6      2202            18421             606
## 7      4401            15008              NA
## 8      4517               NA           13163
## 9      5619             4616             146
## 10     6202             9056            1565
## 11     9909            22147              NA
## 12      706               NA           23350
## 13     2119            18767             350
## 14     3203            45652              NA
## 15     3504              222             160
## 16     3509             1418              NA
## 17     5122               NA           12864
## 18     5616             7293             539
## 19     5824               NA            3062
## 20     9115               NA            1827

Since all 37 offers show up in at least one of the train or test history files, all 20 categories of offers are in either one of these files.

Only 8 offer categories (out of 20) show up in both tables - 5 offer categories in train history file do not show up in test history (Out of 13 train categories) - 7 offers categories in test history file do not show up in train history (Out of 15 test categories)

Reduced Transactions dataset

Transactions dataset contains transaction history for all customers for a period of at least 1 year prior to their offered incentive. Original file is more than 20 GB. The dataset is reduced to 1.7GB by keeping only the transactions with the company or the category exists in offers dataset.

str(reducedTrans)
## 'data.frame':    27764694 obs. of  11 variables:
##  $ id              : num  86246 86246 86246 86246 86246 ...
##  $ chain           : int  205 205 205 205 205 205 205 205 205 205 ...
##  $ dept            : int  99 21 26 58 26 37 72 41 55 29 ...
##  $ category        : int  9909 2106 2630 5824 2634 3703 7205 4109 5558 2903 ...
##  $ company         : num  1.05e+08 1.05e+08 1.04e+08 1.09e+08 1.04e+08 ...
##  $ brand           : int  15343 27873 14647 55172 3293 2820 3830 2820 5603 72482 ...
##  $ date            : Factor w/ 514 levels "2012-03-02","2012-03-03",..: 1 1 1 1 5 5 5 5 6 6 ...
##  $ productsize     : num  16 64 56 16 6 ...
##  $ productmeasure  : Factor w/ 6 levels "CT","LB","LT",..: 4 4 1 4 5 4 4 4 4 4 ...
##  $ purchasequantity: int  1 1 1 1 1 1 1 2 1 1 ...
##  $ purchaseamount  : num  2.49 3.29 1 3.29 4.99 0.79 3.99 1.18 1.25 0.99 ...

Original file - # of transactions: 349,655,789

Reduced file - # of transactions: 27,764,694 (including only the transactions for all the companies and categories in offers.csv)

reducedTrans$date <- as.Date(reducedTrans$date)

Get the subset of transactions for training set customers:

reducedTransTrain <- reducedTrans %>%
  filter(id %in% trainCustomers$id)

One more round of data type transaformations:

str(reducedTransTrain)
## 'data.frame':    14988648 obs. of  11 variables:
##  $ id              : num  86246 86246 86246 86246 86246 ...
##  $ chain           : int  205 205 205 205 205 205 205 205 205 205 ...
##  $ dept            : int  99 21 26 58 26 37 72 41 55 29 ...
##  $ category        : int  9909 2106 2630 5824 2634 3703 7205 4109 5558 2903 ...
##  $ company         : num  1.05e+08 1.05e+08 1.04e+08 1.09e+08 1.04e+08 ...
##  $ brand           : int  15343 27873 14647 55172 3293 2820 3830 2820 5603 72482 ...
##  $ date            : Date, format: "2012-03-02" "2012-03-02" ...
##  $ productsize     : num  16 64 56 16 6 ...
##  $ productmeasure  : Factor w/ 6 levels "CT","LB","LT",..: 4 4 1 4 5 4 4 4 4 4 ...
##  $ purchasequantity: int  1 1 1 1 1 1 1 2 1 1 ...
##  $ purchaseamount  : num  2.49 3.29 1 3.29 4.99 0.79 3.99 1.18 1.25 0.99 ...
reducedTransTrain$id <- factor(reducedTransTrain$id)
reducedTransTrain$chain <- factor(reducedTransTrain$chain)
reducedTransTrain$category <- factor(reducedTransTrain$category)
reducedTransTrain$company <- factor(reducedTransTrain$company)
reducedTransTrain$brand <- factor(reducedTransTrain$brand)
reducedTransTrain$dept <- factor(reducedTransTrain$dept)

Did the customer purchase the item in the specific offer before?

Let’s define the product in an offer by the category-brand-company combination. If we join train history offer details dataset with the reduced transactions train dataset (id,category,brand,company) and get records back, then these are the historical purchase records of customers who purchased the offer product even before the offer (so offer purchase itself is a repeat purchase):

trainHistory_OfferDetails.sub <- select(trainHistory_OfferDetails, id, category, company, brand, repeater)

historical_product_purchase_train <- merge(reducedTransTrain, trainHistory_OfferDetails.sub)

str(historical_product_purchase_train)
## 'data.frame':    83037 obs. of  12 variables:
##  $ id              : Factor w/ 159961 levels "86246","86252",..: 81514 81518 81520 81520 81520 646 646 646 646 646 ...
##  $ category        : Factor w/ 203 levels "201","305","410",..: 123 123 142 142 142 142 142 142 142 142 ...
##  $ company         : Factor w/ 1420 levels "1036030","10100010",..: 403 403 361 361 361 361 361 361 361 361 ...
##  $ brand           : Factor w/ 1838 levels "0","5","47","174",..: 568 568 662 662 662 662 662 662 662 662 ...
##  $ chain           : Factor w/ 130 levels "2","3","4","6",..: 22 22 22 22 22 10 10 10 10 10 ...
##  $ dept            : Factor w/ 53 levels "2","3","4","5",..: 29 29 37 37 37 37 37 37 37 37 ...
##  $ date            : Date, format: "2012-10-12" "2013-03-02" ...
##  $ productsize     : num  19.6 19.6 8 6.84 7.6 6.67 7.6 7.6 6.67 7.6 ...
##  $ productmeasure  : Factor w/ 6 levels "CT","LB","LT",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ purchasequantity: int  1 1 2 1 1 1 1 2 2 1 ...
##  $ purchaseamount  : num  2.5 2.5 5 2.5 2.5 2.5 3.99 6.1 5.09 2.99 ...
##  $ repeater        : Factor w/ 2 levels "f","t": 2 1 1 1 1 1 1 1 1 1 ...

How many of the training set customers purchased their offer product before?

tmp <- select(historical_product_purchase_train, id, repeater) %>% group_by(id, repeater) %>% summarize(transCnt = n())
nrow(tmp)
## [1] 25245
nrow(tmp %>% filter(repeater == "f"))
## [1] 14443

25,245 out of 160,057 (15.8%) total training set customers purchased the category-company-brand combination of the offer product in their transactions history. However, more interestingly, 14,443 out of 25,245 historical purchasers did not become repeaters after the offer.. (57.2%)

Here is the transaction count distribution for the offer product prior to the offer date for training set customers who purchased the offer category-brand-company combination before broken down by repeater flag.

summary(tmp$transCnt[tmp$repeater == "t"])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   2.000   4.058   4.000 938.000
summary(tmp$transCnt[tmp$repeater == "f"])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   1.000   2.714   3.000 105.000

And visually (by eliminating very high transaction counts):

tmp %>%
  ggplot(aes(x=transCnt))+
  geom_histogram(binwidth = 1, fill = "darkblue")+
  xlim(1,12)+
  ylim(0,5000)+
  facet_wrap(~repeater, nrow = 2)
## Warning: Removed 788 rows containing non-finite values (stat_bin).
## Warning: Removed 1 rows containing missing values (geom_bar).

Now let’s take a look at historical purchases of the offer brand only by training set customers separated by repeaters vs. not repeaters.

trainHistory_OfferDetails.brand <- select(trainHistory_OfferDetails, id, brand, repeater)
historical_brand_purchase_train <- merge(reducedTransTrain, trainHistory_OfferDetails.brand)
str(historical_brand_purchase_train)
## 'data.frame':    272827 obs. of  12 variables:
##  $ id              : Factor w/ 159961 levels "86246","86252",..: 596 602 602 602 602 602 602 602 81514 611 ...
##  $ brand           : Factor w/ 1838 levels "0","5","47","174",..: 158 1816 1816 1816 1816 1816 1816 1816 568 158 ...
##  $ chain           : Factor w/ 130 levels "2","3","4","6",..: 10 10 10 10 10 10 10 10 22 10 ...
##  $ dept            : Factor w/ 53 levels "2","3","4","5",..: 15 37 37 37 37 37 37 37 29 15 ...
##  $ category        : Factor w/ 203 levels "201","305","410",..: 55 146 146 146 146 146 146 143 123 55 ...
##  $ company         : Factor w/ 1420 levels "1036030","10100010",..: 346 672 672 672 672 672 672 672 403 346 ...
##  $ date            : Date, format: "2012-08-24" "2012-04-09" ...
##  $ productsize     : num  96 7 7 7 7 7 7 7.3 19.6 96 ...
##  $ productmeasure  : Factor w/ 6 levels "CT","LB","LT",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ purchasequantity: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ purchaseamount  : num  2.69 3.58 3.99 3 0.49 3.99 3.99 3.99 2.5 2.69 ...
##  $ repeater        : Factor w/ 2 levels "f","t": 2 1 1 1 1 1 1 1 2 1 ...
#Get number of transactions with the brand for each customer
tmp1 <- select(historical_brand_purchase_train, id, repeater) %>%
  group_by(id) %>%
  summarize(OfferBrandTransCnt = n())
#Add missing customers who never shopped the brand before
tmp2 <- merge(select(trainHistory_OfferDetails, id, repeater), tmp1, all.x=T)
tmp2$OfferBrandTransCnt[is.na(tmp2$OfferBrandTransCnt)] <- 0
tmp2$OfferBrandPurchased <- ifelse(tmp2$OfferBrandTransCnt > 0, "Yes", "No")

Now we are ready to answer What percent of repeaters (or non-repeaters) purchased the brand before? (should we update reduced transactions by including all offer brands?)

tmp2$OfferBrandPurchased <- factor(tmp2$OfferBrandPurchased)
tmp2 %>% 
  ggplot(aes(x=repeater, fill=OfferBrandPurchased))+
  geom_bar()

About 40% of repeaters and 30% of non-repeaters purchased the offer brand before.

Similarly What percent of repeaters (or non-repeaters) purchased the category before?

trainHistory_OfferDetails.cat <- select(trainHistory_OfferDetails, id, category, repeater)
historical_cat_purchase_train <- merge(reducedTransTrain, trainHistory_OfferDetails.cat)
#Get number of transactions with the category for each customer
tmp1 <- select(historical_cat_purchase_train, id, repeater) %>%
  group_by(id) %>%
  summarize(OfferCatTransCnt = n())
#Add missing customers who never shopped the category before
tmp2 <- merge(select(trainHistory_OfferDetails, id, repeater), tmp1, all.x=T)
tmp2$OfferCatTransCnt[is.na(tmp2$OfferCatTransCnt)] <- 0
tmp2$OfferCatPurchased <- ifelse(tmp2$OfferCatTransCnt > 0, "Yes", "No")
tmp2$OfferCatPurchased <- factor(tmp2$OfferCatPurchased)
tmp2 %>% 
  ggplot(aes(x=repeater, fill=OfferCatPurchased))+
  geom_bar()

About 60% of repeaters and 50% of non-repeaters purchased the offer category before.

Similarly What percent of repeaters (or non-repeaters) purchased the company before?

trainHistory_OfferDetails.comp <- select(trainHistory_OfferDetails, id, company, repeater)
historical_comp_purchase_train <- merge(reducedTransTrain, trainHistory_OfferDetails.comp)
#Get number of transactions with the company for each customer
tmp1 <- select(historical_comp_purchase_train, id, repeater) %>%
  group_by(id) %>%
  summarize(OfferCompanyTransCnt = n())
#Add missing customers who never shopped the company before
tmp2 <- merge(select(trainHistory_OfferDetails, id, repeater), tmp1, all.x=T)
tmp2$OfferCompanyTransCnt[is.na(tmp2$OfferCompanyTransCnt)] <- 0
tmp2$OfferCompanyPurchased <- ifelse(tmp2$OfferCompanyTransCnt > 0, "Yes", "No")
tmp2$OfferCompanyPurchased <- factor(tmp2$OfferCompanyPurchased)
tmp2 %>% 
  ggplot(aes(x=repeater, fill=OfferCompanyPurchased))+
  geom_bar()

About the same percentage of repeaters and non-repeaters (~55%) purchased the offer company before.

Given above findings what should be the list of base features to use for our model inputs and how should we treat outliers.. cont’d ..