In this markdown document, we provide summary findings from exploratory data analysis of the four datasets provided by Kaggle on the Acquire Valued Shoppers project. Project and dataset descriptions are available at https://www.kaggle.com/c/acquire-valued-shoppers-challenge
First download necessary libraries and set work directory
library(ggplot2)
library(dplyr)
setwd("/Users/z001pt4/Documents/DeepLearning/Project/Kaggle_AcquireValuedShoppers/Datasets")
Now read in datasets
offers <- read.csv("offers.csv")
trainHistory <- read.csv("trainHistory.csv")
testHistory <- read.csv("testHistory.csv")
#reducedTrans <- read.csv("reduced_trans_to_offer_cat_comp_subset.csv")
#save(reducedTrans, file = "reducedTrans.Rdata")
load("reducedTrans.Rdata")
str(offers)
## 'data.frame': 37 obs. of 6 variables:
## $ offer : int 1190530 1194044 1197502 1198271 1198272 1198273 1198274 1198275 1199256 1199258 ...
## $ category : int 9115 9909 3203 5558 5558 5558 5558 5558 4401 4401 ...
## $ quantity : int 1 1 1 1 1 1 1 1 1 1 ...
## $ company : int 108500080 107127979 106414464 107120272 107120272 107120272 107120272 107120272 105100050 105100050 ...
## $ offervalue: num 5 1 0.75 1.5 1.5 1.5 1.5 1.5 2 2 ...
## $ brand : int 93904 6732 13474 5072 5072 5072 5072 5072 13791 13791 ...
Offers dataset has 37 rows and following columns: offer, category, quantity, company, offervalue, brand
Let’s convert offer, company, category and brand to a categorical field
offers$offer <- factor(offers$offer) #37 offers
offers$category <- factor(offers$category) #20 categories
offers$company <- factor(offers$company) #18 companies
offers$brand <- factor(offers$brand) #19 brands
There are 37 distinct offers from 20 categories, 18 companies and 19 brands.
Number of offers by company (the manufacturer of the item):
sort(table(offers$company), decreasing=TRUE)
##
## 104460040 103700030 107120272 105100050 105450050 107717272
## 7 5 5 2 2 2
## 1076211171 1087744888 103320030 104127141 104610040 105190050
## 2 2 1 1 1 1
## 106414464 107106878 107127979 108079383 108500080 1089520383
## 1 1 1 1 1 1
offers <- within(offers, company <- factor(company, levels=names(sort(table(company), decreasing=TRUE))))
qplot(company, data = offers, geom = "bar", fill=I("pink"))+
theme(axis.text.x=element_text(angle=90,hjust=1))
Company 104460040 has 7 offers.
Number of offers by category
sort(table(offers$category), decreasing=TRUE)
##
## 5558 7205 799 1703 1726 2202 4401 4517 5619 6202 9909 706 2119 3203 3504
## 5 5 2 2 2 2 2 2 2 2 2 1 1 1 1
## 3509 5122 5616 5824 9115
## 1 1 1 1 1
offers <- within(offers, category <- factor(category, levels=names(sort(table(category), decreasing=TRUE))))
qplot(category, data = offers, geom = "bar", fill = I("lightgreen"))+
theme(axis.text.x=element_text(angle=90,hjust=1))
Categories 5558 and 7205 has 5 offers as maximum number of offers whereas the next top offer count is 2 in a category
Number of offers by brand
sort(table(offers$brand), decreasing=TRUE)
##
## 4294 5072 7668 1322 3718 13791 17286 64486 102504 875
## 5 5 5 2 2 2 2 2 2 1
## 6732 6926 13474 15889 17311 26189 26456 28840 93904
## 1 1 1 1 1 1 1 1 1
offers <- within(offers, brand <- factor(brand, levels=names(sort(table(brand), decreasing=TRUE))))
qplot(brand, data = offers, geom = "bar", fill = I("darkblue"))+
theme(axis.text.x=element_text(angle=90,hjust=1))
Top 3 brands have 5 offers, next from top brand has 2 offers
Offer value summary
summary(offers$offervalue)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.750 1.500 1.500 1.649 1.500 5.000
Offer values are distributed from 75 cents to $5 with a mean value of $1.65 and median $1.5
Offer quantity summary
summary(offers$quantity)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 1.000 1.027 1.000 2.000
offers %>% filter(quantity > 1)
## offer category quantity company offervalue brand
## 1 1221658 7205 2 103700030 3 4294
Number of units one must purchase to get the discount has a max of 2, and a min of 1, with a mean 1.027. Only one offer requires to get 2 units for the discount to apply.
str(trainHistory)
## 'data.frame': 160057 obs. of 7 variables:
## $ id : num 86246 86252 12682470 12996040 13089312 ...
## $ chain : int 205 205 18 15 15 14 15 14 4 4 ...
## $ offer : int 1208251 1197502 1197502 1197502 1204821 1197502 1200581 1200581 1204576 1197502 ...
## $ market : int 34 34 11 9 9 8 9 8 1 1 ...
## $ repeattrips: int 5 16 0 0 0 0 0 0 0 0 ...
## $ repeater : Factor w/ 2 levels "f","t": 2 2 1 1 1 1 1 1 1 1 ...
## $ offerdate : Factor w/ 56 levels "2013-03-01","2013-03-02",..: 50 22 23 20 27 24 25 25 31 21 ...
There are 160,057 rows in the dataset, with columns id, chain, offer, market, repeattrips, repeater, offerdate.
Let’s convert id, chain, offer, and market to categorical fields.
trainHistory$id <- factor(trainHistory$id) #160,057 customers
trainHistory$chain <- factor(trainHistory$chain) #130 store chains (the store where customer shopped)
trainHistory$offer <- factor(trainHistory$offer) #24 offers among 37 original offers show up in train history file
trainHistory$market <- factor(trainHistory$market) #34 markets
trainHistory$offerdate <- as.Date(trainHistory$offerdate)
Since number of distinct customer ids in the file matches number of rows, each customer in training set has only one offer.
Date range of offers
min(trainHistory$offerdate) #Minimum offer date in train History = '2013-03-01'
## [1] "2013-03-01"
max(trainHistory$offerdate) #Maximum offer date in train History = '2013-04-30'
## [1] "2013-04-30"
Number of customers for each offer
trainHistory <- within(trainHistory, offer <- factor(offer, levels=names(sort(table(offer), decreasing=TRUE))))
qplot(offer, data = trainHistory, geom = "bar", fill=I("darkblue"))+
theme(axis.text.x=element_text(angle=90,hjust=1))
sort(table(trainHistory$offer), decreasing = TRUE)
##
## 1197502 1208329 1203052 1208251 1199256 1204576 1199258 1194044 1208501
## 45652 18767 15337 15028 7971 7293 7037 6810 6643
## 1200581 1208252 1200578 1208503 1204821 1204822 1198275 1198272 1200988
## 5032 3393 2922 2413 2326 2290 1977 1924 1418
## 1200582 1200579 1198274 1198273 1198271 1200584
## 1291 1238 1124 992 957 222
Maximum customer count per offer = 45,652 (for offer 1197502),and minimum customer count = 222 in trainHistory.
Repeaters
table(trainHistory$repeater) #43,438 repeaters
##
## f t
## 116619 43438
nrow(trainHistory %>% filter(repeater == "t"))/nrow(trainHistory) #27.1% of all customers are repeaters
## [1] 0.2713908
Repeat percentage within a category
trainHistory_OfferDetails <- merge(trainHistory, offers)
trainHistory_OfferDetails$category <- factor(trainHistory_OfferDetails$category, levels=names(sort(table(trainHistory_OfferDetails$category), decreasing=TRUE)))
trainHistory_OfferDetails %>%
ggplot(aes(x=category, fill=repeater))+
geom_bar()+
theme(axis.text.x=element_text(angle=90,hjust=1))
Visually, categories 9909 and 2119 seem to have the largest percentage of repeaters. Let’s print the percentage of repeaters within a category in a table format:
a <- trainHistory_OfferDetails %>% filter(repeater == "t") %>% group_by(category) %>% summarize(RepeatCustomerCnt = n())
b <- trainHistory_OfferDetails %>% group_by(category) %>% summarize(TotalCustomerCnt = n())
c <- merge(a, b, all = T)
c$PercentRepeat <- c$RepeatCustomerCnt*100.0/c$TotalCustomerCnt
c %>% arrange(-PercentRepeat)
## category RepeatCustomerCnt TotalCustomerCnt PercentRepeat
## 1 9909 9950 22147 44.927078
## 2 2119 8152 18767 43.437950
## 3 6202 3333 9056 36.804329
## 4 2202 5857 18421 31.795234
## 5 5616 2075 7293 28.451940
## 6 3504 51 222 22.972973
## 7 5558 1385 6974 19.859478
## 8 3203 8940 45652 19.582932
## 9 3509 264 1418 18.617772
## 10 5619 792 4616 17.157712
## 11 1726 928 6323 14.676578
## 12 1703 379 4160 9.110577
## 13 4401 1332 15008 8.875267
Repeat percentage within a brand
trainHistory_OfferDetails$brand <- factor(trainHistory_OfferDetails$brand, levels=names(sort(table(trainHistory_OfferDetails$brand), decreasing=TRUE)))
trainHistory_OfferDetails %>%
ggplot(aes(x=brand, fill=repeater))+
geom_bar()+
theme(axis.text.x=element_text(angle=90,hjust=1))
Visually, brands 6926, 6732 and 28840 seem to have the largest percentage of repeaters. Let’s print the percentage of repeaters within a brand in a table format:
trainHistory_OfferDetails <- merge(trainHistory, offers)
a <- trainHistory_OfferDetails %>% filter(repeater == "t") %>% group_by(brand) %>% summarize(RepeatCustomerCnt = n())
b <- trainHistory_OfferDetails %>% group_by(brand) %>% summarize(TotalCustomerCnt = n())
c <- merge(a, b, all = T)
c$PercentRepeat <- c$RepeatCustomerCnt*100.0/c$TotalCustomerCnt
c %>% arrange(-PercentRepeat)
## brand RepeatCustomerCnt TotalCustomerCnt PercentRepeat
## 1 6732 3453 6810 50.704846
## 2 6926 8152 18767 43.437950
## 3 28840 6497 15337 42.361609
## 4 64486 3333 9056 36.804329
## 5 3718 5857 18421 31.795234
## 6 15889 2075 7293 28.451940
## 7 5072 1385 6974 19.859478
## 8 13474 8940 45652 19.582932
## 9 875 264 1418 18.617772
## 10 102504 792 4616 17.157712
## 11 7668 1358 10705 12.685661
## 12 13791 1332 15008 8.875267
Repeat Trip Count Distribution
Let’s take a look at the summary distribution of repeat trip counts of repeaters in the train history dataset.
summary(trainHistory_OfferDetails$repeattrips[trainHistory_OfferDetails$repeater=="t"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 1.000 2.418 2.000 2124.000
Median is 1 repeat trip and mean is 2.418 repeat trips. Strangely, the maximum repeat trip count is too high: 2124 trips for one customer for a specific offer product (data error?). Let’s take a look at all the offers that generated more than 100 repeat trips after using the offer.
trainHistory_OfferDetails %>% filter(repeattrips > 100)
## offer id chain market repeattrips repeater offerdate category
## 1 1197502 3465135195 306 6 2124 t 2013-03-25 3203
## 2 1197502 3450535153 306 6 1418 t 2013-04-18 3203
## 3 1203052 4176323168 166 14 104 t 2013-04-01 9909
## 4 1203052 4427711419 152 10 549 t 2013-04-02 9909
## quantity company offervalue brand
## 1 1 106414464 0.75 13474
## 2 1 106414464 0.75 13474
## 3 1 1089520383 1.00 28840
## 4 1 1089520383 1.00 28840
What is the repeat trip count distribution by offer?
trainHistory_OfferDetails %>%
filter(repeater == "t") %>%
ggplot(aes(x=offer, y=repeattrips))+
geom_boxplot()+
theme(axis.text.x=element_text(angle=90,hjust=1))
There are very big outliers as we saw in the table summary as well. Let’s take a look at a zoomed in version.
trainHistory_OfferDetails %>%
filter(repeater == "t") %>%
ggplot(aes(x=offer, y=repeattrips))+
geom_boxplot()+
ylim(0,20)+
theme(axis.text.x=element_text(angle=90,hjust=1))
The above chart removed 154 rows of train history dataset where number of repeat trips were greater than 20. Median repeat trips is 1 for 20 out of 24 offers, and 2 for the remaining 4 offers in train history dataset.
Let’s take a look at the same summary by pooling in offers by category. So here is the distribution of repeat trips of repeaters by category:
trainHistory_OfferDetails %>%
filter(repeater == "t") %>%
ggplot(aes(x=category, y=repeattrips))+
geom_boxplot()+
ylim(0,20)+
theme(axis.text.x=element_text(angle=90,hjust=1))
And repeat trip counts of repeaters by brand:
trainHistory_OfferDetails %>%
filter(repeater == "t") %>%
ggplot(aes(x=brand, y=repeattrips))+
geom_boxplot()+
ylim(0,20)+
theme(axis.text.x=element_text(angle=90,hjust=1))
Let’s take a look at the data in test history file.
str(testHistory)
## 'data.frame': 151484 obs. of 5 variables:
## $ id : num 12262064 12277270 12332190 12524696 13074629 ...
## $ chain : int 95 95 95 4 14 14 4 15 14 14 ...
## $ offer : int 1219903 1221658 1213242 1221665 1221658 1221658 1203439 1203439 1221665 1221665 ...
## $ market : int 39 39 39 1 8 8 1 9 8 8 ...
## $ offerdate: Factor w/ 89 levels "2013-05-01","2013-05-02",..: 55 51 46 48 49 50 13 12 52 55 ...
There are 151,484 rows in the file with columns: id, chain, offer, market, offerdate.
Let’s convert id, chain, offer, and market to categorical fields
testHistory$id <- factor(testHistory$id) #151,484 customers -> so each customer in test set has a unique offer in test history file
testHistory$chain <- factor(testHistory$chain) #131 store chains (the store where customer shopped) --> training set had 130.
testHistory$offer <- factor(testHistory$offer) #29 offers among 37 original offers show up in train history file --> training set had 24..
testHistory$market <- factor(testHistory$market) #34 markets --> same as train history
testHistory$offerdate <- as.Date(testHistory$offerdate)
Min and Max offerdates
min(testHistory$offerdate) #Minimum offer date in test History = '2013-05-01'
## [1] "2013-05-01"
max(testHistory$offerdate) #Maximum offer date in train History = '2013-07-31'
## [1] "2013-07-31"
Test dataset offer dates are within the two month time frame right after the end of train dataset offers (which was ‘2013-04-30’). So, there is no overlap in train and test offer time frames, but time horizon length is the same (2 months).
Number of customers for each offer
testHistory <- within(testHistory, offer <- factor(offer, levels=names(sort(table(offer), decreasing=TRUE))))
qplot(offer, data = testHistory, geom = "bar")
sort(table(testHistory$offer), decreasing = TRUE)
##
## 1221663 1221658 1230218 1203439 1220503 1220502 1221665 1221666 1221667
## 43819 32447 23350 12864 7709 5454 4802 3137 3095
## 1213242 1219900 1219903 1190530 1208501 1200581 1208503 1204576 1200582
## 3062 2390 1960 1827 894 853 671 539 535
## 1208251 1208329 1198275 1198272 1200584 1198274 1204821 1198273 1198271
## 501 350 237 209 160 148 114 112 108
## 1208252 1204822
## 105 32
Maximum customer count per offer = 43,819 (for offer 1221663) , and minimum customer count = 32 in testHistory
Any overlap in customers in train vs. test?
trainCustomers <- select(trainHistory, id) %>% distinct(id)
testCustomers <- select(testHistory, id) %>% distinct(id)
joint_ids <- merge(trainCustomers, testCustomers)
nrow(joint_ids)
## [1] 0
There is no overlap of customers between train and test history datasets.
Offers and their customer count in train and test set
trainOffers <- select(trainHistory, id, offer)
testOffers <- select(testHistory, id, offer)
trainOffers <- trainOffers %>% group_by(offer) %>% summarize(CustomerCnt = n())
testOffers <- testOffers %>% group_by(offer) %>% summarize(CustomerCnt = n())
colnames(trainOffers) <- c("offer", "TrainCustomerCnt")
colnames(testOffers) <- c("offer", "TestCustomerCnt")
TestVsTrainOffersSmmry <- merge(trainOffers, testOffers, all = T)
TestVsTrainOffersSmmry
## offer TrainCustomerCnt TestCustomerCnt
## 1 1197502 45652 NA
## 2 1208329 18767 350
## 3 1203052 15337 NA
## 4 1208251 15028 501
## 5 1199256 7971 NA
## 6 1204576 7293 539
## 7 1199258 7037 NA
## 8 1194044 6810 NA
## 9 1208501 6643 894
## 10 1200581 5032 853
## 11 1208252 3393 105
## 12 1200578 2922 NA
## 13 1208503 2413 671
## 14 1204821 2326 114
## 15 1204822 2290 32
## 16 1198275 1977 237
## 17 1198272 1924 209
## 18 1200988 1418 NA
## 19 1200582 1291 535
## 20 1200579 1238 NA
## 21 1198274 1124 148
## 22 1198273 992 112
## 23 1198271 957 108
## 24 1200584 222 160
## 25 1221663 NA 43819
## 26 1221658 NA 32447
## 27 1230218 NA 23350
## 28 1203439 NA 12864
## 29 1220503 NA 7709
## 30 1220502 NA 5454
## 31 1221665 NA 4802
## 32 1221666 NA 3137
## 33 1221667 NA 3095
## 34 1213242 NA 3062
## 35 1219900 NA 2390
## 36 1219903 NA 1960
## 37 1190530 NA 1827
All 37 offers show up in at least one of the train or test history files. Only 16 offers show up in both tables. - 8 offers in train history file do not show up in test history (33% of train offers) - 13 offers in test history file do not show up in train history (45% of test offers)
Categories of offers and their customer count in train and test set
trainOfferCat <- merge(trainOffers, offers)
testOfferCat <- merge(testOffers, offers)
trainOfferCat <- select(trainOfferCat, TrainCustomerCnt, category) %>%
group_by(category) %>%
summarize(TrainCustomerCnt = sum(TrainCustomerCnt))
testOfferCat <- select(testOfferCat, TestCustomerCnt, category) %>%
group_by(category) %>%
summarize(TestCustomerCnt = sum(TestCustomerCnt))
TestVsTrainOfferCatSmmry <- merge(trainOfferCat, testOfferCat, all = T)
TestVsTrainOfferCatSmmry
## category TrainCustomerCnt TestCustomerCnt
## 1 5558 6974 814
## 2 7205 NA 87300
## 3 799 NA 4350
## 4 1703 4160 NA
## 5 1726 6323 1388
## 6 2202 18421 606
## 7 4401 15008 NA
## 8 4517 NA 13163
## 9 5619 4616 146
## 10 6202 9056 1565
## 11 9909 22147 NA
## 12 706 NA 23350
## 13 2119 18767 350
## 14 3203 45652 NA
## 15 3504 222 160
## 16 3509 1418 NA
## 17 5122 NA 12864
## 18 5616 7293 539
## 19 5824 NA 3062
## 20 9115 NA 1827
Since all 37 offers show up in at least one of the train or test history files, all 20 categories of offers are in either one of these files.
Only 8 offer categories (out of 20) show up in both tables - 5 offer categories in train history file do not show up in test history (Out of 13 train categories) - 7 offers categories in test history file do not show up in train history (Out of 15 test categories)
Transactions dataset contains transaction history for all customers for a period of at least 1 year prior to their offered incentive. Original file is more than 20 GB. The dataset is reduced to 1.7GB by keeping only the transactions with the company or the category exists in offers dataset.
str(reducedTrans)
## 'data.frame': 27764694 obs. of 11 variables:
## $ id : num 86246 86246 86246 86246 86246 ...
## $ chain : int 205 205 205 205 205 205 205 205 205 205 ...
## $ dept : int 99 21 26 58 26 37 72 41 55 29 ...
## $ category : int 9909 2106 2630 5824 2634 3703 7205 4109 5558 2903 ...
## $ company : num 1.05e+08 1.05e+08 1.04e+08 1.09e+08 1.04e+08 ...
## $ brand : int 15343 27873 14647 55172 3293 2820 3830 2820 5603 72482 ...
## $ date : Factor w/ 514 levels "2012-03-02","2012-03-03",..: 1 1 1 1 5 5 5 5 6 6 ...
## $ productsize : num 16 64 56 16 6 ...
## $ productmeasure : Factor w/ 6 levels "CT","LB","LT",..: 4 4 1 4 5 4 4 4 4 4 ...
## $ purchasequantity: int 1 1 1 1 1 1 1 2 1 1 ...
## $ purchaseamount : num 2.49 3.29 1 3.29 4.99 0.79 3.99 1.18 1.25 0.99 ...
Original file - # of transactions: 349,655,789
Reduced file - # of transactions: 27,764,694 (including only the transactions for all the companies and categories in offers.csv)
reducedTrans$date <- as.Date(reducedTrans$date)
Get the subset of transactions for training set customers:
reducedTransTrain <- reducedTrans %>%
filter(id %in% trainCustomers$id)
One more round of data type transaformations:
str(reducedTransTrain)
## 'data.frame': 14988648 obs. of 11 variables:
## $ id : num 86246 86246 86246 86246 86246 ...
## $ chain : int 205 205 205 205 205 205 205 205 205 205 ...
## $ dept : int 99 21 26 58 26 37 72 41 55 29 ...
## $ category : int 9909 2106 2630 5824 2634 3703 7205 4109 5558 2903 ...
## $ company : num 1.05e+08 1.05e+08 1.04e+08 1.09e+08 1.04e+08 ...
## $ brand : int 15343 27873 14647 55172 3293 2820 3830 2820 5603 72482 ...
## $ date : Date, format: "2012-03-02" "2012-03-02" ...
## $ productsize : num 16 64 56 16 6 ...
## $ productmeasure : Factor w/ 6 levels "CT","LB","LT",..: 4 4 1 4 5 4 4 4 4 4 ...
## $ purchasequantity: int 1 1 1 1 1 1 1 2 1 1 ...
## $ purchaseamount : num 2.49 3.29 1 3.29 4.99 0.79 3.99 1.18 1.25 0.99 ...
reducedTransTrain$id <- factor(reducedTransTrain$id)
reducedTransTrain$chain <- factor(reducedTransTrain$chain)
reducedTransTrain$category <- factor(reducedTransTrain$category)
reducedTransTrain$company <- factor(reducedTransTrain$company)
reducedTransTrain$brand <- factor(reducedTransTrain$brand)
reducedTransTrain$dept <- factor(reducedTransTrain$dept)
Did the customer purchase the item in the specific offer before?
Let’s define the product in an offer by the category-brand-company combination. If we join train history offer details dataset with the reduced transactions train dataset (id,category,brand,company) and get records back, then these are the historical purchase records of customers who purchased the offer product even before the offer (so offer purchase itself is a repeat purchase):
trainHistory_OfferDetails.sub <- select(trainHistory_OfferDetails, id, category, company, brand, repeater)
historical_product_purchase_train <- merge(reducedTransTrain, trainHistory_OfferDetails.sub)
str(historical_product_purchase_train)
## 'data.frame': 83037 obs. of 12 variables:
## $ id : Factor w/ 159961 levels "86246","86252",..: 81514 81518 81520 81520 81520 646 646 646 646 646 ...
## $ category : Factor w/ 203 levels "201","305","410",..: 123 123 142 142 142 142 142 142 142 142 ...
## $ company : Factor w/ 1420 levels "1036030","10100010",..: 403 403 361 361 361 361 361 361 361 361 ...
## $ brand : Factor w/ 1838 levels "0","5","47","174",..: 568 568 662 662 662 662 662 662 662 662 ...
## $ chain : Factor w/ 130 levels "2","3","4","6",..: 22 22 22 22 22 10 10 10 10 10 ...
## $ dept : Factor w/ 53 levels "2","3","4","5",..: 29 29 37 37 37 37 37 37 37 37 ...
## $ date : Date, format: "2012-10-12" "2013-03-02" ...
## $ productsize : num 19.6 19.6 8 6.84 7.6 6.67 7.6 7.6 6.67 7.6 ...
## $ productmeasure : Factor w/ 6 levels "CT","LB","LT",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ purchasequantity: int 1 1 2 1 1 1 1 2 2 1 ...
## $ purchaseamount : num 2.5 2.5 5 2.5 2.5 2.5 3.99 6.1 5.09 2.99 ...
## $ repeater : Factor w/ 2 levels "f","t": 2 1 1 1 1 1 1 1 1 1 ...
How many of the training set customers purchased their offer product before?
tmp <- select(historical_product_purchase_train, id, repeater) %>% group_by(id, repeater) %>% summarize(transCnt = n())
nrow(tmp)
## [1] 25245
nrow(tmp %>% filter(repeater == "f"))
## [1] 14443
25,245 out of 160,057 (15.8%) total training set customers purchased the category-company-brand combination of the offer product in their transactions history. However, more interestingly, 14,443 out of 25,245 historical purchasers did not become repeaters after the offer.. (57.2%)
Here is the transaction count distribution for the offer product prior to the offer date for training set customers who purchased the offer category-brand-company combination before broken down by repeater flag.
summary(tmp$transCnt[tmp$repeater == "t"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 2.000 4.058 4.000 938.000
summary(tmp$transCnt[tmp$repeater == "f"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 1.000 2.714 3.000 105.000
And visually (by eliminating very high transaction counts):
tmp %>%
ggplot(aes(x=transCnt))+
geom_histogram(binwidth = 1, fill = "darkblue")+
xlim(1,12)+
ylim(0,5000)+
facet_wrap(~repeater, nrow = 2)
## Warning: Removed 788 rows containing non-finite values (stat_bin).
## Warning: Removed 1 rows containing missing values (geom_bar).
Now let’s take a look at historical purchases of the offer brand only by training set customers separated by repeaters vs. not repeaters.
trainHistory_OfferDetails.brand <- select(trainHistory_OfferDetails, id, brand, repeater)
historical_brand_purchase_train <- merge(reducedTransTrain, trainHistory_OfferDetails.brand)
str(historical_brand_purchase_train)
## 'data.frame': 272827 obs. of 12 variables:
## $ id : Factor w/ 159961 levels "86246","86252",..: 596 602 602 602 602 602 602 602 81514 611 ...
## $ brand : Factor w/ 1838 levels "0","5","47","174",..: 158 1816 1816 1816 1816 1816 1816 1816 568 158 ...
## $ chain : Factor w/ 130 levels "2","3","4","6",..: 10 10 10 10 10 10 10 10 22 10 ...
## $ dept : Factor w/ 53 levels "2","3","4","5",..: 15 37 37 37 37 37 37 37 29 15 ...
## $ category : Factor w/ 203 levels "201","305","410",..: 55 146 146 146 146 146 146 143 123 55 ...
## $ company : Factor w/ 1420 levels "1036030","10100010",..: 346 672 672 672 672 672 672 672 403 346 ...
## $ date : Date, format: "2012-08-24" "2012-04-09" ...
## $ productsize : num 96 7 7 7 7 7 7 7.3 19.6 96 ...
## $ productmeasure : Factor w/ 6 levels "CT","LB","LT",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ purchasequantity: int 1 1 1 1 1 1 1 1 1 1 ...
## $ purchaseamount : num 2.69 3.58 3.99 3 0.49 3.99 3.99 3.99 2.5 2.69 ...
## $ repeater : Factor w/ 2 levels "f","t": 2 1 1 1 1 1 1 1 2 1 ...
#Get number of transactions with the brand for each customer
tmp1 <- select(historical_brand_purchase_train, id, repeater) %>%
group_by(id) %>%
summarize(OfferBrandTransCnt = n())
#Add missing customers who never shopped the brand before
tmp2 <- merge(select(trainHistory_OfferDetails, id, repeater), tmp1, all.x=T)
tmp2$OfferBrandTransCnt[is.na(tmp2$OfferBrandTransCnt)] <- 0
tmp2$OfferBrandPurchased <- ifelse(tmp2$OfferBrandTransCnt > 0, "Yes", "No")
Now we are ready to answer What percent of repeaters (or non-repeaters) purchased the brand before? (should we update reduced transactions by including all offer brands?)
tmp2$OfferBrandPurchased <- factor(tmp2$OfferBrandPurchased)
tmp2 %>%
ggplot(aes(x=repeater, fill=OfferBrandPurchased))+
geom_bar()
About 40% of repeaters and 30% of non-repeaters purchased the offer brand before.
Similarly What percent of repeaters (or non-repeaters) purchased the category before?
trainHistory_OfferDetails.cat <- select(trainHistory_OfferDetails, id, category, repeater)
historical_cat_purchase_train <- merge(reducedTransTrain, trainHistory_OfferDetails.cat)
#Get number of transactions with the category for each customer
tmp1 <- select(historical_cat_purchase_train, id, repeater) %>%
group_by(id) %>%
summarize(OfferCatTransCnt = n())
#Add missing customers who never shopped the category before
tmp2 <- merge(select(trainHistory_OfferDetails, id, repeater), tmp1, all.x=T)
tmp2$OfferCatTransCnt[is.na(tmp2$OfferCatTransCnt)] <- 0
tmp2$OfferCatPurchased <- ifelse(tmp2$OfferCatTransCnt > 0, "Yes", "No")
tmp2$OfferCatPurchased <- factor(tmp2$OfferCatPurchased)
tmp2 %>%
ggplot(aes(x=repeater, fill=OfferCatPurchased))+
geom_bar()
About 60% of repeaters and 50% of non-repeaters purchased the offer category before.
Similarly What percent of repeaters (or non-repeaters) purchased the company before?
trainHistory_OfferDetails.comp <- select(trainHistory_OfferDetails, id, company, repeater)
historical_comp_purchase_train <- merge(reducedTransTrain, trainHistory_OfferDetails.comp)
#Get number of transactions with the company for each customer
tmp1 <- select(historical_comp_purchase_train, id, repeater) %>%
group_by(id) %>%
summarize(OfferCompanyTransCnt = n())
#Add missing customers who never shopped the company before
tmp2 <- merge(select(trainHistory_OfferDetails, id, repeater), tmp1, all.x=T)
tmp2$OfferCompanyTransCnt[is.na(tmp2$OfferCompanyTransCnt)] <- 0
tmp2$OfferCompanyPurchased <- ifelse(tmp2$OfferCompanyTransCnt > 0, "Yes", "No")
tmp2$OfferCompanyPurchased <- factor(tmp2$OfferCompanyPurchased)
tmp2 %>%
ggplot(aes(x=repeater, fill=OfferCompanyPurchased))+
geom_bar()
About the same percentage of repeaters and non-repeaters (~55%) purchased the offer company before.
Given above findings what should be the list of base features to use for our model inputs and how should we treat outliers.. cont’d ..