Problem Statement

A retail company “ABC Private Limited” wants to understand the customer purchase behaviour (specifically, purchase amount) against various products of different categories. They have shared purchase summary of various customers for selected high volume products from last month.

The data set also contains customer demographics (age, gender, marital status, city_type, stay_in_current_city), product details (product_id and product category) and Total purchase_amount from last month. Now, they want to build a model to predict the purchase amount of customer against various products which will help them to create personalized offer for customers against different products.

Data

Variable Definition
User_ID User ID
Product_ID Product ID
Gender Sex of User
Age Age in bins
Occupation Occupation (Masked)
City_Category “Category of the City (A,B,C)”
Stay_In_Current_City_Years Number of years stay in current city
Marital_Status Marital Status
Product_Category_1 Product Category (Masked)
Product_Category_2 Product may belongs to otder category also (Masked)
Product_Category_3 Product may belong to other category also (Masked)
Purchase Purchase Amount (Target Variable)
train<-read.csv("train.csv") #233599 rows, 12 cols
test<-read.csv("test-comb.csv") #550068 rows,13 cols

library(ggplot2)
library(caret)
sapply(train,class)
##                    User_ID                 Product_ID 
##                  "integer"                   "factor" 
##                     Gender                        Age 
##                   "factor"                   "factor" 
##                 Occupation              City_Category 
##                  "integer"                   "factor" 
## Stay_In_Current_City_Years             Marital_Status 
##                   "factor"                  "integer" 
##         Product_Category_1         Product_Category_2 
##                  "integer"                  "integer" 
##         Product_Category_3                   Purchase 
##                  "integer"                  "integer"
sapply(test,class)
##                          X                    User_ID 
##                  "integer"                  "integer" 
##                 Product_ID                     Gender 
##                   "factor"                   "factor" 
##                        Age                 Occupation 
##                   "factor"                  "integer" 
##              City_Category Stay_In_Current_City_Years 
##                   "factor"                   "factor" 
##             Marital_Status         Product_Category_1 
##                  "integer"                  "integer" 
##         Product_Category_2         Product_Category_3 
##                  "numeric"                  "numeric" 
##                       Comb 
##                   "factor"
#some of the classes don't look right

#are there any missing values?
as.matrix(sapply(train,function(x)sum(is.na(x))))
##                              [,1]
## User_ID                         0
## Product_ID                      0
## Gender                          0
## Age                             0
## Occupation                      0
## City_Category                   0
## Stay_In_Current_City_Years      0
## Marital_Status                  0
## Product_Category_1              0
## Product_Category_2         173638
## Product_Category_3         383247
## Purchase                        0
as.matrix(sapply(test,function(x)sum(is.na(x))))
##                              [,1]
## X                               0
## User_ID                         0
## Product_ID                      0
## Gender                          0
## Age                             0
## Occupation                      0
## City_Category                   0
## Stay_In_Current_City_Years      0
## Marital_Status                  0
## Product_Category_1              0
## Product_Category_2          72344
## Product_Category_3         162562
## Comb                            0
head(train)
##   User_ID Product_ID Gender   Age Occupation City_Category
## 1 1000001  P00069042      F  0-17         10             A
## 2 1000001  P00248942      F  0-17         10             A
## 3 1000001  P00087842      F  0-17         10             A
## 4 1000001  P00085442      F  0-17         10             A
## 5 1000002  P00285442      M   55+         16             C
## 6 1000003  P00193542      M 26-35         15             A
##   Stay_In_Current_City_Years Marital_Status Product_Category_1
## 1                          2              0                  3
## 2                          2              0                  1
## 3                          2              0                 12
## 4                          2              0                 12
## 5                         4+              0                  8
## 6                          3              0                  1
##   Product_Category_2 Product_Category_3 Purchase
## 1                 NA                 NA     8370
## 2                  6                 14    15200
## 3                 NA                 NA     1422
## 4                 14                 NA     1057
## 5                 NA                 NA     7969
## 6                  2                 NA    15227
#User Id is integer. Should actually be factor.
length(intersect(train$User_ID,test$User_ID))
## [1] 5891
length(unique(train$User_ID)) #so all the users are in both the train and test sets
## [1] 5891
train$User_ID <- as.factor(train$User_ID)
test$User_ID <- as.factor(test$User_ID)

#Product Id - how many different product lines are there?
length(levels(train$Product_ID))
## [1] 3631
#how many purchases of each product line in the train set?
summary(as.vector(table(train$Product_ID)))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0    19.5    71.0   151.5   194.0  1880.0
#gender
print(ggplot(train,aes(x=Gender))+geom_bar())

#age
levels(train$Age)
## [1] "0-17"  "18-25" "26-35" "36-45" "46-50" "51-55" "55+"
print(ggplot(train,aes(x=Age,fill=Gender))+geom_bar(position="dodge"))

#occupation - transform to factor. How many types are there?
train$Occupation <- as.factor(train$Occupation)
test$Occupation <- as.factor(test$Occupation)
length(levels(train$Occupation))
## [1] 21
print(ggplot(train,aes(Occupation,fill=Age))+geom_bar()+facet_grid(Gender~.))

#city category
levels(train$City_Category)
## [1] "A" "B" "C"
print(ggplot(train,aes(City_Category,fill=Age))+geom_bar())

#stay in current city
print(ggplot(train,aes(Stay_In_Current_City_Years,fill=Age))+geom_bar()+facet_grid(City_Category~.))

#marital status - should be factor
train$Marital_Status <- as.factor(train$Marital_Status)
test$Marital_Status <- as.factor(test$Marital_Status)
print(ggplot(train,aes(Marital_Status,fill=Age))+geom_bar()+facet_grid(Gender~.))

#since none of the 0-17 age group have status 1, it looks like status 0 is not married, status 1 is married.

#product categories should be factors
library(plyr)
train[,9:11]<-colwise(as.factor,~Product_Category_1+Product_Category_2+Product_Category_3)(train)
test[,10:12]<-colwise(as.factor,~Product_Category_1+Product_Category_2+Product_Category_3)(test)

#purchase - target variable
print(ggplot(train,aes(x=City_Category, y=Purchase))+geom_boxplot())

#In the test set there is a variable X - just appears to be an index starting from zero

#Comb. This variable appears in the test set. It is a concatenation of the user id with the product id. 
length(unique(test$Comb))  #all unique
## [1] 233599
test$Comb <- as.character(test$Comb)

Model for prediction of Purchase

To get a first benchmark I’ll take Gender,Age,Occupation,City Category,Stay in current city, Marital status and Product category 1 as predictor variables build a glm to predict Purchase.

train1 <- train[,c(3:9,12)]
set.seed(1968)
glmcontrol <- trainControl(method="repeatedcv",number=10,repeats=3,verboseIter = TRUE)
glmfit <- train(Purchase~.,data=train1,method="glm",trControl=glmcontrol,metric="RMSE")
r1<-RMSE( predict(glmfit,train1),train$Purchase)
#predict on test set
Purchase<-predict(glmfit,test)
#prepare file for submission
submit1 <- data.frame(User_ID=test$User_ID,Product_ID=test$Product_ID,Purchase)
#write.csv(submit1,file="sol1.csv",row.names = FALSE)
r1 #rmse on train set
## [1] 3013.401

That gave RMSE of 3026 on the test set. The lowest RMSE on the public leaderboard is currently 2425, so there’s plenty room for improvement. Possibly more powerful models could improve the score, but I think that the features should be dealt with first.
Let’s see which features were most important for the glm.

v1<-varImp(glmfit)
v1
## glm variable importance
## 
##   only 20 most important variables shown (out of 53)
## 
##                      Overall
## Product_Category_15  100.000
## Product_Category_18   77.019
## Product_Category_111  64.841
## Product_Category_14   59.359
## Product_Category_113  47.775
## Product_Category_112  38.588
## Product_Category_120  33.887
## Product_Category_118  29.923
## Product_Category_119  27.673
## Product_Category_13   23.503
## Product_Category_110  21.537
## Product_Category_12   16.999
## Product_Category_16   15.212
## Product_Category_17    8.601
## City_CategoryC         7.701
## Product_Category_116   5.708
## Product_Category_115   4.634
## Product_Category_117   4.293
## Occupation12           2.123
## City_CategoryB         2.028

Well, product category plays a vital role, but I haven’t used product category 2 or 3 in this model. It looks like these features need to be worked on.
As I understand, the product category features are describing which category a product id belongs to. The category 1 is the main category that this product id belongs to, but many products belong to a second or third category aswell. This is a way of describing a group property of the products, as if they were clustered, rather than looking at each product id individually. So each row that a particular product id is mentioned it should have the same values for each of the product category features.

library(data.table)
prod<-rbind(train[,c(1:2,9:11)],test[,c(2:3,10:12)])
#how many unique product ids are in the train and test sets together
length(unique(prod$Product_ID))
## [1] 3677
#how many unique rows of products together with their categories are there?
prod <- data.table(prod)
setkey(prod,Product_ID,Product_Category_1,Product_Category_2,Product_Category_3)
prod<-unique(prod)
dim(prod)
## [1] 3677    5

So every time a product id appears in the data, it’s categories are always the same. This means that the products which have missing categories 2 or 3 are never listed under other categories. It would seem to be reasonable to fill in the missing values with the same category that is listed as category 1.

test1<-test
X <- rep(NA,550068)
train <-cbind(X,train)
alldata <- rbind(train[,1:12],test1[,1:12])

alldata$Product_Category_2 <- as.factor(ifelse(is.na(alldata$Product_Category_2),alldata$Product_Category_1,as.character(alldata$Product_Category_2)))
alldata$Product_Category_3 <- as.factor(ifelse(is.na(alldata$Product_Category_3),as.character(alldata$Product_Category_2),as.character(alldata$Product_Category_3)))

train2 <- cbind(alldata[1:550068,],Purchase=train$Purchase)
test2 <- alldata[550069:783667,]

#next attempt at predictive model, glm including categories 2 and 3
set.seed(1968)
glmcontrol <- trainControl(method="repeatedcv",number=10,repeats=3,verboseIter = TRUE)
glmfit2 <- train(Purchase~Product_Category_1+Product_Category_2+Product_Category_3+Occupation+City_Category,data=train2,method="glm",trControl=glmcontrol,metric="RMSE")

r2<-RMSE(predict(glmfit2,train2),train2$Purchase)
r2
## [1] 2988.062
p2<-predict(glmfit2,test2)
submit2 <- data.frame(User_ID=test$User_ID,Product_ID=test$Product_ID,Purchase=p2)
#write.csv(submit2,file="sol2.csv",row.names = FALSE)

That gave a small improvement. The RMSE on the test set is down to 2999. Still a long way to go.
I think that categorization of the product id’s by their actual listing category is not enough. There might well be prodcut id’s in the same category but with very different price ranges. For example:

#the first product id in the training set, as an example
pid <- train$Product_ID[1]

boxplot(train$Purchase[train$Product_ID==pid])

#now compare with the distribution of all items in the same category as pid
boxplot(train$Purchase[train$Product_Category_1==train$Product_Category_1[1]])

So it might help to add the quantiles of the distribution of the product id as features.

rm(alldata,prod,test1,train1)
gc()
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  2064960 110.3    3205452 171.2  3205452 171.2
## Vcells 13707145 104.6   27784779 212.0 27782229 212.0
library(data.table)
train2 <- data.table(train2)
train3<-train2[,c("Q1","Med","Q3"):=as.list(quantile(Purchase,c(0.25,0.5,0.75))),by=Product_ID]
train3[,c("X","User_ID","Product_ID"):=NULL]
set.seed(1968)
glmfit3 <- train(Purchase~.,data=train3,method="glm",trControl=glmcontrol,metric="RMSE")

r3 <-RMSE(predict(glmfit3,train3),train3$Purchase)
v3<- varImp(glmfit3)
r3
## [1] 2654.765
v3
## glm variable importance
## 
##   only 20 most important variables shown (out of 88)
## 
##                      Overall
## Med                  100.000
## Q1                    86.451
## Q3                    78.488
## City_CategoryC        43.167
## Product_Category_15   36.046
## Product_Category_18   33.868
## Product_Category_14   33.137
## Product_Category_111  31.991
## Product_Category_113  31.799
## Product_Category_120  27.973
## Product_Category_112  27.735
## Product_Category_119  25.149
## Product_Category_118  17.546
## City_CategoryB        15.379
## Occupation20          13.309
## Occupation1           13.292
## Product_Category_110  11.958
## Occupation19          11.542
## Occupation3            9.883
## Product_Category_12    9.835
#so product categories 2 and 3 are no longer important. The new features are definitely good!

That looks like quite a good improvement, however, in order to predict on the test set, the new features must be present.
If a Product id from the test set is present in the train set, it’s not a problem to extract the values. If it’s not present, I’ll just impute the values from its category 1.

test3<-test2
setDT(test3)
train3<- cbind(Product_ID=train$Product_ID,train3)
#make temporary table of product ids from test3 that appear in train and get appropriate feature values
temp <- train3[test3$Product_ID %in% train$Product_ID,.(Product_ID,Q1,Med,Q3)]
temp <-unique(temp)
#merge test3 with temp. Now only the product id's that aren't in train have na's
setkey(temp,Product_ID)
setkey(test3,Product_ID)
test3<-merge(test3,temp,all.x=TRUE)

missing <- test3[is.na(test3$Q1)]
for (i in 1:dim(missing)[1]){
        missing[i,c("Q1","Med","Q3")]<-as.list(quantile(train3[Product_Category_1==missing[i,Product_Category_1]]$Purchase,c(0.25,0.5,0.75)))
}
#replace rows with na's by imputed rows
test3<-merge(test3,missing,by=names(missing),all=TRUE)
test3<-test3[complete.cases(test3)]
#test3 has been sorted by key=Product_ID. the sample submission requires to be sorted by X
setkey(test3,X)
#now can predict
p<-predict(glmfit3,test3)
submit3 <- data.frame(User_ID=test$User_ID,Product_ID=test$Product_ID,Purchase=p)
#write.csv(submit3,file="sol3.csv",row.names = FALSE)

Well, that’s nice , got RMSE of 2691 and jumped up about 40 positions on the leaderboard. Still needs to be improved!
How about the users? All of the user id’s that are present in the test set are also present in the train set. So this could theoretically be a good predictor, but there are far too many levels. So we’ll need to categorize the users. It doesn’t make sense to use the quantiles of the distrbutions of purchases of each user, because it depends on the products they are buying, but we could create an average score of how high or low the prices they paid are relative to the median price of that product. In other words, we need to first create a feature describing the ratio of the purchase to the Median purchase, if a higher price was paid the ratio will be greater than 1, and if a lower price was paid the ratio will be less than 1. Then for each user take the mean of all his purchase ratios. This number categorizes the user as how much above or below the median he pays on average. Let’s see how that performs:

train4 <- cbind(User_ID=train$User_ID,train3)
train4[,ratio:=Purchase/Med]
train4[,meanRatio := mean(ratio),by=User_ID]
train4[,ratio:=NULL]

temp<-train4[,.(User_ID,meanRatio)]
temp<-unique(temp)

test4 <- test3
setkey(temp,User_ID)
setkey(test4,User_ID)
test4<-merge(test4,temp,all=TRUE)
setkey(test4,X)

rm(glmfit3,test2,test3,train2,train3)
gc()
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  2086192 111.5    3205452 171.2  3205452 171.2
## Vcells 16806046 128.3   40186080 306.6 40099649 306.0
glmcontrol <- trainControl(method="repeatedcv",number=10,repeats=3,verboseIter = TRUE)
set.seed(1968)
setDF(train4)
glmfit4 <- train(Purchase~.,data=train4[,3:16],method="glm",trControl=glmcontrol,metric="RMSE")

r4<-RMSE(predict(glmfit4,train4),train4$Purchase)
v4<-varImp(glmfit4)
r4
## [1] 2484.336
#that's great, the RMSE has got smaller
v4
## glm variable importance
## 
##   only 20 most important variables shown (out of 89)
## 
##                      Overall
## meanRatio            100.000
## Med                   31.080
## Q1                    26.140
## Q3                    24.121
## Product_Category_15    9.566
## Product_Category_14    9.433
## Product_Category_111   9.005
## Product_Category_113   8.440
## Product_Category_18    8.435
## Product_Category_120   7.934
## Product_Category_112   7.558
## Product_Category_119   7.327
## Product_Category_118   4.452
## Product_Category_110   4.229
## Product_Category_24    3.121
## Product_Category_215   2.956
## Product_Category_16    2.868
## Product_Category_214   2.840
## Product_Category_12    2.751
## Product_Category_315   2.638
p<-predict(glmfit4,test4)
submit4 <- data.frame(User_ID=test$User_ID,Product_ID=test$Product_ID,Purchase=p)
#write.csv(submit4,file="sol4.csv",row.names = FALSE)

Good work! The RMSE on the leaderboard is now 2539 and I jumped up to position 19. Would a more powerful algorithm drastically reduce the RMSE? Let’s try.

set.seed(1968)


#It will take far too long to go through every combination of parameters, I'll use random control
detach(package:caret,unload=TRUE)
library(mlr)
train5<-train4[,4:16]
test5<-data.frame(test4)
test5<-test5[,4:16]
test5$Purchase <- rep(100,233599)


#create task
trainTask <- makeRegrTask(data = train5,target = "Purchase")
testTask <- makeRegrTask(data = test5, target = "Purchase")

gbmlrn <- makeLearner("regr.gbm", predict.type = "response")

#set 5 fold cross validation
set_cv <- makeResampleDesc("CV",iters = 5)

#Search for hyperparameters
gbmpars <- makeParamSet(
        makeIntegerParam("n.trees",  lower=50,upper=2000),
        makeNumericParam("shrinkage", lower=.0005,upper= .05),
       # makeIntegerParam(" n.minobsinnode",lower=8, upper=12) ,
        makeIntegerParam("interaction.depth", lower = 1 , upper = 10)
                      )

#try 10 different combinations of values, (the more the better, but each takes a long time!)
gbmcontrol <- makeTuneControlRandom(maxit = 10)

#hypertune the parameters-do this overnight!
set.seed(11)
gbmtune <- tuneParams(learner = gbmlrn, resampling = set_cv,
      task = trainTask, par.set = gbmpars, control = gbmcontrol, measures=rmse)


 #using hyperparameters for modeling
 tunedgbm <- setHyperPars(gbmlrn, par.vals=gbmtune$x)


#train the model
gbmfit <- train(tunedgbm, trainTask)

p<-predict(gbmfit,trainTask)

r5<-measureRMSE(p$data$truth,p$data$response)

#make predictions
p <- predict(gbmfit, testTask)
r5
## [1] 2400.695
#great!

This decreased the rmse to 2477 and got place 11 on the public leaderboard.