A retail company “ABC Private Limited” wants to understand the customer purchase behaviour (specifically, purchase amount) against various products of different categories. They have shared purchase summary of various customers for selected high volume products from last month.
The data set also contains customer demographics (age, gender, marital status, city_type, stay_in_current_city), product details (product_id and product category) and Total purchase_amount from last month. Now, they want to build a model to predict the purchase amount of customer against various products which will help them to create personalized offer for customers against different products.
| Variable | Definition |
|---|---|
| User_ID | User ID |
| Product_ID | Product ID |
| Gender | Sex of User |
| Age | Age in bins |
| Occupation | Occupation (Masked) |
| City_Category | “Category of the City (A,B,C)” |
| Stay_In_Current_City_Years | Number of years stay in current city |
| Marital_Status | Marital Status |
| Product_Category_1 | Product Category (Masked) |
| Product_Category_2 | Product may belongs to otder category also (Masked) |
| Product_Category_3 | Product may belong to other category also (Masked) |
| Purchase | Purchase Amount (Target Variable) |
train<-read.csv("train.csv") #233599 rows, 12 cols
test<-read.csv("test-comb.csv") #550068 rows,13 cols
library(ggplot2)
library(caret)
sapply(train,class)
## User_ID Product_ID
## "integer" "factor"
## Gender Age
## "factor" "factor"
## Occupation City_Category
## "integer" "factor"
## Stay_In_Current_City_Years Marital_Status
## "factor" "integer"
## Product_Category_1 Product_Category_2
## "integer" "integer"
## Product_Category_3 Purchase
## "integer" "integer"
sapply(test,class)
## X User_ID
## "integer" "integer"
## Product_ID Gender
## "factor" "factor"
## Age Occupation
## "factor" "integer"
## City_Category Stay_In_Current_City_Years
## "factor" "factor"
## Marital_Status Product_Category_1
## "integer" "integer"
## Product_Category_2 Product_Category_3
## "numeric" "numeric"
## Comb
## "factor"
#some of the classes don't look right
#are there any missing values?
as.matrix(sapply(train,function(x)sum(is.na(x))))
## [,1]
## User_ID 0
## Product_ID 0
## Gender 0
## Age 0
## Occupation 0
## City_Category 0
## Stay_In_Current_City_Years 0
## Marital_Status 0
## Product_Category_1 0
## Product_Category_2 173638
## Product_Category_3 383247
## Purchase 0
as.matrix(sapply(test,function(x)sum(is.na(x))))
## [,1]
## X 0
## User_ID 0
## Product_ID 0
## Gender 0
## Age 0
## Occupation 0
## City_Category 0
## Stay_In_Current_City_Years 0
## Marital_Status 0
## Product_Category_1 0
## Product_Category_2 72344
## Product_Category_3 162562
## Comb 0
head(train)
## User_ID Product_ID Gender Age Occupation City_Category
## 1 1000001 P00069042 F 0-17 10 A
## 2 1000001 P00248942 F 0-17 10 A
## 3 1000001 P00087842 F 0-17 10 A
## 4 1000001 P00085442 F 0-17 10 A
## 5 1000002 P00285442 M 55+ 16 C
## 6 1000003 P00193542 M 26-35 15 A
## Stay_In_Current_City_Years Marital_Status Product_Category_1
## 1 2 0 3
## 2 2 0 1
## 3 2 0 12
## 4 2 0 12
## 5 4+ 0 8
## 6 3 0 1
## Product_Category_2 Product_Category_3 Purchase
## 1 NA NA 8370
## 2 6 14 15200
## 3 NA NA 1422
## 4 14 NA 1057
## 5 NA NA 7969
## 6 2 NA 15227
#User Id is integer. Should actually be factor.
length(intersect(train$User_ID,test$User_ID))
## [1] 5891
length(unique(train$User_ID)) #so all the users are in both the train and test sets
## [1] 5891
train$User_ID <- as.factor(train$User_ID)
test$User_ID <- as.factor(test$User_ID)
#Product Id - how many different product lines are there?
length(levels(train$Product_ID))
## [1] 3631
#how many purchases of each product line in the train set?
summary(as.vector(table(train$Product_ID)))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 19.5 71.0 151.5 194.0 1880.0
#gender
print(ggplot(train,aes(x=Gender))+geom_bar())
#age
levels(train$Age)
## [1] "0-17" "18-25" "26-35" "36-45" "46-50" "51-55" "55+"
print(ggplot(train,aes(x=Age,fill=Gender))+geom_bar(position="dodge"))
#occupation - transform to factor. How many types are there?
train$Occupation <- as.factor(train$Occupation)
test$Occupation <- as.factor(test$Occupation)
length(levels(train$Occupation))
## [1] 21
print(ggplot(train,aes(Occupation,fill=Age))+geom_bar()+facet_grid(Gender~.))
#city category
levels(train$City_Category)
## [1] "A" "B" "C"
print(ggplot(train,aes(City_Category,fill=Age))+geom_bar())
#stay in current city
print(ggplot(train,aes(Stay_In_Current_City_Years,fill=Age))+geom_bar()+facet_grid(City_Category~.))
#marital status - should be factor
train$Marital_Status <- as.factor(train$Marital_Status)
test$Marital_Status <- as.factor(test$Marital_Status)
print(ggplot(train,aes(Marital_Status,fill=Age))+geom_bar()+facet_grid(Gender~.))
#since none of the 0-17 age group have status 1, it looks like status 0 is not married, status 1 is married.
#product categories should be factors
library(plyr)
train[,9:11]<-colwise(as.factor,~Product_Category_1+Product_Category_2+Product_Category_3)(train)
test[,10:12]<-colwise(as.factor,~Product_Category_1+Product_Category_2+Product_Category_3)(test)
#purchase - target variable
print(ggplot(train,aes(x=City_Category, y=Purchase))+geom_boxplot())
#In the test set there is a variable X - just appears to be an index starting from zero
#Comb. This variable appears in the test set. It is a concatenation of the user id with the product id.
length(unique(test$Comb)) #all unique
## [1] 233599
test$Comb <- as.character(test$Comb)
To get a first benchmark I’ll take Gender,Age,Occupation,City Category,Stay in current city, Marital status and Product category 1 as predictor variables build a glm to predict Purchase.
train1 <- train[,c(3:9,12)]
set.seed(1968)
glmcontrol <- trainControl(method="repeatedcv",number=10,repeats=3,verboseIter = TRUE)
glmfit <- train(Purchase~.,data=train1,method="glm",trControl=glmcontrol,metric="RMSE")
r1<-RMSE( predict(glmfit,train1),train$Purchase)
#predict on test set
Purchase<-predict(glmfit,test)
#prepare file for submission
submit1 <- data.frame(User_ID=test$User_ID,Product_ID=test$Product_ID,Purchase)
#write.csv(submit1,file="sol1.csv",row.names = FALSE)
r1 #rmse on train set
## [1] 3013.401
That gave RMSE of 3026 on the test set. The lowest RMSE on the public leaderboard is currently 2425, so there’s plenty room for improvement. Possibly more powerful models could improve the score, but I think that the features should be dealt with first.
Let’s see which features were most important for the glm.
v1<-varImp(glmfit)
v1
## glm variable importance
##
## only 20 most important variables shown (out of 53)
##
## Overall
## Product_Category_15 100.000
## Product_Category_18 77.019
## Product_Category_111 64.841
## Product_Category_14 59.359
## Product_Category_113 47.775
## Product_Category_112 38.588
## Product_Category_120 33.887
## Product_Category_118 29.923
## Product_Category_119 27.673
## Product_Category_13 23.503
## Product_Category_110 21.537
## Product_Category_12 16.999
## Product_Category_16 15.212
## Product_Category_17 8.601
## City_CategoryC 7.701
## Product_Category_116 5.708
## Product_Category_115 4.634
## Product_Category_117 4.293
## Occupation12 2.123
## City_CategoryB 2.028
Well, product category plays a vital role, but I haven’t used product category 2 or 3 in this model. It looks like these features need to be worked on.
As I understand, the product category features are describing which category a product id belongs to. The category 1 is the main category that this product id belongs to, but many products belong to a second or third category aswell. This is a way of describing a group property of the products, as if they were clustered, rather than looking at each product id individually. So each row that a particular product id is mentioned it should have the same values for each of the product category features.
library(data.table)
prod<-rbind(train[,c(1:2,9:11)],test[,c(2:3,10:12)])
#how many unique product ids are in the train and test sets together
length(unique(prod$Product_ID))
## [1] 3677
#how many unique rows of products together with their categories are there?
prod <- data.table(prod)
setkey(prod,Product_ID,Product_Category_1,Product_Category_2,Product_Category_3)
prod<-unique(prod)
dim(prod)
## [1] 3677 5
So every time a product id appears in the data, it’s categories are always the same. This means that the products which have missing categories 2 or 3 are never listed under other categories. It would seem to be reasonable to fill in the missing values with the same category that is listed as category 1.
test1<-test
X <- rep(NA,550068)
train <-cbind(X,train)
alldata <- rbind(train[,1:12],test1[,1:12])
alldata$Product_Category_2 <- as.factor(ifelse(is.na(alldata$Product_Category_2),alldata$Product_Category_1,as.character(alldata$Product_Category_2)))
alldata$Product_Category_3 <- as.factor(ifelse(is.na(alldata$Product_Category_3),as.character(alldata$Product_Category_2),as.character(alldata$Product_Category_3)))
train2 <- cbind(alldata[1:550068,],Purchase=train$Purchase)
test2 <- alldata[550069:783667,]
#next attempt at predictive model, glm including categories 2 and 3
set.seed(1968)
glmcontrol <- trainControl(method="repeatedcv",number=10,repeats=3,verboseIter = TRUE)
glmfit2 <- train(Purchase~Product_Category_1+Product_Category_2+Product_Category_3+Occupation+City_Category,data=train2,method="glm",trControl=glmcontrol,metric="RMSE")
r2<-RMSE(predict(glmfit2,train2),train2$Purchase)
r2
## [1] 2988.062
p2<-predict(glmfit2,test2)
submit2 <- data.frame(User_ID=test$User_ID,Product_ID=test$Product_ID,Purchase=p2)
#write.csv(submit2,file="sol2.csv",row.names = FALSE)
That gave a small improvement. The RMSE on the test set is down to 2999. Still a long way to go.
I think that categorization of the product id’s by their actual listing category is not enough. There might well be prodcut id’s in the same category but with very different price ranges. For example:
#the first product id in the training set, as an example
pid <- train$Product_ID[1]
boxplot(train$Purchase[train$Product_ID==pid])
#now compare with the distribution of all items in the same category as pid
boxplot(train$Purchase[train$Product_Category_1==train$Product_Category_1[1]])
So it might help to add the quantiles of the distribution of the product id as features.
rm(alldata,prod,test1,train1)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2064960 110.3 3205452 171.2 3205452 171.2
## Vcells 13707145 104.6 27784779 212.0 27782229 212.0
library(data.table)
train2 <- data.table(train2)
train3<-train2[,c("Q1","Med","Q3"):=as.list(quantile(Purchase,c(0.25,0.5,0.75))),by=Product_ID]
train3[,c("X","User_ID","Product_ID"):=NULL]
set.seed(1968)
glmfit3 <- train(Purchase~.,data=train3,method="glm",trControl=glmcontrol,metric="RMSE")
r3 <-RMSE(predict(glmfit3,train3),train3$Purchase)
v3<- varImp(glmfit3)
r3
## [1] 2654.765
v3
## glm variable importance
##
## only 20 most important variables shown (out of 88)
##
## Overall
## Med 100.000
## Q1 86.451
## Q3 78.488
## City_CategoryC 43.167
## Product_Category_15 36.046
## Product_Category_18 33.868
## Product_Category_14 33.137
## Product_Category_111 31.991
## Product_Category_113 31.799
## Product_Category_120 27.973
## Product_Category_112 27.735
## Product_Category_119 25.149
## Product_Category_118 17.546
## City_CategoryB 15.379
## Occupation20 13.309
## Occupation1 13.292
## Product_Category_110 11.958
## Occupation19 11.542
## Occupation3 9.883
## Product_Category_12 9.835
#so product categories 2 and 3 are no longer important. The new features are definitely good!
That looks like quite a good improvement, however, in order to predict on the test set, the new features must be present.
If a Product id from the test set is present in the train set, it’s not a problem to extract the values. If it’s not present, I’ll just impute the values from its category 1.
test3<-test2
setDT(test3)
train3<- cbind(Product_ID=train$Product_ID,train3)
#make temporary table of product ids from test3 that appear in train and get appropriate feature values
temp <- train3[test3$Product_ID %in% train$Product_ID,.(Product_ID,Q1,Med,Q3)]
temp <-unique(temp)
#merge test3 with temp. Now only the product id's that aren't in train have na's
setkey(temp,Product_ID)
setkey(test3,Product_ID)
test3<-merge(test3,temp,all.x=TRUE)
missing <- test3[is.na(test3$Q1)]
for (i in 1:dim(missing)[1]){
missing[i,c("Q1","Med","Q3")]<-as.list(quantile(train3[Product_Category_1==missing[i,Product_Category_1]]$Purchase,c(0.25,0.5,0.75)))
}
#replace rows with na's by imputed rows
test3<-merge(test3,missing,by=names(missing),all=TRUE)
test3<-test3[complete.cases(test3)]
#test3 has been sorted by key=Product_ID. the sample submission requires to be sorted by X
setkey(test3,X)
#now can predict
p<-predict(glmfit3,test3)
submit3 <- data.frame(User_ID=test$User_ID,Product_ID=test$Product_ID,Purchase=p)
#write.csv(submit3,file="sol3.csv",row.names = FALSE)
Well, that’s nice , got RMSE of 2691 and jumped up about 40 positions on the leaderboard. Still needs to be improved!
How about the users? All of the user id’s that are present in the test set are also present in the train set. So this could theoretically be a good predictor, but there are far too many levels. So we’ll need to categorize the users. It doesn’t make sense to use the quantiles of the distrbutions of purchases of each user, because it depends on the products they are buying, but we could create an average score of how high or low the prices they paid are relative to the median price of that product. In other words, we need to first create a feature describing the ratio of the purchase to the Median purchase, if a higher price was paid the ratio will be greater than 1, and if a lower price was paid the ratio will be less than 1. Then for each user take the mean of all his purchase ratios. This number categorizes the user as how much above or below the median he pays on average. Let’s see how that performs:
train4 <- cbind(User_ID=train$User_ID,train3)
train4[,ratio:=Purchase/Med]
train4[,meanRatio := mean(ratio),by=User_ID]
train4[,ratio:=NULL]
temp<-train4[,.(User_ID,meanRatio)]
temp<-unique(temp)
test4 <- test3
setkey(temp,User_ID)
setkey(test4,User_ID)
test4<-merge(test4,temp,all=TRUE)
setkey(test4,X)
rm(glmfit3,test2,test3,train2,train3)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2086192 111.5 3205452 171.2 3205452 171.2
## Vcells 16806046 128.3 40186080 306.6 40099649 306.0
glmcontrol <- trainControl(method="repeatedcv",number=10,repeats=3,verboseIter = TRUE)
set.seed(1968)
setDF(train4)
glmfit4 <- train(Purchase~.,data=train4[,3:16],method="glm",trControl=glmcontrol,metric="RMSE")
r4<-RMSE(predict(glmfit4,train4),train4$Purchase)
v4<-varImp(glmfit4)
r4
## [1] 2484.336
#that's great, the RMSE has got smaller
v4
## glm variable importance
##
## only 20 most important variables shown (out of 89)
##
## Overall
## meanRatio 100.000
## Med 31.080
## Q1 26.140
## Q3 24.121
## Product_Category_15 9.566
## Product_Category_14 9.433
## Product_Category_111 9.005
## Product_Category_113 8.440
## Product_Category_18 8.435
## Product_Category_120 7.934
## Product_Category_112 7.558
## Product_Category_119 7.327
## Product_Category_118 4.452
## Product_Category_110 4.229
## Product_Category_24 3.121
## Product_Category_215 2.956
## Product_Category_16 2.868
## Product_Category_214 2.840
## Product_Category_12 2.751
## Product_Category_315 2.638
p<-predict(glmfit4,test4)
submit4 <- data.frame(User_ID=test$User_ID,Product_ID=test$Product_ID,Purchase=p)
#write.csv(submit4,file="sol4.csv",row.names = FALSE)
Good work! The RMSE on the leaderboard is now 2539 and I jumped up to position 19. Would a more powerful algorithm drastically reduce the RMSE? Let’s try.
set.seed(1968)
#It will take far too long to go through every combination of parameters, I'll use random control
detach(package:caret,unload=TRUE)
library(mlr)
train5<-train4[,4:16]
test5<-data.frame(test4)
test5<-test5[,4:16]
test5$Purchase <- rep(100,233599)
#create task
trainTask <- makeRegrTask(data = train5,target = "Purchase")
testTask <- makeRegrTask(data = test5, target = "Purchase")
gbmlrn <- makeLearner("regr.gbm", predict.type = "response")
#set 5 fold cross validation
set_cv <- makeResampleDesc("CV",iters = 5)
#Search for hyperparameters
gbmpars <- makeParamSet(
makeIntegerParam("n.trees", lower=50,upper=2000),
makeNumericParam("shrinkage", lower=.0005,upper= .05),
# makeIntegerParam(" n.minobsinnode",lower=8, upper=12) ,
makeIntegerParam("interaction.depth", lower = 1 , upper = 10)
)
#try 10 different combinations of values, (the more the better, but each takes a long time!)
gbmcontrol <- makeTuneControlRandom(maxit = 10)
#hypertune the parameters-do this overnight!
set.seed(11)
gbmtune <- tuneParams(learner = gbmlrn, resampling = set_cv,
task = trainTask, par.set = gbmpars, control = gbmcontrol, measures=rmse)
#using hyperparameters for modeling
tunedgbm <- setHyperPars(gbmlrn, par.vals=gbmtune$x)
#train the model
gbmfit <- train(tunedgbm, trainTask)
p<-predict(gbmfit,trainTask)
r5<-measureRMSE(p$data$truth,p$data$response)
#make predictions
p <- predict(gbmfit, testTask)
r5
## [1] 2400.695
#great!
This decreased the rmse to 2477 and got place 11 on the public leaderboard.