To understand the customer purchase behaviour (specifically, purchase amount) against various products of different categories so that sales of a retail company can be boosted.
We will be building models to predict the purchase amount of customer against various products which will help us to create personalized offer for customers against different products.
Setting up environment for analysis, loading data, packages, understanding variables.
getwd()
## [1] "D:/black friday project"
data_set<-read.csv(paste("train.csv",sep = ""))
## 75% of the sample size
smp_size <- floor(0.75 * nrow(data_set))
## set the seed to make your partition reproductible
set.seed(123)
train_ind <- sample(seq_len(nrow(data_set)), size = smp_size)
train_set <- data_set[train_ind, ]
test_set <- data_set[-train_ind, ]
y_test<-as.data.frame(test_set[,12], drop=false)
names(y_test)<-c("purchase")
test_set<-as.data.frame(test_set[,1:11], drop=false)
dim(test_set)
## [1] 137517 11
dim(train_set)
## [1] 412551 12
dim(test_set)
## [1] 137517 11
Training data set consists of 4,12,551 entries across 12 variables.
Test Data set consists of 1,37,517 entries across 11 variables.
12th variable in Training Data set is our response Variable which needs to be predicted in test data set.
library(psych)
describe(train_set)
## vars n mean sd median
## User_ID 1 412551 1003030.35 1728.22 1003080
## Product_ID* 2 412551 1708.53 1011.98 1668
## Gender* 3 412551 1.75 0.43 2
## Age* 4 412551 3.50 1.35 3
## Occupation 5 412551 8.08 6.53 7
## City_Category* 6 412551 2.04 0.76 2
## Stay_In_Current_City_Years* 7 412551 2.86 1.29 3
## Marital_Status 8 412551 0.41 0.49 0
## Product_Category_1 9 412551 5.41 3.94 5
## Product_Category_2 10 282116 9.85 5.09 9
## Product_Category_3 11 124786 12.67 4.12 14
## Purchase 12 412551 9261.00 5022.12 8044
## trimmed mad min max range skew
## User_ID 1003028.59 2180.90 1000001 1006040 6039 0.00
## Product_ID* 1688.75 1196.46 1 3631 3630 0.15
## Gender* 1.82 0.00 1 2 1 -1.17
## Age* 3.36 1.48 1 7 6 0.81
## Occupation 7.69 8.90 0 20 20 0.40
## City_Category* 2.05 1.48 1 3 2 -0.07
## Stay_In_Current_City_Years* 2.82 1.48 1 5 4 0.32
## Marital_Status 0.39 0.00 0 1 1 0.37
## Product_Category_1 4.91 4.45 1 20 19 1.02
## Product_Category_2 10.00 7.41 2 18 16 -0.17
## Product_Category_3 13.07 2.97 3 18 15 -0.77
## Purchase 8925.40 4250.61 12 23961 23949 0.60
## kurtosis se
## User_ID -1.20 2.69
## Product_ID* -1.09 1.58
## Gender* -0.63 0.00
## Age* 0.30 0.00
## Occupation -1.22 0.01
## City_Category* -1.27 0.00
## Stay_In_Current_City_Years* -1.07 0.00
## Marital_Status -1.86 0.00
## Product_Category_1 1.22 0.01
## Product_Category_2 -1.43 0.01
## Product_Category_3 -0.81 0.01
## Purchase -0.34 7.82
describe(test_set)
## vars n mean sd median
## User_ID 1 137517 1003024.33 1725.69 1003070
## Product_ID* 2 137517 1708.31 1012.87 1665
## Gender* 3 137517 1.75 0.43 2
## Age* 4 137517 3.49 1.36 3
## Occupation 5 137517 8.07 6.51 7
## City_Category* 6 137517 2.04 0.76 2
## Stay_In_Current_City_Years* 7 137517 2.86 1.29 3
## Marital_Status 8 137517 0.41 0.49 0
## Product_Category_1 9 137517 5.38 3.93 5
## Product_Category_2 10 94314 9.82 5.08 9
## Product_Category_3 11 42035 12.67 4.13 14
## trimmed mad min max range skew
## User_ID 1003021.88 2172.01 1000001 1006040 6039 0.00
## Product_ID* 1688.48 1200.91 1 3631 3630 0.15
## Gender* 1.82 0.00 1 2 1 -1.18
## Age* 3.35 1.48 1 7 6 0.81
## Occupation 7.69 8.90 0 20 20 0.40
## City_Category* 2.05 1.48 1 3 2 -0.07
## Stay_In_Current_City_Years* 2.82 1.48 1 5 4 0.32
## Marital_Status 0.39 0.00 0 1 1 0.37
## Product_Category_1 4.88 4.45 1 20 19 1.04
## Product_Category_2 9.97 7.41 2 18 16 -0.15
## Product_Category_3 13.07 2.97 3 18 15 -0.76
## kurtosis se
## User_ID -1.19 4.65
## Product_ID* -1.09 2.73
## Gender* -0.60 0.00
## Age* 0.30 0.00
## Occupation -1.22 0.02
## City_Category* -1.27 0.00
## Stay_In_Current_City_Years* -1.07 0.00
## Marital_Status -1.87 0.00
## Product_Category_1 1.27 0.01
## Product_Category_2 -1.43 0.02
## Product_Category_3 -0.81 0.02
Let’s First Find categorical and continuos variables in our training data set.
str(train_set)
## 'data.frame': 412551 obs. of 12 variables:
## $ User_ID : int 1000442 1000778 1004647 1002875 1001675 1003848 1002801 1003626 1004682 1002774 ...
## $ Product_ID : Factor w/ 3631 levels "P00000142","P00000242",..: 356 1673 2771 1017 2244 971 3510 1866 2134 1183 ...
## $ Gender : Factor w/ 2 levels "F","M": 2 2 2 2 1 2 1 2 2 2 ...
## $ Age : Factor w/ 7 levels "0-17","18-25",..: 3 2 4 3 3 3 3 3 3 3 ...
## $ Occupation : int 1 17 20 2 6 15 3 17 7 1 ...
## $ City_Category : Factor w/ 3 levels "A","B","C": 1 3 2 2 2 3 1 2 2 1 ...
## $ Stay_In_Current_City_Years: Factor w/ 5 levels "0","1","2","3",..: 3 5 2 4 2 2 2 4 4 2 ...
## $ Marital_Status : int 0 0 1 1 1 0 1 0 1 0 ...
## $ Product_Category_1 : int 5 8 8 1 11 5 5 5 8 1 ...
## $ Product_Category_2 : int NA NA NA 2 NA NA 8 11 NA 2 ...
## $ Product_Category_3 : int NA NA NA 8 NA NA NA NA NA 15 ...
## $ Purchase : int 5472 9996 5864 19483 4592 8724 5183 5141 8050 15199 ...
Firstly doing the univariate exploration and modifying the data if deemed necessary.
one way contigency table
mytable <- with(train_set,table(Gender))
mytable
## Gender
## F M
## 102021 310530
lbls <- c("M","F")
pct <- round(mytable/sum(mytable)*100)
lbls <- paste(lbls, pct)
lbls <- paste(lbls,"%",sep="")
pie(mytable,labels = lbls)
mytable1 <- with(train_set,table(Marital_Status))
mytable1
## Marital_Status
## 0 1
## 243555 168996
lbls1 <- c("Single","Married")
pct1 <- round(mytable1/sum(mytable1)*100)
lbls1 <- paste(lbls1, pct1)
lbls1<- paste(lbls1,"%",sep="")
pie(mytable1,labels = lbls1)
mytable2 <- with(train_set,table(City_Category))
mytable2
## City_Category
## A B C
## 110815 173429 128307
lbls2 <- c("A","B","C")
pct2 <- round(mytable2/sum(mytable2)*100)
lbls2 <- paste(lbls2, pct2)
lbls2<- paste(lbls2,"%",sep="")
pie(mytable2,labels = lbls2)
par(mfrow = c(1, 2))
#train data
with(train_set,table(Stay_In_Current_City_Years))
## Stay_In_Current_City_Years
## 0 1 2 3 4+
## 55676 145504 76472 71518 63381
#test data
with(test_set,table(Stay_In_Current_City_Years))
## Stay_In_Current_City_Years
## 0 1 2 3 4+
## 18722 48317 25366 23767 21345
par(mfrow = c(1, 2))
a<-with(train_set,table(Product_Category_1))
#a
barplot(a,main ="train_data")
b<-with(test_set,table(Product_Category_1))
#b
barplot(b,main ="test_data")
par(mfrow = c(1, 2))
a<-with(train_set,table(Product_Category_2))
#a
barplot(a,main ="train_data")
b<-with(test_set,table(Product_Category_2))
#b
barplot(b,main ="test_data")
library(lattice)
par(mfrow = c(1, 2))
a<-with(train_set,table(Product_Category_3))
#a
barchart(a,main ="train_data",horizontal = "FALSE")
b<-with(test_set,table(Product_Category_3))
#b
barchart(b,main ="test_data",horizontal = "FALSE")
2 way table
xtabs(~Marital_Status+Gender,data=train_set)
## Gender
## Marital_Status F M
## 0 59207 184348
## 1 42814 126182
xtabs(Purchase~Product_Category_3,aggregate(Purchase~Product_Category_3,train_set,mean))
## Product_Category_3
## 3 4 5 6 8 9 10
## 14058.795 9806.369 12128.368 13179.604 13047.715 10388.366 13507.104
## 11 12 13 14 15 16 17
## 12113.981 8703.027 13237.030 10036.091 12342.484 11960.804 11737.853
## 18
## 11015.255
xtabs(Purchase~Product_Category_2,aggregate(Purchase~Product_Category_2,train_set,mean))
## Product_Category_2
## 2 3 4 5 6 7 8
## 13610.833 11222.781 10216.105 9033.057 11516.350 6894.512 10247.663
## 9 10 11 12 13 14 15
## 7329.170 15715.654 8893.459 6996.297 9662.622 7101.230 10345.805
## 16 17 18
## 10289.805 9421.296 9328.248
xtabs(Purchase~Product_Category_1,aggregate(Purchase~Product_Category_1,train_set,mean))
## Product_Category_1
## 1 2 3 4 5 6
## 13600.94860 11232.00870 10109.66465 2334.29817 6240.79881 15830.96146
## 7 8 9 10 11 12
## 16374.65185 7497.88064 15754.68932 19668.12699 4687.24631 1344.33266
## 13 14 15 16 17 18
## 720.56800 13143.73906 14790.16472 14754.58678 10167.59811 2979.82756
## 19 20
## 36.78061 369.85307
Since our data is stored based on product id i.e. if a person buying 10 products then his data will be stored in 10 observation and hence there will cause a repetition of same person data.
train_set$User_ID <- as.factor(train_set$User_ID)
train_set$Product_ID <- as.factor(train_set$Product_ID)
train_set$Marital_Status <- as.factor(ifelse(train_set$Marital_Status == 1, 'Married', 'Single'))
train_set$Age <- as.factor(train_set$Age)
train_set$Gender <- as.factor(ifelse(train_set$Gender=='M', 'Male', 'Female'))
train_set$Occupation <- as.factor(train_set$Occupation)
train_set$City_Category <- as.factor(train_set$City_Category)
train_set$Stay_In_Current_City_Years <- as.factor(train_set$Stay_In_Current_City_Years)
test_set$User_ID <- as.factor(test_set$User_ID)
test_set$Product_ID <- as.factor(test_set$Product_ID)
test_set$Marital_Status <- as.factor(ifelse(test_set$Marital_Status == 1, 'Married', 'Single'))
test_set$Age <- as.factor(test_set$Age)
test_set$Gender <- as.factor(ifelse(test_set$Gender=='M', 'Male', 'Female'))
test_set$Occupation <- as.factor(test_set$Occupation)
test_set$City_Category <- as.factor(test_set$City_Category)
test_set$Stay_In_Current_City_Years <- as.factor(test_set$Stay_In_Current_City_Years)
#str(train_set)
#str(test_set)
The function distinct() in dplyr package can be used to keep only unique/distinct rows from a data frame. If there are duplicate rows, only the first row is preserved. It’s an efficient version of the R base function unique().
library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
EDA_Distinct <- distinct(train_set, User_ID, Age, Gender, Marital_Status, Occupation, City_Category, Stay_In_Current_City_Years)
#str(EDA_Distinct)
head(EDA_Distinct)
## User_ID Gender Age Occupation City_Category Stay_In_Current_City_Years
## 1 1000442 Male 26-35 1 A 2
## 2 1000778 Male 18-25 17 C 4+
## 3 1004647 Male 36-45 20 B 1
## 4 1002875 Male 26-35 2 B 3
## 5 1001675 Female 26-35 6 B 1
## 6 1003848 Male 26-35 15 C 1
## Marital_Status
## 1 Single
## 2 Single
## 3 Married
## 4 Married
## 5 Married
## 6 Single
Creating new variables User_Purchase_Count, Total_Spending and Avg_Spending_on_each_Product
Total number of product purchased bu customer
#creating a new data frame to stor the number of purchase made by each user
userIDCount <- as.data.frame(table(train_set$User_ID))
names(userIDCount) <- c("User_ID","User_Purchase_Count")
head(userIDCount)
## User_ID User_Purchase_Count
## 1 1000001 24
## 2 1000002 63
## 3 1000003 25
## 4 1000004 13
## 5 1000005 78
## 6 1000006 37
merging df
train_set <- merge(x = train_set, y = userIDCount, by = "User_ID", all.x = TRUE)
changing the test dataset too
# writing code such that if a new user comes for the first time his count is set to one in test dataset
test_set <- merge(x = test_set, y = userIDCount, by = "User_ID", all.x = TRUE)
test_set[is.na(test_set$User_Purchase_Count), "User_Purchase_Count"] <- 1
#Changing the datatype
test_set$User_Purchase_Count <- as.integer(test_set$User_Purchase_Count)
Total Spending by a User
Changing the Training Dataset
# creating a data frame to store the total spending by a user
totspend <- aggregate(train_set$Purchase, by=list(Category=train_set$User_ID), FUN=sum)
names(totspend) <- c("User_ID","Total_Spending")
train_set <- merge(x = train_set, y = totspend, by = "User_ID", all.x = TRUE)
#head(train_set)
changing test too
test_set <- merge(x = test_set, y = totspend, by = "User_ID", all.x = TRUE)
# writing code such that if a new user comes for the first time his count is set to zero for total spending in test dataset
test_set[is.na(test_set$Total_Spending), "Total_Spending"] <- 0
#Changing the datatype
test_set$Total_Spending <- as.integer(test_set$Total_Spending)
Average Spending on each product by a customer
Now calculating the average spending of user on each product we can divide Total_Spending by User_Purchase Count. Avg_Spending_on_each_Product = Total_Spending / User_Purchase_Count
train_set$Avg_Spending_on_each_Product <- train_set$Total_Spending / train_set$User_Purchase_Count
test_set$Avg_Spending_on_each_Product <- test_set$Total_Spending / test_set$User_Purchase_Count
#Changing the datatype
train_set$Avg_Spending_on_each_Product <- as.integer(train_set$Avg_Spending_on_each_Product)
test_set$Avg_Spending_on_each_Product <- as.integer(test_set$Avg_Spending_on_each_Product)
summary(totspend$Total_Spending)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 20125 178411 388922 648555 842252 8050867
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
ggplot(totspend, aes(x=Total_Spending)) + geom_density(fill="red", col="black", alpha=0.80)
removing unused df
rm(userIDCount, totspend)
#Updating EDA_Distinct dataframe
EDA_Distinct <- distinct(train_set, User_ID, Age, Gender, Marital_Status, Occupation, City_Category, Stay_In_Current_City_Years, User_Purchase_Count)
d1 <- summary(EDA_Distinct$User_Purchase_Count)
d1
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.00 19.00 40.00 70.03 87.00 779.00
ggplot(EDA_Distinct, aes(x=User_Purchase_Count)) + geom_density(fill="red", col="black", alpha=0.80) + annotate(geom = "text", x = 6, y = 0.0125, label = "Min")
histogram(EDA_Distinct$User_Purchase_Count)
bwplot(Age~User_Purchase_Count|Marital_Status+Gender,data = EDA_Distinct)
bwplot(User_Purchase_Count~Occupation|Marital_Status+Gender,data = EDA_Distinct)
head(train_set$Product_ID,15)
## [1] P00111842 P00297042 P0096542 P00210342 P00000142 P00248942 P00064042
## [8] P00004842 P00051842 P00025442 P00085442 P00051442 P00058142 P0097142
## [15] P00214842
## 3631 Levels: P00000142 P00000242 P00000342 P00000442 P00000542 ... P0099942
i.e. there are a total of 3631 number of product
Creating new variables Total number of product unit sold, Average price of a product, Sd of the product price
SoldProdCount <- as.data.frame(table(train_set$Product_ID))
names(SoldProdCount) <- c("Product_ID","Product_Sold_Count")
SoldProdPriceMean <- aggregate(train_set$Purchase, by=list(Category=train_set$Product_ID), FUN=mean)
names(SoldProdPriceMean) <- c("Product_ID","Product_Mean_Price")
SoldProdPriceSD <- aggregate(train_set$Purchase, by=list(Category=train_set$Product_ID), FUN=sd)
names(SoldProdPriceSD) <- c("Product_ID","Product_SD_Price")
summary(SoldProdCount$Product_Sold_Count)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 15.0 53.0 113.6 146.0 1420.0
histogram(SoldProdCount$Product_Sold_Count)
histogram(SoldProdPriceMean$Product_Mean_Price)
histogram(SoldProdPriceSD$Product_SD_Price)
changing variables type to numeric
corr_train <- train_set
corr_train$User_ID <- as.numeric(corr_train$User_ID)
corr_train$User_Purchase_Count <- as.numeric(corr_train$User_Purchase_Count)
corr_train$Product_ID <- as.numeric(corr_train$Product_ID)
corr_train$Gender <- as.numeric(ifelse(corr_train$Gender=="Male", 1, 0))
corr_train$Age <- as.numeric(ifelse(corr_train$Age=='0-17', 17, ifelse(corr_train$Age=='18-25', 25, ifelse(corr_train$Age=='26-35', 35, ifelse(corr_train$Age=='36-45', 45, ifelse(corr_train$Age=='46-50', 50, ifelse(corr_train$Age=='51-55', 55, 65)))))))
corr_train$Marital_Status <- as.numeric(ifelse(corr_train$Marital_Status=="Married", '1', '0'))
corr_train$Occupation <- as.numeric(corr_train$Occupation)
corr_train$City_Category <- as.numeric(ifelse(corr_train$City_Category=='A', 1, ifelse(corr_train$City_Category=='B', 2, 3)))
corr_train$Stay_In_Current_City_Years <- as.numeric(ifelse(corr_train$Stay_In_Current_City_Years=='4+', 6, corr_train$Stay_In_Current_City_Years))
corr_train$Total_Spending <- as.numeric(corr_train$Total_Spending)
corr_train$Avg_Spending_on_each_Product <- as.numeric(corr_train$Avg_Spending_on_each_Product)
#target Variable
Y <- as.numeric(corr_train$Purchase)
corr_train$Product_Category_2<-NULL
corr_train$Product_Category_3<-NULL
cor(corr_train)
## User_ID Product_ID Gender
## User_ID 1.000000000 -0.017397001 -0.03366321
## Product_ID -0.017397001 1.000000000 0.01603059
## Gender -0.033663212 0.016030587 1.00000000
## Age 0.038613234 0.023934173 -0.00377867
## Occupation -0.024521599 0.008086456 0.11825402
## City_Category 0.023437825 0.007097120 -0.00510242
## Stay_In_Current_City_Years -0.029746261 -0.002536608 0.01863580
## Marital_Status 0.020102169 0.012267187 -0.01168131
## Product_Category_1 0.003570070 0.074596632 -0.04595016
## Purchase 0.004244348 -0.109294620 0.06084054
## User_Purchase_Count -0.031272142 -0.002597573 0.07374752
## Total_Spending -0.027597276 -0.004162652 0.10852446
## Avg_Spending_on_each_Product 0.012974786 -0.020602205 0.18595405
## Age Occupation City_Category
## User_ID 0.0386132336 -0.024521599 0.02343783
## Product_ID 0.0239341730 0.008086456 0.00709712
## Gender -0.0037786703 0.118254023 -0.00510242
## Age 1.0000000000 0.096776311 0.11337164
## Occupation 0.0967763113 1.000000000 0.03540659
## City_Category 0.1133716424 0.035406593 1.00000000
## Stay_In_Current_City_Years -0.0001599429 0.027063059 0.01808487
## Marital_Status 0.3051791632 0.024521869 0.04014877
## Product_Category_1 0.0620364981 -0.006648511 -0.01352877
## Purchase 0.0140026938 0.020191945 0.06214326
## User_Purchase_Count -0.0436288574 -0.005070194 -0.52066133
## Total_Spending -0.0371116592 0.002349550 -0.50084781
## Avg_Spending_on_each_Product 0.0428040268 0.061716704 0.18993773
## Stay_In_Current_City_Years Marital_Status
## User_ID -0.0297462606 0.020102169
## Product_ID -0.0025366084 0.012267187
## Gender 0.0186357973 -0.011681306
## Age -0.0001599429 0.305179163
## Occupation 0.0270630585 0.024521869
## City_Category 0.0180848717 0.040148774
## Stay_In_Current_City_Years 1.0000000000 -0.011127002
## Marital_Status -0.0111270023 1.000000000
## Product_Category_1 -0.0016295069 0.019947693
## Purchase 0.0040676009 -0.002036816
## User_Purchase_Count 0.0028393079 0.001716275
## Total_Spending 0.0012801978 -0.004697633
## Avg_Spending_on_each_Product 0.0124346645 -0.006223832
## Product_Category_1 Purchase
## User_ID 0.003570070 0.004244348
## Product_ID 0.074596632 -0.109294620
## Gender -0.045950164 0.060840541
## Age 0.062036498 0.014002694
## Occupation -0.006648511 0.020191945
## City_Category -0.013528773 0.062143257
## Stay_In_Current_City_Years -0.001629507 0.004067601
## Marital_Status 0.019947693 -0.002036816
## Product_Category_1 1.000000000 -0.342337409
## Purchase -0.342337409 1.000000000
## User_Purchase_Count 0.017942766 -0.090045377
## Total_Spending 0.003742677 -0.037563844
## Avg_Spending_on_each_Product -0.114980526 0.327180095
## User_Purchase_Count Total_Spending
## User_ID -0.031272142 -0.027597276
## Product_ID -0.002597573 -0.004162652
## Gender 0.073747517 0.108524461
## Age -0.043628857 -0.037111659
## Occupation -0.005070194 0.002349550
## City_Category -0.520661326 -0.500847805
## Stay_In_Current_City_Years 0.002839308 0.001280198
## Marital_Status 0.001716275 -0.004697633
## Product_Category_1 0.017942766 0.003742677
## Purchase -0.090045377 -0.037563844
## User_Purchase_Count 1.000000000 0.973065973
## Total_Spending 0.973065973 1.000000000
## Avg_Spending_on_each_Product -0.275222715 -0.114816494
## Avg_Spending_on_each_Product
## User_ID 0.012974786
## Product_ID -0.020602205
## Gender 0.185954046
## Age 0.042804027
## Occupation 0.061716704
## City_Category 0.189937728
## Stay_In_Current_City_Years 0.012434664
## Marital_Status -0.006223832
## Product_Category_1 -0.114980526
## Purchase 0.327180095
## User_Purchase_Count -0.275222715
## Total_Spending -0.114816494
## Avg_Spending_on_each_Product 1.000000000
library(corrplot)
## corrplot 0.84 loaded
M <- cor(corr_train)
corrplot(M, method="number")