loading libraries
library(dummies)
## dummies-1.5.6 provided by Decision Patterns
library(plyr)
library(ggplot2)
#train <- read.csv("C:/Users/Admin/Downloads/Black Friday/train_oSwQCTC/train.csv", stringsAsFactors=F)
train <- read.csv("train.csv", stringsAsFactors=F)
onehot-encoding city variable
X_train <- train
X_train <- dummy.data.frame(X_train, names=c("City_Category"), sep="_")
Converting age variable to numeric
X_train$Age[X_train$Age == "0-17"] <- "15"
X_train$Age[X_train$Age == "18-25"] <- "21"
X_train$Age[X_train$Age == "26-35"] <- "30"
X_train$Age[X_train$Age == "36-45"] <- "40"
X_train$Age[X_train$Age == "46-50"] <- "48"
X_train$Age[X_train$Age == "51-55"] <- "53"
X_train$Age[X_train$Age == "55+"] <- "60"
X_train$Age <- as.integer(X_train$Age)
Converting stay in current city to numeric
X_train$Stay_In_Current_City_Years[X_train$Stay_In_Current_City_Years == "4+"] <- "4"
X_train$Stay_In_Current_City_Years <- as.integer(X_train$Stay_In_Current_City_Years)
converting Gender to binary
X_train$Gender <- ifelse(X_train$Gender == "F", 1, 0)
Feature representing the count of each user
user_count <- ddply(X_train, .(User_ID), nrow)
names(user_count)[2] <- "User_Count"
X_train <- merge(X_train, user_count, by="User_ID")
Feature representing the count of each product
product_count <- ddply(X_train, .(Product_ID), nrow)
names(product_count)[2] <- "Product_Count"
X_train <- merge(X_train, product_count, by="Product_ID")
Feature representing the average Purchase of each product
product_mean <- ddply(X_train, .(Product_ID), summarize, Product_Mean=mean(Purchase))
X_train <- merge(X_train, product_mean, by="Product_ID")
feature representing the proportion of times the user purchases the product more than the product’s average
X_train$flag_high <- ifelse(X_train$Purchase > X_train$Product_Mean,1,0)
user_high <- ddply(X_train, .(User_ID), summarize, User_High=mean(flag_high))
X_train <- merge(X_train, user_high, by="User_ID")
Add User_High column to the original dataset also
train$flag_high <- ifelse(X_train$Purchase > X_train$Product_Mean,1,0)
user_high <- ddply(train, .(User_ID), summarize, User_High=mean(flag_high))
train <- merge(train, user_high, by="User_ID")
X_train <- merge(X_train, user_high, by="User_ID")
Structure of the train dataset
str(train)
## 'data.frame': 550068 obs. of 14 variables:
## $ User_ID : int 1000001 1000001 1000001 1000001 1000001 1000001 1000001 1000001 1000001 1000001 ...
## $ Product_ID : chr "P00069042" "P00248942" "P00087842" "P00085442" ...
## $ Gender : chr "F" "F" "F" "F" ...
## $ Age : chr "0-17" "0-17" "0-17" "0-17" ...
## $ Occupation : int 10 10 10 10 10 10 10 10 10 10 ...
## $ City_Category : chr "A" "A" "A" "A" ...
## $ Stay_In_Current_City_Years: chr "2" "2" "2" "2" ...
## $ Marital_Status : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Product_Category_1 : int 3 1 12 12 1 3 14 8 8 2 ...
## $ Product_Category_2 : int NA 6 NA 14 2 4 NA NA NA 4 ...
## $ Product_Category_3 : int NA 14 NA NA 9 12 NA NA NA 8 ...
## $ Purchase : int 8370 15200 1422 1057 15416 10572 11011 10003 8094 12842 ...
## $ flag_high : num 1 0 1 0 0 0 1 1 1 1 ...
## $ User_High : num 0.486 0.486 0.486 0.486 0.486 ...
Structure of the X_train dataset which has newly created
str(X_train)
## 'data.frame': 550068 obs. of 20 variables:
## $ User_ID : int 1000001 1000001 1000001 1000001 1000001 1000001 1000001 1000001 1000001 1000001 ...
## $ Product_ID : chr "P00051442" "P00258742" "P00178342" "P00085442" ...
## $ Gender : num 1 1 1 1 1 1 1 1 1 1 ...
## $ Age : int 15 15 15 15 15 15 15 15 15 15 ...
## $ Occupation : int 10 10 10 10 10 10 10 10 10 10 ...
## $ City_Category_A : int 1 1 1 1 1 1 1 1 1 1 ...
## $ City_Category_B : int 0 0 0 0 0 0 0 0 0 0 ...
## $ City_Category_C : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Stay_In_Current_City_Years: int 2 2 2 2 2 2 2 2 2 2 ...
## $ Marital_Status : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Product_Category_1 : int 8 5 8 12 14 8 3 1 3 6 ...
## $ Product_Category_2 : int 17 NA NA 14 NA NA 4 6 4 8 ...
## $ Product_Category_3 : int NA NA NA NA NA NA 5 14 NA 16 ...
## $ Purchase : int 9938 6910 7887 1057 11011 7953 13650 15200 7943 16622 ...
## $ User_Count : int 35 35 35 35 35 35 35 35 35 35 ...
## $ Product_Count : int 1249 964 369 341 179 197 1152 581 232 1406 ...
## $ Product_Mean : num 8956 7003 7706 1455 12591 ...
## $ flag_high : num 1 0 1 0 0 1 1 0 0 0 ...
## $ User_High.x : num 0.543 0.543 0.543 0.543 0.543 ...
## $ User_High.y : num 0.486 0.486 0.486 0.486 0.486 ...
Random sample the data set. Just 5% of the data set was take as a sample to explore
library(caTools)
set.seed(3000)
spl = sample.split(train$User_ID, SplitRatio = 0.05)
Pdata=train[spl,]
Pdata$Marital_Status <- ifelse(Pdata$Marital_Status == 1, 'M', 'UM')
How is the distribution of Purchases?. The distributions right skewed and the mean amd meedian for the sample and the population are approximately same.
summary(X_train[spl,]$Purchase)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12 5834 8045 9252 12040 23960
summary(X_train$Purchase)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12 5823 8047 9264 12050 23960
Logtransfermation
library(gridExtra)
NoScale<-qplot(x=Purchase,data=Pdata) # no scaleing
logScale<-qplot(x=log10(Purchase),data=Pdata) # x ais is not scaled by log10
#logScale<-ggplot(aes(x=Purchase),data=Pdata)+geom_histogram()+scale_x_log10() --> X-axis is scaled by log 10, this also works
grid.arrange(NoScale,logScale,ncol=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Look at the gender inbalance in the data set
table(Pdata$Gender)
##
## F M
## 6854 20869
The below frequency poligon says that the proportion of male customer is higher than female purchases
ggplot(aes(x = Purchase, y = ..count../sum(..count..)), data = subset(Pdata, !is.na(Gender))) +
geom_freqpoly(aes(color = Gender), binwidth=10) +
scale_x_continuous(limits = c(0, 27723), breaks = seq(0, 27723, 50)) +
xlab('Purchases') +
ylab('Proportion of customers with that Purchase amount')
## Warning: Removed 4 rows containing missing values (geom_path).
Lets transform the data and see more in detail. Yes male proportions are in high in purchase
ggplot(aes(x = Purchase, y = ..count../sum(..count..)), data = subset(Pdata, !is.na(Gender))) +
geom_freqpoly(aes(color = Gender), binwidth=10) +
xlab('Purchases') +
ylab('Proportion of customers with that Purchase amount')+scale_x_sqrt()
How prices are distributed The below graph shows that the prices are spread in a multy model distribution.There are many peaks and crests,therefore it is clear the data is highly variable.
ggplot(train,aes(x=Purchase))+geom_density()
Purchase by gender, statistical exploration. Females distribution is more skewed right
by(Pdata$Purchase,Pdata$Gender,summary)
## Pdata$Gender: F
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12 5418 7894 8691 11410 23950
## --------------------------------------------------------
## Pdata$Gender: M
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12 5892 8099 9476 12540 23950
There is a gender imbalance
There are more outliers in the female purchases and an average female shops lesser than male.
qplot(x=Gender, y=Purchase,data=subset(Pdata, !is.na(Gender)),geom='boxplot')
Outlier : is 1.5 times IQR distance from the median. Set the limits to remove the outliers. So when we see these plots it is clear that the female customers shoppe less than the male
qplot(x=Gender, y=Purchase,data=subset(Pdata, !is.na(Gender)),geom='boxplot',ylim=c(0,17500))
## Warning: Removed 2243 rows containing non-finite values (stat_boxplot).
qplot(x=Gender, y=Purchase,data=subset(Pdata, !is.na(Gender)),geom='boxplot')+
scale_y_continuous(limits=c(0,15750))
## Warning: Removed 3727 rows containing non-finite values (stat_boxplot).
Set the limits to remove the outliers.Suppose, we just wanted to remove the outliers in the visualization, but not in the actual plotting (which means for calculating mean, median etc) .You can see that the graph details matches with the above statistics by using coord_cartesian function
by(Pdata$Purchase,Pdata$Gender,summary) #This is to summarize the Purchases by Gender numerically.
## Pdata$Gender: F
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12 5418 7894 8691 11410 23950
## --------------------------------------------------------
## Pdata$Gender: M
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12 5892 8099 9476 12540 23950
qplot(x=Gender, y=Purchase,data=subset(Pdata, !is.na(Gender)),geom='boxplot')+coord_cartesian(ylim=c(0,15750))
product_count is the number of times the product is appearing. This means products appearing from 50 to 500 are sold more frequently compared to 1000 to the end. This means very less proportion ifrequently sold
p1<-ggplot(X_train,aes(x=Product_Count))+xlab('Number of times a product sold')+geom_density()
p2<-ggplot(X_train,aes(x=Product_Count))+xlab('Number of times a product sold')+geom_density()+scale_x_log10() #lets do a transform for fun
grid.arrange(p1,p2)
Below plot shows the distribution of purchase frequencies and distribution of Occupation frequencies and age frequencies as well
library(gridExtra)
pl1<-ggplot(X_train[spl,],aes(x=Purchase))+geom_density()+ylab('Frequancy')
pl2<-ggplot(X_train[spl,],aes(x=Occupation))+geom_density()+ylab('Shopper frequency')
pl3<-ggplot(X_train[spl,],aes(x=Age))+geom_density()+ylab('Frequancy')
grid.arrange(pl1,pl2,pl3,ncol=1)
If we take product frequency(Product_count) , the city C has higher in low frequency purchased products.
pl1<-ggplot(X_train[spl,],aes(x=Product_Count,fill=City_Category_A))+geom_density()+facet_grid(City_Category_A ~.)+
scale_x_continuous(limits = c(0,2000))
pl2<-ggplot(X_train[spl,],aes(x=Product_Count,fill=City_Category_B))+geom_density()+facet_grid(City_Category_B ~.)+
scale_x_continuous(limits = c(0,2000))
pl3<-ggplot(X_train[spl,],aes(x=Product_Count,fill=City_Category_C))+geom_density()+facet_grid(City_Category_C ~.)+
scale_x_continuous(limits = c(0,2000))
grid.arrange(pl1,pl2,pl3,ncol=1)
When we zoomed more on the right, City B has higher in product frequency in high volume purchases.
pl1<-ggplot(X_train[spl,],aes(x=Product_Count,fill=City_Category_A))+geom_density()+facet_grid(City_Category_A ~.)+
scale_x_continuous(limits = c(1500,1700))
pl2<-ggplot(X_train[spl,],aes(x=Product_Count,fill=City_Category_B))+geom_density()+facet_grid(City_Category_B ~.)+
scale_x_continuous(limits = c(1500,1700))
pl3<-ggplot(X_train[spl,],aes(x=Product_Count,fill=City_Category_C))+geom_density()+facet_grid(City_Category_C ~.)+
scale_x_continuous(limits = c(1500,1700))
grid.arrange(pl1,pl2,pl3,ncol=1)
## Warning: Removed 27497 rows containing non-finite values (stat_density).
## Warning: Removed 27497 rows containing non-finite values (stat_density).
## Warning: Removed 27497 rows containing non-finite values (stat_density).
How Marrital status is effecting the purchase? Check for imbalance
table(Pdata$Marital_Status)
##
## M UM
## 11345 16378
Imbalance observed
by(Pdata$Purchase,Pdata$Marital_Status,summary)
## Pdata$Marital_Status: M
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12 5882 8040 9306 12060 23950
## --------------------------------------------------------
## Pdata$Marital_Status: UM
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12 5481 8042 9265 12070 23950
How about the purchases in the case of married an unmarried? Unmarried are the frquent shoppers
ggplot(aes(x = Purchase, y = ..count../sum(..count..)), data = subset(Pdata, !is.na(Marital_Status))) +
geom_freqpoly(aes(color = Marital_Status), binwidth=10) +
scale_x_continuous(limits = c(0, 27723), breaks = seq(0, 27723, 50)) +
xlab('Purchases') +
ylab('Proportion of customers Married and unmarried')
## Warning: Removed 4 rows containing missing values (geom_path).
Not clear. Lets transform data and see!
ggplot(aes(x = Purchase, y = ..count../sum(..count..)), data = subset(Pdata, !is.na(Marital_Status))) +
geom_freqpoly(aes(color = Marital_Status), binwidth=10) +
scale_x_sqrt()+
xlab('Purchases') +
ylab('Proportion of customers Married and unmarried')
By looking at these plots , It is obvious that the unmarried males are buying more!. What is happening? Once they get married males are not buying due to financial planning or something else?
p1<-ggplot(aes(x = Purchase, y = ..count../sum(..count..)), data = subset(Pdata, !is.na(Gender))) +
geom_freqpoly(aes(color = Gender), binwidth=10) +
xlab('Purchases') +
ylab('Proportion of customers with that Purchase amount')+scale_x_sqrt()
p2<-ggplot(aes(x = Purchase, y = ..count../sum(..count..)), data = subset(Pdata, !is.na(Marital_Status))) +
geom_freqpoly(aes(color = Marital_Status), binwidth=10) +
scale_x_sqrt()+
xlab('Purchases') +
ylab('Proportion of customers Married and unmarried')
grid.arrange(p1,p2,ncol=2)
Lets see how married males and married females are doing
p1<-ggplot(aes(x = Purchase, y = ..count../sum(..count..)), data = subset(Pdata,Pdata$Gender=='M')) +
geom_freqpoly(aes(color = Marital_Status), binwidth=10) +
scale_x_sqrt()+
xlab('Male Purchases') +
ylab('Proportion of male customers Married and unmarried')
p2<-ggplot(aes(x = Purchase, y = ..count../sum(..count..)), data = subset(Pdata,Pdata$Gender=='F')) +
geom_freqpoly(aes(color = Marital_Status), binwidth=10) +
scale_x_sqrt()+
xlab('Female Purchases') +
ylab('Proportion of female customers Married and unmarried')
p3<-ggplot(aes(x = Purchase, y = ..count../sum(..count..)), data = subset(Pdata,Pdata$Marital_Status=='M')) +
geom_freqpoly(aes(color = Gender), binwidth=10) +
scale_x_sqrt()+
xlab('Male Purchases') +
ylab('Proportion of Married customers Male and Female')
p4<-ggplot(aes(x = Purchase, y = ..count../sum(..count..)), data = subset(Pdata,Pdata$Marital_Status=='UM')) +
geom_freqpoly(aes(color = Gender), binwidth=10) +
scale_x_sqrt()+
xlab('Female Purchases') +
ylab('Proportion of unmarried customers Male and Female')
grid.arrange(p1,p2,p3,p4,ncol=2)
There are more changes in female purchase after marriage than male purchase.
t1<-ggplot(aes(x = Purchase, y = ..count../sum(..count..)), data = subset(Pdata,Pdata$Marital_Status=='M' & Pdata$Gender=='M')) +
geom_freqpoly(binwidth=10) +
scale_x_sqrt()+
xlab('Male Purchases') +
ylab('Sale Proportion of married Male ')
t2<-ggplot(aes(x = Purchase, y = ..count../sum(..count..)), data = subset(Pdata,Pdata$Marital_Status=='UM' & Pdata$Gender=='M')) +
geom_freqpoly(binwidth=10) +
scale_x_sqrt()+
xlab('Male Purchases') +
ylab('Sale Proportion of unmarried Male')
grid.arrange(t1,t2)
t3<-ggplot(aes(x = Purchase, y = ..count../sum(..count..)), data = subset(Pdata,Pdata$Marital_Status=='M' & Pdata$Gender=='F')) +
geom_freqpoly(binwidth=10) +
scale_x_sqrt()+
xlab('Female Purchases') +
ylab('Sale Proportion of Married Female')
t4<-ggplot(aes(x = Purchase, y = ..count../sum(..count..)), data = subset(Pdata,Pdata$Marital_Status=='UM' & Pdata$Gender=='F')) +
geom_freqpoly(binwidth=10) +
scale_x_sqrt()+
xlab('Female Purchases') +
ylab('Sale Proportion of unmarried Female')
grid.arrange(t1,t2,t3,t4)
It is evident after looking at the below stats
data = subset(Pdata,Pdata$Gender=='M')
by(data$Purchase,data$Marital_Status,summary)
## data$Marital_Status: M
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12 5909 8080 9444 12400 23930
## --------------------------------------------------------
## data$Marital_Status: UM
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12 5876 8112 9497 12630 23950
data = subset(Pdata,Pdata$Gender=='F')
by(data$Purchase,data$Marital_Status,summary)
## data$Marital_Status: M
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 14 5778 7937 8897 11550 23950
## --------------------------------------------------------
## data$Marital_Status: UM
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12 5362 7860 8543 10920 23700