Data explored here istaken from Analytics Vidya practice problem.

The code to create new features is taken from another member (vopani)

loading libraries

library(dummies)
## dummies-1.5.6 provided by Decision Patterns
library(plyr)
library(ggplot2)
#train <- read.csv("C:/Users/Admin/Downloads/Black Friday/train_oSwQCTC/train.csv", stringsAsFactors=F)
train <- read.csv("train.csv", stringsAsFactors=F)

onehot-encoding city variable

X_train <- train
X_train <- dummy.data.frame(X_train, names=c("City_Category"), sep="_")

Converting age variable to numeric

X_train$Age[X_train$Age == "0-17"] <- "15"
X_train$Age[X_train$Age == "18-25"] <- "21"
X_train$Age[X_train$Age == "26-35"] <- "30"
X_train$Age[X_train$Age == "36-45"] <- "40"
X_train$Age[X_train$Age == "46-50"] <- "48"
X_train$Age[X_train$Age == "51-55"] <- "53"
X_train$Age[X_train$Age == "55+"] <- "60"
X_train$Age <- as.integer(X_train$Age)

Converting stay in current city to numeric

X_train$Stay_In_Current_City_Years[X_train$Stay_In_Current_City_Years == "4+"] <- "4"
X_train$Stay_In_Current_City_Years <- as.integer(X_train$Stay_In_Current_City_Years)

converting Gender to binary

X_train$Gender <- ifelse(X_train$Gender == "F", 1, 0)

Feature representing the count of each user

user_count <- ddply(X_train, .(User_ID), nrow)
names(user_count)[2] <- "User_Count"
X_train <- merge(X_train, user_count, by="User_ID")

Feature representing the count of each product

product_count <- ddply(X_train, .(Product_ID), nrow)
names(product_count)[2] <- "Product_Count"
X_train <- merge(X_train, product_count, by="Product_ID")

Feature representing the average Purchase of each product

product_mean <- ddply(X_train, .(Product_ID), summarize, Product_Mean=mean(Purchase))
X_train <- merge(X_train, product_mean, by="Product_ID")

feature representing the proportion of times the user purchases the product more than the product’s average

X_train$flag_high <- ifelse(X_train$Purchase > X_train$Product_Mean,1,0)
user_high <- ddply(X_train, .(User_ID), summarize, User_High=mean(flag_high))
X_train <- merge(X_train, user_high, by="User_ID")

Add User_High column to the original dataset also

train$flag_high <- ifelse(X_train$Purchase > X_train$Product_Mean,1,0)
user_high <- ddply(train, .(User_ID), summarize, User_High=mean(flag_high))
train <- merge(train, user_high, by="User_ID")
X_train <- merge(X_train, user_high, by="User_ID")

Structure of the train dataset

str(train)
## 'data.frame':    550068 obs. of  14 variables:
##  $ User_ID                   : int  1000001 1000001 1000001 1000001 1000001 1000001 1000001 1000001 1000001 1000001 ...
##  $ Product_ID                : chr  "P00069042" "P00248942" "P00087842" "P00085442" ...
##  $ Gender                    : chr  "F" "F" "F" "F" ...
##  $ Age                       : chr  "0-17" "0-17" "0-17" "0-17" ...
##  $ Occupation                : int  10 10 10 10 10 10 10 10 10 10 ...
##  $ City_Category             : chr  "A" "A" "A" "A" ...
##  $ Stay_In_Current_City_Years: chr  "2" "2" "2" "2" ...
##  $ Marital_Status            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Product_Category_1        : int  3 1 12 12 1 3 14 8 8 2 ...
##  $ Product_Category_2        : int  NA 6 NA 14 2 4 NA NA NA 4 ...
##  $ Product_Category_3        : int  NA 14 NA NA 9 12 NA NA NA 8 ...
##  $ Purchase                  : int  8370 15200 1422 1057 15416 10572 11011 10003 8094 12842 ...
##  $ flag_high                 : num  1 0 1 0 0 0 1 1 1 1 ...
##  $ User_High                 : num  0.486 0.486 0.486 0.486 0.486 ...

Structure of the X_train dataset which has newly created

str(X_train)
## 'data.frame':    550068 obs. of  20 variables:
##  $ User_ID                   : int  1000001 1000001 1000001 1000001 1000001 1000001 1000001 1000001 1000001 1000001 ...
##  $ Product_ID                : chr  "P00051442" "P00258742" "P00178342" "P00085442" ...
##  $ Gender                    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ Age                       : int  15 15 15 15 15 15 15 15 15 15 ...
##  $ Occupation                : int  10 10 10 10 10 10 10 10 10 10 ...
##  $ City_Category_A           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ City_Category_B           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ City_Category_C           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Stay_In_Current_City_Years: int  2 2 2 2 2 2 2 2 2 2 ...
##  $ Marital_Status            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Product_Category_1        : int  8 5 8 12 14 8 3 1 3 6 ...
##  $ Product_Category_2        : int  17 NA NA 14 NA NA 4 6 4 8 ...
##  $ Product_Category_3        : int  NA NA NA NA NA NA 5 14 NA 16 ...
##  $ Purchase                  : int  9938 6910 7887 1057 11011 7953 13650 15200 7943 16622 ...
##  $ User_Count                : int  35 35 35 35 35 35 35 35 35 35 ...
##  $ Product_Count             : int  1249 964 369 341 179 197 1152 581 232 1406 ...
##  $ Product_Mean              : num  8956 7003 7706 1455 12591 ...
##  $ flag_high                 : num  1 0 1 0 0 1 1 0 0 0 ...
##  $ User_High.x               : num  0.543 0.543 0.543 0.543 0.543 ...
##  $ User_High.y               : num  0.486 0.486 0.486 0.486 0.486 ...

Random sample the data set. Just 5% of the data set was take as a sample to explore

library(caTools)
set.seed(3000)
spl = sample.split(train$User_ID, SplitRatio = 0.05)
Pdata=train[spl,]
Pdata$Marital_Status <- ifelse(Pdata$Marital_Status == 1, 'M', 'UM')

How is the distribution of Purchases?. The distributions right skewed and the mean amd meedian for the sample and the population are approximately same.

summary(X_train[spl,]$Purchase)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      12    5834    8045    9252   12040   23960
summary(X_train$Purchase)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      12    5823    8047    9264   12050   23960

Logtransfermation

library(gridExtra) 
NoScale<-qplot(x=Purchase,data=Pdata) # no scaleing
logScale<-qplot(x=log10(Purchase),data=Pdata) # x ais is not scaled by log10
#logScale<-ggplot(aes(x=Purchase),data=Pdata)+geom_histogram()+scale_x_log10() --> X-axis is scaled by log 10, this also works
grid.arrange(NoScale,logScale,ncol=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Look at the gender inbalance in the data set

table(Pdata$Gender)   
## 
##     F     M 
##  6854 20869

The below frequency poligon says that the proportion of male customer is higher than female purchases

ggplot(aes(x = Purchase, y = ..count../sum(..count..)), data = subset(Pdata, !is.na(Gender))) + 
  geom_freqpoly(aes(color = Gender), binwidth=10) + 
  scale_x_continuous(limits = c(0, 27723), breaks = seq(0, 27723, 50)) + 
  xlab('Purchases') + 
  ylab('Proportion of customers with that Purchase amount')
## Warning: Removed 4 rows containing missing values (geom_path).

Lets transform the data and see more in detail. Yes male proportions are in high in purchase

ggplot(aes(x = Purchase, y = ..count../sum(..count..)), data = subset(Pdata, !is.na(Gender))) + 
  geom_freqpoly(aes(color = Gender), binwidth=10) + 
  xlab('Purchases') + 
  ylab('Proportion of customers with that Purchase amount')+scale_x_sqrt()

How prices are distributed The below graph shows that the prices are spread in a multy model distribution.There are many peaks and crests,therefore it is clear the data is highly variable.

ggplot(train,aes(x=Purchase))+geom_density()

Purchase by gender, statistical exploration. Females distribution is more skewed right

by(Pdata$Purchase,Pdata$Gender,summary)   
## Pdata$Gender: F
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      12    5418    7894    8691   11410   23950 
## -------------------------------------------------------- 
## Pdata$Gender: M
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      12    5892    8099    9476   12540   23950

There is a gender imbalance

There are more outliers in the female purchases and an average female shops lesser than male.

qplot(x=Gender, y=Purchase,data=subset(Pdata, !is.na(Gender)),geom='boxplot')

Outlier : is 1.5 times IQR distance from the median. Set the limits to remove the outliers. So when we see these plots it is clear that the female customers shoppe less than the male

qplot(x=Gender, y=Purchase,data=subset(Pdata, !is.na(Gender)),geom='boxplot',ylim=c(0,17500))  
## Warning: Removed 2243 rows containing non-finite values (stat_boxplot).

qplot(x=Gender, y=Purchase,data=subset(Pdata, !is.na(Gender)),geom='boxplot')+
scale_y_continuous(limits=c(0,15750))  
## Warning: Removed 3727 rows containing non-finite values (stat_boxplot).

Set the limits to remove the outliers.Suppose, we just wanted to remove the outliers in the visualization, but not in the actual plotting (which means for calculating mean, median etc) .You can see that the graph details matches with the above statistics by using coord_cartesian function

by(Pdata$Purchase,Pdata$Gender,summary)  #This is to summarize the Purchases by Gender numerically.
## Pdata$Gender: F
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      12    5418    7894    8691   11410   23950 
## -------------------------------------------------------- 
## Pdata$Gender: M
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      12    5892    8099    9476   12540   23950
qplot(x=Gender, y=Purchase,data=subset(Pdata, !is.na(Gender)),geom='boxplot')+coord_cartesian(ylim=c(0,15750))

product_count is the number of times the product is appearing. This means products appearing from 50 to 500 are sold more frequently compared to 1000 to the end. This means very less proportion ifrequently sold

p1<-ggplot(X_train,aes(x=Product_Count))+xlab('Number of times a product sold')+geom_density() 
p2<-ggplot(X_train,aes(x=Product_Count))+xlab('Number of times a product sold')+geom_density()+scale_x_log10() #lets do a transform for fun
grid.arrange(p1,p2)

Below plot shows the distribution of purchase frequencies and distribution of Occupation frequencies and age frequencies as well

library(gridExtra) 
pl1<-ggplot(X_train[spl,],aes(x=Purchase))+geom_density()+ylab('Frequancy')     
pl2<-ggplot(X_train[spl,],aes(x=Occupation))+geom_density()+ylab('Shopper frequency') 
pl3<-ggplot(X_train[spl,],aes(x=Age))+geom_density()+ylab('Frequancy')
grid.arrange(pl1,pl2,pl3,ncol=1)

If we take product frequency(Product_count) , the city C has higher in low frequency purchased products.

pl1<-ggplot(X_train[spl,],aes(x=Product_Count,fill=City_Category_A))+geom_density()+facet_grid(City_Category_A ~.)+
     scale_x_continuous(limits = c(0,2000))
pl2<-ggplot(X_train[spl,],aes(x=Product_Count,fill=City_Category_B))+geom_density()+facet_grid(City_Category_B ~.)+
     scale_x_continuous(limits = c(0,2000))
pl3<-ggplot(X_train[spl,],aes(x=Product_Count,fill=City_Category_C))+geom_density()+facet_grid(City_Category_C ~.)+
     scale_x_continuous(limits = c(0,2000))
grid.arrange(pl1,pl2,pl3,ncol=1)

When we zoomed more on the right, City B has higher in product frequency in high volume purchases.

pl1<-ggplot(X_train[spl,],aes(x=Product_Count,fill=City_Category_A))+geom_density()+facet_grid(City_Category_A ~.)+
  scale_x_continuous(limits = c(1500,1700))
pl2<-ggplot(X_train[spl,],aes(x=Product_Count,fill=City_Category_B))+geom_density()+facet_grid(City_Category_B ~.)+
  scale_x_continuous(limits = c(1500,1700))
pl3<-ggplot(X_train[spl,],aes(x=Product_Count,fill=City_Category_C))+geom_density()+facet_grid(City_Category_C ~.)+
scale_x_continuous(limits = c(1500,1700))
grid.arrange(pl1,pl2,pl3,ncol=1)
## Warning: Removed 27497 rows containing non-finite values (stat_density).

## Warning: Removed 27497 rows containing non-finite values (stat_density).

## Warning: Removed 27497 rows containing non-finite values (stat_density).

How Marrital status is effecting the purchase? Check for imbalance

table(Pdata$Marital_Status)
## 
##     M    UM 
## 11345 16378

Imbalance observed

by(Pdata$Purchase,Pdata$Marital_Status,summary)   
## Pdata$Marital_Status: M
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      12    5882    8040    9306   12060   23950 
## -------------------------------------------------------- 
## Pdata$Marital_Status: UM
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      12    5481    8042    9265   12070   23950

How about the purchases in the case of married an unmarried? Unmarried are the frquent shoppers

ggplot(aes(x = Purchase, y = ..count../sum(..count..)), data = subset(Pdata, !is.na(Marital_Status))) + 
  geom_freqpoly(aes(color = Marital_Status), binwidth=10) + 
  scale_x_continuous(limits = c(0, 27723), breaks = seq(0, 27723, 50)) + 
  xlab('Purchases') + 
  ylab('Proportion of customers Married and unmarried')
## Warning: Removed 4 rows containing missing values (geom_path).

Not clear. Lets transform data and see!

ggplot(aes(x = Purchase, y = ..count../sum(..count..)), data = subset(Pdata, !is.na(Marital_Status))) + 
  geom_freqpoly(aes(color = Marital_Status), binwidth=10) + 
  scale_x_sqrt()+
  xlab('Purchases') + 
  ylab('Proportion of customers Married and unmarried')

Conclusion.

By looking at these plots , It is obvious that the unmarried males are buying more!. What is happening? Once they get married males are not buying due to financial planning or something else?

p1<-ggplot(aes(x = Purchase, y = ..count../sum(..count..)), data = subset(Pdata, !is.na(Gender))) + 
  geom_freqpoly(aes(color = Gender), binwidth=10) + 
  xlab('Purchases') + 
  ylab('Proportion of customers with that Purchase amount')+scale_x_sqrt()

p2<-ggplot(aes(x = Purchase, y = ..count../sum(..count..)), data = subset(Pdata, !is.na(Marital_Status))) + 
  geom_freqpoly(aes(color = Marital_Status), binwidth=10) + 
  scale_x_sqrt()+
  xlab('Purchases') + 
  ylab('Proportion of customers Married and unmarried')

grid.arrange(p1,p2,ncol=2)

Lets see how married males and married females are doing

p1<-ggplot(aes(x = Purchase, y = ..count../sum(..count..)), data = subset(Pdata,Pdata$Gender=='M')) + 
  geom_freqpoly(aes(color = Marital_Status), binwidth=10) + 
  scale_x_sqrt()+
  xlab('Male Purchases') + 
  ylab('Proportion of male customers Married and unmarried')

p2<-ggplot(aes(x = Purchase, y = ..count../sum(..count..)), data = subset(Pdata,Pdata$Gender=='F')) + 
  geom_freqpoly(aes(color = Marital_Status), binwidth=10) + 
  scale_x_sqrt()+
  xlab('Female Purchases') + 
  ylab('Proportion of female customers Married and unmarried')

p3<-ggplot(aes(x = Purchase, y = ..count../sum(..count..)), data = subset(Pdata,Pdata$Marital_Status=='M')) + 
  geom_freqpoly(aes(color = Gender), binwidth=10) + 
  scale_x_sqrt()+
  xlab('Male Purchases') + 
  ylab('Proportion of Married customers Male and Female')

p4<-ggplot(aes(x = Purchase, y = ..count../sum(..count..)), data = subset(Pdata,Pdata$Marital_Status=='UM')) + 
  geom_freqpoly(aes(color = Gender), binwidth=10) + 
  scale_x_sqrt()+
  xlab('Female Purchases') + 
  ylab('Proportion of unmarried customers Male and Female')

grid.arrange(p1,p2,p3,p4,ncol=2)

There are more changes in female purchase after marriage than male purchase.

t1<-ggplot(aes(x = Purchase, y = ..count../sum(..count..)), data = subset(Pdata,Pdata$Marital_Status=='M' & Pdata$Gender=='M')) + 
  geom_freqpoly(binwidth=10) + 
  scale_x_sqrt()+
  xlab('Male Purchases') + 
  ylab('Sale Proportion of married Male ')

t2<-ggplot(aes(x = Purchase, y = ..count../sum(..count..)), data = subset(Pdata,Pdata$Marital_Status=='UM' & Pdata$Gender=='M')) + 
  geom_freqpoly(binwidth=10) + 
  scale_x_sqrt()+
  xlab('Male Purchases') + 
  ylab('Sale Proportion of unmarried Male')
grid.arrange(t1,t2)

t3<-ggplot(aes(x = Purchase, y = ..count../sum(..count..)), data = subset(Pdata,Pdata$Marital_Status=='M' & Pdata$Gender=='F')) + 
  geom_freqpoly(binwidth=10) + 
  scale_x_sqrt()+
  xlab('Female Purchases') + 
  ylab('Sale Proportion of Married Female')

t4<-ggplot(aes(x = Purchase, y = ..count../sum(..count..)), data = subset(Pdata,Pdata$Marital_Status=='UM' & Pdata$Gender=='F')) + 
  geom_freqpoly(binwidth=10) + 
  scale_x_sqrt()+
  xlab('Female Purchases') + 
  ylab('Sale Proportion of unmarried Female')
grid.arrange(t1,t2,t3,t4)

It is evident after looking at the below stats

data = subset(Pdata,Pdata$Gender=='M')
by(data$Purchase,data$Marital_Status,summary)
## data$Marital_Status: M
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      12    5909    8080    9444   12400   23930 
## -------------------------------------------------------- 
## data$Marital_Status: UM
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      12    5876    8112    9497   12630   23950
data = subset(Pdata,Pdata$Gender=='F')
by(data$Purchase,data$Marital_Status,summary)
## data$Marital_Status: M
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      14    5778    7937    8897   11550   23950 
## -------------------------------------------------------- 
## data$Marital_Status: UM
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      12    5362    7860    8543   10920   23700