1. Problem Statement

To understand the customer purchase behaviour (specifically, purchase amount) against various products of different categories so that sales of a retail company can be boosted.

We will be building models to predict the purchase amount of customer against various products which will help us to create personalized offer for customers against different products.

2. Setting up the environment

Setting up environment for analysis, loading data, packages, understanding variables.

2.1 Loading Data

getwd()

## [1] "D:/black friday project"

data_set<-read.csv(paste("train.csv",sep = ""))

## 75% of the sample size
smp_size <- floor(0.75 * nrow(data_set))

## set the seed to make your partition reproductible
set.seed(123)
train_ind <- sample(seq_len(nrow(data_set)), size = smp_size)

train_set <- data_set[train_ind, ]
test_set <- data_set[-train_ind, ]

y_test<-as.data.frame(test_set[,12], drop=false)
names(y_test)<-c("purchase")

test_set<-as.data.frame(test_set[,1:11], drop=false)
dim(test_set)

## [1] 137517     11

2.2 Checking dimensions of the data

dim(train_set)

## [1] 412551     12

dim(test_set)

## [1] 137517     11

Training data set consists of 4,12,551 entries across 12 variables.
Test Data set consists of 1,37,517 entries across 11 variables.

12th variable in Training Data set is our response Variable which needs to be predicted in test data set.

2.3 Understanding the Data set

library(psych)
describe(train_set)

##                             vars      n       mean      sd  median
## User_ID                        1 412551 1003030.35 1728.22 1003080
## Product_ID*                    2 412551    1708.53 1011.98    1668
## Gender*                        3 412551       1.75    0.43       2
## Age*                           4 412551       3.50    1.35       3
## Occupation                     5 412551       8.08    6.53       7
## City_Category*                 6 412551       2.04    0.76       2
## Stay_In_Current_City_Years*    7 412551       2.86    1.29       3
## Marital_Status                 8 412551       0.41    0.49       0
## Product_Category_1             9 412551       5.41    3.94       5
## Product_Category_2            10 282116       9.85    5.09       9
## Product_Category_3            11 124786      12.67    4.12      14
## Purchase                      12 412551    9261.00 5022.12    8044
##                                trimmed     mad     min     max range  skew
## User_ID                     1003028.59 2180.90 1000001 1006040  6039  0.00
## Product_ID*                    1688.75 1196.46       1    3631  3630  0.15
## Gender*                           1.82    0.00       1       2     1 -1.17
## Age*                              3.36    1.48       1       7     6  0.81
## Occupation                        7.69    8.90       0      20    20  0.40
## City_Category*                    2.05    1.48       1       3     2 -0.07
## Stay_In_Current_City_Years*       2.82    1.48       1       5     4  0.32
## Marital_Status                    0.39    0.00       0       1     1  0.37
## Product_Category_1                4.91    4.45       1      20    19  1.02
## Product_Category_2               10.00    7.41       2      18    16 -0.17
## Product_Category_3               13.07    2.97       3      18    15 -0.77
## Purchase                       8925.40 4250.61      12   23961 23949  0.60
##                             kurtosis   se
## User_ID                        -1.20 2.69
## Product_ID*                    -1.09 1.58
## Gender*                        -0.63 0.00
## Age*                            0.30 0.00
## Occupation                     -1.22 0.01
## City_Category*                 -1.27 0.00
## Stay_In_Current_City_Years*    -1.07 0.00
## Marital_Status                 -1.86 0.00
## Product_Category_1              1.22 0.01
## Product_Category_2             -1.43 0.01
## Product_Category_3             -0.81 0.01
## Purchase                       -0.34 7.82

describe(test_set)

##                             vars      n       mean      sd  median
## User_ID                        1 137517 1003024.33 1725.69 1003070
## Product_ID*                    2 137517    1708.31 1012.87    1665
## Gender*                        3 137517       1.75    0.43       2
## Age*                           4 137517       3.49    1.36       3
## Occupation                     5 137517       8.07    6.51       7
## City_Category*                 6 137517       2.04    0.76       2
## Stay_In_Current_City_Years*    7 137517       2.86    1.29       3
## Marital_Status                 8 137517       0.41    0.49       0
## Product_Category_1             9 137517       5.38    3.93       5
## Product_Category_2            10  94314       9.82    5.08       9
## Product_Category_3            11  42035      12.67    4.13      14
##                                trimmed     mad     min     max range  skew
## User_ID                     1003021.88 2172.01 1000001 1006040  6039  0.00
## Product_ID*                    1688.48 1200.91       1    3631  3630  0.15
## Gender*                           1.82    0.00       1       2     1 -1.18
## Age*                              3.35    1.48       1       7     6  0.81
## Occupation                        7.69    8.90       0      20    20  0.40
## City_Category*                    2.05    1.48       1       3     2 -0.07
## Stay_In_Current_City_Years*       2.82    1.48       1       5     4  0.32
## Marital_Status                    0.39    0.00       0       1     1  0.37
## Product_Category_1                4.88    4.45       1      20    19  1.04
## Product_Category_2                9.97    7.41       2      18    16 -0.15
## Product_Category_3               13.07    2.97       3      18    15 -0.76
##                             kurtosis   se
## User_ID                        -1.19 4.65
## Product_ID*                    -1.09 2.73
## Gender*                        -0.60 0.00
## Age*                            0.30 0.00
## Occupation                     -1.22 0.02
## City_Category*                 -1.27 0.00
## Stay_In_Current_City_Years*    -1.07 0.00
## Marital_Status                 -1.87 0.00
## Product_Category_1              1.27 0.01
## Product_Category_2             -1.43 0.02
## Product_Category_3             -0.81 0.02

3. Univariate Analysis

Let’s First Find categorical and continuos variables in our training data set.

str(train_set)

## 'data.frame':    412551 obs. of  12 variables:
##  $ User_ID                   : int  1000442 1000778 1004647 1002875 1001675 1003848 1002801 1003626 1004682 1002774 ...
##  $ Product_ID                : Factor w/ 3631 levels "P00000142","P00000242",..: 356 1673 2771 1017 2244 971 3510 1866 2134 1183 ...
##  $ Gender                    : Factor w/ 2 levels "F","M": 2 2 2 2 1 2 1 2 2 2 ...
##  $ Age                       : Factor w/ 7 levels "0-17","18-25",..: 3 2 4 3 3 3 3 3 3 3 ...
##  $ Occupation                : int  1 17 20 2 6 15 3 17 7 1 ...
##  $ City_Category             : Factor w/ 3 levels "A","B","C": 1 3 2 2 2 3 1 2 2 1 ...
##  $ Stay_In_Current_City_Years: Factor w/ 5 levels "0","1","2","3",..: 3 5 2 4 2 2 2 4 4 2 ...
##  $ Marital_Status            : int  0 0 1 1 1 0 1 0 1 0 ...
##  $ Product_Category_1        : int  5 8 8 1 11 5 5 5 8 1 ...
##  $ Product_Category_2        : int  NA NA NA 2 NA NA 8 11 NA 2 ...
##  $ Product_Category_3        : int  NA NA NA 8 NA NA NA NA NA 15 ...
##  $ Purchase                  : int  5472 9996 5864 19483 4592 8724 5183 5141 8050 15199 ...

Data Prepartion & Exploratory Data Analysis

Firstly doing the univariate exploration and modifying the data if deemed necessary.

one way contigency table

mytable <- with(train_set,table(Gender))
mytable

## Gender
##      F      M 
## 102021 310530

lbls <- c("M","F")
pct <- round(mytable/sum(mytable)*100)
lbls <- paste(lbls, pct) 
lbls <- paste(lbls,"%",sep="")
pie(mytable,labels = lbls)

mytable1 <- with(train_set,table(Marital_Status))
mytable1

## Marital_Status
##      0      1 
## 243555 168996

lbls1 <- c("Single","Married")
pct1 <- round(mytable1/sum(mytable1)*100)
lbls1 <- paste(lbls1, pct1) 
lbls1<- paste(lbls1,"%",sep="")
pie(mytable1,labels = lbls1)

mytable2 <- with(train_set,table(City_Category))
mytable2

## City_Category
##      A      B      C 
## 110815 173429 128307

lbls2 <- c("A","B","C")
pct2 <- round(mytable2/sum(mytable2)*100)
lbls2 <- paste(lbls2, pct2) 
lbls2<- paste(lbls2,"%",sep="")
pie(mytable2,labels = lbls2)

par(mfrow = c(1, 2))
#train data
with(train_set,table(Stay_In_Current_City_Years))

## Stay_In_Current_City_Years
##      0      1      2      3     4+ 
##  55676 145504  76472  71518  63381

#test data
with(test_set,table(Stay_In_Current_City_Years))

## Stay_In_Current_City_Years
##     0     1     2     3    4+ 
## 18722 48317 25366 23767 21345

par(mfrow = c(1, 2))
a<-with(train_set,table(Product_Category_1))
#a
barplot(a,main ="train_data")
b<-with(test_set,table(Product_Category_1))
#b
barplot(b,main ="test_data")

par(mfrow = c(1, 2))
a<-with(train_set,table(Product_Category_2))
#a
barplot(a,main ="train_data")
b<-with(test_set,table(Product_Category_2))
#b
barplot(b,main ="test_data")

library(lattice)
par(mfrow = c(1, 2))
a<-with(train_set,table(Product_Category_3))
#a
barchart(a,main ="train_data",horizontal = "FALSE")

b<-with(test_set,table(Product_Category_3))
#b
barchart(b,main ="test_data",horizontal = "FALSE")

2 way table

xtabs(~Marital_Status+Gender,data=train_set)

##               Gender
## Marital_Status      F      M
##              0  59207 184348
##              1  42814 126182

xtabs(Purchase~Product_Category_3,aggregate(Purchase~Product_Category_3,train_set,mean))

## Product_Category_3
##         3         4         5         6         8         9        10 
## 14058.795  9806.369 12128.368 13179.604 13047.715 10388.366 13507.104 
##        11        12        13        14        15        16        17 
## 12113.981  8703.027 13237.030 10036.091 12342.484 11960.804 11737.853 
##        18 
## 11015.255

xtabs(Purchase~Product_Category_2,aggregate(Purchase~Product_Category_2,train_set,mean))

## Product_Category_2
##         2         3         4         5         6         7         8 
## 13610.833 11222.781 10216.105  9033.057 11516.350  6894.512 10247.663 
##         9        10        11        12        13        14        15 
##  7329.170 15715.654  8893.459  6996.297  9662.622  7101.230 10345.805 
##        16        17        18 
## 10289.805  9421.296  9328.248

xtabs(Purchase~Product_Category_1,aggregate(Purchase~Product_Category_1,train_set,mean))

## Product_Category_1
##           1           2           3           4           5           6 
## 13600.94860 11232.00870 10109.66465  2334.29817  6240.79881 15830.96146 
##           7           8           9          10          11          12 
## 16374.65185  7497.88064 15754.68932 19668.12699  4687.24631  1344.33266 
##          13          14          15          16          17          18 
##   720.56800 13143.73906 14790.16472 14754.58678 10167.59811  2979.82756 
##          19          20 
##    36.78061   369.85307

3.1. Unique Data For EDA & Data type modification

Since our data is stored based on product id i.e. if a person buying 10 products then his data will be stored in 10 observation and hence there will cause a repetition of same person data.

train_set$User_ID <- as.factor(train_set$User_ID)
train_set$Product_ID <- as.factor(train_set$Product_ID)
train_set$Marital_Status <- as.factor(ifelse(train_set$Marital_Status == 1, 'Married', 'Single'))
train_set$Age <- as.factor(train_set$Age)
train_set$Gender <- as.factor(ifelse(train_set$Gender=='M', 'Male', 'Female'))
train_set$Occupation <- as.factor(train_set$Occupation)
train_set$City_Category <- as.factor(train_set$City_Category)
train_set$Stay_In_Current_City_Years <- as.factor(train_set$Stay_In_Current_City_Years)

test_set$User_ID <- as.factor(test_set$User_ID)
test_set$Product_ID <- as.factor(test_set$Product_ID)
test_set$Marital_Status <- as.factor(ifelse(test_set$Marital_Status == 1, 'Married', 'Single'))
test_set$Age <- as.factor(test_set$Age)
test_set$Gender <- as.factor(ifelse(test_set$Gender=='M', 'Male', 'Female'))
test_set$Occupation <- as.factor(test_set$Occupation)
test_set$City_Category <- as.factor(test_set$City_Category)
test_set$Stay_In_Current_City_Years <- as.factor(test_set$Stay_In_Current_City_Years)

#str(train_set)
#str(test_set)

The function distinct() in dplyr package can be used to keep only unique/distinct rows from a data frame. If there are duplicate rows, only the first row is preserved. Itâs an efficient version of the R base function unique().

library("dplyr")

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

EDA_Distinct <- distinct(train_set, User_ID, Age, Gender, Marital_Status, Occupation, City_Category, Stay_In_Current_City_Years)
#str(EDA_Distinct)
head(EDA_Distinct)

##   User_ID Gender   Age Occupation City_Category Stay_In_Current_City_Years
## 1 1000442   Male 26-35          1             A                          2
## 2 1000778   Male 18-25         17             C                         4+
## 3 1004647   Male 36-45         20             B                          1
## 4 1002875   Male 26-35          2             B                          3
## 5 1001675 Female 26-35          6             B                          1
## 6 1003848   Male 26-35         15             C                          1
##   Marital_Status
## 1         Single
## 2         Single
## 3        Married
## 4        Married
## 5        Married
## 6         Single

3.2. User_ID

Creating new variables User_Purchase_Count, Total_Spending and Avg_Spending_on_each_Product

Total number of product purchased bu customer

#creating a new data frame to stor the number of purchase made by each user
userIDCount <- as.data.frame(table(train_set$User_ID))
names(userIDCount) <- c("User_ID","User_Purchase_Count")
head(userIDCount)

##   User_ID User_Purchase_Count
## 1 1000001                  24
## 2 1000002                  63
## 3 1000003                  25
## 4 1000004                  13
## 5 1000005                  78
## 6 1000006                  37

merging df

train_set <- merge(x = train_set, y = userIDCount, by = "User_ID", all.x = TRUE)

changing the test dataset too

# writing code such that if a new user comes for the first time his count is set to one in test dataset
test_set <- merge(x = test_set, y = userIDCount, by = "User_ID", all.x = TRUE)
test_set[is.na(test_set$User_Purchase_Count), "User_Purchase_Count"] <- 1
#Changing the datatype
test_set$User_Purchase_Count <- as.integer(test_set$User_Purchase_Count)

Total Spending by a User

Changing the Training Dataset

# creating a data frame to store the total spending by a user
totspend <- aggregate(train_set$Purchase, by=list(Category=train_set$User_ID), FUN=sum)
names(totspend) <- c("User_ID","Total_Spending")
train_set <- merge(x = train_set, y = totspend, by = "User_ID", all.x = TRUE)
#head(train_set)

changing test too

test_set <- merge(x = test_set, y = totspend, by = "User_ID", all.x = TRUE)
# writing code such that if a new user comes for the first time his count is set to zero for total spending in test dataset
test_set[is.na(test_set$Total_Spending), "Total_Spending"] <- 0
#Changing the datatype
test_set$Total_Spending <- as.integer(test_set$Total_Spending)

Average Spending on each product by a customer

Now calculating the average spending of user on each product we can divide Total_Spending by User_Purchase Count. Avg_Spending_on_each_Product = Total_Spending / User_Purchase_Count

train_set$Avg_Spending_on_each_Product <- train_set$Total_Spending / train_set$User_Purchase_Count
test_set$Avg_Spending_on_each_Product <- test_set$Total_Spending / test_set$User_Purchase_Count
#Changing the datatype 
train_set$Avg_Spending_on_each_Product <- as.integer(train_set$Avg_Spending_on_each_Product)
test_set$Avg_Spending_on_each_Product <- as.integer(test_set$Avg_Spending_on_each_Product)

summary(totspend$Total_Spending)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   20125  178411  388922  648555  842252 8050867

library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following objects are masked from 'package:psych':
## 
##     %+%, alpha

ggplot(totspend, aes(x=Total_Spending)) + geom_density(fill="red", col="black", alpha=0.80)

removing unused df

rm(userIDCount, totspend)

Analysing using distict data

#Updating EDA_Distinct dataframe
EDA_Distinct <- distinct(train_set, User_ID, Age, Gender, Marital_Status, Occupation, City_Category, Stay_In_Current_City_Years, User_Purchase_Count)
d1 <- summary(EDA_Distinct$User_Purchase_Count)
d1

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    3.00   19.00   40.00   70.03   87.00  779.00

ggplot(EDA_Distinct, aes(x=User_Purchase_Count)) + geom_density(fill="red", col="black", alpha=0.80) + annotate(geom = "text", x = 6, y = 0.0125, label = "Min")

histogram(EDA_Distinct$User_Purchase_Count)

bwplot(Age~User_Purchase_Count|Marital_Status+Gender,data = EDA_Distinct)

bwplot(User_Purchase_Count~Occupation|Marital_Status+Gender,data = EDA_Distinct)

3.3 Product_id

head(train_set$Product_ID,15)

##  [1] P00111842 P00297042 P0096542  P00210342 P00000142 P00248942 P00064042
##  [8] P00004842 P00051842 P00025442 P00085442 P00051442 P00058142 P0097142 
## [15] P00214842
## 3631 Levels: P00000142 P00000242 P00000342 P00000442 P00000542 ... P0099942

i.e. there are a total of 3631 number of product

Creating new variables Total number of product unit sold, Average price of a product, Sd of the product price

SoldProdCount <- as.data.frame(table(train_set$Product_ID))
names(SoldProdCount) <- c("Product_ID","Product_Sold_Count")

SoldProdPriceMean <- aggregate(train_set$Purchase, by=list(Category=train_set$Product_ID), FUN=mean)
names(SoldProdPriceMean) <- c("Product_ID","Product_Mean_Price")

SoldProdPriceSD <- aggregate(train_set$Purchase, by=list(Category=train_set$Product_ID), FUN=sd)
names(SoldProdPriceSD) <- c("Product_ID","Product_SD_Price")

summary(SoldProdCount$Product_Sold_Count)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    15.0    53.0   113.6   146.0  1420.0

histogram(SoldProdCount$Product_Sold_Count)

histogram(SoldProdPriceMean$Product_Mean_Price)

histogram(SoldProdPriceSD$Product_SD_Price)

Correlation Matrix

changing variables type to numeric

corr_train <- train_set
corr_train$User_ID <- as.numeric(corr_train$User_ID)
corr_train$User_Purchase_Count <- as.numeric(corr_train$User_Purchase_Count)
corr_train$Product_ID <- as.numeric(corr_train$Product_ID)

corr_train$Gender <- as.numeric(ifelse(corr_train$Gender=="Male", 1, 0))

corr_train$Age <- as.numeric(ifelse(corr_train$Age=='0-17', 17, ifelse(corr_train$Age=='18-25', 25, ifelse(corr_train$Age=='26-35', 35, ifelse(corr_train$Age=='36-45', 45, ifelse(corr_train$Age=='46-50', 50, ifelse(corr_train$Age=='51-55', 55, 65)))))))

corr_train$Marital_Status <- as.numeric(ifelse(corr_train$Marital_Status=="Married", '1', '0'))

corr_train$Occupation <- as.numeric(corr_train$Occupation)

corr_train$City_Category <- as.numeric(ifelse(corr_train$City_Category=='A', 1, ifelse(corr_train$City_Category=='B', 2, 3)))

corr_train$Stay_In_Current_City_Years <- as.numeric(ifelse(corr_train$Stay_In_Current_City_Years=='4+', 6, corr_train$Stay_In_Current_City_Years))

corr_train$Total_Spending <- as.numeric(corr_train$Total_Spending)
corr_train$Avg_Spending_on_each_Product <- as.numeric(corr_train$Avg_Spending_on_each_Product)


#target Variable 
Y <- as.numeric(corr_train$Purchase)
corr_train$Product_Category_2<-NULL
corr_train$Product_Category_3<-NULL
cor(corr_train)

##                                   User_ID   Product_ID      Gender
## User_ID                       1.000000000 -0.017397001 -0.03366321
## Product_ID                   -0.017397001  1.000000000  0.01603059
## Gender                       -0.033663212  0.016030587  1.00000000
## Age                           0.038613234  0.023934173 -0.00377867
## Occupation                   -0.024521599  0.008086456  0.11825402
## City_Category                 0.023437825  0.007097120 -0.00510242
## Stay_In_Current_City_Years   -0.029746261 -0.002536608  0.01863580
## Marital_Status                0.020102169  0.012267187 -0.01168131
## Product_Category_1            0.003570070  0.074596632 -0.04595016
## Purchase                      0.004244348 -0.109294620  0.06084054
## User_Purchase_Count          -0.031272142 -0.002597573  0.07374752
## Total_Spending               -0.027597276 -0.004162652  0.10852446
## Avg_Spending_on_each_Product  0.012974786 -0.020602205  0.18595405
##                                        Age   Occupation City_Category
## User_ID                       0.0386132336 -0.024521599    0.02343783
## Product_ID                    0.0239341730  0.008086456    0.00709712
## Gender                       -0.0037786703  0.118254023   -0.00510242
## Age                           1.0000000000  0.096776311    0.11337164
## Occupation                    0.0967763113  1.000000000    0.03540659
## City_Category                 0.1133716424  0.035406593    1.00000000
## Stay_In_Current_City_Years   -0.0001599429  0.027063059    0.01808487
## Marital_Status                0.3051791632  0.024521869    0.04014877
## Product_Category_1            0.0620364981 -0.006648511   -0.01352877
## Purchase                      0.0140026938  0.020191945    0.06214326
## User_Purchase_Count          -0.0436288574 -0.005070194   -0.52066133
## Total_Spending               -0.0371116592  0.002349550   -0.50084781
## Avg_Spending_on_each_Product  0.0428040268  0.061716704    0.18993773
##                              Stay_In_Current_City_Years Marital_Status
## User_ID                                   -0.0297462606    0.020102169
## Product_ID                                -0.0025366084    0.012267187
## Gender                                     0.0186357973   -0.011681306
## Age                                       -0.0001599429    0.305179163
## Occupation                                 0.0270630585    0.024521869
## City_Category                              0.0180848717    0.040148774
## Stay_In_Current_City_Years                 1.0000000000   -0.011127002
## Marital_Status                            -0.0111270023    1.000000000
## Product_Category_1                        -0.0016295069    0.019947693
## Purchase                                   0.0040676009   -0.002036816
## User_Purchase_Count                        0.0028393079    0.001716275
## Total_Spending                             0.0012801978   -0.004697633
## Avg_Spending_on_each_Product               0.0124346645   -0.006223832
##                              Product_Category_1     Purchase
## User_ID                             0.003570070  0.004244348
## Product_ID                          0.074596632 -0.109294620
## Gender                             -0.045950164  0.060840541
## Age                                 0.062036498  0.014002694
## Occupation                         -0.006648511  0.020191945
## City_Category                      -0.013528773  0.062143257
## Stay_In_Current_City_Years         -0.001629507  0.004067601
## Marital_Status                      0.019947693 -0.002036816
## Product_Category_1                  1.000000000 -0.342337409
## Purchase                           -0.342337409  1.000000000
## User_Purchase_Count                 0.017942766 -0.090045377
## Total_Spending                      0.003742677 -0.037563844
## Avg_Spending_on_each_Product       -0.114980526  0.327180095
##                              User_Purchase_Count Total_Spending
## User_ID                             -0.031272142   -0.027597276
## Product_ID                          -0.002597573   -0.004162652
## Gender                               0.073747517    0.108524461
## Age                                 -0.043628857   -0.037111659
## Occupation                          -0.005070194    0.002349550
## City_Category                       -0.520661326   -0.500847805
## Stay_In_Current_City_Years           0.002839308    0.001280198
## Marital_Status                       0.001716275   -0.004697633
## Product_Category_1                   0.017942766    0.003742677
## Purchase                            -0.090045377   -0.037563844
## User_Purchase_Count                  1.000000000    0.973065973
## Total_Spending                       0.973065973    1.000000000
## Avg_Spending_on_each_Product        -0.275222715   -0.114816494
##                              Avg_Spending_on_each_Product
## User_ID                                       0.012974786
## Product_ID                                   -0.020602205
## Gender                                        0.185954046
## Age                                           0.042804027
## Occupation                                    0.061716704
## City_Category                                 0.189937728
## Stay_In_Current_City_Years                    0.012434664
## Marital_Status                               -0.006223832
## Product_Category_1                           -0.114980526
## Purchase                                      0.327180095
## User_Purchase_Count                          -0.275222715
## Total_Spending                               -0.114816494
## Avg_Spending_on_each_Product                  1.000000000

Corrplot

library(corrplot)

## corrplot 0.84 loaded

M <- cor(corr_train)
corrplot(M, method="number")

Black Friday Data Visualization

Parv

8 March 2018