Carrefour Kenya and are currently undertaking a project that will inform the marketing department on the most relevant marketing strategies that will result in the highest no. of sales (total price including tax). The project has been divided into four parts where it explore a recent marketing dataset by performing various unsupervised learning techniques and later providing recommendations based on the established insights.
The metric of success for this project is to explore a recent marketing dataset by performing various unsupervised learning techniques and later providing recommendations based on the established insights.
The project is centered around the retail sector. Exploring the marketing dataset through performing unsupervised learning techniques will help in providing relevant insight that will help in making informed marketing decisions.
For this analysis, I will perform the following actions:
Loading the Data.
Reading the Data.
Cleaning the Dataset.
Performing EDA:
Univariate Analysis.
Bivariate Analysis.
Multivariate Analysis.
Modelling.
Dimensionality Reduction(Part 1).
Feature Selection (Part 2).
Recommendation.
This data is relevant because it helps in providing relevant insight that will help Carrefour Kenya make informed marketing decisions.
# calling the libraries
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(chron)
library(ggplot2)
library(corrplot)
## corrplot 0.92 loaded
library(superml)
## Loading required package: R6
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
options(warn=-1)
# changing the working directory
setwd("C:\\Users\\USER\\Documents\\R\\Projects\\SUPERMARKET_ANALYSIS")
# loading the dataset
supermarket = read.csv("Supermarket_Dataset.csv",header=TRUE)
# displaying the first six observations of the dataset
head(supermarket)
## Invoice.ID Branch Customer.type Gender Product.line Unit.price
## 1 750-67-8428 A Member Female Health and beauty 74.69
## 2 226-31-3081 C Normal Female Electronic accessories 15.28
## 3 631-41-3108 A Normal Male Home and lifestyle 46.33
## 4 123-19-1176 A Member Male Health and beauty 58.22
## 5 373-73-7910 A Normal Male Sports and travel 86.31
## 6 699-14-3026 C Normal Male Electronic accessories 85.39
## Quantity Tax Date Time Payment cogs gross.margin.percentage
## 1 7 26.1415 1/5/2019 13:08 Ewallet 522.83 4.761905
## 2 5 3.8200 3/8/2019 10:29 Cash 76.40 4.761905
## 3 7 16.2155 3/3/2019 13:23 Credit card 324.31 4.761905
## 4 8 23.2880 1/27/2019 20:33 Ewallet 465.76 4.761905
## 5 7 30.2085 2/8/2019 10:37 Ewallet 604.17 4.761905
## 6 7 29.8865 3/25/2019 18:30 Ewallet 597.73 4.761905
## gross.income Rating Total
## 1 26.1415 9.1 548.9715
## 2 3.8200 9.6 80.2200
## 3 16.2155 7.4 340.5255
## 4 23.2880 8.4 489.0480
## 5 30.2085 5.3 634.3785
## 6 29.8865 4.1 627.6165
# checking the shape of our dataset
dim(supermarket)
## [1] 1000 16
The dataset is comprised of 1000 observations and 16 variable.
# checking the class of the file
class(supermarket)
## [1] "data.frame"
# printing information on the structure of a data frame
describe(supermarket)
## vars n mean sd median trimmed mad min
## Invoice.ID* 1 1000 500.50 288.82 500.50 500.50 370.65 1.00
## Branch* 2 1000 1.99 0.82 2.00 1.99 1.48 1.00
## Customer.type* 3 1000 1.50 0.50 1.00 1.50 0.00 1.00
## Gender* 4 1000 1.50 0.50 1.00 1.50 0.00 1.00
## Product.line* 5 1000 3.45 1.72 3.00 3.44 1.48 1.00
## Unit.price 6 1000 55.67 26.49 55.23 55.62 33.37 10.08
## Quantity 7 1000 5.51 2.92 5.00 5.51 2.97 1.00
## Tax 8 1000 15.38 11.71 12.09 14.00 11.13 0.51
## Date* 9 1000 45.58 25.89 47.00 45.63 34.10 1.00
## Time* 10 1000 252.18 147.07 249.00 252.49 190.51 1.00
## Payment* 11 1000 2.00 0.83 2.00 2.00 1.48 1.00
## cogs 12 1000 307.59 234.18 241.76 279.91 222.65 10.17
## gross.margin.percentage 13 1000 4.76 0.00 4.76 4.76 0.00 4.76
## gross.income 14 1000 15.38 11.71 12.09 14.00 11.13 0.51
## Rating 15 1000 6.97 1.72 7.00 6.97 2.22 4.00
## Total 16 1000 322.97 245.89 253.85 293.91 233.78 10.68
## max range skew kurtosis se
## Invoice.ID* 1000.00 999.00 0.00 -1.20 9.13
## Branch* 3.00 2.00 0.02 -1.51 0.03
## Customer.type* 2.00 1.00 0.00 -2.00 0.02
## Gender* 2.00 1.00 0.00 -2.00 0.02
## Product.line* 6.00 5.00 0.06 -1.28 0.05
## Unit.price 99.96 89.88 0.01 -1.22 0.84
## Quantity 10.00 9.00 0.01 -1.22 0.09
## Tax 49.65 49.14 0.89 -0.09 0.37
## Date* 89.00 88.00 -0.03 -1.23 0.82
## Time* 506.00 505.00 0.00 -1.25 4.65
## Payment* 3.00 2.00 0.00 -1.55 0.03
## cogs 993.00 982.83 0.89 -0.09 7.41
## gross.margin.percentage 4.76 0.00 NaN NaN 0.00
## gross.income 49.65 49.14 0.89 -0.09 0.37
## Rating 10.00 6.00 0.01 -1.16 0.05
## Total 1042.65 1031.97 0.89 -0.09 7.78
# checking the data type of each variable in the dataframe
str(supermarket)
## 'data.frame': 1000 obs. of 16 variables:
## $ Invoice.ID : chr "750-67-8428" "226-31-3081" "631-41-3108" "123-19-1176" ...
## $ Branch : chr "A" "C" "A" "A" ...
## $ Customer.type : chr "Member" "Normal" "Normal" "Member" ...
## $ Gender : chr "Female" "Female" "Male" "Male" ...
## $ Product.line : chr "Health and beauty" "Electronic accessories" "Home and lifestyle" "Health and beauty" ...
## $ Unit.price : num 74.7 15.3 46.3 58.2 86.3 ...
## $ Quantity : int 7 5 7 8 7 7 6 10 2 3 ...
## $ Tax : num 26.14 3.82 16.22 23.29 30.21 ...
## $ Date : chr "1/5/2019" "3/8/2019" "3/3/2019" "1/27/2019" ...
## $ Time : chr "13:08" "10:29" "13:23" "20:33" ...
## $ Payment : chr "Ewallet" "Cash" "Credit card" "Ewallet" ...
## $ cogs : num 522.8 76.4 324.3 465.8 604.2 ...
## $ gross.margin.percentage: num 4.76 4.76 4.76 4.76 4.76 ...
## $ gross.income : num 26.14 3.82 16.22 23.29 30.21 ...
## $ Rating : num 9.1 9.6 7.4 8.4 5.3 4.1 5.8 8 7.2 5.9 ...
## $ Total : num 549 80.2 340.5 489 634.4 ...
# Checking for outliers in every numerical column
Unit.price <- supermarket$Unit.price
Quantity <- supermarket$Quantity
Tax <- supermarket$Tax
cogs <- supermarket$cogs
gross.income <- supermarket$gross.income
Rating <- supermarket$Rating
boxplot(Unit.price, Quantity, Tax, cogs,gross.income,Rating,
main = "Multiple boxplots for comparision",
names = c("Unit.price", "Quantity", "Tax", "cogs","gross.income","Rating"),
las = 2,
col = c("orange","red"),
border = "black",
horizontal = TRUE,
notch = TRUE
)
# Calculating the total number of missing values in our dataset.
sum(is.na(supermarket))
## [1] 0
sum(duplicated(supermarket))
## [1] 0
# removing the (-) from the values in the Invoice.ID column
supermarket$Invoice.ID<- gsub("-", "", supermarket$Invoice.ID)
# changing its datatype from character to integer
supermarket$Invoice.ID <- as.integer(supermarket$Invoice.ID)
# changing the datatype of date from character to date
supermarket$Date <- as.Date(supermarket$Date, "%m/%d/%Y")
# checking the datatypes of the modified columns
print(class(supermarket$Date))
## [1] "Date"
print(class(supermarket$Invoice.ID))
## [1] "integer"
# Converting the column names to lower case.
names(supermarket) <- tolower(names(supermarket))
# Removing (.) in column names and replacing them with (_).
names(supermarket) <- gsub("\\.", "_", names(supermarket))
# Previewing the column names
names(supermarket)
## [1] "invoice_id" "branch"
## [3] "customer_type" "gender"
## [5] "product_line" "unit_price"
## [7] "quantity" "tax"
## [9] "date" "time"
## [11] "payment" "cogs"
## [13] "gross_margin_percentage" "gross_income"
## [15] "rating" "total"
# Plotting the histogram of Rating Distribution
p<-ggplot(supermarket, aes(x=rating)) +
geom_histogram(color="black", fill="light blue")+
ggtitle("Histogram for Rating Distribution")
p
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Observation: - As per the graph, the highest rating is 6. We can also see that majority of the ratings are high, meaning service delivery, as well as customer satisfaction is good.
# Counting the number of male and female customers
ggplot(supermarket, aes(x = gender)) +
geom_bar(fill="light blue", color="black")+
ggtitle("Bar Graph for The count of Male and Female")
Observation: - From the graph we see that the customers are balanced in terms of gender.
# Counting the number of customer type
ggplot(supermarket, aes(x = customer_type)) +
geom_bar(fill="light blue", color="black")+
ggtitle("Bar Graph for The count of Cutomer Type")
Observation: - From the plot we can also see that there is balance between the Normal and Member Customers.
# Counting the number of product line
ggplot(supermarket, aes(x = product_line)) +
geom_bar(fill="light blue", color="black")+
theme(axis.text.x = element_text(angle = 90, size = 7))+
ggtitle("Bar Graph for The count of Product Line")
Observation: - From the above plot we see observe that the products are exhibiting a competitive performance with all the products having a high count. Fashion accessories is leading, however, closely followed by Food and beverages.
# Count on the supermarket branches
ggplot(supermarket, aes(x = branch)) +
geom_bar(fill="light blue", color="black")+
ggtitle("Bar Graph for The count on the branches")
Observation: -From the bar plots, we see that all the branches are performing fairly well in terms of the total number of customers who visit the branches.Branch A, however, has slightly higher total customer count than the other two branches.
# Count on the payment methods used
ggplot(supermarket, aes(x = payment)) +
geom_bar(fill="light blue", color="black")+
ggtitle("Bar Graph for Payment Methods Used")
Observation: -From the bar graph, we see that the payment methods that are mostly used are Cash and Ewallet. Credit card use is slightly lower than the two.
# plotting a bar graph for The count of Male and Female Vs. Product line
library(ggplot2)
ggplot(supermarket, aes(x = gender, fill=product_line)) +
geom_bar(color="black")+
ggtitle("Bar Graph for The count of Male and Female Vs. Product line")
Observation: -From the bar graph we see that female customers buy more fashion accessories, followed by food and beverages. -Men, on the other hand, exhibit a balance on the product line purchase.
# Stacked Bar Graph for The Branches Vs Product line
library(ggplot2)
ggplot(supermarket, aes(x = branch, fill=product_line)) +
geom_bar(color="black")+
ggtitle("Stacked Bar Graph for The Branches Vs Product line")
Obseravation: -Branch A sells more of Home and Lifestyle, followed by Food and Beverages. -Branch B sells more of Fashion accessories, followed by Sports and Travel. -Branch C sells more of Food and Beverages, followed by Fashion accessories.
# Stacked Bar Graph for gender Vs payment
library(ggplot2)
ggplot(supermarket, aes(x = gender, fill=payment)) +
geom_bar(color="black")+
ggtitle("Bar Graph for Gender Vs Payment")
Observation: -From the graph, we see that most male prefer using the Ewallet, then cash. -For the female, their mode of payment is balanced across the three payment modes.
# Stacked Bar Graph for gender Vs branch
ggplot(supermarket, aes(x = branch, fill=gender)) +
geom_bar(color="black")+
ggtitle("Bar Graph for for Gender Vs Branch")
Observation: -From the graph, for Branch A and B, the ratio of male to female seems to be balanced. -On Branch C, however, there are slighly more female than male customers.
# Correlation matrix for numerical variables
supermarket_num <- Filter(is.numeric, supermarket)
corrplot(cor(supermarket_num))
# Plotting a jitter plot of branch, total, and gender
ggplot(supermarket, aes(x = branch, y = total, color = gender)) +
ggtitle("Jitterplot of Branch, Total, and Gender")+
theme(axis.text.x = element_text(angle = 90, size = 7))+
geom_jitter(width = .2)
# Plotting a jitter plot of product_line, total, and gender
ggplot(supermarket, aes(x = product_line, y = total, color = gender)) +
ggtitle("Jitterplot of Product line, Total, and Gender")+
theme(axis.text.x = element_text(angle = 90, size = 5))+
geom_jitter(width = .2)
# making a copy of the original dataset
super <-supermarket
# head(super)
# dropping the variable that we will not need
super_df <- super[ -c(1,9:10,13) ]
# head(super_df)
Columns that need to be encoded are: branch, customer_type, gender, product_line, payment.
lbl = LabelEncoder$new()
super_df$branch = lbl$fit_transform(super_df$branch)
super_df$customer_type = lbl$fit_transform(super_df$customer_type)
super_df$gender = lbl$fit_transform(super_df$gender)
super_df$product_line = lbl$fit_transform(super_df$product_line)
super_df$payment = lbl$fit_transform(super_df$payment)
#super_df
super.pca <- prcomp(scale(super_df), center = TRUE)
#Checking output of pca. prcomp function returns standard deviation (sdev), rotation and loadings.
names(super.pca)
## [1] "sdev" "rotation" "center" "scale" "x"
# Printing the summary of our dataset
summary(super.pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.2195 1.06662 1.03407 1.02343 0.99818 0.98289 0.95229
## Proportion of Variance 0.4105 0.09481 0.08911 0.08728 0.08303 0.08051 0.07557
## Cumulative Proportion 0.4105 0.50531 0.59442 0.68171 0.76474 0.84524 0.92081
## PC8 PC9 PC10 PC11 PC12
## Standard deviation 0.92765 0.29948 2.058e-16 1.988e-16 3.701e-17
## Proportion of Variance 0.07171 0.00747 0.000e+00 0.000e+00 0.000e+00
## Cumulative Proportion 0.99253 1.00000 1.000e+00 1.000e+00 1.000e+00
#getting the Eigenvalues
eig=get_eigenvalue(super.pca)
eig
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 4.926088e+00 4.105073e+01 41.05073
## Dim.2 1.137670e+00 9.480586e+00 50.53132
## Dim.3 1.069308e+00 8.910901e+00 59.44222
## Dim.4 1.047417e+00 8.728475e+00 68.17069
## Dim.5 9.963660e-01 8.303050e+00 76.47374
## Dim.6 9.660776e-01 8.050647e+00 84.52439
## Dim.7 9.068507e-01 7.557089e+00 92.08148
## Dim.8 8.605363e-01 7.171136e+00 99.25261
## Dim.9 8.968627e-02 7.473856e-01 100.00000
## Dim.10 4.237314e-32 3.531095e-31 100.00000
## Dim.11 3.953704e-32 3.294753e-31 100.00000
## Dim.12 1.369523e-33 1.141269e-32 100.00000
#Plotting the PC
fviz_eig(super.pca, addlabels = T, ylim = c(0, 30))
# Laoding the libraries
library(devtools)
## Loading required package: usethis
library(ggbiplot)
## Loading required package: plyr
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:purrr':
##
## compact
## Loading required package: scales
##
## Attaching package: 'scales'
## The following objects are masked from 'package:psych':
##
## alpha, rescale
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
## Loading required package: grid
# Plotting the expalined variance of PC1 Vs PC2
ggbiplot(super.pca)
# Adding more detail to the plot, we provide arguments rownames as labels
ggbiplot(super.pca, labels=rownames(super), obs.scale = 1, var.scale = 1)
#Plotting the contribution of attributes in each PC
fviz_contrib(super.pca,choice = "var", axes = 1)
fviz_contrib(super.pca, choice = "var", axes = 2)
fviz_contrib(super.pca, choice = "var", axes = 3)
fviz_contrib(super.pca, choice = "var", axes = 4)
fviz_contrib(super.pca, choice = "var", axes = 5)
fviz_contrib(super.pca, choice = "var", axes = 6)
fviz_contrib(super.pca, choice = "var", axes = 7)
fviz_contrib(super.pca, choice = "var", axes = 8)
fviz_contrib(super.pca, choice = "var", axes = 9)
fviz_contrib(super.pca, choice = "var", axes = 10)
# making a copy of the original dataset
super1 <-supermarket_num
# head(super)
# Installing and loading our caret package
# ---
#
#suppressWarnings(
#suppressMessages(if
#(!require(caret, #quietly=TRUE))
#install.packages("caret")))
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
# Installing and loading the corrplot package for plotting
# ---
#
#suppressWarnings(
#suppressMessages(if
#(!require(corrplot, quietly=TRUE))
#install.packages("corrplot")))
library(corrplot)
# Calculating the correlation matrix
# ---
#
correlationMatrix <- cor(super1)
# Find attributes that are highly correlated
# ---
#
highlyCorrelated <- findCorrelation(correlationMatrix, cutoff=0.75)
# Highly correlated attributes
# ---
#
highlyCorrelated
## [1] 4 5 7
names(super1[,highlyCorrelated])
## [1] "tax" "cogs" "gross_income"