#Exploratory Data Analysis on Supermarket Sales
#Introduction
#Definition: Exploratory Data Analysis refers to the critical process of performing initial investigations on data so as to discover patterns, to spot anomalies, to test hypothesis and to check assumptions with the help of summary statistics and graphical representations.
#Objective: To test a hypothesis related to different types of problems for business analytics using the supermarket sales data.
#Problems/Questions:
# 1. Descriptive - What is the top payment method based on the product lines?
# 2. Prediction - When is the best time to offer discounts?
# 3. Correlation - Is there a relationship between total sales on weekdays vs weekend?
#Data Understanding
#About the raw data
#Dataset Details
#Title: The Supermarket Sales Data
#Year: 2019
#Source: Kaggle: https://www.kaggle.com/aungpyaeap/supermarket-sales
#About: The growth of supermarkets in most populated cities are increasing and market competitions are also high. This dataset is one of the historical sales of supermarket company which has recorded in 3 different branches for 3 months data.
#Attributes:
#Tax: 5% tax fee for customer buying
#Total: Total price including tax
#Date: Date of purchase (Record available from January 2019 to March 2019)
#Time: Purchase time (10am to 9pm)
#Payment: Payment used by customer for purchase (Cash, Credit card and Ewallet)
#COGS: Cost of goods sold
#Gross margin percentage: Gross margin percentage
#Gross income: Gross income
#Rating: Customer stratification rating on their overall shopping experience (On a scale of 1 to 10)
#Data Overview
#3.1 COLLECT & DATA WRANGLING
#load readr for reading rectangular data
#load dplyr for data wrangling
#load ggplot2 for data visualization
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
library(ggplot2)
#read data set using the read_csv function
#Load the dataset
supermarket_sales_fig <- read_csv("C:/Users/user/Desktop/RSTUDIO/supermarket_sales.csv")
## Rows: 1000 Columns: 17
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (8): Invoice_ID, Branch, City, Customer_type, Gender, Product_line, Dat...
## dbl (8): Unit_price, Quantity, Tax _5%, Total, cogs, gross_margin_percentag...
## time (1): Time
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Preview some of the data to get an overview.
head(supermarket_sales_fig)
## # A tibble: 6 x 17
## Invoice_ID Branch City Customer_type Gender Product_line Unit_price Quantity
## <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 750-67-8428 A Yang~ Member Female Health and ~ 74.7 7
## 2 226-31-3081 C Nayp~ Normal Female Electronic ~ 15.3 5
## 3 631-41-3108 A Yang~ Normal Male Home and li~ 46.3 7
## 4 123-19-1176 A Yang~ Member Male Health and ~ 58.2 8
## 5 373-73-7910 A Yang~ Normal Male Sports and ~ 86.3 7
## 6 699-14-3026 C Nayp~ Normal Male Electronic ~ 85.4 7
## # ... with 9 more variables: Tax _5% <dbl>, Total <dbl>, Date <chr>,
## # Time <time>, Payment <chr>, cogs <dbl>, gross_margin_percentage <dbl>,
## # gross_income <dbl>, Rating <dbl>
#3.1Use descriptive analysis to understand the data.
str(supermarket_sales_fig)
## spec_tbl_df [1,000 x 17] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Invoice_ID : chr [1:1000] "750-67-8428" "226-31-3081" "631-41-3108" "123-19-1176" ...
## $ Branch : chr [1:1000] "A" "C" "A" "A" ...
## $ City : chr [1:1000] "Yangon" "Naypyitaw" "Yangon" "Yangon" ...
## $ Customer_type : chr [1:1000] "Member" "Normal" "Normal" "Member" ...
## $ Gender : chr [1:1000] "Female" "Female" "Male" "Male" ...
## $ Product_line : chr [1:1000] "Health and beauty" "Electronic accessories" "Home and lifestyle" "Health and beauty" ...
## $ Unit_price : num [1:1000] 74.7 15.3 46.3 58.2 86.3 ...
## $ Quantity : num [1:1000] 7 5 7 8 7 7 6 10 2 3 ...
## $ Tax _5% : num [1:1000] 26.14 3.82 16.22 23.29 30.21 ...
## $ Total : num [1:1000] 549 80.2 340.5 489 634.4 ...
## $ Date : chr [1:1000] "1/5/2019" "3/8/2019" "3/3/2019" "1/27/2019" ...
## $ Time : 'hms' num [1:1000] 13:08:00 10:29:00 13:23:00 20:33:00 ...
## ..- attr(*, "units")= chr "secs"
## $ Payment : chr [1:1000] "Ewallet" "Cash" "Credit card" "Ewallet" ...
## $ cogs : num [1:1000] 522.8 76.4 324.3 465.8 604.2 ...
## $ gross_margin_percentage: num [1:1000] 4.76 4.76 4.76 4.76 4.76 ...
## $ gross_income : num [1:1000] 26.14 3.82 16.22 23.29 30.21 ...
## $ Rating : num [1:1000] 9.1 9.6 7.4 8.4 5.3 4.1 5.8 8 7.2 5.9 ...
## - attr(*, "spec")=
## .. cols(
## .. Invoice_ID = col_character(),
## .. Branch = col_character(),
## .. City = col_character(),
## .. Customer_type = col_character(),
## .. Gender = col_character(),
## .. Product_line = col_character(),
## .. Unit_price = col_double(),
## .. Quantity = col_double(),
## .. `Tax _5%` = col_double(),
## .. Total = col_double(),
## .. Date = col_character(),
## .. Time = col_time(format = ""),
## .. Payment = col_character(),
## .. cogs = col_double(),
## .. gross_margin_percentage = col_double(),
## .. gross_income = col_double(),
## .. Rating = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
#to show min & max unit price for each of product line
supermarket_sales_price<-supermarket_sales_fig %>%
group_by(Product_line) %>%
summarise(
total_inv = n(),
min_price = min(Unit_price),
max_price = max(Unit_price),
avg_price = mean(Unit_price))
#check table
supermarket_sales_price
## # A tibble: 6 x 5
## Product_line total_inv min_price max_price avg_price
## <chr> <int> <dbl> <dbl> <dbl>
## 1 Electronic accessories 170 10.6 99.7 53.6
## 2 Fashion accessories 178 10.2 99.9 57.2
## 3 Food and beverages 174 10.1 99.8 56.0
## 4 Health and beauty 152 10.1 100. 54.9
## 5 Home and lifestyle 160 10.5 99.9 55.3
## 6 Sports and travel 166 10.2 100. 57.0
#Data Preprocessing
#Is a significant step to process and prepare the data for analysis.
#4.Data Cleaning
#1- Confirm the class of the loaded dataset is a data frame.
class(supermarket_sales_fig)
## [1] "spec_tbl_df" "tbl_df" "tbl" "data.frame"
#2- Check for duplication sales transactions based on Invoice_ID.
sum(duplicated(supermarket_sales_fig[,1]))
## [1] 0
#3- Check for missing values across all columns in the data frame.
sapply(supermarket_sales_fig,function(x) table(as.character(x) =="")["TRUE"])
## Invoice_ID.NA Branch.NA
## NA NA
## City.NA Customer_type.NA
## NA NA
## Gender.NA Product_line.NA
## NA NA
## Unit_price.NA Quantity.NA
## NA NA
## Tax _5%.NA Total.NA
## NA NA
## Date.NA Time.NA
## NA NA
## Payment.NA cogs.NA
## NA NA
## gross_margin_percentage.NA gross_income.NA
## NA NA
## Rating.NA
## NA
sapply(supermarket_sales_fig,function(x) sum(is.na(x)))
## Invoice_ID Branch City
## 0 0 0
## Customer_type Gender Product_line
## 0 0 0
## Unit_price Quantity Tax _5%
## 0 0 0
## Total Date Time
## 0 0 0
## Payment cogs gross_margin_percentage
## 0 0 0
## gross_income Rating
## 0 0
#4- Convert date from character to datetime.
typeof(supermarket_sales_fig$Date) #check type of object for date
## [1] "character"
## [1] "character"
supermarket_sales_fig$Date<-as.Date(supermarket_sales_fig$Date, format = "%m/%d/%Y")
#5- Remove irrelevant information- only keep columns significant to sales analysis for the business.
supermarket_sales_fig<-subset(supermarket_sales_fig,
select=c("Branch","City","Customer_type","Gender",
"Product_line","Quantity","Total","Date","Time","Payment"))
#Data Transformation
#1- Create new columns for month and day.
supermarket_sales_fig<-transform(supermarket_sales_fig,Months= months(supermarket_sales_fig$Date))
supermarket_sales_fig<-transform(supermarket_sales_fig,Days= weekdays(supermarket_sales_fig$Date))
#2- Re-arrange the columns & assigned it to a new processed dataset.
SS_df_cleaned<-supermarket_sales_fig[,c(1:8,11:12,9:10)]
#3- Check the dimension of the transformed database.
dim(SS_df_cleaned) # by rows & columns
## [1] 1000 12
## [1] 1000 12
#4- Get the details of the transformed database to confirm the changes done.
library(funModeling) #for analysis
## Warning: package 'funModeling' was built under R version 4.1.3
## Loading required package: Hmisc
## Warning: package 'Hmisc' was built under R version 4.1.3
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
## funModeling v.1.9.4 :)
## Examples and tutorials at livebook.datascienceheroes.com
## / Now in Spanish: librovivodecienciadedatos.ai
status(SS_df_cleaned)
## variable q_zeros p_zeros q_na p_na q_inf p_inf type
## Branch Branch 0 0 0 0 0 0 character
## City City 0 0 0 0 0 0 character
## Customer_type Customer_type 0 0 0 0 0 0 character
## Gender Gender 0 0 0 0 0 0 character
## Product_line Product_line 0 0 0 0 0 0 character
## Quantity Quantity 0 0 0 0 0 0 numeric
## Total Total 0 0 0 0 0 0 numeric
## Date Date 0 0 0 0 0 0 Date
## Months Months 0 0 0 0 0 0 character
## Days Days 0 0 0 0 0 0 character
## Time Time 0 0 0 0 0 0 hms-difftime
## Payment Payment 0 0 0 0 0 0 character
## unique
## Branch 3
## City 3
## Customer_type 2
## Gender 2
## Product_line 6
## Quantity 10
## Total 990
## Date 89
## Months 3
## Days 7
## Time 506
## Payment 3
library(ggplot2)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
library(RColorBrewer)
#2- Analyzing categorical variables:
frequency(SS_df_cleaned$Branch)
## [1] 1
plot_num(SS_df_cleaned)
## Warning: attributes are not identical across measure variables; they will be
## dropped
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

#4- Analyzing categorical & numerical variables:
#Analysis by Customer type-Normal
SS_df_analysed<-subset(SS_df_cleaned,SS_df_cleaned$Customer.type=="Normal")
normal<-ggplot(data=SS_df_analysed, aes(x=Product_line,y=Total, color=Product_line)) +
geom_boxplot()+
scale_color_brewer(palette="Dark2") +
geom_jitter(shape=16, position=position_jitter(0.2))+
theme(axis.text.x = element_text(angle = 25))+
labs(title = 'Which product lines do Non-members spend the most',
y='Total Sales',x='Product Lines')
normal

#Analysis by Customer type-Member
SS_df_analysed<-subset(SS_df_cleaned,SS_df_cleaned$Customer.type=="Member")
member<-ggplot(data = SS_df_analysed, aes(x=Product_line,y=Total, color=Product_line)) +
geom_boxplot()+
scale_color_brewer(palette="Dark2") +
geom_jitter(shape=16, position=position_jitter(0.2))+
theme(axis.text.x = element_text(angle = 25))+
labs(title = 'Which product lines do Members spend the most',
y='Total Sales',x='Product Lines')
member

#The distribution of sales across categories for customers with membership is more equally spread out compare to those of normal customers.
#Non-members maximum spending across categories are mostly less than or close to $500 only while customers with membership spend the most in health & beauty line.
#There are big differences in the distributions of sales between non-members and members in fashion accessories, health & beauty and sports & travel product lines.
#Data Analysis
#Answer different types of business questions for analysis
#Q1: Descriptive Problem
#What is the top payment method based on the product lines?
# Plotting the bar graph for the frequency of each payment methods.
# count Payment and change the count values into percentage
plotdata <- SS_df_cleaned %>% count(Payment) %>%
mutate(pct = n / sum(n), # count / sum of count
pctlabel = paste0(round(pct*100), "%")) #percentage form
# plot the bars for Payment in ascending order
ggplot(plotdata,
aes(x = reorder(Payment, pct),
y = pct)) +
geom_bar(stat = "identity",
fill = "#6fd1d1",
color = "black") +
geom_text(aes(label = pctlabel),
vjust=-0.25) +
scale_y_continuous(labels = percent) +
theme_minimal() +
labs(x = "Payment",
y = "Frequency",
title = "Payment Mehods")

#Plotting the bar graph for the frequency of each payment methods according to the product lines.
# plot the bars for Payment based on the colours of Product.line
plotdata <- SS_df_cleaned %>% count(Payment, Product_line) %>%
mutate(pct = n / sum(n),
pctlabel = paste0(round(pct*100), "%"))
ggplot(plotdata,
aes(x = Product_line,
y = pct,
fill = Payment)) +
geom_bar(stat = "identity",
position=position_dodge()) +
scale_fill_brewer(palette="Dark2") +
theme_minimal() +
theme(legend.position = "bottom") +
geom_text(aes(label = pctlabel),
vjust=0.5,
hjust = 1.5,
position = position_dodge(1.0),
size=3.5,
colour = "white") +
scale_y_continuous(labels = percent) +
labs(x = " ",
y = "Frequency",
title = "Payment methods according to the product lines") +
coord_flip() #horizontal bar graph

#Results:
#34% of the customers prefer to use e-wallet as their payment method, another 34% prefer cash, while 31% preferred credit card
#Customers prefer to use cash when they bought electronic accessories items.
#Customers prefer to use their credit cards more when they bought for food and beverages items.
#Customers prefer to use e-wallet when they pay for home and lifestyle products or fashion accessories items.
#Q2: Predictive Problem
#When is the best time to offer discounts?
#Plotting the bar graph for the frequency of the days.
# Days frequency
#count the days
Cdays <- SS_df_cleaned %>% count(Days)
#plot
ggplot(Cdays, aes(x = reorder(Days, n), y = n)) +
geom_bar(stat = "identity",
fill = "indianred3",
color = "black") +
labs(x = "Days",
y = "Frequency",
title = "Frequency of Days")+
theme_minimal()

#Plotting the bar graph for the frequency of the time using binning method.
# Time frequency
#count the time
Ctime <- SS_df_cleaned %>% select(Time)
#create tags for binning
tags <- c("10 am - 12 pm", "12 pm - 3 pm", "3 pm - 5 pm", "5 pm - 7 pm",
"7 pm - 9 pm")
#binning the time column
time_bins <- as_tibble(Ctime) %>%
mutate(Time_bin = case_when(
Time >= 10 & Time < 12 ~ tags[1],
Time >= 12 & Time < 15 ~ tags[2],
Time >= 15 & Time < 17 ~ tags[3],
Time >= 17 & Time < 19 ~ tags[4],
Time >= 19 & Time < 21 ~ tags[5],
))
#count the percentage of the time bins
plotdata <- time_bins %>% count(Time_bin) %>%
mutate(pct = n / sum(n),
pctlabel = paste0(round(pct*100), "%"))
#plot
ggplot(plotdata,
aes(x = reorder(Time_bin, -pct),
y = pct)) +
geom_bar(stat = "identity",
fill = "#6fd1d1",
colour = "black") +
geom_text(aes(label = pctlabel),
vjust = -0.25) +
scale_y_continuous(labels = percent) +
labs(x = "Time Range",
y = "Frequency",
title = "Busy Hour")+
theme_minimal()

#Plotting the bar graph for the frequency of the days and the time.
# day n time
#add the time bins into the new df
new <- SS_df_cleaned %>% mutate(time_bins)
plotdata <- new %>% count(Days, Time_bin) %>%
mutate(pct = n / sum(n),
pctlabel = paste0(round(pct*100), "%"))
ggplot(plotdata,
aes(x = Time_bin,
y = pctlabel,
fill = Days)) +
geom_bar(stat = "identity",
position=position_dodge()) +
scale_fill_brewer(palette="Set2") +
theme_minimal() +
theme(legend.position = "bottom") +
labs(x = "Time Range",
y = "Frequency",
title = "Busy Hour")

#Results:
#The number of customers during the Saturdays were the highest, while the number of customers on Mondays were the least.
#It is the busiest during 12pm to 3pm and the least busy time is during 5pm to 7pm.
#Mondays are the least busy day at the supermarkets.
#Therefore the best time to offer discounts are:
#On Mondays
#5pm - 7pm on any day
#5pm - 7pm on Mondays
#3pm - 5pm on Sundays, Tuesdays, Thursdays.
#Q3: Associative Problem
#Is there a relationship between total sales on weekdays vs weekend?
#Plotting a heatmap to visualize relationship between sales on weekdays vs weekend.
library(superheat)
## Warning: package 'superheat' was built under R version 4.1.3
new$Days <- recode(new$Days,
"Monday" = "Weekdays",
"Tuesday" = "Weekdays",
"Wednesday" = "Weekdays",
"Thursday" = "Weekdays",
"Friday" = "Weekdays",
"Saturday" = "Weekends",
"Sunday" = "Weekends")
#sales relationship between weekdays and weekends
sales <- new %>% select(Total)
#create tags for binning
tags <- c("$0-$200", "$200-$400", "$400-$600","$600-$800", ">$800")
#binning the sales column
Total_bins <- as_tibble(sales) %>%
mutate(Total_bins = case_when(
Total >= 10 & Total < 200 ~ tags[1],
Total >= 200 & Total < 400 ~ tags[2],
Total >= 400 & Total < 600 ~ tags[3],
Total >= 600 & Total < 800 ~ tags[4],
Total >= 800 ~ tags[5]
))
# temporary df
temp <- new %>%
mutate(Total_bins) %>%
count(Days, Total_bins) %>%
filter(Days == "Weekends") %>%
mutate(pct = n / sum(n),
pctlabel = paste0(round(pct*100), "%"))
#plot
new %>%
mutate(Total_bins) %>%
count(Days, Total_bins) %>%
filter(Days == "Weekdays") %>%
mutate(pct = n / sum(n),
pctlabel = paste0(round(pct*100), "%")) %>%
add_row(temp) %>%
ggplot(mapping = aes(x = Days, y = Total_bins)) +
geom_tile(mapping = aes(fill = pct*100)) +
geom_text(aes(label=pctlabel), colour = "#8c9191") +
scale_fill_distiller(palette = "YlGnBu", direction = 1) +
guides(fill = guide_legend(title = "Pecent (%)")) +
theme_minimal() +
labs(x = " ",
y = " ",
title = "Customer's spendature during Weekdays and Weekends")

#Results:
#No matter if it is the weekdays or the weekends, majority of the customers will only spend around $0 to $200.
#On the weekends however, more customers spend $200 to $400 and $400 to $600 compared to when on the weekdays.
#Conclusion
#Based on the results, we can conclude that:
#Descriptive Problem: The data shows that there is a preferred payment method based on the product lines
#Predictive Problem: The data shows that there is a trend on the number of customers at certain days and time, which provides information for business decisions such as targeted time to offerdiscounts.
#Associative Problem: There is weak causal relationship observed between average sales on weekdays vs weekends.
#EDA does not determine models on the data, but it helps business owners to explore possible data analysis models that best suit the data based on the business problems and goals.
#In conclusion, we have successfully meet our objective by answering the questions posed in the beginning for different types of problems.