#1.ASK
#1.0 Business task
#.The average sales amount and the number of sales at each branch
#.The average unit price for each product line
#.Do branches make more sale on a specific day of the week
#.distribution of average sales amount by gender
#.distribution of average sales amount by gender per branch
#which is the most popular product line by quantity sold
#which is the most profitable product line
#1.1 Stakeholders
#stakeholders include the following
#.Marketing & sales director
#.Analytics team
#.Sales team
#2.PREPARE
#.The data is located on Kaggle & contains sales data of different branches over a 3 month period.
#.Data is organized in csv files
#.Credibility of data not in question
#.This data has been stripped of all identifying information ensuring its privacy
#3.PROCESS
#.For this project I choose RStudio Desktop in order to prepare, process, clean, analyze and create the visualizations.
#Data review involved the following:
#.Checking column names across all the 12 original files.
#.Checking for missing values.
#.Checking of white spaces.
#.Checking of duplicate records.
#3.1 COLLECT & DATA WRANGLING
#load readr for reading rectangular data
#load dplyr for data wrangling
#load ggplot2 for data visualization
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
library(ggplot2)
#read data set using the read_csv function
supermarket_sales<-read_csv("C:/Users/user/Desktop/RSTUDIO/supermarket_sales.csv")
## Rows: 1000 Columns: 17
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (8): Invoice_ID, Branch, City, Customer_type, Gender, Product_line, Dat...
## dbl (8): Unit_price, Quantity, Tax _5%, Total, cogs, gross_margin_percentag...
## time (1): Time
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
#3.2 Data Validation
head(supermarket_sales,5)
## # A tibble: 5 x 17
## Invoice_ID Branch City Customer_type Gender Product_line Unit_price Quantity
## <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 750-67-8428 A Yang~ Member Female Health and ~ 74.7 7
## 2 226-31-3081 C Nayp~ Normal Female Electronic ~ 15.3 5
## 3 631-41-3108 A Yang~ Normal Male Home and li~ 46.3 7
## 4 123-19-1176 A Yang~ Member Male Health and ~ 58.2 8
## 5 373-73-7910 A Yang~ Normal Male Sports and ~ 86.3 7
## # ... with 9 more variables: Tax _5% <dbl>, Total <dbl>, Date <chr>,
## # Time <time>, Payment <chr>, cogs <dbl>, gross_margin_percentage <dbl>,
## # gross_income <dbl>, Rating <dbl>
tail(supermarket_sales)
## # A tibble: 6 x 17
## Invoice_ID Branch City Customer_type Gender Product_line Unit_price Quantity
## <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 652-49-6720 C Nayp~ Member Female Electronic ~ 61.0 1
## 2 233-67-5758 C Nayp~ Normal Male Health and ~ 40.4 1
## 3 303-96-2227 B Mand~ Normal Female Home and li~ 97.4 10
## 4 727-02-1313 A Yang~ Member Male Food and be~ 31.8 1
## 5 347-56-2442 A Yang~ Normal Male Home and li~ 65.8 1
## 6 849-09-3807 A Yang~ Member Female Fashion acc~ 88.3 7
## # ... with 9 more variables: Tax _5% <dbl>, Total <dbl>, Date <chr>,
## # Time <time>, Payment <chr>, cogs <dbl>, gross_margin_percentage <dbl>,
## # gross_income <dbl>, Rating <dbl>
str(supermarket_sales)
## spec_tbl_df [1,000 x 17] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Invoice_ID : chr [1:1000] "750-67-8428" "226-31-3081" "631-41-3108" "123-19-1176" ...
## $ Branch : chr [1:1000] "A" "C" "A" "A" ...
## $ City : chr [1:1000] "Yangon" "Naypyitaw" "Yangon" "Yangon" ...
## $ Customer_type : chr [1:1000] "Member" "Normal" "Normal" "Member" ...
## $ Gender : chr [1:1000] "Female" "Female" "Male" "Male" ...
## $ Product_line : chr [1:1000] "Health and beauty" "Electronic accessories" "Home and lifestyle" "Health and beauty" ...
## $ Unit_price : num [1:1000] 74.7 15.3 46.3 58.2 86.3 ...
## $ Quantity : num [1:1000] 7 5 7 8 7 7 6 10 2 3 ...
## $ Tax _5% : num [1:1000] 26.14 3.82 16.22 23.29 30.21 ...
## $ Total : num [1:1000] 549 80.2 340.5 489 634.4 ...
## $ Date : chr [1:1000] "1/5/2019" "3/8/2019" "3/3/2019" "1/27/2019" ...
## $ Time : 'hms' num [1:1000] 13:08:00 10:29:00 13:23:00 20:33:00 ...
## ..- attr(*, "units")= chr "secs"
## $ Payment : chr [1:1000] "Ewallet" "Cash" "Credit card" "Ewallet" ...
## $ cogs : num [1:1000] 522.8 76.4 324.3 465.8 604.2 ...
## $ gross_margin_percentage: num [1:1000] 4.76 4.76 4.76 4.76 4.76 ...
## $ gross_income : num [1:1000] 26.14 3.82 16.22 23.29 30.21 ...
## $ Rating : num [1:1000] 9.1 9.6 7.4 8.4 5.3 4.1 5.8 8 7.2 5.9 ...
## - attr(*, "spec")=
## .. cols(
## .. Invoice_ID = col_character(),
## .. Branch = col_character(),
## .. City = col_character(),
## .. Customer_type = col_character(),
## .. Gender = col_character(),
## .. Product_line = col_character(),
## .. Unit_price = col_double(),
## .. Quantity = col_double(),
## .. `Tax _5%` = col_double(),
## .. Total = col_double(),
## .. Date = col_character(),
## .. Time = col_time(format = ""),
## .. Payment = col_character(),
## .. cogs = col_double(),
## .. gross_margin_percentage = col_double(),
## .. gross_income = col_double(),
## .. Rating = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
glimpse(supermarket_sales)
## Rows: 1,000
## Columns: 17
## $ Invoice_ID <chr> "750-67-8428", "226-31-3081", "631-41-3108", "~
## $ Branch <chr> "A", "C", "A", "A", "A", "C", "A", "C", "A", "~
## $ City <chr> "Yangon", "Naypyitaw", "Yangon", "Yangon", "Ya~
## $ Customer_type <chr> "Member", "Normal", "Normal", "Member", "Norma~
## $ Gender <chr> "Female", "Female", "Male", "Male", "Male", "M~
## $ Product_line <chr> "Health and beauty", "Electronic accessories",~
## $ Unit_price <dbl> 74.69, 15.28, 46.33, 58.22, 86.31, 85.39, 68.8~
## $ Quantity <dbl> 7, 5, 7, 8, 7, 7, 6, 10, 2, 3, 4, 4, 5, 10, 10~
## $ `Tax _5%` <dbl> 26.1415, 3.8200, 16.2155, 23.2880, 30.2085, 29~
## $ Total <dbl> 548.9715, 80.2200, 340.5255, 489.0480, 634.378~
## $ Date <chr> "1/5/2019", "3/8/2019", "3/3/2019", "1/27/2019~
## $ Time <time> 13:08:00, 10:29:00, 13:23:00, 20:33:00, 10:37~
## $ Payment <chr> "Ewallet", "Cash", "Credit card", "Ewallet", "~
## $ cogs <dbl> 522.83, 76.40, 324.31, 465.76, 604.17, 597.73,~
## $ gross_margin_percentage <dbl> 4.761905, 4.761905, 4.761905, 4.761905, 4.7619~
## $ gross_income <dbl> 26.1415, 3.8200, 16.2155, 23.2880, 30.2085, 29~
## $ Rating <dbl> 9.1, 9.6, 7.4, 8.4, 5.3, 4.1, 5.8, 8.0, 7.2, 5~
#4Data Cleaning
#Check for missing values
colSums(is.na(supermarket_sales))
## Invoice_ID Branch City
## 0 0 0
## Customer_type Gender Product_line
## 0 0 0
## Unit_price Quantity Tax _5%
## 0 0 0
## Total Date Time
## 0 0 0
## Payment cogs gross_margin_percentage
## 0 0 0
## gross_income Rating
## 0 0
#check for duplicates
distinct(supermarket_sales)
## # A tibble: 1,000 x 17
## Invoice_ID Branch City Customer_type Gender Product_line Unit_price Quantity
## <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 750-67-84~ A Yang~ Member Female Health and ~ 74.7 7
## 2 226-31-30~ C Nayp~ Normal Female Electronic ~ 15.3 5
## 3 631-41-31~ A Yang~ Normal Male Home and li~ 46.3 7
## 4 123-19-11~ A Yang~ Member Male Health and ~ 58.2 8
## 5 373-73-79~ A Yang~ Normal Male Sports and ~ 86.3 7
## 6 699-14-30~ C Nayp~ Normal Male Electronic ~ 85.4 7
## 7 355-53-59~ A Yang~ Member Female Electronic ~ 68.8 6
## 8 315-22-56~ C Nayp~ Normal Female Home and li~ 73.6 10
## 9 665-32-91~ A Yang~ Member Female Health and ~ 36.3 2
## 10 692-92-55~ B Mand~ Member Female Food and be~ 54.8 3
## # ... with 990 more rows, and 9 more variables: Tax _5% <dbl>, Total <dbl>,
## # Date <chr>, Time <time>, Payment <chr>, cogs <dbl>,
## # gross_margin_percentage <dbl>, gross_income <dbl>, Rating <dbl>
#5.Analysis
#.The average sales amount per branch
avg_sales_amnt<-supermarket_sales%>%
group_by(Branch)%>%
summarise(avg_sales=mean(Total))
avg_sales_amnt
## # A tibble: 3 x 2
## Branch avg_sales
## <chr> <dbl>
## 1 A 312.
## 2 B 320.
## 3 C 337.
#.The average unit price of products in each product line
avg_unit_price<-supermarket_sales%>%
group_by(Product_line)%>%
summarise(avg_unit_price=mean(Unit_price))%>%
arrange(desc(avg_unit_price))
avg_unit_price
## # A tibble: 6 x 2
## Product_line avg_unit_price
## <chr> <dbl>
## 1 Fashion accessories 57.2
## 2 Sports and travel 57.0
## 3 Food and beverages 56.0
## 4 Home and lifestyle 55.3
## 5 Health and beauty 54.9
## 6 Electronic accessories 53.6
#.distribution of average sales amount by gender
avg_sales_by_gender<-supermarket_sales%>%
group_by(Gender)%>%
summarise(avg_sales=mean(Total))
avg_sales_by_gender
## # A tibble: 2 x 2
## Gender avg_sales
## <chr> <dbl>
## 1 Female 335.
## 2 Male 311.
#.distribution of average sales amount by gender per branch
avg_sales_by_gender_branch<-supermarket_sales%>%
group_by(Gender,Branch)%>%
summarise(avg_sales=mean(Total))
## `summarise()` has grouped output by 'Gender'. You can override using the `.groups` argument.
avg_sales_by_gender_branch
## # A tibble: 6 x 3
## # Groups: Gender [2]
## Gender Branch avg_sales
## <chr> <chr> <dbl>
## 1 Female A 331.
## 2 Female B 327.
## 3 Female C 347.
## 4 Male A 296.
## 5 Male B 313.
## 6 Male C 326.
#which is the most popular product line by quantity sold
popular_product<-supermarket_sales%>%
group_by(Product_line)%>%
summarise(avg_qnty=mean(Quantity))%>%
arrange(desc(avg_qnty))
popular_product
## # A tibble: 6 x 2
## Product_line avg_qnty
## <chr> <dbl>
## 1 Electronic accessories 5.71
## 2 Home and lifestyle 5.69
## 3 Health and beauty 5.62
## 4 Sports and travel 5.54
## 5 Food and beverages 5.47
## 6 Fashion accessories 5.07
#which is the most profitable product line
profitable_product<-supermarket_sales%>%
group_by(Product_line)%>%
summarise(avg_gross_income=mean(gross_income))%>%
arrange(desc(avg_gross_income))
profitable_product
## # A tibble: 6 x 2
## Product_line avg_gross_income
## <chr> <dbl>
## 1 Home and lifestyle 16.0
## 2 Sports and travel 15.8
## 3 Health and beauty 15.4
## 4 Food and beverages 15.4
## 5 Electronic accessories 15.2
## 6 Fashion accessories 14.5
#6.VISUALIZATON
#.The average sales amount per branch
ggplot(avg_sales_amnt, aes(x=Branch,y=avg_sales))+
geom_col()

#.The average unit price of products in each product line
ggplot(avg_unit_price,aes(x=Product_line,y=avg_unit_price))+
geom_col()

#.distribution of average sales amount by gender
ggplot(avg_sales_by_gender,aes(x=Gender,y=avg_sales,color=Gender))+
geom_col()

#.distribution of average sales amount by gender per branch
ggplot(avg_sales_by_gender_branch,aes(x=Gender,y=avg_sales))+
geom_col()+
facet_wrap(~Branch)

#which is the most popular product line by quantity sold
ggplot(popular_product,aes(x=Product_line,y=avg_qnty))+
geom_col()

#which is the most profitable product line
ggplot(profitable_product,aes(x=Product_line,y=avg_gross_income,color=Product_line))+
geom_col()

# 5: EXPORT SUMMARY FILE FOR FURTHER ANALYSIS
#=================================================
# Create a csv file that we will visualize in Tableau
# N.B.: This file location is for a Mac. If you are working on a PC, change the file location accordingly (most likely "C:\Users\YOUR_USERNAME\Desktop\...") to export the data. You can read more here: https://datatofish.com/export-dataframe-to-csv-in-r/
write.csv(profitable_product,'C:\\Users\\user\\Desktop\\RSTUDIO\\profitable_product.csv')
write.csv(popular_product,'C:\\Users\\user\\Desktop\\RSTUDIO\\popular_product.csv')
write.csv(avg_sales_by_gender_branch,'C:\\Users\\user\\Desktop\\RSTUDIO\\avg_sales_by_gender_branch.csv')
write.csv(avg_sales_by_gender,'C:\\Users\\user\\Desktop\\RSTUDIO\\avg_sales_by_gender.csv')
write.csv(avg_sales_amnt,'C:\\Users\\user\\Desktop\\RSTUDIO\\avg_sales_amnt.csv')
write.csv(avg_unit_price,'C:\\Users\\user\\Desktop\\RSTUDIO\\avg_unit_price.csv')
#Tableau visualisation
#https://public.tableau.com/views/supermarketsales_analysis/supermarket-sales_analysis?:language=en-US&publish=yes&:display_count=n&:origin=viz_share_link