library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
list.files(path = "../input")
## character(0)
df <- read.csv("supermarket.csv")
head(df)
## Invoice_ID Branch City Customer_type Gender Product_line
## 1 750-67-8428 A Yangon Member Female Health_and_beauty
## 2 226-31-3081 C Naypyitaw Normal Female Electronic_accessories
## 3 631-41-3108 A Yangon Normal Male Home_and_lifestyle
## 4 123-19-1176 A Yangon Member Male Health_and_beauty
## 5 373-73-7910 A Yangon Normal Male Sports_and_travel
## 6 699-14-3026 C Naypyitaw Normal Male Electronic_accessories
## Unit_price Quantity Tax_0.05 Total Date Time Payment
## 1 74.69 7 26.1415 548.9715 1/5/2019 1:08:00_PM Ewallet
## 2 15.28 5 3.8200 80.2200 3/8/2019 10:29:00_AM Cash
## 3 46.33 7 16.2155 340.5255 3/3/2019 1:23:00_PM Credit_card
## 4 58.22 8 23.2880 489.0480 1/27/2019 8:33:00_PM Ewallet
## 5 86.31 7 30.2085 634.3785 2/8/2019 10:37:00_AM Ewallet
## 6 85.39 7 29.8865 627.6165 3/25/2019 6:30:00_PM Ewallet
## cogs gross_margin_percentage gross_income Rating
## 1 522.83 4.761905 26.1415 9.1
## 2 76.40 4.761905 3.8200 9.6
## 3 324.31 4.761905 16.2155 7.4
## 4 465.76 4.761905 23.2880 8.4
## 5 604.17 4.761905 30.2085 5.3
## 6 597.73 4.761905 29.8865 4.1
str(df)
## 'data.frame': 1000 obs. of 17 variables:
## $ Invoice_ID : chr "750-67-8428" "226-31-3081" "631-41-3108" "123-19-1176" ...
## $ Branch : chr "A" "C" "A" "A" ...
## $ City : chr "Yangon" "Naypyitaw" "Yangon" "Yangon" ...
## $ Customer_type : chr "Member" "Normal" "Normal" "Member" ...
## $ Gender : chr "Female" "Female" "Male" "Male" ...
## $ Product_line : chr "Health_and_beauty" "Electronic_accessories" "Home_and_lifestyle" "Health_and_beauty" ...
## $ Unit_price : num 74.7 15.3 46.3 58.2 86.3 ...
## $ Quantity : int 7 5 7 8 7 7 6 10 2 3 ...
## $ Tax_0.05 : num 26.14 3.82 16.22 23.29 30.21 ...
## $ Total : num 549 80.2 340.5 489 634.4 ...
## $ Date : chr "1/5/2019" "3/8/2019" "3/3/2019" "1/27/2019" ...
## $ Time : chr "1:08:00_PM" "10:29:00_AM" "1:23:00_PM" "8:33:00_PM" ...
## $ Payment : chr "Ewallet" "Cash" "Credit_card" "Ewallet" ...
## $ cogs : num 522.8 76.4 324.3 465.8 604.2 ...
## $ gross_margin_percentage: num 4.76 4.76 4.76 4.76 4.76 ...
## $ gross_income : num 26.14 3.82 16.22 23.29 30.21 ...
## $ Rating : num 9.1 9.6 7.4 8.4 5.3 4.1 5.8 8 7.2 5.9 ...
summary(df)
## Invoice_ID Branch City Customer_type
## Length:1000 Length:1000 Length:1000 Length:1000
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Gender Product_line Unit_price Quantity
## Length:1000 Length:1000 Min. :12.45 Min. : 1.00
## Class :character Class :character 1st Qu.:35.84 1st Qu.: 3.00
## Mode :character Mode :character Median :59.55 Median : 6.00
## Mean :58.89 Mean : 5.82
## 3rd Qu.:83.24 3rd Qu.: 8.00
## Max. :99.42 Max. :10.00
## NA's :900 NA's :900
## Tax_0.05 Total Date Time
## Min. : 0.7715 Min. : 16.2 Length:1000 Length:1000
## 1st Qu.: 7.5892 1st Qu.:159.4 Class :character Class :character
## Median :16.1438 Median :339.0 Mode :character Mode :character
## Mean :17.4416 Mean :366.3
## 3rd Qu.:25.6683 3rd Qu.:539.0
## Max. :44.7400 Max. :939.5
## NA's :900 NA's :900
## Payment cogs gross_margin_percentage gross_income
## Length:1000 Min. : 15.43 Min. :4.762 Min. : 0.7715
## Class :character 1st Qu.:151.78 1st Qu.:4.762 1st Qu.: 7.5892
## Mode :character Median :322.88 Median :4.762 Median :16.1438
## Mean :348.83 Mean :4.762 Mean :17.4416
## 3rd Qu.:513.37 3rd Qu.:4.762 3rd Qu.:25.6683
## Max. :894.80 Max. :4.762 Max. :44.7400
## NA's :900 NA's :900 NA's :900
## Rating
## Min. : 4.000
## 1st Qu.: 5.675
## Median : 6.800
## Mean : 6.893
## 3rd Qu.: 8.200
## Max. :10.000
## NA's :900
#Show if there are duplicates
sum(duplicated(df))
## [1] 899
#Show if there are na values
sum(is.na(df))
## [1] 7200
#Obtain all branches, their cities, and the number of all invoices for each branch.
branches <- df %>% #Pipe operator
group_by(City, Branch) %>%
summarise(No.Transaction = n())
## `summarise()` has grouped output by 'City'. You can override using the
## `.groups` argument.
branches
## # A tibble: 4 × 3
## # Groups: City [4]
## City Branch No.Transaction
## <chr> <chr> <int>
## 1 "" "" 900
## 2 "Mandalay" "B" 30
## 3 "Naypyitaw" "C" 35
## 4 "Yangon" "A" 35
#Get the avg tax of each Product_line
df %>%
group_by(Product_line) %>%
summarise("AVG Tax" = mean(Tax_0.05))
## # A tibble: 7 × 2
## Product_line `AVG Tax`
## <chr> <dbl>
## 1 "" NA
## 2 "Electronic_accessories" 15.6
## 3 "Fashion_accessories" 16.1
## 4 "Food_and_beverages" 19.2
## 5 "Health_and_beauty" 17.9
## 6 "Home_and_lifestyle" 15.3
## 7 "Sports_and_travel" 19.5
#Get the smallest item in tax value
filter(df, Tax_0.05 == min(df$Tax_0.05))
## [1] Invoice_ID Branch City
## [4] Customer_type Gender Product_line
## [7] Unit_price Quantity Tax_0.05
## [10] Total Date Time
## [13] Payment cogs gross_margin_percentage
## [16] gross_income Rating
## <0 rows> (or 0-length row.names)
#Get the biggest item in tax value
filter(df, Tax_0.05 == max(df$Tax_0.05))
## [1] Invoice_ID Branch City
## [4] Customer_type Gender Product_line
## [7] Unit_price Quantity Tax_0.05
## [10] Total Date Time
## [13] Payment cogs gross_margin_percentage
## [16] gross_income Rating
## <0 rows> (or 0-length row.names)
library(dplyr)
# Remove duplicates
df_unique <- distinct(df)
# Now you can proceed with your data visualizations using df_unique
#Ù‹ What are payment methods available
payment_methods <- table(df_unique$Payment)
payment_methods
##
## Cash Credit_card Ewallet
## 1 35 25 40
#What is the best-selling Product_line for males and females?
gender_category <- df_unique %>%
group_by(Gender, Product_line) %>%
summarise(Transactions = n())
## `summarise()` has grouped output by 'Gender'. You can override using the
## `.groups` argument.
gender_category
## # A tibble: 13 × 3
## # Groups: Gender [3]
## Gender Product_line Transactions
## <chr> <chr> <int>
## 1 "" "" 1
## 2 "Female" "Electronic_accessories" 9
## 3 "Female" "Fashion_accessories" 6
## 4 "Female" "Food_and_beverages" 10
## 5 "Female" "Health_and_beauty" 10
## 6 "Female" "Home_and_lifestyle" 6
## 7 "Female" "Sports_and_travel" 8
## 8 "Male" "Electronic_accessories" 7
## 9 "Male" "Fashion_accessories" 6
## 10 "Male" "Food_and_beverages" 7
## 11 "Male" "Health_and_beauty" 12
## 12 "Male" "Home_and_lifestyle" 9
## 13 "Male" "Sports_and_travel" 10
library(ggrepel)
ggplot(data = gender_category, aes(x = Product_line, y= Transactions)) + geom_bar(stat = "identity", width = 0.6,aes(fill = Gender)) + coord_flip() +
geom_text_repel(aes(label = Transactions))

#The avg of goods sold by product line in the 3 branches.
goods_per_product_line <- df_unique %>%
group_by(Product_line) %>%
summarise(avg_goods_sold = mean(cogs))
goods_per_product_line
## # A tibble: 7 × 2
## Product_line avg_goods_sold
## <chr> <dbl>
## 1 "" NA
## 2 "Electronic_accessories" 311.
## 3 "Fashion_accessories" 322.
## 4 "Food_and_beverages" 384.
## 5 "Health_and_beauty" 359.
## 6 "Home_and_lifestyle" 307.
## 7 "Sports_and_travel" 390.
ggplot(data = goods_per_product_line,
aes(x = Product_line, y = avg_goods_sold, fill = Product_line, width = 0.6)) +
geom_bar(stat = "identity") +
labs(title = "Cost Per Category", y = "COGS", x = "Category") +
theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5))# Customize x-axis labels alignment and title aligment
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_bar()`).

#The avg of goods sold by product line per branch
goods_per_branch <- df_unique %>%
group_by(Branch, Product_line) %>%
summarise(count = n(), avg_good_sold = mean(cogs))
## `summarise()` has grouped output by 'Branch'. You can override using the
## `.groups` argument.
labs()
## named list()
## attr(,"class")
## [1] "labels"
ggplot(data = goods_per_branch, aes(x = Branch, y = count, fill = Product_line)) +
geom_bar(stat = "identity") +
labs(title = "Branch wise sale of categories", x = "Branch", y = "Units Sold") +
theme(axis.text.x = element_text(angle = 0)) + scale_fill_brewer(palette="Spectral")

#Get quantity of goods sold per branch
quantity_per_branch <- df_unique %>%
group_by(Branch) %>%
summarise("Quantity of goods sold per branch" = sum(Quantity))
quantity_per_branch
## # A tibble: 4 × 2
## Branch `Quantity of goods sold per branch`
## <chr> <int>
## 1 "" NA
## 2 "A" 214
## 3 "B" 157
## 4 "C" 211
#which supermarket makes the highest profit
Totalrevenue <- df_unique %>%
group_by(City) %>%
summarise("Gross_revenue" = sum(Total))
Totalrevenue
## # A tibble: 4 × 2
## City Gross_revenue
## <chr> <dbl>
## 1 "" NA
## 2 "Mandalay" 10372.
## 3 "Naypyitaw" 13967.
## 4 "Yangon" 12288.
ggplot(data= Totalrevenue, aes(x= City, y= Gross_revenue))+ geom_bar(stat= "identity", width = 0.5, fill = "#75ae97", colour ="White")
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_bar()`).

#get review distribution
ggplot(df_unique, aes(x = Rating, fill = factor(Rating))) +
geom_bar(width = 0.06) +
ggtitle("Distribution of Ratings")
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_count()`).
