library(tidyverse) 
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
list.files(path = "../input") 
## character(0)
df <- read.csv("supermarket.csv") 

head(df) 
##    Invoice_ID Branch      City Customer_type Gender           Product_line
## 1 750-67-8428      A    Yangon        Member Female      Health_and_beauty
## 2 226-31-3081      C Naypyitaw        Normal Female Electronic_accessories
## 3 631-41-3108      A    Yangon        Normal   Male     Home_and_lifestyle
## 4 123-19-1176      A    Yangon        Member   Male      Health_and_beauty
## 5 373-73-7910      A    Yangon        Normal   Male      Sports_and_travel
## 6 699-14-3026      C Naypyitaw        Normal   Male Electronic_accessories
##   Unit_price Quantity Tax_0.05    Total      Date        Time     Payment
## 1      74.69        7  26.1415 548.9715  1/5/2019  1:08:00_PM     Ewallet
## 2      15.28        5   3.8200  80.2200  3/8/2019 10:29:00_AM        Cash
## 3      46.33        7  16.2155 340.5255  3/3/2019  1:23:00_PM Credit_card
## 4      58.22        8  23.2880 489.0480 1/27/2019  8:33:00_PM     Ewallet
## 5      86.31        7  30.2085 634.3785  2/8/2019 10:37:00_AM     Ewallet
## 6      85.39        7  29.8865 627.6165 3/25/2019  6:30:00_PM     Ewallet
##     cogs gross_margin_percentage gross_income Rating
## 1 522.83                4.761905      26.1415    9.1
## 2  76.40                4.761905       3.8200    9.6
## 3 324.31                4.761905      16.2155    7.4
## 4 465.76                4.761905      23.2880    8.4
## 5 604.17                4.761905      30.2085    5.3
## 6 597.73                4.761905      29.8865    4.1
str(df) 
## 'data.frame':    1000 obs. of  17 variables:
##  $ Invoice_ID             : chr  "750-67-8428" "226-31-3081" "631-41-3108" "123-19-1176" ...
##  $ Branch                 : chr  "A" "C" "A" "A" ...
##  $ City                   : chr  "Yangon" "Naypyitaw" "Yangon" "Yangon" ...
##  $ Customer_type          : chr  "Member" "Normal" "Normal" "Member" ...
##  $ Gender                 : chr  "Female" "Female" "Male" "Male" ...
##  $ Product_line           : chr  "Health_and_beauty" "Electronic_accessories" "Home_and_lifestyle" "Health_and_beauty" ...
##  $ Unit_price             : num  74.7 15.3 46.3 58.2 86.3 ...
##  $ Quantity               : int  7 5 7 8 7 7 6 10 2 3 ...
##  $ Tax_0.05               : num  26.14 3.82 16.22 23.29 30.21 ...
##  $ Total                  : num  549 80.2 340.5 489 634.4 ...
##  $ Date                   : chr  "1/5/2019" "3/8/2019" "3/3/2019" "1/27/2019" ...
##  $ Time                   : chr  "1:08:00_PM" "10:29:00_AM" "1:23:00_PM" "8:33:00_PM" ...
##  $ Payment                : chr  "Ewallet" "Cash" "Credit_card" "Ewallet" ...
##  $ cogs                   : num  522.8 76.4 324.3 465.8 604.2 ...
##  $ gross_margin_percentage: num  4.76 4.76 4.76 4.76 4.76 ...
##  $ gross_income           : num  26.14 3.82 16.22 23.29 30.21 ...
##  $ Rating                 : num  9.1 9.6 7.4 8.4 5.3 4.1 5.8 8 7.2 5.9 ...
summary(df) 
##   Invoice_ID           Branch              City           Customer_type     
##  Length:1000        Length:1000        Length:1000        Length:1000       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##     Gender          Product_line         Unit_price       Quantity    
##  Length:1000        Length:1000        Min.   :12.45   Min.   : 1.00  
##  Class :character   Class :character   1st Qu.:35.84   1st Qu.: 3.00  
##  Mode  :character   Mode  :character   Median :59.55   Median : 6.00  
##                                        Mean   :58.89   Mean   : 5.82  
##                                        3rd Qu.:83.24   3rd Qu.: 8.00  
##                                        Max.   :99.42   Max.   :10.00  
##                                        NA's   :900     NA's   :900    
##     Tax_0.05           Total           Date               Time          
##  Min.   : 0.7715   Min.   : 16.2   Length:1000        Length:1000       
##  1st Qu.: 7.5892   1st Qu.:159.4   Class :character   Class :character  
##  Median :16.1438   Median :339.0   Mode  :character   Mode  :character  
##  Mean   :17.4416   Mean   :366.3                                        
##  3rd Qu.:25.6683   3rd Qu.:539.0                                        
##  Max.   :44.7400   Max.   :939.5                                        
##  NA's   :900       NA's   :900                                          
##    Payment               cogs        gross_margin_percentage  gross_income    
##  Length:1000        Min.   : 15.43   Min.   :4.762           Min.   : 0.7715  
##  Class :character   1st Qu.:151.78   1st Qu.:4.762           1st Qu.: 7.5892  
##  Mode  :character   Median :322.88   Median :4.762           Median :16.1438  
##                     Mean   :348.83   Mean   :4.762           Mean   :17.4416  
##                     3rd Qu.:513.37   3rd Qu.:4.762           3rd Qu.:25.6683  
##                     Max.   :894.80   Max.   :4.762           Max.   :44.7400  
##                     NA's   :900      NA's   :900             NA's   :900      
##      Rating      
##  Min.   : 4.000  
##  1st Qu.: 5.675  
##  Median : 6.800  
##  Mean   : 6.893  
##  3rd Qu.: 8.200  
##  Max.   :10.000  
##  NA's   :900
#Show if there are duplicates 

sum(duplicated(df)) 
## [1] 899
#Show if there are na values 

sum(is.na(df)) 
## [1] 7200
#Obtain all branches, their cities, and the number of all invoices for each branch. 

branches <- df %>% #Pipe operator 

group_by(City, Branch) %>% 

summarise(No.Transaction = n()) 
## `summarise()` has grouped output by 'City'. You can override using the
## `.groups` argument.
branches 
## # A tibble: 4 × 3
## # Groups:   City [4]
##   City        Branch No.Transaction
##   <chr>       <chr>           <int>
## 1 ""          ""                900
## 2 "Mandalay"  "B"                30
## 3 "Naypyitaw" "C"                35
## 4 "Yangon"    "A"                35
#Get the avg tax of each Product_line 

df %>%  

group_by(Product_line) %>% 

summarise("AVG Tax" = mean(Tax_0.05)) 
## # A tibble: 7 × 2
##   Product_line             `AVG Tax`
##   <chr>                        <dbl>
## 1 ""                            NA  
## 2 "Electronic_accessories"      15.6
## 3 "Fashion_accessories"         16.1
## 4 "Food_and_beverages"          19.2
## 5 "Health_and_beauty"           17.9
## 6 "Home_and_lifestyle"          15.3
## 7 "Sports_and_travel"           19.5
#Get the smallest item in tax value 

filter(df, Tax_0.05 == min(df$Tax_0.05)) 
##  [1] Invoice_ID              Branch                  City                   
##  [4] Customer_type           Gender                  Product_line           
##  [7] Unit_price              Quantity                Tax_0.05               
## [10] Total                   Date                    Time                   
## [13] Payment                 cogs                    gross_margin_percentage
## [16] gross_income            Rating                 
## <0 rows> (or 0-length row.names)
#Get the biggest item in tax value 

filter(df, Tax_0.05 == max(df$Tax_0.05)) 
##  [1] Invoice_ID              Branch                  City                   
##  [4] Customer_type           Gender                  Product_line           
##  [7] Unit_price              Quantity                Tax_0.05               
## [10] Total                   Date                    Time                   
## [13] Payment                 cogs                    gross_margin_percentage
## [16] gross_income            Rating                 
## <0 rows> (or 0-length row.names)
library(dplyr)

# Remove duplicates
df_unique <- distinct(df)

# Now you can proceed with your data visualizations using df_unique


#Ù‹ What are payment methods available  

payment_methods <- table(df_unique$Payment) 

payment_methods 
## 
##                    Cash Credit_card     Ewallet 
##           1          35          25          40
#What is the best-selling Product_line for males and females? 

gender_category <- df_unique %>% 

group_by(Gender, Product_line) %>% 

summarise(Transactions = n())  
## `summarise()` has grouped output by 'Gender'. You can override using the
## `.groups` argument.
gender_category 
## # A tibble: 13 × 3
## # Groups:   Gender [3]
##    Gender   Product_line             Transactions
##    <chr>    <chr>                           <int>
##  1 ""       ""                                  1
##  2 "Female" "Electronic_accessories"            9
##  3 "Female" "Fashion_accessories"               6
##  4 "Female" "Food_and_beverages"               10
##  5 "Female" "Health_and_beauty"                10
##  6 "Female" "Home_and_lifestyle"                6
##  7 "Female" "Sports_and_travel"                 8
##  8 "Male"   "Electronic_accessories"            7
##  9 "Male"   "Fashion_accessories"               6
## 10 "Male"   "Food_and_beverages"                7
## 11 "Male"   "Health_and_beauty"                12
## 12 "Male"   "Home_and_lifestyle"                9
## 13 "Male"   "Sports_and_travel"                10
library(ggrepel) 

ggplot(data = gender_category, aes(x = Product_line, y= Transactions)) + geom_bar(stat = "identity", width = 0.6,aes(fill = Gender)) + coord_flip() + 

geom_text_repel(aes(label = Transactions)) 

#The avg of goods sold by product line in the 3 branches. 

goods_per_product_line <- df_unique %>% 

group_by(Product_line) %>% 

summarise(avg_goods_sold = mean(cogs)) 

goods_per_product_line 
## # A tibble: 7 × 2
##   Product_line             avg_goods_sold
##   <chr>                             <dbl>
## 1 ""                                  NA 
## 2 "Electronic_accessories"           311.
## 3 "Fashion_accessories"              322.
## 4 "Food_and_beverages"               384.
## 5 "Health_and_beauty"                359.
## 6 "Home_and_lifestyle"               307.
## 7 "Sports_and_travel"                390.
ggplot(data = goods_per_product_line, 

aes(x = Product_line, y = avg_goods_sold, fill = Product_line, width = 0.6)) +  

geom_bar(stat = "identity") + 

labs(title = "Cost Per Category", y = "COGS", x = "Category") +  

theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5))# Customize x-axis labels alignment and title aligment  
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_bar()`).

#The avg of goods sold by product line per branch 

goods_per_branch <- df_unique %>% 

group_by(Branch, Product_line) %>% 

summarise(count = n(), avg_good_sold = mean(cogs)) 
## `summarise()` has grouped output by 'Branch'. You can override using the
## `.groups` argument.
labs() 
## named list()
## attr(,"class")
## [1] "labels"
ggplot(data = goods_per_branch, aes(x = Branch, y = count, fill = Product_line)) +  

geom_bar(stat = "identity") + 

labs(title = "Branch wise sale of categories", x = "Branch", y = "Units Sold") +  

theme(axis.text.x = element_text(angle = 0)) +  scale_fill_brewer(palette="Spectral") 

#Get quantity of goods sold per branch 

quantity_per_branch <- df_unique %>% 

group_by(Branch) %>% 

summarise("Quantity of goods sold per branch" = sum(Quantity)) 

quantity_per_branch 
## # A tibble: 4 × 2
##   Branch `Quantity of goods sold per branch`
##   <chr>                                <int>
## 1 ""                                      NA
## 2 "A"                                    214
## 3 "B"                                    157
## 4 "C"                                    211
#which supermarket makes the highest profit 

Totalrevenue <- df_unique %>% 

group_by(City) %>% 

summarise("Gross_revenue" = sum(Total)) 

Totalrevenue 
## # A tibble: 4 × 2
##   City        Gross_revenue
##   <chr>               <dbl>
## 1 ""                    NA 
## 2 "Mandalay"         10372.
## 3 "Naypyitaw"        13967.
## 4 "Yangon"           12288.
ggplot(data= Totalrevenue, aes(x= City, y= Gross_revenue))+ geom_bar(stat= "identity", width = 0.5, fill = "#75ae97", colour ="White") 
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_bar()`).

#get review distribution 

ggplot(df_unique, aes(x = Rating, fill = factor(Rating))) + 

  geom_bar(width = 0.06) +  

  ggtitle("Distribution of Ratings") 
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_count()`).