library(tidyverse) 
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

list.files(path = "../input")
## character(0)
#Load our Data Into A Data Frame
df <- read.csv("supermarket1.csv")
library(dplyr)

# Load our Data Into A Data Frame
df <- read.csv("supermarket.csv", nrows = 100)
head(df)
##    Invoice_ID Branch      City Customer_type Gender           Product_line
## 1 750-67-8428      A    Yangon        Member Female      Health_and_beauty
## 2 226-31-3081      C Naypyitaw        Normal Female Electronic_accessories
## 3 631-41-3108      A    Yangon        Normal   Male     Home_and_lifestyle
## 4 123-19-1176      A    Yangon        Member   Male      Health_and_beauty
## 5 373-73-7910      A    Yangon        Normal   Male      Sports_and_travel
## 6 699-14-3026      C Naypyitaw        Normal   Male Electronic_accessories
##   Unit_price Quantity Tax_0.05    Total      Date        Time     Payment
## 1      74.69        7  26.1415 548.9715  1/5/2019  1:08:00_PM     Ewallet
## 2      15.28        5   3.8200  80.2200  3/8/2019 10:29:00_AM        Cash
## 3      46.33        7  16.2155 340.5255  3/3/2019  1:23:00_PM Credit_card
## 4      58.22        8  23.2880 489.0480 1/27/2019  8:33:00_PM     Ewallet
## 5      86.31        7  30.2085 634.3785  2/8/2019 10:37:00_AM     Ewallet
## 6      85.39        7  29.8865 627.6165 3/25/2019  6:30:00_PM     Ewallet
##     cogs gross_margin_percentage gross_income Rating
## 1 522.83                4.761905      26.1415    9.1
## 2  76.40                4.761905       3.8200    9.6
## 3 324.31                4.761905      16.2155    7.4
## 4 465.76                4.761905      23.2880    8.4
## 5 604.17                4.761905      30.2085    5.3
## 6 597.73                4.761905      29.8865    4.1
str(df)
## 'data.frame':    100 obs. of  17 variables:
##  $ Invoice_ID             : chr  "750-67-8428" "226-31-3081" "631-41-3108" "123-19-1176" ...
##  $ Branch                 : chr  "A" "C" "A" "A" ...
##  $ City                   : chr  "Yangon" "Naypyitaw" "Yangon" "Yangon" ...
##  $ Customer_type          : chr  "Member" "Normal" "Normal" "Member" ...
##  $ Gender                 : chr  "Female" "Female" "Male" "Male" ...
##  $ Product_line           : chr  "Health_and_beauty" "Electronic_accessories" "Home_and_lifestyle" "Health_and_beauty" ...
##  $ Unit_price             : num  74.7 15.3 46.3 58.2 86.3 ...
##  $ Quantity               : int  7 5 7 8 7 7 6 10 2 3 ...
##  $ Tax_0.05               : num  26.14 3.82 16.22 23.29 30.21 ...
##  $ Total                  : num  549 80.2 340.5 489 634.4 ...
##  $ Date                   : chr  "1/5/2019" "3/8/2019" "3/3/2019" "1/27/2019" ...
##  $ Time                   : chr  "1:08:00_PM" "10:29:00_AM" "1:23:00_PM" "8:33:00_PM" ...
##  $ Payment                : chr  "Ewallet" "Cash" "Credit_card" "Ewallet" ...
##  $ cogs                   : num  522.8 76.4 324.3 465.8 604.2 ...
##  $ gross_margin_percentage: num  4.76 4.76 4.76 4.76 4.76 ...
##  $ gross_income           : num  26.14 3.82 16.22 23.29 30.21 ...
##  $ Rating                 : num  9.1 9.6 7.4 8.4 5.3 4.1 5.8 8 7.2 5.9 ...
summary(df)
##   Invoice_ID           Branch              City           Customer_type     
##  Length:100         Length:100         Length:100         Length:100        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##     Gender          Product_line         Unit_price       Quantity    
##  Length:100         Length:100         Min.   :12.45   Min.   : 1.00  
##  Class :character   Class :character   1st Qu.:35.84   1st Qu.: 3.00  
##  Mode  :character   Mode  :character   Median :59.55   Median : 6.00  
##                                        Mean   :58.89   Mean   : 5.82  
##                                        3rd Qu.:83.24   3rd Qu.: 8.00  
##                                        Max.   :99.42   Max.   :10.00  
##     Tax_0.05           Total           Date               Time          
##  Min.   : 0.7715   Min.   : 16.2   Length:100         Length:100        
##  1st Qu.: 7.5892   1st Qu.:159.4   Class :character   Class :character  
##  Median :16.1438   Median :339.0   Mode  :character   Mode  :character  
##  Mean   :17.4416   Mean   :366.3                                        
##  3rd Qu.:25.6683   3rd Qu.:539.0                                        
##  Max.   :44.7400   Max.   :939.5                                        
##    Payment               cogs        gross_margin_percentage  gross_income    
##  Length:100         Min.   : 15.43   Min.   :4.762           Min.   : 0.7715  
##  Class :character   1st Qu.:151.78   1st Qu.:4.762           1st Qu.: 7.5892  
##  Mode  :character   Median :322.88   Median :4.762           Median :16.1438  
##                     Mean   :348.83   Mean   :4.762           Mean   :17.4416  
##                     3rd Qu.:513.37   3rd Qu.:4.762           3rd Qu.:25.6683  
##                     Max.   :894.80   Max.   :4.762           Max.   :44.7400  
##      Rating      
##  Min.   : 4.000  
##  1st Qu.: 5.675  
##  Median : 6.800  
##  Mean   : 6.893  
##  3rd Qu.: 8.200  
##  Max.   :10.000
#Show if there are duplicates
sum(duplicated(df))
## [1] 0
#Show if there are na values
sum(is.na(df))
## [1] 0
#Obtain all branches, their cities, and the number of all invoices for each branch.
branches <- df %>% #Pipe operator
group_by(City, Branch) %>%
summarise(Transaction = n())
## `summarise()` has grouped output by 'City'. You can override using the
## `.groups` argument.
branches
## # A tibble: 3 × 3
## # Groups:   City [3]
##   City      Branch Transaction
##   <chr>     <chr>        <int>
## 1 Mandalay  B               30
## 2 Naypyitaw C               35
## 3 Yangon    A               35
#Get the avg tax of each Product_line
df %>% 
group_by(Product_line) %>%
summarise("AVG Tax" = mean(Tax_0.05))
## # A tibble: 6 × 2
##   Product_line           `AVG Tax`
##   <chr>                      <dbl>
## 1 Electronic_accessories      15.6
## 2 Fashion_accessories         16.1
## 3 Food_and_beverages          19.2
## 4 Health_and_beauty           17.9
## 5 Home_and_lifestyle          15.3
## 6 Sports_and_travel           19.5
#Get the smallest item in tax value
filter(df, Tax_0.05 == min(df$Tax_0.05))
##    Invoice_ID Branch      City Customer_type Gender        Product_line
## 1 778-71-5554      C Naypyitaw        Member   Male Fashion_accessories
##   Unit_price Quantity Tax_0.05   Total      Date       Time     Payment  cogs
## 1      15.43        1   0.7715 16.2015 1/25/2019 3:46:00_PM Credit_card 15.43
##   gross_margin_percentage gross_income Rating
## 1                4.761905       0.7715    6.1
#Get the biggest item in tax value
filter(df, Tax_0.05 == max(df$Tax_0.05))
##    Invoice_ID Branch      City Customer_type Gender       Product_line
## 1 393-65-2792      C Naypyitaw        Normal   Male Food_and_beverages
##   Unit_price Quantity Tax_0.05  Total     Date        Time     Payment  cogs
## 1      89.48       10    44.74 939.54 1/6/2019 12:46:00_PM Credit_card 894.8
##   gross_margin_percentage gross_income Rating
## 1                4.761905        44.74    9.6
#Ù‹What are payment methods avaliable 
payment_methods <- table(df$Payment)
payment_methods
## 
##        Cash Credit_card     Ewallet 
##          35          25          40
#What is the best-selling Product_line for males and females?
gender_category <- df %>%
group_by(Gender, Product_line) %>%
summarise(Transactions = n()) 
## `summarise()` has grouped output by 'Gender'. You can override using the
## `.groups` argument.
gender_category
## # A tibble: 12 × 3
## # Groups:   Gender [2]
##    Gender Product_line           Transactions
##    <chr>  <chr>                         <int>
##  1 Female Electronic_accessories            9
##  2 Female Fashion_accessories               6
##  3 Female Food_and_beverages               10
##  4 Female Health_and_beauty                10
##  5 Female Home_and_lifestyle                6
##  6 Female Sports_and_travel                 8
##  7 Male   Electronic_accessories            7
##  8 Male   Fashion_accessories               6
##  9 Male   Food_and_beverages                7
## 10 Male   Health_and_beauty                12
## 11 Male   Home_and_lifestyle                9
## 12 Male   Sports_and_travel                10
library(ggrepel)

ggplot(data = gender_category, aes(x = Product_line, y= Transactions)) + geom_bar(stat = "identity", width = 0.6,aes(fill = Gender)) + coord_flip() +
geom_text(aes(label = Transactions), size = 3)

#The avg of goods sold by product line in the 3 branches.
goods_per_product_line <- df %>%
group_by(Product_line) %>%
summarise(avg_goods_sold = mean(cogs))
goods_per_product_line
## # A tibble: 6 × 2
##   Product_line           avg_goods_sold
##   <chr>                           <dbl>
## 1 Electronic_accessories           311.
## 2 Fashion_accessories              322.
## 3 Food_and_beverages               384.
## 4 Health_and_beauty                359.
## 5 Home_and_lifestyle               307.
## 6 Sports_and_travel                390.
ggplot(data = goods_per_product_line,
aes(x = Product_line, y = avg_goods_sold, fill = Product_line, width = 0.6)) + 
geom_bar(stat = "identity") +
labs(title = "Cost Per Category", y = "COGS", x = "Category") + 
theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5))# Customize x-axis labels alignment and title aligment 

#The avg of goods sold by product line per branch
goods_per_branch <- df %>%
group_by(Branch, Product_line) %>%
summarise(count = n(), avg_good_sold = mean(cogs))
## `summarise()` has grouped output by 'Branch'. You can override using the
## `.groups` argument.
labs()
## named list()
## attr(,"class")
## [1] "labels"
ggplot(data = goods_per_branch, aes(x = Branch, y = count, fill = Product_line)) + 
geom_bar(stat = "identity") +
labs(title = "Branch wise sale of categories", x = "Branch", y = "Units Sold") + 
theme(axis.text.x = element_text(angle = 0)) +  scale_fill_brewer(palette="Spectral")      

#Get quantity of goods sold per branch
quantity_per_branch <- df %>%
group_by(Branch) %>%
summarise("Quantity of goods sold per branch" = sum(Quantity))
quantity_per_branch
## # A tibble: 3 × 2
##   Branch `Quantity of goods sold per branch`
##   <chr>                                <int>
## 1 A                                      214
## 2 B                                      157
## 3 C                                      211
#which supermarket makes the highest profit
Totalrevenue <- df %>%
group_by(City) %>%
summarise("Gross_revenue" = sum(Total))
Totalrevenue
## # A tibble: 3 × 2
##   City      Gross_revenue
##   <chr>             <dbl>
## 1 Mandalay         10372.
## 2 Naypyitaw        13967.
## 3 Yangon           12288.
ggplot(data= Totalrevenue, aes(x= City, y= Gross_revenue))+ geom_bar(stat= "identity", width = 0.5, fill = "#75ae97", colour ="White")

#get review distribution
ggplot(df, aes(x = Rating, fill = factor(Rating))) +
  geom_bar(width = 0.06) + 
  ggtitle("Distribution of Ratings")