library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
supermarket_sales_Sheet1 <- read_csv("C:/Users/USER/Desktop/supermarket_sales - Sheet1.csv")
## Rows: 1000 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (8): Invoice ID, Branch, City, Customer type, Gender, Product line, Dat...
## dbl  (8): Unit price, Quantity, Tax _five_perc, Total, cogs, gross margin pe...
## time (1): Time
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(supermarket_sales_Sheet1)<-str_to_title(str_replace_all(names(supermarket_sales_Sheet1),"\\s","_"))

EXPLORE THE DATA AND CHECK MISSING VALUES AND DUPLICATES

supermarket<-supermarket_sales_Sheet1%>% distinct()
supermarket$Date<-dmy(supermarket$Date)
## Warning: 587 failed to parse.
str(supermarket)
## tibble [1,000 × 17] (S3: tbl_df/tbl/data.frame)
##  $ Invoice_id             : chr [1:1000] "750-67-8428" "226-31-3081" "631-41-3108" "123-19-1176" ...
##  $ Branch                 : chr [1:1000] "A" "C" "A" "A" ...
##  $ City                   : chr [1:1000] "Yangon" "Naypyitaw" "Yangon" "Yangon" ...
##  $ Customer_type          : chr [1:1000] "Member" "Normal" "Normal" "Member" ...
##  $ Gender                 : chr [1:1000] "Female" "Female" "Male" "Male" ...
##  $ Product_line           : chr [1:1000] "Health and beauty" "Electronic accessories" "Home and lifestyle" "Health and beauty" ...
##  $ Unit_price             : num [1:1000] 74.7 15.3 46.3 58.2 86.3 ...
##  $ Quantity               : num [1:1000] 7 5 7 8 7 7 6 10 2 3 ...
##  $ Tax__five_perc         : num [1:1000] 26.14 3.82 16.22 23.29 30.21 ...
##  $ Total                  : num [1:1000] 549 80.2 340.5 489 634.4 ...
##  $ Date                   : Date[1:1000], format: "2019-05-01" "2019-08-03" ...
##  $ Time                   : 'hms' num [1:1000] 13:08:00 10:29:00 13:23:00 20:33:00 ...
##   ..- attr(*, "units")= chr "secs"
##  $ Payment                : chr [1:1000] "Ewallet" "Cash" "Credit card" "Ewallet" ...
##  $ Cogs                   : num [1:1000] 522.8 76.4 324.3 465.8 604.2 ...
##  $ Gross_margin_percentage: num [1:1000] 4.76 4.76 4.76 4.76 4.76 ...
##  $ Gross_income           : num [1:1000] 26.14 3.82 16.22 23.29 30.21 ...
##  $ Rating                 : num [1:1000] 9.1 9.6 7.4 8.4 5.3 4.1 5.8 8 7.2 5.9 ...
names(supermarket)
##  [1] "Invoice_id"              "Branch"                 
##  [3] "City"                    "Customer_type"          
##  [5] "Gender"                  "Product_line"           
##  [7] "Unit_price"              "Quantity"               
##  [9] "Tax__five_perc"          "Total"                  
## [11] "Date"                    "Time"                   
## [13] "Payment"                 "Cogs"                   
## [15] "Gross_margin_percentage" "Gross_income"           
## [17] "Rating"
dim(supermarket)
## [1] 1000   17
summary(supermarket)
##   Invoice_id           Branch              City           Customer_type     
##  Length:1000        Length:1000        Length:1000        Length:1000       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##     Gender          Product_line         Unit_price       Quantity    
##  Length:1000        Length:1000        Min.   :10.08   Min.   : 1.00  
##  Class :character   Class :character   1st Qu.:32.88   1st Qu.: 3.00  
##  Mode  :character   Mode  :character   Median :55.23   Median : 5.00  
##                                        Mean   :55.67   Mean   : 5.51  
##                                        3rd Qu.:77.94   3rd Qu.: 8.00  
##                                        Max.   :99.96   Max.   :10.00  
##                                                                       
##  Tax__five_perc        Total              Date                Time         
##  Min.   : 0.5085   Min.   :  10.68   Min.   :2019-01-01   Length:1000      
##  1st Qu.: 5.9249   1st Qu.: 124.42   1st Qu.:2019-03-03   Class1:hms       
##  Median :12.0880   Median : 253.85   Median :2019-06-03   Class2:difftime  
##  Mean   :15.3794   Mean   : 322.97   Mean   :2019-06-15   Mode  :numeric   
##  3rd Qu.:22.4453   3rd Qu.: 471.35   3rd Qu.:2019-09-03                    
##  Max.   :49.6500   Max.   :1042.65   Max.   :2019-12-03                    
##                                      NA's   :587                           
##    Payment               Cogs        Gross_margin_percentage  Gross_income    
##  Length:1000        Min.   : 10.17   Min.   :4.762           Min.   : 0.5085  
##  Class :character   1st Qu.:118.50   1st Qu.:4.762           1st Qu.: 5.9249  
##  Mode  :character   Median :241.76   Median :4.762           Median :12.0880  
##                     Mean   :307.59   Mean   :4.762           Mean   :15.3794  
##                     3rd Qu.:448.90   3rd Qu.:4.762           3rd Qu.:22.4453  
##                     Max.   :993.00   Max.   :4.762           Max.   :49.6500  
##                                                                               
##      Rating      
##  Min.   : 4.000  
##  1st Qu.: 5.500  
##  Median : 7.000  
##  Mean   : 6.973  
##  3rd Qu.: 8.500  
##  Max.   :10.000  
## 

THERE ARE MISSING DATES ;WE PLACE THEM WITH MEAN DATE

m<-mean(supermarket$Date,na.rm = T)
m
## [1] "2019-06-15"
supermarket$Date[is.na(supermarket$Date)]<-m

QUANTITY SOLD PER CITY

number_City<-sqldf("select City, count(*) as Quantity_Sold from supermarket group by City order by Quantity_Sold desc")
number_City
##        City Quantity_Sold
## 1    Yangon           340
## 2  Mandalay           332
## 3 Naypyitaw           328

QUATITY SOLD PER BRANCH

number_Branch<-sqldf("select Branch, count(*) from supermarket group by Branch")
number_Branch
##   Branch count(*)
## 1      A      340
## 2      B      332
## 3      C      328

QUANTITY SOLD PER PRODUCT_LINE

number_Product_line<-sqldf("select Product_line, count(*) as Quantity_sold from supermarket group by Product_line order by Quantity_sold desc")
number_Product_line
##             Product_line Quantity_sold
## 1    Fashion accessories           178
## 2     Food and beverages           174
## 3 Electronic accessories           170
## 4      Sports and travel           166
## 5     Home and lifestyle           160
## 6      Health and beauty           152

QUANTITY SOLD PER CUSTOMER TYPE

cus<-sqldf("select Customer_type,count(*) as Quantity_Sold from supermarket group by Customer_type  order by Quantity_Sold desc")
cus
##   Customer_type Quantity_Sold
## 1        Member           501
## 2        Normal           499

QUANTITY SOLD PER PAYMENT MODE

dfg<-sqldf("select Payment,count(*) as Quantity_sold from supermarket
           group by Payment order by Quantity_sold desc")
dfg
##       Payment Quantity_sold
## 1     Ewallet           345
## 2        Cash           344
## 3 Credit card           311

RATING PER CITY

city<-sqldf("select City,avg(Rating)as avg_rating from supermarket
            group by City ORDER BY avg_rating desc")
city
##        City avg_rating
## 1 Naypyitaw   7.072866
## 2    Yangon   7.027059
## 3  Mandalay   6.818072

RATING PER BRANCH

b<-sqldf("select Branch,avg(Rating) as avg_rating from supermarket
         group by Branch order by avg_rating desc")
b
##   Branch avg_rating
## 1      C   7.072866
## 2      A   7.027059
## 3      B   6.818072

RATING PER PRODUCT_LINE

c<-sqldf("select Product_line,avg(Rating) as avg_rating from supermarket group by Product_line order by avg_rating desc")
c
##             Product_line avg_rating
## 1     Food and beverages   7.113218
## 2    Fashion accessories   7.029213
## 3      Health and beauty   7.003289
## 4 Electronic accessories   6.924706
## 5      Sports and travel   6.916265
## 6     Home and lifestyle   6.837500

RATING PER CUSTOMER

d<-sqldf("select Customer_type,avg(Rating) as avg_rating from supermarket group by Customer_type order by avg_rating desc")
d
##   Customer_type avg_rating
## 1        Normal   7.005210
## 2        Member   6.940319

RATING PER PAYMENT MODE

f<-sqldf("select Payment,avg(Rating) as avg_rating from supermarket group by Payment order by avg_rating desc")
f
##       Payment avg_rating
## 1 Credit card   7.003215
## 2        Cash   6.970058
## 3     Ewallet   6.947826
1.0 WHICH CITY HAD HIGHEST INCOME

```r
City_income<-sqldf("select city,sum(Gross_income) as income from supermarket
                   group by city order by income")
City_income
##        City   income
## 1  Mandalay 5057.032
## 2    Yangon 5057.160
## 3 Naypyitaw 5265.176
City_income$City<-fct_reorder(City_income$City,City_income$income)
city_incomes<-City_income%>%ggplot(aes(City,income))+ geom_col(colour="purple",fill="orange",size=2)+geom_text(aes(label = income), vjust = -0.2,colour="blue",size=3)+theme_minimal()+coord_flip()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
city_incomes

2.0 WHICH PRODUCT_LINE BROUGHT HIGHEST INCOME TO THE COMPANY

income<-sqldf("select Product_line,sum(Gross_income) as income from supermarket
                   group by city order by income")
income
##             Product_line   income
## 1     Food and beverages 5057.032
## 2      Health and beauty 5057.160
## 3 Electronic accessories 5265.176
income$City<-fct_reorder(income$Product_line,income$income)
incomes<-income%>%ggplot(aes(Product_line,income))+ geom_col(colour="purple",fill="orange",size=2)+geom_text(aes(label = income), vjust = -0.2,colour="blue",size=3)+theme_minimal()+coord_flip()
incomes

3.0 WHICH GENDER OF THE CUSTOMERS BROUGHT HIGHEST INCOME TO THE COMPANY

Gender_income<-sqldf("select Gender,sum(Gross_income) as income from supermarket
                   group by Gender order by income")
Gender_income
##   Gender   income
## 1   Male 7384.944
## 2 Female 7994.425
number_gender<-sqldf("select Gender, count(*) from supermarket group by Gender")
number_gender
##   Gender count(*)
## 1 Female      501
## 2   Male      499
Gender_income$Gender<-fct_reorder(Gender_income$Gender,Gender_income$income)
Gender_incomes<-Gender_income%>%ggplot(aes(Gender,income))+ geom_col(colour="purple",fill="orange",size=2)+geom_text(aes(label = income), vjust = -0.2,colour="blue",size=3)+theme_minimal()+coord_flip()
Gender_incomes

sales<-sqldf("select strftime('%m',Date) as month ,sum(Gross_income)as Total_income,sum(Quantity)as Quantity_sold from supermarket group by month order by Total_income desc")

INCOME GENERATED IN EACH MONTH

dd<-sqldf("select Total_income,case
          when month='01'then 'Jan' 
          when month='02'then 'Feb'
          when month='03'then 'Marc'
          when month='04'then 'April'
          when month='05'then 'May'
          when month='06'then 'Jun'
          when month='07'then 'July'
          when month='08'then 'Aug'
          when month='09'then 'Sept'
          when month='10'then 'Oct'
          when month='11' then 'Nov'
          when month='12'then 'Dec'
          else 0 end as month from sales")
dd
##    Total_income month
## 1     9481.5335   May
## 2      655.5850  July
## 3      643.0370   Jun
## 4      609.4615  Marc
## 5      602.2265   Dec
## 6      590.4395   Jan
## 7      564.8240   Oct
## 8      469.7715   Aug
## 9      467.8135   Nov
## 10     458.0175  Sept
## 11     457.7250 April
## 12     378.9345   Feb
dd$month<-fct_reorder(dd$month,dd$Total_income)
x<-dd%>%ggplot(aes(month,Total_income))+geom_col()+geom_text(aes(label = Total_income), vjust = -0.2,colour="blue",size=3)+theme_minimal()+coord_flip()
x

QUANTITY SOLD IN EACH MONTH

q<-sqldf("select strftime('%m',Date) as month ,sum(Quantity)as Total_income,sum(Quantity)as Quantity_sold from supermarket group by month order by Total_income desc")
q
##    month Total_income Quantity_sold
## 1     05         3383          3383
## 2     12          226           226
## 3     07          226           226
## 4     06          225           225
## 5     01          214           214
## 6     03          212           212
## 7     04          188           188
## 8     10          183           183
## 9     08          176           176
## 10    11          166           166
## 11    02          157           157
## 12    09          154           154
d<-sqldf("select Quantity_sold,case
          when month='01'then 'Jan' 
          when month='02'then 'Feb'
          when month='03'then 'Marc'
          when month='04'then 'April'
          when month='05'then 'May'
          when month='06'then 'Jun'
          when month='07'then 'July'
          when month='08'then 'Aug'
          when month='09'then 'Sept'
          when month='10'then 'Oct'
          when month='11' then 'Nov'
          when month='12'then 'Dec'
          else 0 end as month from q")
d
##    Quantity_sold month
## 1           3383   May
## 2            226   Dec
## 3            226  July
## 4            225   Jun
## 5            214   Jan
## 6            212  Marc
## 7            188 April
## 8            183   Oct
## 9            176   Aug
## 10           166   Nov
## 11           157   Feb
## 12           154  Sept
d$month<-fct_reorder(d$month,d$Quantity_sold)
d<-d%>%ggplot(aes(Quantity_sold,month))+geom_col()
d

HIGHLY RATED MONTHS

qs<-sqldf("select strftime('%m',Date) as month ,sum(Rating)as avg_rating from supermarket group by month order by avg_rating desc")
qs
##    month avg_rating
## 1     05     4355.9
## 2     03      296.2
## 3     06      293.8
## 4     01      257.2
## 5     12      255.7
## 6     04      244.1
## 7     07      242.4
## 8     10      222.4
## 9     02      218.2
## 10    08      218.1
## 11    11      186.1
## 12    09      182.6
d<-sqldf("select avg_rating,case
          when month='01'then 'Jan' 
          when month='02'then 'Feb'
          when month='03'then 'Marc'
          when month='04'then 'April'
          when month='05'then 'May'
          when month='06'then 'Jun'
          when month='07'then 'July'
          when month='08'then 'Aug'
          when month='09'then 'Sept'
          when month='10'then 'Oct'
          when month='11' then 'Nov'
          when month='12'then 'Dec'
          else 0 end as month from qs")
d
##    avg_rating month
## 1      4355.9   May
## 2       296.2  Marc
## 3       293.8   Jun
## 4       257.2   Jan
## 5       255.7   Dec
## 6       244.1 April
## 7       242.4  July
## 8       222.4   Oct
## 9       218.2   Feb
## 10      218.1   Aug
## 11      186.1   Nov
## 12      182.6  Sept
d$month<-fct_reorder(d$month,d$avg_rating)
d<-d%>%ggplot(aes(avg_rating,month))+geom_col()
d