library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
supermarket_sales_Sheet1 <- read_csv("C:/Users/USER/Desktop/supermarket_sales - Sheet1.csv")
## Rows: 1000 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): Invoice ID, Branch, City, Customer type, Gender, Product line, Dat...
## dbl (8): Unit price, Quantity, Tax _five_perc, Total, cogs, gross margin pe...
## time (1): Time
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(supermarket_sales_Sheet1)<-str_to_title(str_replace_all(names(supermarket_sales_Sheet1),"\\s","_"))
EXPLORE THE DATA AND CHECK MISSING VALUES AND DUPLICATES
supermarket<-supermarket_sales_Sheet1%>% distinct()
supermarket$Date<-dmy(supermarket$Date)
## Warning: 587 failed to parse.
str(supermarket)
## tibble [1,000 × 17] (S3: tbl_df/tbl/data.frame)
## $ Invoice_id : chr [1:1000] "750-67-8428" "226-31-3081" "631-41-3108" "123-19-1176" ...
## $ Branch : chr [1:1000] "A" "C" "A" "A" ...
## $ City : chr [1:1000] "Yangon" "Naypyitaw" "Yangon" "Yangon" ...
## $ Customer_type : chr [1:1000] "Member" "Normal" "Normal" "Member" ...
## $ Gender : chr [1:1000] "Female" "Female" "Male" "Male" ...
## $ Product_line : chr [1:1000] "Health and beauty" "Electronic accessories" "Home and lifestyle" "Health and beauty" ...
## $ Unit_price : num [1:1000] 74.7 15.3 46.3 58.2 86.3 ...
## $ Quantity : num [1:1000] 7 5 7 8 7 7 6 10 2 3 ...
## $ Tax__five_perc : num [1:1000] 26.14 3.82 16.22 23.29 30.21 ...
## $ Total : num [1:1000] 549 80.2 340.5 489 634.4 ...
## $ Date : Date[1:1000], format: "2019-05-01" "2019-08-03" ...
## $ Time : 'hms' num [1:1000] 13:08:00 10:29:00 13:23:00 20:33:00 ...
## ..- attr(*, "units")= chr "secs"
## $ Payment : chr [1:1000] "Ewallet" "Cash" "Credit card" "Ewallet" ...
## $ Cogs : num [1:1000] 522.8 76.4 324.3 465.8 604.2 ...
## $ Gross_margin_percentage: num [1:1000] 4.76 4.76 4.76 4.76 4.76 ...
## $ Gross_income : num [1:1000] 26.14 3.82 16.22 23.29 30.21 ...
## $ Rating : num [1:1000] 9.1 9.6 7.4 8.4 5.3 4.1 5.8 8 7.2 5.9 ...
names(supermarket)
## [1] "Invoice_id" "Branch"
## [3] "City" "Customer_type"
## [5] "Gender" "Product_line"
## [7] "Unit_price" "Quantity"
## [9] "Tax__five_perc" "Total"
## [11] "Date" "Time"
## [13] "Payment" "Cogs"
## [15] "Gross_margin_percentage" "Gross_income"
## [17] "Rating"
dim(supermarket)
## [1] 1000 17
summary(supermarket)
## Invoice_id Branch City Customer_type
## Length:1000 Length:1000 Length:1000 Length:1000
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Gender Product_line Unit_price Quantity
## Length:1000 Length:1000 Min. :10.08 Min. : 1.00
## Class :character Class :character 1st Qu.:32.88 1st Qu.: 3.00
## Mode :character Mode :character Median :55.23 Median : 5.00
## Mean :55.67 Mean : 5.51
## 3rd Qu.:77.94 3rd Qu.: 8.00
## Max. :99.96 Max. :10.00
##
## Tax__five_perc Total Date Time
## Min. : 0.5085 Min. : 10.68 Min. :2019-01-01 Length:1000
## 1st Qu.: 5.9249 1st Qu.: 124.42 1st Qu.:2019-03-03 Class1:hms
## Median :12.0880 Median : 253.85 Median :2019-06-03 Class2:difftime
## Mean :15.3794 Mean : 322.97 Mean :2019-06-15 Mode :numeric
## 3rd Qu.:22.4453 3rd Qu.: 471.35 3rd Qu.:2019-09-03
## Max. :49.6500 Max. :1042.65 Max. :2019-12-03
## NA's :587
## Payment Cogs Gross_margin_percentage Gross_income
## Length:1000 Min. : 10.17 Min. :4.762 Min. : 0.5085
## Class :character 1st Qu.:118.50 1st Qu.:4.762 1st Qu.: 5.9249
## Mode :character Median :241.76 Median :4.762 Median :12.0880
## Mean :307.59 Mean :4.762 Mean :15.3794
## 3rd Qu.:448.90 3rd Qu.:4.762 3rd Qu.:22.4453
## Max. :993.00 Max. :4.762 Max. :49.6500
##
## Rating
## Min. : 4.000
## 1st Qu.: 5.500
## Median : 7.000
## Mean : 6.973
## 3rd Qu.: 8.500
## Max. :10.000
##
THERE ARE MISSING DATES ;WE PLACE THEM WITH MEAN DATE
m<-mean(supermarket$Date,na.rm = T)
m
## [1] "2019-06-15"
supermarket$Date[is.na(supermarket$Date)]<-m
QUANTITY SOLD PER CITY
number_City<-sqldf("select City, count(*) as Quantity_Sold from supermarket group by City order by Quantity_Sold desc")
number_City
## City Quantity_Sold
## 1 Yangon 340
## 2 Mandalay 332
## 3 Naypyitaw 328
QUATITY SOLD PER BRANCH
number_Branch<-sqldf("select Branch, count(*) from supermarket group by Branch")
number_Branch
## Branch count(*)
## 1 A 340
## 2 B 332
## 3 C 328
QUANTITY SOLD PER PRODUCT_LINE
number_Product_line<-sqldf("select Product_line, count(*) as Quantity_sold from supermarket group by Product_line order by Quantity_sold desc")
number_Product_line
## Product_line Quantity_sold
## 1 Fashion accessories 178
## 2 Food and beverages 174
## 3 Electronic accessories 170
## 4 Sports and travel 166
## 5 Home and lifestyle 160
## 6 Health and beauty 152
QUANTITY SOLD PER CUSTOMER TYPE
cus<-sqldf("select Customer_type,count(*) as Quantity_Sold from supermarket group by Customer_type order by Quantity_Sold desc")
cus
## Customer_type Quantity_Sold
## 1 Member 501
## 2 Normal 499
QUANTITY SOLD PER PAYMENT MODE
dfg<-sqldf("select Payment,count(*) as Quantity_sold from supermarket
group by Payment order by Quantity_sold desc")
dfg
## Payment Quantity_sold
## 1 Ewallet 345
## 2 Cash 344
## 3 Credit card 311
RATING PER CITY
city<-sqldf("select City,avg(Rating)as avg_rating from supermarket
group by City ORDER BY avg_rating desc")
city
## City avg_rating
## 1 Naypyitaw 7.072866
## 2 Yangon 7.027059
## 3 Mandalay 6.818072
RATING PER BRANCH
b<-sqldf("select Branch,avg(Rating) as avg_rating from supermarket
group by Branch order by avg_rating desc")
b
## Branch avg_rating
## 1 C 7.072866
## 2 A 7.027059
## 3 B 6.818072
RATING PER PRODUCT_LINE
c<-sqldf("select Product_line,avg(Rating) as avg_rating from supermarket group by Product_line order by avg_rating desc")
c
## Product_line avg_rating
## 1 Food and beverages 7.113218
## 2 Fashion accessories 7.029213
## 3 Health and beauty 7.003289
## 4 Electronic accessories 6.924706
## 5 Sports and travel 6.916265
## 6 Home and lifestyle 6.837500
RATING PER CUSTOMER
d<-sqldf("select Customer_type,avg(Rating) as avg_rating from supermarket group by Customer_type order by avg_rating desc")
d
## Customer_type avg_rating
## 1 Normal 7.005210
## 2 Member 6.940319
RATING PER PAYMENT MODE
f<-sqldf("select Payment,avg(Rating) as avg_rating from supermarket group by Payment order by avg_rating desc")
f
## Payment avg_rating
## 1 Credit card 7.003215
## 2 Cash 6.970058
## 3 Ewallet 6.947826
1.0 WHICH CITY HAD HIGHEST INCOME
```r
City_income<-sqldf("select city,sum(Gross_income) as income from supermarket
group by city order by income")
City_income
## City income
## 1 Mandalay 5057.032
## 2 Yangon 5057.160
## 3 Naypyitaw 5265.176
City_income$City<-fct_reorder(City_income$City,City_income$income)
city_incomes<-City_income%>%ggplot(aes(City,income))+ geom_col(colour="purple",fill="orange",size=2)+geom_text(aes(label = income), vjust = -0.2,colour="blue",size=3)+theme_minimal()+coord_flip()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
city_incomes
2.0 WHICH PRODUCT_LINE BROUGHT HIGHEST INCOME TO THE COMPANY
income<-sqldf("select Product_line,sum(Gross_income) as income from supermarket
group by city order by income")
income
## Product_line income
## 1 Food and beverages 5057.032
## 2 Health and beauty 5057.160
## 3 Electronic accessories 5265.176
income$City<-fct_reorder(income$Product_line,income$income)
incomes<-income%>%ggplot(aes(Product_line,income))+ geom_col(colour="purple",fill="orange",size=2)+geom_text(aes(label = income), vjust = -0.2,colour="blue",size=3)+theme_minimal()+coord_flip()
incomes
3.0 WHICH GENDER OF THE CUSTOMERS BROUGHT HIGHEST INCOME TO THE COMPANY
Gender_income<-sqldf("select Gender,sum(Gross_income) as income from supermarket
group by Gender order by income")
Gender_income
## Gender income
## 1 Male 7384.944
## 2 Female 7994.425
number_gender<-sqldf("select Gender, count(*) from supermarket group by Gender")
number_gender
## Gender count(*)
## 1 Female 501
## 2 Male 499
Gender_income$Gender<-fct_reorder(Gender_income$Gender,Gender_income$income)
Gender_incomes<-Gender_income%>%ggplot(aes(Gender,income))+ geom_col(colour="purple",fill="orange",size=2)+geom_text(aes(label = income), vjust = -0.2,colour="blue",size=3)+theme_minimal()+coord_flip()
Gender_incomes
sales<-sqldf("select strftime('%m',Date) as month ,sum(Gross_income)as Total_income,sum(Quantity)as Quantity_sold from supermarket group by month order by Total_income desc")
INCOME GENERATED IN EACH MONTH
dd<-sqldf("select Total_income,case
when month='01'then 'Jan'
when month='02'then 'Feb'
when month='03'then 'Marc'
when month='04'then 'April'
when month='05'then 'May'
when month='06'then 'Jun'
when month='07'then 'July'
when month='08'then 'Aug'
when month='09'then 'Sept'
when month='10'then 'Oct'
when month='11' then 'Nov'
when month='12'then 'Dec'
else 0 end as month from sales")
dd
## Total_income month
## 1 9481.5335 May
## 2 655.5850 July
## 3 643.0370 Jun
## 4 609.4615 Marc
## 5 602.2265 Dec
## 6 590.4395 Jan
## 7 564.8240 Oct
## 8 469.7715 Aug
## 9 467.8135 Nov
## 10 458.0175 Sept
## 11 457.7250 April
## 12 378.9345 Feb
dd$month<-fct_reorder(dd$month,dd$Total_income)
x<-dd%>%ggplot(aes(month,Total_income))+geom_col()+geom_text(aes(label = Total_income), vjust = -0.2,colour="blue",size=3)+theme_minimal()+coord_flip()
x
QUANTITY SOLD IN EACH MONTH
q<-sqldf("select strftime('%m',Date) as month ,sum(Quantity)as Total_income,sum(Quantity)as Quantity_sold from supermarket group by month order by Total_income desc")
q
## month Total_income Quantity_sold
## 1 05 3383 3383
## 2 12 226 226
## 3 07 226 226
## 4 06 225 225
## 5 01 214 214
## 6 03 212 212
## 7 04 188 188
## 8 10 183 183
## 9 08 176 176
## 10 11 166 166
## 11 02 157 157
## 12 09 154 154
d<-sqldf("select Quantity_sold,case
when month='01'then 'Jan'
when month='02'then 'Feb'
when month='03'then 'Marc'
when month='04'then 'April'
when month='05'then 'May'
when month='06'then 'Jun'
when month='07'then 'July'
when month='08'then 'Aug'
when month='09'then 'Sept'
when month='10'then 'Oct'
when month='11' then 'Nov'
when month='12'then 'Dec'
else 0 end as month from q")
d
## Quantity_sold month
## 1 3383 May
## 2 226 Dec
## 3 226 July
## 4 225 Jun
## 5 214 Jan
## 6 212 Marc
## 7 188 April
## 8 183 Oct
## 9 176 Aug
## 10 166 Nov
## 11 157 Feb
## 12 154 Sept
d$month<-fct_reorder(d$month,d$Quantity_sold)
d<-d%>%ggplot(aes(Quantity_sold,month))+geom_col()
d
HIGHLY RATED MONTHS
qs<-sqldf("select strftime('%m',Date) as month ,sum(Rating)as avg_rating from supermarket group by month order by avg_rating desc")
qs
## month avg_rating
## 1 05 4355.9
## 2 03 296.2
## 3 06 293.8
## 4 01 257.2
## 5 12 255.7
## 6 04 244.1
## 7 07 242.4
## 8 10 222.4
## 9 02 218.2
## 10 08 218.1
## 11 11 186.1
## 12 09 182.6
d<-sqldf("select avg_rating,case
when month='01'then 'Jan'
when month='02'then 'Feb'
when month='03'then 'Marc'
when month='04'then 'April'
when month='05'then 'May'
when month='06'then 'Jun'
when month='07'then 'July'
when month='08'then 'Aug'
when month='09'then 'Sept'
when month='10'then 'Oct'
when month='11' then 'Nov'
when month='12'then 'Dec'
else 0 end as month from qs")
d
## avg_rating month
## 1 4355.9 May
## 2 296.2 Marc
## 3 293.8 Jun
## 4 257.2 Jan
## 5 255.7 Dec
## 6 244.1 April
## 7 242.4 July
## 8 222.4 Oct
## 9 218.2 Feb
## 10 218.1 Aug
## 11 186.1 Nov
## 12 182.6 Sept
d$month<-fct_reorder(d$month,d$avg_rating)
d<-d%>%ggplot(aes(avg_rating,month))+geom_col()
d