#importing Libraries
library(tidyverse)
library(readxl)
library(magrittr)
library(dplyr)
library(RColorBrewer)
library(kableExtra)
library(forcats)
#importing dataset
supermarket_data <-read_csv("C:/Users/LENOVO/Documents/ALY6010 poject/Data/supermarket_sales.csv")
r=nrow(supermarket_data)
n=ncol(supermarket_data)
dplyr::glimpse(supermarket_data[1:6,])
## Rows: 6
## Columns: 17
## $ `Invoice ID` <chr> "750-67-8428", "226-31-3081", "631-41-3108",…
## $ Branch <chr> "A", "C", "A", "A", "A", "C"
## $ City <chr> "Yangon", "Naypyitaw", "Yangon", "Yangon", "…
## $ `Customer type` <chr> "Member", "Normal", "Normal", "Member", "Nor…
## $ Gender <chr> "Female", "Female", "Male", "Male", "Male", …
## $ `Product line` <chr> "Health and beauty", "Electronic accessories…
## $ `Unit price` <dbl> 74.69, 15.28, 46.33, 58.22, 86.31, 85.39
## $ Quantity <dbl> 7, 5, 7, 8, 7, 7
## $ `Tax 5%` <dbl> 26.1415, 3.8200, 16.2155, 23.2880, 30.2085, …
## $ Total <dbl> 548.9715, 80.2200, 340.5255, 489.0480, 634.3…
## $ Date <chr> "01-05-2019", "03-08-2019", "03-03-2019", "1…
## $ Time <time> 13:08:00, 10:29:00, 13:23:00, 20:33:00, 10:3…
## $ Payment <chr> "Ewallet", "Cash", "Credit card", "Ewallet",…
## $ cogs <dbl> 522.83, 76.40, 324.31, 465.76, 604.17, 597.73
## $ `gross margin percentage` <dbl> 4.761905, 4.761905, 4.761905, 4.761905, 4.76…
## $ `gross income` <dbl> 26.1415, 3.8200, 16.2155, 23.2880, 30.2085,…
## $ Rating <dbl> 9.1, 9.6, 7.4, 8.4, 5.3, 4.1
The Dataset contains 1000
number of rows and 17 number of columns
supermarket_renamed<-supermarket_data%>% rename(`Invoice_ID`=`Invoice ID`)%>%
rename(`Customer_type`=`Customer type`)%>%
rename(`Product_line`=`Product line`)%>%
rename(`Unit_price`=`Unit price`)%>%
rename(`Tax_5%`=`Tax 5%`)%>%
rename(`gross_margin_percentage`=`gross margin percentage`)%>%
rename(`gross_income`=`gross income`)
dplyr::glimpse(supermarket_renamed[1:2,])
## Rows: 2
## Columns: 17
## $ Invoice_ID <chr> "750-67-8428", "226-31-3081"
## $ Branch <chr> "A", "C"
## $ City <chr> "Yangon", "Naypyitaw"
## $ Customer_type <chr> "Member", "Normal"
## $ Gender <chr> "Female", "Female"
## $ Product_line <chr> "Health and beauty", "Electronic accessories"
## $ Unit_price <dbl> 74.69, 15.28
## $ Quantity <dbl> 7, 5
## $ `Tax_5%` <dbl> 26.1415, 3.8200
## $ Total <dbl> 548.9715, 80.2200
## $ Date <chr> "01-05-2019", "03-08-2019"
## $ Time <time> 13:08:00, 10:29:00
## $ Payment <chr> "Ewallet", "Cash"
## $ cogs <dbl> 522.83, 76.40
## $ gross_margin_percentage <dbl> 4.761905, 4.761905
## $ gross_income <dbl> 26.1415, 3.8200
## $ Rating <dbl> 9.1, 9.6
Data Cleaning and taking relevant columns
of the dataset
supermarket_renamed$Product_category<-recode(as.character(supermarket_renamed$Product_line),"Health and beauty"="Beauty","Electronic accessories"="Electronics",
"Sports and travel"="Sports",
"Fashion accessories"="Fashion",
"Food and beverages"="Food",
"Home and lifestyle"="lifestyle")
#dplyr::glimpse(supermarket_renamed[1:6,])
supermarket_new<- subset.data.frame(supermarket_renamed,select = -c(`Date`,`Time`,`Product_line`))
dplyr::glimpse(supermarket_new[1:6,])
## Rows: 6
## Columns: 15
## $ Invoice_ID <chr> "750-67-8428", "226-31-3081", "631-41-3108", "…
## $ Branch <chr> "A", "C", "A", "A", "A", "C"
## $ City <chr> "Yangon", "Naypyitaw", "Yangon", "Yangon", "Ya…
## $ Customer_type <chr> "Member", "Normal", "Normal", "Member", "Norma…
## $ Gender <chr> "Female", "Female", "Male", "Male", "Male", "M…
## $ Unit_price <dbl> 74.69, 15.28, 46.33, 58.22, 86.31, 85.39
## $ Quantity <dbl> 7, 5, 7, 8, 7, 7
## $ `Tax_5%` <dbl> 26.1415, 3.8200, 16.2155, 23.2880, 30.2085, 29…
## $ Total <dbl> 548.9715, 80.2200, 340.5255, 489.0480, 634.378…
## $ Payment <chr> "Ewallet", "Cash", "Credit card", "Ewallet", "…
## $ cogs <dbl> 522.83, 76.40, 324.31, 465.76, 604.17, 597.73
## $ gross_margin_percentage <dbl> 4.761905, 4.761905, 4.761905, 4.761905, 4.7619…
## $ gross_income <dbl> 26.1415, 3.8200, 16.2155, 23.2880, 30.2085, 29…
## $ Rating <dbl> 9.1, 9.6, 7.4, 8.4, 5.3, 4.1
## $ Product_category <chr> "Beauty", "Electronics", "lifestyle", "Beauty"…
The Dataset now has1000
number of rows and 15 number of columns
#showing summary of numerical variables
supermarket_new%>% select(`Unit_price`,Quantity,`Tax_5%`,`Total`,`cogs`,`gross_margin_percentage`,`gross_income`,`Rating`)%>%summary()%>% kable(align = "c",
format = "markdown",
caption = "Summary of numerical columns",
table.attr="style='width:30%;'")
| Unit_price | Quantity | Tax_5% | Total | cogs | gross_margin_percentage | gross_income | Rating | |
|---|---|---|---|---|---|---|---|---|
| Min. :10.08 | Min. : 1.00 | Min. : 0.5085 | Min. : 10.68 | Min. : 10.17 | Min. :4.762 | Min. : 0.5085 | Min. : 4.000 | |
| 1st Qu.:32.88 | 1st Qu.: 3.00 | 1st Qu.: 5.9249 | 1st Qu.: 124.42 | 1st Qu.:118.50 | 1st Qu.:4.762 | 1st Qu.: 5.9249 | 1st Qu.: 5.500 | |
| Median :55.23 | Median : 5.00 | Median :12.0880 | Median : 253.85 | Median :241.76 | Median :4.762 | Median :12.0880 | Median : 7.000 | |
| Mean :55.67 | Mean : 5.51 | Mean :15.3794 | Mean : 322.97 | Mean :307.59 | Mean :4.762 | Mean :15.3794 | Mean : 6.973 | |
| 3rd Qu.:77.94 | 3rd Qu.: 8.00 | 3rd Qu.:22.4453 | 3rd Qu.: 471.35 | 3rd Qu.:448.90 | 3rd Qu.:4.762 | 3rd Qu.:22.4453 | 3rd Qu.: 8.500 | |
| Max. :99.96 | Max. :10.00 | Max. :49.6500 | Max. :1042.65 | Max. :993.00 | Max. :4.762 | Max. :49.6500 | Max. :10.000 |
##summary of categorical variables
supermarket_new%>%select(`Product_category`)%>%table()%>%sort(decreasing = TRUE)%>% kable(align = "c",format = "markdown",
caption = "Summary of Product Line",
table.attr="style='width:30%;'")
| Product_category | Freq |
|---|---|
| Fashion | 178 |
| Food | 174 |
| Electronics | 170 |
| Sports | 166 |
| lifestyle | 160 |
| Beauty | 152 |
supermarket_new%>%select(`City`)%>%table()%>%sort(decreasing = TRUE)%>% kable(align = "c",format = "markdown",
caption = "Summary of City",
table.attr="style='width:30%;'")
| City | Freq |
|---|---|
| Yangon | 340 |
| Mandalay | 332 |
| Naypyitaw | 328 |
supermarket_new%>%select(`Customer_type`)%>%table()%>%sort(decreasing = TRUE)%>% kable(align = "c",format = "markdown",
caption = "Summary of Customer type",
table.attr="style='width:30%;'")
| Customer_type | Freq |
|---|---|
| Member | 501 |
| Normal | 499 |
supermarket_new%>%select(`Branch`)%>%table()%>%sort(decreasing = TRUE)%>% kable(align = "c",format = "markdown",
caption = "Summary of Branch",
table.attr="style='width:30%;'")
| Branch | Freq |
|---|---|
| A | 340 |
| B | 332 |
| C | 328 |
supermarket_new%>%select(`Payment`)%>%table()%>%sort(decreasing = TRUE)%>% kable(align = "c",format = "markdown",
caption = "Summary of Payment",
table.attr="style='width:30%;'")
| Payment | Freq |
|---|---|
| Ewallet | 345 |
| Cash | 344 |
| Credit card | 311 |
supermarket_new%>%select(`Gender`)%>%table()%>%sort(decreasing = TRUE)%>% kable(align = "c",format = "markdown",
caption = "Summary of Gender",
table.attr="style='width:30%;'")
| Gender | Freq |
|---|---|
| Female | 501 |
| Male | 499 |
supermarket_new[is.na(supermarket_new)]
## <unspecified> [0]
Creating Barplors for Categorical
variables
table1<-supermarket_new[c("Gender","Product_category")]%>%table()
table2<-supermarket_new[c("City","Product_category")]%>% table()
#kable(table2)
table3<-supermarket_new[c("Customer_type","Branch")]%>%table()
#kable(table3)
table4<-supermarket_new[c("Customer_type","Product_category")]%>%table()
#kable(table4)
par(mfrow=c(1,1))
par(mai=c(1,1,0.2,0.2))
plot1=barplot(table1,xlim=c(0,130),col=c("gray","blue"),legend.text = rownames(table1),las=1,args.legend = "Topleft",beside=TRUE,
main="Count of Records Gender vs Product Line",horiz = TRUE,xlab="Frequency")
text(x=table1,plot1,table1,pos=2,cex=0.8)
#grapgh2
par(mfrow=c(1,1))
par(mai=c(1,1,0.2,0.2))
plot2=barplot(table2,xlim=c(0,100),col=c("gray","green","yellow"),legend.text = rownames(table2),las=1,args.legend = "Topleft",beside=TRUE,
main="Count of Records City vs Product Line",horiz = TRUE,xlab="Frequency")
text(x=table2,plot2,table2,pos=2,cex=0.8)
#graph3
par(mfrow=c(2,1))
par(mai=c(1,0.8,0.2,0.2))
plot3=barplot(table3,ylim=c(0,200),col=c("gray","orange"),legend.text = rownames(table3),las=1,args.legend = "Topleft",
main="Count of Records Customer type vs Branch Line",ylab="Frequency",beside=TRUE,cex.names=0.8)
text(y=table3,plot3,table3,pos=3,cex=0.8)
plot4=barplot(table4,ylim=c(0,300),col=brewer.pal(2,"Pastel1"),legend.text = rownames(table4),las=1,args.legend = "Topleft",
main="Count of Records Customer type vs Branch Line",ylab="Frequency",beside=TRUE,cex.names=0.8)
text(y=table4,plot4,table4,pos=3,cex=0.8)
par(mfrow=c(2,2))
par(mai=c(1,1,0.2,0.2))
supermarket_new$`Total`%>%hist(main="Histogram of Total",
col="LightYellow",breaks=50,
xlab="Total",xlim=c(0,1200),
ylab="Frequency")
abline(v=mean(supermarket_new$`Total`),col="red",lwd=3)
text(y=60,
x=mean(supermarket_new$`Total`),col="red",
paste("mean=",mean(supermarket_new$`Total`)))
supermarket_new$`Quantity`%>%hist(main="Histogram of Quantity",
col="LightPink",breaks=50,
xlab="Quantity",xlim=c(0,12),
ylab="Frequency")
abline(v=mean(supermarket_new$`Quantity`),col="red",lwd=3)
text(y=100,
x=mean(supermarket_new$`Quantity`),col="red",
paste("mean=",mean(supermarket_new$`Quantity`)))
supermarket_new$`gross_income`%>%hist(main="Histogram of gross_income",
col="Lightgreen",breaks=40,
xlab="gross income",xlim=c(0,65),
ylab="Frequency")
abline(v=mean(supermarket_new$`gross_income`),col="red",lwd=3)
text(y=60,
x=mean(supermarket_new$`gross_income`),col="red",
paste("mean=",mean(supermarket_new$`gross_income`)))
supermarket_new$`Rating`%>%hist(main="Histogram of Rating",
col="Lightblue",breaks=40,
xlab="Rating",xlim=c(0,12),
ylab="Frequency")
abline(v=mean(supermarket_new$`Rating`),col="red",lwd=3)
text(y=40,
x=mean(supermarket_new$`Rating`),col="red",
paste("mean=",mean(supermarket_new$`Rating`)))
mean_tbl1<-tapply(supermarket_new$`Total`,supermarket_new$Product_category,mean) %>% round(digits=2)%>% sort(decreasing=TRUE)%>% as.matrix()
kable(mean_tbl1,align = "c",format = "markdown",
caption = "Average Sales per Product category",
table.attr="style='width:30%;'")
| lifestyle | 336.64 |
| Sports | 332.07 |
| Beauty | 323.64 |
| Food | 322.67 |
| Electronics | 319.63 |
| Fashion | 305.09 |
mean_tbl2<-tapply(supermarket_new$`gross_income`,supermarket_new$Product_category,mean) %>% round(digits=2)%>% sort(decreasing=TRUE)%>% as.matrix()
kable(mean_tbl2,align = "c",format = "markdown",
caption = "Average gross income per Product category",
table.attr="style='width:30%;'")
| lifestyle | 16.03 |
| Sports | 15.81 |
| Beauty | 15.41 |
| Food | 15.37 |
| Electronics | 15.22 |
| Fashion | 14.53 |
mean_tbl3<-tapply(supermarket_new$`Total`,supermarket_new$Product_category,sum) %>% round(digits=2)%>% sort(decreasing=TRUE)%>% prop.table()%>% as.matrix()
kable(mean_tbl3*100,align = "c",format = "markdown",
caption = "% total sales per Product category",
table.attr="style='width:30%;'")
| Food | 17.38409 |
| Sports | 17.06765 |
| Electronics | 16.82450 |
| Fashion | 16.81470 |
| lifestyle | 16.67723 |
| Beauty | 15.23183 |
mean_tbl4<-tapply(supermarket_new$`Total`,supermarket_new$City,sum) %>% round(digits=2)%>% sort(decreasing=TRUE)%>% prop.table()%>% as.matrix()
kable(mean_tbl4*100,align = "c",format = "markdown",
caption = "% total sales per City",
table.attr="style='width:30%;'")
| Naypyitaw | 34.23532 |
| Yangon | 32.88276 |
| Mandalay | 32.88192 |
par(mfrow=c(2,1))
par(mai=c(1,1,0.2,0.2))
attach(supermarket_new)
boxplot(`Total`~`Gender`,col=brewer.pal(2,"Pastel1"),
horizontal = TRUE,main="Boxplot showing total sales vs Gender",las=1,ylab="")
boxplot(`Total`~`Product_category`,col=brewer.pal(7,"Set1"),
main="Boxplot showing Total sales vs Product category",
xlab="Product category",ylab="",las=1,ylim=c(0,1200))
par(mfrow=c(2,1))
par(mai=c(1,1,0.2,0.2))
boxplot(`cogs`~`Product_category`,col=brewer.pal(7,"Pastel1"),main="Boxplot showing Cogs vs Product category",las=1,ylab="",ylim=c(0,1200),
xlab="Product category")
boxplot(`Rating`~`Product_category`,col=brewer.pal(7,"Set3"),
main="Boxplot showing Rating vs Product category",
xlab="Product category",ylim=c(0,15))
#boxplot(`Total`~`Customer type`,col=brewer.pal(7,"Set2"))
REFERENCES
Barplots
https://rpubs.com/Dee_Chiluiza/barplots
Histograms
https://rpubs.com/Dee_Chiluiza/816756
Group
BY
https://sparkbyexamples.com/r-programming/group-by-sum-in-r/
https://www.marsja.se/how-to-add-a-column-to-dataframe-in-r-with-tibble-dplyr/
https://www.r-bloggers.com/2022/08/r-program-to-concatenate-two-strings/
https://digitaschools.com/data-types-in-r-tutorial/