ALY6010 20914 - Probability Theory and Introductory Statistics
NORTHEASTERN UNIVERSITY
SANA YASIN
Date :25 February, 2023
Module 1
Professor: Tom Breur

Importing the Dataset and Libraries

#importing Libraries
library(tidyverse)
library(readxl)
library(magrittr)
library(dplyr)
library(RColorBrewer)
library(kableExtra)
library(forcats)
#importing dataset
supermarket_data <-read_csv("C:/Users/LENOVO/Documents/ALY6010 poject/Data/supermarket_sales.csv")
r=nrow(supermarket_data)
n=ncol(supermarket_data)
dplyr::glimpse(supermarket_data[1:6,])
## Rows: 6
## Columns: 17
## $ `Invoice ID`              <chr> "750-67-8428", "226-31-3081", "631-41-3108",…
## $ Branch                    <chr> "A", "C", "A", "A", "A", "C"
## $ City                      <chr> "Yangon", "Naypyitaw", "Yangon", "Yangon", "…
## $ `Customer type`           <chr> "Member", "Normal", "Normal", "Member", "Nor…
## $ Gender                    <chr> "Female", "Female", "Male", "Male", "Male", …
## $ `Product line`            <chr> "Health and beauty", "Electronic accessories…
## $ `Unit price`              <dbl> 74.69, 15.28, 46.33, 58.22, 86.31, 85.39
## $ Quantity                  <dbl> 7, 5, 7, 8, 7, 7
## $ `Tax 5%`                  <dbl> 26.1415, 3.8200, 16.2155, 23.2880, 30.2085, …
## $ Total                     <dbl> 548.9715, 80.2200, 340.5255, 489.0480, 634.3…
## $ Date                      <chr> "01-05-2019", "03-08-2019", "03-03-2019", "1…
## $ Time                      <time> 13:08:00, 10:29:00, 13:23:00, 20:33:00, 10:3…
## $ Payment                   <chr> "Ewallet", "Cash", "Credit card", "Ewallet",…
## $ cogs                      <dbl> 522.83, 76.40, 324.31, 465.76, 604.17, 597.73
## $ `gross margin percentage` <dbl> 4.761905, 4.761905, 4.761905, 4.761905, 4.76…
## $ `gross income`            <dbl> 26.1415, 3.8200, 16.2155, 23.2880, 30.2085,…
## $ Rating                    <dbl> 9.1, 9.6, 7.4, 8.4, 5.3, 4.1

The Dataset contains 1000 number of rows and 17 number of columns

Renaming the Columns

supermarket_renamed<-supermarket_data%>% rename(`Invoice_ID`=`Invoice ID`)%>%
  rename(`Customer_type`=`Customer type`)%>%
  rename(`Product_line`=`Product line`)%>%
  rename(`Unit_price`=`Unit price`)%>%
  rename(`Tax_5%`=`Tax 5%`)%>%
  rename(`gross_margin_percentage`=`gross margin percentage`)%>%
  rename(`gross_income`=`gross income`)
dplyr::glimpse(supermarket_renamed[1:2,])
## Rows: 2
## Columns: 17
## $ Invoice_ID              <chr> "750-67-8428", "226-31-3081"
## $ Branch                  <chr> "A", "C"
## $ City                    <chr> "Yangon", "Naypyitaw"
## $ Customer_type           <chr> "Member", "Normal"
## $ Gender                  <chr> "Female", "Female"
## $ Product_line            <chr> "Health and beauty", "Electronic accessories"
## $ Unit_price              <dbl> 74.69, 15.28
## $ Quantity                <dbl> 7, 5
## $ `Tax_5%`                <dbl> 26.1415, 3.8200
## $ Total                   <dbl> 548.9715, 80.2200
## $ Date                    <chr> "01-05-2019", "03-08-2019"
## $ Time                    <time> 13:08:00, 10:29:00
## $ Payment                 <chr> "Ewallet", "Cash"
## $ cogs                    <dbl> 522.83, 76.40
## $ gross_margin_percentage <dbl> 4.761905, 4.761905
## $ gross_income            <dbl> 26.1415, 3.8200
## $ Rating                  <dbl> 9.1, 9.6
Data Cleaning and taking relevant columns of the dataset

supermarket_renamed$Product_category<-recode(as.character(supermarket_renamed$Product_line),"Health and beauty"="Beauty","Electronic accessories"="Electronics",
           "Sports and travel"="Sports",
           "Fashion accessories"="Fashion",
           "Food and beverages"="Food",
           "Home and lifestyle"="lifestyle")
#dplyr::glimpse(supermarket_renamed[1:6,])
supermarket_new<- subset.data.frame(supermarket_renamed,select = -c(`Date`,`Time`,`Product_line`))
dplyr::glimpse(supermarket_new[1:6,])
## Rows: 6
## Columns: 15
## $ Invoice_ID              <chr> "750-67-8428", "226-31-3081", "631-41-3108", "…
## $ Branch                  <chr> "A", "C", "A", "A", "A", "C"
## $ City                    <chr> "Yangon", "Naypyitaw", "Yangon", "Yangon", "Ya…
## $ Customer_type           <chr> "Member", "Normal", "Normal", "Member", "Norma…
## $ Gender                  <chr> "Female", "Female", "Male", "Male", "Male", "M…
## $ Unit_price              <dbl> 74.69, 15.28, 46.33, 58.22, 86.31, 85.39
## $ Quantity                <dbl> 7, 5, 7, 8, 7, 7
## $ `Tax_5%`                <dbl> 26.1415, 3.8200, 16.2155, 23.2880, 30.2085, 29…
## $ Total                   <dbl> 548.9715, 80.2200, 340.5255, 489.0480, 634.378…
## $ Payment                 <chr> "Ewallet", "Cash", "Credit card", "Ewallet", "…
## $ cogs                    <dbl> 522.83, 76.40, 324.31, 465.76, 604.17, 597.73
## $ gross_margin_percentage <dbl> 4.761905, 4.761905, 4.761905, 4.761905, 4.7619…
## $ gross_income            <dbl> 26.1415, 3.8200, 16.2155, 23.2880, 30.2085, 29…
## $ Rating                  <dbl> 9.1, 9.6, 7.4, 8.4, 5.3, 4.1
## $ Product_category        <chr> "Beauty", "Electronics", "lifestyle", "Beauty"…

The Dataset now has1000 number of rows and 15 number of columns

Descriptive Statistics measures of Numerical Variables

#showing summary of numerical variables
supermarket_new%>% select(`Unit_price`,Quantity,`Tax_5%`,`Total`,`cogs`,`gross_margin_percentage`,`gross_income`,`Rating`)%>%summary()%>% kable(align = "c",
               format = "markdown",
               caption = "Summary of numerical columns",
               table.attr="style='width:30%;'")
Summary of numerical columns
Unit_price Quantity Tax_5% Total cogs gross_margin_percentage gross_income Rating
Min. :10.08 Min. : 1.00 Min. : 0.5085 Min. : 10.68 Min. : 10.17 Min. :4.762 Min. : 0.5085 Min. : 4.000
1st Qu.:32.88 1st Qu.: 3.00 1st Qu.: 5.9249 1st Qu.: 124.42 1st Qu.:118.50 1st Qu.:4.762 1st Qu.: 5.9249 1st Qu.: 5.500
Median :55.23 Median : 5.00 Median :12.0880 Median : 253.85 Median :241.76 Median :4.762 Median :12.0880 Median : 7.000
Mean :55.67 Mean : 5.51 Mean :15.3794 Mean : 322.97 Mean :307.59 Mean :4.762 Mean :15.3794 Mean : 6.973
3rd Qu.:77.94 3rd Qu.: 8.00 3rd Qu.:22.4453 3rd Qu.: 471.35 3rd Qu.:448.90 3rd Qu.:4.762 3rd Qu.:22.4453 3rd Qu.: 8.500
Max. :99.96 Max. :10.00 Max. :49.6500 Max. :1042.65 Max. :993.00 Max. :4.762 Max. :49.6500 Max. :10.000
Frequency Distribution of Categorical Variables

##summary of categorical variables
supermarket_new%>%select(`Product_category`)%>%table()%>%sort(decreasing = TRUE)%>% kable(align = "c",format = "markdown",
               caption = "Summary of Product Line",
               table.attr="style='width:30%;'")
Summary of Product Line
Product_category Freq
Fashion 178
Food 174
Electronics 170
Sports 166
lifestyle 160
Beauty 152
supermarket_new%>%select(`City`)%>%table()%>%sort(decreasing = TRUE)%>% kable(align = "c",format = "markdown",
               caption = "Summary of City",
               table.attr="style='width:30%;'")
Summary of City
City Freq
Yangon 340
Mandalay 332
Naypyitaw 328
supermarket_new%>%select(`Customer_type`)%>%table()%>%sort(decreasing = TRUE)%>% kable(align = "c",format = "markdown",
               caption = "Summary of Customer type",
               table.attr="style='width:30%;'")
Summary of Customer type
Customer_type Freq
Member 501
Normal 499
supermarket_new%>%select(`Branch`)%>%table()%>%sort(decreasing = TRUE)%>% kable(align = "c",format = "markdown",
               caption = "Summary of Branch",
               table.attr="style='width:30%;'")
Summary of Branch
Branch Freq
A 340
B 332
C 328
supermarket_new%>%select(`Payment`)%>%table()%>%sort(decreasing = TRUE)%>% kable(align = "c",format = "markdown",
               caption = "Summary of Payment",
               table.attr="style='width:30%;'")
Summary of Payment
Payment Freq
Ewallet 345
Cash 344
Credit card 311
supermarket_new%>%select(`Gender`)%>%table()%>%sort(decreasing = TRUE)%>% kable(align = "c",format = "markdown",
               caption = "Summary of Gender",
               table.attr="style='width:30%;'")
Summary of Gender
Gender Freq
Female 501
Male 499
Checking for NULL values

supermarket_new[is.na(supermarket_new)]
## <unspecified> [0]
Creating Barplors for Categorical variables

table1<-supermarket_new[c("Gender","Product_category")]%>%table()
table2<-supermarket_new[c("City","Product_category")]%>% table()
#kable(table2)
table3<-supermarket_new[c("Customer_type","Branch")]%>%table()
#kable(table3)
table4<-supermarket_new[c("Customer_type","Product_category")]%>%table()
#kable(table4)
par(mfrow=c(1,1))
par(mai=c(1,1,0.2,0.2))
plot1=barplot(table1,xlim=c(0,130),col=c("gray","blue"),legend.text = rownames(table1),las=1,args.legend = "Topleft",beside=TRUE,
              main="Count of Records Gender vs Product Line",horiz = TRUE,xlab="Frequency")

text(x=table1,plot1,table1,pos=2,cex=0.8)

#grapgh2
par(mfrow=c(1,1))
par(mai=c(1,1,0.2,0.2))
plot2=barplot(table2,xlim=c(0,100),col=c("gray","green","yellow"),legend.text = rownames(table2),las=1,args.legend = "Topleft",beside=TRUE,
              main="Count of Records City vs Product Line",horiz = TRUE,xlab="Frequency")

text(x=table2,plot2,table2,pos=2,cex=0.8)

#graph3
par(mfrow=c(2,1))
par(mai=c(1,0.8,0.2,0.2))
plot3=barplot(table3,ylim=c(0,200),col=c("gray","orange"),legend.text = rownames(table3),las=1,args.legend = "Topleft",
              main="Count of Records Customer type vs Branch Line",ylab="Frequency",beside=TRUE,cex.names=0.8)

text(y=table3,plot3,table3,pos=3,cex=0.8)

plot4=barplot(table4,ylim=c(0,300),col=brewer.pal(2,"Pastel1"),legend.text = rownames(table4),las=1,args.legend = "Topleft",
              main="Count of Records Customer type vs Branch Line",ylab="Frequency",beside=TRUE,cex.names=0.8)

text(y=table4,plot4,table4,pos=3,cex=0.8)

Creating Histograms for Numerical variables

par(mfrow=c(2,2))
par(mai=c(1,1,0.2,0.2))
supermarket_new$`Total`%>%hist(main="Histogram of Total",
                               col="LightYellow",breaks=50,
                               xlab="Total",xlim=c(0,1200),
                               ylab="Frequency")
abline(v=mean(supermarket_new$`Total`),col="red",lwd=3)
text(y=60,
     x=mean(supermarket_new$`Total`),col="red",
     paste("mean=",mean(supermarket_new$`Total`)))
supermarket_new$`Quantity`%>%hist(main="Histogram of Quantity",
                               col="LightPink",breaks=50,
                               xlab="Quantity",xlim=c(0,12),
                               ylab="Frequency")
abline(v=mean(supermarket_new$`Quantity`),col="red",lwd=3)
text(y=100,
     x=mean(supermarket_new$`Quantity`),col="red",
     paste("mean=",mean(supermarket_new$`Quantity`)))
supermarket_new$`gross_income`%>%hist(main="Histogram of gross_income",
                               col="Lightgreen",breaks=40,
                               xlab="gross income",xlim=c(0,65),
                               ylab="Frequency")
abline(v=mean(supermarket_new$`gross_income`),col="red",lwd=3)
text(y=60,
     x=mean(supermarket_new$`gross_income`),col="red",
     paste("mean=",mean(supermarket_new$`gross_income`)))
supermarket_new$`Rating`%>%hist(main="Histogram of Rating",
                               col="Lightblue",breaks=40,
                               xlab="Rating",xlim=c(0,12),
                               ylab="Frequency")
abline(v=mean(supermarket_new$`Rating`),col="red",lwd=3)
text(y=40,
     x=mean(supermarket_new$`Rating`),col="red",
     paste("mean=",mean(supermarket_new$`Rating`)))

Creating Tables for Numerical variables showing Average sales and % sales

mean_tbl1<-tapply(supermarket_new$`Total`,supermarket_new$Product_category,mean) %>% round(digits=2)%>% sort(decreasing=TRUE)%>% as.matrix()

kable(mean_tbl1,align = "c",format = "markdown",
               caption = "Average Sales per Product category",
               table.attr="style='width:30%;'")
Average Sales per Product category
lifestyle 336.64
Sports 332.07
Beauty 323.64
Food 322.67
Electronics 319.63
Fashion 305.09
mean_tbl2<-tapply(supermarket_new$`gross_income`,supermarket_new$Product_category,mean) %>% round(digits=2)%>% sort(decreasing=TRUE)%>% as.matrix()

kable(mean_tbl2,align = "c",format = "markdown",
               caption = "Average gross income per Product category",
               table.attr="style='width:30%;'")
Average gross income per Product category
lifestyle 16.03
Sports 15.81
Beauty 15.41
Food 15.37
Electronics 15.22
Fashion 14.53
mean_tbl3<-tapply(supermarket_new$`Total`,supermarket_new$Product_category,sum) %>% round(digits=2)%>% sort(decreasing=TRUE)%>% prop.table()%>% as.matrix()

kable(mean_tbl3*100,align = "c",format = "markdown",
               caption = "% total sales per Product category",
               table.attr="style='width:30%;'")
% total sales per Product category
Food 17.38409
Sports 17.06765
Electronics 16.82450
Fashion 16.81470
lifestyle 16.67723
Beauty 15.23183
mean_tbl4<-tapply(supermarket_new$`Total`,supermarket_new$City,sum) %>% round(digits=2)%>% sort(decreasing=TRUE)%>% prop.table()%>% as.matrix()

kable(mean_tbl4*100,align = "c",format = "markdown",
               caption = "% total sales per City",
               table.attr="style='width:30%;'")
% total sales per City
Naypyitaw 34.23532
Yangon 32.88276
Mandalay 32.88192
Creating Boxplots for Numerical variables

par(mfrow=c(2,1))
par(mai=c(1,1,0.2,0.2))
attach(supermarket_new)
boxplot(`Total`~`Gender`,col=brewer.pal(2,"Pastel1"),
        horizontal = TRUE,main="Boxplot showing total sales vs Gender",las=1,ylab="")
boxplot(`Total`~`Product_category`,col=brewer.pal(7,"Set1"),
        main="Boxplot showing Total sales vs Product category",
        xlab="Product category",ylab="",las=1,ylim=c(0,1200))

par(mfrow=c(2,1))
par(mai=c(1,1,0.2,0.2))
boxplot(`cogs`~`Product_category`,col=brewer.pal(7,"Pastel1"),main="Boxplot showing Cogs vs Product category",las=1,ylab="",ylim=c(0,1200),
        xlab="Product category")

boxplot(`Rating`~`Product_category`,col=brewer.pal(7,"Set3"),
        main="Boxplot showing Rating vs Product category",
        xlab="Product category",ylim=c(0,15))

#boxplot(`Total`~`Customer type`,col=brewer.pal(7,"Set2"))

REFERENCES
Barplots
https://rpubs.com/Dee_Chiluiza/barplots
Histograms
https://rpubs.com/Dee_Chiluiza/816756
Group BY
https://sparkbyexamples.com/r-programming/group-by-sum-in-r/
https://www.marsja.se/how-to-add-a-column-to-dataframe-in-r-with-tibble-dplyr/
https://www.r-bloggers.com/2022/08/r-program-to-concatenate-two-strings/
https://digitaschools.com/data-types-in-r-tutorial/