Author: Hui-Ju Huang
mydata <- read.table("./supermarket_sales.csv", header = TRUE, sep = ",", dec = ".")
head(mydata)
## Invoice.ID Branch City Customer.type Gender Product.line
## 1 750-67-8428 A Yangon Member Female Health and beauty
## 2 226-31-3081 C Naypyitaw Normal Female Electronic accessories
## 3 631-41-3108 A Yangon Normal Male Home and lifestyle
## 4 123-19-1176 A Yangon Member Male Health and beauty
## 5 373-73-7910 A Yangon Normal Male Sports and travel
## 6 699-14-3026 C Naypyitaw Normal Male Electronic accessories
## Unit.price Quantity Tax.5. Total Date Time Payment cogs
## 1 74.69 7 26.1415 548.9715 1/5/2019 13:08 Ewallet 522.83
## 2 15.28 5 3.8200 80.2200 3/8/2019 10:29 Cash 76.40
## 3 46.33 7 16.2155 340.5255 3/3/2019 13:23 Credit card 324.31
## 4 58.22 8 23.2880 489.0480 1/27/2019 20:33 Ewallet 465.76
## 5 86.31 7 30.2085 634.3785 2/8/2019 10:37 Ewallet 604.17
## 6 85.39 7 29.8865 627.6165 3/25/2019 18:30 Ewallet 597.73
## gross.margin.percentage gross.income Rating
## 1 4.761905 26.1415 9.1
## 2 4.761905 3.8200 9.6
## 3 4.761905 16.2155 7.4
## 4 4.761905 23.2880 8.4
## 5 4.761905 30.2085 5.3
## 6 4.761905 29.8865 4.1
Unit of Observation: Each row represents a sales transaction in the supermarket.
Sample Size: Total number of rows in the dataset (1000).
Definition of Variables
Summary of the Total
summary(mydata$Total)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10.68 124.42 253.85 322.97 471.35 1042.65
Convert categorical variables to factors
mydata$Customer.type <- factor(mydata$Customer.type)
Determine sample sizes
sample_size_member <- sum(mydata$Customer.type == "Member")
sample_size_normal <- sum(mydata$Customer.type == "Normal")
sample_size_member
## [1] 501
sample_size_normal
## [1] 499
#install.packages("psych")
library(psych)
sales_member <- mydata$Total[mydata$Customer.typ == "Member"]
describe(sales_member)
## vars n mean sd median trimmed mad min max range skew
## X1 1 501 327.79 247.99 266.03 299.64 251.8 10.68 1042.65 1031.97 0.84
## kurtosis se
## X1 -0.22 11.08
sales_normal <- mydata$Total[mydata$Customer.typ == "Normal"]
describe(sales_normal)
## vars n mean sd median trimmed mad min max range skew
## X1 1 499 318.12 243.9 237.43 288.7 217.29 12.69 1039.29 1026.6 0.94
## kurtosis se
## X1 0.04 10.92
Check whether a variable is normally distributed.
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
Member <- ggplot(mydata[mydata$Customer.type == "Member", ], aes(x = Total)) +
geom_histogram(position = "identity", binwidth = 100, colour = "black") +
ylab("Count") +
xlab("Total") +
ggtitle("Member")
Normal <- ggplot(mydata[mydata$Customer.type == "Normal", ], aes(x = Total)) +
geom_histogram(position = "identity", binwidth = 100, colour = "black") +
ylab("Count") +
xlab("Total") +
ggtitle("Normal")
#install.packages("ggpubr")
library(ggpubr)
ggarrange(Member, Normal,ncol = 2, nrow=1)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(rstatix)
##
## Attaching package: 'rstatix'
## The following object is masked from 'package:stats':
##
## filter
mydata %>%
group_by(Customer.type) %>%
shapiro_test(Total)
## # A tibble: 2 × 4
## Customer.type variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 Member Total 0.912 2.02e-16
## 2 Normal Total 0.904 3.49e-17
wilcox.test(mydata$Total ~ mydata$Customer.type,
paired = FALSE,
correct = FALSE,
exact = FALSE,
alternative = "two.sided")
##
## Wilcoxon rank sum test
##
## data: mydata$Total by mydata$Customer.type
## W = 127453, p-value = 0.5911
## alternative hypothesis: true location shift is not equal to 0
library(effectsize)
##
## Attaching package: 'effectsize'
## The following objects are masked from 'package:rstatix':
##
## cohens_d, eta_squared
## The following object is masked from 'package:psych':
##
## phi
effectsize(wilcox.test(mydata$Total ~ mydata$Customer.type,
paired = FALSE,
correct = FALSE,
exact = FALSE,
alternative = "two.sided"))
## r (rank biserial) | 95% CI
## ---------------------------------
## 0.02 | [-0.05, 0.09]
library(effectsize)
interpret_rank_biserial(0.02 )
## [1] "tiny"
## (Rules: funder2019)