mydata <- read.table("./retail_sales_dataset.csv", header=TRUE, sep = ",", dec = ",") #Reading the data
head(mydata) #Showing first 6 rows of the data
## Transaction.ID Date Customer.ID Gender Age Product.Category
## 1 1 2023-11-24 CUST001 Male 34 Beauty
## 2 2 2023-02-27 CUST002 Female 26 Clothing
## 3 3 2023-01-13 CUST003 Male 50 Electronics
## 4 4 2023-05-21 CUST004 Male 37 Clothing
## 5 5 2023-05-06 CUST005 Male 30 Beauty
## 6 6 2023-04-25 CUST006 Female 45 Beauty
## Quantity Price.per.Unit Total.Amount
## 1 3 50 150
## 2 2 500 1000
## 3 1 30 30
## 4 1 500 500
## 5 2 50 100
## 6 1 30 30
Unit of observation: 1 customer (not specified where, when; crafted data)
Sample size: 1000
summary(mydata) #Showing the descriptive statistics
## Transaction.ID Date Customer.ID
## Min. : 1.0 Length:1000 Length:1000
## 1st Qu.: 250.8 Class :character Class :character
## Median : 500.5 Mode :character Mode :character
## Mean : 500.5
## 3rd Qu.: 750.2
## Max. :1000.0
## Gender Age Product.Category
## Length:1000 Min. :18.00 Length:1000
## Class :character 1st Qu.:29.00 Class :character
## Mode :character Median :42.00 Mode :character
## Mean :41.39
## 3rd Qu.:53.00
## Max. :64.00
## Quantity Price.per.Unit Total.Amount
## Min. :1.000 Min. : 25.0 Min. : 25
## 1st Qu.:1.000 1st Qu.: 30.0 1st Qu.: 60
## Median :3.000 Median : 50.0 Median : 135
## Mean :2.514 Mean :179.9 Mean : 456
## 3rd Qu.:4.000 3rd Qu.:300.0 3rd Qu.: 900
## Max. :4.000 Max. :500.0 Max. :2000
Explanation of the descriptive statistics:
mydata3 <- mydata[, c(4,5,6,9)] #Including only 4th, 5th, 6th and 9th column (only needed data)
head(mydata3)
## Gender Age Product.Category Total.Amount
## 1 Male 34 Beauty 150
## 2 Female 26 Clothing 1000
## 3 Male 50 Electronics 30
## 4 Male 37 Clothing 500
## 5 Male 30 Beauty 100
## 6 Female 45 Beauty 30
#Creating a factor variable for product category
mydata3$Product.CategoryFactor <- factor(mydata3$Product.Category,
levels = c("Beauty", "Clothing", "Electronics"),
labels = c("Beauty", "Clothing", "Electronics"))
#Creating a factor variable for gender
mydata3$GenderFactor <- factor(mydata3$Gender,
levels = c("Female", "Male"),
labels = c("Female", "Male"))
head(mydata3, 6)
## Gender Age Product.Category Total.Amount Product.CategoryFactor
## 1 Male 34 Beauty 150 Beauty
## 2 Female 26 Clothing 1000 Clothing
## 3 Male 50 Electronics 30 Electronics
## 4 Male 37 Clothing 500 Clothing
## 5 Male 30 Beauty 100 Beauty
## 6 Female 45 Beauty 30 Beauty
## GenderFactor
## 1 Male
## 2 Female
## 3 Male
## 4 Male
## 5 Male
## 6 Female
summary(mydata3) #Showing the descriptive statistics
## Gender Age Product.Category Total.Amount
## Length:1000 Min. :18.00 Length:1000 Min. : 25
## Class :character 1st Qu.:29.00 Class :character 1st Qu.: 60
## Mode :character Median :42.00 Mode :character Median : 135
## Mean :41.39 Mean : 456
## 3rd Qu.:53.00 3rd Qu.: 900
## Max. :64.00 Max. :2000
## Product.CategoryFactor GenderFactor
## Beauty :307 Female:510
## Clothing :351 Male :490
## Electronics:342
##
##
##
Explanation of the descriptive statistics:
#install.packages("psych")
library(car)
## Loading required package: carData
scatterplot(mydata3$Age, mydata3$Total.Amount,
ylim = c(20, 2005),
xlim = c(15,70),
col = "darkgreen",
ylab = "Total amount of a transaction",
xlab = "Age",
smooth=FALSE) #showing the scatterplot
shapiro.test(mydata3$Total.Amount) #checking the normality distribution for variable total amount
##
## Shapiro-Wilk normality test
##
## data: mydata3$Total.Amount
## W = 0.74891, p-value < 2.2e-16
H0: The amount of total transaction is normally distributed
H1: The amount of total transaction is not normally distributed
We reject H0 (p<0.001). We can not say that the amount of total transaction is normally distributed.
shapiro.test(mydata3$Age) #checking the normality distribution for variable gender
##
## Shapiro-Wilk normality test
##
## data: mydata3$Age
## W = 0.95241, p-value < 2.2e-16
H0: Age is normally distributed
H1: Age is not normally distributed
We reject H0 (p<0.001). We can not say that age is normally distributed.
The assumption of normality of both variables is violated. I need to use the non-parametric version, Spearman correlation coefficient.
library(Hmisc)
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
cor(mydata3$`Age`, mydata3$`Total.Amount`,
method= "pearson") #Pearson correlation coefficient
## [1] -0.06056802
cor.test(mydata3$`Age`, mydata3$`Total.Amount`,
method = "pearson",
exact = FALSE)
##
## Pearson's product-moment correlation
##
## data: mydata3$Age and mydata3$Total.Amount
## t = -1.9169, df = 998, p-value = 0.05553
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.12210264 0.00143043
## sample estimates:
## cor
## -0.06056802
As I said, Pearson correlation coefficient is not appropriate, we use Spearman correlation coefficient.
cor(mydata3$`Age`, mydata3$`Total.Amount`,
method= "spearman")
## [1] -0.03786403
cor.test(mydata3$`Age`, mydata3$`Total.Amount`,
method = "spearman",
exact = FALSE) #Spearman correlation coefficient
##
## Spearman's rank correlation rho
##
## data: mydata3$Age and mydata3$Total.Amount
## S = 172977165, p-value = 0.2316
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.03786403
H0: There is no correlation between age and the amount of total transaction.
H1: There is a correlation between age and the amount of total transaction.
We can not reject H0 (p value is too high). We can not say that there is a statistically significant correlation between age and the total amount of a transaction.
Linear relationship between age and the total amount of a transaction is negative and very weak (Spearman correlation coefficient is -0.038)
results <- chisq.test(mydata3$GenderFactor, mydata3$Product.CategoryFactor,
correct = TRUE) # Pearson Chi2 test
results
##
## Pearson's Chi-squared test
##
## data: mydata3$GenderFactor and mydata3$Product.CategoryFactor
## X-squared = 1.6738, df = 2, p-value = 0.433
H0:There is no association between the category of a product and gender.
H1:There is association between the category of a product and gender.
We can not reject H0 (p value is too high). Based on the sample data we can not say that there is an association between gender and the category of a product.
addmargins(results$observed) #Checking empirical frequencies
## mydata3$Product.CategoryFactor
## mydata3$GenderFactor Beauty Clothing Electronics Sum
## Female 166 174 170 510
## Male 141 177 172 490
## Sum 307 351 342 1000
round(results$expected) #Checking expected frequencies
## mydata3$Product.CategoryFactor
## mydata3$GenderFactor Beauty Clothing Electronics
## Female 157 179 174
## Male 150 172 168
round(results$res, 2) #Checking residuals
## mydata3$Product.CategoryFactor
## mydata3$GenderFactor Beauty Clothing Electronics
## Female 0.75 -0.37 -0.33
## Male -0.77 0.38 0.34
All std. residuals are below 1.96 critical value, deviations are not statistically significant.
addmargins(round(prop.table(results$observed), 3))
## mydata3$Product.CategoryFactor
## mydata3$GenderFactor Beauty Clothing Electronics Sum
## Female 0.166 0.174 0.170 0.510
## Male 0.141 0.177 0.172 0.490
## Sum 0.307 0.351 0.342 1.000
Explanation of Female/Beauty = 0.166:
addmargins(round(prop.table(results$observed, 1), 3), 2)
## mydata3$Product.CategoryFactor
## mydata3$GenderFactor Beauty Clothing Electronics Sum
## Female 0.325 0.341 0.333 0.999
## Male 0.288 0.361 0.351 1.000
Explanation of Male/Electronics = 0.351:
addmargins(round(prop.table(results$observed, 2), 3), 1)
## mydata3$Product.CategoryFactor
## mydata3$GenderFactor Beauty Clothing Electronics
## Female 0.541 0.496 0.497
## Male 0.459 0.504 0.503
## Sum 1.000 1.000 1.000
Explanation of Female/Beauty = 0.541:
library(effectsize)
effectsize::cramers_v(mydata3$Product.CategoryFactor, mydata3$GenderFactor) #Calculating the effect size; Cramer's V statistics
## Cramer's V (adj.) | 95% CI
## --------------------------------
## 0.00 | [0.00, 1.00]
##
## - One-sided CIs: upper bound fixed at [1.00].
interpret_cramers_v(0.00)
## [1] "tiny"
## (Rules: funder2019)
fisher.test(mydata3$Product.CategoryFactor, mydata3$GenderFactor) #Fisher test for robustness
##
## Fisher's Exact Test for Count Data
##
## data: mydata3$Product.CategoryFactor and mydata3$GenderFactor
## p-value = 0.4355
## alternative hypothesis: two.sided
H0: Odds ratio = 1
H1: Odds ratio =/ 1
Based on the sample data we can not reject H0 (p value is too high). Both genders have equal odds for the product category.
RQ1: We can not reject H0 (p value is too high). We can not say that there is a statistically significant correlation between age and the total amount of a transaction.
Linear relationship between age and the total amount of a transaction is negative and very weak (Spearman correlation coefficient is -0.038)
RQ2: Based on the sample data we can not say that there is a relationship between gender and the category of a product. The effect size is small (r=0.00).