Danielle Ormandy
15 August 2017
The Ta Feng Grocery Dataset is a Supermarket Dataset containining 817741 transactions from November 2000 until the end of February 2001.
The dataset contains information about 119578 shopping baskets, belonging to 32266 users, where 1129939 items were purchased from a range of 23812 products.
Fields of the Dataset are:
- Transaction date and time (time invalid and useless)
- Customer ID
- Age: 10 possible values
- Residence Area: 8 possible values
- Product subclass
- Product ID
- Amount
- Asset
- Sales price
Top 3 Products are
Product IDs 4714981010038, 4710421090059, 4711271000014
Bar Chart of Product Sales by ProdID
Time Series Decomposition of Product Sales by ProdID
Sales Trend for Top 3 Products
The R libraries used include those for data manipulation (reshape2,data.table), plotting (ggplot, grid, gridExtra), and forecasting (xts, forecast)
library(reshape2)
library(ggplot2)
library(grid)
library(gridExtra)
library(data.table)##
## Attaching package: 'data.table'
## The following objects are masked from 'package:reshape2':
##
## dcast, melt
library(forecast)
library(xts)## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:data.table':
##
## first, last
The Dataset is loaded from the four input files and combined after adding column descriptors and formatting some columns.
df.TFD11 <- read.csv("D11", header = T, sep = ";",
stringsAsFactors = FALSE)
df.TFD12 <- read.csv("D12", header = T, sep = ";",
stringsAsFactors = FALSE)
df.TFD01 <- read.csv("D01", header = T, sep = ";",
stringsAsFactors = FALSE)
df.TFD02 <- read.csv("D02", header = T, sep = ";",
stringsAsFactors = FALSE)
D11cols <- colnames(df.TFD11)
D12cols <- colnames(df.TFD12)
D01cols <- colnames(df.TFD01)
D02cols <- colnames(df.TFD02)
identical(D11cols, D12cols)
identical(D11cols, D01cols)
identical(D11cols, D02cols)
df.TFD0 <- rbind(df.TFD01, df.TFD02)
df.TFD1 <- rbind(df.TFD11, df.TFD12)
df.TFDall <- rbind(df.TFD0, df.TFD1)Dataset Columns are defined and fields converted to required format.
Data Table is created for query analysis
TFcols <- c("DateTime", "CustID", "Age_cat", "ResArea",
"ProdSub", "ProdID", "Amount", "Asset", "SalesPrice")
colnames(df.TFDall) <- TFcols
dt.TF <- as.data.table(df.TFDall)
dt.TF$DateTime <- as.POSIXct(dt.TF$DateTime)
dt.TF$ProdID <- as.factor(dt.TF$ProdID)
df.TFDall$DateTime <- as.POSIXlt(df.TFDall$DateTime)
df.TFDall$Age_cat = as.factor(df.TFDall$Age_cat)
df.TFDall$ResArea = as.factor(df.TFDall$ResArea)
df.TFDall$CustID = as.factor(df.TFDall$CustID)
df.TFDall$ProdSub = as.factor(df.TFDall$ProdSub)From the summary of the dataset it is observed that the max values of colums CustID, Amount, Asset and Sales Price is much higher that the 3rd Quartile, suggesting outliers. This can be explored graphically.
The dataset was also checked for NULL values (with none found)
summary(df.TFDall)
str(df.TFDall)
sapply(df.TFDall, function(x) sum(is.na(x)))## DateTime CustID Age_cat
## Min. :2000-11-01 00:00:00 20459 : 1246 D :181213
## 1st Qu.:2000-11-28 00:00:00 2112589: 879 E :151023
## Median :2001-01-01 00:00:00 2112596: 822 C :140805
## Mean :2000-12-30 16:40:45 2113579: 776 F : 99719
## 3rd Qu.:2001-01-30 00:00:00 1847994: 754 B : 66432
## Max. :2001-02-28 00:00:00 426053 : 596 G : 53719
## (Other):812668 (Other):124830
## ResArea ProdSub ProdID
## E :312501 100205 : 20400 Min. : 20008819
## F :245213 120103 : 15360 1st Qu.:4710085127020
## G : 72092 110401 : 13099 Median :4710421090060
## C : 71640 110411 : 12794 Mean :4461639280530
## H : 40666 130206 : 12008 3rd Qu.:4712500125130
## D : 38674 130315 : 11874 Max. :9789579967620
## (Other): 36955 (Other):732206
## Amount Asset SalesPrice
## Min. : 1.000 Min. : 0.0 Min. : 1.0
## 1st Qu.: 1.000 1st Qu.: 35.0 1st Qu.: 42.0
## Median : 1.000 Median : 62.0 Median : 76.0
## Mean : 1.382 Mean : 112.1 Mean : 131.9
## 3rd Qu.: 1.000 3rd Qu.: 112.0 3rd Qu.: 132.0
## Max. :1200.000 Max. :432000.0 Max. :444000.0
##
## 'data.frame': 817741 obs. of 9 variables:
## $ DateTime : POSIXlt, format: "2001-01-01" "2001-01-01" ...
## $ CustID : Factor w/ 32266 levels "1069","1113",..: 1163 10676 14476 18272 30090 21913 8144 18759 7565 7013 ...
## $ Age_cat : Factor w/ 11 levels "A ","B ","C ",..: 6 5 5 5 1 10 4 8 6 5 ...
## $ ResArea : Factor w/ 8 levels "A ","B ","C ",..: 6 5 7 6 2 5 3 1 3 5 ...
## $ ProdSub : Factor w/ 2012 levels "100101","100102",..: 317 180 14 962 462 135 1489 1169 1014 577 ...
## $ ProdID : num 4710105011011 4710265849066 4712019100607 4710168702901 4710431339148 ...
## $ Amount : int 2 1 1 1 1 1 1 1 1 1 ...
## $ Asset : int 44 150 35 94 100 144 740 676 170 36 ...
## $ SalesPrice: int 52 129 39 119 159 190 969 849 219 59 ...
## DateTime CustID Age_cat ResArea ProdSub ProdID
## 0 0 0 0 0 0
## Amount Asset SalesPrice
## 0 0 0
Questions for the dataset:
How many Customers in the dataset?
How many different products?
How many Shopping Baskets?
How many items are purchased?
ncustomers = length(unique(dt.TF[, CustID]))
print(sprintf("Total Number of Customers= %10.0f",
ncustomers))
nproducts = length(unique(dt.TF[, ProdID]))
print(sprintf("Total Number of Products= %10.0f", nproducts))
nbaskets = nrow(unique(dt.TF[, list(DateTime, CustID,
ResArea)]))
print(sprintf("Total Number of Shopping Baskets= %10.0f",
nbaskets))
nitems = sum(dt.TF[, Amount])
print(sprintf("Total Number of Items= %10.0f", nitems))
nvalue = dt.TF[, list(sum(Amount * SalesPrice))]
print(sprintf("Total Value of Transactions = %10.0f",
nvalue))The distributions of variable ‘Amount’ show almost all transactions are low value transactions (less than 2), which a small number of high value transactions.
From this it can be assumed that the number of transactions can be assumed to be of more interest than the size of the transactions.
p1 = ggplot(df.TFDall, aes(x = df.TFDall$Amount)) +
geom_histogram(binwidth = 1, colour = "black",
fill = "#56B4D9", na.rm = TRUE) + ylim(c(0,
15000)) + xlim(c(1, 100)) + ggtitle("Histogram of Amount") +
labs(y = "Count", x = "")
p2 = ggplot(df.TFDall, aes(x = "", y = df.TFDall$Amount)) +
geom_boxplot(fill = "#56B4D9", outlier.colour = "black",
outlier.shape = 1, outlier.size = 2, na.rm = TRUE) +
ggtitle("Boxplot of Amount") + theme(axis.text.x = element_blank()) +
labs(y = "Amount Value") + labs(y = "Count", x = "")
grid.arrange(p1, p2, ncol = 2, nrow = 1, widths = c(0.8,
0.8))TaFeng Variable ‘Amount’ Distribution
The distributions of variable ‘Asset’ show almost all values are less than 1000, and all except 2 values being less that 10000.
This sheds no light on this variable and it is not used further in the analysis.
p1 = ggplot(df.TFDall, aes(x = df.TFDall$Asset)) +
geom_histogram(binwidth = 100, colour = "black",
fill = "#56B4D9", na.rm = TRUE) + ylim(c(0,
100000)) + xlim(c(0, 10000)) + ggtitle("Histogram of TaFeng Asset") +
labs(y = "Count", x = "")
p2 = ggplot(df.TFDall, aes(x = "", y = df.TFDall$Asset)) +
geom_boxplot(fill = "#56B4D9", outlier.colour = "black",
outlier.shape = 1, outlier.size = 2, na.rm = TRUE) +
ggtitle("Boxplot of TaFeng Asset") + labs(y = "Asset Value") +
theme(axis.text.x = element_blank()) + labs(y = "Count",
x = "")
grid.arrange(p1, p2, ncol = 2, nrow = 1, widths = c(0.8,
0.8))TaFeng Variable ‘Asset’ Distribution
The distributions of variable ‘Sales Price’ show almost all values are less than 2000, with only a small number of samples with Sales Prices above 10000.
These values could be outliers, however they are not removed from the dataset.
p1 = ggplot(df.TFDall, aes(x = df.TFDall$SalesPrice)) +
geom_histogram(binwidth = 20, colour = "black",
fill = "#56B4D9", na.rm = TRUE) + ylim(c(0,
75000)) + xlim(c(0, 2000)) + ggtitle("Histogram of TaFeng Sales Price") +
labs(y = "Count", x = "")
p2 = ggplot(df.TFDall, aes(x = "", y = df.TFDall$SalesPrice)) +
geom_boxplot(fill = "#56B4D9", outlier.colour = "black",
outlier.shape = 1, outlier.size = 2, na.rm = TRUE) +
ggtitle("Boxplot of TaFeng Sales Price") + labs(y = "Sales Price",
x = "")
grid.arrange(p1, p2, ncol = 2, nrow = 1, widths = c(0.8,
0.8))‘Sales Price’ Distribution
Defining a Transaction to be a unique combination of Customer ID, DateTime, and ResArea, it is possible to visualise the number of transactions per day.
dt.TF_trans <- unique(dt.TF[, list(DateTime, CustID,
ResArea)])
df.trans <- dt.TF_trans[, list(num_trans = length(CustID)),
by = DateTime]
ggplot(dt.TF_trans[, list(num_trans = length(CustID)),
by = DateTime]) + geom_bar(aes(x = DateTime, y = num_trans),
stat = "identity", alpha = 0.8) + labs(y = "# Transactions",
x = "Date")Splitting the data into regions shows the difference in the number of transactions in each Region, with Region E being the busiest region with the highest number of transactions in January.
Regions A and B can be seen to have low numbers of transactions.
ggplot(dt.TF_trans[, list(num_trans = length(CustID)),
by = list(DateTime, ResArea)]) + geom_bar(aes(x = DateTime,
y = num_trans, color = ResArea), stat = "identity",
alpha = 0.8) + facet_wrap(~ResArea) + labs(y = "# Transactions",
x = "Date")Looking at the amount of each ProdID sold per day, it is possible to determine the top selling product using ‘Amount’.
It is possible to show the amount of each ProdID sold each day.
dt.TF_prodid <- dt.TF[, list(TAmount = sum(Amount)),
by = list(DateTime, ProdID, ResArea)]
topPr <- dt.TF_prodid[, list(Total = sum(TAmount)),
by = .(ProdID)]
topPr <- topPr[order(-Total)]
head(topPr)## ProdID Total
## 1: 4714981010038 14537
## 2: 4710421090059 11790
## 3: 4711271000014 10615
## 4: 4711663700010 3810
## 5: 4710114128038 3322
## 6: 4710032501791 2865
ggplot(topPr[Total >= 2700, ]) + geom_bar(aes(x = ProdID,
y = Total, fill = ProdID), stat = "identity", alpha = 0.8) +
theme(axis.text.x = element_text(angle = 25)) +
scale_fill_hue(l = 40)A more detailed analysis of the sales of the Top selling Product shows significant increases at some times in the series.
dt.TF_prodid <- dt.TF[, list(TAmount = sum(Amount)),
by = list(DateTime, ProdID, ResArea)]
dt.TF_prodidtop3 <- subset(dt.TF_prodid, ProdID %in%
c(4714981010038, 4710421090059, 4711271000014))
dt.TF_prodid_nores <- dt.TF[, list(TAmount = sum(Amount)),
by = list(DateTime, ProdID)]
dt.TF_prodid1 <- subset(dt.TF_prodid_nores, ProdID ==
4714981010038)
dt.TF_prodid2 <- subset(dt.TF_prodid_nores, ProdID ==
4710421090059)
dt.TF_prodid3 <- subset(dt.TF_prodid_nores, ProdID ==
4711271000014)
ggplot(dt.TF_prodidtop3[, list(TAmount), by = list(DateTime,
ProdID)]) + geom_bar(aes(x = DateTime, y = TAmount,
fill = ProdID), stat = "identity", alpha = 0.8) +
scale_fill_manual(values = c("#007F85", "#0073B9",
"#AB00B6")) + labs(y = "Sales", x = "")Splitting the data into regions shows the difference in the number of transactions in each Region, with Region E being the busiest region with the highest number of sales. Regions A and B can be seen to have low numbers of sales. Most Regions show a similar sales profile for all three top products however Region G shows the highest number of sales for Product 4710421090059.
ggplot(dt.TF_prodidtop3[, list(num_trans = sum(TAmount)),
by = list(ProdID, ResArea)]) + geom_bar(aes(x = ProdID,
y = num_trans, fill = ProdID), stat = "identity",
alpha = 0.8) + scale_fill_manual(values = c("#007F85",
"#0073B9", "#AB00B6")) + facet_wrap(~ResArea) +
labs(y = "Sales", x = "") + theme(axis.ticks = element_blank(),
axis.text.x = element_blank())ts_trans <- xts(df.trans$num_trans, as.Date(df.trans$DateTime))
attr(ts_trans, "frequency") <- 7
trans_decom <- stl(as.ts(ts_trans), s.window = "periodic",
t.window = 7)
plot(trans_decom)ts_trans1 <- xts(dt.TF_prodid1$TAmount, as.Date(dt.TF_prodid1$DateTime))
attr(ts_trans1, "frequency") <- 7
trans_decom1 <- stl(as.ts(ts_trans1), s.window = "periodic",
t.window = 7)
plot(trans_decom1)ts_trans2 <- xts(dt.TF_prodid2$TAmount, as.Date(dt.TF_prodid2$DateTime))
attr(ts_trans2, "frequency") <- 7
trans_decom2 <- stl(as.ts(ts_trans2), s.window = "periodic",
t.window = 7)
plot(trans_decom2)ts_trans3 <- xts(dt.TF_prodid3$TAmount, as.Date(dt.TF_prodid3$DateTime))
attr(ts_trans3, "frequency") <- 7
trans_decom3 <- stl(as.ts(ts_trans3), s.window = "periodic",
t.window = 7)
plot(trans_decom3)par(mfrow = c(3, 1))
plot(trans_decom1$time.series[, "trend"], ylab = "",
main = "ProdID: 4714981010038")
plot(trans_decom2$time.series[, "trend"], ylab = "",
main = "ProdID: 4710421090059")
plot(trans_decom3$time.series[, "trend"], ylab = "",
main = "ProdID: 4711271000014")Top 3 Prod ID Sales Trend by Time