library(data.table)
library(ggplot2)
library(tidyr)
theme_set(theme_bw())
theme_update(plot.title = element_text(hjust = 0.5))
filepath <- "C:/Users/USER/OneDrive/Documents/R"
QVI_data <- read.csv("C:/Users/USER/OneDrive/Documents/R/QVI_data.csv")
data <- as.data.table(QVI_data)
# Monthly overall sales revenue
# Monthly number of customers
# Monthly number of transactions per customer
Calculate these measures overtime for each store ## Create a month ID
data[, YEARMONTH := year(DATE)* 100 + month(DATE)]
measureOverTime <- data[, .(totSales = sum(TOT_SALES),
nCustomers = uniqueN(LYLTY_CARD_NBR),
nTxnPerCust = uniqueN(TXN_ID) / uniqueN(LYLTY_CARD_NBR),
nChipsPerTxn = sum(PROD_QTY) / uniqueN(TXN_ID),
avgPricePerUnit = sum(TOT_SALES) / sum(PROD_QTY)),
by = c("STORE_NBR", "YEARMONTH")][order(STORE_NBR, YEARMONTH)]
storeWithFullObs <- unique(measureOverTime[, .N, STORE_NBR][N == 12, STORE_NBR])
preTrialMeasure <- measureOverTime[YEARMONTH < 201902 &
STORE_NBR %in% storeWithFullObs,]
Now we need to work out a way of ranking how similar each potential control store is to the trial store. We can calculate how correlated the performance of each store is to the trial store.
Let’s write a function for this so that we don’t have to calculate this for each trial store and control store pair.
calculateCorrelation <- function(inputTable, metricCol, storeComparison) {
calcCorrTable = data.table(Store1 = numeric(),
Store2 = numeric(),
corr_measure = numeric())
storeNumbers <- unique(inputTable[, STORE_NBR])
for(i in storeNumbers) {
calculatedMeasure = data.table("Store1" = storeComparison,
"Store2" = i,
"corr_measure" = cor(inputTable[STORE_NBR == storeComparison, eval(metricCol)],
inputTable[STORE_NBR == i, eval(metricCol)]))
calcCorrTable <- rbind(calcCorrTable, calculatedMeasure)
}
return(calcCorrTable)
}
Apart from correlation, we can also calculate a standardized metric based on the absolute difference between the trial store’s performance and each control store’s performance.Let’s write a function for this.
calculateMagnitudeDistance <- function(inputTable, metricCol, storeComparison) {
calcDistTable = data.table(Store1 = numeric(),
Store2 = numeric(),
YEARMONTH = numeric(),
measure = numeric())
storeNumbers <- unique(inputTable[, STORE_NBR])
for(i in storeNumbers) {
calculatedMeasure = data.table("Store1" = storeComparison,
"Store2" = i,
"YEARMONTH" = inputTable[STORE_NBR == storeComparison, YEARMONTH],
"measure" = abs(inputTable[STORE_NBR == storeComparison, eval(metricCol)] -
inputTable[STORE_NBR == i, eval(metricCol)]))
calcDistTable <- rbind(calcDistTable, calculatedMeasure)
}
#### Standardize the magnitude distance so that the measure ranges from 0 to 1
minMaxDist <- calcDistTable[, .(minDist = min(measure),
maxDist = max(measure)),
by = c("Store1", "YEARMONTH")]
distTable <- merge(calcDistTable, minMaxDist, by = c("Store1", "YEARMONTH"))
distTable[, magnitudeMeasure := 1 - (measure - minDist)/(maxDist - minDist)]
finalDistTable <- distTable[, .(mag_measure = mean(magnitudeMeasure)),
by = .(Store1, Store2)]
return(finalDistTable)
}
Now let’s use the functions to find the control stores. We’ll select the control stores based on how similar Monthly Total Sales and Monthly Number of Customers are to the trial stores. We’ll use the above functions to get 4 scores: 2 of total sales and 2 of total number of customers for each of the trial and control stores.
#### Set up trial store
trial_store77 <- 77
#### Calculate correlation for sales and customers
corr_nSales_77 <- calculateCorrelation(preTrialMeasure, quote(totSales), trial_store77)
corr_nCustomers_77 <- calculateCorrelation(preTrialMeasure, quote(nCustomers), trial_store77)
#### Calculate absolute difference magnitude for sales and customers
magnitude_nSales_77 <- calculateMagnitudeDistance(preTrialMeasure, quote(totSales), trial_store77)
magnitude_nCustomers_77 <- calculateMagnitudeDistance(preTrialMeasure, quote(nCustomers), trial_store77)
We’ll need to combine all the scores calculated using our function to create a composite score to rank on.
Let’s take a simple average of the Correlation and Magnitude scores for each driver. Note that if we consider it more important for the trend of the drivers to be similar, we can increase the weight of the correlation score, or if we consider the absolute difference of the drivers to be more important, we can lower the weight of the correlation scores.
corr_weight <- 0.5
score_nSales_77 <- merge(corr_nSales_77, magnitude_nSales_77,
by = c("Store1", "Store2"))[, scoreNSales := corr_measure * corr_weight +
mag_measure * (1 - corr_weight)]
score_nCustomers_77 <- merge(corr_nCustomers_77, magnitude_nCustomers_77,
by = c("Store1", "Store2"))[, scoreNCust := corr_measure * corr_weight +
mag_measure * (1 - corr_weight)]
Now we have a score for each of the 2 metrics: total sales and total number of customers.We’ll then select the store with the highest score as the control store, since it is the most similar to the trial store.
score_Control <- merge(score_nSales_77,
score_nCustomers_77,
by = c("Store1", "Store2"))[, finalControlScore := scoreNSales * 0.5 +
scoreNCust * 0.5]
control_store <- score_Control[Store1 == trial_store77][order(-finalControlScore)][2, Store2]
score_Control[order(-finalControlScore)][2, Store2]
## [1] 233
Now that we have found a control store, let’s check visually if the drivers are indeed similar in the period before the trial. Start with sales
measureOverTimeSales <- measureOverTime
pastSales <- measureOverTimeSales[,
Store_type := ifelse(STORE_NBR == trial_store77, "Trial",
ifelse(STORE_NBR == control_store, "Control", "Other Stores"))
][, totSales := mean(totSales), by = c("YEARMONTH", "Store_type")
][, TransactionMonth := as.Date(paste(YEARMONTH %/% 100,
YEARMONTH %% 100, 1, sep = '-'),
"%Y-%m-%d")][YEARMONTH < 201903, ]
pastSales[Store_type %in% c("Control", "Trial"), c("TransactionMonth",
"STORE_NBR", "totSales", "Store_type")]
## TransactionMonth STORE_NBR totSales Store_type
## <Date> <int> <num> <char>
## 1: 2018-07-01 77 296.8 Trial
## 2: 2018-08-01 77 255.5 Trial
## 3: 2018-09-01 77 225.2 Trial
## 4: 2018-10-01 77 204.5 Trial
## 5: 2018-11-01 77 245.3 Trial
## 6: 2018-12-01 77 267.3 Trial
## 7: 2019-01-01 77 204.4 Trial
## 8: 2019-02-01 77 235.0 Trial
## 9: 2018-07-01 233 290.7 Control
## 10: 2018-08-01 233 285.9 Control
## 11: 2018-09-01 233 228.6 Control
## 12: 2018-10-01 233 185.7 Control
## 13: 2018-11-01 233 211.6 Control
## 14: 2018-12-01 233 279.8 Control
## 15: 2019-01-01 233 177.5 Control
## 16: 2019-02-01 233 244.0 Control
ggplot(pastSales, aes(x = TransactionMonth, y = totSales, color = Store_type)) +
geom_line() +
labs(x = "Month of Operation", y = "Total Sales", title = "Total Sales by Month")
Next, number of customers
measureOverTimeCusts <- measureOverTime
pastCustomers <- measureOverTimeCusts[, Store_type := ifelse(STORE_NBR == trial_store77, "Trial",
ifelse(STORE_NBR == control_store, "Control", "Other Stores"))
][, numberCustomers := mean(nCustomers), by = c("YEARMONTH", "Store_type")
][, TransactionMonth := as.Date(paste(YEARMONTH %/% 100,
YEARMONTH %% 100, 1, sep = '-'),
"%Y-%m-%d")][YEARMONTH < 201903, ]
ggplot(pastCustomers, aes(x = TransactionMonth, y = numberCustomers, color = Store_type)) +
geom_line() + labs(x = "Month of Operation", y = "Total No. of Customers", title =
"Total No. of Customers by Month")
The trial period goes from the start of March 2019 to June 2019. We now want to see an uplift in overall chip sales.We’ll start with scaling the control store’s sales to a level similar to control for any differences between 2 stores outside of the trial period.
scalingFactoForControlSales <- preTrialMeasure[STORE_NBR == trial_store77 &
YEARMONTH < 201902,sum(totSales)]/
preTrialMeasure[STORE_NBR == control_store & YEARMONTH < 201902, sum(totSales)]
measureOverTimeSales <- measureOverTime
scaledControlSales <- measureOverTimeSales[STORE_NBR == control_store,
][, controlSales := totSales * scalingFactoForControlSales]
Now that we have comparable sales figures for the control store, we can calculate the percentage difference between the scaled control sales, and the trial store’s sales during the trial period.
percentageDiff <- merge(scaledControlSales[, c("YEARMONTH", "controlSales")],
measureOverTime[STORE_NBR == trial_store77, c("YEARMONTH", "totSales")],
by = "YEARMONTH")[, percentageDiff := abs(controlSales - totSales)/controlSales]
Let’s check if the difference is significant
stdDev <- sd(percentageDiff[YEARMONTH < 201902, percentageDiff])
degreesOfFreedom <- 7
percentageDiff[, tValue := (percentageDiff - 0)/stdDev
][,TransactionMonth := as.Date(paste(YEARMONTH %/% 100,
YEARMONTH %% 100, 1,
sep = "-"), "%Y-%m-%d")
][YEARMONTH < 201905 & YEARMONTH > 201901, .(TransactionMonth, tValue)]
## TransactionMonth tValue
## <Date> <num>
## 1: 2019-02-01 1.183534
## 2: 2019-03-01 7.339116
## 3: 2019-04-01 12.476373
Find the 95th percentile of the t distribution with appropriate degrees of freedom to compare against
qt(0.95, df = degreesOfFreedom) # 1.89
## [1] 1.894579
We observe that the t-value is much larger than the 95th percentile value of the distribution, for March and April ~ the increase in sales in the trial store in March and April is statistically greater than in the control store.Let’s create a more visual version of this by plotting the sales of the control store, the sales of the trial stores and the 95th percentile value of sales of the control store.
measureOverTimeSales <- measureOverTime
pastSales <- measureOverTimeSales[, Store_type := ifelse(STORE_NBR == trial_store77, "Trial",
ifelse(STORE_NBR == control_store, "Control", "Other stores"))
][, totSales := mean(totSales), by = c("YEARMONTH", "Store_type")
][, TransactionMonth := as.Date(paste(YEARMONTH %/% 100, YEARMONTH %% 100, 1,
sep = "-"), "%Y-%m-%d")][Store_type %in% c("Trial", "Control")]
pastSales_Conrtols95 <- pastSales[Store_type == "Control",
][, totSales := totSales * (1 + stdDev * 2)
][, Store_type := "Conrol 95th % confidence interval"]
pastSales_Controls5 <- pastSales[Store_type == "Control",
][, totSales := totSales * (1 - stdDev * 2)
][, Store_type := "Control 5th % confidence interval"]
trialAssessment <- rbind(pastSales, pastSales_Conrtols95, pastSales_Controls5)
ggplot(trialAssessment, aes(x = TransactionMonth, totSales, color = Store_type)) +
geom_rect(data = trialAssessment[YEARMONTH < 201905 & YEARMONTH > 201901, ],
aes(xmin = min(TransactionMonth), xmax = max(TransactionMonth),
ymin = 0, ymax = Inf, color = NULL), show.legend = FALSE) +
geom_line() +
labs(x = "Month of Operation", y = "Total Sales", title = "Total Sales by Month")
The results show that the trial store 77 is significantly different to its control store in the trial period, as the trial store performance lies outside the 5% to 95% CI of the control store in 2 of the 3 trial months. Let’s have a look at assessing this for number of customers as well.
scalingFactoForControlCust <- preTrialMeasure[STORE_NBR == trial_store77 &
YEARMONTH < 201902, sum(nCustomers)]/preTrialMeasure[
STORE_NBR == control_store & YEARMONTH < 201902, sum(nCustomers)]
measureOverTimeCusts <- measureOverTime
scaledControlCustomers <- measureOverTimeCusts[STORE_NBR == control_store,
][, controlCustomers := nCustomers*scalingFactoForControlCust
][, Store_type := ifelse(STORE_NBR == trial_store77, "Trial",
ifelse(STORE_NBR == control_store,"Control", "Other stores"))]
percentageDiffCust <- merge(scaledControlCustomers[, c("YEARMONTH", "controlCustomers")],
measureOverTimeCusts[STORE_NBR == trial_store77, c("nCustomers", "YEARMONTH")],
by = "YEARMONTH")[, percentageDiff := abs(controlCustomers - nCustomers)/ controlCustomers]
Let’s again see if the difference is significant visually
As our null hypothesis is that the trial period is the same as the pre-trial period, let’s take the standard deviation based on the scaled percentage difference in the pre-trial period
stdDev2 <- sd(percentageDiffCust[YEARMONTH < 201902, percentageDiff])
degreesOfFreedom <- 7
pastCustomers <- measureOverTimeCusts[, nCusts := mean(nCustomers),
by = c("YEARMONTH", "Store_type")
][Store_type %in% c("Trial", "Control")]
pastCustomers_Controls95 <- pastCustomers[Store_type == "Control", ][,
nCusts := nCusts * (1 + stdDev2*2)][, Store_type := "Control 95th % CI"]
pastCustomers_Controls5 <- pastCustomers[Store_type == "Control",][,
nCusts := nCusts * (1 - stdDev2*2)][, Store_type := "Control 5th % CI"]
trialAssessmentCust <- rbind(pastCustomers, pastCustomers_Controls95, pastCustomers_Controls5)
ggplot(trialAssessmentCust, aes(x = TransactionMonth, y = nCusts, color = Store_type)) +
geom_rect(data = trialAssessmentCust[YEARMONTH < 201905 & YEARMONTH > 201901, ],
aes(xmin = min(TransactionMonth), xmax = max(TransactionMonth),
ymin = 0, ymax = Inf, color = NULL), show.legend = FALSE) +
geom_line() + labs(x = "Month of Operation", y = "Total Number of Customers",
title = "Total no. of Customers Per Month")
Let’s repeat finding the control store and assessing the impact of the trial for each of the other 2 trial stores.
Store 86
measureOverTime <- data[, .(totSales = sum(TOT_SALES),
nCustomers = uniqueN(LYLTY_CARD_NBR),
nTxnPerCust = uniqueN(TXN_ID)/uniqueN(LYLTY_CARD_NBR),
nChipsPerTxn = sum(PROD_QTY)/uniqueN(TXN_ID),
avgPricePerUnit = sum(TOT_SALES)/sum(PROD_QTY)
), by = c("STORE_NBR", "YEARMONTH")
][order(STORE_NBR, YEARMONTH)]
trial_store86 <- 86
corr_nSales86 <- calculateCorrelation(preTrialMeasure, quote(totSales), trial_store86)
corr_nCustomers86 <- calculateCorrelation(preTrialMeasure, quote(nCustomers), trial_store86)
magnitude_nSales86 <- calculateMagnitudeDistance(preTrialMeasure, quote(totSales), trial_store86)
magnitude_nCustomer86 <- calculateMagnitudeDistance(preTrialMeasure, quote(nCustomers), trial_store86)
corr_weight <- 0.5
score_nSales86 <- merge(corr_nSales86,
magnitude_nCustomer86,
by = c("Store1", "Store2"))[, scoreNSales := (corr_measure * corr_weight) +
(mag_measure * (1-corr_weight))]
score_nCustomer86 <- merge(corr_nCustomers86, magnitude_nCustomer86,
by = c("Store1", "Store2"))[, scoreNCust := (corr_measure * corr_weight) +
(mag_measure * (1-corr_weight))]
score_Control86 <- merge(score_nSales86,
score_nCustomer86,
by = c("Store1", "Store2"))[, finalControlScore := scoreNSales * 0.5 + scoreNCust * 0.5]
Select control stores based on the highest matching store (closest to 1 but not the store itself, i.e. the second ranked highest store). Select control store for trial store 86
control_store86 <- score_Control86[Store1 == trial_store86, ][order(-finalControlScore)][2, Store2]
control_store86
## [1] 155
Looks like store 155 will be a control store for trial store 86. Again, let’s check visually if the drivers are indeed similar in the period before the trial. We’ll look at sales first.
measureOverTimeSales <- measureOverTime
pastSales86 <- measureOverTimeSales[, Store_type := ifelse(STORE_NBR == trial_store86, "Trial",
ifelse(STORE_NBR == control_store86, "Control", "Other stores"))
][, totSales := mean(totSales),
by = c("YEARMONTH", "Store_type")
][, TransactionMonth := as.Date(paste(YEARMONTH %/% 100, YEARMONTH %% 100, 1, sep = "-"),
"%Y-%m-%d")][YEARMONTH < 201903, ]
ggplot(pastSales86, aes(TransactionMonth, totSales, color = Store_type)) +
geom_line(aes(linetype = Store_type)) +
labs(x = "Month", y = "Total sales",
title = "Average total sales by month for stores 86/155 pre-trial")
Let’s check number of customers
measureOverTimeCusts <- measureOverTime
pastCustomers86 <- measureOverTimeCusts[, Store_type := ifelse(STORE_NBR == trial_store86, "Trial",
ifelse(STORE_NBR == control_store86, "Control", "Other stores"))
][, numberCustomers := mean(nCustomers), by = c("YEARMONTH", "Store_type")
][, TransactionMonth := as.Date(paste(YEARMONTH %/% 100, YEARMONTH %% 100, 1, sep = "-"),
"%Y-%m-%d")][YEARMONTH < 201903, ]
ggplot(pastCustomers86, aes(TransactionMonth, numberCustomers, color = Store_type)) +
geom_line(aes(linetype = Store_type)) +
labs(x = "Month", y = "Number of Customers", title = "Average number of customers by month for stores 86/155")
Good, the trend in number of customers is also similar. Let’s now assess the impact of the trial on sales
scalingFactorForControlSales_86 <- preTrialMeasure[STORE_NBR == trial_store86 &
YEARMONTH < 201902,
sum(totSales)
]/preTrialMeasure[
STORE_NBR == control_store86 &
YEARMONTH < 201902, sum(totSales)]
measureOverTimeSales <- measureOverTime
scaledControlSales86 <- measureOverTimeSales[STORE_NBR == control_store86,
][, controlSales := totSales * scalingFactorForControlSales_86]
percentageDiff86 <- merge(scaledControlSales86[, c("YEARMONTH", "controlSales")],
measureOverTime[STORE_NBR == trial_store86, c("YEARMONTH", "totSales")],
by = "YEARMONTH"
)[, percentageDiff := abs(controlSales - totSales)/controlSales]
Since our null hypothesis is that the trial period is the same as the pre-trial-period,let’s take the standard deviation based on the scaled percentage difference in the pre-trial period.
stdDevSales_86 <- sd(percentageDiff86[, percentageDiff])
degreesOfFreedom <- 7
measureOverTimeSales <- measureOverTime
pastSales86 <- measureOverTimeSales[, Store_type := ifelse(STORE_NBR == trial_store86, "Trial",
ifelse(STORE_NBR == control_store86, "Control", "Other stores"))
][, totSales := mean(totSales), by = c("YEARMONTH", "Store_type")
][, TransactionMonth := as.Date(paste(YEARMONTH %/% 100,
YEARMONTH %% 100, 1, sep = "-"), "%Y-%m-%d")
][Store_type %in% c("Trial", "Control"), ]
pastSales86_Controls95 <- pastSales86[Store_type == "Control",
][, totSales := totSales * (1 + stdDevSales_86 * 2)
][, Store_type := "Control 95th % confidence interval"]
pastSales86_Controls5 <- pastSales86[Store_type == "Control",
][, totSales := totSales * (1 - stdDevSales_86 * 2)
][, Store_type := "Control 5th % confidence interval"]
trialAssessmentSales86 <- rbind(pastSales86, pastSales86_Controls5, pastSales86_Controls95)
ggplot(trialAssessmentSales86, aes(TransactionMonth, totSales, color = Store_type)) +
geom_rect(data = trialAssessmentSales86[YEARMONTH < 201905 & YEARMONTH > 201901, ],
aes(xmin = min(TransactionMonth), xmax = max(TransactionMonth),
ymin = 0, ymax = Inf, color = NULL), show.legend = FALSE) +
geom_line(aes(linetype = Store_type)) +
labs(x = "Month", y = "Average Total Sales", title = "Average Total Sales Per Month")
The results show that the trial in store 86 is not significantly different to its control store in the trial period, as the trial store performance lies inside the 5% and 95% confidence interval of the control store in 2 of the 3 trial months.
Let’s look at assessing this for number of customers as well
scalingFactorForControlCust_86 <- preTrialMeasure[STORE_NBR == trial_store86 &
YEARMONTH < 201902, sum(nCustomers)
]/preTrialMeasure[STORE_NBR == control_store86 & YEARMONTH < 201902,
sum(nCustomers)]
measureOverTimeCusts <- measureOverTime
scaledControlCustomers86 <- measureOverTimeCusts[STORE_NBR == control_store86,
][, controlCustomers := nCustomers * scalingFactorForControlCust_86
][, Store_type := ifelse(Store_type == trial_store86, "Trial",
ifelse(Store_type == control_store86,
"Control", "Other stores"))]
percentageDiffCust86 <- merge(scaledControlCustomers86[, c("YEARMONTH", "controlCustomers")],
measureOverTime[STORE_NBR == trial_store77, c("YEARMONTH", "nCustomers")],
by = "YEARMONTH"
)[, percentageDiff := abs(controlCustomers - nCustomers) / controlCustomers]
As our null hypothesis is that the trial period is the same as the pre-trial period, let’s take the standard deviation based on the scaled percentage difference in the pre-trial period
stdDevCust_86 <- sd(percentageDiffCust86[YEARMONTH < 201902, percentageDiff])
degreesOfFreedom <- 7
pastCustomers86 <- measureOverTimeCusts[, nCusts := mean(nCustomers), by = c("YEARMONTH", "Store_type")
][Store_type %in% c("Trial", "Control"), ]
pastCustomers86_Controls95 <- pastCustomers86[Store_type == "Control",
][, nCusts := nCusts * (1 + (stdDevCust_86 * 2))
][, Store_type := "Control 95th % confidence interval"]
pastCustomers86_Controls5 <- pastCustomers86[Store_type == "Control",
][, nCusts := nCusts * (1 - (stdDevCust_86 * 2))
][, Store_type := "Control 5th % confidence interval"]
trialAssessmentCust86 <- rbind(pastCustomers86, pastCustomers86_Controls5, pastCustomers86_Controls95)
ggplot(trialAssessmentCust86, aes(TransactionMonth, nCusts, color = Store_type)) +
geom_rect(data = trialAssessmentCust86[YEARMONTH < 201905 & YEARMONTH > 201901, ],
aes(xmin = min(TransactionMonth), xmax = max(TransactionMonth),
ymin = 0, ymax = Inf, color = NULL), show.legend = FALSE) +
geom_line() +
labs(x = "Month", y = "Average no. of customers", title = "Average no. of customers per month")
It looks like the number of customers is significantly higher in all of the 3 months. This seems to suggest that the trial had a significant impact on increasing the number of customers in trial store 86, but as we saw earlier, sales were not significantly higher.We should check with the Category Manager if there were special deals in the trial store that may have resulted in lower prices, impacting the results.
measureOverTime <- data[, .(totSales = sum(TOT_SALES),
nCustomers = uniqueN(LYLTY_CARD_NBR),
nTxnPerCust = uniqueN(TXN_ID) / uniqueN(LYLTY_CARD_NBR),
nChipsPerTxn = sum(PROD_QTY) / uniqueN(TXN_ID),
avgPricePerUnit = sum(TOT_SALES) / sum(PROD_QTY)),
by = c("STORE_NBR", "YEARMONTH")][order(STORE_NBR, YEARMONTH)]
trial_store <- 88
corr_nSales <- calculateCorrelation(preTrialMeasure, quote(totSales), trial_store)
corr_nCustomers <- calculateCorrelation(preTrialMeasure, quote(nCustomers), trial_store)
magnitude_nSales <- calculateMagnitudeDistance(preTrialMeasure, quote(totSales), trial_store)
magnitude_nCustomers <- calculateMagnitudeDistance(preTrialMeasure, quote(nCustomers), trial_store)
corr_weight <- 0.5
score_NSales88 <- merge(corr_nSales, magnitude_nSales,
by = c("Store1", "Store2")
)[, scoreNSales := (corr_measure*corr_weight) +
(mag_measure * (1 - corr_weight))]
score_NCustomers88 <- merge(corr_nCustomers, magnitude_nCustomers,
by = c("Store1", "Store2")
)[, scoreNCust :=(corr_measure * corr_weight) +
(mag_measure * (1 - corr_weight))]
score_Control88 <- merge(score_NSales88, score_NCustomers88, by = c("Store1", "Store2"))
score_Control88[, finalControlScore := (scoreNSales * 0.5) + (scoreNCust * 0.5)]
#### Select control stores based on the highest matching store (closest to 1 but not exactly 1)
#### Select control store for trial store 88
control_store88 <- score_Control88[order(-finalControlScore)][2, Store2]
control_store88 # 237
## [1] 237
measureOverTimeSales <- measureOverTime
pastSales88 <- measureOverTimeSales[, Store_type := ifelse(STORE_NBR == 88, "Trial",
ifelse(STORE_NBR == control_store88, "Control","Other Stores"))
][, totSales := mean(totSales), by = c("YEARMONTH", "Store_type")
][, TransactionMonth := as.Date(paste(YEARMONTH %/% 100,YEARMONTH %% 100, 1,
sep = '-'), "%Y-%m-%d")
][YEARMONTH < 201903, ]
ggplot(pastSales88, aes(TransactionMonth, totSales, color = Store_type)) +
geom_line(aes(linetype = Store_type)) +
labs(x = "Transaction Month", y = "Total sales per month",
title = "Total sales per month Stores 88/237")
measureOverTimeCusts <- measureOverTime
pastCustomers88 <- measureOverTimeCusts[, Store_type := ifelse(STORE_NBR == 88, "Trial",
ifelse(STORE_NBR == control_store88, "Control", "Other Stores"))
][, numberCustomers := mean(nCustomers), by = c("YEARMONTH", "Store_type")
][, TrsancationMonth := as.Date(paste(YEARMONTH %/% 100,
YEARMONTH %% 100, 1, sep = "-"), "%Y-%m-%d")
][YEARMONTH < 201903]
ggplot(pastCustomers88, aes(TransactionMonth, numberCustomers, color = Store_type)) +
geom_line() +
labs(x = "Transaction Month", y = "Total Customers Per Month",
title = "Total Customers Per Month Stores 88/237")
Total number of customers of the control (88) and trial (237) are also similar. Let’s now assess the impact of the trial on sales
scalingFactorForControlSales_88 <- preTrialMeasure[STORE_NBR == 88 &
YEARMONTH < 201902, sum(totSales)]/preTrialMeasure[
STORE_NBR == control_store88 & YEARMONTH < 201902,
sum(totSales)]
measureOverTimeSales <- measureOverTime
scaledControlSales88 <- measureOverTimeSales[STORE_NBR == control_store88,
][, controlSales := totSales * scalingFactorForControlSales_88]
percentageDiff88 <- merge(scaledControlSales88[, c("YEARMONTH", "controlSales")],
measureOverTime[STORE_NBR == 88, c("YEARMONTH", "totSales")], by = "YEARMONTH")
percentageDiff88[, percentageDiff := abs(controlSales - totSales)/controlSales]
As our null hypothesis is that the trial period is the same as the pre-trial period, let’s take the standard deviation based on the scaled percentage difference in the pre-trial period.
stdDevSales_88 <- sd(percentageDiff88[YEARMONTH < 201902, percentageDiff])
degreesOfFreedom <- 7
measureOverTimeSales <- measureOverTime
pastSales88 <- measureOverTimeSales[, Store_type := ifelse(STORE_NBR == 88, "Trial",
ifelse(STORE_NBR == control_store88, "Control", "Other stores"))
][, totSales := mean(totSales), by = c("YEARMONTH", "Store_type")
][, TransactionMonth := as.Date(paste(YEARMONTH %/% 100,
YEARMONTH %% 100, 1, sep = "-"),
"%Y-%m-%d")
][Store_type %in% c("Trial", "Control")]
pastSales88_Controls95 <- pastSales88[Store_type == "Control",
][, totSales := totSales * (1 + stdDevSales_88*2)
][, Store_type := "Control 95th % confidence interval"]
pastSales88_Controls5 <- pastSales88[Store_type == "Control",
][, totSales := totSales * (1 - stdDevSales_88*2)
][, Store_type := "Control 5th % confidence interval"]
trialAssessment88 <- rbind(pastSales88, pastSales88_Controls95, pastSales88_Controls5)
ggplot(trialAssessment88, aes(TransactionMonth, totSales, color = Store_type)) +
geom_rect(data = trialAssessment88[YEARMONTH < 201905 & YEARMONTH > 201901, ],
aes(xmin = min(TransactionMonth), xmax = max(TransactionMonth),
ymin = 0, ymax = Inf, color = NULL), show.legend = FALSE) +
geom_line(aes(linetype = Store_type)) +
labs(x = "Month of Operation", y = "Total Sales", title = "Total Sales by Month")
The results show that the trial in store 88 is significantly different to its control store in the trial period as the trial store performance lies outside of the 5% and 95% confidence interval of the control store in 2 out of the 3 trial months.Let’s have a look at assessing this for number of customers as well
ScalingFactorForControlCust_88 <- preTrialMeasure[STORE_NBR == 88 &
YEARMONTH < 201902, sum(nCustomers)]/preTrialMeasure[
STORE_NBR == control_store88 & YEARMONTH < 201902,
sum(nCustomers)
]
measureOverTimeCusts <- measureOverTime
scaledControlCustomers88 <- measureOverTimeCusts[STORE_NBR == control_store88,
][, controlCustomers := nCustomers * ScalingFactorForControlCust_88
][, Store_type := ifelse(STORE_NBR == 88, "Trial",
ifelse(STORE_NBR == control_store88, "Control", "Other stores"))]
percentageDiffCust88 <- merge(scaledControlCustomers88[, c("YEARMONTH", "controlCustomers")],
measureOverTime[STORE_NBR == 88, c("YEARMONTH", "nCustomers")],
by = "YEARMONTH")[, percentageDiff :=
abs(controlCustomers - nCustomers) /
controlCustomers]
As our null hypothesis is that the trial period is the same as the pre-trial period, let’s take the standard deviation based on the scaled percentage difference in the pre-trial period
stdDevCust_88 <- sd(percentageDiffCust88[YEARMONTH < 201902, percentageDiff])
degreesOfFreedom <- 7
pastCustomers88 <- measureOverTimeCusts[, nCusts := mean(nCustomers), by = c("YEARMONTH", "Store_type")
][Store_type %in% c("Trial", "Control"), ]
pastCustomers88_Controls95 <- pastCustomers88[Store_type == "Control",
][, nCusts := nCusts * (1 + stdDevCust_88 * 2)
][, Store_type := "Control 95th % confidence interval"]
pastCustomers88_Controls5 <- pastCustomers88[Store_type == "Control",
][, nCusts := nCusts * (1 - stdDevCust_88 * 2)
][, Store_type := "Control 5th % confidence interval"]
trialAssessmentCust88 <- rbind(pastCustomers88, pastCustomers88_Controls5, pastCustomers88_Controls95)
ggplot(trialAssessmentCust88, aes(TransactionMonth, nCusts, color = Store_type)) +
geom_rect(data = trialAssessmentCust88[YEARMONTH < 201905 & YEARMONTH > 201901, ],
aes(xmin = min(TransactionMonth), xmax = max(TransactionMonth),
ymin = 0, ymax = Inf, color = NULL), show.legend = FALSE) +
geom_line() +
labs(x = "Month of Operation", y = "Total Number of Customers", title = "Total
Number of Customers by Month Stores 88/237")
Total number of customers in the trial period for the trial store is significantly high than the control store, for 2 out of 3 months. This indicates a positive trial effect.
We found that for the trial stores 77, 86, 88, the following are the most suitable control stores: 233, 155, 237 respectively.
The results for trial stores 77 and 88 during the trial period, show significant difference in at least 2 of the 3 trial months but this is not the case for trial store 86. We can check with the client if the implementation of the trial was different in trial store 86, but overall, the trial shows a significant increase in sales.
Now that we have finished our analysis, we can prepare our presentation to the Category Manager.