setwd("C:/Users/emeka/Documents/Anthonydata")
#Loading neccessary libraries
library(data.table)
library(zoo)
library(forecast)
library(ggplot2)
train <- fread("train.csv")
test <- fread("test.csv")
store <- fread("store.csv")
This is an exploratory analysis of the Rossmann Store Sales data, available here. Although the data is small, utilising data.table significantly improves performance. It is beneficial to have unmasked data because it allows for some interpretations.
str(train)
## Classes 'data.table' and 'data.frame': 1017209 obs. of 9 variables:
## $ Store : int 1 2 3 4 5 6 7 8 9 10 ...
## $ DayOfWeek : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Date : IDate, format: "2015-07-31" "2015-07-31" ...
## $ Sales : int 5263 6064 8314 13995 4822 5651 15344 8492 8565 7185 ...
## $ Customers : int 555 625 821 1498 559 589 1414 833 687 681 ...
## $ Open : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Promo : int 1 1 1 1 1 1 1 1 1 1 ...
## $ StateHoliday : chr "0" "0" "0" "0" ...
## $ SchoolHoliday: int 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
str(test)
## Classes 'data.table' and 'data.frame': 41088 obs. of 8 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Store : int 1 3 7 8 9 10 11 12 13 14 ...
## $ DayOfWeek : int 4 4 4 4 4 4 4 4 4 4 ...
## $ Date : IDate, format: "2015-09-17" "2015-09-17" ...
## $ Open : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Promo : int 1 1 1 1 1 1 1 1 1 1 ...
## $ StateHoliday : chr "0" "0" "0" "0" ...
## $ SchoolHoliday: int 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, ".internal.selfref")=<externalptr>
str(store)
## Classes 'data.table' and 'data.frame': 1115 obs. of 10 variables:
## $ Store : int 1 2 3 4 5 6 7 8 9 10 ...
## $ StoreType : chr "c" "a" "a" "c" ...
## $ Assortment : chr "a" "a" "a" "c" ...
## $ CompetitionDistance : int 1270 570 14130 620 29910 310 24000 7520 2030 3160 ...
## $ CompetitionOpenSinceMonth: int 9 11 12 9 4 12 4 10 8 9 ...
## $ CompetitionOpenSinceYear : int 2008 2007 2006 2009 2015 2013 2013 2014 2000 2009 ...
## $ Promo2 : int 0 1 1 0 0 0 0 0 0 0 ...
## $ Promo2SinceWeek : int NA 13 14 NA NA NA NA NA NA NA ...
## $ Promo2SinceYear : int NA 2010 2011 NA NA NA NA NA NA NA ...
## $ PromoInterval : chr "" "Jan,Apr,Jul,Oct" "Jan,Apr,Jul,Oct" "" ...
## - attr(*, ".internal.selfref")=<externalptr>
By analysing sales data over time, I noticed that Rossmann stores have higher sales at various times of the year, particularly around holidays and promotional events. This information enables us to better organise our inventory and staffing to match customer demand during peak times, resulting in a good shopping experience. Furthermore, understanding our consumers’ preferences depending on shop type and product assortment allows us to modify our products to better fit their requirements and preferences, resulting in increased sales and customer happiness.
train[, Date := as.Date(Date)]
test[, Date := as.Date(Date)]
This code converts the “Date” column in both the training and testing datasets to the Date format.
test[is.na(Open), Open := 1]
This code fills in missing values in the “Open” column of the test dataset with the value 1.
# First histogram: Distribution of sales in the "train" dataset
hist(train$Sales, breaks = 100, main = "Distribution of Sales", xlab = "Sales Amount", ylab = "Frequency")
# Second histogram: Average sales per store when the store was not closed
closed_sales <- train[Sales != 0]$Sales
mean_sales_per_store <- aggregate(closed_sales, by = list(train[Sales != 0]$Store), mean)$x
hist(mean_sales_per_store, breaks = 100, main = "Mean Sales per Store (When Open)", xlab = "Average Sales", ylab = "Frequency")
The sales distribution histogram depicts how sales are distributed across different stores, revealing that most stores have sales ranging from 0 to 10,000, with around 15,000 occurrences. Meanwhile, the histogram for average sales per store when open shows diverse store performance, with some continually generating high sales frequency and others declining, indicating areas for improvement or investment.
# Analyze customer distribution
hist(train$Customers, 100)
hist(aggregate(train[Sales != 0]$Customers, by = list(train[Sales != 0]$Store), mean)$x, 100, main = "Mean customers per store when store was not closed")
# Analyze promotional events
ggplot(train, aes(x = factor(Promo), y = Sales)) +
geom_jitter(alpha = 0.1) +
geom_boxplot(color = "red", outlier.colour = NA, fill = NA)
The analysis of promotional events at Rossmann stores reveals that while promotional events are active, the median sales value is roughly $5,744, with less fluctuation than during non-promotional periods. This shows that promotional activities help stores achieve more consistent and high levels of sales performance.
# Analyze holiday occurrences
table(train$StateHoliday) / nrow(train)
##
## 0 a b c
## 0.969475300 0.019917244 0.006576820 0.004030637
table(test$StateHoliday) / nrow(test)
##
## 0 a
## 0.995619159 0.004380841
The examination of holiday occurrences reveals that the majority of days in the training dataset (about 97%) are non-holiday days (coded as “0” in the StateHoliday column), while the remaining days are divided into three sorts of holidays (“a,” “b,” and “c”). Similarly, in the test sample, almost all days (99.6%) are non-holidays, with only a small number classified as holiday days. This suggests that the dataset is primarily made up of non-holiday days, with holiday occurrences being rather rare.
# Analyze school holiday occurrences
table(train$SchoolHoliday) / nrow(train)
##
## 0 1
## 0.8213533 0.1786467
table(test$SchoolHoliday) / nrow(test)
##
## 0 1
## 0.5565129 0.4434871
In the training dataset, about 82% of the days are not school holidays, while roughly 18% are school holidays. Similarly, in the test dataset, around 55.7% of the days are not school holidays, and approximately 44.3% are school holidays. This suggests that school holidays have a significant presence in the dataset, potentially influencing store sales and customer behavior.
# Analyze store types
table(store$StoreType)
##
## a b c d
## 602 17 148 348
The analysis of store types indicates that the majority of stores belong to type “a,” followed by type “d,” “c,” and “b.” Store type “a” appears to be the most common, suggesting it might represent a specific category or size of store in the Rossmann dataset. The relatively lower counts of types “b” and “c” imply that these types might correspond to specialized or less common store configurations, while type “d” also exhibits significant representation, possibly indicating another prevalent category within the dataset.
# Analyze assortment types
table(store$Assortment)
##
## a b c
## 593 9 513
The analysis of assortment types reveals that the majority of stores have assortment type “c,” followed by type “a” and “b.” Assortment type “c” appears to be the most common, indicating it might represent a specific product assortment strategy adopted by the majority of Rossmann stores in the dataset. Types “a” and “b” exhibit relatively lower counts, suggesting they might represent different or less common assortment strategies within the dataset.
# Analyze Competition Distance
hist(store$CompetitionDistance, breaks = 100, main = "Distribution of Competition Distances", xlab = "Competition Distance", ylab = "Frequency")
# Analyze Promo2
table(store$Promo2)
##
## 0 1
## 544 571
It shows that 544 stores do not have Promo2, while 571 stores have Promo2.
# Analyze PromoInterval
table(store$PromoInterval)
##
## Feb,May,Aug,Nov Jan,Apr,Jul,Oct Mar,Jun,Sept,Dec
## 544 130 335 106
It indicates that 544 stores have promotions in February, May, August, and November; 130 stores have promotions in January, April, July, and October; and 335 stores have promotions in March, June, September, and December.
# Plot time series for sales
plot(train$Date, train$Sales, type = "l")
plot(test$Date, type = "l")
The plot shows a general tendency of diminishing sales near the end of the examined period, especially after September 15th. This sales reduction could be attributed to a variety of variables, including seasonal changes, alterations in customer behaviour, or specific events affecting store performance.
# Check column names
print(colnames(train))
## [1] "Store" "DayOfWeek" "Date" "Sales"
## [5] "Customers" "Open" "Promo" "StateHoliday"
## [9] "SchoolHoliday"
# Verify data loading
print(head(train))
## Store DayOfWeek Date Sales Customers Open Promo StateHoliday
## <int> <int> <Date> <int> <int> <int> <int> <char>
## 1: 1 5 2015-07-31 5263 555 1 1 0
## 2: 2 5 2015-07-31 6064 625 1 1 0
## 3: 3 5 2015-07-31 8314 821 1 1 0
## 4: 4 5 2015-07-31 13995 1498 1 1 0
## 5: 5 5 2015-07-31 4822 559 1 1 0
## 6: 6 5 2015-07-31 5651 589 1 1 0
## SchoolHoliday
## <int>
## 1: 1
## 2: 1
## 3: 1
## 4: 1
## 5: 1
## 6: 1
# Inspect data structure
str(train)
## Classes 'data.table' and 'data.frame': 1017209 obs. of 9 variables:
## $ Store : int 1 2 3 4 5 6 7 8 9 10 ...
## $ DayOfWeek : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Date : Date, format: "2015-07-31" "2015-07-31" ...
## $ Sales : int 5263 6064 8314 13995 4822 5651 15344 8492 8565 7185 ...
## $ Customers : int 555 625 821 1498 559 589 1414 833 687 681 ...
## $ Open : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Promo : int 1 1 1 1 1 1 1 1 1 1 ...
## $ StateHoliday : chr "0" "0" "0" "0" ...
## $ SchoolHoliday: int 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
train <- merge(train, store[, c("Store", "CompetitionOpenSinceYear", "CompetitionOpenSinceMonth")], by = "Store", all.x = TRUE)
train$CompetitionOpenSince <- as.Date(paste(train$CompetitionOpenSinceYear, train$CompetitionOpenSinceMonth, "01", sep = "-"))
train$DaysWithCompetition <- train$Date - train$CompetitionOpenSince
stores_with_competition <- train[!is.na(train$DaysWithCompetition), unique(Store)]
for (store_id in stores_with_competition) {
store_data <- train[train$Store == store_id, ]
competition_opening_day <- min(store_data$DaysWithCompetition[!is.na(store_data$DaysWithCompetition)])
sales_before <- store_data[store_data$DaysWithCompetition < competition_opening_day, "Sales"]
sales_after <- store_data[store_data$DaysWithCompetition >= competition_opening_day, "Sales"]
# Perform analysis or visualization here
}
ggplot(train, aes(x = Date, y = Sales)) +
geom_line() +
labs(title = "Sales Over Time", x = "Date", y = "Sales")
merged_train <- merge(train, store, by = "Store")
str(merged_train)
## Classes 'data.table' and 'data.frame': 1017209 obs. of 22 variables:
## $ Store : int 1 1 1 1 1 1 1 1 1 1 ...
## $ DayOfWeek : int 5 4 3 2 1 7 6 5 4 3 ...
## $ Date : Date, format: "2015-07-31" "2015-07-30" ...
## $ Sales : int 5263 5020 4782 5011 6102 0 4364 3706 3769 3464 ...
## $ Customers : int 555 546 523 560 612 0 500 459 503 463 ...
## $ Open : int 1 1 1 1 1 0 1 1 1 1 ...
## $ Promo : int 1 1 1 1 1 0 0 0 0 0 ...
## $ StateHoliday : chr "0" "0" "0" "0" ...
## $ SchoolHoliday : int 1 1 1 1 1 0 0 0 0 0 ...
## $ CompetitionOpenSinceYear.x : int 2008 2008 2008 2008 2008 2008 2008 2008 2008 2008 ...
## $ CompetitionOpenSinceMonth.x: int 9 9 9 9 9 9 9 9 9 9 ...
## $ CompetitionOpenSince : Date, format: "2008-09-01" "2008-09-01" ...
## $ DaysWithCompetition : 'difftime' num 2524 2523 2522 2521 ...
## ..- attr(*, "units")= chr "days"
## $ StoreType : chr "c" "c" "c" "c" ...
## $ Assortment : chr "a" "a" "a" "a" ...
## $ CompetitionDistance : int 1270 1270 1270 1270 1270 1270 1270 1270 1270 1270 ...
## $ CompetitionOpenSinceMonth.y: int 9 9 9 9 9 9 9 9 9 9 ...
## $ CompetitionOpenSinceYear.y : int 2008 2008 2008 2008 2008 2008 2008 2008 2008 2008 ...
## $ Promo2 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Promo2SinceWeek : int NA NA NA NA NA NA NA NA NA NA ...
## $ Promo2SinceYear : int NA NA NA NA NA NA NA NA NA NA ...
## $ PromoInterval : chr "" "" "" "" ...
## - attr(*, ".internal.selfref")=<externalptr>
## - attr(*, "sorted")= chr "Store"
ggplot(merged_train, aes(x = factor(StoreType), y = Sales)) +
geom_boxplot() +
labs(title = "Sales Distribution by Store Type", x = "Store Type", y = "Sales")
ggplot(train, aes(x = Customers, y = Sales)) +
geom_point() +
labs(title = "Sales vs. Number of Customers", x = "Number of Customers", y = "Sales")
Perform seasonal decomposition of time series (STL)
sales_ts <- ts(train$Sales, start = 1, frequency = 7) # Assuming weekly frequency, adjust if needed
decomposed_sales <- stl(sales_ts, s.window = "periodic")
plot(decomposed_sales)
acf(train$Sales)
pacf(train$Sales)
arima_model <- auto.arima(train$Sales)
summary(arima_model)
## Series: train$Sales
## ARIMA(4,1,2)
##
## Coefficients:
## ar1 ar2 ar3 ar4 ma1 ma2
## -0.8333 -0.0814 -0.0381 -0.1059 -0.1125 -0.8287
## s.e. 0.0013 0.0013 0.0013 0.0010 0.0008 0.0008
##
## sigma^2 = 10348895: log likelihood = -9658525
## AIC=19317063 AICc=19317063 BIC=19317146
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -0.003970651 3216.959 2353.461 -Inf Inf 0.7911909 0.007341639
The ARIMA(4,1,2) model is critical for making sound decisions about sales forecasting and resource allocation. Here’s how to understand the observations in a business context:
Model Coefficients: The coefficients (ar1, ar2, ar3, ar4, ma1, ma2) represent the intensity and direction of the association between previous sales and future sales projections. This data assists in determining which elements have the greatest influence on sales patterns and can be used to guide strategic decision-making, such as marketing campaigns and inventory management.
Understanding the variability of residuals (sigma^2) offers insight into the stability and consistency of sales data. A lower sigma^2 indicates a more reliable forecasting tool since the model’s forecasts closely align with actual sales data.
AIC and BIC measure model fit and complexity. The AIC, AICc, and BIC metrics are used to assess the trade-off between model complexity and goodness of fit. Lower numbers correspond to better-fitting models with fewer parameters, which are preferable for precise and efficient forecasting.
Training Set Error Measures: The error metrics (ME, RMSE, MAE, MPE, MAPE, MASE, and ACF1) evaluate the model’s accuracy and performance on the training dataset. These measures assist quantify the magnitude and direction of prediction mistakes, giving insights into the model’s reliability and potential areas for improvement.
sales_ts <- ts(train$Sales, start = 1, frequency = 7)
holt_winters_model <- HoltWinters(sales_ts)
print(holt_winters_model)
## Holt-Winters exponential smoothing with trend and additive seasonal component.
##
## Call:
## HoltWinters(x = sales_ts)
##
## Smoothing parameters:
## alpha: 0.04267568
## beta : 0.001436316
## gamma: 0.1548731
##
## Coefficients:
## [,1]
## a -50244.273162
## b -4.343859
## s1 54850.591511
## s2 49771.944175
## s3 55170.443286
## s4 54758.380219
## s5 54557.144938
## s6 54256.196728
## s7 53798.492008
The Holt-Winters exponential smoothing approach provides vital information about the underlying trends and seasonal patterns in the sales data.
The smoothing parameters (alpha, beta, and gamma) describe the degree of responsiveness to new data points, trend changes, and seasonal variations, respectively. The coefficients (a, b, s1–s7) reflect the baseline level, trend component, and seasonal components for each time period (e.g., week) in the data. These insights provide a more detailed knowledge of how sales fluctuate over time, including both short-term trends and longer-term seasonal patterns that can be useful for forecasting and decision-making.
Feature Engineering (Lagged sales)
train$lagged_sales <- lag(train$Sales, k = 7) # Lagged by one week (7 days)
Rolling averages
train$rolling_mean <- rollmean(train$Sales, k = 7, fill = NA)
Seasonality indicators
train$month <- month(train$Date)
train$quarter <- quarter(train$Date)
customer_segment <- ifelse(train$Sales >= 5000, "High Value", "Low Value")
ggplot(data = train, aes(x = customer_segment)) +
geom_bar(fill = "skyblue", color = "black") +
labs(title = "Customer Segmentation based on Average Transaction Value", x = "Customer Segment", y = "Count") +
scale_y_continuous(labels = scales::comma)
Promotion Effectiveness (sales during promotion periods)
promo_sales <- subset(train, Promo == 1)$Sales
non_promo_sales <- subset(train, Promo == 0)$Sales
promo_data <- data.frame(Sales = promo_sales, Period = "Promotion")
non_promo_data <- data.frame(Sales = non_promo_sales, Period = "Non-Promotion")
combined_data <- rbind(promo_data, non_promo_data)
ggplot(combined_data, aes(x = Sales, fill = Period)) +
geom_density(alpha = 0.5) +
labs(title = "Comparison of Sales Density During Promotion and Non-Promotion Periods", x = "Sales", y = "Density") +
scale_fill_manual(values = c("Promotion" = "skyblue", "Non-Promotion" = "pink"))+
scale_y_continuous(labels = scales::comma)
diff_sales <- diff(train$Sales)
lagged_sales <- train$lagged_sales[-length(train$lagged_sales)]
print(length(diff_sales))
## [1] 1017208
print(length(lagged_sales))
## [1] 1017208
sales_growth_rate <- diff_sales / lagged_sales
sales_growth_rate <- diff(train$Sales) / train$lagged_sales[-1]
missing_values <- is.na(train$Date[-1]) | is.na(sales_growth_rate)
cleaned_date <- train$Date[-1][!missing_values]
cleaned_growth_rate <- sales_growth_rate[!missing_values]
train$Date <- as.Date(train$Date)
invalid_dates <- train$Date < as.Date("2013-01-01") | train$Date > as.Date("2015-07-26")
train$Date[invalid_dates] <- NA
train$Date[is.na(train$Date)] <- mean(train$Date, na.rm = TRUE)
remaining_invalid_dates <- train$Date < as.Date("2013-01-01") | train$Date > as.Date("2015-07-26")
if (any(remaining_invalid_dates)) {
warning("There are still invalid dates remaining after cleaning.")
}
sales_growth_ts <- ts(sales_growth_rate, start = 1, end = length(sales_growth_rate))
plot(train$Date[-1], sales_growth_rate, type = "l", xlab = "Date", ylab = "Sales Growth Rate",
main = "Sales Growth Rate Over Time", col = "blue", lwd = 2)
abline(h = 0, col = "red", lty = 2)
cleaned_date <- cleaned_date[!is.na(cleaned_growth_rate) & is.finite(cleaned_growth_rate)]
cleaned_growth_rate <- cleaned_growth_rate[!is.na(cleaned_growth_rate) & is.finite(cleaned_growth_rate)]
if (length(cleaned_date) > 0) {
lines(loess.smooth(cleaned_date, cleaned_growth_rate), col = "orange", lwd = 2)
} else {
warning("No valid data points for LOESS smoothing.")
}
legend("topright", legend = c("Sales Growth Rate", "Trend Line", "Zero Growth"),
col = c("blue", "orange", "red"), lty = c(1, 1, 2), lwd = c(2, 2, 1))
title(main = "Sales Growth Rate Over Time", sub = "Trend line represents smoothed growth trend",
cex.main = 1.2, cex.sub = 0.8)