setwd("C:/Users/emeka/Documents/Anthonydata")

#Loading neccessary libraries

library(data.table)
library(zoo)
library(forecast)
library(ggplot2)

Load CSV data

train <- fread("train.csv")
test <- fread("test.csv")
store <- fread("store.csv")

This is an exploratory analysis of the Rossmann Store Sales data, available here. Although the data is small, utilising data.table significantly improves performance. It is beneficial to have unmasked data because it allows for some interpretations.

View the structure of the data

str(train)
## Classes 'data.table' and 'data.frame':   1017209 obs. of  9 variables:
##  $ Store        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ DayOfWeek    : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ Date         : IDate, format: "2015-07-31" "2015-07-31" ...
##  $ Sales        : int  5263 6064 8314 13995 4822 5651 15344 8492 8565 7185 ...
##  $ Customers    : int  555 625 821 1498 559 589 1414 833 687 681 ...
##  $ Open         : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Promo        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ StateHoliday : chr  "0" "0" "0" "0" ...
##  $ SchoolHoliday: int  1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, ".internal.selfref")=<externalptr>
str(test)
## Classes 'data.table' and 'data.frame':   41088 obs. of  8 variables:
##  $ Id           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Store        : int  1 3 7 8 9 10 11 12 13 14 ...
##  $ DayOfWeek    : int  4 4 4 4 4 4 4 4 4 4 ...
##  $ Date         : IDate, format: "2015-09-17" "2015-09-17" ...
##  $ Open         : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Promo        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ StateHoliday : chr  "0" "0" "0" "0" ...
##  $ SchoolHoliday: int  0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, ".internal.selfref")=<externalptr>
str(store)
## Classes 'data.table' and 'data.frame':   1115 obs. of  10 variables:
##  $ Store                    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ StoreType                : chr  "c" "a" "a" "c" ...
##  $ Assortment               : chr  "a" "a" "a" "c" ...
##  $ CompetitionDistance      : int  1270 570 14130 620 29910 310 24000 7520 2030 3160 ...
##  $ CompetitionOpenSinceMonth: int  9 11 12 9 4 12 4 10 8 9 ...
##  $ CompetitionOpenSinceYear : int  2008 2007 2006 2009 2015 2013 2013 2014 2000 2009 ...
##  $ Promo2                   : int  0 1 1 0 0 0 0 0 0 0 ...
##  $ Promo2SinceWeek          : int  NA 13 14 NA NA NA NA NA NA NA ...
##  $ Promo2SinceYear          : int  NA 2010 2011 NA NA NA NA NA NA NA ...
##  $ PromoInterval            : chr  "" "Jan,Apr,Jul,Oct" "Jan,Apr,Jul,Oct" "" ...
##  - attr(*, ".internal.selfref")=<externalptr>

By analysing sales data over time, I noticed that Rossmann stores have higher sales at various times of the year, particularly around holidays and promotional events. This information enables us to better organise our inventory and staffing to match customer demand during peak times, resulting in a good shopping experience. Furthermore, understanding our consumers’ preferences depending on shop type and product assortment allows us to modify our products to better fit their requirements and preferences, resulting in increased sales and customer happiness.

train[, Date := as.Date(Date)]
test[, Date := as.Date(Date)]

This code converts the “Date” column in both the training and testing datasets to the Date format.

test[is.na(Open), Open := 1]

This code fills in missing values in the “Open” column of the test dataset with the value 1.

Data Exploration

# First histogram: Distribution of sales in the "train" dataset
hist(train$Sales, breaks = 100, main = "Distribution of Sales", xlab = "Sales Amount", ylab = "Frequency")

# Second histogram: Average sales per store when the store was not closed
closed_sales <- train[Sales != 0]$Sales
mean_sales_per_store <- aggregate(closed_sales, by = list(train[Sales != 0]$Store), mean)$x
hist(mean_sales_per_store, breaks = 100, main = "Mean Sales per Store (When Open)", xlab = "Average Sales", ylab = "Frequency")

The sales distribution histogram depicts how sales are distributed across different stores, revealing that most stores have sales ranging from 0 to 10,000, with around 15,000 occurrences. Meanwhile, the histogram for average sales per store when open shows diverse store performance, with some continually generating high sales frequency and others declining, indicating areas for improvement or investment.

# Analyze customer distribution
hist(train$Customers, 100)

hist(aggregate(train[Sales != 0]$Customers, by = list(train[Sales != 0]$Store), mean)$x, 100, main = "Mean customers per store when store was not closed")

# Analyze promotional events
ggplot(train, aes(x = factor(Promo), y = Sales)) +
  geom_jitter(alpha = 0.1) +
  geom_boxplot(color = "red", outlier.colour = NA, fill = NA)

The analysis of promotional events at Rossmann stores reveals that while promotional events are active, the median sales value is roughly $5,744, with less fluctuation than during non-promotional periods. This shows that promotional activities help stores achieve more consistent and high levels of sales performance.

# Analyze holiday occurrences
table(train$StateHoliday) / nrow(train)
## 
##           0           a           b           c 
## 0.969475300 0.019917244 0.006576820 0.004030637
table(test$StateHoliday) / nrow(test)
## 
##           0           a 
## 0.995619159 0.004380841

The examination of holiday occurrences reveals that the majority of days in the training dataset (about 97%) are non-holiday days (coded as “0” in the StateHoliday column), while the remaining days are divided into three sorts of holidays (“a,” “b,” and “c”). Similarly, in the test sample, almost all days (99.6%) are non-holidays, with only a small number classified as holiday days. This suggests that the dataset is primarily made up of non-holiday days, with holiday occurrences being rather rare.

# Analyze school holiday occurrences
table(train$SchoolHoliday) / nrow(train)
## 
##         0         1 
## 0.8213533 0.1786467
table(test$SchoolHoliday) / nrow(test)
## 
##         0         1 
## 0.5565129 0.4434871

In the training dataset, about 82% of the days are not school holidays, while roughly 18% are school holidays. Similarly, in the test dataset, around 55.7% of the days are not school holidays, and approximately 44.3% are school holidays. This suggests that school holidays have a significant presence in the dataset, potentially influencing store sales and customer behavior.

# Analyze store types
table(store$StoreType)
## 
##   a   b   c   d 
## 602  17 148 348

The analysis of store types indicates that the majority of stores belong to type “a,” followed by type “d,” “c,” and “b.” Store type “a” appears to be the most common, suggesting it might represent a specific category or size of store in the Rossmann dataset. The relatively lower counts of types “b” and “c” imply that these types might correspond to specialized or less common store configurations, while type “d” also exhibits significant representation, possibly indicating another prevalent category within the dataset.

# Analyze assortment types
table(store$Assortment)
## 
##   a   b   c 
## 593   9 513

The analysis of assortment types reveals that the majority of stores have assortment type “c,” followed by type “a” and “b.” Assortment type “c” appears to be the most common, indicating it might represent a specific product assortment strategy adopted by the majority of Rossmann stores in the dataset. Types “a” and “b” exhibit relatively lower counts, suggesting they might represent different or less common assortment strategies within the dataset.

# Analyze Competition Distance
hist(store$CompetitionDistance, breaks = 100, main = "Distribution of Competition Distances", xlab = "Competition Distance", ylab = "Frequency")

# Analyze Promo2
table(store$Promo2)
## 
##   0   1 
## 544 571

It shows that 544 stores do not have Promo2, while 571 stores have Promo2.

# Analyze PromoInterval
table(store$PromoInterval)
## 
##                   Feb,May,Aug,Nov  Jan,Apr,Jul,Oct Mar,Jun,Sept,Dec 
##              544              130              335              106

It indicates that 544 stores have promotions in February, May, August, and November; 130 stores have promotions in January, April, July, and October; and 335 stores have promotions in March, June, September, and December.

# Plot time series for sales
plot(train$Date, train$Sales, type = "l")

plot(test$Date, type = "l")

The plot shows a general tendency of diminishing sales near the end of the examined period, especially after September 15th. This sales reduction could be attributed to a variety of variables, including seasonal changes, alterations in customer behaviour, or specific events affecting store performance.

# Check column names
print(colnames(train))
## [1] "Store"         "DayOfWeek"     "Date"          "Sales"        
## [5] "Customers"     "Open"          "Promo"         "StateHoliday" 
## [9] "SchoolHoliday"
# Verify data loading
print(head(train))
##    Store DayOfWeek       Date Sales Customers  Open Promo StateHoliday
##    <int>     <int>     <Date> <int>     <int> <int> <int>       <char>
## 1:     1         5 2015-07-31  5263       555     1     1            0
## 2:     2         5 2015-07-31  6064       625     1     1            0
## 3:     3         5 2015-07-31  8314       821     1     1            0
## 4:     4         5 2015-07-31 13995      1498     1     1            0
## 5:     5         5 2015-07-31  4822       559     1     1            0
## 6:     6         5 2015-07-31  5651       589     1     1            0
##    SchoolHoliday
##            <int>
## 1:             1
## 2:             1
## 3:             1
## 4:             1
## 5:             1
## 6:             1
# Inspect data structure
str(train)
## Classes 'data.table' and 'data.frame':   1017209 obs. of  9 variables:
##  $ Store        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ DayOfWeek    : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ Date         : Date, format: "2015-07-31" "2015-07-31" ...
##  $ Sales        : int  5263 6064 8314 13995 4822 5651 15344 8492 8565 7185 ...
##  $ Customers    : int  555 625 821 1498 559 589 1414 833 687 681 ...
##  $ Open         : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Promo        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ StateHoliday : chr  "0" "0" "0" "0" ...
##  $ SchoolHoliday: int  1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, ".internal.selfref")=<externalptr>

Merge, Convert, Calculate. and Filter

train <- merge(train, store[, c("Store", "CompetitionOpenSinceYear", "CompetitionOpenSinceMonth")], by = "Store", all.x = TRUE)


train$CompetitionOpenSince <- as.Date(paste(train$CompetitionOpenSinceYear, train$CompetitionOpenSinceMonth, "01", sep = "-"))


train$DaysWithCompetition <- train$Date - train$CompetitionOpenSince


stores_with_competition <- train[!is.na(train$DaysWithCompetition), unique(Store)]

Analyze sales before and after competitor opening for each store

for (store_id in stores_with_competition) {
  store_data <- train[train$Store == store_id, ]
  competition_opening_day <- min(store_data$DaysWithCompetition[!is.na(store_data$DaysWithCompetition)])
  sales_before <- store_data[store_data$DaysWithCompetition < competition_opening_day, "Sales"]
  sales_after <- store_data[store_data$DaysWithCompetition >= competition_opening_day, "Sales"]
  # Perform analysis or visualization here
}

Sales over time

ggplot(train, aes(x = Date, y = Sales)) +
  geom_line() +
  labs(title = "Sales Over Time", x = "Date", y = "Sales")

merged_train <- merge(train, store, by = "Store")


str(merged_train)
## Classes 'data.table' and 'data.frame':   1017209 obs. of  22 variables:
##  $ Store                      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ DayOfWeek                  : int  5 4 3 2 1 7 6 5 4 3 ...
##  $ Date                       : Date, format: "2015-07-31" "2015-07-30" ...
##  $ Sales                      : int  5263 5020 4782 5011 6102 0 4364 3706 3769 3464 ...
##  $ Customers                  : int  555 546 523 560 612 0 500 459 503 463 ...
##  $ Open                       : int  1 1 1 1 1 0 1 1 1 1 ...
##  $ Promo                      : int  1 1 1 1 1 0 0 0 0 0 ...
##  $ StateHoliday               : chr  "0" "0" "0" "0" ...
##  $ SchoolHoliday              : int  1 1 1 1 1 0 0 0 0 0 ...
##  $ CompetitionOpenSinceYear.x : int  2008 2008 2008 2008 2008 2008 2008 2008 2008 2008 ...
##  $ CompetitionOpenSinceMonth.x: int  9 9 9 9 9 9 9 9 9 9 ...
##  $ CompetitionOpenSince       : Date, format: "2008-09-01" "2008-09-01" ...
##  $ DaysWithCompetition        : 'difftime' num  2524 2523 2522 2521 ...
##   ..- attr(*, "units")= chr "days"
##  $ StoreType                  : chr  "c" "c" "c" "c" ...
##  $ Assortment                 : chr  "a" "a" "a" "a" ...
##  $ CompetitionDistance        : int  1270 1270 1270 1270 1270 1270 1270 1270 1270 1270 ...
##  $ CompetitionOpenSinceMonth.y: int  9 9 9 9 9 9 9 9 9 9 ...
##  $ CompetitionOpenSinceYear.y : int  2008 2008 2008 2008 2008 2008 2008 2008 2008 2008 ...
##  $ Promo2                     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Promo2SinceWeek            : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ Promo2SinceYear            : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ PromoInterval              : chr  "" "" "" "" ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr "Store"

Boxplot sales by store type

ggplot(merged_train, aes(x = factor(StoreType), y = Sales)) +
  geom_boxplot() +
  labs(title = "Sales Distribution by Store Type", x = "Store Type", y = "Sales")

Scatter sales against number of customers

ggplot(train, aes(x = Customers, y = Sales)) +
  geom_point() +
  labs(title = "Sales vs. Number of Customers", x = "Number of Customers", y = "Sales")

Time Series Analysis

Perform seasonal decomposition of time series (STL)

sales_ts <- ts(train$Sales, start = 1, frequency = 7)  # Assuming weekly frequency, adjust if needed

decomposed_sales <- stl(sales_ts, s.window = "periodic")

Decomposed components

plot(decomposed_sales)

Autocorrelation and Partial Autocorrelation Analysis

acf(train$Sales)

pacf(train$Sales)

Time Series Forecasting (Fit ARIMA model)

arima_model <- auto.arima(train$Sales)
summary(arima_model)
## Series: train$Sales 
## ARIMA(4,1,2) 
## 
## Coefficients:
##           ar1      ar2      ar3      ar4      ma1      ma2
##       -0.8333  -0.0814  -0.0381  -0.1059  -0.1125  -0.8287
## s.e.   0.0013   0.0013   0.0013   0.0010   0.0008   0.0008
## 
## sigma^2 = 10348895:  log likelihood = -9658525
## AIC=19317063   AICc=19317063   BIC=19317146
## 
## Training set error measures:
##                        ME     RMSE      MAE  MPE MAPE      MASE        ACF1
## Training set -0.003970651 3216.959 2353.461 -Inf  Inf 0.7911909 0.007341639

The ARIMA(4,1,2) model is critical for making sound decisions about sales forecasting and resource allocation. Here’s how to understand the observations in a business context:

sales_ts <- ts(train$Sales, start = 1, frequency = 7)  


holt_winters_model <- HoltWinters(sales_ts)


print(holt_winters_model)
## Holt-Winters exponential smoothing with trend and additive seasonal component.
## 
## Call:
## HoltWinters(x = sales_ts)
## 
## Smoothing parameters:
##  alpha: 0.04267568
##  beta : 0.001436316
##  gamma: 0.1548731
## 
## Coefficients:
##             [,1]
## a  -50244.273162
## b      -4.343859
## s1  54850.591511
## s2  49771.944175
## s3  55170.443286
## s4  54758.380219
## s5  54557.144938
## s6  54256.196728
## s7  53798.492008

The Holt-Winters exponential smoothing approach provides vital information about the underlying trends and seasonal patterns in the sales data.

The smoothing parameters (alpha, beta, and gamma) describe the degree of responsiveness to new data points, trend changes, and seasonal variations, respectively. The coefficients (a, b, s1–s7) reflect the baseline level, trend component, and seasonal components for each time period (e.g., week) in the data. These insights provide a more detailed knowledge of how sales fluctuate over time, including both short-term trends and longer-term seasonal patterns that can be useful for forecasting and decision-making.

Feature Engineering (Lagged sales)

train$lagged_sales <- lag(train$Sales, k = 7)  # Lagged by one week (7 days)

Rolling averages

train$rolling_mean <- rollmean(train$Sales, k = 7, fill = NA)

Seasonality indicators

train$month <- month(train$Date)
train$quarter <- quarter(train$Date)

Customer segmentation based on average transaction value

customer_segment <- ifelse(train$Sales >= 5000, "High Value", "Low Value")

Distribution of customer segments

ggplot(data = train, aes(x = customer_segment)) +
  geom_bar(fill = "skyblue", color = "black") +
  labs(title = "Customer Segmentation based on Average Transaction Value", x = "Customer Segment", y = "Count") +
  scale_y_continuous(labels = scales::comma)

Promotion Effectiveness (sales during promotion periods)

promo_sales <- subset(train, Promo == 1)$Sales
non_promo_sales <- subset(train, Promo == 0)$Sales

Comparison of Sales Density During Promotion and Non-Promotion Periods

promo_data <- data.frame(Sales = promo_sales, Period = "Promotion")
non_promo_data <- data.frame(Sales = non_promo_sales, Period = "Non-Promotion")
combined_data <- rbind(promo_data, non_promo_data)

ggplot(combined_data, aes(x = Sales, fill = Period)) +
  geom_density(alpha = 0.5) +
  labs(title = "Comparison of Sales Density During Promotion and Non-Promotion Periods", x = "Sales", y = "Density") +
  scale_fill_manual(values = c("Promotion" = "skyblue", "Non-Promotion" = "pink"))+
  scale_y_continuous(labels = scales::comma)

diff_sales <- diff(train$Sales)
lagged_sales <- train$lagged_sales[-length(train$lagged_sales)]
print(length(diff_sales))
## [1] 1017208
print(length(lagged_sales))
## [1] 1017208
sales_growth_rate <- diff_sales / lagged_sales

Store Performance Analysis (sales growth rate for each store)

sales_growth_rate <- diff(train$Sales) / train$lagged_sales[-1]

Sales growth rate over time

missing_values <- is.na(train$Date[-1]) | is.na(sales_growth_rate)


cleaned_date <- train$Date[-1][!missing_values]
cleaned_growth_rate <- sales_growth_rate[!missing_values]


train$Date <- as.Date(train$Date)


invalid_dates <- train$Date < as.Date("2013-01-01") | train$Date > as.Date("2015-07-26")
train$Date[invalid_dates] <- NA


train$Date[is.na(train$Date)] <- mean(train$Date, na.rm = TRUE)


remaining_invalid_dates <- train$Date < as.Date("2013-01-01") | train$Date > as.Date("2015-07-26")

if (any(remaining_invalid_dates)) {
  warning("There are still invalid dates remaining after cleaning.")
}


sales_growth_ts <- ts(sales_growth_rate, start = 1, end = length(sales_growth_rate))


plot(train$Date[-1], sales_growth_rate, type = "l", xlab = "Date", ylab = "Sales Growth Rate", 
     main = "Sales Growth Rate Over Time", col = "blue", lwd = 2)


abline(h = 0, col = "red", lty = 2)


cleaned_date <- cleaned_date[!is.na(cleaned_growth_rate) & is.finite(cleaned_growth_rate)]
cleaned_growth_rate <- cleaned_growth_rate[!is.na(cleaned_growth_rate) & is.finite(cleaned_growth_rate)]

if (length(cleaned_date) > 0) {

  lines(loess.smooth(cleaned_date, cleaned_growth_rate), col = "orange", lwd = 2)
} else {
  warning("No valid data points for LOESS smoothing.")
}

legend("topright", legend = c("Sales Growth Rate", "Trend Line", "Zero Growth"), 
       col = c("blue", "orange", "red"), lty = c(1, 1, 2), lwd = c(2, 2, 1))



title(main = "Sales Growth Rate Over Time", sub = "Trend line represents smoothed growth trend", 
      cex.main = 1.2, cex.sub = 0.8)