This is a solution to the Kaggle competition titled Rossmann Store Sales - Forecast sales using store, promotion, and competitor data.
“Rossmann operates over 3,000 drug stores in 7 European countries. Currently, Rossmann store managers are tasked with predicting their daily sales for up to six weeks in advance. Store sales are influenced by many factors, including promotions, competition, school and state holidays, seasonality, and locality. With thousands of individual managers predicting sales based on their unique circumstances, the accuracy of results can be quite varied.”
“In their first Kaggle competition, Rossmann is challenging you to predict 6 weeks of daily sales for 1,115 stores located across Germany. Reliable sales forecasts enable store managers to create effective staff schedules that increase productivity and motivation. By helping Rossmann create a robust prediction model, you will help store managers stay focused on what’s most important to them: their customers and their teams!”
trainingData <- read.csv("train.csv")
testingData <- read.csv("test.csv")
storeData <- read.csv("store.csv")
Training data and testing data are processed to be able to analyze the data more easily.
The following fields have been added to training and testing data.
Training and testing data had some inconsistencies and missing values that have been resolved.
# Training Data
## Transformation
trainingData$Date <- as.Date(trainingData$Date)
trainingData$Sales <- as.numeric(trainingData$Sales)
trainingData$Customers <- as.numeric(trainingData$Customers)
trainingData$StateHoliday <- as.factor(trainingData$StateHoliday)
trainingData$SchoolHoliday <- as.factor(trainingData$SchoolHoliday)
## New fields
trainingData$WeekOfYear <- as.numeric(strftime(trainingData$Date, format="%W"))
trainingData$Weekend <- ifelse(trainingData$DayOfWeek %in% 1:5, 0, 1)
trainingData$DateDiff <- as.numeric(trainingData$Date - as.Date("2013-01-01"))
## Data Cleanup - Set Open = 0 when sales <= 0
trainingData[trainingData$Open == 1 & trainingData$Sales <= 0.00, 'Open'] <- 0
# Testing Data
## Transformation
testingData$Date <- as.Date(testingData$Date)
testingData$StateHoliday <- as.factor(testingData$StateHoliday)
testingData$SchoolHoliday <- as.factor(testingData$SchoolHoliday)
## New fields
testingData$WeekOfYear <- as.numeric(strftime(testingData$Date, format="%W"))
testingData$Weekend <- ifelse(testingData$DayOfWeek %in% 1:5, 0, 1)
testingData$DateDiff <- as.numeric(testingData$Date - as.Date("2013-01-01"))
## Data Cleanup
### There are 11 record with an empty Open flag - Set the flag based on the other data
### if more than half of the stores are open on a date for a missing Open flag record
### assume the store is open, otherwise assume it is closed on that day.
missingOpenFlags <- testingData %>%
group_by(Date) %>%
summarize(total.count = n(),
count.open = sum(!is.na(Open) & Open == 1))
testingData <- testingData %>%
inner_join(missingOpenFlags, by = "Date")
testingData$Open <- ifelse(!is.na(testingData$Open), testingData$Open,
ifelse(testingData$count.open >
(testingData$total.count / 2.0), 1, 0))
# Remove the fields that are no longer needed.
testingData <- testingData[, -(12:13)]
The training data is missing data for some stores for some periods of time. In this section the missing data is estimated and using the method described below.
# Get all training dates
allTrainingDates <- unique(trainingData$Date)
# Get all stores
allStores <- unique(trainingData$Store)
# Find all combiniations of dates and stores
allStoresAndDates <- expand.grid(allStores, allTrainingDates)
#assign column names
names(allStoresAndDates) <- c("Store", "Date")
# Find missing Stores and Dates data
missingDatesForStores <- anti_join(allStoresAndDates, trainingData, by = c("Store", "Date"))
# Initialize the imputed data
imputedData <- missingDatesForStores
# Add day of Week and Week of Year
imputedData$DayOfWeek <- as.POSIXlt(imputedData$Date)$wday
# Since the standard for Day of Week does not match what is used in Data we need to adjust the values
# as.POSIXlt(imputedData$Date)$wday returns 0 for Sundays but the data uses 7 for Sundays
imputedData[imputedData$DayOfWeek == 0, 'DayOfWeek'] <- 7
imputedData$WeekOfYear <- as.numeric(strftime(imputedData$Date, format="%W"))
imputedData$Weekend <- ifelse(imputedData$DayOfWeek %in% 1:5, 0, 1)
# Infer the Open / Promo flags from other data
distinctMissingDates <- unique(missingDatesForStores$Date)
missingDateFlags <- trainingData %>%
filter(Date %in% distinctMissingDates) %>%
group_by(Date) %>%
summarize(total.count = n(),
count.open = sum(Open == 1),
count.promo = sum(Promo == 1))
imputedData <- imputedData %>% inner_join(missingDateFlags, by = "Date")
# if more then half stores have the Open and Promo flags set to TRUE
# then set the flag to TRUE for the store.
imputedData$Open <- ifelse(imputedData$count.open > (imputedData$total.count / 2.0), 1, 0)
imputedData$Promo <- ifelse(imputedData$count.promo > (imputedData$total.count / 2.0), 1, 0)
imputedData$Sales <- NA
imputedData$Customers <- NA
imputedData$StateHoliday <- NA
imputedData$SchoolHoliday <- NA
imputedData$DateDiff <- as.numeric(imputedData$Date - as.Date("2013-01-01"))
imputedData$Imputed <- 1
# Remove Extra Columns
imputedData <- imputedData[, -(6:8)]
trainingData$Imputed <- 0
regularSales <-
trainingData[trainingData$Promo == 0 & trainingData$Open == 1 & trainingData$Sales > 0, ]
# Calculate Regular Sales Trend to use in imputing missing data
for(store in storeData$Store) {
coeff <- lm(data = regularSales[regularSales$Store == store, ],
Sales ~ DateDiff)$coefficients
storeData[storeData$Store == store, 'reg_intercept'] <- coeff[1]
storeData[storeData$Store == store, 'reg_slope'] <- coeff[2]
trainingData[trainingData$Store == store, 'LinearRegressionForecast'] <-
coeff[1] + coeff[2] * trainingData[trainingData$Store == store, 'DateDiff']
imputedData[imputedData$Store == store, 'LinearRegressionForecast'] <-
coeff[1] + coeff[2] * imputedData[imputedData$Store == store, 'DateDiff']
}
predictors <- c('Store', 'WeekOfYear', 'DayOfWeek', 'Promo')
modelImput <- trainingData[trainingData$Open == 1, ] %>%
group_by_(.dots=predictors) %>%
summarize(salesMinusForecast=mean(Sales - LinearRegressionForecast)) %>%
ungroup()
saveImputedData <- imputedData
imputedData <- imputedData %>%
left_join(modelImput, by=predictors) %>%
mutate(Sales=salesMinusForecast + LinearRegressionForecast) %>%
select(Store, DayOfWeek, Date, Sales, Customers, Open, Promo,
StateHoliday, SchoolHoliday, WeekOfYear, Weekend, Imputed,
DateDiff, LinearRegressionForecast)
# Set Sales to zero for the dates that the stores are closed
imputedData[imputedData$Open == 0, 'Sales'] <- 0.0
imputedData[!is.na(imputedData$Sales) & imputedData$Sales < 0, 'Sales'] <- 0.0
# There are still many null values for Sales
# This is because we don't have Sales data for all different
# combinations of our predictors 'Store', 'WeekOfYear', 'Weekend', 'Promo'
# To estimate the missing sales we calculate average Promo to regular sales
# for each store and use it to imput for missing store/date sales
# Stores that did not have promotion on a missing sale date but the sale that
# we are trying to estimate is on a promo date
averageSales <- trainingData[trainingData$Open == 1, ] %>%
group_by(Store, Promo) %>%
summarize(meanSales=mean(Sales))
avgSalesRatios <- averageSales %>%
filter(Promo == 0) %>%
select(Store, regSales = meanSales)
avgSalesRatios <- avgSalesRatios %>%
inner_join(averageSales[averageSales$Promo == 1, ], by=c('Store')) %>%
select(Store, regSales, promoSales=meanSales)
avgSalesRatios$Ratio <- avgSalesRatios$promoSales / avgSalesRatios$regSales
avgSalesRatios <- avgSalesRatios[, -(2:3)]
missingRegImputed <- imputedData[is.na(imputedData$Sales) & imputedData$Promo == 0, ]
missingPromoImputed <- imputedData[is.na(imputedData$Sales) & imputedData$Promo == 1, ]
# Estimate the sales - Regular dates
predictors <- c('Store', 'WeekOfYear', 'DayOfWeek') #### Removed Promo
missingRegImputed <- missingRegImputed %>%
left_join(modelImput[modelImput$Promo == 1, ], by=predictors) %>%
inner_join(avgSalesRatios, by=c('Store')) %>%
mutate(Sales=(salesMinusForecast + LinearRegressionForecast) / Ratio) %>%
select(Store, DayOfWeek, Date, Sales, Customers, Open,
Promo=Promo.x, StateHoliday, SchoolHoliday, WeekOfYear,
Weekend, Imputed, DateDiff, LinearRegressionForecast)
missingPromoImputed <- missingPromoImputed %>%
left_join(modelImput[modelImput$Promo == 0, ], by=predictors) %>%
inner_join(avgSalesRatios, by=c('Store')) %>%
mutate(Sales=(salesMinusForecast + LinearRegressionForecast) * Ratio) %>%
select(Store, DayOfWeek, Date, Sales, Customers, Open,
Promo=Promo.x, StateHoliday, SchoolHoliday, WeekOfYear,
Weekend, Imputed, DateDiff, LinearRegressionForecast)
# There is still missing Sales data on 12-31-2014
# This happens because 12-31-2014 was day 3 of week 52.
# we don't have any match for this in our data.
# to imput the missing Sales data for 12-31-2014 just use the Sales figures from
# previous year and adjust it using the sales trend
missingRegImputedException <- missingRegImputed %>%
filter(is.na(Sales)) %>%
inner_join(storeData, by='Store') %>%
select(Store, DayOfWeek, Date, Sales, Customers, Open, Promo,
StateHoliday, SchoolHoliday, WeekOfYear, Weekend, Imputed,
DateDiff, LinearRegressionForecast, reg_slope)
missingRegImputedException <- missingRegImputedException %>%
inner_join(trainingData[, c('Store', 'Sales', 'DateDiff')],
by=c('Store')) %>%
filter(DateDiff.x == 729 & DateDiff.y == 364 ) %>%
mutate(Sales = Sales.y + reg_slope * 365.0) %>%
select(Store, DayOfWeek, Date, Sales, Customers, Open, Promo,
StateHoliday, SchoolHoliday, WeekOfYear, Weekend, Imputed,
DateDiff=DateDiff.x, LinearRegressionForecast)
missingRegImputed <- missingRegImputed %>% filter(!is.na(Sales))
############################################
# There are still 437 records in missingPromoImputed with NA Sales.
# Estimate these missing values based on previous day OR previous week Sales
# Previous day
index <- which(is.na(missingPromoImputed$Sales))
for(i in index) {
iStore <- missingPromoImputed[i, 'Store']
iWeekOfYear <- missingPromoImputed[i, 'WeekOfYear']
iDayOfWeek <- missingPromoImputed[i, 'DayOfWeek']
# 1 - Check to see if we have data for the previous day
iDayOfWeek <- ifelse(iDayOfWeek %in% 2:5, iDayOfWeek - 1, iDayOfWeek)
match = filter(modelImput,
Store == iStore &
WeekOfYear == iWeekOfYear &
DayOfWeek == iDayOfWeek )
if(dim(match)[1] <= 0)
{
iDayOfWeek <- missingPromoImputed[i, 'DayOfWeek']
# 2 - Check to see if we have data for the next day
iDayOfWeek <- ifelse(iDayOfWeek %in% 1:4, iDayOfWeek + 1, iDayOfWeek)
match = filter(modelImput,
Store == iStore &
WeekOfYear == iWeekOfYear &
DayOfWeek == iDayOfWeek )
}
iDayOfWeek <- missingPromoImputed[i, 'DayOfWeek']
if(dim(match)[1] <= 0)
{
# 3 - Check to see if we have data for the previous Week
iWeekOfYear <- ifelse(iWeekOfYear > 1, iWeekOfYear - 1, iWeekOfYear)
match = filter(modelImput,
Store == iStore &
WeekOfYear == iWeekOfYear &
DayOfWeek == iDayOfWeek )
}
iWeekOfYear <- missingPromoImputed[i, 'WeekOfYear']
if(dim(match)[1] <= 0)
{
# 4 - Check to see if we have data for the next Week
iWeekOfYear <- ifelse(iWeekOfYear < 51, iWeekOfYear + 1, iWeekOfYear)
match = filter(modelImput,
Store == iStore &
WeekOfYear == iWeekOfYear &
DayOfWeek == iDayOfWeek )
}
iWeekOfYear <- missingPromoImputed[i, 'WeekOfYear']
if(dim(match)[1] <= 0)
{
# 5 - Check to see if we have data for two weeks ago
iWeekOfYear <- ifelse(iWeekOfYear > 2, iWeekOfYear - 2, iWeekOfYear)
match = filter(modelImput,
Store == iStore &
WeekOfYear == iWeekOfYear &
DayOfWeek == iDayOfWeek )
}
iWeekOfYear <- missingPromoImputed[i, 'WeekOfYear']
if(dim(match)[1] <= 0)
{
# 6 - Check to see if we have data for two Weeks later
iWeekOfYear <- ifelse(iWeekOfYear < 50, iWeekOfYear + 2, iWeekOfYear)
match = filter(modelImput,
Store == iStore &
WeekOfYear == iWeekOfYear &
DayOfWeek == iDayOfWeek )
}
iWeekOfYear <- missingPromoImputed[i, 'WeekOfYear']
if(dim(match)[1] > 0)
{
missingPromoImputed[i, 'Sales'] <-
match[1, 'salesMinusForecast'] +
missingPromoImputed[i, 'LinearRegressionForecast']
if(match[1, 'Promo'] == 0){
missingPromoImputed[i, 'Sales'] <-
missingPromoImputed[i, 'Sales'] *
avgSalesRatios[avgSalesRatios$Store == iStore, 'Ratio']
}
}
}
#
############################################
imputedData <- imputedData %>%
filter(!is.na(Sales)) %>%
select(Store, DayOfWeek, Date, Sales, Customers, Open, Promo,
StateHoliday, SchoolHoliday, WeekOfYear, Weekend, Imputed,
DateDiff, LinearRegressionForecast)
imputedData <- rbind(imputedData, missingRegImputed,
missingPromoImputed,
missingRegImputedException)
combinedTrainingData <- rbind(trainingData, imputedData)
store <- combinedTrainingData[combinedTrainingData$Store == 13, ]
ggplot(store) + aes(Date, Sales, color=Imputed) + geom_line()
store <- combinedTrainingData[combinedTrainingData$Store == 100, ]
ggplot(store) + aes(Date, Sales, color=Imputed) + geom_line()
store <- combinedTrainingData[combinedTrainingData$Store == 920, ]
ggplot(store) + aes(Date, Sales, color=Imputed) + geom_line()
Use a similar approach as in imputing missing data to forecast data for the six weeks period after 2015-07-31. The combined training data and imputed data is used to forecast the sales into the future.
regularSales <- combinedTrainingData[combinedTrainingData$Promo == 0 &
combinedTrainingData$Open == 1, ]
testingForecast <- testingData
# Calculate Regular Sales Trend to use in imputing missing data
for(store in storeData$Store) {
coeff <- lm(data = regularSales[regularSales$Store == store, ],
Sales ~ DateDiff)$coefficients
storeData[storeData$Store == store, 'reg_intercept'] <- coeff[1]
storeData[storeData$Store == store, 'reg_slope'] <- coeff[2]
combinedTrainingData[combinedTrainingData$Store == store, 'LinearRegressionForecast'] <-
coeff[1] + coeff[2] * combinedTrainingData[combinedTrainingData$Store == store, 'DateDiff']
testingForecast[testingForecast$Store == store, 'LinearRegressionForecast'] <-
coeff[1] + coeff[2] * testingForecast[testingForecast$Store == store, 'DateDiff']
}
predictors <- c('Store', 'WeekOfYear', 'DayOfWeek', 'Promo')
modelForecast <- combinedTrainingData[combinedTrainingData$Open == 1, ] %>%
group_by_(.dots=predictors) %>%
summarize(salesMinusForecast=mean(Sales - LinearRegressionForecast)) %>%
ungroup()
testingForecast <- testingForecast %>%
left_join(modelForecast, by=predictors) %>%
mutate(Sales=salesMinusForecast + LinearRegressionForecast) %>%
select(Id, Store, DayOfWeek, Date, Sales, Open, Promo, StateHoliday,
SchoolHoliday, WeekOfYear, Weekend, DateDiff, LinearRegressionForecast)
testingForecast$Imputed <- 0
testingForecast$Forecast <- 1
combinedTrainingData$Forecast <- 0
testingForecast[!is.na(testingForecast$Open) & testingForecast$Open == 0, 'Sales'] <- 0.0
index <- which(is.na(testingForecast$Sales))
for(i in index) {
iStore <- testingForecast[i, 'Store']
iWeekOfYear <- testingForecast[i, 'WeekOfYear']
iDayOfWeek <- testingForecast[i, 'DayOfWeek']
# 1 - Check to see if we have data for a previous day
iDayOfWeek <- ifelse(iDayOfWeek %in% 2:5, iDayOfWeek - 1, iDayOfWeek)
match = filter(modelForecast,
Store == iStore &
WeekOfYear == iWeekOfYear &
DayOfWeek == iDayOfWeek )
if(dim(match)[1] <= 0)
{
iDayOfWeek <- testingForecast[i, 'DayOfWeek']
# 2 - Check to see if we have data for a previous day
iDayOfWeek <- ifelse(iDayOfWeek %in% 1:4, iDayOfWeek + 1, iDayOfWeek)
match = filter(modelForecast,
Store == iStore &
WeekOfYear == iWeekOfYear &
DayOfWeek == iDayOfWeek )
}
iDayOfWeek <- testingForecast[i, 'DayOfWeek']
if(dim(match)[1] <= 0)
{
# 3 - Check to see if we have data for a previous Week
iWeekOfYear <- ifelse(iWeekOfYear > 1, iWeekOfYear - 1, iWeekOfYear)
match = filter(modelForecast,
Store == iStore &
WeekOfYear == iWeekOfYear &
DayOfWeek == iDayOfWeek )
}
iWeekOfYear <- testingForecast[i, 'WeekOfYear']
if(dim(match)[1] <= 0)
{
# 4 - Check to see if we have data for a next Week
iWeekOfYear <- ifelse(iWeekOfYear < 51, iWeekOfYear + 1, iWeekOfYear)
match = filter(modelForecast,
Store == iStore &
WeekOfYear == iWeekOfYear &
DayOfWeek == iDayOfWeek )
}
iWeekOfYear <- testingForecast[i, 'WeekOfYear']
if(dim(match)[1] <= 0)
{
# 5 - Check to see if we have data for two weeks ago
iWeekOfYear <- ifelse(iWeekOfYear > 2, iWeekOfYear - 2, iWeekOfYear)
match = filter(modelForecast,
Store == iStore &
WeekOfYear == iWeekOfYear &
DayOfWeek == iDayOfWeek )
}
iWeekOfYear <- testingForecast[i, 'WeekOfYear']
if(dim(match)[1] <= 0)
{
# 6 - Check to see if we have data for two Weeks later
iWeekOfYear <- ifelse(iWeekOfYear < 50, iWeekOfYear + 2, iWeekOfYear)
match = filter(modelForecast,
Store == iStore &
WeekOfYear == iWeekOfYear &
DayOfWeek == iDayOfWeek )
}
iWeekOfYear <- testingForecast[i, 'WeekOfYear']
if(dim(match)[1] > 0)
{
testingForecast[i, 'Sales'] <-
match[1, 'salesMinusForecast'] +
testingForecast[i, 'LinearRegressionForecast']
if(match[1, 'Promo'] == 0){
testingForecast[i, 'Sales'] <-
testingForecast[i, 'Sales'] *
avgSalesRatios[avgSalesRatios$Store == iStore, 'Ratio']
}
}
}
combinedTrainingTestingData <- rbind(combinedTrainingData[, c(1:4, 6:15)],
testingForecast[, 2:15])
combinedTrainingTestingData[combinedTrainingTestingData$Imputed == 1,
'Type'] <- "Imputed"
combinedTrainingTestingData[combinedTrainingTestingData$Forecast == 1,
'Type'] <- "Forecast"
combinedTrainingTestingData[combinedTrainingTestingData$Imputed == 0 &
combinedTrainingTestingData$Forecast == 0,
'Type'] <- "Observed"
# Write the forecast into a .csv file
finalForecast <- data.frame(Id=testingForecast$Id, Sales=testingForecast$Sales)
write.csv(finalForecast, "forecast.csv",row.names=F)
store <- combinedTrainingTestingData[combinedTrainingTestingData$Store == 13, ]
ggplot(store) + aes(Date, Sales, color=Type) + geom_line()
store <- combinedTrainingTestingData[combinedTrainingTestingData$Store == 100, ]
ggplot(store) + aes(Date, Sales, color=Type) + geom_line()
store <- combinedTrainingTestingData[combinedTrainingTestingData$Store == 920, ]
ggplot(store) + aes(Date, Sales, color=Type) + geom_line()
store <- combinedTrainingTestingData[combinedTrainingTestingData$Store == 744, ]
ggplot(store) + aes(Date, Sales, color=Type) + geom_line()
store <- combinedTrainingTestingData[combinedTrainingTestingData$Store == 644, ]
ggplot(store) + aes(Date, Sales, color=Type) + geom_line()