INTRODUCTION
Rossmann is a Germany drug store chain with over 3790 stores in Europe. In this problem set obtained from Kaggle.com, participants are given records of sales of each store on different days, from 01/01/2013 to 31/07/2015. Other business factors such as holidays, promotions and competitions. Our goal is to forecast the sales of the store 6 weeks in advance, from 08/01/2015 to 09/17/2015 based on the data fields provided. We apply the Random Forest method to build a model that predict sales for these 6 weeks. Our work is based on Darius Barušauskas’s script: https://www.kaggle.com/raddar/random-forest-example.
library(readr)
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.0.2
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
set.seed(678)
Read the train, test and store data
train <- read_csv("train.csv")
## Parsed with column specification:
## cols(
## Store = col_double(),
## DayOfWeek = col_double(),
## Date = col_character(),
## Sales = col_double(),
## Customers = col_double(),
## Open = col_double(),
## Promo = col_double(),
## StateHoliday = col_double(),
## SchoolHoliday = col_double()
## )
## Warning: 31050 parsing failures.
## row col expected actual file
## 63556 StateHoliday a double a 'train.csv'
## 63558 StateHoliday a double a 'train.csv'
## 63560 StateHoliday a double a 'train.csv'
## 63561 StateHoliday a double a 'train.csv'
## 63564 StateHoliday a double a 'train.csv'
## ..... ............ ........ ...... ...........
## See problems(...) for more details.
test <- read_csv("test.csv")
## Parsed with column specification:
## cols(
## Id = col_double(),
## Store = col_double(),
## DayOfWeek = col_double(),
## Date = col_date(format = ""),
## Open = col_double(),
## Promo = col_double(),
## StateHoliday = col_double(),
## SchoolHoliday = col_double()
## )
## Warning: 180 parsing failures.
## row col expected actual file
## 28257 StateHoliday a double a 'test.csv'
## 28262 StateHoliday a double a 'test.csv'
## 28264 StateHoliday a double a 'test.csv'
## 28272 StateHoliday a double a 'test.csv'
## 28275 StateHoliday a double a 'test.csv'
## ..... ............ ........ ...... ..........
## See problems(...) for more details.
store <- read_csv("store.csv")
## Parsed with column specification:
## cols(
## Store = col_double(),
## StoreType = col_character(),
## Assortment = col_character(),
## CompetitionDistance = col_double(),
## CompetitionOpenSinceMonth = col_double(),
## CompetitionOpenSinceYear = col_double(),
## Promo2 = col_double(),
## Promo2SinceWeek = col_double(),
## Promo2SinceYear = col_double(),
## PromoInterval = col_character()
## )
Merge train and test data with store
train <- merge(train,store)
test <- merge(test,store)
Change all NAs to zeros
train[is.na(train)] <- 0
test[is.na(test)] <- 0
Look at train data’s column names and details
names(train)
## [1] "Store" "DayOfWeek"
## [3] "Date" "Sales"
## [5] "Customers" "Open"
## [7] "Promo" "StateHoliday"
## [9] "SchoolHoliday" "StoreType"
## [11] "Assortment" "CompetitionDistance"
## [13] "CompetitionOpenSinceMonth" "CompetitionOpenSinceYear"
## [15] "Promo2" "Promo2SinceWeek"
## [17] "Promo2SinceYear" "PromoInterval"
str(train)
## 'data.frame': 1017209 obs. of 18 variables:
## $ Store : num 1 1 1 1 1 1 1 1 1 1 ...
## $ DayOfWeek : num 5 6 5 3 3 7 3 1 5 1 ...
## $ Date : chr "7/31/2015" "1/12/2013" "1/3/2014" "12/3/2014" ...
## $ Sales : num 5263 4952 4190 6454 3310 ...
## $ Customers : num 555 646 552 695 464 0 453 542 466 480 ...
## $ Open : num 1 1 1 1 1 0 1 1 1 1 ...
## $ Promo : num 1 0 0 1 0 0 0 1 0 0 ...
## $ StateHoliday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SchoolHoliday : num 1 0 1 0 0 0 0 0 0 0 ...
## $ StoreType : chr "c" "c" "c" "c" ...
## $ Assortment : chr "a" "a" "a" "a" ...
## $ CompetitionDistance : num 1270 1270 1270 1270 1270 1270 1270 1270 1270 1270 ...
## $ CompetitionOpenSinceMonth: num 9 9 9 9 9 9 9 9 9 9 ...
## $ CompetitionOpenSinceYear : num 2008 2008 2008 2008 2008 ...
## $ Promo2 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Promo2SinceWeek : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Promo2SinceYear : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PromoInterval : chr "0" "0" "0" "0" ...
summary(train)
## Store DayOfWeek Date Sales
## Min. : 1.0 Min. :1.000 Length:1017209 Min. : 0
## 1st Qu.: 280.0 1st Qu.:2.000 Class :character 1st Qu.: 3727
## Median : 558.0 Median :4.000 Mode :character Median : 5744
## Mean : 558.4 Mean :3.998 Mean : 5774
## 3rd Qu.: 838.0 3rd Qu.:6.000 3rd Qu.: 7856
## Max. :1115.0 Max. :7.000 Max. :41551
## Customers Open Promo StateHoliday
## Min. : 0.0 Min. :0.0000 Min. :0.0000 Min. :0
## 1st Qu.: 405.0 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:0
## Median : 609.0 Median :1.0000 Median :0.0000 Median :0
## Mean : 633.1 Mean :0.8301 Mean :0.3815 Mean :0
## 3rd Qu.: 837.0 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0
## Max. :7388.0 Max. :1.0000 Max. :1.0000 Max. :0
## SchoolHoliday StoreType Assortment CompetitionDistance
## Min. :0.0000 Length:1017209 Length:1017209 Min. : 0
## 1st Qu.:0.0000 Class :character Class :character 1st Qu.: 700
## Median :0.0000 Mode :character Mode :character Median : 2320
## Mean :0.1786 Mean : 5416
## 3rd Qu.:0.0000 3rd Qu.: 6880
## Max. :1.0000 Max. :75860
## CompetitionOpenSinceMonth CompetitionOpenSinceYear Promo2
## Min. : 0.000 Min. : 0 Min. :0.0000
## 1st Qu.: 0.000 1st Qu.: 0 1st Qu.:0.0000
## Median : 4.000 Median :2006 Median :1.0000
## Mean : 4.927 Mean :1370 Mean :0.5006
## 3rd Qu.: 9.000 3rd Qu.:2011 3rd Qu.:1.0000
## Max. :12.000 Max. :2015 Max. :1.0000
## Promo2SinceWeek Promo2SinceYear PromoInterval
## Min. : 0.00 Min. : 0 Length:1017209
## 1st Qu.: 0.00 1st Qu.: 0 Class :character
## Median : 1.00 Median :2009 Mode :character
## Mean :11.65 Mean :1007
## 3rd Qu.:22.00 3rd Qu.:2012
## Max. :50.00 Max. :2015
Look at test data’s column names and details
names(test)
## [1] "Store" "Id"
## [3] "DayOfWeek" "Date"
## [5] "Open" "Promo"
## [7] "StateHoliday" "SchoolHoliday"
## [9] "StoreType" "Assortment"
## [11] "CompetitionDistance" "CompetitionOpenSinceMonth"
## [13] "CompetitionOpenSinceYear" "Promo2"
## [15] "Promo2SinceWeek" "Promo2SinceYear"
## [17] "PromoInterval"
str(test)
## 'data.frame': 41088 obs. of 17 variables:
## $ Store : num 1 1 1 1 1 1 1 1 1 1 ...
## $ Id : num 1 24825 5993 37665 18833 ...
## $ DayOfWeek : num 4 3 4 2 3 3 2 3 7 1 ...
## $ Date : Date, format: "2015-09-17" "2015-08-19" ...
## $ Open : num 1 1 1 1 1 1 1 1 0 1 ...
## $ Promo : num 1 1 0 1 0 1 0 1 0 1 ...
## $ StateHoliday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SchoolHoliday : num 0 1 0 1 1 1 1 0 1 1 ...
## $ StoreType : chr "c" "c" "c" "c" ...
## $ Assortment : chr "a" "a" "a" "a" ...
## $ CompetitionDistance : num 1270 1270 1270 1270 1270 1270 1270 1270 1270 1270 ...
## $ CompetitionOpenSinceMonth: num 9 9 9 9 9 9 9 9 9 9 ...
## $ CompetitionOpenSinceYear : num 2008 2008 2008 2008 2008 ...
## $ Promo2 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Promo2SinceWeek : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Promo2SinceYear : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PromoInterval : chr "0" "0" "0" "0" ...
summary(test)
## Store Id DayOfWeek Date
## Min. : 1.0 Min. : 1 Min. :1.000 Min. :2015-08-01
## 1st Qu.: 279.8 1st Qu.:10273 1st Qu.:2.000 1st Qu.:2015-08-12
## Median : 553.5 Median :20545 Median :4.000 Median :2015-08-24
## Mean : 555.9 Mean :20545 Mean :3.979 Mean :2015-08-24
## 3rd Qu.: 832.2 3rd Qu.:30816 3rd Qu.:6.000 3rd Qu.:2015-09-05
## Max. :1115.0 Max. :41088 Max. :7.000 Max. :2015-09-17
## Open Promo StateHoliday SchoolHoliday
## Min. :0.0000 Min. :0.0000 Min. :0 Min. :0.0000
## 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:0 1st Qu.:0.0000
## Median :1.0000 Median :0.0000 Median :0 Median :0.0000
## Mean :0.8541 Mean :0.3958 Mean :0 Mean :0.4435
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :0 Max. :1.0000
## StoreType Assortment CompetitionDistance
## Length:41088 Length:41088 Min. : 0
## Class :character Class :character 1st Qu.: 710
## Mode :character Mode :character Median : 2410
## Mean : 5077
## 3rd Qu.: 6435
## Max. :75860
## CompetitionOpenSinceMonth CompetitionOpenSinceYear Promo2
## Min. : 0.00 Min. : 0 Min. :0.0000
## 1st Qu.: 0.00 1st Qu.: 0 1st Qu.:0.0000
## Median : 4.00 Median :2005 Median :1.0000
## Mean : 4.43 Mean :1265 Mean :0.5806
## 3rd Qu.: 9.00 3rd Qu.:2011 3rd Qu.:1.0000
## Max. :12.00 Max. :2015 Max. :1.0000
## Promo2SinceWeek Promo2SinceYear PromoInterval
## Min. : 0.00 Min. : 0 Length:41088
## 1st Qu.: 0.00 1st Qu.: 0 Class :character
## Median : 9.00 Median :2010 Mode :character
## Mean :14.18 Mean :1168
## 3rd Qu.:31.00 3rd Qu.:2012
## Max. :49.00 Max. :2015
Look at stores that are opened in train data only
train <- train[ which(train$Open=='1'),]
Seperate out the elements of the date column for the train set
train$Date <- as.Date(train$Date,format = "%m/%d/%y")
train$month <- as.integer(format(train$Date, "%m"))
train$year <- as.integer(format(train$Date, "%y"))
train$day <- as.integer(format(train$Date, "%d"))
Remove the date column (since elements are extracted)
train <- train[,-3]
Seperate out the elements of the date column for the test set
test$Date <- as.Date(test$Date,format = "%m/%d/%y")
test$month <- as.integer(format(test$Date, "%m"))
test$year <- as.integer(format(test$Date, "%y"))
test$day <- as.integer(format(test$Date, "%d"))
Remove the date column (since elements are extracted)
test <- test[,-4]
Now we finished data cleaning before applying the Random Forest model to this dataset. However, we will first test it with the train data first. We will split the train data so that 70% is used for modeling and 30% is used for testing:
index = sample(2,nrow(train),replace = TRUE, prob=c(0.7,0.3))
train70 <- train[index==1,]
train30 <- train[index==2,]
Only select the columns in train data that are strong predictor values for sales
feature.names <- names(train70)[c(1,2,5:20)]
feature.names
## [1] "Store" "DayOfWeek"
## [3] "Open" "Promo"
## [5] "StateHoliday" "SchoolHoliday"
## [7] "StoreType" "Assortment"
## [9] "CompetitionDistance" "CompetitionOpenSinceMonth"
## [11] "CompetitionOpenSinceYear" "Promo2"
## [13] "Promo2SinceWeek" "Promo2SinceYear"
## [15] "PromoInterval" "month"
## [17] "year" "day"
for (f in feature.names) {
if (class(train70[[f]])=="character") {
levels <- unique(c(train70[[f]], train30[[f]]))
train70[[f]] <- as.integer(factor(train70[[f]], levels=levels))
train30[[f]] <- as.integer(factor(train30[[f]], levels=levels))
}
}
Apply the Random Forest model
model_demo <- randomForest(train70[,feature.names],
log(train70$Sales+1),
ntree=17,
sampsize=100000,
do.trace=TRUE)
## | Out-of-bag |
## Tree | MSE %Var(y) |
## 1 | 0.05752 30.99 |
## 2 | 0.04527 24.40 |
## 3 | 0.03895 20.99 |
## 4 | 0.03512 18.93 |
## 5 | 0.03409 18.37 |
## 6 | 0.03352 18.06 |
## 7 | 0.03247 17.50 |
## 8 | 0.03156 17.00 |
## 9 | 0.03095 16.68 |
## 10 | 0.03098 16.69 |
## 11 | 0.03063 16.50 |
## 12 | 0.03042 16.39 |
## 13 | 0.03006 16.19 |
## 14 | 0.02992 16.12 |
## 15 | 0.02964 15.97 |
## 16 | 0.02943 15.86 |
## 17 | 0.02937 15.83 |
The mean squared error is low, indicating that the model might have a high accuracy.
Predict sales and test the accuracy of the model on the train dataset
pred_demo <- exp(predict(model_demo, train30)) -1
1 - mean(abs(train30$Sales - pred_demo)/pred_demo)
## [1] 0.8847371
The accuracy of this model is 88.47%. Thus, we apply this model to predict the sales for the test data.
feature.names <- names(train)[c(1,2,5:20)]
feature.names
## [1] "Store" "DayOfWeek"
## [3] "Open" "Promo"
## [5] "StateHoliday" "SchoolHoliday"
## [7] "StoreType" "Assortment"
## [9] "CompetitionDistance" "CompetitionOpenSinceMonth"
## [11] "CompetitionOpenSinceYear" "Promo2"
## [13] "Promo2SinceWeek" "Promo2SinceYear"
## [15] "PromoInterval" "month"
## [17] "year" "day"
for (f in feature.names) {
if (class(train[[f]])=="character") {
levels <- unique(c(train[[f]], test[[f]]))
train[[f]] <- as.integer(factor(train[[f]], levels=levels))
test[[f]] <- as.integer(factor(test[[f]], levels=levels))
}
}
Apply the Random Forest model
model <- randomForest(train[,feature.names],
log(train$Sales+1),
ntree=17,
sampsize=100000,
do.trace=TRUE)
## | Out-of-bag |
## Tree | MSE %Var(y) |
## 1 | 0.06367 34.29 |
## 2 | 0.04764 25.66 |
## 3 | 0.04044 21.78 |
## 4 | 0.0375 20.19 |
## 5 | 0.03632 19.56 |
## 6 | 0.03554 19.14 |
## 7 | 0.03489 18.79 |
## 8 | 0.03361 18.10 |
## 9 | 0.03312 17.84 |
## 10 | 0.03253 17.52 |
## 11 | 0.03214 17.31 |
## 12 | 0.03178 17.11 |
## 13 | 0.03137 16.89 |
## 14 | 0.03129 16.85 |
## 15 | 0.03113 16.77 |
## 16 | 0.0308 16.59 |
## 17 | 0.03038 16.36 |
Predict Sales and save the results in a .csv document
pred <- exp(predict(model,test)) -1
sales_forecast <- data.frame(Id=test$Id, Sales=pred)
write_csv(sales_forecast,"SalesForecastRossman.csv")