INTRODUCTION

Rossmann is a Germany drug store chain with over 3790 stores in Europe. In this problem set obtained from Kaggle.com, participants are given records of sales of each store on different days, from 01/01/2013 to 31/07/2015. Other business factors such as holidays, promotions and competitions. Our goal is to forecast the sales of the store 6 weeks in advance, from 08/01/2015 to 09/17/2015 based on the data fields provided. We apply the Random Forest method to build a model that predict sales for these 6 weeks. Our work is based on Darius Barušauskas’s script: https://www.kaggle.com/raddar/random-forest-example.

library(readr)
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.0.2
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
set.seed(678)

Read the train, test and store data

train <- read_csv("train.csv")
## Parsed with column specification:
## cols(
##   Store = col_double(),
##   DayOfWeek = col_double(),
##   Date = col_character(),
##   Sales = col_double(),
##   Customers = col_double(),
##   Open = col_double(),
##   Promo = col_double(),
##   StateHoliday = col_double(),
##   SchoolHoliday = col_double()
## )
## Warning: 31050 parsing failures.
##   row          col expected actual        file
## 63556 StateHoliday a double      a 'train.csv'
## 63558 StateHoliday a double      a 'train.csv'
## 63560 StateHoliday a double      a 'train.csv'
## 63561 StateHoliday a double      a 'train.csv'
## 63564 StateHoliday a double      a 'train.csv'
## ..... ............ ........ ...... ...........
## See problems(...) for more details.
test  <- read_csv("test.csv")
## Parsed with column specification:
## cols(
##   Id = col_double(),
##   Store = col_double(),
##   DayOfWeek = col_double(),
##   Date = col_date(format = ""),
##   Open = col_double(),
##   Promo = col_double(),
##   StateHoliday = col_double(),
##   SchoolHoliday = col_double()
## )
## Warning: 180 parsing failures.
##   row          col expected actual       file
## 28257 StateHoliday a double      a 'test.csv'
## 28262 StateHoliday a double      a 'test.csv'
## 28264 StateHoliday a double      a 'test.csv'
## 28272 StateHoliday a double      a 'test.csv'
## 28275 StateHoliday a double      a 'test.csv'
## ..... ............ ........ ...... ..........
## See problems(...) for more details.
store <- read_csv("store.csv")
## Parsed with column specification:
## cols(
##   Store = col_double(),
##   StoreType = col_character(),
##   Assortment = col_character(),
##   CompetitionDistance = col_double(),
##   CompetitionOpenSinceMonth = col_double(),
##   CompetitionOpenSinceYear = col_double(),
##   Promo2 = col_double(),
##   Promo2SinceWeek = col_double(),
##   Promo2SinceYear = col_double(),
##   PromoInterval = col_character()
## )

Merge train and test data with store

train <- merge(train,store)
test <- merge(test,store)

Change all NAs to zeros

train[is.na(train)] <- 0
test[is.na(test)] <- 0

Look at train data’s column names and details

names(train)
##  [1] "Store"                     "DayOfWeek"                
##  [3] "Date"                      "Sales"                    
##  [5] "Customers"                 "Open"                     
##  [7] "Promo"                     "StateHoliday"             
##  [9] "SchoolHoliday"             "StoreType"                
## [11] "Assortment"                "CompetitionDistance"      
## [13] "CompetitionOpenSinceMonth" "CompetitionOpenSinceYear" 
## [15] "Promo2"                    "Promo2SinceWeek"          
## [17] "Promo2SinceYear"           "PromoInterval"
str(train)
## 'data.frame':    1017209 obs. of  18 variables:
##  $ Store                    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ DayOfWeek                : num  5 6 5 3 3 7 3 1 5 1 ...
##  $ Date                     : chr  "7/31/2015" "1/12/2013" "1/3/2014" "12/3/2014" ...
##  $ Sales                    : num  5263 4952 4190 6454 3310 ...
##  $ Customers                : num  555 646 552 695 464 0 453 542 466 480 ...
##  $ Open                     : num  1 1 1 1 1 0 1 1 1 1 ...
##  $ Promo                    : num  1 0 0 1 0 0 0 1 0 0 ...
##  $ StateHoliday             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SchoolHoliday            : num  1 0 1 0 0 0 0 0 0 0 ...
##  $ StoreType                : chr  "c" "c" "c" "c" ...
##  $ Assortment               : chr  "a" "a" "a" "a" ...
##  $ CompetitionDistance      : num  1270 1270 1270 1270 1270 1270 1270 1270 1270 1270 ...
##  $ CompetitionOpenSinceMonth: num  9 9 9 9 9 9 9 9 9 9 ...
##  $ CompetitionOpenSinceYear : num  2008 2008 2008 2008 2008 ...
##  $ Promo2                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Promo2SinceWeek          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Promo2SinceYear          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PromoInterval            : chr  "0" "0" "0" "0" ...
summary(train)
##      Store          DayOfWeek         Date               Sales      
##  Min.   :   1.0   Min.   :1.000   Length:1017209     Min.   :    0  
##  1st Qu.: 280.0   1st Qu.:2.000   Class :character   1st Qu.: 3727  
##  Median : 558.0   Median :4.000   Mode  :character   Median : 5744  
##  Mean   : 558.4   Mean   :3.998                      Mean   : 5774  
##  3rd Qu.: 838.0   3rd Qu.:6.000                      3rd Qu.: 7856  
##  Max.   :1115.0   Max.   :7.000                      Max.   :41551  
##    Customers           Open            Promo         StateHoliday
##  Min.   :   0.0   Min.   :0.0000   Min.   :0.0000   Min.   :0    
##  1st Qu.: 405.0   1st Qu.:1.0000   1st Qu.:0.0000   1st Qu.:0    
##  Median : 609.0   Median :1.0000   Median :0.0000   Median :0    
##  Mean   : 633.1   Mean   :0.8301   Mean   :0.3815   Mean   :0    
##  3rd Qu.: 837.0   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0    
##  Max.   :7388.0   Max.   :1.0000   Max.   :1.0000   Max.   :0    
##  SchoolHoliday     StoreType          Assortment        CompetitionDistance
##  Min.   :0.0000   Length:1017209     Length:1017209     Min.   :    0      
##  1st Qu.:0.0000   Class :character   Class :character   1st Qu.:  700      
##  Median :0.0000   Mode  :character   Mode  :character   Median : 2320      
##  Mean   :0.1786                                         Mean   : 5416      
##  3rd Qu.:0.0000                                         3rd Qu.: 6880      
##  Max.   :1.0000                                         Max.   :75860      
##  CompetitionOpenSinceMonth CompetitionOpenSinceYear     Promo2      
##  Min.   : 0.000            Min.   :   0             Min.   :0.0000  
##  1st Qu.: 0.000            1st Qu.:   0             1st Qu.:0.0000  
##  Median : 4.000            Median :2006             Median :1.0000  
##  Mean   : 4.927            Mean   :1370             Mean   :0.5006  
##  3rd Qu.: 9.000            3rd Qu.:2011             3rd Qu.:1.0000  
##  Max.   :12.000            Max.   :2015             Max.   :1.0000  
##  Promo2SinceWeek Promo2SinceYear PromoInterval     
##  Min.   : 0.00   Min.   :   0    Length:1017209    
##  1st Qu.: 0.00   1st Qu.:   0    Class :character  
##  Median : 1.00   Median :2009    Mode  :character  
##  Mean   :11.65   Mean   :1007                      
##  3rd Qu.:22.00   3rd Qu.:2012                      
##  Max.   :50.00   Max.   :2015

Look at test data’s column names and details

names(test)
##  [1] "Store"                     "Id"                       
##  [3] "DayOfWeek"                 "Date"                     
##  [5] "Open"                      "Promo"                    
##  [7] "StateHoliday"              "SchoolHoliday"            
##  [9] "StoreType"                 "Assortment"               
## [11] "CompetitionDistance"       "CompetitionOpenSinceMonth"
## [13] "CompetitionOpenSinceYear"  "Promo2"                   
## [15] "Promo2SinceWeek"           "Promo2SinceYear"          
## [17] "PromoInterval"
str(test)
## 'data.frame':    41088 obs. of  17 variables:
##  $ Store                    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ Id                       : num  1 24825 5993 37665 18833 ...
##  $ DayOfWeek                : num  4 3 4 2 3 3 2 3 7 1 ...
##  $ Date                     : Date, format: "2015-09-17" "2015-08-19" ...
##  $ Open                     : num  1 1 1 1 1 1 1 1 0 1 ...
##  $ Promo                    : num  1 1 0 1 0 1 0 1 0 1 ...
##  $ StateHoliday             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SchoolHoliday            : num  0 1 0 1 1 1 1 0 1 1 ...
##  $ StoreType                : chr  "c" "c" "c" "c" ...
##  $ Assortment               : chr  "a" "a" "a" "a" ...
##  $ CompetitionDistance      : num  1270 1270 1270 1270 1270 1270 1270 1270 1270 1270 ...
##  $ CompetitionOpenSinceMonth: num  9 9 9 9 9 9 9 9 9 9 ...
##  $ CompetitionOpenSinceYear : num  2008 2008 2008 2008 2008 ...
##  $ Promo2                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Promo2SinceWeek          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Promo2SinceYear          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ PromoInterval            : chr  "0" "0" "0" "0" ...
summary(test)
##      Store              Id          DayOfWeek          Date           
##  Min.   :   1.0   Min.   :    1   Min.   :1.000   Min.   :2015-08-01  
##  1st Qu.: 279.8   1st Qu.:10273   1st Qu.:2.000   1st Qu.:2015-08-12  
##  Median : 553.5   Median :20545   Median :4.000   Median :2015-08-24  
##  Mean   : 555.9   Mean   :20545   Mean   :3.979   Mean   :2015-08-24  
##  3rd Qu.: 832.2   3rd Qu.:30816   3rd Qu.:6.000   3rd Qu.:2015-09-05  
##  Max.   :1115.0   Max.   :41088   Max.   :7.000   Max.   :2015-09-17  
##       Open            Promo         StateHoliday SchoolHoliday   
##  Min.   :0.0000   Min.   :0.0000   Min.   :0     Min.   :0.0000  
##  1st Qu.:1.0000   1st Qu.:0.0000   1st Qu.:0     1st Qu.:0.0000  
##  Median :1.0000   Median :0.0000   Median :0     Median :0.0000  
##  Mean   :0.8541   Mean   :0.3958   Mean   :0     Mean   :0.4435  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0     3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :0     Max.   :1.0000  
##   StoreType          Assortment        CompetitionDistance
##  Length:41088       Length:41088       Min.   :    0      
##  Class :character   Class :character   1st Qu.:  710      
##  Mode  :character   Mode  :character   Median : 2410      
##                                        Mean   : 5077      
##                                        3rd Qu.: 6435      
##                                        Max.   :75860      
##  CompetitionOpenSinceMonth CompetitionOpenSinceYear     Promo2      
##  Min.   : 0.00             Min.   :   0             Min.   :0.0000  
##  1st Qu.: 0.00             1st Qu.:   0             1st Qu.:0.0000  
##  Median : 4.00             Median :2005             Median :1.0000  
##  Mean   : 4.43             Mean   :1265             Mean   :0.5806  
##  3rd Qu.: 9.00             3rd Qu.:2011             3rd Qu.:1.0000  
##  Max.   :12.00             Max.   :2015             Max.   :1.0000  
##  Promo2SinceWeek Promo2SinceYear PromoInterval     
##  Min.   : 0.00   Min.   :   0    Length:41088      
##  1st Qu.: 0.00   1st Qu.:   0    Class :character  
##  Median : 9.00   Median :2010    Mode  :character  
##  Mean   :14.18   Mean   :1168                      
##  3rd Qu.:31.00   3rd Qu.:2012                      
##  Max.   :49.00   Max.   :2015

Look at stores that are opened in train data only

train <- train[ which(train$Open=='1'),]

Seperate out the elements of the date column for the train set

train$Date <- as.Date(train$Date,format = "%m/%d/%y")
train$month <- as.integer(format(train$Date, "%m"))
train$year <- as.integer(format(train$Date, "%y"))
train$day <- as.integer(format(train$Date, "%d"))

Remove the date column (since elements are extracted)

train <- train[,-3]

Seperate out the elements of the date column for the test set

test$Date <- as.Date(test$Date,format = "%m/%d/%y")
test$month <- as.integer(format(test$Date, "%m"))
test$year <- as.integer(format(test$Date, "%y"))
test$day <- as.integer(format(test$Date, "%d"))

Remove the date column (since elements are extracted)

test <- test[,-4]

Now we finished data cleaning before applying the Random Forest model to this dataset. However, we will first test it with the train data first. We will split the train data so that 70% is used for modeling and 30% is used for testing:

index = sample(2,nrow(train),replace = TRUE, prob=c(0.7,0.3))
train70 <- train[index==1,]
train30 <- train[index==2,]

Only select the columns in train data that are strong predictor values for sales

feature.names <- names(train70)[c(1,2,5:20)]
feature.names
##  [1] "Store"                     "DayOfWeek"                
##  [3] "Open"                      "Promo"                    
##  [5] "StateHoliday"              "SchoolHoliday"            
##  [7] "StoreType"                 "Assortment"               
##  [9] "CompetitionDistance"       "CompetitionOpenSinceMonth"
## [11] "CompetitionOpenSinceYear"  "Promo2"                   
## [13] "Promo2SinceWeek"           "Promo2SinceYear"          
## [15] "PromoInterval"             "month"                    
## [17] "year"                      "day"
for (f in feature.names) {
  if (class(train70[[f]])=="character") {
    levels <- unique(c(train70[[f]], train30[[f]]))
    train70[[f]] <- as.integer(factor(train70[[f]], levels=levels))
    train30[[f]]  <- as.integer(factor(train30[[f]],  levels=levels))
  }
}

Apply the Random Forest model

model_demo <- randomForest(train70[,feature.names], 
                    log(train70$Sales+1), 
                    ntree=17,
                    sampsize=100000, 
                    do.trace=TRUE)
##      |      Out-of-bag   |
## Tree |      MSE  %Var(y) |
##    1 |  0.05752    30.99 |
##    2 |  0.04527    24.40 |
##    3 |  0.03895    20.99 |
##    4 |  0.03512    18.93 |
##    5 |  0.03409    18.37 |
##    6 |  0.03352    18.06 |
##    7 |  0.03247    17.50 |
##    8 |  0.03156    17.00 |
##    9 |  0.03095    16.68 |
##   10 |  0.03098    16.69 |
##   11 |  0.03063    16.50 |
##   12 |  0.03042    16.39 |
##   13 |  0.03006    16.19 |
##   14 |  0.02992    16.12 |
##   15 |  0.02964    15.97 |
##   16 |  0.02943    15.86 |
##   17 |  0.02937    15.83 |

The mean squared error is low, indicating that the model might have a high accuracy.

Predict sales and test the accuracy of the model on the train dataset

pred_demo <- exp(predict(model_demo, train30)) -1
1 - mean(abs(train30$Sales - pred_demo)/pred_demo)
## [1] 0.8847371

The accuracy of this model is 88.47%. Thus, we apply this model to predict the sales for the test data.

feature.names <- names(train)[c(1,2,5:20)]
feature.names
##  [1] "Store"                     "DayOfWeek"                
##  [3] "Open"                      "Promo"                    
##  [5] "StateHoliday"              "SchoolHoliday"            
##  [7] "StoreType"                 "Assortment"               
##  [9] "CompetitionDistance"       "CompetitionOpenSinceMonth"
## [11] "CompetitionOpenSinceYear"  "Promo2"                   
## [13] "Promo2SinceWeek"           "Promo2SinceYear"          
## [15] "PromoInterval"             "month"                    
## [17] "year"                      "day"
for (f in feature.names) {
  if (class(train[[f]])=="character") {
    levels <- unique(c(train[[f]], test[[f]]))
    train[[f]] <- as.integer(factor(train[[f]], levels=levels))
    test[[f]]  <- as.integer(factor(test[[f]],  levels=levels))
  }
}

Apply the Random Forest model

model <- randomForest(train[,feature.names], 
                    log(train$Sales+1), 
                    ntree=17,
                    sampsize=100000, 
                    do.trace=TRUE)
##      |      Out-of-bag   |
## Tree |      MSE  %Var(y) |
##    1 |  0.06367    34.29 |
##    2 |  0.04764    25.66 |
##    3 |  0.04044    21.78 |
##    4 |   0.0375    20.19 |
##    5 |  0.03632    19.56 |
##    6 |  0.03554    19.14 |
##    7 |  0.03489    18.79 |
##    8 |  0.03361    18.10 |
##    9 |  0.03312    17.84 |
##   10 |  0.03253    17.52 |
##   11 |  0.03214    17.31 |
##   12 |  0.03178    17.11 |
##   13 |  0.03137    16.89 |
##   14 |  0.03129    16.85 |
##   15 |  0.03113    16.77 |
##   16 |   0.0308    16.59 |
##   17 |  0.03038    16.36 |

Predict Sales and save the results in a .csv document

pred <- exp(predict(model,test)) -1
sales_forecast <- data.frame(Id=test$Id, Sales=pred)
write_csv(sales_forecast,"SalesForecastRossman.csv")