#Libraries for data analysis
library(RPostgreSQL)
library(stringr)
library(ggplot2)
library(plyr)
library(dygraphs)
library(plotly)
library(gridExtra)
library(zoo)
library(xts)
library(tsoutliers)
library(DT)
library(tseries)
library(forecast)
library(reshape2)
library(dtw)

Loading dataset from PSQL database

  1. Firstly, let’s read data from database:
#Establishing connection to db
con <- dbConnect(PostgreSQL(), 
                 user= "datascience", 
                 password="datasciencepassword", 
                 dbname="grubhub", 
                 host = "opsinterviews.cn6vkbc8fhom.us-east-1.rds.amazonaws.com", 
                 port = "5432")

#Reading all data from public.problem5 table
orders <- dbGetQuery(con, statement = paste("select * from public.problem5"))
  1. Explore size of data.frame:
dim(orders)
[1] 20613     7
  1. Explore type of variables in orders data.frame:
str(orders)
'data.frame':   20613 obs. of  7 variables:
 $ order_year : int  2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
 $ order_month: int  9 9 9 9 9 9 9 9 9 9 ...
 $ order_day  : int  2 2 2 2 2 2 2 2 2 2 ...
 $ order_hour : int  0 1 2 3 4 5 6 7 8 9 ...
 $ order_count: int  40 36 30 25 21 20 20 21 23 26 ...
 $ order_week : chr  "Monday                                                                                                                         "| __truncated__ "Monday                                                                                                                         "| __truncated__ "Monday                                                                                                                         "| __truncated__ "Monday                                                                                                                         "| __truncated__ ...
 $ order_date : Date, format: "2013-09-02" "2013-09-02" ...

Let’s delete all spaces from order_week variable and convert it to factor:

#Deleting spaces from order_week column
orders$order_week <- str_trim(orders$order_week)

#Transforming order_week variable to factor type
orders$order_week <- factor(orders$order_week, levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))
  1. Verification of NAs appearance:
sum(!complete.cases(orders))
[1] 0

There are no NAs in dataset.

  1. Missed rows in dataset:
sum(table(orders$order_date) < 24)
[1] 49

So, there are 49 days with missed hours. There are two possible explanation of it:
- there are 0 orders for these hours;
- rows are just missed.
These values are needed to be imputed to make timeseries full. I made an assumption that there are no orders for these hours and have imputed them with 0.

List of days with missed hours:

names(which(table(orders$order_date) < 24))
 [1] "2013-09-04" "2013-09-07" "2013-09-09" "2013-09-16" "2013-09-30"
 [6] "2013-10-01" "2013-10-02" "2013-10-04" "2013-10-10" "2013-10-21"
[11] "2013-10-22" "2013-10-31" "2013-11-05" "2013-11-11" "2013-11-12"
[16] "2013-11-15" "2013-12-05" "2013-12-25" "2014-01-22" "2014-02-04"
[21] "2014-02-11" "2014-02-18" "2014-02-19" "2014-02-20" "2014-03-12"
[26] "2014-03-25" "2014-04-01" "2014-04-07" "2014-04-21" "2014-04-25"
[31] "2014-04-28" "2014-05-01" "2014-05-07" "2014-05-09" "2014-05-12"
[36] "2014-05-13" "2014-05-15" "2014-05-29" "2014-06-01" "2014-06-05"
[41] "2014-08-19" "2014-09-11" "2014-09-15" "2014-09-25" "2014-10-01"
[46] "2014-11-03" "2014-12-25" "2015-02-02" "2015-12-25"

Adding rows to data.frame with missed hours as 0:

while (sum(which(table(orders$order_date) < 24)) > 0) {
        d <- as.Date(names(which(table(orders$order_date) < 24))[1])
        orders <- rbind(orders, data.frame(order_year = as.numeric(format(d, "%Y")),
                                           order_month = as.numeric(format(d, "%m")),
                                           order_day = as.POSIXlt(d)$mday,
                                           order_hour = (setdiff(0:23, orders[orders$order_date == d,4]))[1],
                                           order_count = 0,
                                           order_week = format(as.Date(d), "%A"),
                                           order_date = d))
}

orders <- orders[order(orders$order_date, orders$order_hour),]

Now, data is ready for analysis and we could look at it:

datatable(orders)


Statistical exploration of the dataset and identifying patterns in data

  1. Firstly, let’s plot the data:
#Add full date for more convenient
orders$full_date <- as.POSIXlt(with(orders, paste(order_date, order_hour)), format = '%Y-%m-%d %H')

#Correcting time moments, which don't exist (due to time change) 
orders$full_date[is.na(orders$full_date)]$hour <- orders$full_date[is.na(orders$full_date)]$hour + 1

#Plot last month data
ord <- as.xts(orders$order_count, order.by = orders$full_date) #time object

dygraph(ord, main = "Orders") %>%
dyRangeSelector(dateWindow = c("2015-12-01", "2015-12-31"))

So, as could be seen, data is periodical with the smallest period - 1 day. Also, we could observe descent of number of orders at Christmas time. Every day is characterized with two peaks: first at dinner time, second - evening time.

  1. General Statistics of number of orders:
summary(orders$order_count)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   0.00   26.00   53.00   77.32   98.00  440.00 
rng <- range(orders$order_count)
p1 <- qplot(order_count, 
            data=orders,
            geom="histogram", 
            breaks = seq(rng[1], rng[2], by =10),
            main = "Histogram of number of\n orders per hour",
            xlab = "Orders per hour",
            ylab = "Count"
      )
ggplotly(p1)

So, our data is highly right skewed. And mostly there are from 20 to 30 orders per hour. At average, there are 53 orders per hour.

  1. Trend in data:
#Aggregating data to months
months_orders <- ddply(orders, ~format(order_date, "%Y-%m"), summarise, months_orders=sum(order_count))
names(months_orders) <- c("Month", "Orders")

#Deleting last row as it isn't representative
months_orders <- months_orders[-nrow(months_orders),]
months_orders$Month <- as.Date(as.yearmon(months_orders$Month))

p2 <- ggplot(data = months_orders, 
                aes(x = Month, y = Orders)) +
                geom_line(lwd = 0.1, color = "grey") +
                geom_point() + 
                geom_smooth(lwd = 0.1, method = "lm", formula = y ~ x) +
                labs(title = "Trend in data")

ggplotly(p2)

There is a positive trend in number of orders per month. At average it has increased on 20k for two years.

  1. Difference in days of the week:
#Getting weekdays and orders
days_orders <- ddply(orders, ~order_date, summarise, order_count=sum(order_count))
days_orders$order_date <- factor(weekdays(as.Date(as.yearmon(days_orders$order_date))), levels = rev(c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")))

plot_ly(days_orders, y = order_count, color = order_date, type = "box") %>% 
        layout(title = "Orders per weekdays",  yaxis = list(title = "Number of orders per day"))

On the plot it is shown that there is no significant difference in number of orders for different week days.
Let’s verify this with statistical test. One-way t-test:

#Performing ANOVA test
oneway.test(days_orders$order_count~days_orders$order_date)

    One-way analysis of means (not assuming equal variances)

data:  days_orders$order_count and days_orders$order_date
F = 2.996, num df = 6.00, denom df = 355.34, p-value = 0.007215
#Performing pairwise t-test
pairwise.t.test(days_orders$order_count, days_orders$order_date)

    Pairwise comparisons using t tests with pooled SD 

data:  days_orders$order_count and days_orders$order_date 

          Sunday Saturday Friday Thursday Wednesday Tuesday
Saturday  1.0000 -        -      -        -         -      
Friday    1.0000 1.0000   -      -        -         -      
Thursday  0.3768 0.6525   0.2719 -        -         -      
Wednesday 1.0000 1.0000   1.0000 1.0000   -         -      
Tuesday   0.6476 0.7215   1.0000 0.0015   0.1547    -      
Monday    1.0000 1.0000   1.0000 0.9489   1.0000    0.7215 

P value adjustment method: holm 

This test shows that p = 0.007215 < 0.05, so with 95% we could conclude that data provide convincing evidence that at least one pair of means of number of orders among weekdays are different from each other (we could reject H0 Hypothesis, which stands for their equality). Based on pairwise t-test we can conclude with 95% confidence level that Tuesday has smaller average number of orders than Thursday.

  1. Difference in number of orders per day for different months:
#Getting months and orders
days_orders <- ddply(orders, ~order_date, summarise, order_count=sum(order_count))
days_orders$order_date <- factor(format(as.Date(as.yearmon(days_orders$order_date)), "%B"), levels = rev(unique(format(as.Date(as.yearmon(days_orders$order_date)), "%B")[order(format(as.Date(as.yearmon(days_orders$order_date)), "%m"))])))

plot_ly(days_orders, y = order_count, color = order_date, type = "box")  %>% 
        layout(title = "Orders per months",  yaxis = list(title = "Number of orders per day"))

July and September are characterized by smaller number of orders per day than January and February (also could be verified by statistical test). It could be caused because of July and September are vocation months.

  1. Difference in number of orders per hour for different hours of day:
plot_ly(orders, y = order_count, color = factor(order_hour), type = "box")  %>% 
        layout(title = "Orders per hours",  yaxis = list(title = "Number of orders per hour"))

Here interesting patterns are shown. Hours 0-10 are characterized by very small number of orders per hour, where 17-20 hours are very intensive.

  1. Difference in working and weekend days:
#Add column with data which indicates where it is weekend day (Saturday, Sunday) or not
orders$weekend <- ifelse(orders$order_week %in% c("Saturday", "Sunday"), 'weekend', 'weekday')

#Creating data for weekdays and weekends
weekends <- ddply(orders, .(order_hour, weekend), summarise, avr_orders=median(order_count))

p3 <- qplot(order_hour,
      avr_orders,
      color = weekend,
      data=weekends,
      geom=c("point", "path"),
      main = "Average of number of\n orders per hour",
      xlab = "Hour",
      ylab = "Average number of orders"
      )
p3

Almost for the whole range of hours we could observe higher average number of orders per hour for weekends.


Outliers detection

  1. Outliers detection is performed separately for each day of week (as we observed some difference in them) in next way:
#Building data.frame with hour data for each day
ord_out <- data.frame(matrix(orders$order_count, byrow = TRUE, ncol = 24))
ord_out$Weekday <- levels(orders$order_week)

#Calculating base(average) values for each day of week
base_levels <- melt(ord_out)

base_levels <- ddply(base_levels, .(Weekday, variable), summarise, avr_orders=median(value))

base_levels <- dcast(base_levels, Weekday~variable, value.var="avr_orders")

ord_out$date <- seq(min(orders$order_date), max(orders$order_date),1)

#Calculating dtw distances to median levels
ord_out$dtw <- NA
for (i in 1:nrow(ord_out)) {
        ord_out$dtw[i] <-  dtw(as.numeric(ord_out[i,1:24]), 
                               as.numeric(base_levels[base_levels$Weekday == ord_out[i,25],2:25]),
                               keep.internals=TRUE, 
                               step = symmetric1)$distance
}
  1. Known outliers for each week day could be visualized with boxplot:
plot_ly(ord_out, y = dtw, color = Weekday, type = "box")  %>% 
        layout(title = "DTW distances to median value",  yaxis = list(title = "DTW distance"))

So, all outliers are visualized as dots in the plot above. For example one outlier for Friday corresponds to 2014-07-04 - Independence day in USA.

  1. Detecting all outliers:
outliers <- Sys.Date()
for (wd in unique(ord_out$Weekday)) {
        data_day <- ord_out[ord_out$Weekday == wd,]
        outliers <- c(outliers, as.Date(data_day$date[data_day$dtw %in% boxplot(data_day$dtw, plot = FALSE)$out]))
}

outliers <- outliers[2:length(outliers)]
outliers
 [1] "2015-02-02" "2015-03-23" "2015-05-25" "2015-06-15" "2015-09-07"
 [6] "2015-12-28" "2013-12-24" "2013-12-25" "2014-01-01" "2014-12-24"
[11] "2015-12-30" "2013-11-28" "2014-11-27" "2015-01-01" "2015-02-19"
[16] "2015-11-26" "2014-07-04" "2015-02-14" "2015-10-31" "2015-11-21"
[21] "2014-05-25" "2015-02-15" "2015-04-19" "2015-11-01"

There are 24 outlier days.

  1. Gettign data about all bank holidays in USA from here:
#Creating temporary file in memory
temp <- tempfile()
#Just to use download.file in R Markdown
setInternet2(use = TRUE)
#Download file in temp
download.file("http://www.opm.gov/about-us/open-government/Data/Apps/Holidays/holidays.ical", temp)
#Read data from temp file
usa_holidays <- read.csv(temp)
#Delete temp file
unlink(temp)
  1. Parse file and creating list of holiday days:
usa_holidays <- data.frame(start_hol = usa_holidays[grep("DTSTART",usa_holidays[,1]),], end_hol = usa_holidays[grep("DTEND",usa_holidays[,1]),], stringsAsFactors = FALSE)

usa_holidays$start_hol <- as.Date(substring(usa_holidays$start_hol, 9, 17), format = '%Y%m%d')
usa_holidays$end_hol <- as.Date(substring(usa_holidays$end_hol, 7, 14), format = '%Y%m%d')

holiday_days <- Sys.Date()
for (i in 1:nrow(usa_holidays)){
        holiday_days <- c(holiday_days, seq(usa_holidays$start_hol[i], usa_holidays$end_hol[i], 1))
}

holiday_days <- unique(holiday_days[2:length(holiday_days)])
length(holiday_days)
[1] 486

There are 486 holiday days in USA from 1997 to 2020.

  1. Percentage of outliers in holidays list:
mean(outliers %in% holiday_days)
[1] 0.375

So, 37.5% of outliers in our timeseries are bank holiday days.

A lot of another outliers refer to national holidays in USA:

outliers[!(outliers %in% holiday_days)]
 [1] "2015-02-02" "2015-03-23" "2015-06-15" "2015-12-28" "2013-12-24"
 [6] "2014-12-24" "2015-12-30" "2015-02-19" "2015-02-14" "2015-10-31"
[11] "2015-11-21" "2014-05-25" "2015-02-15" "2015-04-19" "2015-11-01"

ARIMA Model

  1. As a first, order of differentiating to achive stationary timeseries is needed to be found:
#Creating ts objects
orders_ts_train <- ts(data = orders$order_count[1:20496], frequency = 24)
orders_ts_test <- ts(data = orders$order_count[20497:20664], frequency = 24)

#Firstly, we need to be sure that time series is stationary, if not - differentiate it
#Let's perform  Kpss test
kpss.test(orders_ts_train)

    KPSS Test for Level Stationarity

data:  orders_ts_train
KPSS Level = 12.737, Truncation lag parameter = 33, p-value = 0.01

As p value < 0.01, we could reject NULL hypothesis and accept alternative that time series is non-stationary with 99%. After differenciating at 1 order:

#Differentiating, 1 order
kpss.test(diff(orders_ts_train))

    KPSS Test for Level Stationarity

data:  diff(orders_ts_train)
KPSS Level = 0.0006921, Truncation lag parameter = 33, p-value =
0.1

p = 0.1 and we could accept null hypothesis about timeseries stationary.

  1. Finding p and q for ARIMA model with seasonal component:
auto.arima(orders_ts_train, d = 1, D = 1)
Series: orders_ts_train 
ARIMA(2,1,1)(2,1,0)[24]                    

Coefficients:
         ar1      ar2      ma1     sar1     sar2
      1.1037  -0.3415  -0.9490  -0.4975  -0.2542
s.e.  0.0071   0.0068   0.0037   0.0069   0.0068

sigma^2 estimated as 140.8:  log likelihood=-79688.93
AIC=159389.9   AICc=159389.9   BIC=159437.4
  1. Fitting ARIMA model:
order_arima <- arima(orders_ts_train, order=c(2,1,1), seasonal = list(order = c(2,1,0), period = 24))
order_arima

Call:
arima(x = orders_ts_train, order = c(2, 1, 1), seasonal = list(order = c(2, 
    1, 0), period = 24))

Coefficients:
         ar1      ar2      ma1     sar1     sar2
      1.1037  -0.3415  -0.9490  -0.4975  -0.2542
s.e.  0.0071   0.0068   0.0037   0.0069   0.0068

sigma^2 estimated as 140.8:  log likelihood = -79688.93,  aic = 159389.9
  1. Accuracy. Lets measure accuracy with RMSE for training data and testing data (prediction for the last 7 days):
pred <- predict(order_arima, n.ahead = 7*24)

RMSE on training data:

sqrt(mean(order_arima$residuals ^ 2))
[1] 11.85812

RMSE on testing data:

sqrt(mean((as.numeric(pred$pred) - as.numeric(orders_ts_test)) ^ 2))
[1] 41.77625
  1. Plotting prediction for last week:
real_orders <- as.xts(orders$order_count, order.by = orders$full_date) #time object
predicted_orders <- as.xts(as.numeric(pred$pred), order.by = orders$full_date[(nrow(orders) - 7*24 + 1):nrow(orders)])

orders_bind <- cbind(real_orders = real_orders, predicted_orders = predicted_orders)

dygraph(orders_bind, main = "Prediction of orders") %>%
dyRangeSelector(dateWindow = c("2015-12-01", "2016-1-10"))

  1. Disadvantages of ARIMA model: