#Libraries for data analysis
library(RPostgreSQL)
library(stringr)
library(ggplot2)
library(plyr)
library(dygraphs)
library(plotly)
library(gridExtra)
library(zoo)
library(xts)
library(tsoutliers)
library(DT)
library(tseries)
library(forecast)
library(reshape2)
library(dtw)
#Establishing connection to db
con <- dbConnect(PostgreSQL(),
user= "datascience",
password="datasciencepassword",
dbname="grubhub",
host = "opsinterviews.cn6vkbc8fhom.us-east-1.rds.amazonaws.com",
port = "5432")
#Reading all data from public.problem5 table
orders <- dbGetQuery(con, statement = paste("select * from public.problem5"))
dim(orders)
[1] 20613 7
str(orders)
'data.frame': 20613 obs. of 7 variables:
$ order_year : int 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
$ order_month: int 9 9 9 9 9 9 9 9 9 9 ...
$ order_day : int 2 2 2 2 2 2 2 2 2 2 ...
$ order_hour : int 0 1 2 3 4 5 6 7 8 9 ...
$ order_count: int 40 36 30 25 21 20 20 21 23 26 ...
$ order_week : chr "Monday "| __truncated__ "Monday "| __truncated__ "Monday "| __truncated__ "Monday "| __truncated__ ...
$ order_date : Date, format: "2013-09-02" "2013-09-02" ...
Let’s delete all spaces from order_week variable and convert it to factor:
#Deleting spaces from order_week column
orders$order_week <- str_trim(orders$order_week)
#Transforming order_week variable to factor type
orders$order_week <- factor(orders$order_week, levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))
sum(!complete.cases(orders))
[1] 0
There are no NAs in dataset.
sum(table(orders$order_date) < 24)
[1] 49
So, there are 49 days with missed hours. There are two possible explanation of it:
- there are 0 orders for these hours;
- rows are just missed.
These values are needed to be imputed to make timeseries full. I made an assumption that there are no orders for these hours and have imputed them with 0.
List of days with missed hours:
names(which(table(orders$order_date) < 24))
[1] "2013-09-04" "2013-09-07" "2013-09-09" "2013-09-16" "2013-09-30"
[6] "2013-10-01" "2013-10-02" "2013-10-04" "2013-10-10" "2013-10-21"
[11] "2013-10-22" "2013-10-31" "2013-11-05" "2013-11-11" "2013-11-12"
[16] "2013-11-15" "2013-12-05" "2013-12-25" "2014-01-22" "2014-02-04"
[21] "2014-02-11" "2014-02-18" "2014-02-19" "2014-02-20" "2014-03-12"
[26] "2014-03-25" "2014-04-01" "2014-04-07" "2014-04-21" "2014-04-25"
[31] "2014-04-28" "2014-05-01" "2014-05-07" "2014-05-09" "2014-05-12"
[36] "2014-05-13" "2014-05-15" "2014-05-29" "2014-06-01" "2014-06-05"
[41] "2014-08-19" "2014-09-11" "2014-09-15" "2014-09-25" "2014-10-01"
[46] "2014-11-03" "2014-12-25" "2015-02-02" "2015-12-25"
Adding rows to data.frame with missed hours as 0:
while (sum(which(table(orders$order_date) < 24)) > 0) {
d <- as.Date(names(which(table(orders$order_date) < 24))[1])
orders <- rbind(orders, data.frame(order_year = as.numeric(format(d, "%Y")),
order_month = as.numeric(format(d, "%m")),
order_day = as.POSIXlt(d)$mday,
order_hour = (setdiff(0:23, orders[orders$order_date == d,4]))[1],
order_count = 0,
order_week = format(as.Date(d), "%A"),
order_date = d))
}
orders <- orders[order(orders$order_date, orders$order_hour),]
Now, data is ready for analysis and we could look at it:
datatable(orders)
#Add full date for more convenient
orders$full_date <- as.POSIXlt(with(orders, paste(order_date, order_hour)), format = '%Y-%m-%d %H')
#Correcting time moments, which don't exist (due to time change)
orders$full_date[is.na(orders$full_date)]$hour <- orders$full_date[is.na(orders$full_date)]$hour + 1
#Plot last month data
ord <- as.xts(orders$order_count, order.by = orders$full_date) #time object
dygraph(ord, main = "Orders") %>%
dyRangeSelector(dateWindow = c("2015-12-01", "2015-12-31"))
So, as could be seen, data is periodical with the smallest period - 1 day. Also, we could observe descent of number of orders at Christmas time. Every day is characterized with two peaks: first at dinner time, second - evening time.
summary(orders$order_count)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 26.00 53.00 77.32 98.00 440.00
rng <- range(orders$order_count)
p1 <- qplot(order_count,
data=orders,
geom="histogram",
breaks = seq(rng[1], rng[2], by =10),
main = "Histogram of number of\n orders per hour",
xlab = "Orders per hour",
ylab = "Count"
)
ggplotly(p1)
So, our data is highly right skewed. And mostly there are from 20 to 30 orders per hour. At average, there are 53 orders per hour.
#Aggregating data to months
months_orders <- ddply(orders, ~format(order_date, "%Y-%m"), summarise, months_orders=sum(order_count))
names(months_orders) <- c("Month", "Orders")
#Deleting last row as it isn't representative
months_orders <- months_orders[-nrow(months_orders),]
months_orders$Month <- as.Date(as.yearmon(months_orders$Month))
p2 <- ggplot(data = months_orders,
aes(x = Month, y = Orders)) +
geom_line(lwd = 0.1, color = "grey") +
geom_point() +
geom_smooth(lwd = 0.1, method = "lm", formula = y ~ x) +
labs(title = "Trend in data")
ggplotly(p2)
There is a positive trend in number of orders per month. At average it has increased on 20k for two years.
#Getting weekdays and orders
days_orders <- ddply(orders, ~order_date, summarise, order_count=sum(order_count))
days_orders$order_date <- factor(weekdays(as.Date(as.yearmon(days_orders$order_date))), levels = rev(c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")))
plot_ly(days_orders, y = order_count, color = order_date, type = "box") %>%
layout(title = "Orders per weekdays", yaxis = list(title = "Number of orders per day"))
On the plot it is shown that there is no significant difference in number of orders for different week days.#Performing ANOVA test
oneway.test(days_orders$order_count~days_orders$order_date)
One-way analysis of means (not assuming equal variances)
data: days_orders$order_count and days_orders$order_date
F = 2.996, num df = 6.00, denom df = 355.34, p-value = 0.007215
#Performing pairwise t-test
pairwise.t.test(days_orders$order_count, days_orders$order_date)
Pairwise comparisons using t tests with pooled SD
data: days_orders$order_count and days_orders$order_date
Sunday Saturday Friday Thursday Wednesday Tuesday
Saturday 1.0000 - - - - -
Friday 1.0000 1.0000 - - - -
Thursday 0.3768 0.6525 0.2719 - - -
Wednesday 1.0000 1.0000 1.0000 1.0000 - -
Tuesday 0.6476 0.7215 1.0000 0.0015 0.1547 -
Monday 1.0000 1.0000 1.0000 0.9489 1.0000 0.7215
P value adjustment method: holm
This test shows that p = 0.007215 < 0.05, so with 95% we could conclude that data provide convincing evidence that at least one pair of means of number of orders among weekdays are different from each other (we could reject H0 Hypothesis, which stands for their equality). Based on pairwise t-test we can conclude with 95% confidence level that Tuesday has smaller average number of orders than Thursday.
#Getting months and orders
days_orders <- ddply(orders, ~order_date, summarise, order_count=sum(order_count))
days_orders$order_date <- factor(format(as.Date(as.yearmon(days_orders$order_date)), "%B"), levels = rev(unique(format(as.Date(as.yearmon(days_orders$order_date)), "%B")[order(format(as.Date(as.yearmon(days_orders$order_date)), "%m"))])))
plot_ly(days_orders, y = order_count, color = order_date, type = "box") %>%
layout(title = "Orders per months", yaxis = list(title = "Number of orders per day"))
July and September are characterized by smaller number of orders per day than January and February (also could be verified by statistical test). It could be caused because of July and September are vocation months.
plot_ly(orders, y = order_count, color = factor(order_hour), type = "box") %>%
layout(title = "Orders per hours", yaxis = list(title = "Number of orders per hour"))
Here interesting patterns are shown. Hours 0-10 are characterized by very small number of orders per hour, where 17-20 hours are very intensive.
#Add column with data which indicates where it is weekend day (Saturday, Sunday) or not
orders$weekend <- ifelse(orders$order_week %in% c("Saturday", "Sunday"), 'weekend', 'weekday')
#Creating data for weekdays and weekends
weekends <- ddply(orders, .(order_hour, weekend), summarise, avr_orders=median(order_count))
p3 <- qplot(order_hour,
avr_orders,
color = weekend,
data=weekends,
geom=c("point", "path"),
main = "Average of number of\n orders per hour",
xlab = "Hour",
ylab = "Average number of orders"
)
p3
Almost for the whole range of hours we could observe higher average number of orders per hour for weekends.
#Building data.frame with hour data for each day
ord_out <- data.frame(matrix(orders$order_count, byrow = TRUE, ncol = 24))
ord_out$Weekday <- levels(orders$order_week)
#Calculating base(average) values for each day of week
base_levels <- melt(ord_out)
base_levels <- ddply(base_levels, .(Weekday, variable), summarise, avr_orders=median(value))
base_levels <- dcast(base_levels, Weekday~variable, value.var="avr_orders")
ord_out$date <- seq(min(orders$order_date), max(orders$order_date),1)
#Calculating dtw distances to median levels
ord_out$dtw <- NA
for (i in 1:nrow(ord_out)) {
ord_out$dtw[i] <- dtw(as.numeric(ord_out[i,1:24]),
as.numeric(base_levels[base_levels$Weekday == ord_out[i,25],2:25]),
keep.internals=TRUE,
step = symmetric1)$distance
}
plot_ly(ord_out, y = dtw, color = Weekday, type = "box") %>%
layout(title = "DTW distances to median value", yaxis = list(title = "DTW distance"))
So, all outliers are visualized as dots in the plot above. For example one outlier for Friday corresponds to 2014-07-04 - Independence day in USA.
outliers <- Sys.Date()
for (wd in unique(ord_out$Weekday)) {
data_day <- ord_out[ord_out$Weekday == wd,]
outliers <- c(outliers, as.Date(data_day$date[data_day$dtw %in% boxplot(data_day$dtw, plot = FALSE)$out]))
}
outliers <- outliers[2:length(outliers)]
outliers
[1] "2015-02-02" "2015-03-23" "2015-05-25" "2015-06-15" "2015-09-07"
[6] "2015-12-28" "2013-12-24" "2013-12-25" "2014-01-01" "2014-12-24"
[11] "2015-12-30" "2013-11-28" "2014-11-27" "2015-01-01" "2015-02-19"
[16] "2015-11-26" "2014-07-04" "2015-02-14" "2015-10-31" "2015-11-21"
[21] "2014-05-25" "2015-02-15" "2015-04-19" "2015-11-01"
There are 24 outlier days.
#Creating temporary file in memory
temp <- tempfile()
#Just to use download.file in R Markdown
setInternet2(use = TRUE)
#Download file in temp
download.file("http://www.opm.gov/about-us/open-government/Data/Apps/Holidays/holidays.ical", temp)
#Read data from temp file
usa_holidays <- read.csv(temp)
#Delete temp file
unlink(temp)
usa_holidays <- data.frame(start_hol = usa_holidays[grep("DTSTART",usa_holidays[,1]),], end_hol = usa_holidays[grep("DTEND",usa_holidays[,1]),], stringsAsFactors = FALSE)
usa_holidays$start_hol <- as.Date(substring(usa_holidays$start_hol, 9, 17), format = '%Y%m%d')
usa_holidays$end_hol <- as.Date(substring(usa_holidays$end_hol, 7, 14), format = '%Y%m%d')
holiday_days <- Sys.Date()
for (i in 1:nrow(usa_holidays)){
holiday_days <- c(holiday_days, seq(usa_holidays$start_hol[i], usa_holidays$end_hol[i], 1))
}
holiday_days <- unique(holiday_days[2:length(holiday_days)])
length(holiday_days)
[1] 486
There are 486 holiday days in USA from 1997 to 2020.
mean(outliers %in% holiday_days)
[1] 0.375
So, 37.5% of outliers in our timeseries are bank holiday days.
A lot of another outliers refer to national holidays in USA:
outliers[!(outliers %in% holiday_days)]
[1] "2015-02-02" "2015-03-23" "2015-06-15" "2015-12-28" "2013-12-24"
[6] "2014-12-24" "2015-12-30" "2015-02-19" "2015-02-14" "2015-10-31"
[11] "2015-11-21" "2014-05-25" "2015-02-15" "2015-04-19" "2015-11-01"
#Creating ts objects
orders_ts_train <- ts(data = orders$order_count[1:20496], frequency = 24)
orders_ts_test <- ts(data = orders$order_count[20497:20664], frequency = 24)
#Firstly, we need to be sure that time series is stationary, if not - differentiate it
#Let's perform Kpss test
kpss.test(orders_ts_train)
KPSS Test for Level Stationarity
data: orders_ts_train
KPSS Level = 12.737, Truncation lag parameter = 33, p-value = 0.01
As p value < 0.01, we could reject NULL hypothesis and accept alternative that time series is non-stationary with 99%. After differenciating at 1 order:
#Differentiating, 1 order
kpss.test(diff(orders_ts_train))
KPSS Test for Level Stationarity
data: diff(orders_ts_train)
KPSS Level = 0.0006921, Truncation lag parameter = 33, p-value =
0.1
p = 0.1 and we could accept null hypothesis about timeseries stationary.
auto.arima(orders_ts_train, d = 1, D = 1)
Series: orders_ts_train
ARIMA(2,1,1)(2,1,0)[24]
Coefficients:
ar1 ar2 ma1 sar1 sar2
1.1037 -0.3415 -0.9490 -0.4975 -0.2542
s.e. 0.0071 0.0068 0.0037 0.0069 0.0068
sigma^2 estimated as 140.8: log likelihood=-79688.93
AIC=159389.9 AICc=159389.9 BIC=159437.4
order_arima <- arima(orders_ts_train, order=c(2,1,1), seasonal = list(order = c(2,1,0), period = 24))
order_arima
Call:
arima(x = orders_ts_train, order = c(2, 1, 1), seasonal = list(order = c(2,
1, 0), period = 24))
Coefficients:
ar1 ar2 ma1 sar1 sar2
1.1037 -0.3415 -0.9490 -0.4975 -0.2542
s.e. 0.0071 0.0068 0.0037 0.0069 0.0068
sigma^2 estimated as 140.8: log likelihood = -79688.93, aic = 159389.9
pred <- predict(order_arima, n.ahead = 7*24)
RMSE on training data:
sqrt(mean(order_arima$residuals ^ 2))
[1] 11.85812
RMSE on testing data:
sqrt(mean((as.numeric(pred$pred) - as.numeric(orders_ts_test)) ^ 2))
[1] 41.77625
real_orders <- as.xts(orders$order_count, order.by = orders$full_date) #time object
predicted_orders <- as.xts(as.numeric(pred$pred), order.by = orders$full_date[(nrow(orders) - 7*24 + 1):nrow(orders)])
orders_bind <- cbind(real_orders = real_orders, predicted_orders = predicted_orders)
dygraph(orders_bind, main = "Prediction of orders") %>%
dyRangeSelector(dateWindow = c("2015-12-01", "2016-1-10"))