library(dplyr)
library(ggplot2)
library(prophet)
library(Metrics)
# Define our own performance metrics functions, since prophet's default one has no performance metrics
performance.metrics <- function (df, metrics = c("smape", "mape", "rmse", "mad"))
{
metric.values.df <- data.frame(matrix(0, nrow = 1, ncol = length(metrics)))
names(metric.values.df) <- metrics
for (metric in metrics) {
metric.values.df[[metric]] <- get(metric)(df$y, df$yhat)
}
metric.values.df
}
read.csv("Police_Department_Incident_Reports__Historical_2003_to_May_2018.csv",
stringsAsFactors = F,
nrows=1000)
work.df <- read.csv("Police_Department_Incident_Reports__Historical_2003_to_May_2018.csv", stringsAsFactors = F) %>%
mutate(Hour = gsub("(.+):.+","\\1",Time), Date = as.Date(Date, "%m/%d/%Y")) %>%
select(IncidntNum, Date, Hour) %>%
distinct() %>%
group_by(Date, Hour) %>%
count() %>%
data.frame()
# filter last day as incompleated
work.df <- work.df %>% filter(Date != max(work.df$Date))
work.df %>% head()
summary(work.df)
## Date Hour n
## Min. :2003-01-01 Length:133914 Min. : 1.00
## 1st Qu.:2006-11-04 Class :character 1st Qu.: 8.00
## Median :2010-09-10 Mode :character Median : 13.00
## Mean :2010-09-08 Mean : 13.05
## 3rd Qu.:2014-07-14 3rd Qu.: 18.00
## Max. :2018-05-14 Max. :166.00
monthly.df <- work.df %>%
mutate(Date = gsub("(.+)-(.+)-(.+)","\\1-\\2-01",Date)) %>%
group_by(Date) %>% summarise(n=sum(n))
monthly.df <- monthly.df %>% filter(Date != max(monthly.df$Date))
monthly.df %>% head()
summary(monthly.df)
## Date n
## Length:184 Min. : 7564
## Class :character 1st Qu.: 9042
## Mode :character Median : 9606
## Mean : 9481
## 3rd Qu.: 9996
## Max. :10914
ggplot(monthly.df, aes(x=Date, y = n, group = 1)) + geom_line()
m <- prophet(monthly.df %>% rename(ds = Date, y = n))
## Disabling weekly seasonality. Run prophet with weekly.seasonality=TRUE to override this.
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
future <- make_future_dataframe(m, periods = 12 * 2, freq='month')
forecast <- predict(m, future)
plot(m, forecast)
df.cv <- cross_validation(m, period = 365, horizon = 365*2, units = 'days')
## Making 8 forecasts with cutoffs between 2009-04-03 and 2016-04-01
monthly.metrics <- performance.metrics(df.cv %>% data.frame())
monthly.metrics
daily.df <- work.df %>% group_by(Date) %>% summarise(n=sum(n))
daily.df %>% head()
summary(daily.df)
## Date n
## Min. :2003-01-01 Min. : 2.0
## 1st Qu.:2006-11-03 1st Qu.:287.0
## Median :2010-09-07 Median :311.0
## Mean :2010-09-07 Mean :311.3
## 3rd Qu.:2014-07-11 3rd Qu.:336.0
## Max. :2018-05-14 Max. :541.0
ggplot(daily.df, aes(x=Date, y = n, group = 1)) + geom_line()
m <- prophet(daily.df %>% rename(ds = Date, y = n))
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
future <- make_future_dataframe(m, periods = 365 * 2)
forecast <- predict(m, future)
plot(m, forecast)
df.cv <- cross_validation(m, period = 365, horizon = 365*2, units = 'days')
## Making 8 forecasts with cutoffs between 2009-05-16 and 2016-05-14
daily.metrics <- performance.metrics(df.cv, metrics = c("smape", "mape", "rmse", "mad"))
daily.metrics
hourly.df <- work.df %>%
mutate(Date = as.POSIXct(paste0(Date," ", Hour, ":00"), format = "%Y-%m-%d %H:%M")) %>%
group_by(Date) %>%
summarise(n=sum(n))
hourly.df %>% head()
summary(hourly.df)
## Date n
## Min. :2003-01-01 00:00:00 Min. : 1.00
## 1st Qu.:2006-11-04 06:30:00 1st Qu.: 8.00
## Median :2010-09-10 08:00:00 Median : 13.00
## Mean :2010-09-09 00:43:24 Mean : 13.05
## 3rd Qu.:2014-07-14 19:30:00 3rd Qu.: 18.00
## Max. :2018-05-14 23:00:00 Max. :166.00
## NA's :1
ggplot(hourly.df, aes(x=Date, y = n, group = 1)) + geom_line()
## Warning: Removed 1 rows containing missing values (geom_path).
hourly.df <- hourly.df %>%
mutate(ds = as.character(Date)) %>%
rename(y = n) %>%
arrange(Date) %>%
tail(n = 365 * 24) %>%
na.omit()
hourly.df['cap'] <- max(hourly.df$y)
hourly.df['floor'] <- 2
m <- prophet(hourly.df, growth = "logistic")
## Disabling yearly seasonality. Run prophet with yearly.seasonality=TRUE to override this.
future <- make_future_dataframe(m, periods = 30 * 24 * 2, freq = 3600)
future['cap'] <- max(hourly.df$y)
future['floor'] <- min(hourly.df$y)
forecast <- predict(m, future)
forecast$yhat_lower[forecast$yhat_lower < 0] <- 2
forecast$yhat_upper[forecast$yhat_upper < 0] <- 2
forecast$yhat[forecast$yhat < 0] <- 2
plot(m, forecast)
df.cv <- cross_validation(m, horizon = 24*2, units = 'days')
## Making 8 forecasts with cutoffs between 2017-10-10 23:00:00 and 2018-03-27 23:00:00
df.cv$yhat[df.cv$yhat < 0] <- 2
hourly.metrics <- performance.metrics(df.cv, metrics = c("smape", "mape", "rmse", "mad"))
hourly.metrics
#Metrics
rbind(monthly.metrics, daily.metrics) %>%
rbind(hourly.metrics) %>%
data.frame()
Daily only - didn’t have time for Monthly and Hourly. General idea is the same
#
work.df <- read.csv("Police_Department_Incident_Reports__Historical_2003_to_May_2018.csv", stringsAsFactors = F) %>%
mutate(Hour = gsub("(.+):.+","\\1",Time), Date = as.Date(Date, "%m/%d/%Y")) %>%
select(IncidntNum, Date, Hour, PdDistrict) %>%
distinct() %>%
group_by(Date, Hour, PdDistrict) %>%
count() %>%
data.frame()
# filter last day as incompleated
work.df <- work.df %>% filter(Date != max(work.df$Date))
work.df %>% head()
daily.df <- work.df %>%
group_by(Date, PdDistrict) %>%
summarise(n=sum(n)) %>%
rename(ds = Date, y = n) %>%
filter(PdDistrict != "")
daily.df %>% group_by(PdDistrict) %>% summarise(n())
# The function is not mine originaly it was taken from prophet::cross_validation since it's being crushing on simple row bindings I had to rewrite some parts of it to make it work correctly
cross.validation <- function(model, horizon, units, period = NULL, initial = NULL)
{
df <- model$history
horizon.dt <- as.difftime(horizon, units = units)
if (is.null(period)) {
period <- 0.5 * horizon
}
period.dt <- as.difftime(period, units = units)
period.max <- 0
for (s in model$seasonalities) {
period.max <- max(period.max, s$period)
}
seasonality.dt <- as.difftime(period.max, units = "days")
if (is.null(initial)) {
initial.dt <- max(as.difftime(3 * horizon, units = units),
seasonality.dt)
}
else {
initial.dt <- as.difftime(initial, units = units)
if (initial.dt < seasonality.dt) {
warning(paste0("Seasonality has period of ", period.max,
" days which ", "is larger than initial window. Consider increasing initial."))
}
}
cutoffs <- prophet:::generate_cutoffs(df, horizon.dt, initial.dt, period.dt)
nms <- c("ds", "y", "yhat", "yhat_lower", "yhat_upper", "cutoff")
predicts <- data.frame(t(rep(NA,length(nms))))
names(predicts) <- nms
predicts <- predicts[-1,]
for (i in 1:length(cutoffs)) {
cutoff <- cutoffs[i]
m <- prophet:::prophet_copy(model, cutoff)
history.c <- dplyr::filter(df, ds <= cutoff)
if (nrow(history.c) < 2) {
stop("Less than two datapoints before cutoff. Increase initial window.")
}
m <- fit.prophet(m, history.c)
df.predict <- dplyr::filter(df, ds > cutoff, ds <= cutoff +
horizon.dt)
columns <- "ds"
if (m$growth == "logistic") {
columns <- c(columns, "cap")
if (m$logistic.floor) {
columns <- c(columns, "floor")
}
}
columns <- c(columns, names(m$extra_regressors))
for (name in names(m$seasonalities)) {
condition.name = m$seasonalities[[name]]$condition.name
if (!is.null(condition.name)) {
columns <- c(columns, condition.name)
}
}
future <- df.predict[columns]
yhat <- stats::predict(m, future)
df.c <- dplyr::inner_join(df.predict, yhat, by = "ds")
df.c <- dplyr::select(df.c, ds, y, yhat, yhat_lower,
yhat_upper)
df.c$cutoff <- cutoff
predicts <- rbind(predicts, data.frame(df.c))
}
return(predicts)
}
# Function that creates Prophet model and returns set of metrics
get.metrics <- function(df, period = 360, horizon = 360 * 2, units = 'days')
{
df %>%
prophet() %>%
cross.validation(period = period, horizon = horizon, units = units) %>%
performance.metrics( metrics = c("smape", "mape", "rmse", "mad"))
}
nms <- c("smape", "mape", "rmse", "mad", "PdDistrict")
metrics.df <- data.frame(t(rep(NA,length(nms))))
names(metrics.df) <- nms
metrics.df <- metrics.df[-1,]
for (s in unique(daily.df$PdDistrict))
{
tmp.df <- daily.df %>%
filter(PdDistrict == s) %>%
select(ds,y) %>%
data.frame() %>%
get.metrics()
tmp.df[["PdDistrict"]] = s
metrics.df <- rbind(metrics.df, tmp.df)
names(metrics.df) <- nms
}
metrics.df