Data View and Libraires

Work Plan

  • Define metrics, libraries, extra functions
  • Build and measure models for Monthly, Daily and Hourly predictions
  • Build models based on geo (pdDistrict)
  • Next Steps
library(dplyr)
library(ggplot2)
library(prophet)
library(Metrics)


# Define our own performance metrics functions, since prophet's default one has no performance metrics
performance.metrics <- function (df, metrics = c("smape", "mape", "rmse", "mad")) 
{
    metric.values.df <- data.frame(matrix(0, nrow = 1, ncol = length(metrics)))
    names(metric.values.df) <- metrics
    
    for (metric in metrics) {
        metric.values.df[[metric]] <- get(metric)(df$y, df$yhat)
    }
    metric.values.df
}

read.csv("Police_Department_Incident_Reports__Historical_2003_to_May_2018.csv", 
         stringsAsFactors = F, 
         nrows=1000)

Load Data and Clear Base Data

work.df <- read.csv("Police_Department_Incident_Reports__Historical_2003_to_May_2018.csv", stringsAsFactors = F) %>% 
              mutate(Hour = gsub("(.+):.+","\\1",Time), Date = as.Date(Date, "%m/%d/%Y")) %>%
              select(IncidntNum, Date, Hour) %>% 
              distinct() %>% 
              group_by(Date, Hour) %>% 
              count() %>%
              data.frame()

# filter last day as incompleated
work.df <- work.df %>% filter(Date != max(work.df$Date))

work.df %>% head()
summary(work.df)
##       Date                Hour                 n         
##  Min.   :2003-01-01   Length:133914      Min.   :  1.00  
##  1st Qu.:2006-11-04   Class :character   1st Qu.:  8.00  
##  Median :2010-09-10   Mode  :character   Median : 13.00  
##  Mean   :2010-09-08                      Mean   : 13.05  
##  3rd Qu.:2014-07-14                      3rd Qu.: 18.00  
##  Max.   :2018-05-14                      Max.   :166.00

Monthly, Weekly, Daily Forecasts

Monthly

Data Frame and Exploratory Plots

monthly.df <- work.df %>% 
                 mutate(Date = gsub("(.+)-(.+)-(.+)","\\1-\\2-01",Date)) %>%
                 group_by(Date) %>% summarise(n=sum(n))

monthly.df <- monthly.df %>% filter(Date != max(monthly.df$Date))

monthly.df %>% head()
summary(monthly.df)
##      Date                 n        
##  Length:184         Min.   : 7564  
##  Class :character   1st Qu.: 9042  
##  Mode  :character   Median : 9606  
##                     Mean   : 9481  
##                     3rd Qu.: 9996  
##                     Max.   :10914
ggplot(monthly.df, aes(x=Date, y = n, group = 1)) + geom_line()

Monthly Model

m <- prophet(monthly.df %>% rename(ds = Date, y = n))
## Disabling weekly seasonality. Run prophet with weekly.seasonality=TRUE to override this.
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
future <- make_future_dataframe(m, periods = 12 * 2, freq='month')
forecast <- predict(m, future)
plot(m, forecast)

Mertrics measurments

df.cv <- cross_validation(m, period = 365, horizon = 365*2, units = 'days')
## Making 8 forecasts with cutoffs between 2009-04-03 and 2016-04-01
monthly.metrics <- performance.metrics(df.cv %>% data.frame())
monthly.metrics

Daily

Data Frame and Exploratory Plots

daily.df <- work.df %>% group_by(Date) %>% summarise(n=sum(n))
daily.df %>% head()
summary(daily.df)
##       Date                  n        
##  Min.   :2003-01-01   Min.   :  2.0  
##  1st Qu.:2006-11-03   1st Qu.:287.0  
##  Median :2010-09-07   Median :311.0  
##  Mean   :2010-09-07   Mean   :311.3  
##  3rd Qu.:2014-07-11   3rd Qu.:336.0  
##  Max.   :2018-05-14   Max.   :541.0
ggplot(daily.df, aes(x=Date, y = n, group = 1)) + geom_line()

Daily Model

m <- prophet(daily.df %>% rename(ds = Date, y = n))
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
future <- make_future_dataframe(m, periods = 365 * 2)
forecast <- predict(m, future)
plot(m, forecast)

Daily Metrics

df.cv <- cross_validation(m, period = 365, horizon = 365*2, units = 'days')
## Making 8 forecasts with cutoffs between 2009-05-16 and 2016-05-14
daily.metrics <- performance.metrics(df.cv, metrics = c("smape", "mape", "rmse", "mad"))
daily.metrics

Hourly

Data Frame and Exploratory Plots

hourly.df <- work.df %>% 
               mutate(Date = as.POSIXct(paste0(Date," ", Hour, ":00"), format = "%Y-%m-%d %H:%M")) %>%
               group_by(Date) %>% 
               summarise(n=sum(n))


hourly.df %>% head()
summary(hourly.df)
##       Date                           n         
##  Min.   :2003-01-01 00:00:00   Min.   :  1.00  
##  1st Qu.:2006-11-04 06:30:00   1st Qu.:  8.00  
##  Median :2010-09-10 08:00:00   Median : 13.00  
##  Mean   :2010-09-09 00:43:24   Mean   : 13.05  
##  3rd Qu.:2014-07-14 19:30:00   3rd Qu.: 18.00  
##  Max.   :2018-05-14 23:00:00   Max.   :166.00  
##  NA's   :1
ggplot(hourly.df, aes(x=Date, y = n, group = 1)) + geom_line()
## Warning: Removed 1 rows containing missing values (geom_path).

Hourly Model

hourly.df <- hourly.df %>% 
          mutate(ds = as.character(Date)) %>% 
          rename(y = n) %>%
          arrange(Date) %>% 
          tail(n = 365 * 24) %>% 
          na.omit()
          
hourly.df['cap']   <- max(hourly.df$y)
hourly.df['floor'] <- 2


m <- prophet(hourly.df, growth = "logistic")
## Disabling yearly seasonality. Run prophet with yearly.seasonality=TRUE to override this.
future <- make_future_dataframe(m, periods = 30 * 24 * 2, freq = 3600)

future['cap']   <- max(hourly.df$y)
future['floor'] <- min(hourly.df$y)

forecast <- predict(m, future)
forecast$yhat_lower[forecast$yhat_lower < 0] <- 2
forecast$yhat_upper[forecast$yhat_upper < 0] <- 2
forecast$yhat[forecast$yhat < 0] <- 2

plot(m, forecast)

Hourly Metrics

df.cv <- cross_validation(m, horizon = 24*2, units = 'days')
## Making 8 forecasts with cutoffs between 2017-10-10 23:00:00 and 2018-03-27 23:00:00
df.cv$yhat[df.cv$yhat < 0] <- 2
hourly.metrics <- performance.metrics(df.cv, metrics = c("smape", "mape", "rmse", "mad"))
hourly.metrics

#Metrics

  rbind(monthly.metrics, daily.metrics) %>%
  rbind(hourly.metrics) %>% 
  data.frame()

Forecast by district

Daily only - didn’t have time for Monthly and Hourly. General idea is the same

# 
work.df <- read.csv("Police_Department_Incident_Reports__Historical_2003_to_May_2018.csv", stringsAsFactors = F) %>% 
              mutate(Hour = gsub("(.+):.+","\\1",Time), Date = as.Date(Date, "%m/%d/%Y")) %>%
              select(IncidntNum, Date, Hour, PdDistrict) %>% 
              distinct() %>% 
              group_by(Date, Hour, PdDistrict) %>% 
              count() %>%
              data.frame()

# filter last day as incompleated 
work.df <- work.df %>% filter(Date != max(work.df$Date))

work.df %>% head()

Daily

Data Frame and Number of observation for each district

daily.df <- work.df %>% 
                group_by(Date, PdDistrict) %>% 
                summarise(n=sum(n)) %>%
                rename(ds = Date, y = n) %>%
                filter(PdDistrict != "")


daily.df %>% group_by(PdDistrict) %>% summarise(n())

Set of Help Functions

# The function is not mine originaly it was taken from prophet::cross_validation since it's being crushing on simple row bindings I had to rewrite some parts of it to make it work correctly
cross.validation <- function(model, horizon, units, period = NULL, initial = NULL)
{
    df <- model$history
    horizon.dt <- as.difftime(horizon, units = units)
    if (is.null(period)) {
        period <- 0.5 * horizon
    }
    period.dt <- as.difftime(period, units = units)
    period.max <- 0
    for (s in model$seasonalities) {
        period.max <- max(period.max, s$period)
    }
    seasonality.dt <- as.difftime(period.max, units = "days")
    if (is.null(initial)) {
        initial.dt <- max(as.difftime(3 * horizon, units = units), 
            seasonality.dt)
    }
    else {
        initial.dt <- as.difftime(initial, units = units)
        if (initial.dt < seasonality.dt) {
            warning(paste0("Seasonality has period of ", period.max, 
                " days which ", "is larger than initial window. Consider increasing initial."))
        }
    }
    
    cutoffs <- prophet:::generate_cutoffs(df, horizon.dt, initial.dt, period.dt)
    nms <- c("ds", "y", "yhat", "yhat_lower", "yhat_upper", "cutoff")
    predicts <-  data.frame(t(rep(NA,length(nms))))
    names(predicts) <- nms
    predicts <- predicts[-1,]
    
    for (i in 1:length(cutoffs)) {
        cutoff <- cutoffs[i]
        m <- prophet:::prophet_copy(model, cutoff)
        history.c <- dplyr::filter(df, ds <= cutoff)
        if (nrow(history.c) < 2) {
            stop("Less than two datapoints before cutoff. Increase initial window.")
        }
        m <- fit.prophet(m, history.c)
        df.predict <- dplyr::filter(df, ds > cutoff, ds <= cutoff + 
            horizon.dt)
        columns <- "ds"
        if (m$growth == "logistic") {
            columns <- c(columns, "cap")
            if (m$logistic.floor) {
                columns <- c(columns, "floor")
            }
        }
        columns <- c(columns, names(m$extra_regressors))
        for (name in names(m$seasonalities)) {
            condition.name = m$seasonalities[[name]]$condition.name
            if (!is.null(condition.name)) {
                columns <- c(columns, condition.name)
            }
        }
        future <- df.predict[columns]
        yhat <- stats::predict(m, future)
        df.c <- dplyr::inner_join(df.predict, yhat, by = "ds")
        df.c <- dplyr::select(df.c, ds, y, yhat, yhat_lower, 
            yhat_upper)
        df.c$cutoff <- cutoff
        predicts <- rbind(predicts, data.frame(df.c))
    }
    return(predicts)
}


# Function that creates Prophet model and returns set of metrics
get.metrics <- function(df, period = 360, horizon = 360 * 2, units = 'days')
{
  df %>%
    prophet() %>%
    cross.validation(period = period, horizon = horizon, units = units) %>%
    performance.metrics( metrics = c("smape", "mape", "rmse", "mad"))
}

Metrics by District

nms <- c("smape", "mape", "rmse", "mad", "PdDistrict")
metrics.df   <-  data.frame(t(rep(NA,length(nms))))
names(metrics.df) <- nms

metrics.df   <- metrics.df[-1,]

for (s in unique(daily.df$PdDistrict))
{
  
  tmp.df <- daily.df %>% 
             filter(PdDistrict == s) %>% 
             select(ds,y) %>% 
             data.frame() %>% 
             get.metrics()
  
  tmp.df[["PdDistrict"]] = s
  
  metrics.df <- rbind(metrics.df,  tmp.df)
  names(metrics.df) <- nms
}

metrics.df

Next Steps