Data View and Libraires

Work Plan

Define metrics, libraries, extra functions
Build and measure models for Monthly, Daily and Hourly predictions
Build models based on geo (pdDistrict)
Next Steps

library(dplyr)
library(ggplot2)
library(prophet)
library(Metrics)


# Define our own performance metrics functions, since prophet's default one has no performance metrics
performance.metrics <- function (df, metrics = c("smape", "mape", "rmse", "mad")) 
{
    metric.values.df <- data.frame(matrix(0, nrow = 1, ncol = length(metrics)))
    names(metric.values.df) <- metrics
    
    for (metric in metrics) {
        metric.values.df[[metric]] <- get(metric)(df$y, df$yhat)
    }
    metric.values.df
}

read.csv("Police_Department_Incident_Reports__Historical_2003_to_May_2018.csv", 
         stringsAsFactors = F, 
         nrows=1000)

Load Data and Clear Base Data

work.df <- read.csv("Police_Department_Incident_Reports__Historical_2003_to_May_2018.csv", stringsAsFactors = F) %>% 
              mutate(Hour = gsub("(.+):.+","\\1",Time), Date = as.Date(Date, "%m/%d/%Y")) %>%
              select(IncidntNum, Date, Hour) %>% 
              distinct() %>% 
              group_by(Date, Hour) %>% 
              count() %>%
              data.frame()

# filter last day as incompleated
work.df <- work.df %>% filter(Date != max(work.df$Date))

work.df %>% head()

summary(work.df)

##       Date                Hour                 n         
##  Min.   :2003-01-01   Length:133914      Min.   :  1.00  
##  1st Qu.:2006-11-04   Class :character   1st Qu.:  8.00  
##  Median :2010-09-10   Mode  :character   Median : 13.00  
##  Mean   :2010-09-08                      Mean   : 13.05  
##  3rd Qu.:2014-07-14                      3rd Qu.: 18.00  
##  Max.   :2018-05-14                      Max.   :166.00

Monthly, Weekly, Daily Forecasts

Monthly

Data Frame and Exploratory Plots

monthly.df <- work.df %>% 
                 mutate(Date = gsub("(.+)-(.+)-(.+)","\\1-\\2-01",Date)) %>%
                 group_by(Date) %>% summarise(n=sum(n))

monthly.df <- monthly.df %>% filter(Date != max(monthly.df$Date))

monthly.df %>% head()

summary(monthly.df)

##      Date                 n        
##  Length:184         Min.   : 7564  
##  Class :character   1st Qu.: 9042  
##  Mode  :character   Median : 9606  
##                     Mean   : 9481  
##                     3rd Qu.: 9996  
##                     Max.   :10914

ggplot(monthly.df, aes(x=Date, y = n, group = 1)) + geom_line()

Monthly Model

m <- prophet(monthly.df %>% rename(ds = Date, y = n))

## Disabling weekly seasonality. Run prophet with weekly.seasonality=TRUE to override this.

## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.

future <- make_future_dataframe(m, periods = 12 * 2, freq='month')
forecast <- predict(m, future)
plot(m, forecast)

Mertrics measurments

df.cv <- cross_validation(m, period = 365, horizon = 365*2, units = 'days')

## Making 8 forecasts with cutoffs between 2009-04-03 and 2016-04-01

monthly.metrics <- performance.metrics(df.cv %>% data.frame())
monthly.metrics

Daily

Data Frame and Exploratory Plots

daily.df <- work.df %>% group_by(Date) %>% summarise(n=sum(n))
daily.df %>% head()

summary(daily.df)

##       Date                  n        
##  Min.   :2003-01-01   Min.   :  2.0  
##  1st Qu.:2006-11-03   1st Qu.:287.0  
##  Median :2010-09-07   Median :311.0  
##  Mean   :2010-09-07   Mean   :311.3  
##  3rd Qu.:2014-07-11   3rd Qu.:336.0  
##  Max.   :2018-05-14   Max.   :541.0

ggplot(daily.df, aes(x=Date, y = n, group = 1)) + geom_line()

Daily Model

m <- prophet(daily.df %>% rename(ds = Date, y = n))

## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.

future <- make_future_dataframe(m, periods = 365 * 2)
forecast <- predict(m, future)
plot(m, forecast)

Daily Metrics

df.cv <- cross_validation(m, period = 365, horizon = 365*2, units = 'days')

## Making 8 forecasts with cutoffs between 2009-05-16 and 2016-05-14

daily.metrics <- performance.metrics(df.cv, metrics = c("smape", "mape", "rmse", "mad"))
daily.metrics

Hourly

Data Frame and Exploratory Plots

hourly.df <- work.df %>% 
               mutate(Date = as.POSIXct(paste0(Date," ", Hour, ":00"), format = "%Y-%m-%d %H:%M")) %>%
               group_by(Date) %>% 
               summarise(n=sum(n))


hourly.df %>% head()

summary(hourly.df)

##       Date                           n         
##  Min.   :2003-01-01 00:00:00   Min.   :  1.00  
##  1st Qu.:2006-11-04 06:30:00   1st Qu.:  8.00  
##  Median :2010-09-10 08:00:00   Median : 13.00  
##  Mean   :2010-09-09 00:43:24   Mean   : 13.05  
##  3rd Qu.:2014-07-14 19:30:00   3rd Qu.: 18.00  
##  Max.   :2018-05-14 23:00:00   Max.   :166.00  
##  NA's   :1

ggplot(hourly.df, aes(x=Date, y = n, group = 1)) + geom_line()

## Warning: Removed 1 rows containing missing values (geom_path).

Hourly Model

hourly.df <- hourly.df %>% 
          mutate(ds = as.character(Date)) %>% 
          rename(y = n) %>%
          arrange(Date) %>% 
          tail(n = 365 * 24) %>% 
          na.omit()
          
hourly.df['cap']   <- max(hourly.df$y)
hourly.df['floor'] <- 2


m <- prophet(hourly.df, growth = "logistic")

## Disabling yearly seasonality. Run prophet with yearly.seasonality=TRUE to override this.

future <- make_future_dataframe(m, periods = 30 * 24 * 2, freq = 3600)

future['cap']   <- max(hourly.df$y)
future['floor'] <- min(hourly.df$y)

forecast <- predict(m, future)
forecast$yhat_lower[forecast$yhat_lower < 0] <- 2
forecast$yhat_upper[forecast$yhat_upper < 0] <- 2
forecast$yhat[forecast$yhat < 0] <- 2

plot(m, forecast)

Hourly Metrics

df.cv <- cross_validation(m, horizon = 24*2, units = 'days')

## Making 8 forecasts with cutoffs between 2017-10-10 23:00:00 and 2018-03-27 23:00:00

df.cv$yhat[df.cv$yhat < 0] <- 2
hourly.metrics <- performance.metrics(df.cv, metrics = c("smape", "mape", "rmse", "mad"))
hourly.metrics

#Metrics

  rbind(monthly.metrics, daily.metrics) %>%
  rbind(hourly.metrics) %>% 
  data.frame()

Forecast by district

Daily only - didn’t have time for Monthly and Hourly. General idea is the same

# 
work.df <- read.csv("Police_Department_Incident_Reports__Historical_2003_to_May_2018.csv", stringsAsFactors = F) %>% 
              mutate(Hour = gsub("(.+):.+","\\1",Time), Date = as.Date(Date, "%m/%d/%Y")) %>%
              select(IncidntNum, Date, Hour, PdDistrict) %>% 
              distinct() %>% 
              group_by(Date, Hour, PdDistrict) %>% 
              count() %>%
              data.frame()

# filter last day as incompleated 
work.df <- work.df %>% filter(Date != max(work.df$Date))

work.df %>% head()

Daily

Data Frame and Number of observation for each district

daily.df <- work.df %>% 
                group_by(Date, PdDistrict) %>% 
                summarise(n=sum(n)) %>%
                rename(ds = Date, y = n) %>%
                filter(PdDistrict != "")


daily.df %>% group_by(PdDistrict) %>% summarise(n())

Set of Help Functions

# The function is not mine originaly it was taken from prophet::cross_validation since it's being crushing on simple row bindings I had to rewrite some parts of it to make it work correctly
cross.validation <- function(model, horizon, units, period = NULL, initial = NULL)
{
    df <- model$history
    horizon.dt <- as.difftime(horizon, units = units)
    if (is.null(period)) {
        period <- 0.5 * horizon
    }
    period.dt <- as.difftime(period, units = units)
    period.max <- 0
    for (s in model$seasonalities) {
        period.max <- max(period.max, s$period)
    }
    seasonality.dt <- as.difftime(period.max, units = "days")
    if (is.null(initial)) {
        initial.dt <- max(as.difftime(3 * horizon, units = units), 
            seasonality.dt)
    }
    else {
        initial.dt <- as.difftime(initial, units = units)
        if (initial.dt < seasonality.dt) {
            warning(paste0("Seasonality has period of ", period.max, 
                " days which ", "is larger than initial window. Consider increasing initial."))
        }
    }
    
    cutoffs <- prophet:::generate_cutoffs(df, horizon.dt, initial.dt, period.dt)
    nms <- c("ds", "y", "yhat", "yhat_lower", "yhat_upper", "cutoff")
    predicts <-  data.frame(t(rep(NA,length(nms))))
    names(predicts) <- nms
    predicts <- predicts[-1,]
    
    for (i in 1:length(cutoffs)) {
        cutoff <- cutoffs[i]
        m <- prophet:::prophet_copy(model, cutoff)
        history.c <- dplyr::filter(df, ds <= cutoff)
        if (nrow(history.c) < 2) {
            stop("Less than two datapoints before cutoff. Increase initial window.")
        }
        m <- fit.prophet(m, history.c)
        df.predict <- dplyr::filter(df, ds > cutoff, ds <= cutoff + 
            horizon.dt)
        columns <- "ds"
        if (m$growth == "logistic") {
            columns <- c(columns, "cap")
            if (m$logistic.floor) {
                columns <- c(columns, "floor")
            }
        }
        columns <- c(columns, names(m$extra_regressors))
        for (name in names(m$seasonalities)) {
            condition.name = m$seasonalities[[name]]$condition.name
            if (!is.null(condition.name)) {
                columns <- c(columns, condition.name)
            }
        }
        future <- df.predict[columns]
        yhat <- stats::predict(m, future)
        df.c <- dplyr::inner_join(df.predict, yhat, by = "ds")
        df.c <- dplyr::select(df.c, ds, y, yhat, yhat_lower, 
            yhat_upper)
        df.c$cutoff <- cutoff
        predicts <- rbind(predicts, data.frame(df.c))
    }
    return(predicts)
}


# Function that creates Prophet model and returns set of metrics
get.metrics <- function(df, period = 360, horizon = 360 * 2, units = 'days')
{
  df %>%
    prophet() %>%
    cross.validation(period = period, horizon = horizon, units = units) %>%
    performance.metrics( metrics = c("smape", "mape", "rmse", "mad"))
}

Metrics by District

nms <- c("smape", "mape", "rmse", "mad", "PdDistrict")
metrics.df   <-  data.frame(t(rep(NA,length(nms))))
names(metrics.df) <- nms

metrics.df   <- metrics.df[-1,]

for (s in unique(daily.df$PdDistrict))
{
  
  tmp.df <- daily.df %>% 
             filter(PdDistrict == s) %>% 
             select(ds,y) %>% 
             data.frame() %>% 
             get.metrics()
  
  tmp.df[["PdDistrict"]] = s
  
  metrics.df <- rbind(metrics.df,  tmp.df)
  names(metrics.df) <- nms
}

metrics.df

Next Steps

Parameters tuning
Design and create Software and API
Use version control (for demo purpuses it is only notebook here. In reality it has to be GitHub or GitLab repo with defined software design especially after research stage)
Created set of unittests
Put it inside of Docker container
Depending on product create REST API or UI (can be Shiny UI)
Build better GEO models (Add metrics that includes geo locations and build approximations - didn’t have time for it)
Integrate App in production - Web Serivice, Library, Docker Container, AWS Lambda Container
CD/CI
Add complexity and tune models
Add logging functionality

Incedent Forecasting

Alexey Butyrev

Data View and Libraires

Work Plan

Load Data and Clear Base Data

Monthly, Weekly, Daily Forecasts

Monthly

Data Frame and Exploratory Plots

Monthly Model

Mertrics measurments

Daily

Data Frame and Exploratory Plots

Daily Model

Daily Metrics

Hourly

Data Frame and Exploratory Plots

Hourly Model

Hourly Metrics

Forecast by district

Daily

Data Frame and Number of observation for each district

Set of Help Functions

Metrics by District

Next Steps