The following code will show how to employ time series analysis techniques -ARIMA modelling in particular- to predict future values of City administration processes.

All data can be freely downloaded from the City of Boston Open Data portal ( https://data.cityofboston.gov/ ) with the exception of Tax Assessor’s data, available at the Boston Area Research Initiative repository at Harvard’s Dataverse ( https://dataverse.harvard.edu )

Load libraries and set options

options(scipen = 20)
require(stringr)

## Loading required package: stringr

require(dplyr)

## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

require(tidyr)

## Loading required package: tidyr

require(ggplot2)

## Loading required package: ggplot2

library(lubridate)
library(highcharter)

Step 1: Data preparation

Building permits data set

BPdf <- read.csv('/home/havb/data/Boston/Building Permits 2015 - Data.csv', stringsAsFactors = F)

Fix extraneous characters

BPdf$PermitTypeDescr <- gsub("\xe4\xf3\xf1", "-" ,BPdf$PermitTypeDescr)

Flag renovations

BPdf$reno <- ifelse(BPdf$PermitTypeDescr== "BFD - Asbestos Removal" | BPdf$PermitTypeDescr== "BFD - Cutting-Burning-Welding" | BPdf$PermitTypeDescr== "Plumbing Permit" | BPdf$PermitTypeDescr== "Gas Permit" | BPdf$PermitTypeDescr== "Electrical Fire Alarms" | BPdf$PermitTypeDescr== "Electrical Low Voltage" | BPdf$PermitTypeDescr== "Electrical Permit" | BPdf$PermitTypeDescr== "Electrical Temporary Service" | BPdf$PermitTypeDescr== "Fire Protection Permit" | BPdf$PermitTypeDescr== "Short Form Bldg Permit" | BPdf$PermitTypeDescr== "Temporary Construction Drivewa" | BPdf$PermitTypeDescr== "BFD - Bag Smoke Detectors" | BPdf$PermitTypeDescr== "BFD - Temporary Dumpster" | BPdf$PermitTypeDescr== "BFD Chemist Certificate proces" | BPdf$PermitTypeDescr== "BBFD Application Tank Removal" | BPdf$PermitTypeDescr== "BFD Approval of Tank Truck" | BPdf$PermitTypeDescr== "BFD Install or Modify Undergrd" | BPdf$PermitTypeDescr== "BFDAlteration of Fuel Oil Burn" | BPdf$PermitTypeDescr== "Sidewalk Deposit", 1,0)

Flag moves. “Street Occupancy Permit” does not necessarily signify that someone is moving, as it’s also used for dumpster location and construction trucks - But it’s still usually related to moves.

BPdf$moving <- ifelse(BPdf$PermitTypeDescr== "Street Occupancy Permit", 1,0)

Drop permits with no issuing date

BPdf <- filter(BPdf, ISSUED_DATE != "")

Convert ISSUED_DATE column into date format, and add a YearMonth column

padmonth <- function(int) return(ifelse(int > 9, int, paste(0,int,sep = "")))

BPdf <- mutate(BPdf, ISSUED_DATE = mdy_hm(ISSUED_DATE))
BPdf <- mutate(BPdf, YEARMONTH = paste(year(ISSUED_DATE), padmonth(month(ISSUED_DATE)), sep = "_"))

Keep only records from 2010 on (earlier years are too sparse)

BPdf <- filter(BPdf, year(ISSUED_DATE) > 2009 & ISSUED_DATE < "2015-08-01 00:00:00 UTC")

Flag Northeastern University renovation permits

BPdf$NU <- ifelse(grepl("NORTHEASTERN U", BPdf$Owner_Upper) & !BPdf$newcon ,T,F)

Aggregate

Aggregate by parcel and yearmonth, and then again by Census tract and by neigborhood, counting issued permits and adding up instances of:

“newcon” “addition” “demo” “reno”
“moving” “specialevents” “uppereducation” “healthcare” “religious”
“government” “civic”

Aggregation by parcel

BPdf_by_parcel <- BPdf %>% 
  filter(!is.na(CT_ID_10) & !is.na(YEARMONTH) & !is.na(parcel_num) ) %>%
  group_by(parcel_num, CT_ID_10, YEARMONTH) %>%
  select(DECLARED_VALUATION, TOTAL_FEES, newcon, addition, demo, reno, moving, specialevents,
         uppereducation, healthcare, religious, government, civic, Aesthetics, NU) %>%
  summarise(
    DECLARED_VALUATION = sum(DECLARED_VALUATION, na.rm = TRUE),
    FEES = sum(TOTAL_FEES, na.rm = TRUE),
    MEAN_DECLARED_VALUATION = mean(DECLARED_VALUATION, na.rm = TRUE),
    NEWCONS = sum(newcon, na.rm = TRUE),
    ADDITIONS = sum(addition, na.rm = TRUE),
    DEMOLITIONS = sum(demo, na.rm = TRUE),
    RENOVATIONS = sum(reno, na.rm = TRUE),
    MOVES = sum(moving, na.rm = TRUE),
    SPECIAL_EVENTS = sum(specialevents, na.rm = TRUE),
    HIGHER_ED = sum(uppereducation, na.rm = TRUE),
    HEALTH_CARE = sum(healthcare, na.rm = TRUE),
    RELIGIOUS = sum(religious, na.rm = TRUE),
    GOVERNMENT = sum(government, na.rm = TRUE),
    CIVIC = sum(civic, na.rm = TRUE),
    NU =  sum(NU, na.rm = TRUE)
  )

Aggregation by Census tract

BPdf_by_tract <- BPdf_by_parcel %>%
  group_by(CT_ID_10, YEARMONTH) %>%
  summarise(
    BUILTPARCELS = n(),
    DECLARED_VALUATION = sum(DECLARED_VALUATION, na.rm = TRUE),
    FEES = sum(FEES, na.rm = TRUE),
    MEAN_DECLARED_VALUATION = mean(DECLARED_VALUATION, na.rm = TRUE),
    NEWCONS = sum(NEWCONS, na.rm = TRUE),
    ADDITIONS = sum(ADDITIONS, na.rm = TRUE),
    DEMOLITIONS = sum(DEMOLITIONS, na.rm = TRUE),
    RENOVATIONS = sum(RENOVATIONS, na.rm = TRUE),
    MOVES = sum(MOVES, na.rm = TRUE),
    SPECIAL_EVENTS = sum(SPECIAL_EVENTS, na.rm = TRUE),
    HIGHER_ED = sum(HIGHER_ED, na.rm = TRUE),
    HEALTH_CARE = sum(HEALTH_CARE, na.rm = TRUE),
    RELIGIOUS = sum(RELIGIOUS, na.rm = TRUE),
    GOVERNMENT = sum(GOVERNMENT, na.rm = TRUE),
    CIVIC = sum(CIVIC, na.rm = TRUE),
    NU =  sum(NU, na.rm = TRUE)
  )

# Now include CT/month combinations with zero building permits
BPdf_by_tract$YEARMONTH <- as.factor(BPdf_by_tract$YEARMONTH)
BPdf_by_tract$CT_ID_10 <- as.factor(BPdf_by_tract$CT_ID_10)
  
BPdf_by_tract <- left_join( expand.grid( YEARMONTH = levels(BPdf_by_tract$YEARMONTH),
                                         CT_ID_10 = levels(BPdf_by_tract$CT_ID_10)),
                            BPdf_by_tract)

BPdf_by_tract[is.na(BPdf_by_tract)] <- 0

Aggregation by neighborhood

# Append neighborhood names

tractsData <- read.csv('/home/havb/data/Boston/Census/Tract Census Data.csv', stringsAsFactors = F)

BPdf_by_tract <- merge(BPdf_by_tract, tractsData[c(2,8)], all.x = T, by = "CT_ID_10")
names(BPdf_by_tract)[19] <- "NB"


BPdf_by_NB <- BPdf_by_tract %>%
  group_by(NB, YEARMONTH) %>%
  summarise(
    BUILTPARCELS = sum(BUILTPARCELS),
    DECLARED_VALUATION = sum(DECLARED_VALUATION, na.rm = TRUE),
    FEES = sum(FEES, na.rm = TRUE),
    MEAN_DECLARED_VALUATION = mean(DECLARED_VALUATION, na.rm = TRUE),
    NEWCONS = sum(NEWCONS, na.rm = TRUE),
    ADDITIONS = sum(ADDITIONS, na.rm = TRUE),
    DEMOLITIONS = sum(DEMOLITIONS, na.rm = TRUE),
    RENOVATIONS = sum(RENOVATIONS, na.rm = TRUE),
    MOVES = sum(MOVES, na.rm = TRUE),
    SPECIAL_EVENTS = sum(SPECIAL_EVENTS, na.rm = TRUE),
    HIGHER_ED = sum(HIGHER_ED, na.rm = TRUE),
    HEALTH_CARE = sum(HEALTH_CARE, na.rm = TRUE),
    RELIGIOUS = sum(RELIGIOUS, na.rm = TRUE),
    GOVERNMENT = sum(GOVERNMENT, na.rm = TRUE),
    CIVIC = sum(CIVIC, na.rm = TRUE),
    NU =  sum(NU, na.rm = TRUE)
  )

Summarise the whole city

BP_Boston <- BPdf_by_parcel %>%
  group_by(YEARMONTH) %>%
  summarise(
    BUILTPARCELS = n(),
    VALUATION = sum(DECLARED_VALUATION, na.rm = TRUE),
    FEES = sum(FEES, na.rm = TRUE),
    NEWCONS = sum(NEWCONS, na.rm = TRUE),
    ADDITIONS = sum(ADDITIONS, na.rm = TRUE),
    DEMOLITIONS = sum(DEMOLITIONS, na.rm = TRUE),
    RENOVATIONS = sum(RENOVATIONS, na.rm = TRUE),
    MOVES = sum(MOVES, na.rm = TRUE),
    SPECIAL_EVENTS = sum(SPECIAL_EVENTS, na.rm = TRUE),
    HIGHER_ED = sum(HIGHER_ED, na.rm = TRUE),
    HEALTH_CARE = sum(HEALTH_CARE, na.rm = TRUE),
    RELIGIOUS = sum(RELIGIOUS, na.rm = TRUE),
    GOVERNMENT = sum(GOVERNMENT, na.rm = TRUE),
    CIVIC = sum(CIVIC, na.rm = TRUE),
    NU =  sum(NU, na.rm = TRUE)
  )

#write.csv(file = '/home/havb/data/Boston/BP_Boston.csv', BP_Boston, row.names = F)
#write.csv(file = '/home/havb/data/Boston/BPdf_by_NB.csv', BPdf_by_NB, row.names = F)
BP_Boston <- read.csv('/home/havb/data/Boston/BP_Boston.csv', stringsAsFactors = F)
BPdf_by_NB <- read.csv('/home/havb/data/Boston/BPdf_by_NB.csv', stringsAsFactors = F)

311 incidents data set

db311 <- read.table('/home/havb/data/Boston/311/CRM Cases 2010_2014 Unrestricted.tab', sep = "\t",header = T)

db311_2 <- read.table('/home/havb/data/Boston/311/CRM Cases 2015_2019 Unrestricted.tab', sep = "\t",header = T)

db311 <- rbind(db311, db311_2)

db311 <- read.csv('/home/havb/data/Boston/311/CRM Cases 2010_2016.tab', stringsAsFactors = F )

# Convert OPEN_DT column into date format, and add a YearMonth column

padmonth <- function(int) return(ifelse(int > 9, int, paste(0,int,sep = "")))

db311 <- mutate(db311, OPEN_DT = ymd_hms(OPEN_DT))
db311 <- mutate(db311, YEARMONTH = paste(year(OPEN_DT), padmonth(month(OPEN_DT)), sep = "_"))

#Drop months with incomplete records
db311 <- filter(db311, YEARMONTH != "2010_02" & YEARMONTH != "2016_01")


#Aggregate by yearmonth and count instances of 

#  "SCHEDULE A BULK ITEM PICKUP"
#  "REQUEST FOR SNOW PLOWING"
#  "REQUESTS FOR STREET CLEANING"
#  "STREET LIGHT OUTAGES"              
#  "REQUEST FOR POTHOLE REPAIR"

db311_ag <- db311 %>%
  group_by(YEARMONTH) %>%
  summarise(
    BULK.ITEM.PICKUP = sum(TYPE == "SCHEDULE A BULK ITEM PICKUP", na.rm = TRUE),
    SNOW.PLOWING = sum(TYPE == "REQUEST FOR SNOW PLOWING", na.rm = TRUE),
    STREET.CLEANING = sum(TYPE == "REQUESTS FOR STREET CLEANING", na.rm = TRUE),
    STREETLIGHT.OUTAGE = sum(TYPE == "STREET LIGHT OUTAGES", na.rm = TRUE),
    POTHOLE.REPAIR = sum(TYPE == "SCHEDULE A BULK ITEM PICKUP", na.rm = TRUE)
  )

#write.csv(file = '/home/havb/data/Boston/311/CRM_Cases_2010_2015_aggregated.csv', db311_ag, row.names = F)

db311_ag <- read.csv('/home/havb/data/Boston/311/CRM_Cases_2010_2015_aggregated.csv', stringsAsFactors = F)

Tax Assessor’s data set

TAdf <- read.csv('/home/havb/data/Boston/Tax Assessor Longitudinal - Data.csv', stringsAsFactors = F)

The dataframe has a number of duplicated rows. We’ll keep only unique observations.

TAdf <- TAdf[!duplicated(TAdf),]

Tidy the data so: -each column is a variable. -each row is an observation.

The original dataset is in “wide” format: 2006.VALUE 20007.VALUE 2008.VALUE
54522 86325 26982

We have to turn it into a “long” dataframe: YEAR VALUE 2007 54522 2008 86325 2009 26982

The same problem needs to be fixed for land use (2006.LU, 2007.LU, etc)

TAdf_subset_av <- select(TAdf, FY2000.AV, FY2001.AV, FY2002.AV, FY2003.AV, FY2004.AV, FY2005.AV, FY2006.AV, FY2007.AV, FY2008.AV, FY2009.AV, FY2010.AV, FY2011.AV, FY2012.AV, FY2013.AV, X2014.AV, X2015.AV, CT_ID_10)

TAdf_subset_lu <- select(TAdf, FY2000.LU, FY2001.LU, FY2002.LU, FY2003.LU, FY2004.LU, FY2005.LU, FY2006.LU, FY2007.LU, FY2008.LU, FY2009.LU, FY2010.LU, FY2011.LU, FY2012.LU, FY2013.LU, X2014.LU, X2015.LU)

We rename columns like “FY2004.AV”, “FY2005.AV”, etc as “2004”, “2006”… and then spread them as different observations of variable “YEAR”. We then summarise by Census Tract ID

names(TAdf_subset_av)[1:16] <- 2000:2015
TAdf_gathered_av <- TAdf_subset_av %>%
gather(YEAR, ASSESSED_VALUE, 1:16)

head(TAdf_gathered_av)

# Now land use
names(TAdf_subset_lu)[1:16] <- 2000:2015
TAdf_gathered_lu <- TAdf_subset_lu %>%
gather(YEAR, LU)

TAdf_gathered <- cbind(TAdf_gathered_av, LU = TAdf_gathered_lu$LU)

# fix a single parcel in the entire dataset with LU = "XX", 
#a land use code that is not registered in the datasat dictionary. 
#Since the parcel is owned by a church, it's assumed a Tax Exempt parcel.

TAdf_gathered[TAdf_gathered$LU == 'XX',]$LU <- "E"

# Filter out tax excempt parcels

TAdf_gathered <- filter(TAdf_gathered, LU != "E" & LU != "EA")

# Aggregate by year

TA_Boston <- TAdf_gathered %>%
  group_by(YEAR) %>%
  summarise(
    ASSESSED_VALUE = sum(as.numeric(ASSESSED_VALUE), na.rm = TRUE))


#write.csv(file = '/home/havb/data/Boston/TA_Boston', TA_Boston, row.names = F)

TA_Boston <- read.csv('/home/havb/data/Boston/TA_Boston', stringsAsFactors = F)

Step 2: Time Series Analysis and Forecasting

library(forecast)

## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Loading required package: timeDate
## This is forecast 7.0

Our forecast methodology

Based on Hyndman, Rob J., and George Athanasopoulos. Forecasting: principles and practice. OTexts, 2014.

1. Plot the data. Identify any unusual observations.
2. If necessary, transform the data to stabilize the variance.
3. If the data are non-stationary: take first differences of the data until the data are stationary.
4. Examine the ACF/PACF: Decide if an AR(pp) or MA(qq) model are appropriate.
5. Try the chosen model(s)
6. Check the residuals from the chosen model by plotting the ACF of the residuals, and observing the distribution of the residuals. If they do not look like white noise, a modified model is required.
7. Once the residuals look like white noise, calculate forecasts.

Demolitions in Boston’s Downtown

Quick visual analysis construction vs demolition activity, per neighborhood:

my.plot.theme <- theme(text = element_text(family = "Futura Bk BT"),
                      #plot.title = element_text(size = 36),
                      axis.line = element_blank(),
                      axis.title.x = element_blank(),
                      axis.title.y = element_blank(),
                      axis.text.x = element_blank(),
                      #axis.text.y = element_blank(),
                      #legend.title = element_text(size = 16),
                      #legend.text = element_text(size = 24),
                      #strip.text = element_text(size = 20),
                      panel.grid.major = element_blank (), # remove major grid
                      panel.grid.minor = element_blank (),  # remove minor grid
                      axis.text = element_blank (), 
                      axis.title = element_blank (),
                      axis.ticks = element_blank ())
                      
                      

ggplot(data = BPdf_by_NB) +
  geom_line(aes(x = YEARMONTH, y = NEWCONS, group = 1, color = "new construction")) +
  geom_line(aes(x = YEARMONTH, y = DEMOLITIONS, group = 1, color = "demolition")) +  
  scale_colour_manual(values=c("new construction"="darkseagreen3","demolition" = "black"),
                      guide = guide_legend(title = NULL)) + 
  theme_minimal()+
  theme  (legend.position = "bottom") + 
  labs(title="New constructions Vs Demolitions (2010 - 2015)\n") +
  my.plot.theme +
  facet_wrap(~ NB)

We will concentrate on demolitions in the Downtown (Central district), as the comparative plot by neighboorhood shows a much higher occurrence there than anywhere else in the city.

Downtown.demolitions.ts <- ts(BPdf_by_NB[BPdf_by_NB$NB == "Central", "DEMOLITIONS"], start = c(2010,1), frequency = 12)

hchart(Downtown.demolitions.ts) %>% 
    hc_title(text = "Approved Demolition permits - Boston Downtown")

Let’s take a look at its first-order differencing:

hchart(diff(Downtown.demolitions.ts))

That took care of the trend. The mean is clearlt stable now, but there’s a worrying increase of the variance in later times, that may indicate non-stationarity. To be sure, we will take a root’unit test to check for non-stationarity:

tseries::adf.test(diff(Downtown.demolitions.ts), k = 12)

## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff(Downtown.demolitions.ts)
## Dickey-Fuller = -3.9278, Lag order = 12, p-value = 0.0184
## alternative hypothesis: stationary

With a p-value well below the threshold at 0.5, we can assume that the differenced time series is stationary. We can continue with a look at the autocorrelation plots:

tsdisplay(diff(Downtown.demolitions.ts))

There’s significant autocorrelation at lag 1, suggesting a MA(1) process. There is also significant autocorrelation at lags 14 and 15, perhaps as a result of seasonality. There’s partial autocorrelation at lags 1 and 2 -AR(2)- and spike of partial autocorrelation at lag 13, indicating the possibility of yearly AR seasonality.

AR (autoregression) models predicts future values of a variable of interest using a linear combination of past values of the variable. The term “autoregression” is used to indicate that it is a regression of the variable against itself.

An autoregressive model of order $p$ is written as:

\[ y_{t} = c + \phi_{1}y_{t-1} + \phi_{2}y_{t-2} + \dots + \phi_{p}y_{t-p} + e_{t}, \]

where $c$ is a constant and $e_t$ is white noise. This is referred to as an AR($p$) model.

MA (moving average) models, instead of using past values of the forecast variable in a regression, use past forecast errors in a regression-like model.

\[ y_{t} = c + e_t + \theta_{1}e_{t-1} + \theta_{2}e_{t-2} + \dots + \theta_{q}e_{t-q}, \]

where $e_t$ is white noise. We refer to this as an MA($q$) model.

When a process is modelled with an AR component, and also an MA one, it’s called and ARMA model. When integration (time series differencing) is added, it’s called an ARIMA model.

ARIMA models can also model seasonal data. A seasonal ARIMA model is formed by including additional seasonal terms like this:

Where $m$ = the number of periods per season. We use uppercase notation for the seasonal parts of the model, and lowercase notation for the non-seasonal parts of the model.

The seasonal part of the model consists of terms that are very similar to the non-seasonal components of the model, but they involve backshifts of the seasonal period. For example, an ARIMA(1,1,1)(1,1,1) model (without a constant) is for quarterly data ($m=4$) and can be written as

(See https://www.otexts.org/fpp/8/9 )

We’ll model the process as an ARIMA(2,1,1)(1,1,0). That is, and AR(2), I(1), MA(1) process with AR and I seasonality:

dd.fit <- Arima(Downtown.demolitions.ts, order = c(2,1,1), seasonal = c(1,1,0))

Now we’ll examine the distribution of the residuals with a custom function (courtesy of http://www.quantlego.com):

PlotForecastErrors <- function(forecasterrors)
{
  mybinsize <- IQR(forecasterrors)/4
  mysd <- sd(forecasterrors)
  mymin <- min(forecasterrors)-mysd*5
  mymax <- max(forecasterrors)+mysd*5
  mynorm <- rnorm(10000,mean=0,sd=mysd)
  mymin2 <- min(mynorm)
  mymax2 <- max(mynorm)
  if (mymin2<mymin)
  {
    mymin<-mymin2
  }
  if (mymax2>mymax)
  {
    mymax<-mymax2
  }
  mybins <- seq(mymin,mymax,mybinsize)
  hist(forecasterrors, col="red", freq=FALSE, breaks=mybins)
  myhist <- hist(mynorm, plot=FALSE, breaks=mybins)
  points(myhist$mids, myhist$density, type="l", col="blue", lwd=2)
}

PlotForecastErrors(dd.fit$residuals)

The histogram shows that the distribution of forecast errors is mostly centered on zero, and normally distributed, it is perfectly plausible that the forecast errors are normally distributed with mean zero and constant variance.

tsdisplay(dd.fit$residuals)

The residuals don’t dislay any autocorrelation, so we can assume that our model provides a good approximation.

This is our two-year forecast for approved demolitions at the City’s downtown:

hchart(forecast(dd.fit, h=24)) %>% 
    hc_title(text = "Two year forecast: Approved Demolition permits - Boston Downtown")

Maintenance activity (example: Northeastern’s facilities)

The approval of a building permit is required by the city for additions to existing structures, as well as for the execution of general maintenance jobs (electricity, plumbing, hazardous materials removal,etc). It is possible to used the approved permits database to track the “renovation rythm” of government agencies, universities, churches and other institutions that own a significant number of built parcels in the city.

For this example, during the data preparation phase we have isolated all renovation and addition permits approved for parcels owned by Northeastern University. We’ll use the information to predict Northeastern’s future maintenance efforts.

NU.maint.ts <- ts(BP_Boston["NU"], start = c(2010,1), frequency = 12)
hchart(NU.maint.ts) %>% 
    hc_title(text = "Northeastern University maintenance permits approved")

Again a seasonal effect, and a trend. Let’s check the dfferenced series:

tsdisplay(diff(NU.maint.ts))

Clearly more stationary.

There only significant autocorrelation spike is at lag 12 indicating the possibility of yearly MA seasonality. None ofthe first lags show partial autocorrelation, so this isn’t an MA process either. We’ll model the process as an arima(0,1,0) process with seasonality:

nm.fit <- Arima(NU.maint.ts, order = c(0,1,0), seasonal = c(1,1,0))

Now examine the distribution of the residuals:

PlotForecastErrors(nm.fit$residuals)

Forecast errors are nicely centered around zero, and normally distributed.

tsdisplay(dd.fit$residuals)

The residuals don’t dislay any autocorrelation, so we can assume that our model provides a good approximation.

This is our two-year forecast for maintenance activity at Northeastern University facilities:

hchart(forecast(nm.fit, h=24)) %>% 
    hc_title(text = "Two year forecast: Northeastern University requested maintenance permits")

Is worth noticing that predicted values fall below 0 at the outer fringes of the confidence interval. Of course, there cannot be a negative number of approved building permits - any value below 0 must be interpreted as 0 for this use case. ARIMA processes cannot be guaranteed to simulate only positive values. See http://stackoverflow.com/questions/27672891/generate-strictly-positive-values-using-arima-sim-in-r

Demand of City Services

Information extracted from the City of Boston 311 database can be used to estimate seasonal demand for city serices (garbage collection, pothole fixing, snow plowing, and so on.)

Pothole fix requests

potholes.ts <- ts(db311_ag[6], start = c(2010,3), frequency = 12) 
hchart(potholes.ts)  %>% 
    hc_title(text = "Boston 311 pothole fix requests")

tsdisplay(diff(potholes.ts))

Interestingly, there are spikes at lags 6, 12 and 18, suggesting periodicity.

The lack of autocorrelation spikes at the initial lags indicates a non AR nor MA process.

ph.fit <- Arima(potholes.ts, order = c(0,1,0), seasonal = c(0,1,0))
PlotForecastErrors(ph.fit$residuals)

tsdisplay(ph.fit$residuals)

This is our two-year forecast for pothole fix requests:

hchart(forecast(ph.fit, h=24)) %>% 
    hc_title(text = "Two year forecast: Boston 311 pothole fix requests (ARIMA)")

Again, there are negative predictions inside the confidence interval - we’ll consider those equivalent to 0.

An alternative model can be defined using Holt’s exponential smoothing. The basic method, that fits a linear trend, is based on a forecast equation and two smoothing equations (one for the level and one for the trend):

\[ \text{Forecast equation: } \hat{y}_{t+h|t} = \ell_{t} + hb_{t} \] \[ \text{Level equation: } \ell_{t} = \alpha y_{t} + (1 - \alpha)(\ell_{t-1} + b_{t-1}) \] \[ \text{Trend equation: } b_{t} = \beta^*(\ell_{t} - \ell_{t-1}) + (1 -\beta^*)b_{t-1} \]

where $\ell_{t}$ denotes an estimate of the level of the series at time $t$, $b_t$ denotes an estimate of the trend (slope) of the series at time $t$, $\alpha$ is the smoothing parameter for the level, $0\le\alpha\le1$ and $\beta$ the smoothing parameter for the trend, $0 \le \beta \le 1$

Since the pot hole fix requests time series show seasonal behaviour, Holt cannot be directly applied. Holt’s method has been extended to capture seasonality with the forecast equation and three smoothing equations — one for the level $\ell_{t}$, one for trend $\beta_t$, and one for the seasonal component denoted by $s_t$, with smoothing parameters $\alpha$, $\beta$ and $\gamma$

R’s forecast package provides a function that automatically estimates the best paramaters based on fit (ftted values in red):

plot(HoltWinters(potholes.ts))

The resulting forecast is very similar to the ARIMA option, but it offers a more constrained confidence interval, which seems appropriate in this scenario:

hchart(forecast(HoltWinters(potholes.ts))) %>% 
    hc_title(text = "Two year forecast: Boston 311 pothole fix requests (Holt-Winter)")

But- even when the residuals are normally distributed, they still keep some autocorrelation:

PlotForecastErrors(potholes.ts - HoltWinters(potholes.ts)$fitted[,1])

tsdisplay(potholes.ts - HoltWinters(potholes.ts)$fitted[,1])

So the ARIMA model is to be preferred.

Snow plowing requests

snowplow.ts <-ts(db311_ag[3], start = c(2010,3), frequency = 12) 

hchart(snowplow.ts) %>% 
    hc_title(text = "Boston 311 street snow plow requests")

Interestingly, there look stationary right from the beginning. The augmented Dickey–Fuller test agrees:

tseries::adf.test(snowplow.ts)

## 
##  Augmented Dickey-Fuller Test
## 
## data:  snowplow.ts
## Dickey-Fuller = -3.9513, Lag order = 4, p-value = 0.01703
## alternative hypothesis: stationary

If we difference the time series anyway, its strong “rythmic” structure becomes apparent - it looks like a biological pulse:

hchart(diff(snowplow.ts))

Of course, the rythm is related to the relatively few days when snow accumulates in Boston enough to make citizens call for help. An smooth exponential model may not be ideal here - the sudden jumps in values are not picked up by the Holt-Winter equations:

plot(HoltWinters(snowplow.ts))

Let’s try an ARIMA model:

tsdisplay(snowplow.ts)

Autocorrelations are almost inexistant at lag 1, and absent after that. We can try to model an AR(1) MA(1) process with seasonality:

sp.fit <- Arima(snowplow.ts, order = c(1,0,1), seasonal = c(0,1,0))

hchart(forecast(sp.fit))

The results are not good: The model didn’t pick up the alternative high and lows in consecutive years.

Let’s go back to the differenced series looking for clues:

tsdisplay(diff(snowplow.ts))

There’s much more information now. Correlations suggest an AR(2), MA(1) process. Direct observation of the series suggest that there is a bi-annual seasonality at play, confirmed by the pacf spike at lag 23, almost a two year period. We’ll estimate a new model - AR(2)I(1)MA(1) with bi-annual periodicity:

sp.fit <- Arima(snowplow.ts, order = c(2,1,1), seasonal = list(order=c(1,1,1), period = 24))
PlotForecastErrors(sp.fit$residuals)

tsdisplay(dd.fit$residuals)

Residuals look good, and the prediction is much better now:

hchart(forecast(sp.fit, h = 24))  %>% 
    hc_title(text = "Two year Forecast: Boston 311 street snow plow requests")

Bulk item pickup requests

Bulk item pickups are incidents that reflect a citizen asking the City to dispose of a large discarded object (old refrigerators, furniture, smaller items in bulk, etc).

bulk.pickup.ts  <-ts(db311_ag[2], start = c(2010,3), frequency = 12)

hchart(bulk.pickup.ts) %>% 
    hc_title(text = "Boston 311 bulk item pickup requests")

tsdisplay(diff(bulk.pickup.ts))

Following the autocorrelation plot, we’ll model a first-orded integrated AR(2), MA(1) process

bpick.fit <- Arima(bulk.pickup.ts, order = c(2,1,1), seasonal = c(1,1,1))

PlotForecastErrors(bpick.fit$residuals)

tsdisplay(bpick.fit$residuals)

hchart(forecast(bpick.fit, h = 24))  %>% 
    hc_title(text = "Two year Forecast: Boston 311 bulk item pickup requests")

Once again, a Holt-Winter smoothing aproximation yields similar results:

hchart(forecast(HoltWinters(bulk.pickup.ts), h = 24))  %>%
    hc_title(text = "Two year Forecast: Boston 311 bulk item pickup requests (Holt-Winter approximation)")

But the errors are correlated, so the ARIMA model performs better:

PlotForecastErrors(bulk.pickup.ts - HoltWinters(bulk.pickup.ts)$fitted[,1])

tsdisplay(bulk.pickup.ts - HoltWinters(bulk.pickup.ts)$fitted[,1])

Sources of income

Forecasting the accumulated value of the city’s private property is important because property taxes are one of the main sources of income for the city government. To some extent, the same can be said of building projects (additions, new contructions, etc) since the city takes a fee based on the declared project values.

Total taxable property value

Monetary inflation increases figures from year to year, even when there are no significant changes in the underlying process (like property real state value dynamics). To compensate for this, a drift term can be introduced to adjust our predictions.

TaxAssessor.ts  <-ts(TA_Boston[2], start = 2000, frequency = 1)

hchart(TaxAssessor.ts) %>%
    hc_title(text = "Total accumulated taxable property value (USD)")

tsdisplay(diff(TaxAssessor.ts))

tseries::adf.test(diff(TaxAssessor.ts))

## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff(TaxAssessor.ts)
## Dickey-Fuller = -0.91826, Lag order = 2, p-value = 0.9326
## alternative hypothesis: stationary

Once differentiated, the lack of lag correlations indicates that the process is not AR or MA. As expected, the ADF test indicates non-stationarity, since we have a drift.

We’ll model the process as random walk -that is, an ARIMA(0,1,0) process- with a drift term:

ta.fit <- Arima(TaxAssessor.ts, order = c(0,1,0), include.drift = TRUE)
PlotForecastErrors(ta.fit$residuals)

tsdisplay(ta.fit$residuals)

hchart(forecast(ta.fit, h = 5))  %>%
    hc_title(text = "Five year forecast: Total accumulated taxable property value (USD)")

Total declared valuation of construction/addition projects

DeclaredValuation.ts  <- ts(BP_Boston["VALUATION"], start = c(2010,1), frequency = 12)

hchart(DeclaredValuation.ts) %>%
    hc_title(text = "Total declared valuation for construction projects (USD)")

As usual, we differenciate the series to remove trend:

tsdisplay(diff(DeclaredValuation.ts))

tseries::adf.test(diff(DeclaredValuation.ts))

## Warning in tseries::adf.test(diff(DeclaredValuation.ts)): p-value smaller
## than printed p-value

## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff(DeclaredValuation.ts)
## Dickey-Fuller = -4.2699, Lag order = 4, p-value = 0.01
## alternative hypothesis: stationary

Lags suggest AR(2), MA(1). The ADF test indicates stationarity -maybe a drift term is not necessary?.

We’ll compare models with and without a drift term

dv.no_drift.fit <- Arima(DeclaredValuation.ts, order = c(2,1,1))

dv.drift.fit <- Arima(DeclaredValuation.ts, order = c(2,1,1), include.drift = TRUE)

“Error in solve.default(res$hessian * n.used, A) : system is computationally singular: reciprocal condition number = 9.42933e-17”

The Arima algorithm cannot estimated a model of the desired order with a drift term. Based on trial an error, a way was found to include a drift term: differenciate as a seasonal component:

dv.drift.fit <- Arima(DeclaredValuation.ts, order = c(2,0,1), seasonal = c(0,1,0), include.drift = TRUE)

Now we compare models checking their AIC and BIC indices (the lower, the better fit)

dv.no_drift.fit

## Series: DeclaredValuation.ts 
## ARIMA(2,1,1)                    
## 
## Coefficients:
##           ar1      ar2      ma1
##       -0.4657  -0.2203  -0.4265
## s.e.   0.2419   0.1881   0.2308
## 
## sigma^2 estimated as 40316212831980136:  log likelihood=-1354.33
## AIC=2716.66   AICc=2717.32   BIC=2725.42

dv.drift.fit

## Series: DeclaredValuation.ts 
## ARIMA(2,0,1)(0,1,0)[12] with drift         
## 
## Coefficients:
##          ar1     ar2      ma1    drift
##       0.6212  0.0796  -0.5088  6743728
## s.e.  0.3677  0.1633   0.3502  4675626
## 
## sigma^2 estimated as 71896894484850760:  log likelihood=-1143.4
## AIC=2296.8   AICc=2298.02   BIC=2306.84

The model with drift is clearly better. Now, let’s check the residuals:

PlotForecastErrors(dv.drift.fit$residuals)

tsdisplay(dv.drift.fit$residuals)

There’s clear autocorrelation and partial autocorrelation at lag 12. We need to add an AR and MA term to the seasonal component.

dv.drift.fit <- Arima(DeclaredValuation.ts, order = c(2,0,1), seasonal = c(1,1,1), include.drift = TRUE)
PlotForecastErrors(dv.drift.fit$residuals)

tsdisplay(dv.drift.fit$residuals)

Much better now. There’s almost correlation at a few lags, but it’s still below the significance threshold.

hchart(forecast(dv.drift.fit, h = 24))  %>%
    hc_title(text = "Two year forecast: Total declared valuation for construction projects (USD)")

Forecasting Urban Processes with Time Series Analysis of Public Administration Data Sets

Antonio Vazquez Brust, M.S. in Urban Informatics, Northeastern University