library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tsibble)
## Warning: package 'tsibble' was built under R version 4.4.3
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
##
## Attaching package: 'tsibble'
##
## The following object is masked from 'package:lubridate':
##
## interval
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
library(fable)
## Warning: package 'fable' was built under R version 4.4.3
## Loading required package: fabletools
## Warning: package 'fabletools' was built under R version 4.4.3
library(feasts)
## Warning: package 'feasts' was built under R version 4.4.3
library(lubridate)
library(fabletools)
library(zoo)
##
## Attaching package: 'zoo'
##
## The following object is masked from 'package:tsibble':
##
## index
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
# Load the data
df <- read_csv("https://yichenqin.github.io/dataviz/data/UC_crime.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 33259 Columns: 40
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (28): INSTANCEID, INCIDENT_NO, DATE_REPORTED, DATE_FROM, DATE_TO, CLSD, ...
## dbl (10): UCR, DST, BEAT, RPT_AREA, HOUR_FROM, HOUR_TO, LONGITUDE_X, LATITUD...
## lgl (2): ZIP, SNA_NEIGHBORHOOD
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
This dataset contains reported crime data from the University of Cincinnati area, created and shared by Professor Yichen Qin. It includes data such as date of crime report, crime type, time details, and other attributes.
DATE_REPORTEDWe chose this dataset because crime forecasting can help with resource planning and proactive policing around the campus. Forecasting can also uncover seasonal or cyclic behavior that supports smarter operational decisions.
# Convert date
crime <- df %>%
mutate(date = as.Date(DATE_REPORTED, format = "%m/%d/%Y")) %>%
filter(!is.na(date)) %>%
count(date, name = "crime_count") %>%
complete(date = seq.Date(min(date), max(date), by = "day")) %>%
mutate(crime_count = replace_na(crime_count, 0))
# Create weekday variable
crime <- crime %>%
mutate(day_of_week = wday(date, label = TRUE))
# Convert to tsibble
crime_ts <- crime %>%
as_tsibble(index = date)
# Daily plot
autoplot(crime_ts, crime_count) +
labs(title = "Daily Crime Counts", y = "Count")
# Seasonality
gg_season(crime_ts, crime_count) + labs(title = "Seasonal Plot")
gg_subseries(crime_ts, crime_count) + labs(title = "Subseries Plot")
# Heatmap of weekdays
crime %>%
count(day_of_week) %>%
ggplot(aes(x = day_of_week, y = n, fill = day_of_week)) +
geom_col() +
labs(title = "Crimes by Day of Week", y = "Total")
# Add rolling average
crime <- crime %>%
mutate(
roll_7 = zoo::rollmean(crime_count, 7, fill = NA, align = "right"),
roll_30 = zoo::rollmean(crime_count, 30, fill = NA, align = "right")
)
# Plot with smoothing
crime %>%
ggplot(aes(x = date)) +
geom_line(aes(y = crime_count), alpha = 0.5) +
geom_line(aes(y = roll_7), color = "blue") +
geom_line(aes(y = roll_30), color = "red") +
labs(title = "Crime Count with Rolling Averages")
## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 29 rows containing missing values or values outside the scale range
## (`geom_line()`).
stl_model <- crime_ts %>% model(STL(crime_count))
components(stl_model) %>% autoplot()
crime_weekly <- crime_ts %>%
index_by(week = ~ yearweek(.)) %>%
summarise(weekly_count = sum(crime_count))
# Use last 12 months as test
train <- crime_weekly %>% filter(week < yearweek("2023 W01"))
test <- crime_weekly %>% filter(week >= yearweek("2023 W01"))
fit_models <- model(
train,
ARIMA = ARIMA(weekly_count),
ETS = ETS(weekly_count),
TSLM = TSLM(weekly_count ~ trend() + season())
)
## Warning: Seasonal periods (`period`) of length greather than 24 are not
## supported by ETS. Seasonality will be ignored.
# Model info
glance(fit_models)
## # A tibble: 3 × 20
## .model sigma2 log_lik AIC AICc BIC ar_roots ma_roots MSE AMSE MAE
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <list> <list> <dbl> <dbl> <dbl>
## 1 ARIMA 235. -2727. 5466. 5466. 5493. <cpl [1]> <cpl [55]> NA NA NA
## 2 ETS 237. -3940. 7885. 7885. 7899. <NULL> <NULL> 236. 261. 11.2
## 3 TSLM 492. -2950. 4138. 4148. 4380. <NULL> <NULL> NA NA NA
## # ℹ 9 more variables: r_squared <dbl>, adj_r_squared <dbl>, statistic <dbl>,
## # p_value <dbl>, df <int>, CV <dbl>, deviance <dbl>, df.residual <int>,
## # rank <int>
report(fit_models)
## Warning in report.mdl_df(fit_models): Model reporting is only supported for
## individual models, so a glance will be shown. To see the report for a specific
## model, use `select()` and `filter()` to identify a single model.
## # A tibble: 3 × 20
## .model sigma2 log_lik AIC AICc BIC ar_roots ma_roots MSE AMSE MAE
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <list> <list> <dbl> <dbl> <dbl>
## 1 ARIMA 235. -2727. 5466. 5466. 5493. <cpl [1]> <cpl [55]> NA NA NA
## 2 ETS 237. -3940. 7885. 7885. 7899. <NULL> <NULL> 236. 261. 11.2
## 3 TSLM 492. -2950. 4138. 4148. 4380. <NULL> <NULL> NA NA NA
## # ℹ 9 more variables: r_squared <dbl>, adj_r_squared <dbl>, statistic <dbl>,
## # p_value <dbl>, df <int>, CV <dbl>, deviance <dbl>, df.residual <int>,
## # rank <int>
# Add naive model
fit_models <- model(
train,
ARIMA = ARIMA(weekly_count),
ETS = ETS(weekly_count ~ error("A") + trend("A") + season("N")),
TSLM = TSLM(weekly_count ~ trend()),
Naive = NAIVE(weekly_count)
)
# Merge predictors
daily_full <- df %>%
mutate(date = as.Date(DATE_REPORTED, format = "%m/%d/%Y")) %>%
filter(!is.na(date)) %>%
count(date, name = "crime_count") %>%
complete(date = seq.Date(min(date), max(date), by = "day")) %>%
mutate(
crime_count = replace_na(crime_count, 0),
day_of_week = wday(date, label = TRUE),
weekend = ifelse(day_of_week %in% c("Sat", "Sun"), 1, 0)
) %>%
as_tsibble(index = date)
# Fit TSLM with predictors
predictor_model <- model(daily_full, TSLM = TSLM(crime_count ~ weekend + trend() + season()))
report(predictor_model)
## Series: crime_count
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.7535 -3.7010 -0.8473 2.4897 56.8613
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.764e+00 2.597e-01 22.194 < 2e-16 ***
## weekend 7.160e-01 3.073e-01 2.330 0.019852 *
## trend() 4.294e-04 6.179e-05 6.950 4.18e-12 ***
## season()week2 5.421e-01 3.073e-01 1.764 0.077792 .
## season()week3 -5.512e-01 3.073e-01 -1.794 0.072925 .
## season()week4 NA NA NA NA
## season()week5 1.153e+00 3.073e-01 3.753 0.000177 ***
## season()week6 4.419e-01 3.074e-01 1.437 0.150680
## season()week7 2.102e-01 3.073e-01 0.684 0.494114
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.574 on 4597 degrees of freedom
## Multiple R-squared: 0.01452, Adjusted R-squared: 0.01302
## F-statistic: 9.674 on 7 and 4597 DF, p-value: 5.2726e-12
# Plot residuals for each model separately
fit_models %>%
select(ARIMA) %>%
gg_tsresiduals()
fit_models %>%
select(ETS) %>%
gg_tsresiduals()
fit_models %>%
select(TSLM) %>%
gg_tsresiduals()
fit_models %>%
select(Naive) %>%
gg_tsresiduals()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
best_model <- fit_models %>% select(ARIMA)
forecast_vals <- forecast(best_model, h = "12 weeks")
forecast_vals %>%
autoplot(crime_weekly) +
labs(title = "12-Week Crime Forecast")
nnet_model <- model(train, NNETAR = NNETAR(weekly_count))
forecast(nnet_model, h = 12) %>% autoplot(train)