1. Load Packages

2. Import Data

EvictionData <- read.csv("C:/Users/schic/OneDrive/Documents/Predictive Analytics and Forecasting/Monthly Eviction Filings by Location.csv")

This data set, from data.world, provides monthly evictions for 10 states and 34 cities from 2020 until 2023. Including the city, the data collection type, the GEOID, the racial majority for the area, the month-year of occurrences, the number of filings, average filings, and the date of last update.

3. Data Exploration/Manipulation

#check for missing values
missmap(EvictionData)

There is no missing data, so we will not need to impute any data points

#filtering data for variables of interest
Evictions <- EvictionData %>%
  select('city', 'racial_majority', 'month', 'filings', 'GEOID')

#convert Month variable to Date format
Evictions$date <- as.Date(paste0("01-", Evictions$month), format = "%d-%b-%y")



#creeate tsibble object to aggregate GEOID and city
Evictions_tsibble <- tsibble(Evictions, key = c(GEOID, city), index = date)
Evictions_tsibble <- mutate(Evictions_tsibble, date = yearmonth(date))
#setting up training and testing datasets
Evictions_train <- Evictions %>%
  filter(date >= "2020-01-01" & date <= "2022-12-01")
Evictions_test <- Evictions %>%
  filter(date >= "2023-01-01" & date <= "2023-09-01")

4. Modeling

a. ETS

#establishing hierarchichal structure
Evictions_hts <- Evictions_tsibble %>%
  aggregate_key(city/GEOID, Value = sum(filings))
Evictions_hts
## # A tsibble: 403,931 x 4 [1M]
## # Key:       city, GEOID [9,124]
##        date city         GEOID        Value
##       <mth> <chr*>       <chr*>       <int>
##  1 2020 Jan <aggregated> <aggregated> 73372
##  2 2020 Feb <aggregated> <aggregated> 65198
##  3 2020 Mar <aggregated> <aggregated> 38233
##  4 2020 Apr <aggregated> <aggregated>  5375
##  5 2020 May <aggregated> <aggregated>  7898
##  6 2020 Jun <aggregated> <aggregated> 15018
##  7 2020 Jul <aggregated> <aggregated> 17535
##  8 2020 Aug <aggregated> <aggregated> 29053
##  9 2020 Sep <aggregated> <aggregated> 35418
## 10 2020 Oct <aggregated> <aggregated> 38641
## # ℹ 403,921 more rows
Evictions_hts |>
  filter(is_aggregated(GEOID)) |>
  autoplot(Value) +
  labs(y = "Filings #",
       title = "Number of Filings for Eviction in Each City (aggregated by GEOID)") +
  facet_wrap(vars(city), scales = "free_y", ncol = 6) +
  theme(legend.position = "none")

Evictions_tsibble2 <- tsibble(Evictions_train, key = c(GEOID, city), index = date)
Evictions_tsibble2 <- mutate(Evictions_tsibble2, date = yearmonth(date))

Evictions_tsibble3 <- tsibble(Evictions_test, key = c(GEOID, city), index = date)
Evictions_tsibble3 <- mutate(Evictions_tsibble3, date = yearmonth(date))
Evictions_hts2 <- Evictions_tsibble2 %>%
  aggregate_key(city/GEOID, Value = sum(filings))
Evictions_hts2
## # A tsibble: 328,464 x 4 [1M]
## # Key:       city, GEOID [9,124]
##        date city         GEOID        Value
##       <mth> <chr*>       <chr*>       <int>
##  1 2020 Jan <aggregated> <aggregated> 73372
##  2 2020 Feb <aggregated> <aggregated> 65198
##  3 2020 Mar <aggregated> <aggregated> 38233
##  4 2020 Apr <aggregated> <aggregated>  5375
##  5 2020 May <aggregated> <aggregated>  7898
##  6 2020 Jun <aggregated> <aggregated> 15018
##  7 2020 Jul <aggregated> <aggregated> 17535
##  8 2020 Aug <aggregated> <aggregated> 29053
##  9 2020 Sep <aggregated> <aggregated> 35418
## 10 2020 Oct <aggregated> <aggregated> 38641
## # ℹ 328,454 more rows
ETS_model <- Evictions_hts2 |>
  filter(is_aggregated(GEOID), is_aggregated(city)) |>
  model(ETS(Value ~ error("A") + trend("A") + season("A")))
fc <- ETS_model |> forecast(h = "9 months")
fc |>
  autoplot(Evictions_hts) +
  labs(y = "# of Filings") +
  facet_wrap(vars(city), scales = "free_y")

print(accuracy(ETS_model))
## # A tibble: 1 × 12
##   city        GEOID      .model .type    ME   RMSE   MAE   MPE  MAPE  MASE RMSSE
##   <chr*>      <chr*>     <chr>  <chr> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 <aggregate… <aggregat… "ETS(… Trai…  43.7 10521. 5739. -11.9  25.5 0.262 0.425
## # ℹ 1 more variable: ACF1 <dbl>

b. Neural Nets

#Evictions_hts3 <- Evictions_hts2 |>
  #filter(is_aggregated(GEOID), is_aggregated(city))

fit <- Evictions_hts2 |>
  filter(is_aggregated(GEOID), is_aggregated(city)) |>
  model(NNETAR(sqrt(Value)))
forecasted_values <- fit |>
  forecast(h = "9 months")

forecasted_values |>
  autoplot(Evictions_hts) +
  labs( x = "Time", y = "# of Filings", title = "# of Filings for Eviction (Neural Networks Forecast)")

forecasted_values
## # A fable: 9 x 6 [1M]
## # Key:     city, GEOID, .model [1]
##   city         GEOID        .model                  date        Value  .mean
##   <chr*>       <chr*>       <chr>                  <mth>       <dist>  <dbl>
## 1 <aggregated> <aggregated> NNETAR(sqrt(Value)) 2023 Jan sample[5000] 63929.
## 2 <aggregated> <aggregated> NNETAR(sqrt(Value)) 2023 Feb sample[5000] 63765.
## 3 <aggregated> <aggregated> NNETAR(sqrt(Value)) 2023 Mar sample[5000] 61674.
## 4 <aggregated> <aggregated> NNETAR(sqrt(Value)) 2023 Apr sample[5000] 62864.
## 5 <aggregated> <aggregated> NNETAR(sqrt(Value)) 2023 May sample[5000] 61783.
## 6 <aggregated> <aggregated> NNETAR(sqrt(Value)) 2023 Jun sample[5000] 60412.
## 7 <aggregated> <aggregated> NNETAR(sqrt(Value)) 2023 Jul sample[5000] 59051.
## 8 <aggregated> <aggregated> NNETAR(sqrt(Value)) 2023 Aug sample[5000] 58239.
## 9 <aggregated> <aggregated> NNETAR(sqrt(Value)) 2023 Sep sample[5000] 57982.
print(accuracy(fit))
## # A tibble: 1 × 12
##   city        GEOID      .model .type    ME  RMSE   MAE    MPE  MAPE  MASE RMSSE
##   <chr*>      <chr*>     <chr>  <chr> <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>
## 1 <aggregate… <aggregat… NNETA… Trai…  93.8 3227. 2805. -0.494  6.41 0.128 0.130
## # ℹ 1 more variable: ACF1 <dbl>

5. Comparisons

a. Accuracy of NNETAR model (test)

Evictions_hts3 <- Evictions_tsibble3 %>%
  aggregate_key(city/GEOID, Value = sum(filings))
Evictions_hts3
## # A tsibble: 75,467 x 4 [1M]
## # Key:       city, GEOID [9,124]
##        date city            GEOID        Value
##       <mth> <chr*>          <chr*>       <int>
##  1 2023 Jan <aggregated>    <aggregated> 73692
##  2 2023 Feb <aggregated>    <aggregated> 64741
##  3 2023 Mar <aggregated>    <aggregated> 67215
##  4 2023 Apr <aggregated>    <aggregated> 57938
##  5 2023 May <aggregated>    <aggregated> 65370
##  6 2023 Jun <aggregated>    <aggregated> 67290
##  7 2023 Jul <aggregated>    <aggregated> 59279
##  8 2023 Aug <aggregated>    <aggregated> 59838
##  9 2023 Sep <aggregated>    <aggregated>  4494
## 10 2023 Jan Albuquerque, NM <aggregated>   795
## # ℹ 75,457 more rows
accuracy_measures1 <- accuracy(forecasted_values, Evictions_hts3)

print(accuracy_measures1)
## # A tibble: 1 × 12
##   .model city       GEOID      .type     ME   RMSE   MAE   MPE  MAPE  MASE RMSSE
##   <chr>  <chr*>     <chr*>     <chr>  <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 NNETA… <aggregat… <aggregat… Test  -3316. 18484. 9665. -129.  138.   NaN   NaN
## # ℹ 1 more variable: ACF1 <dbl>

b. Accuracy of ETS model (test)

accuracy_measures2 <- accuracy(fc, Evictions_hts3)
print(accuracy_measures2)
## # A tibble: 1 × 12
##   .model      city       GEOID      .type     ME   RMSE    MAE   MPE  MAPE  MASE
##   <chr>       <chr*>     <chr*>     <chr>  <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl>
## 1 "ETS(Value… <aggregat… <aggregat… Test  -8817. 24337. 11733. -178.  183.   NaN
## # ℹ 2 more variables: RMSSE <dbl>, ACF1 <dbl>

c. Conclusions

Based on observations of the ETS and NNETAR models above, I have come to the conclusion that the Neural Network (NNETAR) model has outperformed the ETS model in both fitting the data and predicting future values withheld from training the model. We can come to this conclusion, as the NNETAR model has training error measures of “RMSE, MAE, and MASE” outperfomring those obtained by the ETS model. Additionally, in terms of testing error measures, we have obtained “RMSE and MAE” for the NNETAR model that are also outperforming those obtained by the ETS model. In conclusion, the NNETAR, or NNAR(2,1,2)[12], model outperforms ETS in both goodness of fit and predictability of data withheld from the training models in this case.