EvictionData <- read.csv("C:/Users/schic/OneDrive/Documents/Predictive Analytics and Forecasting/Monthly Eviction Filings by Location.csv")
This data set, from data.world, provides monthly evictions for 10 states and 34 cities from 2020 until 2023. Including the city, the data collection type, the GEOID, the racial majority for the area, the month-year of occurrences, the number of filings, average filings, and the date of last update.
#check for missing values
missmap(EvictionData)
There is no missing data, so we will not need to impute any data points
#filtering data for variables of interest
Evictions <- EvictionData %>%
select('city', 'racial_majority', 'month', 'filings', 'GEOID')
#convert Month variable to Date format
Evictions$date <- as.Date(paste0("01-", Evictions$month), format = "%d-%b-%y")
#creeate tsibble object to aggregate GEOID and city
Evictions_tsibble <- tsibble(Evictions, key = c(GEOID, city), index = date)
Evictions_tsibble <- mutate(Evictions_tsibble, date = yearmonth(date))
#setting up training and testing datasets
Evictions_train <- Evictions %>%
filter(date >= "2020-01-01" & date <= "2022-12-01")
Evictions_test <- Evictions %>%
filter(date >= "2023-01-01" & date <= "2023-09-01")
#establishing hierarchichal structure
Evictions_hts <- Evictions_tsibble %>%
aggregate_key(city/GEOID, Value = sum(filings))
Evictions_hts
## # A tsibble: 403,931 x 4 [1M]
## # Key: city, GEOID [9,124]
## date city GEOID Value
## <mth> <chr*> <chr*> <int>
## 1 2020 Jan <aggregated> <aggregated> 73372
## 2 2020 Feb <aggregated> <aggregated> 65198
## 3 2020 Mar <aggregated> <aggregated> 38233
## 4 2020 Apr <aggregated> <aggregated> 5375
## 5 2020 May <aggregated> <aggregated> 7898
## 6 2020 Jun <aggregated> <aggregated> 15018
## 7 2020 Jul <aggregated> <aggregated> 17535
## 8 2020 Aug <aggregated> <aggregated> 29053
## 9 2020 Sep <aggregated> <aggregated> 35418
## 10 2020 Oct <aggregated> <aggregated> 38641
## # ℹ 403,921 more rows
Evictions_hts |>
filter(is_aggregated(GEOID)) |>
autoplot(Value) +
labs(y = "Filings #",
title = "Number of Filings for Eviction in Each City (aggregated by GEOID)") +
facet_wrap(vars(city), scales = "free_y", ncol = 6) +
theme(legend.position = "none")
Evictions_tsibble2 <- tsibble(Evictions_train, key = c(GEOID, city), index = date)
Evictions_tsibble2 <- mutate(Evictions_tsibble2, date = yearmonth(date))
Evictions_tsibble3 <- tsibble(Evictions_test, key = c(GEOID, city), index = date)
Evictions_tsibble3 <- mutate(Evictions_tsibble3, date = yearmonth(date))
Evictions_hts2 <- Evictions_tsibble2 %>%
aggregate_key(city/GEOID, Value = sum(filings))
Evictions_hts2
## # A tsibble: 328,464 x 4 [1M]
## # Key: city, GEOID [9,124]
## date city GEOID Value
## <mth> <chr*> <chr*> <int>
## 1 2020 Jan <aggregated> <aggregated> 73372
## 2 2020 Feb <aggregated> <aggregated> 65198
## 3 2020 Mar <aggregated> <aggregated> 38233
## 4 2020 Apr <aggregated> <aggregated> 5375
## 5 2020 May <aggregated> <aggregated> 7898
## 6 2020 Jun <aggregated> <aggregated> 15018
## 7 2020 Jul <aggregated> <aggregated> 17535
## 8 2020 Aug <aggregated> <aggregated> 29053
## 9 2020 Sep <aggregated> <aggregated> 35418
## 10 2020 Oct <aggregated> <aggregated> 38641
## # ℹ 328,454 more rows
ETS_model <- Evictions_hts2 |>
filter(is_aggregated(GEOID), is_aggregated(city)) |>
model(ETS(Value ~ error("A") + trend("A") + season("A")))
fc <- ETS_model |> forecast(h = "9 months")
fc |>
autoplot(Evictions_hts) +
labs(y = "# of Filings") +
facet_wrap(vars(city), scales = "free_y")
print(accuracy(ETS_model))
## # A tibble: 1 × 12
## city GEOID .model .type ME RMSE MAE MPE MAPE MASE RMSSE
## <chr*> <chr*> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 <aggregate… <aggregat… "ETS(… Trai… 43.7 10521. 5739. -11.9 25.5 0.262 0.425
## # ℹ 1 more variable: ACF1 <dbl>
#Evictions_hts3 <- Evictions_hts2 |>
#filter(is_aggregated(GEOID), is_aggregated(city))
fit <- Evictions_hts2 |>
filter(is_aggregated(GEOID), is_aggregated(city)) |>
model(NNETAR(sqrt(Value)))
forecasted_values <- fit |>
forecast(h = "9 months")
forecasted_values |>
autoplot(Evictions_hts) +
labs( x = "Time", y = "# of Filings", title = "# of Filings for Eviction (Neural Networks Forecast)")
forecasted_values
## # A fable: 9 x 6 [1M]
## # Key: city, GEOID, .model [1]
## city GEOID .model date Value .mean
## <chr*> <chr*> <chr> <mth> <dist> <dbl>
## 1 <aggregated> <aggregated> NNETAR(sqrt(Value)) 2023 Jan sample[5000] 63929.
## 2 <aggregated> <aggregated> NNETAR(sqrt(Value)) 2023 Feb sample[5000] 63765.
## 3 <aggregated> <aggregated> NNETAR(sqrt(Value)) 2023 Mar sample[5000] 61674.
## 4 <aggregated> <aggregated> NNETAR(sqrt(Value)) 2023 Apr sample[5000] 62864.
## 5 <aggregated> <aggregated> NNETAR(sqrt(Value)) 2023 May sample[5000] 61783.
## 6 <aggregated> <aggregated> NNETAR(sqrt(Value)) 2023 Jun sample[5000] 60412.
## 7 <aggregated> <aggregated> NNETAR(sqrt(Value)) 2023 Jul sample[5000] 59051.
## 8 <aggregated> <aggregated> NNETAR(sqrt(Value)) 2023 Aug sample[5000] 58239.
## 9 <aggregated> <aggregated> NNETAR(sqrt(Value)) 2023 Sep sample[5000] 57982.
print(accuracy(fit))
## # A tibble: 1 × 12
## city GEOID .model .type ME RMSE MAE MPE MAPE MASE RMSSE
## <chr*> <chr*> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 <aggregate… <aggregat… NNETA… Trai… 93.8 3227. 2805. -0.494 6.41 0.128 0.130
## # ℹ 1 more variable: ACF1 <dbl>
Evictions_hts3 <- Evictions_tsibble3 %>%
aggregate_key(city/GEOID, Value = sum(filings))
Evictions_hts3
## # A tsibble: 75,467 x 4 [1M]
## # Key: city, GEOID [9,124]
## date city GEOID Value
## <mth> <chr*> <chr*> <int>
## 1 2023 Jan <aggregated> <aggregated> 73692
## 2 2023 Feb <aggregated> <aggregated> 64741
## 3 2023 Mar <aggregated> <aggregated> 67215
## 4 2023 Apr <aggregated> <aggregated> 57938
## 5 2023 May <aggregated> <aggregated> 65370
## 6 2023 Jun <aggregated> <aggregated> 67290
## 7 2023 Jul <aggregated> <aggregated> 59279
## 8 2023 Aug <aggregated> <aggregated> 59838
## 9 2023 Sep <aggregated> <aggregated> 4494
## 10 2023 Jan Albuquerque, NM <aggregated> 795
## # ℹ 75,457 more rows
accuracy_measures1 <- accuracy(forecasted_values, Evictions_hts3)
print(accuracy_measures1)
## # A tibble: 1 × 12
## .model city GEOID .type ME RMSE MAE MPE MAPE MASE RMSSE
## <chr> <chr*> <chr*> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 NNETA… <aggregat… <aggregat… Test -3316. 18484. 9665. -129. 138. NaN NaN
## # ℹ 1 more variable: ACF1 <dbl>
accuracy_measures2 <- accuracy(fc, Evictions_hts3)
print(accuracy_measures2)
## # A tibble: 1 × 12
## .model city GEOID .type ME RMSE MAE MPE MAPE MASE
## <chr> <chr*> <chr*> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 "ETS(Value… <aggregat… <aggregat… Test -8817. 24337. 11733. -178. 183. NaN
## # ℹ 2 more variables: RMSSE <dbl>, ACF1 <dbl>
Based on observations of the ETS and NNETAR models above, I have come to the conclusion that the Neural Network (NNETAR) model has outperformed the ETS model in both fitting the data and predicting future values withheld from training the model. We can come to this conclusion, as the NNETAR model has training error measures of “RMSE, MAE, and MASE” outperfomring those obtained by the ETS model. Additionally, in terms of testing error measures, we have obtained “RMSE and MAE” for the NNETAR model that are also outperforming those obtained by the ETS model. In conclusion, the NNETAR, or NNAR(2,1,2)[12], model outperforms ETS in both goodness of fit and predictability of data withheld from the training models in this case.