Anomaly detection on vegetation indexes

We begin by loading the necessary packages

library(tidyverse)
library(anomalize)
library(tidyquant)
library(timetk)
library(sweep)
library(imputeTS)

Then we load the data directly from github

displacement <- read_csv("https://raw.githubusercontent.com/omdena/UNHCR/master/task7_data2heatmap_conversion/displacement_conflict_weekly.csv")

vegetation_indexes <- read_csv("https://raw.githubusercontent.com/omdena/UNHCR/master/task7_data2heatmap_conversion/vegetation_indexes.csv")

We observe some negative values in the weeks column in the vegetation index dataset.

unique(vegetation_indexes$week)

##  [1]   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
## [18]  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34
## [35]  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51
## [52]  52 -28 -29 -30 -31 -32  -3  -4  -5  -6  -7  -8  -9 -10 -11 -12

So we transform it to have only positive values in that column.

vegetation_indexes <- vegetation_indexes %>%
  mutate(week = abs(week))

We check for data completeness, taking into account we only have 41 weeks for year 2019

n_regions <- n_distinct(vegetation_indexes$region)
n_weeks   <- n_distinct(vegetation_indexes$week)
n_years   <- n_distinct(vegetation_indexes$year) - 1
  
n_dimensions_complete <- n_regions*n_weeks*n_years + n_regions* 41

print(n_dimensions_complete)

## [1] 35370

Now we look at the number of lines of our data set.

nrow(vegetation_indexes)

## [1] 34443

So we have implicit missing data. We will turn implicit missing values into explicit missing values.

vegetation_indexes <- vegetation_indexes %>%
  complete(region, year, week) %>%
  filter(!(year == 2019 & week > 41 ))

nrow(vegetation_indexes)

## [1] 35370

We have now the number of rows than what we should have. We then find out the percentage of missing values.

vegetation_indexes <- vegetation_indexes %>%    
  mutate(is_na = ifelse(is.na(VHI), 1, 0))

round(sum(vegetation_indexes$is_na)/nrow(vegetation_indexes),3)

## [1] 0.026

We create a date column, so dates are easier to manipulate.

n_periods <- nrow(vegetation_indexes)/n_regions

vegetation_indexes <- vegetation_indexes %>%
  arrange(region, year, week) %>%
  group_by(region) %>%
  mutate(date = seq(as.Date("1982/1/1"), by = "week", length.out = n_periods),
         date = ymd(date))

Impute missing values

vegetation_indexes <- vegetation_indexes %>%
  group_by(region) %>%
  mutate(SMN = na_ma(SMN),
         SMT = na_ma(SMT),
         VCI = na_ma(VCI),
         TCI = na_ma(TCI),
         VHI = na_ma(VHI))

We group our data frame by region and then nest it, so it is easier to manipulate. This way, all we do will be done on a per-region basis.

vegetation_indexes_nest <- vegetation_indexes %>%
  group_by(region) %>%
  nest()

vegetation_indexes_nest_ts <- vegetation_indexes_nest %>%
  mutate(data_ts = map(.x = data, 
                       .f = tk_ts,
                       select = c('SMN', 'SMT', 'VCI', 'TCI', 'VHI'),
                       start = 1982,
                       freq = 52))

We then perfom a seasonal decomposition, so that we can get the seasonal and remainder components of our time series.

vegetation_indexes_anomalies <- vegetation_indexes_nest %>%
  mutate(ts_decompose = map(.x = data,
                            ~time_decompose(data = .x,
                                            target = VHI,
                                            method = "stl",
                                            trend = "5 years")))

So our results look like this,

head(vegetation_indexes_anomalies$ts_decompose[[1]], 10)

Then we detect anomalies using the remainder. In short, we will establish lower and upper bounds for the remainder component of our time-series. If the remainder component is out of the established bounds, it will be flagged as an anomaly.

vegetation_indexes_anomalies <- vegetation_indexes_anomalies %>%
  mutate(anomalies = map(.x= ts_decompose,
                            ~anomalize(data = .x,
                                        target = remainder,
                                        method = "iqr",
                                        alpha = 0.2)))

So now we have a column that flags anomalies,

head(vegetation_indexes_anomalies$anomalies[[1]], 10)

Now, we will recompose our time series. If the remainder component of the observed value is flagged as an anomaly, then the observed value will be flagged as an anomaly.

vegetation_indexes_anomalies <- vegetation_indexes_anomalies %>%
  mutate(ts_recompose = map(.x= anomalies,
                            ~time_recompose(data = .x)))

So we now have a data frame that looks like this,

head(vegetation_indexes_anomalies$ts_recompose[[1]], 10)

We can then observe the results for the Bari region,

vegetation_indexes_anomalies$ts_recompose[[4]] %>%
  plot_anomalies(time_recomposed = TRUE, alpha_dots = 0.4, alpha_ribbon = 0.1, fill_ribbon = "grey80")

Now, we unnest all our data frames.

vegetation_indexes_anomalies_unnest <- vegetation_indexes_anomalies %>%
  unnest(ts_recompose) %>%
  select(region,  date, observed, season, trend, remainder, remainder_l1, 
         remainder_l2, anomaly, recomposed_l1, recomposed_l2)

Observe the structure

head(vegetation_indexes_anomalies_unnest)

And we count the number of anomalies

table(vegetation_indexes_anomalies_unnest$anomaly)

## 
##    No   Yes 
## 31417  3953