Code examples are from https://www.r-bloggers.com/anomaly-detection-in-r-2 written by the team at Perceptive Analytics: Madhur Modi, Prudhvi Potuganti, Saneesh Veetil and Chaitanya Sagar.
Check if the libraries needed are already installed and if not install them. Note: the devtools library must be installed before the github packages.
The Rcpp library is used to integrate R with C++ functions. The ggplot2 library is used to create plots.
neededPackages = c("devtools", "Rcpp", "ggplot2", "backports")
for (i in 1:length(neededPackages)) {
ifelse(!neededPackages[i] %in% installed.packages(), install.packages(neededPackages[i]),
print(paste(neededPackages[i], "is already installed.")))
}
[1] "devtools is already installed."
[1] "Rcpp is already installed."
[1] "ggplot2 is already installed."
[1] "backports is already installed."
The wikipediatrend library contains the API to access wikipedia trends data on any page in Wikipedia. The AnomalyDetection library detects anomalies in seasonal univariate time series.
library(backports)
library(devtools)
gitHubPackages = c("wikipediatrend", "AnomalyDetection")
gitHubLoction = c("petermeissner/wikipediatrend", "twitter/AnomalyDetection")
for (i in 1:length(gitHubPackages)) {
ifelse(!gitHubPackages[i] %in% installed.packages(), install_github(gitHubLoction[i]),
print(paste(gitHubPackages[i], "is already installed.")))
}
[1] "wikipediatrend is already installed."
[1] "AnomalyDetection is already installed."
library(Rcpp)
library(wikipediatrend)
library(AnomalyDetection)
Use the wikipediatrend library to get trends data from Wikipedia. In the example below the wp_trend() function is capturing the trends data for the English language Wikipedia (lang = “en”) page for President Michael D. Higgins (Michael_D._Higgins), starting on 1st January 2018 (from=“2018-01-01”) and saves it as the variable higginsWikipedia. Use the head() function to view the first 6 rows.
higginsWikipedia = wp_trend("Michael_D._Higgins", from="2018-01-01", lang = "en")
head(higginsWikipedia)
project language article access agent granularity
1 wikipedia en Michael_D._Higg ... all-access all-agents daily
2 wikipedia en Michael_D._Higg ... all-access all-agents daily
3 wikipedia en Michael_D._Higg ... all-access all-agents daily
4 wikipedia en Michael_D._Higg ... all-access all-agents daily
5 wikipedia en Michael_D._Higg ... all-access all-agents daily
6 wikipedia en Michael_D._Higg ... all-access all-agents daily
date views
1 2018-01-01 785
2 2018-01-02 773
3 2018-01-03 657
4 2018-01-04 668
5 2018-01-05 622
6 2018-01-06 689
Create a line plot of the wikipedia data using ggplot2. Set the x-axis to the Date and the y-axis to the number of views.
library(ggplot2)
ggplot(higginsWikipedia, aes(x=date, y=views, color=views)) +
geom_line()
Prepare the data for the AnomalyDetection library by keeping only the date and views variables and discard all other variables. The input must be a series of
higginsWikipedia=higginsWikipedia[,c("date","views")]
str(higginsWikipedia)
Classes ‘wp_df’ and 'data.frame': 350 obs. of 2 variables:
$ date : POSIXct, format: "2018-01-01" "2018-01-02" ...
$ views: num 785 773 657 668 622 ...
Apply anomaly detection and plot the results.
AnomalyDetectionHiggins = AnomalyDetectionTs(higginsWikipedia, direction="pos", plot=TRUE, title = "Anomaly Detection")
AnomalyDetectionHiggins$plot
Look at the dates the anomalies occured.
AnomalyDetectionHiggins$anoms
timestamp anoms
1 2018-01-15 10627
2 2018-01-16 12142
3 2018-01-17 4009
4 2018-01-29 3260
5 2018-02-10 2773
6 2018-03-10 3682
7 2018-05-26 2972
8 2018-07-10 5597
9 2018-07-11 5200
10 2018-07-12 3435
11 2018-07-13 3091
12 2018-07-14 2560
13 2018-08-19 3824
14 2018-08-25 6091
15 2018-10-15 3329
16 2018-10-16 3340
17 2018-10-17 5692
18 2018-10-18 4210
19 2018-10-19 3102
20 2018-10-20 2631
21 2018-10-21 4139
22 2018-10-22 3167
23 2018-10-23 6285
24 2018-10-24 4835
25 2018-10-25 4391
26 2018-10-26 9895
27 2018-10-27 28548
28 2018-10-28 15945
29 2018-10-29 6961
30 2018-10-30 12862
31 2018-10-31 9165
32 2018-11-01 5206
33 2018-11-11 7703
34 2018-11-12 3623
35 2018-11-17 2869
Check if the anomalize library and dplyr are already installed and if not install it.
neededPackages = c("anomalize")
packageLocations = c("business-science/anomalize")
for (i in 1:length(gitHubPackages)) {
ifelse(!neededPackages[i] %in% installed.packages(), install_github(packageLocations[i]),
print(paste(neededPackages[i], "is already installed.")))}
[1] "anomalize is already installed."
[1] "NA is already installed."
neededPackages = c("dplyr", "tibble")
for (i in 1:length(neededPackages)) {
ifelse(!neededPackages[i] %in% installed.packages(), install.packages(neededPackages[i]),
print(paste(neededPackages[i], "is already installed.")))
}
[1] "dplyr is already installed."
[1] "tibble is already installed."
Load the anomalize and dplyr libraries.
library(anomalize)
library(dplyr)
library(tibble)
Decompose data using time_decompose() function in anomalize package. We will use stl method which extracts seasonality.
higginsWikipedia_ts = higginsWikipedia %>%
as.tibble()
higginsWikipedia_ts %>%
time_decompose(views, method = "stl", frequency = "auto", trend = "auto") %>%
anomalize(remainder, method = "gesd", alpha = 0.05, max_anoms = 0.1) %>%
plot_anomaly_decomposition()
Converting from tbl_df to tbl_time.
Auto-index message: index = date
frequency = 7 days
trend = 90.5 days
Decompose data using time_decompose() function in anomalize package. We will use stl method which extracts seasonality.
higginsWikipedia_ts %>%
time_decompose(views, method = "stl", frequency = "auto", trend = "auto") %>%
anomalize(remainder, method = "gesd", alpha = 0.05, max_anoms = 0.1) %>%
plot_anomaly_decomposition()
Converting from tbl_df to tbl_time.
Auto-index message: index = date
frequency = 7 days
trend = 90.5 days
Plot the data again by recomposing data
higginsWikipedia_ts %>%
time_decompose(views) %>%
anomalize(remainder) %>%
time_recompose() %>%
plot_anomalies(time_recomposed = TRUE, ncol = 3, alpha_dots = 0.5)
Converting from tbl_df to tbl_time.
Auto-index message: index = date
frequency = 7 days
trend = 90.5 days
Extract the anomalies
anomalizeHiggins = higginsWikipedia_ts %>%
time_decompose(views) %>%
anomalize(remainder) %>%
time_recompose() %>%
filter(anomaly == 'Yes')
Converting from tbl_df to tbl_time.
Auto-index message: index = date
frequency = 7 days
trend = 90.5 days
anomalizeHiggins
# A time tibble: 36 x 10
[38;5;246m# Index: date[39m
date observed season trend remainder remainder_l1
[3m[38;5;246m<dttm>[39m[23m [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<dbl>[39m[23m
[38;5;250m 1[39m 2018-01-15 [38;5;246m00:00:00[39m [4m1[24m[4m0[24m627 59.7 851. [4m9[24m716. -[31m[4m1[24m28[39m[31m8[39m[31m.[39m
[38;5;250m 2[39m 2018-01-16 [38;5;246m00:00:00[39m [4m1[24m[4m2[24m142 39.8 850. [4m1[24m[4m1[24m252. -[31m[4m1[24m28[39m[31m8[39m[31m.[39m
[38;5;250m 3[39m 2018-01-17 [38;5;246m00:00:00[39m [4m4[24m009 -[31m15[39m[31m.[39m[31m1[39m 850. [4m3[24m174. -[31m[4m1[24m28[39m[31m8[39m[31m.[39m
[38;5;250m 4[39m 2018-01-29 [38;5;246m00:00:00[39m [4m3[24m260. 59.7 845. [4m2[24m356. -[31m[4m1[24m28[39m[31m8[39m[31m.[39m
[38;5;250m 5[39m 2018-02-10 [38;5;246m00:00:00[39m [4m2[24m773 -[31m46[39m[31m.[39m[31m1[39m 842. [4m1[24m977. -[31m[4m1[24m28[39m[31m8[39m[31m.[39m
[38;5;250m 6[39m 2018-02-21 [38;5;246m00:00:00[39m [4m2[24m364 -[31m15[39m[31m.[39m[31m1[39m 830. [4m1[24m549. -[31m[4m1[24m28[39m[31m8[39m[31m.[39m
[38;5;250m 7[39m 2018-03-10 [38;5;246m00:00:00[39m [4m3[24m682 -[31m46[39m[31m.[39m[31m1[39m 845. [4m2[24m883. -[31m[4m1[24m28[39m[31m8[39m[31m.[39m
[38;5;250m 8[39m 2018-03-17 [38;5;246m00:00:00[39m [4m2[24m307 -[31m46[39m[31m.[39m[31m1[39m 855. [4m1[24m498. -[31m[4m1[24m28[39m[31m8[39m[31m.[39m
[38;5;250m 9[39m 2018-05-26 [38;5;246m00:00:00[39m [4m2[24m972 -[31m46[39m[31m.[39m[31m1[39m 793. [4m2[24m225. -[31m[4m1[24m28[39m[31m8[39m[31m.[39m
[38;5;250m10[39m 2018-07-10 [38;5;246m00:00:00[39m [4m5[24m597 39.8 831. [4m4[24m726. -[31m[4m1[24m28[39m[31m8[39m[31m.[39m
[38;5;246m# ... with 26 more rows, and 4 more variables: remainder_l2 [3m[38;5;246m<dbl>[38;5;246m[23m,
# anomaly [3m[38;5;246m<chr>[38;5;246m[23m, recomposed_l1 [3m[38;5;246m<dbl>[38;5;246m[23m, recomposed_l2 [3m[38;5;246m<dbl>[38;5;246m[23m[39m