Code examples are from https://www.r-bloggers.com/anomaly-detection-in-r-2/ written by the team at Perceptive Analytics: Madhur Modi, Prudhvi Potuganti, Saneesh Veetil and Chaitanya Sagar.
Check if the packages needed are already installed and if not install them. First install the devtools package. Then install the github packages.
The wikipediatrend package contains the API to access wikipedia trends data on any page in Wikipedia. The AnomalyDetection package detects anomalies in seasonal univariate time series.
neededPackages = c("devtools", "Rcpp", "ggplot2")
for (i in 1:length(neededPackages)) {
ifelse(!neededPackages[i] %in% installed.packages(), install.packages(neededPackages[i]),
print(paste(neededPackages[i], "is already installed.")))
}
[1] "devtools is already installed."
[1] "Rcpp is already installed."
[1] "ggplot2 is already installed."
library(devtools)
gitHubPackages = c("wikipediatrend", "AnomalyDetection")
gitHubLoction = c("petermeissner/wikipediatrend", "twitter/AnomalyDetection")
for (i in 1:length(gitHubPackages)) {
ifelse(!gitHubPackages[i] %in% installed.packages(), install_github(gitHubLoction[i]),
print(paste(gitHubPackages[i], "is already installed.")))
}
[1] "wikipediatrend is already installed."
[1] "AnomalyDetection is already installed."
library(Rcpp)
library(wikipediatrend)
library(AnomalyDetection)
Download the trends data from the English Wikipedia webpage for President Michael D. Higgins “Michael_D._Higgins" and save it as the variable higginsWikipedia. View the first 6 rows.
higginsWikipedia = wp_trend("Michael_D._Higgins", from="2018-01-01", lang = "en")
head(higginsWikipedia)
project language article access agent granularity date views
1 wikipedia en Michael_D._Higgins all-access all-agents daily 2018-01-01 785
2 wikipedia en Michael_D._Higgins all-access all-agents daily 2018-01-02 773
3 wikipedia en Michael_D._Higgins all-access all-agents daily 2018-01-03 657
4 wikipedia en Michael_D._Higgins all-access all-agents daily 2018-01-04 668
5 wikipedia en Michael_D._Higgins all-access all-agents daily 2018-01-05 622
6 wikipedia en Michael_D._Higgins all-access all-agents daily 2018-01-06 689
Plot the data using ggplot2. Set the x-axis to the Date and the y-axis to the number of views.
library(ggplot2)
ggplot(higginsWikipedia, aes(x=date, y=views, color=views)) +
geom_line()
Prepare the data for the AnomalyDetection package by keeping only the date and page views and discard all other variables. The input must be a series of
columns_to_keep=c("date","views")
higginsWikipedia=higginsWikipedia[,columns_to_keep]
str(higginsWikipedia)
Classes ‘wp_df’ and 'data.frame': 350 obs. of 2 variables:
$ date : POSIXct, format: "2018-01-01" "2018-01-02" "2018-01-03" ...
$ views: num 785 773 657 668 622 ...
Apply anomaly detection and plot the results.
AnomalyDetectionHiggins = AnomalyDetectionTs(higginsWikipedia, direction="pos", plot=TRUE, title = "Anomaly Detection")
AnomalyDetectionHiggins$plot
Look at the dates the anomalies occured.
anomaliesHiggins$anoms$timestamp
Install anomalize package.
#install.packages('anomalize')
#Update from github
#library(devtools)
#install_github("business-science/anomalize")
#Load the package
library(anomalize)
# We will also use tidyverse package for processing and coindeskr to get bitcoin data
library(tidyverse)
Decompose data using time_decompose() function in anomalize package. We will use stl method which extracts seasonality.
higginsWikipedia_ts = higginsWikipedia %>%
as.tibble()
higginsWikipedia_ts %>%
time_decompose(views, method = "stl", frequency = "auto", trend = "auto") %>%
anomalize(remainder, method = "gesd", alpha = 0.05, max_anoms = 0.1) %>%
plot_anomaly_decomposition()
Converting from tbl_df to tbl_time.
Auto-index message: index = date
frequency = 7 days
trend = 90.5 days
Decompose data using time_decompose() function in anomalize package. We will use stl method which extracts seasonality.
higginsWikipedia_ts %>%
time_decompose(views, method = "stl", frequency = "auto", trend = "auto") %>%
anomalize(remainder, method = "gesd", alpha = 0.05, max_anoms = 0.1) %>%
plot_anomaly_decomposition()
Converting from tbl_df to tbl_time.
Auto-index message: index = date
frequency = 7 days
trend = 90.5 days
Plot the data again by recomposing data
higginsWikipedia_ts %>%
time_decompose(views) %>%
Extract the anomalies
anomalizeHiggins = higginsWikipedia_ts %>%
time_decompose(views) %>%
anomalize(remainder) %>%
time_recompose() %>%
filter(anomaly == 'Yes')
Converting from tbl_df to tbl_time.
Auto-index message: index = date
frequency = 7 days
trend = 90.5 days
anomalizeHiggins
# A time tibble: 36 x 10
[38;5;246m# Index: date[39m
date observed season trend remainder remainder_l1 remainder_l2 anomaly recomposed_l1
[3m[38;5;246m<dttm>[39m[23m [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<chr>[39m[23m [3m[38;5;246m<dbl>[39m[23m
[38;5;250m 1[39m 2018-01-15 [38;5;246m00:00:00[39m [4m1[24m[4m0[24m627 59.7 851. [4m9[24m716. -[31m[4m1[24m28[39m[31m8[39m[31m.[39m [4m1[24m459. Yes -[31m377[39m[31m.[39m
[38;5;250m 2[39m 2018-01-16 [38;5;246m00:00:00[39m [4m1[24m[4m2[24m142 39.8 850. [4m1[24m[4m1[24m252. -[31m[4m1[24m28[39m[31m8[39m[31m.[39m [4m1[24m459. Yes -[31m398[39m[31m.[39m
[38;5;250m 3[39m 2018-01-17 [38;5;246m00:00:00[39m [4m4[24m009 -[31m15[39m[31m.[39m[31m1[39m 850. [4m3[24m174. -[31m[4m1[24m28[39m[31m8[39m[31m.[39m [4m1[24m459. Yes -[31m453[39m[31m.[39m
[38;5;250m 4[39m 2018-01-29 [38;5;246m00:00:00[39m [4m3[24m260. 59.7 845. [4m2[24m356. -[31m[4m1[24m28[39m[31m8[39m[31m.[39m [4m1[24m459. Yes -[31m384[39m[31m.[39m
[38;5;250m 5[39m 2018-02-10 [38;5;246m00:00:00[39m [4m2[24m773 -[31m46[39m[31m.[39m[31m1[39m 842. [4m1[24m977. -[31m[4m1[24m28[39m[31m8[39m[31m.[39m [4m1[24m459. Yes -[31m492[39m[31m.[39m
[38;5;250m 6[39m 2018-02-21 [38;5;246m00:00:00[39m [4m2[24m364 -[31m15[39m[31m.[39m[31m1[39m 830. [4m1[24m549. -[31m[4m1[24m28[39m[31m8[39m[31m.[39m [4m1[24m459. Yes -[31m473[39m[31m.[39m
[38;5;250m 7[39m 2018-03-10 [38;5;246m00:00:00[39m [4m3[24m682 -[31m46[39m[31m.[39m[31m1[39m 845. [4m2[24m883. -[31m[4m1[24m28[39m[31m8[39m[31m.[39m [4m1[24m459. Yes -[31m489[39m[31m.[39m
[38;5;250m 8[39m 2018-03-17 [38;5;246m00:00:00[39m [4m2[24m307 -[31m46[39m[31m.[39m[31m1[39m 855. [4m1[24m498. -[31m[4m1[24m28[39m[31m8[39m[31m.[39m [4m1[24m459. Yes -[31m479[39m[31m.[39m
[38;5;250m 9[39m 2018-05-26 [38;5;246m00:00:00[39m [4m2[24m972 -[31m46[39m[31m.[39m[31m1[39m 793. [4m2[24m225. -[31m[4m1[24m28[39m[31m8[39m[31m.[39m [4m1[24m459. Yes -[31m541[39m[31m.[39m
[38;5;250m10[39m 2018-07-10 [38;5;246m00:00:00[39m [4m5[24m597 39.8 831. [4m4[24m726. -[31m[4m1[24m28[39m[31m8[39m[31m.[39m [4m1[24m459. Yes -[31m417[39m[31m.[39m
[38;5;246m# ... with 26 more rows, and 1 more variable: recomposed_l2 [3m[38;5;246m<dbl>[38;5;246m[23m[39m