Part 3: Association Rules

# Loading Libraries

library(arules)
## Loading required package: Matrix
## 
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ tidyr::pack()   masks Matrix::pack()
## ✖ dplyr::recode() masks arules::recode()
## ✖ tidyr::unpack() masks Matrix::unpack()
library(arulesViz)
# Loading dataset

url<-"http://bit.ly/SupermarketDatasetII"
sales <- read.transactions(url, sep = ',')
## Warning in asMethod(object): removing duplicated items in transactions
head(sales)
## transactions in sparse format with
##  6 transactions (rows) and
##  119 items (columns)
# Previewing top items 

inspect(sales[1:4])
##     items               
## [1] {almonds,           
##      antioxydant juice, 
##      avocado,           
##      cottage cheese,    
##      energy drink,      
##      frozen smoothie,   
##      green grapes,      
##      green tea,         
##      honey,             
##      low fat yogurt,    
##      mineral water,     
##      olive oil,         
##      salad,             
##      salmon,            
##      shrimp,            
##      spinach,           
##      tomato juice,      
##      vegetables mix,    
##      whole weat flour,  
##      yams}              
## [2] {burgers,           
##      eggs,              
##      meatballs}         
## [3] {chutney}           
## [4] {avocado,           
##      turkey}
# Previewing summary statistic

summary(sales)
## transactions as itemMatrix in sparse format with
##  7501 rows (elements/itemsets/transactions) and
##  119 columns (items) and a density of 0.03288973 
## 
## most frequent items:
## mineral water          eggs     spaghetti  french fries     chocolate 
##          1788          1348          1306          1282          1229 
##       (Other) 
##         22405 
## 
## element (itemset/transaction) length distribution:
## sizes
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
## 1754 1358 1044  816  667  493  391  324  259  139  102   67   40   22   17    4 
##   18   19   20 
##    1    2    1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   3.914   5.000  20.000 
## 
## includes extended item information - examples:
##              labels
## 1           almonds
## 2 antioxydant juice
## 3         asparagus
# Previewing the labels of items

items <- as.data.frame(itemLabels(sales))
colnames(items) <- "Item"
head(items, 4)    
##                Item
## 1           almonds
## 2 antioxydant juice
## 3         asparagus
## 4           avocado
tail(items,4)
##                 Item
## 116 whole wheat rice
## 117             yams
## 118      yogurt cake
## 119         zucchini
# Checking the frequency of first 10 labels

itemFrequency(sales[, 1:10],type = "absolute")
##           almonds antioxydant juice         asparagus           avocado 
##               153                67                36               250 
##       babies food             bacon    barbecue sauce         black tea 
##                34                65                81               107 
##       blueberries        body spray 
##                69                86
# plotting the frequency of items

itemFrequencyPlot(sales, topN = 10,col="green")

itemFrequencyPlot(sales, support = 0.1,col="blue")

# Checking the rules of the data at 80% confidence level

rules <- apriori (sales, parameter = list(supp = 0.001, conf = 0.8))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.8    0.1    1 none FALSE            TRUE       5   0.001      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 7 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[119 item(s), 7501 transaction(s)] done [0.01s].
## sorting and recoding items ... [116 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 done [0.02s].
## writing ... [74 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
rules
## set of 74 rules
# Summary of the 1st 5  rules

inspect(rules[1:5])
##     lhs                              rhs             support     confidence
## [1] {frozen smoothie, spinach}    => {mineral water} 0.001066524 0.8888889 
## [2] {bacon, pancakes}             => {spaghetti}     0.001733102 0.8125000 
## [3] {nonfat milk, turkey}         => {mineral water} 0.001199840 0.8181818 
## [4] {ground beef, nonfat milk}    => {mineral water} 0.001599787 0.8571429 
## [5] {mushroom cream sauce, pasta} => {escalope}      0.002532996 0.9500000 
##     coverage    lift      count
## [1] 0.001199840  3.729058  8   
## [2] 0.002133049  4.666587 13   
## [3] 0.001466471  3.432428  9   
## [4] 0.001866418  3.595877 12   
## [5] 0.002666311 11.976387 19
# Visualization of the association rules

subrules <- head(rules, n = 10, by = "confidence")
plot(subrules, method = "graph" ,   engine = "htmlwidget")
plot(subrules, method = "graph" ,   engine = "default")

plot(subrules, method = "paracoord")

Part 4: Anomaly Detection

1. Loading libraries and dataset

# Loading libraries

library(tidyverse)
library(anomalize)
## ══ Use anomalize to improve your Forecasts by 50%! ═════════════════════════════
## Business Science offers a 1-hour course - Lab #18: Time Series Anomaly Detection!
## </> Learn more at: https://university.business-science.io/p/learning-labs-pro </>
library(dplyr)
library(tidyr)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:arules':
## 
##     intersect, setdiff, union
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(tidyverse)
library(tibbletime)
## 
## Attaching package: 'tibbletime'
## The following object is masked from 'package:stats':
## 
##     filter
library(dplyr)
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(devtools)
## Loading required package: usethis
library(Rcpp)
library(anomaly)
## 
## Attaching package: 'anomaly'
## The following object is masked from 'package:stats':
## 
##     simulate
library(ggplot2)
# Loading dataset

url <- "http://bit.ly/CarreFourSalesDataset"

sales_forecast<-read.csv(url)

2. Checking the data

# Previewing the top of the dataset

head(sales_forecast)
##        Date    Sales
## 1  1/5/2019 548.9715
## 2  3/8/2019  80.2200
## 3  3/3/2019 340.5255
## 4 1/27/2019 489.0480
## 5  2/8/2019 634.3785
## 6 3/25/2019 627.6165
# Previewing bottom of the dataset

tail(sales_forecast)
##           Date     Sales
## 995  2/18/2019   63.9975
## 996  1/29/2019   42.3675
## 997   3/2/2019 1022.4900
## 998   2/9/2019   33.4320
## 999  2/22/2019   69.1110
## 1000 2/18/2019  649.2990
# Previewing shape

cat("The dataset has", nrow(sales_forecast), "rows", "and", ncol(sales_forecast), "columns")
## The dataset has 1000 rows and 2 columns
# Checking Data types

str(sales_forecast)
## 'data.frame':    1000 obs. of  2 variables:
##  $ Date : chr  "1/5/2019" "3/8/2019" "3/3/2019" "1/27/2019" ...
##  $ Sales: num  549 80.2 340.5 489 634.4 ...
sales_forecast$Date <- as.Date(sales_forecast$Date , format = "%m/%d/%Y")
head(sales_forecast)
##         Date    Sales
## 1 2019-01-05 548.9715
## 2 2019-03-08  80.2200
## 3 2019-03-03 340.5255
## 4 2019-01-27 489.0480
## 5 2019-02-08 634.3785
## 6 2019-03-25 627.6165
unique(sales_forecast$Date)
##  [1] "2019-01-05" "2019-03-08" "2019-03-03" "2019-01-27" "2019-02-08"
##  [6] "2019-03-25" "2019-02-25" "2019-02-24" "2019-01-10" "2019-02-20"
## [11] "2019-02-06" "2019-03-09" "2019-02-12" "2019-02-07" "2019-03-29"
## [16] "2019-01-15" "2019-03-11" "2019-01-01" "2019-01-21" "2019-03-05"
## [21] "2019-03-15" "2019-02-17" "2019-03-02" "2019-03-22" "2019-03-10"
## [26] "2019-01-25" "2019-01-28" "2019-01-07" "2019-03-23" "2019-01-17"
## [31] "2019-02-02" "2019-03-04" "2019-03-16" "2019-02-27" "2019-02-10"
## [36] "2019-03-19" "2019-02-03" "2019-03-07" "2019-02-28" "2019-03-27"
## [41] "2019-01-20" "2019-03-12" "2019-02-15" "2019-03-06" "2019-02-14"
## [46] "2019-03-13" "2019-01-24" "2019-01-06" "2019-02-11" "2019-01-22"
## [51] "2019-01-13" "2019-01-09" "2019-01-12" "2019-01-26" "2019-01-23"
## [56] "2019-02-23" "2019-01-02" "2019-02-09" "2019-03-26" "2019-03-01"
## [61] "2019-02-01" "2019-03-28" "2019-03-24" "2019-02-05" "2019-01-19"
## [66] "2019-01-16" "2019-01-08" "2019-02-18" "2019-01-18" "2019-02-16"
## [71] "2019-02-22" "2019-01-29" "2019-01-04" "2019-03-30" "2019-01-30"
## [76] "2019-01-03" "2019-03-21" "2019-02-13" "2019-01-14" "2019-03-18"
## [81] "2019-03-20" "2019-02-21" "2019-01-31" "2019-01-11" "2019-02-26"
## [86] "2019-03-17" "2019-03-14" "2019-02-04" "2019-02-19"
sales_forecast <- as_tbl_time(sales_forecast , index= Date)

3. Data Cleaning

# Tidying column names

colnames(sales_forecast)
## [1] "Date"  "Sales"
# Checking for number of missing values

length(which(is.na(sales_forecast)))
## [1] 0
# Checking for duplicates

sum(duplicated(sales_forecast))
## [1] 0
# Checking for outliers

boxplot(sales_forecast$Sales)

# Listing the outliers

boxplot.stats(sales_forecast$Sales)$out
## [1] 1003.590 1039.290 1042.650 1002.120 1020.705 1034.460 1023.750 1022.385
## [9] 1022.490

4. EDA

# Statistical summary of the dataset

describe(sales_forecast$Sales)
##    vars    n   mean     sd median trimmed    mad   min     max   range skew
## X1    1 1000 322.97 245.89 253.85  293.91 233.78 10.68 1042.65 1031.97 0.89
##    kurtosis   se
## X1    -0.09 7.78
# Plotting the sales data

ggplot(sales_forecast, aes(x=Date, y=Sales, color=Sales)) + geom_line()

# Frequency distribution of sales

hist(sales_forecast$Sales, col = 'gold', ylim = c(0, 250))

5. Anomaly Detection

# Performing anomaly detection using Seasonal Hybrid ESD Test

anomaly.detect <- sales_forecast %>%group_by(Date) %>%summarise(totalsales =sum(eval(as.symbol("Sales")))) %>% ungroup() %>%time_decompose(totalsales) %>%  anomalize(remainder, method = "gesd", alpha = 0.05, max_anoms = 0.2) %>% 
  
plot_anomaly_decomposition()
## frequency = 7 days
## trend = 30 days
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
anomaly.detect

# Plotting clearer points to check the months that have anomalies

sales_forecast %>%  group_by(Date) %>%  summarise(totalsales = sum(Sales),.group='drop') %>%  time_decompose(totalsales, method = "stl", frequency = "auto", trend = "auto") %>%  anomalize(remainder, method = "gesd") %>%  time_recompose() %>%

# Anomaly Visualization
  
  plot_anomalies(time_recomposed =T,ncol = 6, color_no = "red", color_yes = "green",fill_ribbon ="yellow")
## frequency = 7 days
## trend = 30 days

6. Conclusion

7. Recommendation