Core Module 3 Week 3 IP (Part 3&4)

Part 3: Association Rules

# Loading Libraries

library(arules)

## Loading required package: Matrix

## 
## Attaching package: 'arules'

## The following objects are masked from 'package:base':
## 
##     abbreviate, write

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ tidyr::pack()   masks Matrix::pack()
## ✖ dplyr::recode() masks arules::recode()
## ✖ tidyr::unpack() masks Matrix::unpack()

library(arulesViz)

# Loading dataset

url<-"http://bit.ly/SupermarketDatasetII"
sales <- read.transactions(url, sep = ',')

## Warning in asMethod(object): removing duplicated items in transactions

head(sales)

## transactions in sparse format with
##  6 transactions (rows) and
##  119 items (columns)

# Previewing top items 

inspect(sales[1:4])

##     items               
## [1] {almonds,           
##      antioxydant juice, 
##      avocado,           
##      cottage cheese,    
##      energy drink,      
##      frozen smoothie,   
##      green grapes,      
##      green tea,         
##      honey,             
##      low fat yogurt,    
##      mineral water,     
##      olive oil,         
##      salad,             
##      salmon,            
##      shrimp,            
##      spinach,           
##      tomato juice,      
##      vegetables mix,    
##      whole weat flour,  
##      yams}              
## [2] {burgers,           
##      eggs,              
##      meatballs}         
## [3] {chutney}           
## [4] {avocado,           
##      turkey}

# Previewing summary statistic

summary(sales)

## transactions as itemMatrix in sparse format with
##  7501 rows (elements/itemsets/transactions) and
##  119 columns (items) and a density of 0.03288973 
## 
## most frequent items:
## mineral water          eggs     spaghetti  french fries     chocolate 
##          1788          1348          1306          1282          1229 
##       (Other) 
##         22405 
## 
## element (itemset/transaction) length distribution:
## sizes
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
## 1754 1358 1044  816  667  493  391  324  259  139  102   67   40   22   17    4 
##   18   19   20 
##    1    2    1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   3.914   5.000  20.000 
## 
## includes extended item information - examples:
##              labels
## 1           almonds
## 2 antioxydant juice
## 3         asparagus

The most frequent items ae; mineral water, eggs, spaghetti, french fries and chocolate

# Previewing the labels of items

items <- as.data.frame(itemLabels(sales))
colnames(items) <- "Item"
head(items, 4)

##                Item
## 1           almonds
## 2 antioxydant juice
## 3         asparagus
## 4           avocado

tail(items,4)

##                 Item
## 116 whole wheat rice
## 117             yams
## 118      yogurt cake
## 119         zucchini

# Checking the frequency of first 10 labels

itemFrequency(sales[, 1:10],type = "absolute")

##           almonds antioxydant juice         asparagus           avocado 
##               153                67                36               250 
##       babies food             bacon    barbecue sauce         black tea 
##                34                65                81               107 
##       blueberries        body spray 
##                69                86

# plotting the frequency of items

itemFrequencyPlot(sales, topN = 10,col="green")

itemFrequencyPlot(sales, support = 0.1,col="blue")

# Checking the rules of the data at 80% confidence level

rules <- apriori (sales, parameter = list(supp = 0.001, conf = 0.8))

## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.8    0.1    1 none FALSE            TRUE       5   0.001      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 7 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[119 item(s), 7501 transaction(s)] done [0.01s].
## sorting and recoding items ... [116 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 done [0.02s].
## writing ... [74 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].

rules

## set of 74 rules

A set of 74 rules generated at 80% confidence level and 0.001 support

# Summary of the 1st 5  rules

inspect(rules[1:5])

##     lhs                              rhs             support     confidence
## [1] {frozen smoothie, spinach}    => {mineral water} 0.001066524 0.8888889 
## [2] {bacon, pancakes}             => {spaghetti}     0.001733102 0.8125000 
## [3] {nonfat milk, turkey}         => {mineral water} 0.001199840 0.8181818 
## [4] {ground beef, nonfat milk}    => {mineral water} 0.001599787 0.8571429 
## [5] {mushroom cream sauce, pasta} => {escalope}      0.002532996 0.9500000 
##     coverage    lift      count
## [1] 0.001199840  3.729058  8   
## [2] 0.002133049  4.666587 13   
## [3] 0.001466471  3.432428  9   
## [4] 0.001866418  3.595877 12   
## [5] 0.002666311 11.976387 19

Rue 1: If a customer purchases frozen smoothie and spinach, there is a 88.9% chance of them buying mineral water
Rule 2: If a customer purchases bacon and pancakes, there is a 81.2% probability of them purchasing spaghetti as well
Rule 3: If a customer purchases non-fat milk and turkey, there is a 81.8% chance of them purchasing mineral water as well
Rule 4:If a customer purchases ground beef and non-fat milk, there is a 85.7% probability of them purchasing mineral water as well
Rule 5: If a customer purchases mushroom cream sauce and pasta, there is a 95% chance of them purchasing escalope as well

# Visualization of the association rules

subrules <- head(rules, n = 10, by = "confidence")
plot(subrules, method = "graph" ,   engine = "htmlwidget")

plot(subrules, method = "graph" ,   engine = "default")

plot(subrules, method = "paracoord")

Part 4: Anomaly Detection

1. Loading libraries and dataset

# Loading libraries

library(tidyverse)
library(anomalize)

## ══ Use anomalize to improve your Forecasts by 50%! ═════════════════════════════
## Business Science offers a 1-hour course - Lab #18: Time Series Anomaly Detection!
## </> Learn more at: https://university.business-science.io/p/learning-labs-pro </>

library(dplyr)
library(tidyr)
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:arules':
## 
##     intersect, setdiff, union

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(tidyverse)
library(tibbletime)

## 
## Attaching package: 'tibbletime'

## The following object is masked from 'package:stats':
## 
##     filter

library(dplyr)
library(psych)

## 
## Attaching package: 'psych'

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

library(devtools)

## Loading required package: usethis

library(Rcpp)
library(anomaly)

## 
## Attaching package: 'anomaly'

## The following object is masked from 'package:stats':
## 
##     simulate

library(ggplot2)

# Loading dataset

url <- "http://bit.ly/CarreFourSalesDataset"

sales_forecast<-read.csv(url)

2. Checking the data

# Previewing the top of the dataset

head(sales_forecast)

##        Date    Sales
## 1  1/5/2019 548.9715
## 2  3/8/2019  80.2200
## 3  3/3/2019 340.5255
## 4 1/27/2019 489.0480
## 5  2/8/2019 634.3785
## 6 3/25/2019 627.6165

# Previewing bottom of the dataset

tail(sales_forecast)

##           Date     Sales
## 995  2/18/2019   63.9975
## 996  1/29/2019   42.3675
## 997   3/2/2019 1022.4900
## 998   2/9/2019   33.4320
## 999  2/22/2019   69.1110
## 1000 2/18/2019  649.2990

# Previewing shape

cat("The dataset has", nrow(sales_forecast), "rows", "and", ncol(sales_forecast), "columns")

## The dataset has 1000 rows and 2 columns

# Checking Data types

str(sales_forecast)

## 'data.frame':    1000 obs. of  2 variables:
##  $ Date : chr  "1/5/2019" "3/8/2019" "3/3/2019" "1/27/2019" ...
##  $ Sales: num  549 80.2 340.5 489 634.4 ...

Sales has the appropriate datatype; date should e changed to datetime

sales_forecast$Date <- as.Date(sales_forecast$Date , format = "%m/%d/%Y")
head(sales_forecast)

##         Date    Sales
## 1 2019-01-05 548.9715
## 2 2019-03-08  80.2200
## 3 2019-03-03 340.5255
## 4 2019-01-27 489.0480
## 5 2019-02-08 634.3785
## 6 2019-03-25 627.6165

unique(sales_forecast$Date)

##  [1] "2019-01-05" "2019-03-08" "2019-03-03" "2019-01-27" "2019-02-08"
##  [6] "2019-03-25" "2019-02-25" "2019-02-24" "2019-01-10" "2019-02-20"
## [11] "2019-02-06" "2019-03-09" "2019-02-12" "2019-02-07" "2019-03-29"
## [16] "2019-01-15" "2019-03-11" "2019-01-01" "2019-01-21" "2019-03-05"
## [21] "2019-03-15" "2019-02-17" "2019-03-02" "2019-03-22" "2019-03-10"
## [26] "2019-01-25" "2019-01-28" "2019-01-07" "2019-03-23" "2019-01-17"
## [31] "2019-02-02" "2019-03-04" "2019-03-16" "2019-02-27" "2019-02-10"
## [36] "2019-03-19" "2019-02-03" "2019-03-07" "2019-02-28" "2019-03-27"
## [41] "2019-01-20" "2019-03-12" "2019-02-15" "2019-03-06" "2019-02-14"
## [46] "2019-03-13" "2019-01-24" "2019-01-06" "2019-02-11" "2019-01-22"
## [51] "2019-01-13" "2019-01-09" "2019-01-12" "2019-01-26" "2019-01-23"
## [56] "2019-02-23" "2019-01-02" "2019-02-09" "2019-03-26" "2019-03-01"
## [61] "2019-02-01" "2019-03-28" "2019-03-24" "2019-02-05" "2019-01-19"
## [66] "2019-01-16" "2019-01-08" "2019-02-18" "2019-01-18" "2019-02-16"
## [71] "2019-02-22" "2019-01-29" "2019-01-04" "2019-03-30" "2019-01-30"
## [76] "2019-01-03" "2019-03-21" "2019-02-13" "2019-01-14" "2019-03-18"
## [81] "2019-03-20" "2019-02-21" "2019-01-31" "2019-01-11" "2019-02-26"
## [86] "2019-03-17" "2019-03-14" "2019-02-04" "2019-02-19"

sales_forecast <- as_tbl_time(sales_forecast , index= Date)

3. Data Cleaning

# Tidying column names

colnames(sales_forecast)

## [1] "Date"  "Sales"

Columns are uniformly labeled

# Checking for number of missing values

length(which(is.na(sales_forecast)))

## [1] 0

No missing values

# Checking for duplicates

sum(duplicated(sales_forecast))

## [1] 0

No duplicates

# Checking for outliers

boxplot(sales_forecast$Sales)

# Listing the outliers

boxplot.stats(sales_forecast$Sales)$out

## [1] 1003.590 1039.290 1042.650 1002.120 1020.705 1034.460 1023.750 1022.385
## [9] 1022.490

The outliers are valid

4. EDA

# Statistical summary of the dataset

describe(sales_forecast$Sales)

##    vars    n   mean     sd median trimmed    mad   min     max   range skew
## X1    1 1000 322.97 245.89 253.85  293.91 233.78 10.68 1042.65 1031.97 0.89
##    kurtosis   se
## X1    -0.09 7.78

# Plotting the sales data

ggplot(sales_forecast, aes(x=Date, y=Sales, color=Sales)) + geom_line()

It is evident that there are high sale spikes at different intervals between Jan-April

# Frequency distribution of sales

hist(sales_forecast$Sales, col = 'gold', ylim = c(0, 250))

5. Anomaly Detection

# Performing anomaly detection using Seasonal Hybrid ESD Test

anomaly.detect <- sales_forecast %>%group_by(Date) %>%summarise(totalsales =sum(eval(as.symbol("Sales")))) %>% ungroup() %>%time_decompose(totalsales) %>%  anomalize(remainder, method = "gesd", alpha = 0.05, max_anoms = 0.2) %>% 
  
plot_anomaly_decomposition()

## frequency = 7 days

## trend = 30 days

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

anomaly.detect

# Plotting clearer points to check the months that have anomalies

sales_forecast %>%  group_by(Date) %>%  summarise(totalsales = sum(Sales),.group='drop') %>%  time_decompose(totalsales, method = "stl", frequency = "auto", trend = "auto") %>%  anomalize(remainder, method = "gesd") %>%  time_recompose() %>%

# Anomaly Visualization
  
  plot_anomalies(time_recomposed =T,ncol = 6, color_no = "red", color_yes = "green",fill_ribbon ="yellow")

## frequency = 7 days

## trend = 30 days

Anomalies are observed in early and mid February, as well as mid-March

6. Conclusion

The sales data has anomalies as seen above, though not many.

7. Recommendation

The marketing team should investigate the spike in sales and the anomalies to ascertain they are not fraudulent.

Core Module 3 Week 3 IP (Part 3&4)

Jane Ngala

2022-06-11