# Load package arules
library(arules)
## Warning: package 'arules' was built under R version 4.1.3
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
library(arulesViz)
## Warning: package 'arulesViz' was built under R version 4.1.3
## Warning in register(): Can't find generic `scale_type` in package ggplot2 to
## register S3 method.
library(grid)
library(readxl)
pathname <- "C:\\Users\\katie\\OneDrive\\Documents\\Information Systems\\Data Mining"
setwd(pathname)
fname <- "winequality-both.xlsx"
data <- read_excel(fname, col_names = T)
head(data)
## # A tibble: 6 x 13
## fixed_acidity volatile_acidity citric_acid residual_sugar chlorides
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7 0.27 0.36 20.7 0.045
## 2 6.3 0.3 0.34 1.6 0.049
## 3 8.1 0.28 0.4 6.9 0.05
## 4 7.2 0.23 0.32 8.5 0.058
## 5 7.2 0.23 0.32 8.5 0.058
## 6 8.1 0.28 0.4 6.9 0.05
## # ... with 8 more variables: free_sulfur_dioxide <dbl>,
## # total_sulfur_dioxide <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>,
## # alcohol <dbl>, quality <dbl>, type <chr>
encode_ordinal <- function(x, order = unique(x)) {
x <- as.numeric(factor(x, levels = order, exclude = NULL))
x}
encoded_classes <- encode_ordinal(data$type)
keep_cols <- c("fixed_acidity", "volatile_acidity", "citric_acid", "residual_sugar", "chlorides", "free_sulfur_dioxide", "total_sulfur_dioxide", "density","pH","sulphates","alcohol","quality")
data <- data[keep_cols]
data$wine_type <- encoded_classes
head(data)
## # A tibble: 6 x 13
## fixed_acidity volatile_acidity citric_acid residual_sugar chlorides
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7 0.27 0.36 20.7 0.045
## 2 6.3 0.3 0.34 1.6 0.049
## 3 8.1 0.28 0.4 6.9 0.05
## 4 7.2 0.23 0.32 8.5 0.058
## 5 7.2 0.23 0.32 8.5 0.058
## 6 8.1 0.28 0.4 6.9 0.05
## # ... with 8 more variables: free_sulfur_dioxide <dbl>,
## # total_sulfur_dioxide <dbl>, density <dbl>, pH <dbl>, sulphates <dbl>,
## # alcohol <dbl>, quality <dbl>, wine_type <dbl>
rules <- apriori(data, parameter = list(support = 0.01, confidence = 0.3))
## Warning: Column(s) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 not logical or
## factor. Applying default discretization (see '? discretizeDF').
## Warning in discretize(x = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, : The calculated breaks are: 1, 1, 1, 2
## Only unique breaks are used reducing the number of intervals. Look at ? discretize for details.
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.3 0.1 1 none FALSE TRUE 5 0.01 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 64
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[37 item(s), 6497 transaction(s)] done [0.01s].
## sorting and recoding items ... [37 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 7 8 9 10
## Warning in apriori(data, parameter = list(support = 0.01, confidence = 0.3)):
## Mining stopped (maxlen reached). Only patterns up to a length of 10 returned!
## done [0.28s].
## writing ... [565728 rule(s)] done [0.13s].
## creating S4 object ... done [0.42s].
summary(rules)
## set of 565728 rules
##
## rule length distribution (lhs + rhs):sizes
## 1 2 3 4 5 6 7 8 9 10
## 36 831 10956 75013 168182 168127 96601 36811 8292 879
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 5.000 6.000 5.724 7.000 10.000
##
## summary of quality measures:
## support confidence coverage lift
## Min. :0.01000 Min. :0.3000 Min. :0.01000 Min. :0.4739
## 1st Qu.:0.01185 1st Qu.:0.4440 1st Qu.:0.01816 1st Qu.:1.1234
## Median :0.01478 Median :0.6033 Median :0.02678 Median :1.4931
## Mean :0.01949 Mean :0.6411 Mean :0.03459 Mean :1.6451
## 3rd Qu.:0.02124 3rd Qu.:0.8372 3rd Qu.:0.03940 3rd Qu.:2.0922
## Max. :1.00000 Max. :1.0000 Max. :1.00000 Max. :3.1621
## count
## Min. : 65.0
## 1st Qu.: 77.0
## Median : 96.0
## Mean : 126.7
## 3rd Qu.: 138.0
## Max. :6497.0
##
## mining info:
## data ntransactions support confidence
## data 6497 0.01 0.3
## call
## apriori(data = data, parameter = list(support = 0.01, confidence = 0.3))
inspect(head(sort(rules, by ="lift")))
## lhs rhs support confidence coverage lift count
## [1] {citric_acid=[0,0.27),
## residual_sugar=[0.6,2.1),
## density=[0.987,0.993),
## pH=[3.28,4.01],
## alcohol=[11,14.9],
## quality=[6,9]} => {fixed_acidity=[3.8,6.6)} 0.01000462 0.9558824 0.01046637 3.162102 65
## [2] {citric_acid=[0,0.27),
## residual_sugar=[0.6,2.1),
## density=[0.987,0.993),
## pH=[3.28,4.01],
## alcohol=[11,14.9],
## quality=[6,9],
## wine_type=[1,2]} => {fixed_acidity=[3.8,6.6)} 0.01000462 0.9558824 0.01046637 3.162102 65
## [3] {fixed_acidity=[6.6,7.4),
## volatile_acidity=[0.35,1.58],
## chlorides=[0.055,0.611],
## total_sulfur_dioxide=[6,95),
## density=[0.993,0.996),
## alcohol=[9.7,11)} => {citric_acid=[0,0.27)} 0.01169771 0.9870130 0.01185162 3.152716 76
## [4] {fixed_acidity=[6.6,7.4),
## volatile_acidity=[0.35,1.58],
## chlorides=[0.055,0.611],
## density=[0.993,0.996),
## pH=[3.28,4.01],
## alcohol=[9.7,11)} => {citric_acid=[0,0.27)} 0.01169771 0.9870130 0.01185162 3.152716 76
## [5] {fixed_acidity=[6.6,7.4),
## volatile_acidity=[0.35,1.58],
## chlorides=[0.055,0.611],
## total_sulfur_dioxide=[6,95),
## density=[0.993,0.996),
## alcohol=[9.7,11),
## wine_type=[1,2]} => {citric_acid=[0,0.27)} 0.01169771 0.9870130 0.01185162 3.152716 76
## [6] {fixed_acidity=[6.6,7.4),
## volatile_acidity=[0.35,1.58],
## chlorides=[0.055,0.611],
## density=[0.993,0.996),
## pH=[3.28,4.01],
## alcohol=[9.7,11),
## wine_type=[1,2]} => {citric_acid=[0,0.27)} 0.01169771 0.9870130 0.01185162 3.152716 76
plot(rules)
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
