library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Attaching package: 'arules'
## The following object is masked from 'package:dplyr':
##
## recode
## The following objects are masked from 'package:base':
##
## abbreviate, write
library(arulesViz)
## Loading required package: grid
## Registered S3 method overwritten by 'seriation':
## method from
## reorder.hclust gclus
library(methods)
데이터 불러오기
ordr_pr <- read_csv("dataset/instacartData/order_products__prior.csv")
## Parsed with column specification:
## cols(
## order_id = col_double(),
## product_id = col_double(),
## add_to_cart_order = col_double(),
## reordered = col_double()
## )
prods <- read_csv("dataset/instacartData/products.csv")
## Parsed with column specification:
## cols(
## product_id = col_double(),
## product_name = col_character(),
## aisle_id = col_double(),
## department_id = col_double()
## )
데이터 준비 orderid로 그룹핑해서 상품을 리스트로 쪼개기
order_baskets <- ordr_pr %>%
inner_join(prods, by = "product_id") %>%
group_by(order_id) %>%
summarise(basket = as.vector(list(product_name)))
transaction 타입으로 변경
transactions <- as(order_baskets$basket,"transactions")
head(transactions)
## transactions in sparse format with
## 6 transactions (rows) and
## 31325 items (columns)
총 6개의 거래 정보가 정상적으로 변환 장바구니 분석하기 *먼저 데이터를 살펴보자
hist(size(transactions), breaks = 0:100, xaxt="n", ylim=c(0,10000), main = "Number of Items per basket", xlab = "#Items")
axis(1, at=seq(0,90,by=10), cex.axis=0.8)
mtext(paste("Total:", length(transactions), "baskets,", sum(size(transactions)), "items"))
어떤 아이템이 빈번하게 나타날까? support 기준을 0.02 : 아이템은 적어도 모든 장바구니에 2% 정도는 나와야한다는 의미
item_frequency <- itemFrequency(transactions, type ="a")
support <- 0.02
freq_items <- sort(item_frequency,decreasing = F)
freq_items <- freq_items[freq_items>support*length(transactions)]
par(mar=c(2,10,2,2)); options(scipen = 5)
barplot(freq_items, horiz=T, las=1, main = "Frequent Items", cex.names = .8, xlim=c(0,11000))
mtext(paste("support:",support), padj = .8)
abline(v=support*length(transactions), col="red")
과일이랑 야채들이 많음 - 바나나, 딸기, 아보카도 등등 Frequent Itemsets 빈발 아이템을 계산!, 크기가 2이상인 빈번한 항목 집합을 관찰할 가능성이 작은 것을 고려해 support 기준을 낮춤
support <- 0.008
itemsets <- apriori(transactions, parameter = list(target = "frequent itemsets", supp = support, minlen = 2),
control = list(verbose = FALSE))
par(mar=c(5,18,2,2) +.1)
sets_order_supp <- DATAFRAME(sort(itemsets, by = "support", decreasing = F))
barplot(sets_order_supp$support, names.arg = sets_order_supp$items, xlim = c(0,0.02), horiz=T, las=2,
cex.names = .8, main = "Frequent Itemsets")
mtext(paste("support:", support), padj =.8)
우선 support 임계값이 0.008인 경우 빈번한 쌍만 관찰하고 두번째로 바나나가 많음 support 가 가장 높은 8쌍에는 바나나가 들어있음 거의 모든 아이템들은 과일과 야채가 들어있음 우유를 포함한 빈번하게 나타내는 쌍이 보임
낮은 support 기준과 높은 confidence 를 사용해서 작은 항목에 대해서도 강력한 규칙을 생성
rules1 <- apriori(transactions, parameter = list(supp = 0.0001, conf = 0.6, maxlen=3), control=list(verbose = FALSE))
summary(quality(rules1))
## support confidence lift count
## Min. :0.0001089 Min. :0.6000 Min. : 4.088 Min. : 7.000
## 1st Qu.:0.0001089 1st Qu.:0.6364 1st Qu.: 5.116 1st Qu.: 7.000
## Median :0.0001245 Median :0.6667 Median : 8.254 Median : 8.000
## Mean :0.0001491 Mean :0.6981 Mean : 143.684 Mean : 9.586
## 3rd Qu.:0.0001556 3rd Qu.:0.7333 3rd Qu.: 93.511 3rd Qu.:10.000
## Max. :0.0010424 Max. :1.0000 Max. :4249.719 Max. :67.000
plot(rules1)
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
품목간에 강한 연관성을 나타내는 강한 향상도가 있는 규칙이 있다! 이러한 규칙을 자세히 살펴보자
inspect(sort(rules1, by="lift")[1:10])
## lhs rhs support confidence lift count
## [1] {Purina Mixed Grill Classic Pate Cat Food} => {Liver & Chicken Dinner Classic Pate Cat Food} 0.0001244613 0.7272727 4249.719 8
## [2] {Liver & Chicken Dinner Classic Pate Cat Food} => {Purina Mixed Grill Classic Pate Cat Food} 0.0001244613 0.7272727 4249.719 8
## [3] {Fancy Feast Wet Classic Chicken Feast Cat Food} => {Classic Tender Liver & Chicken Feast Cat Food} 0.0001089037 0.7000000 3461.069 7
## [4] {Organic Basil Babe Level 1 Baby Food} => {Peachy Keen Organic Level 1} 0.0001089037 0.6363636 3146.427 7
## [5] {Vanilla Pot De Creme} => {Dark Chocolate Pot De Creme} 0.0001244613 0.7272727 2921.682 8
## [6] {Organic Nondairy Lemon Cashew Yogurt} => {Organic Nondairy Strawberry Cashew Yogurt} 0.0001244613 0.8000000 2448.648 8
## [7] {Organic Blueberry Lowfat Yogurt} => {Yogurt, Organic, Lowfat, Strawberry} 0.0001089037 0.6363636 2406.091 7
## [8] {Stage 1 First Apples} => {Organic Stage 1 First Peas Baby Food} 0.0001244613 0.8888889 2285.404 8
## [9] {Unsweet Peach Water,
## Unsweetened Blackberry Water} => {Water, Unsweet, Blood Orange} 0.0001089037 0.6363636 2272.419 7
## [10] {9 Inch Plates,
## Plastic Spoons} => {Compostable Forks} 0.0001089037 1.0000000 2008.656 7
inspect(sort(rules1, by="confidence")[1:10])
## lhs rhs support confidence lift count
## [1] {Water Mineral,
## Zero Calorie Cola} => {Soda} 0.0001089037 1 87.33288 7
## [2] {Sandwich Cookies & Crackers Variety Snack Packs,
## Zero Calorie Cola} => {Soda} 0.0001089037 1 87.33288 7
## [3] {Mighty 4 Sweet Potato, Blueberry, Millet & Greek Yogurt Tots Snack,
## Mighty 4: Pumpkin Pomegranate Quinoa Greek Yogurt Nutrition Blend for Tots} => {Mighty 4 Purple Carrot Blackberry Quinoa & Greek Yogurt Tots Snack} 0.0001400190 1 1260.33333 9
## [4] {Chocolate Sandwich Cookies,
## Clementines} => {Hass Avocados} 0.0001089037 1 65.58878 7
## [5] {Chocolate Sandwich Cookies,
## Hass Avocados} => {Clementines} 0.0001089037 1 113.96631 7
## [6] {9 Inch Plates,
## Plastic Spoons} => {Compostable Forks} 0.0001089037 1 2008.65625 7
## [7] {Milk Chocolate Covered Raisins,
## Popcorn} => {Extra Fancy Unsalted Mixed Nuts} 0.0001089037 1 389.55758 7
## [8] {Lemon Sparkling Water,
## Peach-Pear Sparkling Water} => {Grapefruit Sparkling Water} 0.0001400190 1 209.37134 9
## [9] {Mixed Fruit Fruit Snacks,
## Orange & Lemon Flavor Variety Pack Sparkling Fruit Beverage} => {Soda} 0.0001089037 1 87.33288 7
## [10] {Lemon Fruit & Nut Food Bar,
## Pecan Pie Fruit & Nut Food Bar} => {Blueberry Muffin Bar} 0.0001400190 1 455.86525 9
이러한 규칙은 대개 함께 구입한 유사한 품목에 영향을 주는 것 같음 바나나를 포함하는 규칙은 없음 *다음으로 support 를 높이고 confidence를 낮추어 좀 더 빈번한 품목 규칙을 보자
rules2 <- apriori(transactions, parameter = list(supp = 0.0001, conf = 0.4, maxlen=3), control=list(verbose = FALSE))
summary(quality(rules2))
## support confidence lift count
## Min. :0.0001089 Min. :0.4000 Min. : 2.725 Min. : 7.00
## 1st Qu.:0.0001089 1st Qu.:0.4324 1st Qu.: 3.634 1st Qu.: 7.00
## Median :0.0001400 Median :0.4737 Median : 5.260 Median : 9.00
## Mean :0.0001771 Mean :0.5082 Mean : 67.092 Mean : 11.38
## 3rd Qu.:0.0001867 3rd Qu.:0.5500 3rd Qu.: 13.151 3rd Qu.: 12.00
## Max. :0.0034538 Max. :1.0000 Max. :4249.719 Max. :222.00
plot(rules2)
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
inspect(sort(rules2, by="lift")[1:10])
## lhs rhs support confidence lift count
## [1] {Purina Mixed Grill Classic Pate Cat Food} => {Liver & Chicken Dinner Classic Pate Cat Food} 0.0001244613 0.7272727 4249.719 8
## [2] {Liver & Chicken Dinner Classic Pate Cat Food} => {Purina Mixed Grill Classic Pate Cat Food} 0.0001244613 0.7272727 4249.719 8
## [3] {Classic Tender Liver & Chicken Feast Cat Food} => {Fancy Feast Wet Classic Chicken Feast Cat Food} 0.0001089037 0.5384615 3461.069 7
## [4] {Fancy Feast Wet Classic Chicken Feast Cat Food} => {Classic Tender Liver & Chicken Feast Cat Food} 0.0001089037 0.7000000 3461.069 7
## [5] {Organic Basil Babe Level 1 Baby Food} => {Peachy Keen Organic Level 1} 0.0001089037 0.6363636 3146.427 7
## [6] {Peachy Keen Organic Level 1} => {Organic Basil Babe Level 1 Baby Food} 0.0001089037 0.5384615 3146.427 7
## [7] {Vanilla Pot De Creme} => {Dark Chocolate Pot De Creme} 0.0001244613 0.7272727 2921.682 8
## [8] {Dark Chocolate Pot De Creme} => {Vanilla Pot De Creme} 0.0001244613 0.5000000 2921.682 8
## [9] {Organic Forest Berry Cream On Top Whole Milk Yogurt} => {Organic Blueberry Cream On Top Whole Milk Yogurt} 0.0001089037 0.5833333 2884.224 7
## [10] {Organic Blueberry Cream On Top Whole Milk Yogurt} => {Organic Forest Berry Cream On Top Whole Milk Yogurt} 0.0001089037 0.5384615 2884.224 7
inspect(sort(rules2, by="confidence")[1:10])
## lhs rhs support confidence lift count
## [1] {Water Mineral,
## Zero Calorie Cola} => {Soda} 0.0001089037 1 87.33288 7
## [2] {Sandwich Cookies & Crackers Variety Snack Packs,
## Zero Calorie Cola} => {Soda} 0.0001089037 1 87.33288 7
## [3] {Mighty 4 Sweet Potato, Blueberry, Millet & Greek Yogurt Tots Snack,
## Mighty 4: Pumpkin Pomegranate Quinoa Greek Yogurt Nutrition Blend for Tots} => {Mighty 4 Purple Carrot Blackberry Quinoa & Greek Yogurt Tots Snack} 0.0001400190 1 1260.33333 9
## [4] {Chocolate Sandwich Cookies,
## Clementines} => {Hass Avocados} 0.0001089037 1 65.58878 7
## [5] {Chocolate Sandwich Cookies,
## Hass Avocados} => {Clementines} 0.0001089037 1 113.96631 7
## [6] {9 Inch Plates,
## Plastic Spoons} => {Compostable Forks} 0.0001089037 1 2008.65625 7
## [7] {Milk Chocolate Covered Raisins,
## Popcorn} => {Extra Fancy Unsalted Mixed Nuts} 0.0001089037 1 389.55758 7
## [8] {Lemon Sparkling Water,
## Peach-Pear Sparkling Water} => {Grapefruit Sparkling Water} 0.0001400190 1 209.37134 9
## [9] {Mixed Fruit Fruit Snacks,
## Orange & Lemon Flavor Variety Pack Sparkling Fruit Beverage} => {Soda} 0.0001089037 1 87.33288 7
## [10] {Lemon Fruit & Nut Food Bar,
## Pecan Pie Fruit & Nut Food Bar} => {Blueberry Muffin Bar} 0.0001400190 1 455.86525 9
rules3 <- apriori(transactions, parameter = list(supp = 0.0005, conf = 0.1, maxlen=3), control=list(verbose = FALSE))
summary(quality(rules3))
## support confidence lift count
## Min. :0.0005134 Min. :0.1000 Min. : 0.7237 Min. : 33.00
## 1st Qu.:0.0006067 1st Qu.:0.1409 1st Qu.: 1.9424 1st Qu.: 39.00
## Median :0.0007468 Median :0.1912 Median : 2.7183 Median : 48.00
## Mean :0.0011682 Mean :0.2104 Mean : 7.2884 Mean : 75.09
## 3rd Qu.:0.0011202 3rd Qu.:0.2570 3rd Qu.: 3.9553 3rd Qu.: 72.00
## Max. :0.1467710 Max. :0.7679 Max. :766.8133 Max. :9434.00
plot(rules3)
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
inspect(sort(rules3, by="lift")[1:10])
## lhs rhs support confidence lift count
## [1] {Oh My Yog! Organic Wild Quebec Blueberry Cream Top Yogurt & Fruit} => {Oh My Yog! Pacific Coast Strawberry Trilayer Yogurt} 0.0005289606 0.6800000 766.8133 34
## [2] {Oh My Yog! Pacific Coast Strawberry Trilayer Yogurt} => {Oh My Yog! Organic Wild Quebec Blueberry Cream Top Yogurt & Fruit} 0.0005289606 0.5964912 766.8133 34
## [3] {Greek Whole Milk Blended Blueberry Yogurt} => {Organic Greek Whole Milk Blended Strawberry Yogurt} 0.0005445183 0.6481481 534.1156 35
## [4] {Organic Greek Whole Milk Blended Strawberry Yogurt} => {Greek Whole Milk Blended Blueberry Yogurt} 0.0005445183 0.4487179 534.1156 35
## [5] {Coconut Chocolate Bar} => {Chocolate Sea Salt} 0.0005289606 0.5230769 400.2597 34
## [6] {Chocolate Sea Salt} => {Coconut Chocolate Bar} 0.0005289606 0.4047619 400.2597 34
## [7] {Almond Milk Strawberry Yogurt} => {Almond Milk Blueberry Yogurt} 0.0008089986 0.4482759 335.0445 52
## [8] {Almond Milk Blueberry Yogurt} => {Almond Milk Strawberry Yogurt} 0.0008089986 0.6046512 335.0445 52
## [9] {Almond Milk Peach Yogurt} => {Almond Milk Strawberry Yogurt} 0.0009179022 0.6020408 333.5981 59
## [10] {Almond Milk Strawberry Yogurt} => {Almond Milk Peach Yogurt} 0.0009179022 0.5086207 333.5981 59
inspect(sort(rules3, by="confidence")[1:10])
## lhs rhs support confidence lift count
## [1] {Peach Pear Flavored Sparkling Water,
## Pure Sparkling Water} => {Sparkling Water Grapefruit} 0.0006689796 0.7678571 31.13915 43
## [2] {Total 2% All Natural Greek Strained Yogurt with Honey,
## Total 2% Lowfat Greek Strained Yogurt With Blueberry} => {Total 2% with Strawberry Lowfat Greek Strained Yogurt} 0.0008867869 0.7307692 76.87668 57
## [3] {Pure Sparkling Water,
## Sparkling Water Berry} => {Sparkling Water Grapefruit} 0.0006845372 0.6875000 27.88040 44
## [4] {Lime Sparkling Water,
## Peach Pear Flavored Sparkling Water} => {Sparkling Water Grapefruit} 0.0010112482 0.6842105 27.74700 65
## [5] {Total 2% All Natural Greek Strained Yogurt with Honey,
## Total 2% Lowfat Greek Strained Yogurt with Peach} => {Total 2% with Strawberry Lowfat Greek Strained Yogurt} 0.0006689796 0.6825397 71.80295 43
## [6] {Oh My Yog! Organic Wild Quebec Blueberry Cream Top Yogurt & Fruit} => {Oh My Yog! Pacific Coast Strawberry Trilayer Yogurt} 0.0005289606 0.6800000 766.81333 34
## [7] {Blueberry on the Bottom Nonfat Greek Yogurt,
## Peach on the Bottom Nonfat Greek Yogurt} => {Strawberry on the Bottom Nonfat Greek Yogurt} 0.0005289606 0.6800000 291.38907 34
## [8] {Total 2% All Natural Greek Strained Yogurt with Honey,
## Total 2% Greek Strained Yogurt with Cherry 5.3 oz} => {Total 2% with Strawberry Lowfat Greek Strained Yogurt} 0.0006067489 0.6724138 70.73771 39
## [9] {Pure Sparkling Water,
## Sparkling Lemon Water} => {Sparkling Water Grapefruit} 0.0009801329 0.6702128 27.17935 63
## [10] {Lime Sparkling Water,
## Sparkling Water Berry} => {Sparkling Water Grapefruit} 0.0009334599 0.6666667 27.03554 60