Understanding customer preferences and trends is important for businesses to adjust their products, marketing strategies, and overall customer experience. This study analyzes the relationship between factors influencing the purchase frequency of target customer groups by using association rules to determine the purchasing patterns of segment groups.
# Packages & Libraries
if (!require("pacman")) install.packages("pacman")
## Loading required package: pacman
pacman::p_load(arulesCBA,arulesViz)
The dataset used for this analysis is shopping_trends.csv, which contains information about customer behavior including: age, gender, category,frequency provided by Kaggle (https://www.kaggle.com/datasets/iamsouravbanerjee/customer-shopping-trends-datasets)
# Import dataset
data<-read.csv("shopping_trends.csv", sep=",", dec=".", header=TRUE)
data <- data[-1]
head(data)
## Age Gender Item.Purchased Category Purchase.Amount..USD. Location Size
## 1 55 Male Blouse Clothing 53 Kentucky L
## 2 19 Male Sweater Clothing 64 Maine L
## 3 50 Male Jeans Clothing 73 Massachusetts S
## 4 21 Male Sandals Footwear 90 Rhode Island M
## 5 45 Male Blouse Clothing 49 Oregon M
## 6 46 Male Sneakers Footwear 20 Wyoming M
## Color Season Review.Rating Subscription.Status Payment.Method
## 1 Gray Winter 3.1 Yes Credit Card
## 2 Maroon Winter 3.1 Yes Bank Transfer
## 3 Maroon Spring 3.1 Yes Cash
## 4 Maroon Spring 3.5 Yes PayPal
## 5 Turquoise Spring 2.7 Yes Cash
## 6 White Summer 2.9 Yes Venmo
## Shipping.Type Discount.Applied Promo.Code.Used Previous.Purchases
## 1 Express Yes Yes 14
## 2 Express Yes Yes 2
## 3 Free Shipping Yes Yes 23
## 4 Next Day Air Yes Yes 49
## 5 Free Shipping Yes Yes 31
## 6 Standard Yes Yes 14
## Preferred.Payment.Method Frequency.of.Purchases
## 1 Venmo Fortnightly
## 2 Cash Fortnightly
## 3 Credit Card Weekly
## 4 PayPal Weekly
## 5 PayPal Annually
## 6 Venmo Weekly
# Checking data
dim(data)
## [1] 3900 18
summary(data)
## Age Gender Item.Purchased Category
## Min. :18.00 Length:3900 Length:3900 Length:3900
## 1st Qu.:31.00 Class :character Class :character Class :character
## Median :44.00 Mode :character Mode :character Mode :character
## Mean :44.07
## 3rd Qu.:57.00
## Max. :70.00
## Purchase.Amount..USD. Location Size Color
## Min. : 20.00 Length:3900 Length:3900 Length:3900
## 1st Qu.: 39.00 Class :character Class :character Class :character
## Median : 60.00 Mode :character Mode :character Mode :character
## Mean : 59.76
## 3rd Qu.: 81.00
## Max. :100.00
## Season Review.Rating Subscription.Status Payment.Method
## Length:3900 Min. :2.50 Length:3900 Length:3900
## Class :character 1st Qu.:3.10 Class :character Class :character
## Mode :character Median :3.70 Mode :character Mode :character
## Mean :3.75
## 3rd Qu.:4.40
## Max. :5.00
## Shipping.Type Discount.Applied Promo.Code.Used Previous.Purchases
## Length:3900 Length:3900 Length:3900 Min. : 1.00
## Class :character Class :character Class :character 1st Qu.:13.00
## Mode :character Mode :character Mode :character Median :25.00
## Mean :25.35
## 3rd Qu.:38.00
## Max. :50.00
## Preferred.Payment.Method Frequency.of.Purchases
## Length:3900 Length:3900
## Class :character Class :character
## Mode :character Mode :character
##
##
##
Checking distribution of numeric datas
summary(data$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.00 31.00 44.00 44.07 57.00 70.00
hist(data$Age, col = "brown",
xlab = "Age", ylab = "Number of customer",
main = "Histogram of age variable")
summary(data$Purchase.Amount..USD)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 20.00 39.00 60.00 59.76 81.00 100.00
hist(data$Purchase.Amount..USD, col = "blue",
xlab = "Purchase.Amount", ylab = "Number of customer",
main = "Histogram of Purchase.Amount variable")
summary(data$Previous.Purchases)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 13.00 25.00 25.35 38.00 50.00
hist(data$Previous.Purchases, col = "green",
xlab = "Previous.Purchases", ylab = "Number of customer",
main = "Histogram of Previous.Purchases variable")
summary(data$Review.Rating)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.50 3.10 3.70 3.75 4.40 5.00
hist(data$Previous.Purchases, col = "yellow",
xlab = "Review.Rating", ylab = "Number of customer",
main = "Histogram of Review.Rating variable")
# Preparation
columns_to_factor <- c("Gender", "Item.Purchased", "Category",
"Location", "Size", "Color", "Season", "Season",
"Subscription.Status", "Payment.Method",
"Shipping.Type", "Discount.Applied",
"Promo.Code.Used", "Preferred.Payment.Method", "Frequency.of.Purchases")
# Eliminate duplication
columns_to_factor <- unique(columns_to_factor)
columns_to_factor <- intersect(columns_to_factor, colnames(data))
# Covert columns to factor
data[columns_to_factor] <- lapply(data[columns_to_factor], factor)
med.mdlp <- discretizeDF.supervised(Frequency.of.Purchases ~ ., data = data, method = "mdlp")
summary(med.mdlp)
## Age Gender Item.Purchased Category
## [-Inf, Inf]:3900 Female:1248 Blouse : 171 Accessories:1240
## Male :2652 Jewelry: 171 Clothing :1737
## Pants : 171 Footwear : 599
## Shirt : 169 Outerwear : 324
## Dress : 166
## Sweater: 164
## (Other):2888
## Purchase.Amount..USD. Location Size Color Season
## [-Inf, Inf]:3900 Montana : 96 L :1053 Olive : 177 Fall :975
## California: 95 M :1755 Yellow : 174 Spring:999
## Idaho : 93 S : 663 Silver : 173 Summer:955
## Illinois : 92 XL: 429 Teal : 172 Winter:971
## Alabama : 89 Green : 169
## Minnesota : 88 Black : 167
## (Other) :3347 (Other):2868
## Review.Rating Subscription.Status Payment.Method
## [-Inf, Inf]:3900 No :2847 Bank Transfer:632
## Yes:1053 Cash :648
## Credit Card :696
## Debit Card :633
## PayPal :638
## Venmo :653
##
## Shipping.Type Discount.Applied Promo.Code.Used Previous.Purchases
## 2-Day Shipping:627 No :2223 No :2223 [-Inf, Inf]:3900
## Express :646 Yes:1677 Yes:1677
## Free Shipping :675
## Next Day Air :648
## Standard :654
## Store Pickup :650
##
## Preferred.Payment.Method Frequency.of.Purchases
## Bank Transfer:612 Annually :572
## Cash :670 Bi-Weekly :547
## Credit Card :671 Every 3 Months:584
## Debit Card :636 Fortnightly :542
## PayPal :677 Monthly :553
## Venmo :634 Quarterly :563
## Weekly :539
Method mdlp do not devide continuous data in a effective way
med.caim <- discretizeDF.supervised(Frequency.of.Purchases ~ ., data = data, method = "caim")
summary(med.caim)
## Age Gender Item.Purchased Category
## [29.5,52.5):1691 Female:1248 Blouse : 171 Accessories:1240
## [55.5,67.5): 875 Male :2652 Jewelry: 171 Clothing :1737
## [22.5,29.5): 531 Pants : 171 Footwear : 599
## [18,22.5) : 347 Shirt : 169 Outerwear : 324
## [52.5,55.5): 226 Dress : 166
## [67.5,69.5): 163 Sweater: 164
## (Other) : 67 (Other):2888
## Purchase.Amount..USD. Location Size Color Season
## [59.5,91.5):1518 Montana : 96 L :1053 Olive : 177 Fall :975
## [45.5,59.5): 637 California: 95 M :1755 Yellow : 174 Spring:999
## [20,29.5) : 503 Idaho : 93 S : 663 Silver : 173 Summer:955
## [91.5,100) : 409 Illinois : 92 XL: 429 Teal : 172 Winter:971
## [32.5,39.5): 349 Alabama : 89 Green : 169
## [39.5,45.5): 286 Minnesota : 88 Black : 167
## (Other) : 198 (Other) :3347 (Other):2868
## Review.Rating Subscription.Status Payment.Method
## [2.55,3.55):1580 No :2847 Bank Transfer:632
## [4.15,4.85):1081 Yes:1053 Cash :648
## [3.95,4.15): 329 Credit Card :696
## [3.55,3.75): 305 Debit Card :633
## [3.75,3.95): 305 PayPal :638
## [4.85,5) : 166 Venmo :653
## (Other) : 134
## Shipping.Type Discount.Applied Promo.Code.Used Previous.Purchases
## 2-Day Shipping:627 No :2223 No :2223 [1,21.5) :1657
## Express :646 Yes:1677 Yes:1677 [35.5,48.5):1012
## Free Shipping :675 [21.5,31.5): 791
## Next Day Air :648 [31.5,33.5): 163
## Standard :654 [33.5,34.5): 79
## Store Pickup :650 [50, Inf] : 77
## (Other) : 121
## Preferred.Payment.Method Frequency.of.Purchases
## Bank Transfer:612 Annually :572
## Cash :670 Bi-Weekly :547
## Credit Card :671 Every 3 Months:584
## Debit Card :636 Fortnightly :542
## PayPal :677 Monthly :553
## Venmo :634 Quarterly :563
## Weekly :539
With caim method, continuous datas seem to be devided better
# Association rules with support= 3% and confidence 10%
M2.trans<-transactions(med.caim)
M2.ass <- mineCARs(Frequency.of.Purchases ~ ., transactions = M2.trans, support = 0.03, confidence = 0.1)
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.1 0.1 1 none FALSE FALSE 5 0.03 1
## maxlen target ext
## 5 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 117
##
## set item appearances ...[181 item(s)] done [0.00s].
## set transactions ...[181 item(s), 3900 transaction(s)] done [0.00s].
## sorting and recoding items ... [176 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 done [0.08s].
## writing ... [42233 rule(s)] done [0.01s].
## creating S4 object ... done [0.01s].
# Cleaning rules
M2.clean <- M2.ass[!is.redundant(M2.ass)]
M2.clean<-M2.clean[is.significant(M2.clean, M2.trans)]
M2.clean<-M2.clean[is.maximal(M2.clean)]
summary(M2.clean)
## set of 120 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3 4 5
## 5 18 81 16
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.0 4.0 4.0 3.9 4.0 5.0
##
## summary of quality measures:
## support confidence coverage lift
## Min. :0.006923 Min. :0.1784 Min. :0.03000 Min. :1.291
## 1st Qu.:0.007692 1st Qu.:0.2131 1st Qu.:0.03308 1st Qu.:1.498
## Median :0.008718 Median :0.2240 Median :0.03923 Median :1.566
## Mean :0.009429 Mean :0.2234 Mean :0.04276 Mean :1.559
## 3rd Qu.:0.010513 3rd Qu.:0.2326 3rd Qu.:0.04827 3rd Qu.:1.621
## Max. :0.023077 Max. :0.2727 Max. :0.12103 Max. :1.889
## count
## Min. :27.00
## 1st Qu.:30.00
## Median :34.00
## Mean :36.77
## 3rd Qu.:41.00
## Max. :90.00
##
## mining info:
## data ntransactions support confidence
## transactions 3900 0.03 0.1
## call
## apriori(data = transactions, parameter = parameter, appearance = list(rhs = vars$class_items, lhs = vars$feature_items), control = control)
inspectDT(M2.clean)
plot(M2.clean, method="graph", engine="htmlwidget")
## Warning: Too many rules supplied. Only plotting the best 100 using 'lift'
## (change control parameter max if needed).
plot(M2.clean, measure=c("support","lift"), shading="confidence")
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.