Loading Data and Libraries
### Connecting data base and checking data
library(DBI)
library(RMySQL)
library(dplyr)
library(ggplot2)
library(arules)
# 2. Connecting to db
db_user <- 'data_student_berlin'
db_password <- 'waai_has_shitty_internet'
db_name <- 'pricehub'
db_host <- '34.89.228.59' # for local access
db_port <- 3306
mydb <- dbConnect(MySQL(), user = db_user, password = db_password,
dbname = db_name, host = db_host, port = db_port)
# 3. Read data from db
db_table <- 'line_item'
s <- paste0("select * from ", db_table)
rs <- dbSendQuery(mydb, s)
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 0 imported
## as numeric
## Warning in .local(conn, statement, ...): Decimal MySQL column 4 imported as
## numeric
line_item <- fetch(rs, n = -1)
##Import df 'orders'
db_table <- 'orders'
s <- paste0("select * from ", db_table)
rs <- dbSendQuery(mydb, s)
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 0 imported
## as numeric
## Warning in .local(conn, statement, ...): Decimal MySQL column 3 imported as
## numeric
orders <- fetch(rs, n = -1)
##Import df 'products'
db_table <- 'products'
s <- paste0("select * from ", db_table)
rs <- dbSendQuery(mydb, s)
## Warning in .local(conn, statement, ...): Decimal MySQL column 3 imported as
## numeric
products <- fetch(rs, n = -1)
on.exit(dbDisconnect(mydb))
## Warning: Closing open result sets
Data Cleaning and filtering
as.factor(orders$state)
## joining line item and orders
join1 <- inner_join(line_item, orders, by="id_order")
head(join1)
## id id_order product_quantity sku unit_price date
## 1 1119109 299539 1 OTT0127 19 2017-01-01 00:07:19
## 2 1119110 299540 1 LGE0037 399 2017-01-01 00:19:45
## 3 1119111 299541 1 PAR0065 474 2017-01-01 00:20:57
## 4 1119112 299542 1 WDT0309 68 2017-01-01 00:51:40
## 5 1119113 299543 1 JBL0098 24 2017-01-01 01:06:38
## 6 1119115 299544 1 APP1576 1138 2017-01-01 01:17:21
## created_date state total_paid
## 1 2017-01-01 00:07:19 Shopping basket 19
## 2 2017-01-01 00:19:45 Shopping basket 399
## 3 2017-01-01 00:20:57 Shopping basket 474
## 4 2017-01-01 00:51:40 Shopping basket 68
## 5 2017-01-01 01:06:38 Shopping basket 24
## 6 2017-01-01 01:17:21 Shopping basket 1138
## joining previous with products
join_full <- inner_join(join1, products, by="sku")
## filtering out non completed orders
join_full <- join_full %>%
filter(join_full$state == "Completed")
summary(join_full)
## id id_order product_quantity sku
## Min. :1119116 Min. :241423 Min. : 1.000 Length:61742
## 1st Qu.:1266564 1st Qu.:363127 1st Qu.: 1.000 Class :character
## Median :1390455 Median :417656 Median : 1.000 Mode :character
## Mean :1386498 Mean :414741 Mean : 1.121
## 3rd Qu.:1514202 3rd Qu.:470231 3rd Qu.: 1.000
## Max. :1649593 Max. :527112 Max. :72.000
## unit_price date created_date state
## Min. : 1.0 Length:61742 Length:61742 Length:61742
## 1st Qu.: 29.0 Class :character Class :character Class :character
## Median : 70.0 Mode :character Mode :character Mode :character
## Mean : 240.2
## 3rd Qu.: 186.0
## Max. :8288.0
## total_paid name_en short_desc_en
## Min. : 0.0 Length:61742 Length:61742
## 1st Qu.: 57.0 Class :character Class :character
## Median : 139.0 Mode :character Mode :character
## Mean : 403.5
## 3rd Qu.: 389.0
## Max. :13388.0
## price salable stock_qty in_stock
## Min. : 3 Min. :0.0000 Min. :-25.00 Min. :0.0000
## 1st Qu.: 40 1st Qu.:1.0000 1st Qu.: 0.00 1st Qu.:0.0000
## Median : 90 Median :1.0000 Median : 2.00 Median :1.0000
## Mean : 307642 Mean :0.7882 Mean : 10.55 Mean :0.5634
## 3rd Qu.: 249 3rd Qu.:1.0000 3rd Qu.: 6.00 3rd Qu.:1.0000
## Max. :115900092 Max. :1.0000 Max. :348.00 Max. :1.0000
## ProductId purchasable brand manual_categories
## Min. : 4 Min. :0.0000 Length:61742 Length:61742
## 1st Qu.: 4479 1st Qu.:1.0000 Class :character Class :character
## Median :14174 Median :1.0000 Mode :character Mode :character
## Mean :12238 Mean :0.7613
## 3rd Qu.:16839 3rd Qu.:1.0000
## Max. :22700 Max. :1.0000
dim(join_full)
## [1] 61742 19
## create column with unit price * quantity
join_full <- join_full %>%
mutate(Paid_per_product = unit_price * product_quantity)
Grouping by id_order, calculating difference in the total amount paid, keeping only orders with price diff/total paid < 0.3
diff_table <- join_full %>%
group_by(id_order) %>%
summarise(paid_per_order = sum(Paid_per_product),
total_paid = mean(total_paid)) %>%
mutate(diff_total_paid = abs(total_paid-paid_per_order),
ratio = diff_total_paid/total_paid) %>%
filter(ratio < 0.3)
## creating out of line_item a table that contains only relevant rows for id_order
final_table <- line_item[line_item$id_order %in% diff_table$id_order, ]
## excluding transactions of size 1
not1 <- final_table %>%
group_by(id_order) %>%
count(id_order)
not1 <- not1 %>% filter(n != 1)
transactions_not1 <- final_table[final_table$id_order %in%
not1$id_order, ]
Creating the transactional file
## exchanging sku with name of products
transactions_not1_name <- transactions_not1 %>%
left_join(products, by ="sku")
transactions_not1_name <- transactions_not1_name %>%
select(id_order,name_en)
## deleting missing values
sum(is.na(transactions_not1_name))
## [1] 130
transactions_not1_name <- na.omit(transactions_not1_name)
## taking only id_order and sku into the transactional file
transactional_file <- transactions_not1 %>% select(id_order, sku)
Investigating the data
## Getting to know the large transactions file
large_transactions_name <- large_transactions
large_transactions_name
## transactions in sparse format with
## 10321 transactions (rows) and
## 4206 items (columns)
inspect(head(large_transactions_name))
## items transactionID
## [1] {IFX0014,IFX0049,MOP0083,TUC0302} 246018
## [2] {APP0405,APP1208} 251302
## [3] {APP0432,APP1457,APP2487,APP2552,BOS0059} 251688
## [4] {APP1208,APP1459,ELA0004,LAC0221,LGE0038} 253220
## [5] {APP1916,APP2498} 253306
## [6] {ALL0004,APP2523,LAC0166,SAN0150,WOE0002} 253307
length(large_transactions_name)
## [1] 10321
size(head(large_transactions_name, 100))
## [1] 4 2 5 5 2 5 2 3 2 2 2 2 3 2 2 2 2 2 2 2 2 2 4 4 2 3 3 3 3 2 2 2 4 2 2
## [36] 2 4 2 2 3 4 2 2 4 2 5 2 2 2 2 2 2 2 2 2 2 2 2 3 3 2 2 4 2 2 2 2 4 2 3
## [71] 2 2 2 3 5 2 3 2 2 2 2 3 2 3 2 2 2 2 3 3 3 2 3 2 2 4 2 2 2 3
Plotting and imaging the large transactions file
itemFrequencyPlot(large_transactions_name)

itemFrequencyPlot(large_transactions_name,
topN = 15,
type = c("absolute"),
col = rainbow(20),
horiz = TRUE,
xlab = "Item Frequency, absolute values")

image(large_transactions_name)

Cross Table of items purchased together, also testing for Chi-Squared
cross_table <- crossTable(large_transactions_name,
sort = TRUE)
cross_table[1:5,1:5]
## APP0692 APP1184 SAM0068 APP1208 WDT0177
## APP0692 289 3 1 1 0
## APP1184 3 283 1 0 1
## SAM0068 1 1 214 0 1
## APP1208 1 0 0 210 1
## WDT0177 0 1 1 1 180
cross_table_chi <- crossTable(large_transactions_name, measure = "chi")
head(cross_table_chi[1:5])
## [1] NA 1.126517e-07 2.816292e-08 2.816292e-08 1.126517e-07
Applying the Apriori Algorithm
rules <- apriori(large_transactions_name,
parameter = list(supp = 0.001, # 5 rules
conf = 0.6,
target = "rules"))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.6 0.1 1 none FALSE TRUE 5 0.001 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 10
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4206 item(s), 10321 transaction(s)] done [0.02s].
## sorting and recoding items ... [579 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 done [0.00s].
## writing ... [5 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
rules <- apriori(large_transactions_name,
parameter = list(supp = 0.0005, # 48 rules
conf = 0.5,
target = "rules"),
control = list())
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.5 0.1 1 none FALSE TRUE 5 5e-04 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 5
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4206 item(s), 10321 transaction(s)] done [0.02s].
## sorting and recoding items ... [1110 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 done [0.01s].
## writing ... [48 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
rules
## set of 48 rules
inspect(rules[1:10])
## lhs rhs support confidence lift count
## [1] {OWC0056} => {OWC0054} 0.0005813390 1.0000000 860.08333 6
## [2] {OWC0054} => {OWC0056} 0.0005813390 0.5000000 860.08333 6
## [3] {WAC0158} => {WAC0156} 0.0006782289 1.0000000 1474.42857 7
## [4] {WAC0156} => {WAC0158} 0.0006782289 1.0000000 1474.42857 7
## [5] {PRY0004} => {PRY0003} 0.0005813390 1.0000000 607.11765 6
## [6] {NES0009} => {NES0006} 0.0006782289 1.0000000 1032.10000 7
## [7] {NES0006} => {NES0009} 0.0006782289 0.7000000 1032.10000 7
## [8] {APP2125} => {APP1215} 0.0005813390 0.7500000 54.13112 6
## [9] {PAC2115} => {CAD0005} 0.0005813390 0.7500000 193.51875 6
## [10] {QNA0149} => {WDT0177} 0.0005813390 0.6666667 38.22593 6
summary(rules)
## set of 48 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3
## 47 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 2.000 2.000 2.021 2.000 3.000
##
## summary of quality measures:
## support confidence lift count
## Min. :0.0005813 Min. :0.5000 Min. : 18.23 Min. : 6.000
## 1st Qu.:0.0005813 1st Qu.:0.5455 1st Qu.: 39.37 1st Qu.: 6.000
## Median :0.0006782 Median :0.6177 Median : 130.22 Median : 7.000
## Mean :0.0009346 Mean :0.6527 Mean : 306.02 Mean : 9.646
## 3rd Qu.:0.0009931 3rd Qu.:0.6942 3rd Qu.: 333.91 3rd Qu.:10.250
## Max. :0.0045538 Max. :1.0000 Max. :1474.43 Max. :47.000
##
## mining info:
## data ntransactions support confidence
## large_transactions_name 10321 5e-04 0.5
Removing redundant rules with is.redundant ()
redundant_rules <- is.redundant(rules)
summary(redundant_rules)
## Mode FALSE TRUE
## logical 47 1
rules <- rules[!redundant_rules]
length(rules)
## [1] 47
Removing redundant rules second option
redundant_rules <- which(colSums(is.subset(rules)) >1)
length(redundant_rules)
## [1] 14
rules <- rules[- redundant_rules]
length(rules)
## [1] 33
Checking on frequent items in the basket, confidence and lift parameters
frequentItems <- eclat (large_transactions_name,
parameter = list(supp = 0.01, maxlen = 15))
## Eclat
##
## parameter specification:
## tidLists support minlen maxlen target ext
## FALSE 0.01 1 15 frequent itemsets FALSE
##
## algorithmic control:
## sparse sort verbose
## 7 -2 TRUE
##
## Absolute minimum support count: 103
##
## create itemset ...
## set transactions ...[4206 item(s), 10321 transaction(s)] done [0.02s].
## sorting and recoding items ... [16 item(s)] done [0.00s].
## creating sparse bit matrix ... [16 row(s), 10321 column(s)] done [0.00s].
## writing ... [16 set(s)] done [0.00s].
## Creating S4 object ... done [0.00s].
inspect(sort(frequentItems,
decreasing = TRUE, by = "support")[1:5])
## items support count
## [1] {APP0692} 0.02800116 289
## [2] {APP1184} 0.02741982 283
## [3] {SAM0068} 0.02073442 214
## [4] {APP1208} 0.02034687 210
## [5] {WDT0177} 0.01744017 180
rules_conf <- sort(rules, by = "confidence", desc = TRUE)
inspect(rules_conf[1:10])
## lhs rhs support confidence lift count
## [1] {PRY0004} => {PRY0003} 0.0005813390 1.0000000 607.11765 6
## [2] {APP1803} => {APP1215} 0.0016471272 0.8947368 64.57748 17
## [3] {APP2113} => {APP1215} 0.0006782289 0.7777778 56.13598 7
## [4] {APP2125} => {APP1215} 0.0005813390 0.7500000 54.13112 6
## [5] {PAC2115} => {CAD0005} 0.0005813390 0.7500000 193.51875 6
## [6] {IFX0087} => {IFX0028} 0.0007751187 0.7272727 326.35573 8
## [7] {APP2486} => {APP1184} 0.0008720085 0.6923077 25.24844 9
## [8] {QNA0149} => {WDT0177} 0.0005813390 0.6666667 38.22593 6
## [9] {APP1575} => {LIBRO} 0.0007751187 0.6666667 140.42177 8
## [10] {APP1623} => {APP1215} 0.0005813390 0.6666667 48.11655 6
rules_lift <- sort(rules, by = "lift", desc = TRUE)
inspect(rules_lift[1:10])
## lhs rhs support confidence lift count
## [1] {PRY0004} => {PRY0003} 0.0005813390 1.0000000 607.1176 6
## [2] {SPH0014} => {SPH0016} 0.0005813390 0.5454545 331.1551 6
## [3] {SPH0014} => {SPH0015} 0.0006782289 0.6363636 328.3955 7
## [4] {IFX0087} => {IFX0028} 0.0007751187 0.7272727 326.3557 8
## [5] {ALL0011} => {ALL0002} 0.0006782289 0.5833333 200.6861 7
## [6] {PAC2115} => {CAD0005} 0.0005813390 0.7500000 193.5188 6
## [7] {SNS0019} => {SNS0014} 0.0007751187 0.5000000 191.1296 8
## [8] {APP1914} => {APP1041} 0.0011626780 0.5217391 163.1779 12
## [9] {APP1040} => {APP1565} 0.0009688984 0.5555556 147.0228 10
## [10] {APP1575} => {LIBRO} 0.0007751187 0.6666667 140.4218 8