Loading Data and Libraries

### Connecting data base and checking data

library(DBI)
library(RMySQL)
library(dplyr)
library(ggplot2)
library(arules)


# 2. Connecting to db
db_user <- 'data_student_berlin'
db_password <- 'waai_has_shitty_internet'
db_name <- 'pricehub'
db_host <- '34.89.228.59' # for local access
db_port <- 3306
mydb <-  dbConnect(MySQL(), user = db_user, password = db_password,
                   dbname = db_name, host = db_host, port = db_port)

# 3. Read data from db
db_table <- 'line_item'
s <- paste0("select * from ", db_table)
rs <- dbSendQuery(mydb, s)
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 0 imported
## as numeric
## Warning in .local(conn, statement, ...): Decimal MySQL column 4 imported as
## numeric
line_item <-  fetch(rs, n = -1)

##Import df 'orders'
db_table <- 'orders'
s <- paste0("select * from ", db_table)
rs <- dbSendQuery(mydb, s)
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 0 imported
## as numeric
## Warning in .local(conn, statement, ...): Decimal MySQL column 3 imported as
## numeric
orders <-  fetch(rs, n = -1)

##Import df 'products'
db_table <- 'products'
s <- paste0("select * from ", db_table)
rs <- dbSendQuery(mydb, s)
## Warning in .local(conn, statement, ...): Decimal MySQL column 3 imported as
## numeric
products <-  fetch(rs, n = -1)
on.exit(dbDisconnect(mydb))
## Warning: Closing open result sets

Data Cleaning and filtering

as.factor(orders$state)
## joining line item and orders

join1 <- inner_join(line_item, orders, by="id_order")

head(join1)
##        id id_order product_quantity     sku unit_price                date
## 1 1119109   299539                1 OTT0127         19 2017-01-01 00:07:19
## 2 1119110   299540                1 LGE0037        399 2017-01-01 00:19:45
## 3 1119111   299541                1 PAR0065        474 2017-01-01 00:20:57
## 4 1119112   299542                1 WDT0309         68 2017-01-01 00:51:40
## 5 1119113   299543                1 JBL0098         24 2017-01-01 01:06:38
## 6 1119115   299544                1 APP1576       1138 2017-01-01 01:17:21
##          created_date           state total_paid
## 1 2017-01-01 00:07:19 Shopping basket         19
## 2 2017-01-01 00:19:45 Shopping basket        399
## 3 2017-01-01 00:20:57 Shopping basket        474
## 4 2017-01-01 00:51:40 Shopping basket         68
## 5 2017-01-01 01:06:38 Shopping basket         24
## 6 2017-01-01 01:17:21 Shopping basket       1138
## joining previous with products

join_full <- inner_join(join1, products, by="sku")


## filtering out non completed orders

join_full <- join_full %>%  
              filter(join_full$state == "Completed")

summary(join_full)
##        id             id_order      product_quantity     sku           
##  Min.   :1119116   Min.   :241423   Min.   : 1.000   Length:61742      
##  1st Qu.:1266564   1st Qu.:363127   1st Qu.: 1.000   Class :character  
##  Median :1390455   Median :417656   Median : 1.000   Mode  :character  
##  Mean   :1386498   Mean   :414741   Mean   : 1.121                     
##  3rd Qu.:1514202   3rd Qu.:470231   3rd Qu.: 1.000                     
##  Max.   :1649593   Max.   :527112   Max.   :72.000                     
##    unit_price         date           created_date          state          
##  Min.   :   1.0   Length:61742       Length:61742       Length:61742      
##  1st Qu.:  29.0   Class :character   Class :character   Class :character  
##  Median :  70.0   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 240.2                                                           
##  3rd Qu.: 186.0                                                           
##  Max.   :8288.0                                                           
##    total_paid        name_en          short_desc_en     
##  Min.   :    0.0   Length:61742       Length:61742      
##  1st Qu.:   57.0   Class :character   Class :character  
##  Median :  139.0   Mode  :character   Mode  :character  
##  Mean   :  403.5                                        
##  3rd Qu.:  389.0                                        
##  Max.   :13388.0                                        
##      price              salable         stock_qty         in_stock     
##  Min.   :        3   Min.   :0.0000   Min.   :-25.00   Min.   :0.0000  
##  1st Qu.:       40   1st Qu.:1.0000   1st Qu.:  0.00   1st Qu.:0.0000  
##  Median :       90   Median :1.0000   Median :  2.00   Median :1.0000  
##  Mean   :   307642   Mean   :0.7882   Mean   : 10.55   Mean   :0.5634  
##  3rd Qu.:      249   3rd Qu.:1.0000   3rd Qu.:  6.00   3rd Qu.:1.0000  
##  Max.   :115900092   Max.   :1.0000   Max.   :348.00   Max.   :1.0000  
##    ProductId      purchasable        brand           manual_categories 
##  Min.   :    4   Min.   :0.0000   Length:61742       Length:61742      
##  1st Qu.: 4479   1st Qu.:1.0000   Class :character   Class :character  
##  Median :14174   Median :1.0000   Mode  :character   Mode  :character  
##  Mean   :12238   Mean   :0.7613                                        
##  3rd Qu.:16839   3rd Qu.:1.0000                                        
##  Max.   :22700   Max.   :1.0000
dim(join_full)
## [1] 61742    19
## create column with unit price * quantity

join_full <- join_full %>% 
  mutate(Paid_per_product = unit_price * product_quantity)

Grouping by id_order, calculating difference in the total amount paid, keeping only orders with price diff/total paid < 0.3

diff_table <- join_full %>% 
  group_by(id_order) %>%
  summarise(paid_per_order = sum(Paid_per_product),
            total_paid = mean(total_paid)) %>% 
  mutate(diff_total_paid = abs(total_paid-paid_per_order),
         ratio = diff_total_paid/total_paid) %>% 
  filter(ratio < 0.3)


## creating out of line_item a table that contains only relevant rows for id_order

final_table <- line_item[line_item$id_order %in% diff_table$id_order, ]


## excluding transactions of size 1

not1 <- final_table %>% 
  group_by(id_order) %>% 
  count(id_order)


not1 <- not1 %>% filter(n != 1)

transactions_not1 <- final_table[final_table$id_order %in% 
                                          not1$id_order, ]

Creating the transactional file

## exchanging sku with name of products

transactions_not1_name <- transactions_not1 %>% 
        left_join(products, by ="sku")


transactions_not1_name <- transactions_not1_name %>% 
        select(id_order,name_en)


## deleting missing values

sum(is.na(transactions_not1_name))
## [1] 130
transactions_not1_name <- na.omit(transactions_not1_name)


## taking only id_order and sku into the transactional file

transactional_file <- transactions_not1 %>% select(id_order, sku)

Transforming transactional file using read.transactions

write.csv(transactional_file, 
          file = "transactional_file.csv",
          row.names=FALSE)


large_transactions <- read.transactions(
  "transactional_file.csv",
  format = "single",
  cols = c(1,2),
  header = TRUE,
  sep = ","
)

large_transactions
## transactions in sparse format with
##  10321 transactions (rows) and
##  4206 items (columns)
inspect(head(large_transactions))
##     items                                     transactionID
## [1] {IFX0014,IFX0049,MOP0083,TUC0302}         246018       
## [2] {APP0405,APP1208}                         251302       
## [3] {APP0432,APP1457,APP2487,APP2552,BOS0059} 251688       
## [4] {APP1208,APP1459,ELA0004,LAC0221,LGE0038} 253220       
## [5] {APP1916,APP2498}                         253306       
## [6] {ALL0004,APP2523,LAC0166,SAN0150,WOE0002} 253307

Investigating the data

## Getting to know the large transactions file

large_transactions_name <- large_transactions

large_transactions_name
## transactions in sparse format with
##  10321 transactions (rows) and
##  4206 items (columns)
inspect(head(large_transactions_name))
##     items                                     transactionID
## [1] {IFX0014,IFX0049,MOP0083,TUC0302}         246018       
## [2] {APP0405,APP1208}                         251302       
## [3] {APP0432,APP1457,APP2487,APP2552,BOS0059} 251688       
## [4] {APP1208,APP1459,ELA0004,LAC0221,LGE0038} 253220       
## [5] {APP1916,APP2498}                         253306       
## [6] {ALL0004,APP2523,LAC0166,SAN0150,WOE0002} 253307
length(large_transactions_name)
## [1] 10321
size(head(large_transactions_name, 100))
##   [1] 4 2 5 5 2 5 2 3 2 2 2 2 3 2 2 2 2 2 2 2 2 2 4 4 2 3 3 3 3 2 2 2 4 2 2
##  [36] 2 4 2 2 3 4 2 2 4 2 5 2 2 2 2 2 2 2 2 2 2 2 2 3 3 2 2 4 2 2 2 2 4 2 3
##  [71] 2 2 2 3 5 2 3 2 2 2 2 3 2 3 2 2 2 2 3 3 3 2 3 2 2 4 2 2 2 3

Plotting and imaging the large transactions file

itemFrequencyPlot(large_transactions_name)

itemFrequencyPlot(large_transactions_name, 
                  topN = 15, 
                  type = c("absolute"), 
                  col = rainbow(20),
                  horiz = TRUE,
                  xlab = "Item Frequency, absolute values")

image(large_transactions_name)

Cross Table of items purchased together, also testing for Chi-Squared

cross_table <- crossTable(large_transactions_name, 
                          sort = TRUE)

cross_table[1:5,1:5]
##         APP0692 APP1184 SAM0068 APP1208 WDT0177
## APP0692     289       3       1       1       0
## APP1184       3     283       1       0       1
## SAM0068       1       1     214       0       1
## APP1208       1       0       0     210       1
## WDT0177       0       1       1       1     180
cross_table_chi <-  crossTable(large_transactions_name, measure = "chi")

head(cross_table_chi[1:5])
## [1]           NA 1.126517e-07 2.816292e-08 2.816292e-08 1.126517e-07

Applying the Apriori Algorithm

rules <- apriori(large_transactions_name, 
                 parameter = list(supp = 0.001,   # 5 rules
                                  conf = 0.6, 
                                  target = "rules"))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.6    0.1    1 none FALSE            TRUE       5   0.001      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 10 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4206 item(s), 10321 transaction(s)] done [0.02s].
## sorting and recoding items ... [579 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 done [0.00s].
## writing ... [5 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
rules <- apriori(large_transactions_name, 
                 parameter = list(supp = 0.0005,   # 48 rules
                                  conf = 0.5, 
                                  target = "rules"),
                 control = list())
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.5    0.1    1 none FALSE            TRUE       5   5e-04      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 5 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4206 item(s), 10321 transaction(s)] done [0.02s].
## sorting and recoding items ... [1110 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 done [0.01s].
## writing ... [48 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
rules
## set of 48 rules
inspect(rules[1:10])
##      lhs          rhs       support      confidence lift       count
## [1]  {OWC0056} => {OWC0054} 0.0005813390 1.0000000   860.08333 6    
## [2]  {OWC0054} => {OWC0056} 0.0005813390 0.5000000   860.08333 6    
## [3]  {WAC0158} => {WAC0156} 0.0006782289 1.0000000  1474.42857 7    
## [4]  {WAC0156} => {WAC0158} 0.0006782289 1.0000000  1474.42857 7    
## [5]  {PRY0004} => {PRY0003} 0.0005813390 1.0000000   607.11765 6    
## [6]  {NES0009} => {NES0006} 0.0006782289 1.0000000  1032.10000 7    
## [7]  {NES0006} => {NES0009} 0.0006782289 0.7000000  1032.10000 7    
## [8]  {APP2125} => {APP1215} 0.0005813390 0.7500000    54.13112 6    
## [9]  {PAC2115} => {CAD0005} 0.0005813390 0.7500000   193.51875 6    
## [10] {QNA0149} => {WDT0177} 0.0005813390 0.6666667    38.22593 6
summary(rules)
## set of 48 rules
## 
## rule length distribution (lhs + rhs):sizes
##  2  3 
## 47  1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   2.000   2.000   2.021   2.000   3.000 
## 
## summary of quality measures:
##     support            confidence          lift             count       
##  Min.   :0.0005813   Min.   :0.5000   Min.   :  18.23   Min.   : 6.000  
##  1st Qu.:0.0005813   1st Qu.:0.5455   1st Qu.:  39.37   1st Qu.: 6.000  
##  Median :0.0006782   Median :0.6177   Median : 130.22   Median : 7.000  
##  Mean   :0.0009346   Mean   :0.6527   Mean   : 306.02   Mean   : 9.646  
##  3rd Qu.:0.0009931   3rd Qu.:0.6942   3rd Qu.: 333.91   3rd Qu.:10.250  
##  Max.   :0.0045538   Max.   :1.0000   Max.   :1474.43   Max.   :47.000  
## 
## mining info:
##                     data ntransactions support confidence
##  large_transactions_name         10321   5e-04        0.5

Removing redundant rules with is.redundant ()

redundant_rules <- is.redundant(rules)
summary(redundant_rules)
##    Mode   FALSE    TRUE 
## logical      47       1
rules <- rules[!redundant_rules]
length(rules)
## [1] 47

Removing redundant rules second option

redundant_rules <- which(colSums(is.subset(rules)) >1)
length(redundant_rules)
## [1] 14
rules <- rules[- redundant_rules]
length(rules)
## [1] 33

Checking on frequent items in the basket, confidence and lift parameters

frequentItems <- eclat (large_transactions_name, 
                        parameter = list(supp = 0.01, maxlen = 15))
## Eclat
## 
## parameter specification:
##  tidLists support minlen maxlen            target   ext
##     FALSE    0.01      1     15 frequent itemsets FALSE
## 
## algorithmic control:
##  sparse sort verbose
##       7   -2    TRUE
## 
## Absolute minimum support count: 103 
## 
## create itemset ... 
## set transactions ...[4206 item(s), 10321 transaction(s)] done [0.02s].
## sorting and recoding items ... [16 item(s)] done [0.00s].
## creating sparse bit matrix ... [16 row(s), 10321 column(s)] done [0.00s].
## writing  ... [16 set(s)] done [0.00s].
## Creating S4 object  ... done [0.00s].
inspect(sort(frequentItems, 
             decreasing = TRUE, by = "support")[1:5])
##     items     support    count
## [1] {APP0692} 0.02800116 289  
## [2] {APP1184} 0.02741982 283  
## [3] {SAM0068} 0.02073442 214  
## [4] {APP1208} 0.02034687 210  
## [5] {WDT0177} 0.01744017 180
rules_conf <- sort(rules, by = "confidence", desc = TRUE)
inspect(rules_conf[1:10])
##      lhs          rhs       support      confidence lift      count
## [1]  {PRY0004} => {PRY0003} 0.0005813390 1.0000000  607.11765  6   
## [2]  {APP1803} => {APP1215} 0.0016471272 0.8947368   64.57748 17   
## [3]  {APP2113} => {APP1215} 0.0006782289 0.7777778   56.13598  7   
## [4]  {APP2125} => {APP1215} 0.0005813390 0.7500000   54.13112  6   
## [5]  {PAC2115} => {CAD0005} 0.0005813390 0.7500000  193.51875  6   
## [6]  {IFX0087} => {IFX0028} 0.0007751187 0.7272727  326.35573  8   
## [7]  {APP2486} => {APP1184} 0.0008720085 0.6923077   25.24844  9   
## [8]  {QNA0149} => {WDT0177} 0.0005813390 0.6666667   38.22593  6   
## [9]  {APP1575} => {LIBRO}   0.0007751187 0.6666667  140.42177  8   
## [10] {APP1623} => {APP1215} 0.0005813390 0.6666667   48.11655  6
rules_lift <- sort(rules, by = "lift", desc = TRUE)
inspect(rules_lift[1:10])
##      lhs          rhs       support      confidence lift     count
## [1]  {PRY0004} => {PRY0003} 0.0005813390 1.0000000  607.1176  6   
## [2]  {SPH0014} => {SPH0016} 0.0005813390 0.5454545  331.1551  6   
## [3]  {SPH0014} => {SPH0015} 0.0006782289 0.6363636  328.3955  7   
## [4]  {IFX0087} => {IFX0028} 0.0007751187 0.7272727  326.3557  8   
## [5]  {ALL0011} => {ALL0002} 0.0006782289 0.5833333  200.6861  7   
## [6]  {PAC2115} => {CAD0005} 0.0005813390 0.7500000  193.5188  6   
## [7]  {SNS0019} => {SNS0014} 0.0007751187 0.5000000  191.1296  8   
## [8]  {APP1914} => {APP1041} 0.0011626780 0.5217391  163.1779 12   
## [9]  {APP1040} => {APP1565} 0.0009688984 0.5555556  147.0228 10   
## [10] {APP1575} => {LIBRO}   0.0007751187 0.6666667  140.4218  8