Movies Market Basket Analysis with Apriori - A Customer Recommendation Engine

Dawn Daras, M.S.

##We are going to be building a movie recommendation engine using the Rotten Tomatoes Movie Recommendation Dataset

Movie Audience

knitr::opts_chunk$set

## function (...) 
## {
##     set2(resolve(...))
## }
## <bytecode: 0x59f65799fbb8>
## <environment: 0x59f657995198>

##Roadmap ##1) EDA’s - Exploratory Data Analysis of our data ##2) Preprocessing/Cleaning the Data ##3) Building an interactive table to explore the “Golden Age” of Movies ##4) Market Basket Analysis (Apriori)

library(dplyr) # for data manipulation

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr) # for data manipulation
library(caret) # for sampling

## Loading required package: ggplot2

## Loading required package: lattice

library(caTools) # for train/test split
library(ggplot2) # for data visualization
library(corrplot) # for correlations

## corrplot 0.92 loaded

library(Rtsne) # for tsne plotting
library(ggplot2)
library(repr)
library(RColorBrewer)
library(DMwR)

## Loading required package: grid

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(tidyr)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::lift()   masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(arules)  ##specifically for apriori

## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## 
## Attaching package: 'arules'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following objects are masked from 'package:base':
## 
##     abbreviate, write

library(arulesViz) ##specifically for apriori
library(lubridate)
library(vembedr)

## 
## Attaching package: 'vembedr'
## 
## The following object is masked from 'package:lubridate':
## 
##     hms

library(hrbrthemes)
library(rstatix)

## 
## Attaching package: 'rstatix'
## 
## The following object is masked from 'package:stats':
## 
##     filter

library(formattable)
library(data.table)

## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

library(xlsx)
library(arules)
library(arulesViz)
library(plyr)

## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## 
## The following objects are masked from 'package:rstatix':
## 
##     desc, mutate
## 
## The following object is masked from 'package:purrr':
## 
##     compact
## 
## The following object is masked from 'package:DMwR':
## 
##     join
## 
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

library(IRdisplay)
library(plotly)

## 
## Attaching package: 'plotly'
## 
## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise
## 
## The following object is masked from 'package:formattable':
## 
##     style
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

library(stringdist)

## 
## Attaching package: 'stringdist'
## 
## The following object is masked from 'package:tidyr':
## 
##     extract

library(Matrix)

##Bringing in two files: Movies & Ratings

dfmovies <- read.csv("/cloud/project/movies.csv", header=TRUE, stringsAsFactors=FALSE)

dfratings <- read.csv("/cloud/project/ratings.csv",header=TRUE,stringsAsFactors=FALSE)

result <-merge(dfmovies, dfratings, by.x = "movieId", by.y = "movieId", all = TRUE)

head(result,5)

##   movieId            title                                      genres userId
## 1       1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy      1
## 2       1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy    555
## 3       1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy    232
## 4       1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy    590
## 5       1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy    601
##   rating  timestamp
## 1    4.0  964982703
## 2    4.0  978746159
## 3    3.5 1076955621
## 4    4.0 1258420408
## 5    4.0 1521467801

# Number of Users who rated at least one movie:
print("Number of Unique Users")

## [1] "Number of Unique Users"

length(unique(result$userId))

## [1] 611

#Number of movies
print("Number of Unique Movies")

## [1] "Number of Unique Movies"

length(unique(result$movieId))

## [1] 9742

##Extracting movie release year into one column

# Split name column into title and year
df2 <- separate(data = result, col = title, into = c("Movie", "Year"), sep = "\\(" )

## Warning: Expected 2 pieces. Additional pieces discarded in 6782 rows [1424, 1425, 1426,
## 1427, 1428, 1429, 1430, 1431, 1432, 1433, 1434, 1435, 1436, 1437, 1438, 1439,
## 1440, 1441, 1442, 1443, ...].

## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 17 rows [81310, 81311,
## 99344, 99345, 99346, 99347, 99478, 99545, 99636, 99818, 100070, 100288, 100445,
## 100446, 100447, 100467, 100620].

df3 <- df2 %>% mutate(Year = as.numeric(gsub(")", "", Year)))

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Year = as.numeric(gsub(")", "", Year))`.
## Caused by warning:
## ! NAs introduced by coercion

head(df3,2)

##   movieId      Movie Year                                      genres userId
## 1       1 Toy Story  1995 Adventure|Animation|Children|Comedy|Fantasy      1
## 2       1 Toy Story  1995 Adventure|Animation|Children|Comedy|Fantasy    555
##   rating timestamp
## 1      4 964982703
## 2      4 978746159

#convert UNIX timestamp to date using Lubridate

df3$timestamp <- as.POSIXct(df3$timestamp, origin = "1970-01-01")

##Quick EDAs on data to see what other cleaning must be done:

summary(df3)

##     movieId          Movie                Year         genres         
##  Min.   :     1   Length:100854      Min.   :   6   Length:100854     
##  1st Qu.:  1199   Class :character   1st Qu.:1990   Class :character  
##  Median :  2991   Mode  :character   Median :1997   Mode  :character  
##  Mean   : 19435                      Mean   :1995                     
##  3rd Qu.:  8128                      3rd Qu.:2003                     
##  Max.   :193609                      Max.   :2018                     
##                                      NA's   :6799                     
##      userId          rating        timestamp                     
##  Min.   :  1.0   Min.   :0.500   Min.   :1996-03-29 18:36:55.00  
##  1st Qu.:177.0   1st Qu.:3.000   1st Qu.:2002-04-18 09:57:46.00  
##  Median :325.0   Median :3.500   Median :2007-08-02 20:31:02.00  
##  Mean   :326.1   Mean   :3.502   Mean   :2008-03-19 17:01:27.36  
##  3rd Qu.:477.0   3rd Qu.:4.000   3rd Qu.:2015-07-04 07:15:44.50  
##  Max.   :610.0   Max.   :5.000   Max.   :2018-09-24 14:27:30.00  
##  NA's   :18      NA's   :18      NA's   :18

##Removing data with missing userIDs

MISSING <- is.na(df3$userId)|
          is.na(df3$timestamp) |
           is.na(df3$rating)|
          is.na(df3$Year)

sum(MISSING)

## [1] 6815

dfclean <- subset(df3, 
                      subset = !MISSING)

nrow(dfclean)

## [1] 94039

summary(dfclean)

##     movieId          Movie                Year         genres         
##  Min.   :     1   Length:94039       Min.   :   6   Length:94039      
##  1st Qu.:  1197   Class :character   1st Qu.:1990   Class :character  
##  Median :  2970   Mode  :character   Median :1997   Mode  :character  
##  Mean   : 19784                      Mean   :1995                     
##  3rd Qu.:  8360                      3rd Qu.:2003                     
##  Max.   :193609                      Max.   :2018                     
##      userId          rating        timestamp                     
##  Min.   :  1.0   Min.   :0.500   Min.   :1996-03-29 18:36:55.00  
##  1st Qu.:177.0   1st Qu.:3.000   1st Qu.:2001-12-30 06:34:33.50  
##  Median :325.0   Median :3.500   Median :2007-08-02 20:28:57.00  
##  Mean   :325.5   Mean   :3.485   Mean   :2008-03-10 17:44:45.02  
##  3rd Qu.:477.0   3rd Qu.:4.000   3rd Qu.:2015-07-04 07:09:53.50  
##  Max.   :610.0   Max.   :5.000   Max.   :2018-09-24 14:27:30.00

##Looking at the ratings to see the frequency counts

freqrat <- table(dfclean$rating)

print(freqrat)

## 
##   0.5     1   1.5     2   2.5     3   3.5     4   4.5     5 
##  1304  2686  1712  7228  5278 18992 12194 24872  7741 12032

##Looking at frequency of UserIDs ##create min, max, med, mode and look at outliers, remove any extremes do a histogram

hist(dfclean$userId,
  xlab = "UserId",
  main = "Histogram of UserID",
  breaks = sqrt(nrow(df3))
) # set number of bins

##The userIds that are above frequency counts of 1500 frequency are the anomalies will skew the recommendating engine. We need to remove them. We are going to create a frequency table to identify these anomalies

frequency <- freq_table(dfclean$userId)

##Now we are creating a dataframe of just the userId’s that have a frequency count > 1200 and printing a table of those

largeanomaly <- frequency %>% 
  filter(n > 1200)

print(largeanomaly)

## # A tibble: 6 × 3
##   group     n  prop
##   <int> <int> <dbl>
## 1    68  1209   1.3
## 2   274  1261   1.3
## 3   414  2574   2.7
## 4   448  1813   1.9
## 5   474  1945   2.1
## 6   599  2250   2.4

##Now we will drop the vars we don’t need rename group to “userId” and do an anti-join to remove the outliers from our dataset now that we have identified them

names(largeanomaly)[names(largeanomaly) == "group"] <- "userId"
largeanomaly = subset(largeanomaly, select = -c(n,prop) ) 

print(largeanomaly)

## # A tibble: 6 × 1
##   userId
##    <int>
## 1     68
## 2    274
## 3    414
## 4    448
## 5    474
## 6    599

##Here’s our anti-join

dfclean <- dfclean %>%
  anti_join(largeanomaly)

## Joining with `by = join_by(userId)`

##Let’s look at the histogram again - to see that we did it!!

hist(dfclean$userId,
  xlab = "UserId",
  main = "Histogram of UserID",
  breaks = sqrt(nrow(dfclean))
) # set number of bins

##Checking the range of Year to see if there is any cleaning up to do - yes the lowest Year is ‘6’

range(dfclean$Year)

## [1]    6 2018

##Round off ratings to single digit

dfclean$rating <- round(dfclean$rating)

ratingtable <-freq_table(dfclean$rating)
print(ratingtable)

## # A tibble: 6 × 3
##   group     n  prop
##   <dbl> <int> <dbl>
## 1     0  1220   1.5
## 2     1  2419   2.9
## 3     2 10967  13.2
## 4     3 16469  19.8
## 5     4 40315  48.6
## 6     5 11597  14

##Creating a dataframe of just the outlying ‘6’ Year

##Finding Year 6
out <-dfclean %>%  filter(Year=='6')

##Antijoin to remove the outlier Year

##Antijoin to remove #6

clean <- dfclean %>%
  anti_join(out)

## Joining with `by = join_by(movieId, Movie, Year, genres, userId, rating,
## timestamp)`

##Okay we removed it!

range(clean$Year)

## [1] 1903 2018

##We are going to save out some data for later by creating another dataframe:

modeldata <-clean

##Preparing data to build table and visualizations for 1960’s to 1980’s or “The Golden Age” of Film: ##removing movies with years out of range or older

golden <-clean[clean$Year > 1959 &  clean$Year < 1981, ]

range(golden$Year)

## [1] 1960 1980

##Checking to see how many unique users and unique movies we have in our dataset

# Number of Users who rated at least one movie:
print("Number of Unique Users")

## [1] "Number of Unique Users"

length(unique(golden$userId))

## [1] 487

#Number of movies
print("Number of Unique Movies")

## [1] "Number of Unique Movies"

length(unique(golden$movieId))

## [1] 723

##Building a custom table

##Cleaning genres to first three descriptors

golden$new <-unlist(lapply(strsplit(golden$genres, '|', fixed = TRUE), '[', 2))

##Checking our frequencies by Genre for our Golden Years dataset

New <-freq_table(golden$new)

print(New)

## # A tibble: 17 × 3
##    group           n  prop
##    <chr>       <int> <dbl>
##  1 Adventure     984  16.7
##  2 Animation      92   1.6
##  3 Children      249   4.2
##  4 Comedy        599  10.2
##  5 Crime         237   4  
##  6 Documentary     2   0  
##  7 Drama        1618  27.5
##  8 Fantasy       132   2.2
##  9 Film-Noir      59   1  
## 10 Horror        320   5.4
## 11 Musical       151   2.6
## 12 Mystery       184   3.1
## 13 Romance       242   4.1
## 14 Sci-Fi        466   7.9
## 15 Thriller      179   3  
## 16 War           213   3.6
## 17 Western       163   2.8

##Displaying it in a Histogram

counts <- table(golden$new)
barplot(counts, main="Ratings by Genre",
   col="hot pink",las=2)

##Average ratings by Year for our Golden Years Dataset ##Building the aggregate dataset of number of ratings per month

ratings <- aggregate(golden$rating, by = list(golden$Year), FUN = sum)

head(ratings)

##   Group.1    x
## 1    1960  724
## 2    1961  536
## 3    1962  855
## 4    1963  822
## 5    1964 1124
## 6    1965  605

#Rename Columns

colnames(ratings)[1] <- "Year"      
colnames(ratings)[2] <- "Count"  
head(ratings, 5)

##   Year Count
## 1 1960   724
## 2 1961   536
## 3 1962   855
## 4 1963   822
## 5 1964  1124

##Trend Line

ratings %>%
  ggplot(aes(x = Year, y = Count, group = 1)) +
  geom_line(aes(col = 'hotpink', las = 2,))

## Warning in geom_line(aes(col = "hotpink", las = 2, )): Ignoring unknown
## aesthetics: las

  theme(axis.text.x = element_text(angle=90, vjust=.5, hjust=1))

## List of 1
##  $ axis.text.x:List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 1
##   ..$ vjust        : num 0.5
##   ..$ angle        : num 90
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi FALSE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi FALSE
##  - attr(*, "validate")= logi TRUE

##Top rated movies ##Aggregate movies by movie, rating, year, genre #Rename new to genre

names(golden)[8] <- "Genre"

nrow(distinct(golden))

## [1] 7251

##Sample a table of movies where ratings are 4 and >

df4 <- filter(golden, rating > 3)

second <-aggregate(df4$rating, by = list(df4$Movie, df4$Year, df4$Genre), FUN = mean)

#rename the columns before building our table

names(second)[1] <- "Movie"
names(second)[2] <- "Year"
names(second)[3] <- "Genre"
names(second)[4] <- "Avg Rating"

##Round Avg Rating to two digits

second$`Avg Rating`<- round(second$`Avg Rating`, digits = 2)

library(data.table)
library(DT)

##Build an interactable data table from the new aggregate table we have built for movies from the Golden Age, rated 4 -5

datatable(second,extensions = 'Buttons',
options = list(dom='Bfrtip',
buttons=c('copy', 'csv', 'excel', 'print', 'pdf')))

##Now let’s build our Market Basket - Apriori Model ##For this we are going back to the data before we split the genres ##pulling the model data - dropping all the vars except userId and movieId

data2 = subset(modeldata, select = -c(Movie,Year,genres,rating,timestamp) ) 

head(data2,5)

##   movieId userId
## 1       1      1
## 2       1    555
## 3       1    232
## 4       1    590
## 5       1    601

dim(data2)[1]

## [1] 82986

##Now we have to translate our dataframe into a spare User-Item matrix focused on the transactions

#convert rating-per-row dataframe into sparse User-Item matrix
user_item_matrix <- as(split(data2[,"movieId"],data2[,"userId"]), "transactions")

#investigate the User-Item matrix
#transactions (rows) -> number of raters
#items (columns) -> number of movies
user_item_matrix

## transactions in sparse format with
##  604 transactions (rows) and
##  7618 items (columns)

##Setting the parameters for the Apriori algotrithm

rule_param = list(
    supp = 0.001,
    conf = 0.7,
    maxlen = 2
)

##Running the algorithm

assoc_rules = apriori(user_item_matrix,parameter = rule_param)

## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.7    0.1    1 none FALSE            TRUE       5   0.001      1
##  maxlen target  ext
##       2  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 0 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[7618 item(s), 604 transaction(s)] done [0.04s].
## sorting and recoding items ... [7618 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2

## Warning in apriori(user_item_matrix, parameter = rule_param): Mining stopped
## (maxlen reached). Only patterns up to a length of 2 returned!

##  done [0.18s].
## writing ... [1681728 rule(s)] done [0.29s].
## creating S4 object  ... done [0.83s].

##Printing out the association rules

summary(assoc_rules)

## set of 1681728 rules
## 
## rule length distribution (lhs + rhs):sizes
##       2 
## 1681728 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       2       2       2       2       2       2 
## 
## summary of quality measures:
##     support           confidence       coverage             lift        
##  Min.   :0.001656   Min.   :0.700   Min.   :0.001656   Min.   :  1.309  
##  1st Qu.:0.001656   1st Qu.:1.000   1st Qu.:0.001656   1st Qu.:  9.015  
##  Median :0.001656   Median :1.000   Median :0.001656   Median : 22.370  
##  Mean   :0.002763   Mean   :0.983   Mean   :0.003045   Mean   : 95.256  
##  3rd Qu.:0.001656   3rd Qu.:1.000   3rd Qu.:0.001656   3rd Qu.: 86.286  
##  Max.   :0.374172   Max.   :1.000   Max.   :0.516556   Max.   :604.000  
##      count        
##  Min.   :  1.000  
##  1st Qu.:  1.000  
##  Median :  1.000  
##  Mean   :  1.669  
##  3rd Qu.:  1.000  
##  Max.   :226.000  
## 
## mining info:
##              data ntransactions support confidence
##  user_item_matrix           604   0.001        0.7
##                                                      call
##  apriori(data = user_item_matrix, parameter = rule_param)

##Because there are so many rules we are going to only chose those whose lift have exceeded the 75th percentile, or in this case >= 86.286

assoc_rules = subset(assoc_rules, lift >= 86.286)

summary(assoc_rules)

## set of 390205 rules
## 
## rule length distribution (lhs + rhs):sizes
##      2 
## 390205 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       2       2       2       2       2       2 
## 
## summary of quality measures:
##     support           confidence        coverage             lift      
##  Min.   :0.001656   Min.   :0.7500   Min.   :0.001656   Min.   : 90.6  
##  1st Qu.:0.001656   1st Qu.:1.0000   1st Qu.:0.001656   1st Qu.:151.0  
##  Median :0.001656   Median :1.0000   Median :0.001656   Median :302.0  
##  Mean   :0.001688   Mean   :0.9997   Mean   :0.001690   Mean   :334.4  
##  3rd Qu.:0.001656   3rd Qu.:1.0000   3rd Qu.:0.001656   3rd Qu.:604.0  
##  Max.   :0.009934   Max.   :1.0000   Max.   :0.009934   Max.   :604.0  
##      count     
##  Min.   :1.00  
##  1st Qu.:1.00  
##  Median :1.00  
##  Mean   :1.02  
##  3rd Qu.:1.00  
##  Max.   :6.00  
## 
## mining info:
##              data ntransactions support confidence
##  user_item_matrix           604   0.001        0.7
##                                                      call
##  apriori(data = user_item_matrix, parameter = rule_param)

##Putting our association rules into a data frame

assoc_rules = as(assoc_rules,"data.frame")

head(assoc_rules)

##                rules     support confidence    coverage     lift count
## 1  {26555} => {3404} 0.001655629          1 0.001655629 100.6667     1
## 2  {26555} => {4387} 0.001655629          1 0.001655629 100.6667     1
## 15   {2493} => {389} 0.001655629          1 0.001655629 302.0000     1
## 16  {2493} => {2390} 0.001655629          1 0.001655629 100.6667     1
## 17  {2493} => {1365} 0.001655629          1 0.001655629 100.6667     1
## 32     {626} => {40} 0.001655629          1 0.001655629 302.0000     1

##Now we have to break apart the rules into separate columns - or right-hand and left-hand rules so we can re-attach the movie names and make the rule-sets useful or consummable at the business level

rules = sapply(assoc_rules$rules,function(x){
    x = gsub("[\\{\\}]", "", regmatches(x, gregexpr("\\{.*\\}", x))[[1]])
    x = gsub("=>",",",x)
    x = str_replace_all(x," ","")
    return( x )
})

rules = as.character(rules)
rules = str_split(rules,",")

assoc_rules$lhs_movie = sapply( rules, "[[", 1)
assoc_rules$rhs_movie = sapply( rules , "[[", 2)

assoc_rules$rules = NULL
rm(rules)
gc()

##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  3426271 183.0    7364918 393.4  7364918 393.4
## Vcells 12058915  92.1   42729456 326.0 42625524 325.3

assoc_rules$lhs_movie = as.numeric(assoc_rules$lhs_movie)
assoc_rules$rhs_movie = as.numeric(assoc_rules$rhs_movie)

##Preparing data to create a dataset to extract rules

movienames = subset(clean, select = -c(userId,Year,timestamp,genres,rating) ) 

movienames <-unique(movienames)

##Joinging with a moviename only dataset to extract movie names

assoc_rules = assoc_rules %>% left_join(modeldata,by=c("lhs_movie" = "movieId") )

## Warning in left_join(., modeldata, by = c(lhs_movie = "movieId")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1005 of `x` matches multiple rows in `y`.
## ℹ Row 64660 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

assoc_rules <-left_join(assoc_rules, movienames, by=c('rhs_movie'='movieId'))

##Renaming movie columns

colnames(assoc_rules)[8] <- "left.title"      
colnames(assoc_rules)[14] <- "right.title"  
head(assoc_rules, 5)

##       support confidence    coverage     lift count lhs_movie rhs_movie
## 1 0.001655629          1 0.001655629 100.6667     1     26555      3404
## 2 0.001655629          1 0.001655629 100.6667     1     26555      4387
## 3 0.001655629          1 0.001655629 302.0000     1      2493       389
## 4 0.001655629          1 0.001655629 100.6667     1      2493      2390
## 5 0.001655629          1 0.001655629 100.6667     1      2493      1365
##         left.title Year genres userId rating           timestamp
## 1   Spies Like Us  1985 Comedy    320      4 2010-09-12 00:25:03
## 2   Spies Like Us  1985 Comedy    320      4 2010-09-12 00:25:03
## 3 Harmonists, The  1997  Drama    467      4 1999-02-22 08:05:53
## 4 Harmonists, The  1997  Drama    467      4 1999-02-22 08:05:53
## 5 Harmonists, The  1997  Drama    467      4 1999-02-22 08:05:53
##            right.title
## 1             Titanic 
## 2  Kiss of the Dragon 
## 3 Colonel Chabert, Le 
## 4        Little Voice 
## 5            Ridicule

##Now look at the rules we mined - we mined the top rules with the highest lift

assoc_rules %>% arrange(desc(lift)) %>% select(left.title,right.title,support,confidence,lift) %>% head()

##        left.title     right.title     support confidence lift
## 1           Okja  The Red Turtle  0.001655629          1  604
## 2 The Red Turtle            Okja  0.001655629          1  604
## 3           Okja        Lemonade  0.001655629          1  604
## 4       Lemonade            Okja  0.001655629          1  604
## 5 The Red Turtle        Lemonade  0.001655629          1  604
## 6       Lemonade  The Red Turtle  0.001655629          1  604

##We do a naive filter here. Results with a number on both sides or similar opening string is removed,

assoc_rules = assoc_rules %>% 
    filter( ! (grepl("[0-9]",left.title,perl = TRUE) &  grepl("[0-9]",right.title,perl = TRUE) ) ) %>%
    filter( ! (grepl("Lemonade",left.title,perl = TRUE) &  grepl("Lemonade",right.title,perl = TRUE) ) ) %>%
    filter( substr( left.title,start = 1,stop = min(5,str_length(left.title),str_length(right.title)) ) != substr( right.title,start = 1,stop = min(5,str_length(left.title),str_length(right.title)) ) ) %>%
    arrange(desc(lift))

head(assoc_rules %>% select(left.title,right.title,support,confidence,lift),10)

##             left.title         right.title     support confidence lift
## 1                Okja      The Red Turtle  0.001655629          1  604
## 2      The Red Turtle                Okja  0.001655629          1  604
## 3                Okja            Lemonade  0.001655629          1  604
## 4            Lemonade                Okja  0.001655629          1  604
## 5      The Red Turtle            Lemonade  0.001655629          1  604
## 6            Lemonade      The Red Turtle  0.001655629          1  604
## 7  Too Late for Tears             Villain  0.001655629          1  604
## 8             Villain  Too Late for Tears  0.001655629          1  604
## 9      Lifeguard, The     One I Love, The  0.001655629          1  604
## 10    One I Love, The      Lifeguard, The  0.001655629          1  604

##Lastly, we can use association rules to recommend a potential movie. Tomb Raider is fun – let’s see what movies based we could explore based on it.

assoc_rules %>% 
    filter(str_detect(left.title,"Tomb Raider") | str_detect(right.title,"Tomb Raider")) %>%
    select(left.title,right.title,support,confidence,lift) %>%
    head(20)

##                        left.title  right.title     support confidence lift
## 1                   Wonder Wheel  Tomb Raider  0.001655629          1  151
## 2                      Lady Bird  Tomb Raider  0.001655629          1  151
## 3                       The Wait  Tomb Raider  0.001655629          1  151
## 4                     Disconnect  Tomb Raider  0.001655629          1  151
## 5                         Güeros  Tomb Raider  0.001655629          1  151
## 6          There Will Come a Day  Tomb Raider  0.001655629          1  151
## 7                  Way Back, The  Tomb Raider  0.001655629          1  151
## 8                      Self/less  Tomb Raider  0.001655629          1  151
## 9                          Alpha  Tomb Raider  0.001655629          1  151
## 10                    Game Night  Tomb Raider  0.001655629          1  151
## 11                     The Mummy  Tomb Raider  0.001655629          1  151
## 12 The Star Wars Holiday Special  Tomb Raider  0.001655629          1  151
## 13                          Fuzz  Tomb Raider  0.001655629          1  151
## 14             Cannonball Run II  Tomb Raider  0.001655629          1  151
## 15                  Fire and Ice  Tomb Raider  0.001655629          1  151
## 16       Humanoids from the Deep  Tomb Raider  0.001655629          1  151
## 17                     Delirious  Tomb Raider  0.001655629          1  151
## 18          Happy Birthday to Me  Tomb Raider  0.001655629          1  151
## 19        Company of Wolves, The  Tomb Raider  0.001655629          1  151
## 20                        Squirm  Tomb Raider  0.001655629          1  151

embed_url("https://youtu.be/Hby8GJscdho?si=mYKPG5uM0l-2-y5h")

Movies Market Basket Analysis with Apriori - A Customer Recommendation Engine

Dawn Daras MS

2024-06-11