We are going to be building a movie customer recommendation using the Rotten Tomatoes dataset - Market Basket Analysis - Apriori

Our Roadmap

library(dplyr) # for data manipulation
library(stringr) # for data manipulation
library(caret) # for sampling
library(caTools) # for train/test split
library(ggplot2) # for data visualization
library(corrplot) # for correlations
library(Rtsne) # for tsne plotting
library(ggplot2)
library(repr)
library(RColorBrewer)
library(factoextra)
library(tidyr)
library(tidyverse)
library(arules)  ##specifically for apriori
library(arulesViz) ##specifically for apriori
library(lubridate)
library(vembedr)
library(hrbrthemes)
library(rstatix)
library(formattable)
library(data.table)
library(xlsx)
library(arules)
library(arulesViz)
library(plyr)
library(IRdisplay)
library(plotly)
library(stringdist)
library(Matrix)

Bringing in our data

dfmovies <- read.csv("/cloud/project/movies.csv", header=TRUE, stringsAsFactors=FALSE)
dfratings <- read.csv("/cloud/project/ratings.csv", header=TRUE, stringsAsFactors=FALSE)

Merging the dataframes into one

result <-merge(dfmovies, dfratings, by.x = "movieId", by.y = "movieId", all = TRUE)

head(result,5)
##   movieId            title                                      genres userId
## 1       1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy      1
## 2       1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy    555
## 3       1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy    232
## 4       1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy    590
## 5       1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy    601
##   rating  timestamp
## 1    4.0  964982703
## 2    4.0  978746159
## 3    3.5 1076955621
## 4    4.0 1258420408
## 5    4.0 1521467801

Extracting movie year release into separate column

Split name column into title and year

library(tidyr)
df2 <- separate(data = result, col = title, into = c("Movie", "Year"), sep = "\\(" )
df3 <- df2 %>% mutate(Year = as.numeric(gsub(")", "", Year)))

head(df3,2)
##   movieId      Movie Year                                      genres userId
## 1       1 Toy Story  1995 Adventure|Animation|Children|Comedy|Fantasy      1
## 2       1 Toy Story  1995 Adventure|Animation|Children|Comedy|Fantasy    555
##   rating timestamp
## 1      4 964982703
## 2      4 978746159

Convert UNIX to timestamp using LUBRIDATE

df3$timestamp <- as.POSIXct(df3$timestamp, origin = "1970-01-01")

head(df3,3)
##   movieId      Movie Year                                      genres userId
## 1       1 Toy Story  1995 Adventure|Animation|Children|Comedy|Fantasy      1
## 2       1 Toy Story  1995 Adventure|Animation|Children|Comedy|Fantasy    555
## 3       1 Toy Story  1995 Adventure|Animation|Children|Comedy|Fantasy    232
##   rating           timestamp
## 1    4.0 2000-07-30 18:45:03
## 2    4.0 2001-01-06 01:55:59
## 3    3.5 2004-02-16 18:20:21

Quick EDAs & Preprocessing of Data

summary(df3)
##     movieId          Movie                Year         genres         
##  Min.   :     1   Length:100854      Min.   :   6   Length:100854     
##  1st Qu.:  1199   Class :character   1st Qu.:1990   Class :character  
##  Median :  2991   Mode  :character   Median :1997   Mode  :character  
##  Mean   : 19435                      Mean   :1995                     
##  3rd Qu.:  8128                      3rd Qu.:2003                     
##  Max.   :193609                      Max.   :2018                     
##                                      NA's   :6799                     
##      userId          rating        timestamp                     
##  Min.   :  1.0   Min.   :0.500   Min.   :1996-03-29 18:36:55.00  
##  1st Qu.:177.0   1st Qu.:3.000   1st Qu.:2002-04-18 09:57:46.00  
##  Median :325.0   Median :3.500   Median :2007-08-02 20:31:02.00  
##  Mean   :326.1   Mean   :3.502   Mean   :2008-03-19 17:01:27.36  
##  3rd Qu.:477.0   3rd Qu.:4.000   3rd Qu.:2015-07-04 07:15:44.50  
##  Max.   :610.0   Max.   :5.000   Max.   :2018-09-24 14:27:30.00  
##  NA's   :18      NA's   :18      NA's   :18

Remove data with missings

MISSING <- is.na(df3$userId)|
          is.na(df3$timestamp) |
           is.na(df3$rating)|
          is.na(df3$Year)
sum(MISSING)
## [1] 6815
dfclean <- subset(df3, 
                      subset = !MISSING)

Count the number of rows left

nrow(dfclean)
## [1] 94039

Look at frequency counts of ratings

freqrat <- table(dfclean$rating)

print(freqrat)
## 
##   0.5     1   1.5     2   2.5     3   3.5     4   4.5     5 
##  1304  2686  1712  7228  5278 18992 12194 24872  7741 12032

Looking at any extremes in UserIds to see if any users who too frequently rate might be skewing the data

hist(dfclean$userId,
  xlab = "UserId",
  main = "Histogram of UserID",
  breaks = sqrt(nrow(df3))
) # set number of bins

The userIds that are above frequency counts of 1500 frequency are the anomalies will skew the recommendating engine. We need to remove them. We are going to create a frequency table to identify these anomalies

frequency <- freq_table(dfclean,userId)

head(frequency,5)
## # A tibble: 5 × 3
##   userId     n  prop
##    <int> <int> <dbl>
## 1      1   222   0.2
## 2      2    29   0  
## 3      3    36   0  
## 4      4   186   0.2
## 5      5    41   0
largeanomaly <- frequency %>% 
  filter(n > 1200)

print(largeanomaly)
## # A tibble: 6 × 3
##   userId     n  prop
##    <int> <int> <dbl>
## 1     68  1209   1.3
## 2    274  1261   1.3
## 3    414  2574   2.7
## 4    448  1813   1.9
## 5    474  1945   2.1
## 6    599  2250   2.4
names(largeanomaly)[names(largeanomaly) == "cat"] <- "userId"
largeanomaly = subset(largeanomaly, select = -c(n) )
head(largeanomaly,5)
## # A tibble: 5 × 2
##   userId  prop
##    <int> <dbl>
## 1     68   1.3
## 2    274   1.3
## 3    414   2.7
## 4    448   1.9
## 5    474   2.1
largeanomaly$userId<-as.integer(largeanomaly$userId)

Antijoin to remove the outliers

dfclean <- dfclean %>%
  anti_join(largeanomaly)
#Joining with `by = join_by(userId)`

Relooking at the histogram again

hist(dfclean$userId,
  xlab = "UserId",
  main = "Histogram of UserID",
  breaks = sqrt(nrow(dfclean))
) # set number of bins

Range of year

range(dfclean$Year)
## [1]    6 2018

Round off ratings to single digit

dfclean$rating <- round(dfclean$rating)

head(dfclean$rating, 5)
## [1] 4 4 4 4 4

Removing the outlier “6” Year

out <-dfclean %>%  filter(Year=='6')

Anti-join to remove it

clean <- dfclean %>%
  anti_join(out)

Checking the Year range now

range(clean$Year)
## [1] 1903 2018

We are going to save out some data for our Market Basket Analysis later

modeldata <-clean

Preparing data to build table and visualizations for 1960’s to 1980’s or “The Golden Age” of Film: removing movies with years out of range or older

Building Interactive Table

golden <-clean[clean$Year > 1959 &  clean$Year < 1981, ]
range(golden$Year)
## [1] 1960 1980

Checking to see how many unique users and unique movies we have in our dataset

Number of Users who rated at least one movie & Number of movies

print("Number of Unique Users")
## [1] "Number of Unique Users"
length(unique(golden$userId))
## [1] 487
#Number of movies
print("Number of Unique Movies")
## [1] "Number of Unique Movies"
length(unique(golden$movieId))
## [1] 723

Building a custom table

Cleaning genres to first three descriptors

golden$new <-unlist(lapply(strsplit(golden$genres, '|', fixed = TRUE), '[', 2))
counts <- table(golden$new)
barplot(counts, main="Ratings by Genre",
   col="violet",las=2)

Average ratings by Year for our Golden Years Dataset

Building the aggregate dataset of number of ratings per month

ratings <- aggregate(golden$rating, by = list(golden$Year), FUN = sum)

head(ratings)
##   Group.1    x
## 1    1960  724
## 2    1961  536
## 3    1962  855
## 4    1963  822
## 5    1964 1124
## 6    1965  605

Rename Columns

colnames(ratings)[1] <- "Year"      
colnames(ratings)[2] <- "Count"  
head(ratings, 5)
##   Year Count
## 1 1960   724
## 2 1961   536
## 3 1962   855
## 4 1963   822
## 5 1964  1124
# Libraries
library(ggplot2)
library(hrbrthemes)

Trend line

ggplot(ratings, aes(x = Year, y = Count, group = 1)) +
  geom_line(color = "#800080",    # Color of the line
            lwd = 1,      # Width of the line
            linetype = 1) 

Top rated movies - Aggregate movies by movie, rating, year, genre - Rename new to genre

names(golden)[8] <- "Genre"

Sample a table where ratings are 4 and >

df4 <- filter(golden, rating > 3)
second <-aggregate(df4$rating, by = list(df4$Movie, df4$Year, df4$Genre), FUN = mean)

Rename columns for our table

names(second)[1] <- "Movie"
names(second)[2] <- "Year"
names(second)[3] <- "Genre"
names(second)[4] <- "Avg Rating"

Avg ratings to 2 digits

second$`Avg Rating`<- round(second$`Avg Rating`, digits = 2)
library(data.table)
library(DT)
datatable(second,extensions = 'Buttons',
options = list(dom='Bfrtip',
buttons=c('copy', 'csv', 'excel', 'print', 'pdf')))

Market Basket - Apriori Model

Now let’s build our Market Basket - Apriori Model For this we are going back to the data before we split the genres pulling the model data - dropping all the vars except userId and movieId

data2 = subset(modeldata, select = -c(Movie,Year,genres,rating,timestamp) ) 

head(data2,5)
##   movieId userId
## 1       1      1
## 2       1    555
## 3       1    232
## 4       1    590
## 5       1    601
dim(data2)[1]
## [1] 82986
library(arules)
library(dplyr)
library(reshape2)
library(Matrix)
library(stringr)
library(stringdist)
#convert rating-per-row dataframe into sparse User-Item matrix
user_item_matrix <- as(split(data2[,"movieId"],data2[,"userId"]), "transactions")

#investigate the User-Item matrix
#transactions (rows) -> number of raters
#items (columns) -> number of movies
user_item_matrix
## transactions in sparse format with
##  604 transactions (rows) and
##  7618 items (columns)

Setting the parameters for the Apriori

rule_param = list(
    supp = 0.001,
    conf = 0.7,
    maxlen = 2
)

Running the algorithm

assoc_rules = apriori(user_item_matrix,parameter = rule_param)
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.7    0.1    1 none FALSE            TRUE       5   0.001      1
##  maxlen target  ext
##       2  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 0 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[7618 item(s), 604 transaction(s)] done [0.02s].
## sorting and recoding items ... [7618 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2
##  done [0.18s].
## writing ... [1681728 rule(s)] done [0.25s].
## creating S4 object  ... done [0.65s].

Printing out the Association Rules

summary(assoc_rules)
## set of 1681728 rules
## 
## rule length distribution (lhs + rhs):sizes
##       2 
## 1681728 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       2       2       2       2       2       2 
## 
## summary of quality measures:
##     support           confidence       coverage             lift        
##  Min.   :0.001656   Min.   :0.700   Min.   :0.001656   Min.   :  1.309  
##  1st Qu.:0.001656   1st Qu.:1.000   1st Qu.:0.001656   1st Qu.:  9.015  
##  Median :0.001656   Median :1.000   Median :0.001656   Median : 22.370  
##  Mean   :0.002763   Mean   :0.983   Mean   :0.003045   Mean   : 95.256  
##  3rd Qu.:0.001656   3rd Qu.:1.000   3rd Qu.:0.001656   3rd Qu.: 86.286  
##  Max.   :0.374172   Max.   :1.000   Max.   :0.516556   Max.   :604.000  
##      count        
##  Min.   :  1.000  
##  1st Qu.:  1.000  
##  Median :  1.000  
##  Mean   :  1.669  
##  3rd Qu.:  1.000  
##  Max.   :226.000  
## 
## mining info:
##              data ntransactions support confidence
##  user_item_matrix           604   0.001        0.7
##                                                      call
##  apriori(data = user_item_matrix, parameter = rule_param)

Because there are so many rules we are going to only chose those whose lift have exceeded the 75th percentile, or in this case >= 86.286

assoc_rules = subset(assoc_rules, lift >= 86.286)

summary(assoc_rules)
## set of 390205 rules
## 
## rule length distribution (lhs + rhs):sizes
##      2 
## 390205 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       2       2       2       2       2       2 
## 
## summary of quality measures:
##     support           confidence        coverage             lift      
##  Min.   :0.001656   Min.   :0.7500   Min.   :0.001656   Min.   : 90.6  
##  1st Qu.:0.001656   1st Qu.:1.0000   1st Qu.:0.001656   1st Qu.:151.0  
##  Median :0.001656   Median :1.0000   Median :0.001656   Median :302.0  
##  Mean   :0.001688   Mean   :0.9997   Mean   :0.001690   Mean   :334.4  
##  3rd Qu.:0.001656   3rd Qu.:1.0000   3rd Qu.:0.001656   3rd Qu.:604.0  
##  Max.   :0.009934   Max.   :1.0000   Max.   :0.009934   Max.   :604.0  
##      count     
##  Min.   :1.00  
##  1st Qu.:1.00  
##  Median :1.00  
##  Mean   :1.02  
##  3rd Qu.:1.00  
##  Max.   :6.00  
## 
## mining info:
##              data ntransactions support confidence
##  user_item_matrix           604   0.001        0.7
##                                                      call
##  apriori(data = user_item_matrix, parameter = rule_param)

Putting our association rules into a dataframe

assoc_rules = as(assoc_rules,"data.frame")

head(assoc_rules)
##                rules     support confidence    coverage     lift count
## 1  {26555} => {3404} 0.001655629          1 0.001655629 100.6667     1
## 2  {26555} => {4387} 0.001655629          1 0.001655629 100.6667     1
## 15   {2493} => {389} 0.001655629          1 0.001655629 302.0000     1
## 16  {2493} => {2390} 0.001655629          1 0.001655629 100.6667     1
## 17  {2493} => {1365} 0.001655629          1 0.001655629 100.6667     1
## 32     {626} => {40} 0.001655629          1 0.001655629 302.0000     1

Now we have to break apart the rules into separate columns - or right-hand and left-hand rules so we can re-attach the movie names and make the rule-sets useful or consummable at the business level

rules = sapply(assoc_rules$rules,function(x){
    x = gsub("[\\{\\}]", "", regmatches(x, gregexpr("\\{.*\\}", x))[[1]])
    x = gsub("=>",",",x)
    x = str_replace_all(x," ","")
    return( x )
})

rules = as.character(rules)
rules = str_split(rules,",")

assoc_rules$lhs_movie = sapply( rules, "[[", 1)
assoc_rules$rhs_movie = sapply( rules , "[[", 2)

assoc_rules$rules = NULL
rm(rules)
gc()
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  3134835 167.5    6838394 365.3  6838394 365.3
## Vcells 11524252  88.0   41451706 316.3 41443217 316.2
assoc_rules$lhs_movie = as.numeric(assoc_rules$lhs_movie)
assoc_rules$rhs_movie = as.numeric(assoc_rules$rhs_movie)

Preparing data to create a dataset to extract rules

movienames = subset(clean, select = -c(userId,Year,timestamp,genres,rating) ) 

movienames <-unique(movienames)
assoc_rules = assoc_rules %>% left_join(modeldata,by=c("lhs_movie" = "movieId") )
assoc_rules <-left_join(assoc_rules, movienames, by=c('rhs_movie'='movieId'))

Renaming Movie Columns

colnames(assoc_rules)[8] <- "left.title"      
colnames(assoc_rules)[14] <- "right.title"  
head(assoc_rules, 5)
##       support confidence    coverage     lift count lhs_movie rhs_movie
## 1 0.001655629          1 0.001655629 100.6667     1     26555      3404
## 2 0.001655629          1 0.001655629 100.6667     1     26555      4387
## 3 0.001655629          1 0.001655629 302.0000     1      2493       389
## 4 0.001655629          1 0.001655629 100.6667     1      2493      2390
## 5 0.001655629          1 0.001655629 100.6667     1      2493      1365
##         left.title Year genres userId rating           timestamp
## 1   Spies Like Us  1985 Comedy    320      4 2010-09-12 00:25:03
## 2   Spies Like Us  1985 Comedy    320      4 2010-09-12 00:25:03
## 3 Harmonists, The  1997  Drama    467      4 1999-02-22 08:05:53
## 4 Harmonists, The  1997  Drama    467      4 1999-02-22 08:05:53
## 5 Harmonists, The  1997  Drama    467      4 1999-02-22 08:05:53
##            right.title
## 1             Titanic 
## 2  Kiss of the Dragon 
## 3 Colonel Chabert, Le 
## 4        Little Voice 
## 5            Ridicule

Now look at the rules we mined - we mined the top rules with the highest lift

assoc_rules %>% arrange(desc(lift)) %>% select(left.title,right.title,support,confidence,lift) %>% head()
##        left.title     right.title     support confidence lift
## 1           Okja  The Red Turtle  0.001655629          1  604
## 2 The Red Turtle            Okja  0.001655629          1  604
## 3           Okja        Lemonade  0.001655629          1  604
## 4       Lemonade            Okja  0.001655629          1  604
## 5 The Red Turtle        Lemonade  0.001655629          1  604
## 6       Lemonade  The Red Turtle  0.001655629          1  604

We do a naive filter here. Results with a number on both sides or similar opening string is removed,

assoc_rules = assoc_rules %>% 
    filter( ! (grepl("[0-9]",left.title,perl = TRUE) &  grepl("[0-9]",right.title,perl = TRUE) ) ) %>%
    filter( ! (grepl("Lemonade",left.title,perl = TRUE) &  grepl("Lemonade",right.title,perl = TRUE) ) ) %>%
    filter( substr( left.title,start = 1,stop = min(5,str_length(left.title),str_length(right.title)) ) != substr( right.title,start = 1,stop = min(5,str_length(left.title),str_length(right.title)) ) ) %>%
    arrange(desc(lift))

head(assoc_rules %>% select(left.title,right.title,support,confidence,lift),10)
##             left.title         right.title     support confidence lift
## 1                Okja      The Red Turtle  0.001655629          1  604
## 2      The Red Turtle                Okja  0.001655629          1  604
## 3                Okja            Lemonade  0.001655629          1  604
## 4            Lemonade                Okja  0.001655629          1  604
## 5      The Red Turtle            Lemonade  0.001655629          1  604
## 6            Lemonade      The Red Turtle  0.001655629          1  604
## 7  Too Late for Tears             Villain  0.001655629          1  604
## 8             Villain  Too Late for Tears  0.001655629          1  604
## 9      Lifeguard, The     One I Love, The  0.001655629          1  604
## 10    One I Love, The      Lifeguard, The  0.001655629          1  604

Lastly, we can use association rules to recommend a potential movie. Tomb Raider is fun – let’s see what movies based we could explore based on it.

assoc_rules %>% 
    filter(str_detect(left.title,"Tomb Raider") | str_detect(right.title,"Tomb Raider")) %>%
    select(left.title,right.title,support,confidence,lift) %>%
    head(20)
##                        left.title  right.title     support confidence lift
## 1                   Wonder Wheel  Tomb Raider  0.001655629          1  151
## 2                      Lady Bird  Tomb Raider  0.001655629          1  151
## 3                       The Wait  Tomb Raider  0.001655629          1  151
## 4                     Disconnect  Tomb Raider  0.001655629          1  151
## 5                         Güeros  Tomb Raider  0.001655629          1  151
## 6          There Will Come a Day  Tomb Raider  0.001655629          1  151
## 7                  Way Back, The  Tomb Raider  0.001655629          1  151
## 8                      Self/less  Tomb Raider  0.001655629          1  151
## 9                          Alpha  Tomb Raider  0.001655629          1  151
## 10                    Game Night  Tomb Raider  0.001655629          1  151
## 11                     The Mummy  Tomb Raider  0.001655629          1  151
## 12 The Star Wars Holiday Special  Tomb Raider  0.001655629          1  151
## 13                          Fuzz  Tomb Raider  0.001655629          1  151
## 14             Cannonball Run II  Tomb Raider  0.001655629          1  151
## 15                  Fire and Ice  Tomb Raider  0.001655629          1  151
## 16       Humanoids from the Deep  Tomb Raider  0.001655629          1  151
## 17                     Delirious  Tomb Raider  0.001655629          1  151
## 18          Happy Birthday to Me  Tomb Raider  0.001655629          1  151
## 19        Company of Wolves, The  Tomb Raider  0.001655629          1  151
## 20                        Squirm  Tomb Raider  0.001655629          1  151