We are going to be building a movie customer recommendation using the Rotten Tomatoes dataset - Market Basket Analysis - Apriori
library(dplyr) # for data manipulation
library(stringr) # for data manipulation
library(caret) # for sampling
library(caTools) # for train/test split
library(ggplot2) # for data visualization
library(corrplot) # for correlations
library(Rtsne) # for tsne plotting
library(ggplot2)
library(repr)
library(RColorBrewer)
library(factoextra)
library(tidyr)
library(tidyverse)
library(arules) ##specifically for apriori
library(arulesViz) ##specifically for apriori
library(lubridate)
library(vembedr)
library(hrbrthemes)
library(rstatix)
library(formattable)
library(data.table)
library(xlsx)
library(arules)
library(arulesViz)
library(plyr)
library(IRdisplay)
library(plotly)
library(stringdist)
library(Matrix)
dfmovies <- read.csv("/cloud/project/movies.csv", header=TRUE, stringsAsFactors=FALSE)
dfratings <- read.csv("/cloud/project/ratings.csv", header=TRUE, stringsAsFactors=FALSE)
result <-merge(dfmovies, dfratings, by.x = "movieId", by.y = "movieId", all = TRUE)
head(result,5)
## movieId title genres userId
## 1 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy 1
## 2 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy 555
## 3 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy 232
## 4 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy 590
## 5 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy 601
## rating timestamp
## 1 4.0 964982703
## 2 4.0 978746159
## 3 3.5 1076955621
## 4 4.0 1258420408
## 5 4.0 1521467801
# Number of Users who rated at least one movie:
print("Number of Unique Users")
## [1] "Number of Unique Users"
length(unique(result$userId))
## [1] 611
#Number of movies
print("Number of Unique Movies")
## [1] "Number of Unique Movies"
length(unique(result$movieId))
## [1] 9742
library(tidyr)
df2 <- separate(data = result, col = title, into = c("Movie", "Year"), sep = "\\(" )
df3 <- df2 %>% mutate(Year = as.numeric(gsub(")", "", Year)))
head(df3,2)
## movieId Movie Year genres userId
## 1 1 Toy Story 1995 Adventure|Animation|Children|Comedy|Fantasy 1
## 2 1 Toy Story 1995 Adventure|Animation|Children|Comedy|Fantasy 555
## rating timestamp
## 1 4 964982703
## 2 4 978746159
df3$timestamp <- as.POSIXct(df3$timestamp, origin = "1970-01-01")
head(df3,3)
## movieId Movie Year genres userId
## 1 1 Toy Story 1995 Adventure|Animation|Children|Comedy|Fantasy 1
## 2 1 Toy Story 1995 Adventure|Animation|Children|Comedy|Fantasy 555
## 3 1 Toy Story 1995 Adventure|Animation|Children|Comedy|Fantasy 232
## rating timestamp
## 1 4.0 2000-07-30 18:45:03
## 2 4.0 2001-01-06 01:55:59
## 3 3.5 2004-02-16 18:20:21
summary(df3)
## movieId Movie Year genres
## Min. : 1 Length:100854 Min. : 6 Length:100854
## 1st Qu.: 1199 Class :character 1st Qu.:1990 Class :character
## Median : 2991 Mode :character Median :1997 Mode :character
## Mean : 19435 Mean :1995
## 3rd Qu.: 8128 3rd Qu.:2003
## Max. :193609 Max. :2018
## NA's :6799
## userId rating timestamp
## Min. : 1.0 Min. :0.500 Min. :1996-03-29 18:36:55.00
## 1st Qu.:177.0 1st Qu.:3.000 1st Qu.:2002-04-18 09:57:46.00
## Median :325.0 Median :3.500 Median :2007-08-02 20:31:02.00
## Mean :326.1 Mean :3.502 Mean :2008-03-19 17:01:27.36
## 3rd Qu.:477.0 3rd Qu.:4.000 3rd Qu.:2015-07-04 07:15:44.50
## Max. :610.0 Max. :5.000 Max. :2018-09-24 14:27:30.00
## NA's :18 NA's :18 NA's :18
MISSING <- is.na(df3$userId)|
is.na(df3$timestamp) |
is.na(df3$rating)|
is.na(df3$Year)
sum(MISSING)
## [1] 6815
dfclean <- subset(df3,
subset = !MISSING)
nrow(dfclean)
## [1] 94039
freqrat <- table(dfclean$rating)
print(freqrat)
##
## 0.5 1 1.5 2 2.5 3 3.5 4 4.5 5
## 1304 2686 1712 7228 5278 18992 12194 24872 7741 12032
hist(dfclean$userId,
xlab = "UserId",
main = "Histogram of UserID",
breaks = sqrt(nrow(df3))
) # set number of bins
The userIds that are above frequency counts of 1500 frequency are the anomalies will skew the recommendating engine. We need to remove them. We are going to create a frequency table to identify these anomalies
frequency <- freq_table(dfclean,userId)
head(frequency,5)
## # A tibble: 5 × 3
## userId n prop
## <int> <int> <dbl>
## 1 1 222 0.2
## 2 2 29 0
## 3 3 36 0
## 4 4 186 0.2
## 5 5 41 0
largeanomaly <- frequency %>%
filter(n > 1200)
print(largeanomaly)
## # A tibble: 6 × 3
## userId n prop
## <int> <int> <dbl>
## 1 68 1209 1.3
## 2 274 1261 1.3
## 3 414 2574 2.7
## 4 448 1813 1.9
## 5 474 1945 2.1
## 6 599 2250 2.4
names(largeanomaly)[names(largeanomaly) == "cat"] <- "userId"
largeanomaly = subset(largeanomaly, select = -c(n) )
head(largeanomaly,5)
## # A tibble: 5 × 2
## userId prop
## <int> <dbl>
## 1 68 1.3
## 2 274 1.3
## 3 414 2.7
## 4 448 1.9
## 5 474 2.1
largeanomaly$userId<-as.integer(largeanomaly$userId)
dfclean <- dfclean %>%
anti_join(largeanomaly)
#Joining with `by = join_by(userId)`
hist(dfclean$userId,
xlab = "UserId",
main = "Histogram of UserID",
breaks = sqrt(nrow(dfclean))
) # set number of bins
range(dfclean$Year)
## [1] 6 2018
dfclean$rating <- round(dfclean$rating)
head(dfclean$rating, 5)
## [1] 4 4 4 4 4
out <-dfclean %>% filter(Year=='6')
clean <- dfclean %>%
anti_join(out)
range(clean$Year)
## [1] 1903 2018
We are going to save out some data for our Market Basket Analysis later
modeldata <-clean
Preparing data to build table and visualizations for 1960’s to 1980’s or “The Golden Age” of Film: removing movies with years out of range or older
golden <-clean[clean$Year > 1959 & clean$Year < 1981, ]
range(golden$Year)
## [1] 1960 1980
print("Number of Unique Users")
## [1] "Number of Unique Users"
length(unique(golden$userId))
## [1] 487
#Number of movies
print("Number of Unique Movies")
## [1] "Number of Unique Movies"
length(unique(golden$movieId))
## [1] 723
golden$new <-unlist(lapply(strsplit(golden$genres, '|', fixed = TRUE), '[', 2))
counts <- table(golden$new)
barplot(counts, main="Ratings by Genre",
col="violet",las=2)
ratings <- aggregate(golden$rating, by = list(golden$Year), FUN = sum)
head(ratings)
## Group.1 x
## 1 1960 724
## 2 1961 536
## 3 1962 855
## 4 1963 822
## 5 1964 1124
## 6 1965 605
colnames(ratings)[1] <- "Year"
colnames(ratings)[2] <- "Count"
head(ratings, 5)
## Year Count
## 1 1960 724
## 2 1961 536
## 3 1962 855
## 4 1963 822
## 5 1964 1124
# Libraries
library(ggplot2)
library(hrbrthemes)
ggplot(ratings, aes(x = Year, y = Count, group = 1)) +
geom_line(color = "#800080", # Color of the line
lwd = 1, # Width of the line
linetype = 1)
names(golden)[8] <- "Genre"
df4 <- filter(golden, rating > 3)
second <-aggregate(df4$rating, by = list(df4$Movie, df4$Year, df4$Genre), FUN = mean)
names(second)[1] <- "Movie"
names(second)[2] <- "Year"
names(second)[3] <- "Genre"
names(second)[4] <- "Avg Rating"
second$`Avg Rating`<- round(second$`Avg Rating`, digits = 2)
library(data.table)
library(DT)
datatable(second,extensions = 'Buttons',
options = list(dom='Bfrtip',
buttons=c('copy', 'csv', 'excel', 'print', 'pdf')))
Now let’s build our Market Basket - Apriori Model For this we are going back to the data before we split the genres pulling the model data - dropping all the vars except userId and movieId
data2 = subset(modeldata, select = -c(Movie,Year,genres,rating,timestamp) )
head(data2,5)
## movieId userId
## 1 1 1
## 2 1 555
## 3 1 232
## 4 1 590
## 5 1 601
dim(data2)[1]
## [1] 82986
library(arules)
library(dplyr)
library(reshape2)
library(Matrix)
library(stringr)
library(stringdist)
#convert rating-per-row dataframe into sparse User-Item matrix
user_item_matrix <- as(split(data2[,"movieId"],data2[,"userId"]), "transactions")
#investigate the User-Item matrix
#transactions (rows) -> number of raters
#items (columns) -> number of movies
user_item_matrix
## transactions in sparse format with
## 604 transactions (rows) and
## 7618 items (columns)
rule_param = list(
supp = 0.001,
conf = 0.7,
maxlen = 2
)
assoc_rules = apriori(user_item_matrix,parameter = rule_param)
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.7 0.1 1 none FALSE TRUE 5 0.001 1
## maxlen target ext
## 2 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 0
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[7618 item(s), 604 transaction(s)] done [0.02s].
## sorting and recoding items ... [7618 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2
## done [0.18s].
## writing ... [1681728 rule(s)] done [0.25s].
## creating S4 object ... done [0.65s].
summary(assoc_rules)
## set of 1681728 rules
##
## rule length distribution (lhs + rhs):sizes
## 2
## 1681728
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2 2 2 2 2 2
##
## summary of quality measures:
## support confidence coverage lift
## Min. :0.001656 Min. :0.700 Min. :0.001656 Min. : 1.309
## 1st Qu.:0.001656 1st Qu.:1.000 1st Qu.:0.001656 1st Qu.: 9.015
## Median :0.001656 Median :1.000 Median :0.001656 Median : 22.370
## Mean :0.002763 Mean :0.983 Mean :0.003045 Mean : 95.256
## 3rd Qu.:0.001656 3rd Qu.:1.000 3rd Qu.:0.001656 3rd Qu.: 86.286
## Max. :0.374172 Max. :1.000 Max. :0.516556 Max. :604.000
## count
## Min. : 1.000
## 1st Qu.: 1.000
## Median : 1.000
## Mean : 1.669
## 3rd Qu.: 1.000
## Max. :226.000
##
## mining info:
## data ntransactions support confidence
## user_item_matrix 604 0.001 0.7
## call
## apriori(data = user_item_matrix, parameter = rule_param)
Because there are so many rules we are going to only chose those whose lift have exceeded the 75th percentile, or in this case >= 86.286
assoc_rules = subset(assoc_rules, lift >= 86.286)
summary(assoc_rules)
## set of 390205 rules
##
## rule length distribution (lhs + rhs):sizes
## 2
## 390205
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2 2 2 2 2 2
##
## summary of quality measures:
## support confidence coverage lift
## Min. :0.001656 Min. :0.7500 Min. :0.001656 Min. : 90.6
## 1st Qu.:0.001656 1st Qu.:1.0000 1st Qu.:0.001656 1st Qu.:151.0
## Median :0.001656 Median :1.0000 Median :0.001656 Median :302.0
## Mean :0.001688 Mean :0.9997 Mean :0.001690 Mean :334.4
## 3rd Qu.:0.001656 3rd Qu.:1.0000 3rd Qu.:0.001656 3rd Qu.:604.0
## Max. :0.009934 Max. :1.0000 Max. :0.009934 Max. :604.0
## count
## Min. :1.00
## 1st Qu.:1.00
## Median :1.00
## Mean :1.02
## 3rd Qu.:1.00
## Max. :6.00
##
## mining info:
## data ntransactions support confidence
## user_item_matrix 604 0.001 0.7
## call
## apriori(data = user_item_matrix, parameter = rule_param)
assoc_rules = as(assoc_rules,"data.frame")
head(assoc_rules)
## rules support confidence coverage lift count
## 1 {26555} => {3404} 0.001655629 1 0.001655629 100.6667 1
## 2 {26555} => {4387} 0.001655629 1 0.001655629 100.6667 1
## 15 {2493} => {389} 0.001655629 1 0.001655629 302.0000 1
## 16 {2493} => {2390} 0.001655629 1 0.001655629 100.6667 1
## 17 {2493} => {1365} 0.001655629 1 0.001655629 100.6667 1
## 32 {626} => {40} 0.001655629 1 0.001655629 302.0000 1
Now we have to break apart the rules into separate columns - or right-hand and left-hand rules so we can re-attach the movie names and make the rule-sets useful or consummable at the business level
rules = sapply(assoc_rules$rules,function(x){
x = gsub("[\\{\\}]", "", regmatches(x, gregexpr("\\{.*\\}", x))[[1]])
x = gsub("=>",",",x)
x = str_replace_all(x," ","")
return( x )
})
rules = as.character(rules)
rules = str_split(rules,",")
assoc_rules$lhs_movie = sapply( rules, "[[", 1)
assoc_rules$rhs_movie = sapply( rules , "[[", 2)
assoc_rules$rules = NULL
rm(rules)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3134835 167.5 6838394 365.3 6838394 365.3
## Vcells 11524252 88.0 41451706 316.3 41443217 316.2
assoc_rules$lhs_movie = as.numeric(assoc_rules$lhs_movie)
assoc_rules$rhs_movie = as.numeric(assoc_rules$rhs_movie)
Preparing data to create a dataset to extract rules
movienames = subset(clean, select = -c(userId,Year,timestamp,genres,rating) )
movienames <-unique(movienames)
assoc_rules = assoc_rules %>% left_join(modeldata,by=c("lhs_movie" = "movieId") )
assoc_rules <-left_join(assoc_rules, movienames, by=c('rhs_movie'='movieId'))
colnames(assoc_rules)[8] <- "left.title"
colnames(assoc_rules)[14] <- "right.title"
head(assoc_rules, 5)
## support confidence coverage lift count lhs_movie rhs_movie
## 1 0.001655629 1 0.001655629 100.6667 1 26555 3404
## 2 0.001655629 1 0.001655629 100.6667 1 26555 4387
## 3 0.001655629 1 0.001655629 302.0000 1 2493 389
## 4 0.001655629 1 0.001655629 100.6667 1 2493 2390
## 5 0.001655629 1 0.001655629 100.6667 1 2493 1365
## left.title Year genres userId rating timestamp
## 1 Spies Like Us 1985 Comedy 320 4 2010-09-12 00:25:03
## 2 Spies Like Us 1985 Comedy 320 4 2010-09-12 00:25:03
## 3 Harmonists, The 1997 Drama 467 4 1999-02-22 08:05:53
## 4 Harmonists, The 1997 Drama 467 4 1999-02-22 08:05:53
## 5 Harmonists, The 1997 Drama 467 4 1999-02-22 08:05:53
## right.title
## 1 Titanic
## 2 Kiss of the Dragon
## 3 Colonel Chabert, Le
## 4 Little Voice
## 5 Ridicule
Now look at the rules we mined - we mined the top rules with the highest lift
assoc_rules %>% arrange(desc(lift)) %>% select(left.title,right.title,support,confidence,lift) %>% head()
## left.title right.title support confidence lift
## 1 Okja The Red Turtle 0.001655629 1 604
## 2 The Red Turtle Okja 0.001655629 1 604
## 3 Okja Lemonade 0.001655629 1 604
## 4 Lemonade Okja 0.001655629 1 604
## 5 The Red Turtle Lemonade 0.001655629 1 604
## 6 Lemonade The Red Turtle 0.001655629 1 604
We do a naive filter here. Results with a number on both sides or similar opening string is removed,
assoc_rules = assoc_rules %>%
filter( ! (grepl("[0-9]",left.title,perl = TRUE) & grepl("[0-9]",right.title,perl = TRUE) ) ) %>%
filter( ! (grepl("Lemonade",left.title,perl = TRUE) & grepl("Lemonade",right.title,perl = TRUE) ) ) %>%
filter( substr( left.title,start = 1,stop = min(5,str_length(left.title),str_length(right.title)) ) != substr( right.title,start = 1,stop = min(5,str_length(left.title),str_length(right.title)) ) ) %>%
arrange(desc(lift))
head(assoc_rules %>% select(left.title,right.title,support,confidence,lift),10)
## left.title right.title support confidence lift
## 1 Okja The Red Turtle 0.001655629 1 604
## 2 The Red Turtle Okja 0.001655629 1 604
## 3 Okja Lemonade 0.001655629 1 604
## 4 Lemonade Okja 0.001655629 1 604
## 5 The Red Turtle Lemonade 0.001655629 1 604
## 6 Lemonade The Red Turtle 0.001655629 1 604
## 7 Too Late for Tears Villain 0.001655629 1 604
## 8 Villain Too Late for Tears 0.001655629 1 604
## 9 Lifeguard, The One I Love, The 0.001655629 1 604
## 10 One I Love, The Lifeguard, The 0.001655629 1 604
Lastly, we can use association rules to recommend a potential movie. Tomb Raider is fun – let’s see what movies based we could explore based on it.
assoc_rules %>%
filter(str_detect(left.title,"Tomb Raider") | str_detect(right.title,"Tomb Raider")) %>%
select(left.title,right.title,support,confidence,lift) %>%
head(20)
## left.title right.title support confidence lift
## 1 Wonder Wheel Tomb Raider 0.001655629 1 151
## 2 Lady Bird Tomb Raider 0.001655629 1 151
## 3 The Wait Tomb Raider 0.001655629 1 151
## 4 Disconnect Tomb Raider 0.001655629 1 151
## 5 Güeros Tomb Raider 0.001655629 1 151
## 6 There Will Come a Day Tomb Raider 0.001655629 1 151
## 7 Way Back, The Tomb Raider 0.001655629 1 151
## 8 Self/less Tomb Raider 0.001655629 1 151
## 9 Alpha Tomb Raider 0.001655629 1 151
## 10 Game Night Tomb Raider 0.001655629 1 151
## 11 The Mummy Tomb Raider 0.001655629 1 151
## 12 The Star Wars Holiday Special Tomb Raider 0.001655629 1 151
## 13 Fuzz Tomb Raider 0.001655629 1 151
## 14 Cannonball Run II Tomb Raider 0.001655629 1 151
## 15 Fire and Ice Tomb Raider 0.001655629 1 151
## 16 Humanoids from the Deep Tomb Raider 0.001655629 1 151
## 17 Delirious Tomb Raider 0.001655629 1 151
## 18 Happy Birthday to Me Tomb Raider 0.001655629 1 151
## 19 Company of Wolves, The Tomb Raider 0.001655629 1 151
## 20 Squirm Tomb Raider 0.001655629 1 151