##We are going to be building a movie recommendation engine using the Rotten Tomatoes Movie Recommendation Dataset
knitr::opts_chunk$set
## function (...)
## {
## set2(resolve(...))
## }
## <bytecode: 0x59f65799fbb8>
## <environment: 0x59f657995198>
##Roadmap ##1) EDA’s - Exploratory Data Analysis of our data ##2) Preprocessing/Cleaning the Data ##3) Building an interactive table to explore the “Golden Age” of Movies ##4) Market Basket Analysis (Apriori)
library(dplyr) # for data manipulation
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr) # for data manipulation
library(caret) # for sampling
## Loading required package: ggplot2
## Loading required package: lattice
library(caTools) # for train/test split
library(ggplot2) # for data visualization
library(corrplot) # for correlations
## corrplot 0.92 loaded
library(Rtsne) # for tsne plotting
library(ggplot2)
library(repr)
library(RColorBrewer)
library(DMwR)
## Loading required package: grid
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(tidyr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(arules) ##specifically for apriori
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
##
## Attaching package: 'arules'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following objects are masked from 'package:base':
##
## abbreviate, write
library(arulesViz) ##specifically for apriori
library(lubridate)
library(vembedr)
##
## Attaching package: 'vembedr'
##
## The following object is masked from 'package:lubridate':
##
## hms
library(hrbrthemes)
library(rstatix)
##
## Attaching package: 'rstatix'
##
## The following object is masked from 'package:stats':
##
## filter
library(formattable)
library(data.table)
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following object is masked from 'package:purrr':
##
## transpose
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(xlsx)
library(arules)
library(arulesViz)
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
##
## The following objects are masked from 'package:rstatix':
##
## desc, mutate
##
## The following object is masked from 'package:purrr':
##
## compact
##
## The following object is masked from 'package:DMwR':
##
## join
##
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
library(IRdisplay)
library(plotly)
##
## Attaching package: 'plotly'
##
## The following objects are masked from 'package:plyr':
##
## arrange, mutate, rename, summarise
##
## The following object is masked from 'package:formattable':
##
## style
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(stringdist)
##
## Attaching package: 'stringdist'
##
## The following object is masked from 'package:tidyr':
##
## extract
library(Matrix)
##Bringing in two files: Movies & Ratings
dfmovies <- read.csv("/cloud/project/movies.csv", header=TRUE, stringsAsFactors=FALSE)
dfratings <- read.csv("/cloud/project/ratings.csv",header=TRUE,stringsAsFactors=FALSE)
result <-merge(dfmovies, dfratings, by.x = "movieId", by.y = "movieId", all = TRUE)
head(result,5)
## movieId title genres userId
## 1 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy 1
## 2 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy 555
## 3 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy 232
## 4 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy 590
## 5 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy 601
## rating timestamp
## 1 4.0 964982703
## 2 4.0 978746159
## 3 3.5 1076955621
## 4 4.0 1258420408
## 5 4.0 1521467801
# Number of Users who rated at least one movie:
print("Number of Unique Users")
## [1] "Number of Unique Users"
length(unique(result$userId))
## [1] 611
#Number of movies
print("Number of Unique Movies")
## [1] "Number of Unique Movies"
length(unique(result$movieId))
## [1] 9742
##Extracting movie release year into one column
# Split name column into title and year
df2 <- separate(data = result, col = title, into = c("Movie", "Year"), sep = "\\(" )
## Warning: Expected 2 pieces. Additional pieces discarded in 6782 rows [1424, 1425, 1426,
## 1427, 1428, 1429, 1430, 1431, 1432, 1433, 1434, 1435, 1436, 1437, 1438, 1439,
## 1440, 1441, 1442, 1443, ...].
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 17 rows [81310, 81311,
## 99344, 99345, 99346, 99347, 99478, 99545, 99636, 99818, 100070, 100288, 100445,
## 100446, 100447, 100467, 100620].
df3 <- df2 %>% mutate(Year = as.numeric(gsub(")", "", Year)))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Year = as.numeric(gsub(")", "", Year))`.
## Caused by warning:
## ! NAs introduced by coercion
head(df3,2)
## movieId Movie Year genres userId
## 1 1 Toy Story 1995 Adventure|Animation|Children|Comedy|Fantasy 1
## 2 1 Toy Story 1995 Adventure|Animation|Children|Comedy|Fantasy 555
## rating timestamp
## 1 4 964982703
## 2 4 978746159
#convert UNIX timestamp to date using Lubridate
df3$timestamp <- as.POSIXct(df3$timestamp, origin = "1970-01-01")
##Quick EDAs on data to see what other cleaning must be done:
summary(df3)
## movieId Movie Year genres
## Min. : 1 Length:100854 Min. : 6 Length:100854
## 1st Qu.: 1199 Class :character 1st Qu.:1990 Class :character
## Median : 2991 Mode :character Median :1997 Mode :character
## Mean : 19435 Mean :1995
## 3rd Qu.: 8128 3rd Qu.:2003
## Max. :193609 Max. :2018
## NA's :6799
## userId rating timestamp
## Min. : 1.0 Min. :0.500 Min. :1996-03-29 18:36:55.00
## 1st Qu.:177.0 1st Qu.:3.000 1st Qu.:2002-04-18 09:57:46.00
## Median :325.0 Median :3.500 Median :2007-08-02 20:31:02.00
## Mean :326.1 Mean :3.502 Mean :2008-03-19 17:01:27.36
## 3rd Qu.:477.0 3rd Qu.:4.000 3rd Qu.:2015-07-04 07:15:44.50
## Max. :610.0 Max. :5.000 Max. :2018-09-24 14:27:30.00
## NA's :18 NA's :18 NA's :18
##Removing data with missing userIDs
MISSING <- is.na(df3$userId)|
is.na(df3$timestamp) |
is.na(df3$rating)|
is.na(df3$Year)
sum(MISSING)
## [1] 6815
dfclean <- subset(df3,
subset = !MISSING)
nrow(dfclean)
## [1] 94039
summary(dfclean)
## movieId Movie Year genres
## Min. : 1 Length:94039 Min. : 6 Length:94039
## 1st Qu.: 1197 Class :character 1st Qu.:1990 Class :character
## Median : 2970 Mode :character Median :1997 Mode :character
## Mean : 19784 Mean :1995
## 3rd Qu.: 8360 3rd Qu.:2003
## Max. :193609 Max. :2018
## userId rating timestamp
## Min. : 1.0 Min. :0.500 Min. :1996-03-29 18:36:55.00
## 1st Qu.:177.0 1st Qu.:3.000 1st Qu.:2001-12-30 06:34:33.50
## Median :325.0 Median :3.500 Median :2007-08-02 20:28:57.00
## Mean :325.5 Mean :3.485 Mean :2008-03-10 17:44:45.02
## 3rd Qu.:477.0 3rd Qu.:4.000 3rd Qu.:2015-07-04 07:09:53.50
## Max. :610.0 Max. :5.000 Max. :2018-09-24 14:27:30.00
##Looking at the ratings to see the frequency counts
freqrat <- table(dfclean$rating)
print(freqrat)
##
## 0.5 1 1.5 2 2.5 3 3.5 4 4.5 5
## 1304 2686 1712 7228 5278 18992 12194 24872 7741 12032
##Looking at frequency of UserIDs ##create min, max, med, mode and look at outliers, remove any extremes do a histogram
hist(dfclean$userId,
xlab = "UserId",
main = "Histogram of UserID",
breaks = sqrt(nrow(df3))
) # set number of bins
##The userIds that are above frequency counts of 1500 frequency are the anomalies will skew the recommendating engine. We need to remove them. We are going to create a frequency table to identify these anomalies
frequency <- freq_table(dfclean$userId)
##Now we are creating a dataframe of just the userId’s that have a frequency count > 1200 and printing a table of those
largeanomaly <- frequency %>%
filter(n > 1200)
print(largeanomaly)
## # A tibble: 6 × 3
## group n prop
## <int> <int> <dbl>
## 1 68 1209 1.3
## 2 274 1261 1.3
## 3 414 2574 2.7
## 4 448 1813 1.9
## 5 474 1945 2.1
## 6 599 2250 2.4
##Now we will drop the vars we don’t need rename group to “userId” and do an anti-join to remove the outliers from our dataset now that we have identified them
names(largeanomaly)[names(largeanomaly) == "group"] <- "userId"
largeanomaly = subset(largeanomaly, select = -c(n,prop) )
print(largeanomaly)
## # A tibble: 6 × 1
## userId
## <int>
## 1 68
## 2 274
## 3 414
## 4 448
## 5 474
## 6 599
##Here’s our anti-join
dfclean <- dfclean %>%
anti_join(largeanomaly)
## Joining with `by = join_by(userId)`
##Let’s look at the histogram again - to see that we did it!!
hist(dfclean$userId,
xlab = "UserId",
main = "Histogram of UserID",
breaks = sqrt(nrow(dfclean))
) # set number of bins
##Checking the range of Year to see if there is any cleaning up to do - yes the lowest Year is ‘6’
range(dfclean$Year)
## [1] 6 2018
##Round off ratings to single digit
dfclean$rating <- round(dfclean$rating)
ratingtable <-freq_table(dfclean$rating)
print(ratingtable)
## # A tibble: 6 × 3
## group n prop
## <dbl> <int> <dbl>
## 1 0 1220 1.5
## 2 1 2419 2.9
## 3 2 10967 13.2
## 4 3 16469 19.8
## 5 4 40315 48.6
## 6 5 11597 14
##Creating a dataframe of just the outlying ‘6’ Year
##Finding Year 6
out <-dfclean %>% filter(Year=='6')
##Antijoin to remove the outlier Year
##Antijoin to remove #6
clean <- dfclean %>%
anti_join(out)
## Joining with `by = join_by(movieId, Movie, Year, genres, userId, rating,
## timestamp)`
##Okay we removed it!
range(clean$Year)
## [1] 1903 2018
##We are going to save out some data for later by creating another dataframe:
modeldata <-clean
##Preparing data to build table and visualizations for 1960’s to 1980’s or “The Golden Age” of Film: ##removing movies with years out of range or older
golden <-clean[clean$Year > 1959 & clean$Year < 1981, ]
range(golden$Year)
## [1] 1960 1980
##Checking to see how many unique users and unique movies we have in our dataset
# Number of Users who rated at least one movie:
print("Number of Unique Users")
## [1] "Number of Unique Users"
length(unique(golden$userId))
## [1] 487
#Number of movies
print("Number of Unique Movies")
## [1] "Number of Unique Movies"
length(unique(golden$movieId))
## [1] 723
##Building a custom table
##Cleaning genres to first three descriptors
golden$new <-unlist(lapply(strsplit(golden$genres, '|', fixed = TRUE), '[', 2))
##Checking our frequencies by Genre for our Golden Years dataset
New <-freq_table(golden$new)
print(New)
## # A tibble: 17 × 3
## group n prop
## <chr> <int> <dbl>
## 1 Adventure 984 16.7
## 2 Animation 92 1.6
## 3 Children 249 4.2
## 4 Comedy 599 10.2
## 5 Crime 237 4
## 6 Documentary 2 0
## 7 Drama 1618 27.5
## 8 Fantasy 132 2.2
## 9 Film-Noir 59 1
## 10 Horror 320 5.4
## 11 Musical 151 2.6
## 12 Mystery 184 3.1
## 13 Romance 242 4.1
## 14 Sci-Fi 466 7.9
## 15 Thriller 179 3
## 16 War 213 3.6
## 17 Western 163 2.8
##Displaying it in a Histogram
counts <- table(golden$new)
barplot(counts, main="Ratings by Genre",
col="hot pink",las=2)
##Average ratings by Year for our Golden Years Dataset ##Building the aggregate dataset of number of ratings per month
ratings <- aggregate(golden$rating, by = list(golden$Year), FUN = sum)
head(ratings)
## Group.1 x
## 1 1960 724
## 2 1961 536
## 3 1962 855
## 4 1963 822
## 5 1964 1124
## 6 1965 605
#Rename Columns
colnames(ratings)[1] <- "Year"
colnames(ratings)[2] <- "Count"
head(ratings, 5)
## Year Count
## 1 1960 724
## 2 1961 536
## 3 1962 855
## 4 1963 822
## 5 1964 1124
##Trend Line
ratings %>%
ggplot(aes(x = Year, y = Count, group = 1)) +
geom_line(aes(col = 'hotpink', las = 2,))
## Warning in geom_line(aes(col = "hotpink", las = 2, )): Ignoring unknown
## aesthetics: las
theme(axis.text.x = element_text(angle=90, vjust=.5, hjust=1))
## List of 1
## $ axis.text.x:List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 1
## ..$ vjust : num 0.5
## ..$ angle : num 90
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi FALSE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi FALSE
## - attr(*, "validate")= logi TRUE
##Top rated movies ##Aggregate movies by movie, rating, year, genre #Rename new to genre
names(golden)[8] <- "Genre"
nrow(distinct(golden))
## [1] 7251
##Sample a table of movies where ratings are 4 and >
df4 <- filter(golden, rating > 3)
second <-aggregate(df4$rating, by = list(df4$Movie, df4$Year, df4$Genre), FUN = mean)
#rename the columns before building our table
names(second)[1] <- "Movie"
names(second)[2] <- "Year"
names(second)[3] <- "Genre"
names(second)[4] <- "Avg Rating"
##Round Avg Rating to two digits
second$`Avg Rating`<- round(second$`Avg Rating`, digits = 2)
library(data.table)
library(DT)
##Build an interactable data table from the new aggregate table we have built for movies from the Golden Age, rated 4 -5
datatable(second,extensions = 'Buttons',
options = list(dom='Bfrtip',
buttons=c('copy', 'csv', 'excel', 'print', 'pdf')))
##Now let’s build our Market Basket - Apriori Model ##For this we are going back to the data before we split the genres ##pulling the model data - dropping all the vars except userId and movieId
data2 = subset(modeldata, select = -c(Movie,Year,genres,rating,timestamp) )
head(data2,5)
## movieId userId
## 1 1 1
## 2 1 555
## 3 1 232
## 4 1 590
## 5 1 601
dim(data2)[1]
## [1] 82986
##Now we have to translate our dataframe into a spare User-Item matrix focused on the transactions
#convert rating-per-row dataframe into sparse User-Item matrix
user_item_matrix <- as(split(data2[,"movieId"],data2[,"userId"]), "transactions")
#investigate the User-Item matrix
#transactions (rows) -> number of raters
#items (columns) -> number of movies
user_item_matrix
## transactions in sparse format with
## 604 transactions (rows) and
## 7618 items (columns)
##Setting the parameters for the Apriori algotrithm
rule_param = list(
supp = 0.001,
conf = 0.7,
maxlen = 2
)
##Running the algorithm
assoc_rules = apriori(user_item_matrix,parameter = rule_param)
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.7 0.1 1 none FALSE TRUE 5 0.001 1
## maxlen target ext
## 2 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 0
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[7618 item(s), 604 transaction(s)] done [0.04s].
## sorting and recoding items ... [7618 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2
## Warning in apriori(user_item_matrix, parameter = rule_param): Mining stopped
## (maxlen reached). Only patterns up to a length of 2 returned!
## done [0.18s].
## writing ... [1681728 rule(s)] done [0.29s].
## creating S4 object ... done [0.83s].
##Printing out the association rules
summary(assoc_rules)
## set of 1681728 rules
##
## rule length distribution (lhs + rhs):sizes
## 2
## 1681728
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2 2 2 2 2 2
##
## summary of quality measures:
## support confidence coverage lift
## Min. :0.001656 Min. :0.700 Min. :0.001656 Min. : 1.309
## 1st Qu.:0.001656 1st Qu.:1.000 1st Qu.:0.001656 1st Qu.: 9.015
## Median :0.001656 Median :1.000 Median :0.001656 Median : 22.370
## Mean :0.002763 Mean :0.983 Mean :0.003045 Mean : 95.256
## 3rd Qu.:0.001656 3rd Qu.:1.000 3rd Qu.:0.001656 3rd Qu.: 86.286
## Max. :0.374172 Max. :1.000 Max. :0.516556 Max. :604.000
## count
## Min. : 1.000
## 1st Qu.: 1.000
## Median : 1.000
## Mean : 1.669
## 3rd Qu.: 1.000
## Max. :226.000
##
## mining info:
## data ntransactions support confidence
## user_item_matrix 604 0.001 0.7
## call
## apriori(data = user_item_matrix, parameter = rule_param)
##Because there are so many rules we are going to only chose those whose lift have exceeded the 75th percentile, or in this case >= 86.286
assoc_rules = subset(assoc_rules, lift >= 86.286)
summary(assoc_rules)
## set of 390205 rules
##
## rule length distribution (lhs + rhs):sizes
## 2
## 390205
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2 2 2 2 2 2
##
## summary of quality measures:
## support confidence coverage lift
## Min. :0.001656 Min. :0.7500 Min. :0.001656 Min. : 90.6
## 1st Qu.:0.001656 1st Qu.:1.0000 1st Qu.:0.001656 1st Qu.:151.0
## Median :0.001656 Median :1.0000 Median :0.001656 Median :302.0
## Mean :0.001688 Mean :0.9997 Mean :0.001690 Mean :334.4
## 3rd Qu.:0.001656 3rd Qu.:1.0000 3rd Qu.:0.001656 3rd Qu.:604.0
## Max. :0.009934 Max. :1.0000 Max. :0.009934 Max. :604.0
## count
## Min. :1.00
## 1st Qu.:1.00
## Median :1.00
## Mean :1.02
## 3rd Qu.:1.00
## Max. :6.00
##
## mining info:
## data ntransactions support confidence
## user_item_matrix 604 0.001 0.7
## call
## apriori(data = user_item_matrix, parameter = rule_param)
##Putting our association rules into a data frame
assoc_rules = as(assoc_rules,"data.frame")
head(assoc_rules)
## rules support confidence coverage lift count
## 1 {26555} => {3404} 0.001655629 1 0.001655629 100.6667 1
## 2 {26555} => {4387} 0.001655629 1 0.001655629 100.6667 1
## 15 {2493} => {389} 0.001655629 1 0.001655629 302.0000 1
## 16 {2493} => {2390} 0.001655629 1 0.001655629 100.6667 1
## 17 {2493} => {1365} 0.001655629 1 0.001655629 100.6667 1
## 32 {626} => {40} 0.001655629 1 0.001655629 302.0000 1
##Now we have to break apart the rules into separate columns - or right-hand and left-hand rules so we can re-attach the movie names and make the rule-sets useful or consummable at the business level
rules = sapply(assoc_rules$rules,function(x){
x = gsub("[\\{\\}]", "", regmatches(x, gregexpr("\\{.*\\}", x))[[1]])
x = gsub("=>",",",x)
x = str_replace_all(x," ","")
return( x )
})
rules = as.character(rules)
rules = str_split(rules,",")
assoc_rules$lhs_movie = sapply( rules, "[[", 1)
assoc_rules$rhs_movie = sapply( rules , "[[", 2)
assoc_rules$rules = NULL
rm(rules)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3426271 183.0 7364918 393.4 7364918 393.4
## Vcells 12058915 92.1 42729456 326.0 42625524 325.3
assoc_rules$lhs_movie = as.numeric(assoc_rules$lhs_movie)
assoc_rules$rhs_movie = as.numeric(assoc_rules$rhs_movie)
##Preparing data to create a dataset to extract rules
movienames = subset(clean, select = -c(userId,Year,timestamp,genres,rating) )
movienames <-unique(movienames)
##Joinging with a moviename only dataset to extract movie names
assoc_rules = assoc_rules %>% left_join(modeldata,by=c("lhs_movie" = "movieId") )
## Warning in left_join(., modeldata, by = c(lhs_movie = "movieId")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1005 of `x` matches multiple rows in `y`.
## ℹ Row 64660 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
assoc_rules <-left_join(assoc_rules, movienames, by=c('rhs_movie'='movieId'))
##Renaming movie columns
colnames(assoc_rules)[8] <- "left.title"
colnames(assoc_rules)[14] <- "right.title"
head(assoc_rules, 5)
## support confidence coverage lift count lhs_movie rhs_movie
## 1 0.001655629 1 0.001655629 100.6667 1 26555 3404
## 2 0.001655629 1 0.001655629 100.6667 1 26555 4387
## 3 0.001655629 1 0.001655629 302.0000 1 2493 389
## 4 0.001655629 1 0.001655629 100.6667 1 2493 2390
## 5 0.001655629 1 0.001655629 100.6667 1 2493 1365
## left.title Year genres userId rating timestamp
## 1 Spies Like Us 1985 Comedy 320 4 2010-09-12 00:25:03
## 2 Spies Like Us 1985 Comedy 320 4 2010-09-12 00:25:03
## 3 Harmonists, The 1997 Drama 467 4 1999-02-22 08:05:53
## 4 Harmonists, The 1997 Drama 467 4 1999-02-22 08:05:53
## 5 Harmonists, The 1997 Drama 467 4 1999-02-22 08:05:53
## right.title
## 1 Titanic
## 2 Kiss of the Dragon
## 3 Colonel Chabert, Le
## 4 Little Voice
## 5 Ridicule
##Now look at the rules we mined - we mined the top rules with the highest lift
assoc_rules %>% arrange(desc(lift)) %>% select(left.title,right.title,support,confidence,lift) %>% head()
## left.title right.title support confidence lift
## 1 Okja The Red Turtle 0.001655629 1 604
## 2 The Red Turtle Okja 0.001655629 1 604
## 3 Okja Lemonade 0.001655629 1 604
## 4 Lemonade Okja 0.001655629 1 604
## 5 The Red Turtle Lemonade 0.001655629 1 604
## 6 Lemonade The Red Turtle 0.001655629 1 604
##We do a naive filter here. Results with a number on both sides or similar opening string is removed,
assoc_rules = assoc_rules %>%
filter( ! (grepl("[0-9]",left.title,perl = TRUE) & grepl("[0-9]",right.title,perl = TRUE) ) ) %>%
filter( ! (grepl("Lemonade",left.title,perl = TRUE) & grepl("Lemonade",right.title,perl = TRUE) ) ) %>%
filter( substr( left.title,start = 1,stop = min(5,str_length(left.title),str_length(right.title)) ) != substr( right.title,start = 1,stop = min(5,str_length(left.title),str_length(right.title)) ) ) %>%
arrange(desc(lift))
head(assoc_rules %>% select(left.title,right.title,support,confidence,lift),10)
## left.title right.title support confidence lift
## 1 Okja The Red Turtle 0.001655629 1 604
## 2 The Red Turtle Okja 0.001655629 1 604
## 3 Okja Lemonade 0.001655629 1 604
## 4 Lemonade Okja 0.001655629 1 604
## 5 The Red Turtle Lemonade 0.001655629 1 604
## 6 Lemonade The Red Turtle 0.001655629 1 604
## 7 Too Late for Tears Villain 0.001655629 1 604
## 8 Villain Too Late for Tears 0.001655629 1 604
## 9 Lifeguard, The One I Love, The 0.001655629 1 604
## 10 One I Love, The Lifeguard, The 0.001655629 1 604
##Lastly, we can use association rules to recommend a potential movie. Tomb Raider is fun – let’s see what movies based we could explore based on it.
assoc_rules %>%
filter(str_detect(left.title,"Tomb Raider") | str_detect(right.title,"Tomb Raider")) %>%
select(left.title,right.title,support,confidence,lift) %>%
head(20)
## left.title right.title support confidence lift
## 1 Wonder Wheel Tomb Raider 0.001655629 1 151
## 2 Lady Bird Tomb Raider 0.001655629 1 151
## 3 The Wait Tomb Raider 0.001655629 1 151
## 4 Disconnect Tomb Raider 0.001655629 1 151
## 5 Güeros Tomb Raider 0.001655629 1 151
## 6 There Will Come a Day Tomb Raider 0.001655629 1 151
## 7 Way Back, The Tomb Raider 0.001655629 1 151
## 8 Self/less Tomb Raider 0.001655629 1 151
## 9 Alpha Tomb Raider 0.001655629 1 151
## 10 Game Night Tomb Raider 0.001655629 1 151
## 11 The Mummy Tomb Raider 0.001655629 1 151
## 12 The Star Wars Holiday Special Tomb Raider 0.001655629 1 151
## 13 Fuzz Tomb Raider 0.001655629 1 151
## 14 Cannonball Run II Tomb Raider 0.001655629 1 151
## 15 Fire and Ice Tomb Raider 0.001655629 1 151
## 16 Humanoids from the Deep Tomb Raider 0.001655629 1 151
## 17 Delirious Tomb Raider 0.001655629 1 151
## 18 Happy Birthday to Me Tomb Raider 0.001655629 1 151
## 19 Company of Wolves, The Tomb Raider 0.001655629 1 151
## 20 Squirm Tomb Raider 0.001655629 1 151
embed_url("https://youtu.be/Hby8GJscdho?si=mYKPG5uM0l-2-y5h")