library(dplyr)
library(readr)
library(data.table)
library(tidyverse)
library(ggplot2)
library(reshape2)
library(NbClust)
library(factoextra)
library(miscTools)
movie_data <- read_csv("movies.csv")
rating_data <- read_csv("ratings.csv")
genome_data <- read_csv("MovieGenome.csv")
movie_genre <- as.data.frame(movie_data$genres, stringsAsFactors=FALSE)
movie_genre2 <- as.data.frame(tstrsplit(movie_genre[,1], '[|]',
type.convert=TRUE),
stringsAsFactors=FALSE) #DataFlair
colnames(movie_genre2) <- c(1:10)
list_genre <- c("Action", "Adventure", "Animation", "Children",
"Comedy", "Crime","Documentary", "Drama", "Fantasy",
"Film-Noir", "Horror", "Musical", "Mystery","Romance",
"Sci-Fi", "Thriller", "War", "Western")
genre_mat1 <- matrix(0,27279,18)
genre_mat1[1,] <- list_genre
colnames(genre_mat1) <- list_genre
for (index in 1:nrow(movie_genre2)) {
for (col in 1:ncol(movie_genre2)) {
gen_col = which(genre_mat1[1,] == movie_genre2[index,col]) #Author DataFlair
genre_mat1[index+1,gen_col] <- 1
}
}
genre_mat2 <- as.data.frame(genre_mat1[-1,], stringsAsFactors=FALSE) #remove first row, which was the genre list
for (col in 1:ncol(genre_mat2)) {
genre_mat2[,col] <- as.integer(genre_mat2[,col]) #convert from characters to integers
}
genre_mat2$movieId = movie_data$movieId
#Getting average user rating for each movieID
rating_avg = aggregate(rating_data$rating, list(rating_data$movieId), FUN=mean)
#Merging average user ratings to genre matrix
colnames(rating_avg) = c("movieId", "avgRating")
movie_avg = merge(rating_avg, genre_mat2, by = "movieId", all.x = TRUE)
#Creating function to fetch average rating for specific genres
avgRatingByGenre = function(genre) {
tempDF = movie_avg %>%
filter(!!as.symbol(genre) == 1)
return(mean(tempDF$avgRating))
}
#Applying function to all genres in the dataset
genreNames = c(colnames(movie_avg[3:20]))
genreScores = sapply(genreNames, avgRatingByGenre)
#Converting series to dataframe
genreRatings <- data.frame(genre=genreNames,
avgRating=genreScores,
stringsAsFactors=FALSE)
#Showing average genre ratings in descending order
genreRatings[order(-genreRatings$avgRating),]
littleWomenID = 261
# Moving little women 1994 to first row, so it's not included
littleWomenRow = genome_data %>%
filter(movieId == littleWomenID)
noLittleWomen = genome_data %>%
filter(movieId != littleWomenID)
#Don't run this code, takes way too long, but the answer is Little Princess (1995)
#which.min(distances = dist(rbind(littleWomenRow[,2:ncol(littleWomenRow)], noLittleWomen[,2:ncol(noLittleWomen)]))[1:nrow(noLittleWomen)])
first5k = genome_data[1:5000,]
first5kScaled = scale(first5k)
set.seed(1)
fviz_nbclust(first5kScaled, kmeans, method = "wss", k.max = 15, verbose = TRUE) + theme_minimal() + ggtitle("Within Sum of Square value by K")
Because the total within sum of square will always decrease as k increases, we search for the “elbow” point to indicate the optimal cluster count. Ultimately we settle on k = 7.
# Creating clusters with k = 7
clusters = kmeans(first5kScaled[,2:ncol(first5kScaled)], 7)
# Consolidating the matrix data with the movie ids and assigned cluster
idToCluster = data.frame(movieId = first5k$movieId, cluster = clusters[1])
clusterGenre = merge(movie_avg[1:5000,], idToCluster, by = "movieId")
# Examining cluster 2
cluster2 = clusterGenre %>%
filter(cluster == 2)
# Creating dataframe containing info regarding the allotment of movie genres for the given cluster
# relative to the entire set of the first 5k movies.
genreDiff = data.frame(diff = colMeans(cluster2[,3:20]) - colMeans(clusterGenre[,3:20]))
genreDiff$genre = row.names(genreDiff)
# Plotting the difference
genreDiff %>%
ggplot(aes(reorder(genre, -diff, sum), y = diff)) +
geom_bar(stat="identity") +
xlab("Genre") +
ylab("Difference in Prevalence of Genre") +
ggtitle("Comparing the difference in prevalence of genre's\nbetween cluster 2 and the first 5000 movies") +
theme(plot.title = element_text(hjust = 0.5), axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
According to the above bar chart, it is evident that the second cluster primarily contains children’s movies, with a significant aversion to more adult marketed content such as thrillers and horrors.
The five movies we have selected: 1. Dune (1984) -> id : 2021 2. Forrest Gump -> id : 356 3. 2001: A Space Odyssey -> id : 924 4. Planes, Trains & Automobiles -> id : 4002 5. The Thing (2011) -> id : 90345
# Filtering out all ratings that don't cover one of the selected five
fiveIds = c(2021, 356, 924, 4002, 90345)
fiveMovies = rating_data %>%
filter(movieId %in% fiveIds)
# Filtering the reviewers such that only those who have reviewed all five movies remain
reviewers = aggregate(fiveMovies$movieId, by=list(fiveMovies$userId), FUN=length) %>%
filter(x >= 5) %>%
select(c("Group.1"))
colnames(reviewers) = c("userId")
# Generating dataframe containing the reviwers and their reviews of the five movies
# then pivoting so that the movieId become column names (renamed to the movie title)
fiveMovieRating = merge(reviewers, rating_data, by = "userId") %>%
filter(movieId %in% fiveIds)
fiveMovieRating = dcast(fiveMovieRating, userId ~ movieId)
colnames(fiveMovieRating) = c("userId", "Forrest Gump", "2001: A Space Odyssey", "Dune", "Planes, Trains & Automobiles", "The Thing")
# Running k-means with differing values of k to find optimal number of clusters
fviz_nbclust(fiveMovieRating[,2:6], kmeans, method = "wss", k.max = 29, verbose = TRUE) + theme_minimal() + ggtitle("Total Within Sum of Square value by K")
Once again looking for the elbow point, we identify k = 8 as the optimal selection.
# Generating clusters with k = 8, then implanting the center locations
# into a dataframe
fiveMoviesClusters = kmeans(fiveMovieRating[,2:6], 8)
fiveMoviesClustersCenters = data.frame(fiveMoviesClusters["centers"])
colnames(fiveMoviesClustersCenters) = c("Forrest Gump", "2001: A Space Odyssey", "Dune", "Planes, Trains & Automobiles", "The Thing")
#Showing cluster centers
fiveMoviesClustersCenters
Friend’s Movie Ratings: 1. Forrest Gump : 5/5 2. 2001: A Space Odyssey : 3/5 3. Dune (1984) : 2/5 4. Planes, Trains & Automobiles : 4/5
#Converting friend rating vector to data frame row w same columns
friendRating = rbind(fiveMoviesClustersCenters[,1:4], c(5, 3, 2, 4))
friendRating = friendRating[9,]
#Finding the cluster that friendRating best fits into
which.min(dist(rbind(friendRating, fiveMoviesClustersCenters[,1:4]))[1:nrow(fiveMoviesClustersCenters[,1:4])])
## [1] 8
According to their ratings of the first four movies, they best align with the 8th cluster.
#Cluster center for the unincluded movie - The expected rating the friend would give
fiveMoviesClustersCenters[8,5]
## [1] 3.125
According to the cluster they were assigned to, the average rating allotted to The Thing was 3.125. With this, whether or not I recommend this film entirely depends on whether or not they are ok with watching remarkably average movies. Ultimately, this could go either way considering it’s middle of the pack score, but I for one would appreciate not being told to watch a movie that I would find completely and utterly average.