Conternt-based system
# Libraries and function
library(tidyverse)
library(igraph)
library(magrittr)
library(sentimentr)
source("extract_json.R")
'%out%' <- function(x,y)!('%in%'(x,y))
# Remove scientific notation
options(scipen = 999)
# Data
movies2 <- read.csv("movies2.csv")
# Splitting plot_keywords
library(splitstackshape)
movies2 <- cSplit(movies2, "plot_keywords", "|")
movies2 <- movies2 %>% select(-budget, -gross, -tagline, -X)
#### ==================================
#### Network with clusters and keywords
#### ==================================
### 1st keyword
key_1 <- movies2 %>% select(plot_keywords_1, content_rating)
graph_1 = graph_from_data_frame(key_1, directed = F)
clusters_1 = cluster_walktrap(graph_1)
clusters2 <- data.frame(n_clstr = clusters_1$membership, plot_keywords_1 = V(graph_1)$name)
clusters2 <- clusters2 %>% filter(plot_keywords_1 %out% (unique(key_1$content_rating)))
movies2 <- inner_join(clusters2, movies2)
### 2nd keyword
key_2 <- movies2 %>% select(plot_keywords_2, content_rating)
graph_2 = graph_from_data_frame(key_2, directed = F)
clusters_2 = cluster_walktrap(graph_2)
clusters3 <- data.frame(n_clstr_1 = clusters_2$membership, plot_keywords_2 = V(graph_2)$name)
clusters3 <- clusters3 %>% filter(plot_keywords_2 %out% (unique(key_2$content_rating)))
movies2 <- inner_join(clusters3, movies2)
### 3rd keyword
key_3 <- movies2 %>% select(plot_keywords_3, content_rating)
graph_3 = graph_from_data_frame(key_3, directed = F)
clusters_3 = cluster_walktrap(graph_3)
clusters4 <- data.frame(n_clstr_2 = clusters_3$membership, plot_keywords_3 = V(graph_3)$name)
clusters4 <- clusters4 %>% filter(plot_keywords_3 %out% (unique(key_3$content_rating)))
movies2 <- inner_join(clusters4, movies2)
# Time period
movies2$movie_year <- as.numeric(unlist(movies2$movie_year))
movies2$time_category <- cut(movies2$movie_year, c(1910, 1960, 1970, 1980, 1990, 2000, 2005))
movies2$time_coded[movies2$time_category == "(1910,1960]"] <- "1"
movies2$time_coded[movies2$time_category == "(1960,1970]"] <- "2"
movies2$time_coded[movies2$time_category == "(1970,1980]"] <- "3"
movies2$time_coded[movies2$time_category == "(1980,1990]"] <- "4"
movies2$time_coded[movies2$time_category == "(1990,2000]"] <- "5"
movies2$time_coded[movies2$time_category == "(2000,2005]"] <- "6"
movies2$time_coded <- as.numeric(movies2$time_coded)
# Content rating
movies2$content_coded[movies2$content_rating == "G"] <- "1"
movies2$content_coded[movies2$content_rating == "PG"] <- "2"
movies2$content_coded[movies2$content_rating == "GP"] <- "2"
movies2$content_coded[movies2$content_rating == "PG-13"] <- "3"
movies2$content_coded[movies2$content_rating == "R"] <- "4"
movies2$content_coded[movies2$content_rating == "X"] <- "4"
movies2$content_coded[movies2$content_rating == "NC-17"] <- "4"
movies2$content_coded[movies2$content_rating == "Approved"] <- "1"
movies2$content_coded[movies2$content_rating == "Passed"] <- "1"
movies2$content_coded[movies2$content_rating == "Unrated"] <- "5"
movies2$content_coded <- as.numeric(movies2$content_coded)
# Sentiments
movies2$overview <- as.character(movies2$overview)
movies2$movie_id = seq.int(nrow(movies2))
movies2$sent = (sentiment_by(movies2$overview, by = movies2$movie_id,
averaging.function = sentimentr::average_downweighted_zero))$ave_sentiment
# Data frame for matrix
movies_test = select(movies2, time_coded, runtime, content_coded, sent, movie_id, n_clstr, n_clstr_1, n_clstr_2)
rownames(movies_test) = movies_test$movie_id
movies_test <- movies_test %>% dplyr::select(-movie_id)
movies_test <- scale(movies_test)
# Matrix
sim = lsa::cosine(t(as.matrix(movies_test)))
diag(sim) = 0
sim[1:10, 1:10] %>% round(4)
## 1 2 3 4 5 6 7 8 9
## 1 0.0000 0.4211 0.3056 -0.4886 NA -0.0289 -0.2714 -0.5437 -0.3189
## 2 0.4211 0.0000 0.8527 -0.4952 NA 0.1493 -0.7978 -0.6269 -0.4242
## 3 0.3056 0.8527 0.0000 -0.3529 NA -0.0518 -0.8121 -0.3652 -0.1717
## 4 -0.4886 -0.4952 -0.3529 0.0000 NA 0.1906 0.6914 0.9321 0.0748
## 5 NA NA NA NA 0 NA NA NA NA
## 6 -0.0289 0.1493 -0.0518 0.1906 NA 0.0000 0.2434 -0.1101 0.1980
## 7 -0.2714 -0.7978 -0.8121 0.6914 NA 0.2434 0.0000 0.6077 0.0673
## 8 -0.5437 -0.6269 -0.3652 0.9321 NA -0.1101 0.6077 0.0000 0.2230
## 9 -0.3189 -0.4242 -0.1717 0.0748 NA 0.1980 0.0673 0.2230 0.0000
## 10 -0.0632 -0.6187 -0.2395 0.2171 NA -0.1377 0.2555 0.4335 0.8162
## 10
## 1 -0.0632
## 2 -0.6187
## 3 -0.2395
## 4 0.2171
## 5 NA
## 6 -0.1377
## 7 0.2555
## 8 0.4335
## 9 0.8162
## 10 0.0000
# Recommenration function
getRecFilms = function(title2, N = 5){
film = movies2 %>% dplyr::filter(title == title2)
if (nrow(film) == 0) {
recommend = filter(movies2, popularity >= 100) %>% dplyr::select(title)
} else {
mostSimilar = head(sort(sim[,as.character(film$movie_id)], decreasing = T), n = N)
a = which(sim[,as.character(film$movie_id)] %in% mostSimilar, arr.ind = TRUE)
rows = a %% dim(sim)[1]
result = rownames(sim)[rows]
recommend = filter(movies2, movie_id %in% result) %>% dplyr::select(title, movie_year, director_name)
}
recommend
}
getRecFilms("Batman Begins", 3)
## title movie_year director_name
## 1 Robin Hood: Prince of Thieves 1991 Kevin Reynolds
## 2 The Sum of All Fears 2002 Phil Alden Robinson
## 3 White Squall 1996 Ridley Scott
Sentiments Visualization
library(cowplot)
library(viridis)
plot_a <- ggplot() +
geom_point(data = movies2, aes(x = popularity, y = sent, color = sent)) +
scale_fill_continuous(guide = guide_legend()) +
theme(legend.position = "right") +
xlab("Популярность фильма") +
ylab("Значения сентиментов") +
geom_hline(yintercept = 0, color = "darkgray", linetype = "dashed") +
geom_hline(yintercept = 1, color = "darkgray") +
geom_hline(yintercept = -1, color = "darkgray")
plot_b <- ggplot() +
geom_point(data = movies2, aes(x = movie_year, y = sent, color = sent)) +
scale_fill_continuous(guide = guide_legend()) +
theme(legend.position = "right") +
scale_color_viridis(option = "D") +
xlab("Год выпуска фильма") +
ylab("Значения сентиментов") +
geom_hline(yintercept = 0, color = "darkgray", linetype = "dashed") +
geom_hline(yintercept = 1, color = "darkgray") +
geom_hline(yintercept = -1, color = "darkgray")
plot_grid(plot_a, plot_b, labels = c('A', 'B'))
