Conternt-based system

# Libraries and function
library(tidyverse)
library(igraph)
library(magrittr)
library(sentimentr)

source("extract_json.R") 
'%out%' <- function(x,y)!('%in%'(x,y))

# Remove scientific notation
options(scipen = 999)

# Data
movies2 <- read.csv("movies2.csv")

# Splitting plot_keywords
library(splitstackshape)
movies2 <- cSplit(movies2, "plot_keywords", "|")
movies2 <- movies2 %>% select(-budget, -gross, -tagline, -X) 

#### ================================== 
#### Network with clusters and keywords
#### ==================================

### 1st keyword
key_1 <- movies2 %>% select(plot_keywords_1, content_rating)
graph_1 = graph_from_data_frame(key_1, directed = F)
clusters_1 = cluster_walktrap(graph_1)
clusters2 <- data.frame(n_clstr = clusters_1$membership, plot_keywords_1 = V(graph_1)$name)
clusters2 <- clusters2 %>% filter(plot_keywords_1 %out% (unique(key_1$content_rating)))
movies2 <- inner_join(clusters2, movies2)

### 2nd keyword
key_2 <- movies2 %>% select(plot_keywords_2, content_rating)
graph_2 = graph_from_data_frame(key_2, directed = F)
clusters_2 = cluster_walktrap(graph_2)
clusters3 <- data.frame(n_clstr_1 = clusters_2$membership, plot_keywords_2 = V(graph_2)$name)
clusters3 <- clusters3 %>% filter(plot_keywords_2 %out% (unique(key_2$content_rating)))
movies2 <- inner_join(clusters3, movies2)

### 3rd keyword
key_3 <- movies2 %>% select(plot_keywords_3, content_rating)
graph_3 = graph_from_data_frame(key_3, directed = F)
clusters_3 = cluster_walktrap(graph_3)
clusters4 <- data.frame(n_clstr_2 = clusters_3$membership, plot_keywords_3 = V(graph_3)$name)
clusters4 <- clusters4 %>% filter(plot_keywords_3 %out% (unique(key_3$content_rating)))
movies2 <- inner_join(clusters4, movies2)

# Time period
movies2$movie_year <- as.numeric(unlist(movies2$movie_year))
movies2$time_category <- cut(movies2$movie_year, c(1910, 1960, 1970, 1980, 1990, 2000, 2005))
movies2$time_coded[movies2$time_category == "(1910,1960]"] <- "1"
movies2$time_coded[movies2$time_category == "(1960,1970]"] <- "2"
movies2$time_coded[movies2$time_category == "(1970,1980]"] <- "3"
movies2$time_coded[movies2$time_category == "(1980,1990]"] <- "4"
movies2$time_coded[movies2$time_category == "(1990,2000]"] <- "5"
movies2$time_coded[movies2$time_category == "(2000,2005]"] <- "6"
movies2$time_coded <- as.numeric(movies2$time_coded)

# Content rating
movies2$content_coded[movies2$content_rating == "G"] <- "1"
movies2$content_coded[movies2$content_rating == "PG"] <- "2"
movies2$content_coded[movies2$content_rating == "GP"] <- "2"
movies2$content_coded[movies2$content_rating == "PG-13"] <- "3"
movies2$content_coded[movies2$content_rating == "R"] <- "4"
movies2$content_coded[movies2$content_rating == "X"] <- "4"
movies2$content_coded[movies2$content_rating == "NC-17"] <- "4"
movies2$content_coded[movies2$content_rating == "Approved"] <- "1"
movies2$content_coded[movies2$content_rating == "Passed"] <- "1"
movies2$content_coded[movies2$content_rating == "Unrated"] <- "5"
movies2$content_coded <- as.numeric(movies2$content_coded)

# Sentiments
movies2$overview <- as.character(movies2$overview)
movies2$movie_id = seq.int(nrow(movies2))
movies2$sent = (sentiment_by(movies2$overview, by = movies2$movie_id,
averaging.function = sentimentr::average_downweighted_zero))$ave_sentiment

# Data frame for matrix
movies_test = select(movies2, time_coded, runtime, content_coded, sent, movie_id, n_clstr, n_clstr_1, n_clstr_2)
rownames(movies_test) = movies_test$movie_id
movies_test <- movies_test %>% dplyr::select(-movie_id)
movies_test <- scale(movies_test)

# Matrix
sim = lsa::cosine(t(as.matrix(movies_test)))
diag(sim) = 0
sim[1:10, 1:10] %>% round(4)
##          1       2       3       4  5       6       7       8       9
## 1   0.0000  0.4211  0.3056 -0.4886 NA -0.0289 -0.2714 -0.5437 -0.3189
## 2   0.4211  0.0000  0.8527 -0.4952 NA  0.1493 -0.7978 -0.6269 -0.4242
## 3   0.3056  0.8527  0.0000 -0.3529 NA -0.0518 -0.8121 -0.3652 -0.1717
## 4  -0.4886 -0.4952 -0.3529  0.0000 NA  0.1906  0.6914  0.9321  0.0748
## 5       NA      NA      NA      NA  0      NA      NA      NA      NA
## 6  -0.0289  0.1493 -0.0518  0.1906 NA  0.0000  0.2434 -0.1101  0.1980
## 7  -0.2714 -0.7978 -0.8121  0.6914 NA  0.2434  0.0000  0.6077  0.0673
## 8  -0.5437 -0.6269 -0.3652  0.9321 NA -0.1101  0.6077  0.0000  0.2230
## 9  -0.3189 -0.4242 -0.1717  0.0748 NA  0.1980  0.0673  0.2230  0.0000
## 10 -0.0632 -0.6187 -0.2395  0.2171 NA -0.1377  0.2555  0.4335  0.8162
##         10
## 1  -0.0632
## 2  -0.6187
## 3  -0.2395
## 4   0.2171
## 5       NA
## 6  -0.1377
## 7   0.2555
## 8   0.4335
## 9   0.8162
## 10  0.0000
# Recommenration function
getRecFilms = function(title2, N = 5){
  film = movies2 %>% dplyr::filter(title == title2)
  if (nrow(film) == 0) {
    recommend = filter(movies2, popularity >= 100) %>% dplyr::select(title) 
  } else {
    mostSimilar = head(sort(sim[,as.character(film$movie_id)], decreasing = T), n = N)
    a = which(sim[,as.character(film$movie_id)] %in% mostSimilar, arr.ind = TRUE)
    rows = a %% dim(sim)[1]
    result = rownames(sim)[rows]
    recommend = filter(movies2, movie_id %in% result) %>% dplyr::select(title, movie_year, director_name) 
  }
  recommend
}
getRecFilms("Batman Begins", 3)
##                           title movie_year       director_name
## 1 Robin Hood: Prince of Thieves       1991      Kevin Reynolds
## 2          The Sum of All Fears       2002 Phil Alden Robinson
## 3                  White Squall       1996        Ridley Scott

Sentiments Visualization

library(cowplot)
library(viridis)

plot_a <- ggplot() +
  geom_point(data = movies2, aes(x = popularity, y = sent, color = sent)) +
  scale_fill_continuous(guide = guide_legend()) +
    theme(legend.position = "right") +
  xlab("Популярность фильма") +
  ylab("Значения сентиментов") +
  geom_hline(yintercept = 0, color = "darkgray", linetype = "dashed") +
  geom_hline(yintercept = 1, color = "darkgray") +
  geom_hline(yintercept = -1, color = "darkgray")

plot_b <- ggplot() +
  geom_point(data = movies2, aes(x = movie_year, y = sent, color = sent)) +
  scale_fill_continuous(guide = guide_legend()) +
    theme(legend.position = "right") +
    scale_color_viridis(option = "D") +
  xlab("Год выпуска фильма") +
  ylab("Значения сентиментов") +
  geom_hline(yintercept = 0, color = "darkgray", linetype = "dashed") +
  geom_hline(yintercept = 1, color = "darkgray") +
  geom_hline(yintercept = -1, color = "darkgray")

plot_grid(plot_a, plot_b, labels = c('A', 'B'))