Cargamos las librerías

library(plotly)
library(dplyr)
library(stringr)
library(reshape2)
library(ggplot2)
library(arules)
library(arulesViz)
library(stringdist)
library(Matrix)

Cargamos los CSV

movies <- read.csv("movies.csv", 
                  colClasses = c("integer","character","character"),
                  sep = ",",
                  stringsAsFactors = FALSE)

ratings <- read.csv("ratings.csv",
                   colClasses = c("integer","integer","numeric","character"),
                   sep=",",
                   stringsAsFactors = FALSE)

head(movies, 5)
##   movieId                              title
## 1       1                   Toy Story (1995)
## 2       2                     Jumanji (1995)
## 3       3            Grumpier Old Men (1995)
## 4       4           Waiting to Exhale (1995)
## 5       5 Father of the Bride Part II (1995)
##                                        genres
## 1 Adventure|Animation|Children|Comedy|Fantasy
## 2                  Adventure|Children|Fantasy
## 3                              Comedy|Romance
## 4                        Comedy|Drama|Romance
## 5                                      Comedy
head(ratings, 5)
##   userId movieId rating  timestamp
## 1      1       2    3.5 1112486027
## 2      1      29    3.5 1112484676
## 3      1      32    3.5 1112484819
## 4      1      47    3.5 1112484727
## 5      1      50    3.5 1112484580

Preparamos el dataset de películas

Extraemos el año de la película y eliminamos las filas que no tienen este valor:

movies$year = as.numeric(str_sub( str_trim(movies$title) ,start = -5,end = -2))
movies <- na.omit(movies)

Agrupamos por año y graficamos la cantidad de producciones:

mov_per_y <- movies %>% 
      group_by(year) %>% 
      summarise(Total = n()) %>% 
      arrange(year)
plot_ly(data = mov_per_y, x=~year, y=~Total ,name='Demanda', type = 'scatter', mode = 'lines', line = list(color = 'steelblue') ) %>% layout(title ="Películas por Año")

Vamos a obtener el promedio de ratings por película y la cantidad de usuarios que hicieron rate:

users_rat <- ratings %>%
              group_by(movieId) %>%
              summarise(avg_rating = mean(rating),
                        num_users = n_distinct(userId))

Unimos los dos datasets para continuar con el análisis:

 df_mov_rat <- movies %>% 
                inner_join(users_rat, by = "movieId")

Listamos las 10 películas con más y mejor rating:

df_mov_rat %>% 
  arrange(desc(num_users), desc(avg_rating)) %>% 
  select('Título' = title, 'Rating Promedio' = avg_rating, 'Cantidad Ratings' = num_users) %>% 
  head(n=10)
##                                       Título Rating Promedio
## 1                        Pulp Fiction (1994)        4.174231
## 2                        Forrest Gump (1994)        4.029000
## 3           Shawshank Redemption, The (1994)        4.446990
## 4           Silence of the Lambs, The (1991)        4.177057
## 5                       Jurassic Park (1993)        3.664741
## 6  Star Wars: Episode IV - A New Hope (1977)        4.190672
## 7                          Braveheart (1995)        4.042534
## 8          Terminator 2: Judgment Day (1991)        3.931954
## 9                         Matrix, The (1999)        4.187186
## 10                   Schindler's List (1993)        4.310175
##    Cantidad Ratings
## 1             67310
## 2             66172
## 3             63366
## 4             63299
## 5             59715
## 6             54502
## 7             53769
## 8             52244
## 9             51334
## 10            50054

Reglas de Asociación - Creamos la matriz dispersa película - usuario:

user_item_matrix <- as(split(ratings[,"movieId"], ratings[,"userId"]), "transactions")
user_item_matrix
## transactions in sparse format with
##  138493 transactions (rows) and
##  26744 items (columns)

Veamos cómo se comportan los usuarios y las películas:

summary(size(user_item_matrix))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    20.0    35.0    68.0   144.4   155.0  9254.0

Los usuarios vieron entre 20 y 9254 películas con un promedio de 144.

params = list(
    supp = 0.001,
    conf = 0.7,
    maxlen = 2
)

reglas <- apriori(user_item_matrix, parameter = params)
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.7    0.1    1 none FALSE            TRUE       5   0.001      1
##  maxlen target   ext
##       2  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 138 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[26744 item(s), 138493 transaction(s)] done [20.39s].
## sorting and recoding items ... [7693 item(s)] done [0.84s].
## creating transaction tree ... done [0.28s].
## checking subsets of size 1 2 done [18.30s].
## writing ... [189736 rule(s)] done [0.28s].
## creating S4 object  ... done [0.18s].
summary(reglas)
## set of 189736 rules
## 
## rule length distribution (lhs + rhs):sizes
##      2 
## 189736 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       2       2       2       2       2       2 
## 
## summary of quality measures:
##     support           confidence          lift             count        
##  Min.   :0.001004   Min.   :0.7000   Min.   :  1.440   Min.   :  139.0  
##  1st Qu.:0.001516   1st Qu.:0.7240   1st Qu.:  2.246   1st Qu.:  210.0  
##  Median :0.002520   Median :0.7532   Median :  3.057   Median :  349.0  
##  Mean   :0.006659   Mean   :0.7637   Mean   :  3.929   Mean   :  922.2  
##  3rd Qu.:0.005712   3rd Qu.:0.7941   3rd Qu.:  4.323   3rd Qu.:  791.0  
##  Max.   :0.344516   Max.   :0.9700   Max.   :663.359   Max.   :47713.0  
## 
## mining info:
##              data ntransactions support confidence
##  user_item_matrix        138493   0.001        0.7

Se crearon 189736 reglas. Vamos a filtrar las reglas que tenga un lift mayor al cuartil 3 (4.323).

reglas_2 <- subset(reglas, lift >= 4.323)

summary(reglas_2)
## set of 47438 rules
## 
## rule length distribution (lhs + rhs):sizes
##     2 
## 47438 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       2       2       2       2       2       2 
## 
## summary of quality measures:
##     support           confidence          lift             count      
##  Min.   :0.001004   Min.   :0.7000   Min.   :  4.323   Min.   :  139  
##  1st Qu.:0.001329   1st Qu.:0.7208   1st Qu.:  4.777   1st Qu.:  184  
##  Median :0.001935   Median :0.7463   Median :  5.592   Median :  268  
##  Mean   :0.003393   Mean   :0.7565   Mean   :  7.648   Mean   :  470  
##  3rd Qu.:0.003488   3rd Qu.:0.7824   3rd Qu.:  7.584   3rd Qu.:  483  
##  Max.   :0.121017   Max.   :0.9700   Max.   :663.359   Max.   :16760  
## 
## mining info:
##              data ntransactions support confidence
##  user_item_matrix        138493   0.001        0.7

####Redujimos a 47438 reglas!!! Vamos a crear un gráfico que relacione el soporte y la confianza de cada regla:

plot(reglas_2, method = "two-key plot")
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.

df_reglas <- as(reglas_2, "data.frame")
head(df_reglas)
##                 rules     support confidence     lift count
## 1      {834} => {788} 0.001039764  0.7093596 5.225881   144
## 9       {732} => {95} 0.001249161  0.7393162 4.632199   173
## 30   {8485} => {4973} 0.001032543  0.8827160 5.020740   143
## 33 {73759} => {58559} 0.001090308  0.8531073 5.780869   151
## 37      {706} => {95} 0.001321366  0.7290837 4.568087   183
## 38     {706} => {788} 0.001379131  0.7609562 5.605995   191

Separemos los movieId de las reglas:

Ahora que tenemos los Ids unamos con el dataset de películas:

df_reglas <-  df_reglas %>% left_join(movies,by=c("lhs_movie" = "movieId") )

df_reglas$lhs_movie = NULL
col_name = colnames(df_reglas)
col_name[5:24] = str_c("left.",col_name[5:24])


df_reglas = df_reglas %>% left_join(movies,by=c("rhs_movie" = "movieId"))
df_reglas$rhs_movie = NULL
col_name = colnames(df_reglas)
col_name[24:43] = str_c("right.",col_name[24:43])

df_reglas <- df_reglas %>% select(  -genres.x, -genres.y, -year.x, -year.y) 

Ordenamos las reglas para las primeras 20 con mayor lift y ploteamos:

lift_20 <- df_reglas %>% select(lift, title.x, title.y) %>% arrange(desc(lift)) %>% head(n=20)
ggplot(lift_20, aes(x = title.x, y = title.y, fill = lift)) +
  geom_tile() + 
  labs(title = "Reglas de asociación", subtitle = "Top 20 Lift")+
  theme(axis.text.x=element_text(angle = -45, hjust = 0))

Hacemos el mismo procedimiento para la métrica de confianza:

conf_20 <- df_reglas %>% select(confidence, title.x, title.y) %>% arrange(desc(confidence)) %>% head(n=20)
ggplot(conf_20, aes(x = title.x, y = title.y, fill = confidence)) +
  geom_tile() + 
  labs(title = "Reglas de asociación", subtitle = "Top 20 COnfidence")+
  theme(axis.text.x=element_text(angle = -45, hjust = 0))