library(plotly)
library(dplyr)
library(stringr)
library(reshape2)
library(ggplot2)
library(arules)
library(arulesViz)
library(stringdist)
library(Matrix)
movies <- read.csv("movies.csv",
colClasses = c("integer","character","character"),
sep = ",",
stringsAsFactors = FALSE)
ratings <- read.csv("ratings.csv",
colClasses = c("integer","integer","numeric","character"),
sep=",",
stringsAsFactors = FALSE)
head(movies, 5)
## movieId title
## 1 1 Toy Story (1995)
## 2 2 Jumanji (1995)
## 3 3 Grumpier Old Men (1995)
## 4 4 Waiting to Exhale (1995)
## 5 5 Father of the Bride Part II (1995)
## genres
## 1 Adventure|Animation|Children|Comedy|Fantasy
## 2 Adventure|Children|Fantasy
## 3 Comedy|Romance
## 4 Comedy|Drama|Romance
## 5 Comedy
head(ratings, 5)
## userId movieId rating timestamp
## 1 1 2 3.5 1112486027
## 2 1 29 3.5 1112484676
## 3 1 32 3.5 1112484819
## 4 1 47 3.5 1112484727
## 5 1 50 3.5 1112484580
movies$year = as.numeric(str_sub( str_trim(movies$title) ,start = -5,end = -2))
movies <- na.omit(movies)
mov_per_y <- movies %>%
group_by(year) %>%
summarise(Total = n()) %>%
arrange(year)
plot_ly(data = mov_per_y, x=~year, y=~Total ,name='Demanda', type = 'scatter', mode = 'lines', line = list(color = 'steelblue') ) %>% layout(title ="PelÃculas por Año")
users_rat <- ratings %>%
group_by(movieId) %>%
summarise(avg_rating = mean(rating),
num_users = n_distinct(userId))
df_mov_rat <- movies %>%
inner_join(users_rat, by = "movieId")
df_mov_rat %>%
arrange(desc(num_users), desc(avg_rating)) %>%
select('TÃtulo' = title, 'Rating Promedio' = avg_rating, 'Cantidad Ratings' = num_users) %>%
head(n=10)
## TÃtulo Rating Promedio
## 1 Pulp Fiction (1994) 4.174231
## 2 Forrest Gump (1994) 4.029000
## 3 Shawshank Redemption, The (1994) 4.446990
## 4 Silence of the Lambs, The (1991) 4.177057
## 5 Jurassic Park (1993) 3.664741
## 6 Star Wars: Episode IV - A New Hope (1977) 4.190672
## 7 Braveheart (1995) 4.042534
## 8 Terminator 2: Judgment Day (1991) 3.931954
## 9 Matrix, The (1999) 4.187186
## 10 Schindler's List (1993) 4.310175
## Cantidad Ratings
## 1 67310
## 2 66172
## 3 63366
## 4 63299
## 5 59715
## 6 54502
## 7 53769
## 8 52244
## 9 51334
## 10 50054
user_item_matrix <- as(split(ratings[,"movieId"], ratings[,"userId"]), "transactions")
user_item_matrix
## transactions in sparse format with
## 138493 transactions (rows) and
## 26744 items (columns)
summary(size(user_item_matrix))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 20.0 35.0 68.0 144.4 155.0 9254.0
params = list(
supp = 0.001,
conf = 0.7,
maxlen = 2
)
reglas <- apriori(user_item_matrix, parameter = params)
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.7 0.1 1 none FALSE TRUE 5 0.001 1
## maxlen target ext
## 2 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 138
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[26744 item(s), 138493 transaction(s)] done [20.39s].
## sorting and recoding items ... [7693 item(s)] done [0.84s].
## creating transaction tree ... done [0.28s].
## checking subsets of size 1 2 done [18.30s].
## writing ... [189736 rule(s)] done [0.28s].
## creating S4 object ... done [0.18s].
summary(reglas)
## set of 189736 rules
##
## rule length distribution (lhs + rhs):sizes
## 2
## 189736
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2 2 2 2 2 2
##
## summary of quality measures:
## support confidence lift count
## Min. :0.001004 Min. :0.7000 Min. : 1.440 Min. : 139.0
## 1st Qu.:0.001516 1st Qu.:0.7240 1st Qu.: 2.246 1st Qu.: 210.0
## Median :0.002520 Median :0.7532 Median : 3.057 Median : 349.0
## Mean :0.006659 Mean :0.7637 Mean : 3.929 Mean : 922.2
## 3rd Qu.:0.005712 3rd Qu.:0.7941 3rd Qu.: 4.323 3rd Qu.: 791.0
## Max. :0.344516 Max. :0.9700 Max. :663.359 Max. :47713.0
##
## mining info:
## data ntransactions support confidence
## user_item_matrix 138493 0.001 0.7
reglas_2 <- subset(reglas, lift >= 4.323)
summary(reglas_2)
## set of 47438 rules
##
## rule length distribution (lhs + rhs):sizes
## 2
## 47438
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2 2 2 2 2 2
##
## summary of quality measures:
## support confidence lift count
## Min. :0.001004 Min. :0.7000 Min. : 4.323 Min. : 139
## 1st Qu.:0.001329 1st Qu.:0.7208 1st Qu.: 4.777 1st Qu.: 184
## Median :0.001935 Median :0.7463 Median : 5.592 Median : 268
## Mean :0.003393 Mean :0.7565 Mean : 7.648 Mean : 470
## 3rd Qu.:0.003488 3rd Qu.:0.7824 3rd Qu.: 7.584 3rd Qu.: 483
## Max. :0.121017 Max. :0.9700 Max. :663.359 Max. :16760
##
## mining info:
## data ntransactions support confidence
## user_item_matrix 138493 0.001 0.7
####Redujimos a 47438 reglas!!! Vamos a crear un gráfico que relacione el soporte y la confianza de cada regla:
plot(reglas_2, method = "two-key plot")
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
df_reglas <- as(reglas_2, "data.frame")
head(df_reglas)
## rules support confidence lift count
## 1 {834} => {788} 0.001039764 0.7093596 5.225881 144
## 9 {732} => {95} 0.001249161 0.7393162 4.632199 173
## 30 {8485} => {4973} 0.001032543 0.8827160 5.020740 143
## 33 {73759} => {58559} 0.001090308 0.8531073 5.780869 151
## 37 {706} => {95} 0.001321366 0.7290837 4.568087 183
## 38 {706} => {788} 0.001379131 0.7609562 5.605995 191
df_reglas <- df_reglas %>% left_join(movies,by=c("lhs_movie" = "movieId") )
df_reglas$lhs_movie = NULL
col_name = colnames(df_reglas)
col_name[5:24] = str_c("left.",col_name[5:24])
df_reglas = df_reglas %>% left_join(movies,by=c("rhs_movie" = "movieId"))
df_reglas$rhs_movie = NULL
col_name = colnames(df_reglas)
col_name[24:43] = str_c("right.",col_name[24:43])
df_reglas <- df_reglas %>% select( -genres.x, -genres.y, -year.x, -year.y)
lift_20 <- df_reglas %>% select(lift, title.x, title.y) %>% arrange(desc(lift)) %>% head(n=20)
ggplot(lift_20, aes(x = title.x, y = title.y, fill = lift)) +
geom_tile() +
labs(title = "Reglas de asociación", subtitle = "Top 20 Lift")+
theme(axis.text.x=element_text(angle = -45, hjust = 0))
conf_20 <- df_reglas %>% select(confidence, title.x, title.y) %>% arrange(desc(confidence)) %>% head(n=20)
ggplot(conf_20, aes(x = title.x, y = title.y, fill = confidence)) +
geom_tile() +
labs(title = "Reglas de asociación", subtitle = "Top 20 COnfidence")+
theme(axis.text.x=element_text(angle = -45, hjust = 0))