Recomendation System Based Classification

Analise de dados e Particao em Teste e Treino

library(dslabs)
library(tidyverse)

## -- Attaching packages ---------------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.2.1     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## -- Conflicts ------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

data("movielens")

head(movielens)

##   movieId                                   title year
## 1      31                         Dangerous Minds 1995
## 2    1029                                   Dumbo 1941
## 3    1061                                Sleepers 1996
## 4    1129                    Escape from New York 1981
## 5    1172 Cinema Paradiso (Nuovo cinema Paradiso) 1989
## 6    1263                        Deer Hunter, The 1978
##                             genres userId rating  timestamp
## 1                            Drama      1    2.5 1260759144
## 2 Animation|Children|Drama|Musical      1    3.0 1260759179
## 3                         Thriller      1    3.0 1260759182
## 4 Action|Adventure|Sci-Fi|Thriller      1    2.0 1260759185
## 5                            Drama      1    4.0 1260759205
## 6                        Drama|War      1    2.0 1260759151

keep <- movielens %>%
  dplyr::count(movieId) %>%
  top_n(5) %>%
  pull(movieId)

## Selecting by n

tab <- movielens %>%
  filter(userId %in% c(13:20)) %>% 
  filter(movieId %in% keep) %>% 
  select(userId, title, rating) %>% 
  spread(title, rating)
tab %>% knitr::kable()     #Exemplo de user

userId	Forrest Gump	Pulp Fiction	Shawshank Redemption, The	Silence of the Lambs, The	Star Wars: Episode IV - A New Hope
13	5.0	3.5	4.5	NA	NA
15	1.0	5.0	2.0	5.0	5.0
16	NA	NA	4.0	NA	NA
17	2.5	5.0	5.0	4.5	3.5
18	NA	NA	NA	NA	3.0
19	5.0	5.0	4.0	3.0	4.0
20	2.0	0.5	4.5	0.5	1.5

users <- sample(unique(movielens$userId), 100)
rafalib::mypar()
movielens %>% filter(userId %in% users) %>% 
  select(userId, movieId, rating) %>%
  mutate(rating = 1) %>%
  spread(movieId, rating) %>% select(sample(ncol(.), 100)) %>% 
  as.matrix() %>% t(.) %>%
  image(1:100, 1:100,. , xlab="Movies", ylab="Users")
abline(h=0:100+0.5, v=0:100+0.5, col = "grey") #Matriz da combinacao de user com produtos

movielens %>% 
  dplyr::count(movieId) %>% 
  ggplot(aes(n)) + 
  geom_histogram(bins = 30, color = "black") + 
  scale_x_log10() + 
  ggtitle("Movies")   #Distribuicao de classificacao por produtos

movielens %>%
  dplyr::count(userId) %>% 
  ggplot(aes(n)) + 
  geom_histogram(bins = 30, color = "black") + 
  scale_x_log10() +
  ggtitle("Users")    #Distribuicao de classificacoes por utilizadores

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

set.seed(755)
test_index <- createDataPartition(y = movielens$rating, times = 1,
                                  p = 0.2, list = FALSE)
train_set <- movielens[-test_index,]
test_set <- movielens[test_index,]  #Dividir os dados em teste e treino

test_set <- test_set %>% 
  semi_join(train_set, by = "movieId") %>%
  semi_join(train_set, by = "userId") #Removemos do teste os utilizadores e filmes que nao aparecem no conjunto de treino

Desenvolvimento do Sistema de Recomendacao

RMSE <- function(true_ratings, predicted_ratings){
  sqrt(mean((true_ratings - predicted_ratings)^2)) #Erro residual (se a classificacao for de 1 a 5, então RMSE>1 torna a predicao má)
}

mu_hat <- mean(train_set$rating)
mu_hat

## [1] 3.543305

naive_rmse <- RMSE(test_set$rating, mu_hat)
naive_rmse

## [1] 1.052805

rmse_results <- data_frame(method = "Just the average", RMSE = naive_rmse)

## Warning: `data_frame()` is deprecated, use `tibble()`.
## This warning is displayed once per session.

mu <- mean(train_set$rating)


 
movie_avgs <- train_set %>% 
  group_by(movieId) %>% 
  summarize(b_i = mean(rating - mu))
predicted_ratings <- mu + test_set %>% 
  left_join(movie_avgs, by='movieId') %>%
  .$b_i
model_1_rmse <- RMSE(predicted_ratings, test_set$rating)
rmse_results <- bind_rows(rmse_results,
                          data_frame(method="Movie Effect Model",
                                     RMSE = model_1_rmse ))
rmse_results %>% knitr::kable()                                             #Modelo 1

method	RMSE
Just the average	1.0528047
Movie Effect Model	0.9885931

user_avgs <- test_set %>% 
  left_join(movie_avgs, by='movieId') %>%
  group_by(userId) %>%
  summarize(b_u = mean(rating - mu - b_i))
predicted_ratings <- test_set %>% 
  left_join(movie_avgs, by='movieId') %>%
  left_join(user_avgs, by='userId') %>%
  mutate(pred = mu + b_i + b_u) %>%
  .$pred
model_2_rmse <- RMSE(predicted_ratings, test_set$rating)
rmse_results <- bind_rows(rmse_results,
                          data_frame(method="Movie + User Effects Model",  
                                     RMSE = model_2_rmse ))
rmse_results %>% knitr::kable()    #Modelo 2

method	RMSE
Just the average	1.0528047
Movie Effect Model	0.9885931
Movie + User Effects Model	0.8812843

lambdas <- seq(0, 10, 0.25)
rmses <- sapply(lambdas, function(l){
  mu <- mean(train_set$rating)
  b_i <- train_set %>%
    group_by(movieId) %>%
    summarize(b_i = sum(rating - mu)/(n()+l))
  b_u <- train_set %>% 
    left_join(b_i, by="movieId") %>%
    group_by(userId) %>%
    summarize(b_u = sum(rating - b_i - mu)/(n()+l))
  predicted_ratings <- 
    test_set %>% 
    left_join(b_i, by = "movieId") %>%
    left_join(b_u, by = "userId") %>%
    mutate(pred = mu + b_i + b_u) %>%
    .$pred
  return(RMSE(predicted_ratings, test_set$rating))
})
qplot(lambdas, rmses)

lambda <- lambdas[which.min(rmses)]
lambda

## [1] 3.25

rmse_results <- bind_rows(rmse_results,
                          data_frame(method="Regularized Movie + User Effect Model",  
                                     RMSE = min(rmses)))
rmse_results %>% knitr::kable()                                   #Modelo 3

method	RMSE
Just the average	1.0528047
Movie Effect Model	0.9885931
Movie + User Effects Model	0.8812843
Regularized Movie + User Effect Model	0.8814297

Recomendation System Based Classification

Luís

12/05/2020

Analise de dados e Particao em Teste e Treino

Desenvolvimento do Sistema de Recomendacao

userId	Forrest Gump	Pulp Fiction	Shawshank Redemption, The	Silence of the Lambs, The	Star Wars: Episode IV - A New Hope
13	5.0	3.5	4.5	NA	NA
15	1.0	5.0	2.0	5.0	5.0
16	NA	NA	4.0	NA	NA
17	2.5	5.0	5.0	4.5	3.5
18	NA	NA	NA	NA	3.0
19	5.0	5.0	4.0	3.0	4.0
20	2.0	0.5	4.5	0.5	1.5

userId	Forrest Gump	Pulp Fiction	Shawshank Redemption, The	Silence of the Lambs, The	Star Wars: Episode IV - A New Hope
13	5.0	3.5	4.5	NA	NA
15	1.0	5.0	2.0	5.0	5.0
16	NA	NA	4.0	NA	NA
17	2.5	5.0	5.0	4.5	3.5
18	NA	NA	NA	NA	3.0
19	5.0	5.0	4.0	3.0	4.0
20	2.0	0.5	4.5	0.5	1.5

userId	Forrest Gump	Pulp Fiction	Shawshank Redemption, The	Silence of the Lambs, The	Star Wars: Episode IV - A New Hope
13	5.0	3.5	4.5	NA	NA
15	1.0	5.0	2.0	5.0	5.0
16	NA	NA	4.0	NA	NA
17	2.5	5.0	5.0	4.5	3.5
18	NA	NA	NA	NA	3.0
19	5.0	5.0	4.0	3.0	4.0
20	2.0	0.5	4.5	0.5	1.5