Analise de dados e Particao em Teste e Treino

library(dslabs)
library(tidyverse)
## -- Attaching packages ---------------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts ------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
data("movielens")

head(movielens)  
##   movieId                                   title year
## 1      31                         Dangerous Minds 1995
## 2    1029                                   Dumbo 1941
## 3    1061                                Sleepers 1996
## 4    1129                    Escape from New York 1981
## 5    1172 Cinema Paradiso (Nuovo cinema Paradiso) 1989
## 6    1263                        Deer Hunter, The 1978
##                             genres userId rating  timestamp
## 1                            Drama      1    2.5 1260759144
## 2 Animation|Children|Drama|Musical      1    3.0 1260759179
## 3                         Thriller      1    3.0 1260759182
## 4 Action|Adventure|Sci-Fi|Thriller      1    2.0 1260759185
## 5                            Drama      1    4.0 1260759205
## 6                        Drama|War      1    2.0 1260759151
keep <- movielens %>%
  dplyr::count(movieId) %>%
  top_n(5) %>%
  pull(movieId)
## Selecting by n
tab <- movielens %>%
  filter(userId %in% c(13:20)) %>% 
  filter(movieId %in% keep) %>% 
  select(userId, title, rating) %>% 
  spread(title, rating)
tab %>% knitr::kable()     #Exemplo de user 
userId Forrest Gump Pulp Fiction Shawshank Redemption, The Silence of the Lambs, The Star Wars: Episode IV - A New Hope
13 5.0 3.5 4.5 NA NA
15 1.0 5.0 2.0 5.0 5.0
16 NA NA 4.0 NA NA
17 2.5 5.0 5.0 4.5 3.5
18 NA NA NA NA 3.0
19 5.0 5.0 4.0 3.0 4.0
20 2.0 0.5 4.5 0.5 1.5
users <- sample(unique(movielens$userId), 100)
rafalib::mypar()
movielens %>% filter(userId %in% users) %>% 
  select(userId, movieId, rating) %>%
  mutate(rating = 1) %>%
  spread(movieId, rating) %>% select(sample(ncol(.), 100)) %>% 
  as.matrix() %>% t(.) %>%
  image(1:100, 1:100,. , xlab="Movies", ylab="Users")
abline(h=0:100+0.5, v=0:100+0.5, col = "grey") #Matriz da combinacao de user com produtos

movielens %>% 
  dplyr::count(movieId) %>% 
  ggplot(aes(n)) + 
  geom_histogram(bins = 30, color = "black") + 
  scale_x_log10() + 
  ggtitle("Movies")   #Distribuicao de classificacao por produtos

movielens %>%
  dplyr::count(userId) %>% 
  ggplot(aes(n)) + 
  geom_histogram(bins = 30, color = "black") + 
  scale_x_log10() +
  ggtitle("Users")    #Distribuicao de classificacoes por utilizadores

library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
set.seed(755)
test_index <- createDataPartition(y = movielens$rating, times = 1,
                                  p = 0.2, list = FALSE)
train_set <- movielens[-test_index,]
test_set <- movielens[test_index,]  #Dividir os dados em teste e treino

test_set <- test_set %>% 
  semi_join(train_set, by = "movieId") %>%
  semi_join(train_set, by = "userId") #Removemos do teste os utilizadores e filmes que nao aparecem no conjunto de treino

Desenvolvimento do Sistema de Recomendacao

RMSE <- function(true_ratings, predicted_ratings){
  sqrt(mean((true_ratings - predicted_ratings)^2)) #Erro residual (se a classificacao for de 1 a 5, então RMSE>1 torna a predicao má)
}

mu_hat <- mean(train_set$rating)
mu_hat
## [1] 3.543305
naive_rmse <- RMSE(test_set$rating, mu_hat)
naive_rmse
## [1] 1.052805
rmse_results <- data_frame(method = "Just the average", RMSE = naive_rmse)
## Warning: `data_frame()` is deprecated, use `tibble()`.
## This warning is displayed once per session.
mu <- mean(train_set$rating)


 
movie_avgs <- train_set %>% 
  group_by(movieId) %>% 
  summarize(b_i = mean(rating - mu))
predicted_ratings <- mu + test_set %>% 
  left_join(movie_avgs, by='movieId') %>%
  .$b_i
model_1_rmse <- RMSE(predicted_ratings, test_set$rating)
rmse_results <- bind_rows(rmse_results,
                          data_frame(method="Movie Effect Model",
                                     RMSE = model_1_rmse ))
rmse_results %>% knitr::kable()                                             #Modelo 1
method RMSE
Just the average 1.0528047
Movie Effect Model 0.9885931
user_avgs <- test_set %>% 
  left_join(movie_avgs, by='movieId') %>%
  group_by(userId) %>%
  summarize(b_u = mean(rating - mu - b_i))
predicted_ratings <- test_set %>% 
  left_join(movie_avgs, by='movieId') %>%
  left_join(user_avgs, by='userId') %>%
  mutate(pred = mu + b_i + b_u) %>%
  .$pred
model_2_rmse <- RMSE(predicted_ratings, test_set$rating)
rmse_results <- bind_rows(rmse_results,
                          data_frame(method="Movie + User Effects Model",  
                                     RMSE = model_2_rmse ))
rmse_results %>% knitr::kable()    #Modelo 2
method RMSE
Just the average 1.0528047
Movie Effect Model 0.9885931
Movie + User Effects Model 0.8812843
lambdas <- seq(0, 10, 0.25)
rmses <- sapply(lambdas, function(l){
  mu <- mean(train_set$rating)
  b_i <- train_set %>%
    group_by(movieId) %>%
    summarize(b_i = sum(rating - mu)/(n()+l))
  b_u <- train_set %>% 
    left_join(b_i, by="movieId") %>%
    group_by(userId) %>%
    summarize(b_u = sum(rating - b_i - mu)/(n()+l))
  predicted_ratings <- 
    test_set %>% 
    left_join(b_i, by = "movieId") %>%
    left_join(b_u, by = "userId") %>%
    mutate(pred = mu + b_i + b_u) %>%
    .$pred
  return(RMSE(predicted_ratings, test_set$rating))
})
qplot(lambdas, rmses)  

lambda <- lambdas[which.min(rmses)]
lambda
## [1] 3.25
rmse_results <- bind_rows(rmse_results,
                          data_frame(method="Regularized Movie + User Effect Model",  
                                     RMSE = min(rmses)))
rmse_results %>% knitr::kable()                                   #Modelo 3
method RMSE
Just the average 1.0528047
Movie Effect Model 0.9885931
Movie + User Effects Model 0.8812843
Regularized Movie + User Effect Model 0.8814297