Analise de dados e Particao em Teste e Treino
library(dslabs)
library(tidyverse)
## -- Attaching packages ---------------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
data("movielens")
head(movielens)
## movieId title year
## 1 31 Dangerous Minds 1995
## 2 1029 Dumbo 1941
## 3 1061 Sleepers 1996
## 4 1129 Escape from New York 1981
## 5 1172 Cinema Paradiso (Nuovo cinema Paradiso) 1989
## 6 1263 Deer Hunter, The 1978
## genres userId rating timestamp
## 1 Drama 1 2.5 1260759144
## 2 Animation|Children|Drama|Musical 1 3.0 1260759179
## 3 Thriller 1 3.0 1260759182
## 4 Action|Adventure|Sci-Fi|Thriller 1 2.0 1260759185
## 5 Drama 1 4.0 1260759205
## 6 Drama|War 1 2.0 1260759151
keep <- movielens %>%
dplyr::count(movieId) %>%
top_n(5) %>%
pull(movieId)
## Selecting by n
tab <- movielens %>%
filter(userId %in% c(13:20)) %>%
filter(movieId %in% keep) %>%
select(userId, title, rating) %>%
spread(title, rating)
tab %>% knitr::kable() #Exemplo de user
| 13 |
5.0 |
3.5 |
4.5 |
NA |
NA |
| 15 |
1.0 |
5.0 |
2.0 |
5.0 |
5.0 |
| 16 |
NA |
NA |
4.0 |
NA |
NA |
| 17 |
2.5 |
5.0 |
5.0 |
4.5 |
3.5 |
| 18 |
NA |
NA |
NA |
NA |
3.0 |
| 19 |
5.0 |
5.0 |
4.0 |
3.0 |
4.0 |
| 20 |
2.0 |
0.5 |
4.5 |
0.5 |
1.5 |
users <- sample(unique(movielens$userId), 100)
rafalib::mypar()
movielens %>% filter(userId %in% users) %>%
select(userId, movieId, rating) %>%
mutate(rating = 1) %>%
spread(movieId, rating) %>% select(sample(ncol(.), 100)) %>%
as.matrix() %>% t(.) %>%
image(1:100, 1:100,. , xlab="Movies", ylab="Users")
abline(h=0:100+0.5, v=0:100+0.5, col = "grey") #Matriz da combinacao de user com produtos

movielens %>%
dplyr::count(movieId) %>%
ggplot(aes(n)) +
geom_histogram(bins = 30, color = "black") +
scale_x_log10() +
ggtitle("Movies") #Distribuicao de classificacao por produtos

movielens %>%
dplyr::count(userId) %>%
ggplot(aes(n)) +
geom_histogram(bins = 30, color = "black") +
scale_x_log10() +
ggtitle("Users") #Distribuicao de classificacoes por utilizadores

library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
set.seed(755)
test_index <- createDataPartition(y = movielens$rating, times = 1,
p = 0.2, list = FALSE)
train_set <- movielens[-test_index,]
test_set <- movielens[test_index,] #Dividir os dados em teste e treino
test_set <- test_set %>%
semi_join(train_set, by = "movieId") %>%
semi_join(train_set, by = "userId") #Removemos do teste os utilizadores e filmes que nao aparecem no conjunto de treino
Desenvolvimento do Sistema de Recomendacao
RMSE <- function(true_ratings, predicted_ratings){
sqrt(mean((true_ratings - predicted_ratings)^2)) #Erro residual (se a classificacao for de 1 a 5, então RMSE>1 torna a predicao má)
}
mu_hat <- mean(train_set$rating)
mu_hat
## [1] 3.543305
naive_rmse <- RMSE(test_set$rating, mu_hat)
naive_rmse
## [1] 1.052805
rmse_results <- data_frame(method = "Just the average", RMSE = naive_rmse)
## Warning: `data_frame()` is deprecated, use `tibble()`.
## This warning is displayed once per session.
mu <- mean(train_set$rating)
movie_avgs <- train_set %>%
group_by(movieId) %>%
summarize(b_i = mean(rating - mu))
predicted_ratings <- mu + test_set %>%
left_join(movie_avgs, by='movieId') %>%
.$b_i
model_1_rmse <- RMSE(predicted_ratings, test_set$rating)
rmse_results <- bind_rows(rmse_results,
data_frame(method="Movie Effect Model",
RMSE = model_1_rmse ))
rmse_results %>% knitr::kable() #Modelo 1
| Just the average |
1.0528047 |
| Movie Effect Model |
0.9885931 |
user_avgs <- test_set %>%
left_join(movie_avgs, by='movieId') %>%
group_by(userId) %>%
summarize(b_u = mean(rating - mu - b_i))
predicted_ratings <- test_set %>%
left_join(movie_avgs, by='movieId') %>%
left_join(user_avgs, by='userId') %>%
mutate(pred = mu + b_i + b_u) %>%
.$pred
model_2_rmse <- RMSE(predicted_ratings, test_set$rating)
rmse_results <- bind_rows(rmse_results,
data_frame(method="Movie + User Effects Model",
RMSE = model_2_rmse ))
rmse_results %>% knitr::kable() #Modelo 2
| Just the average |
1.0528047 |
| Movie Effect Model |
0.9885931 |
| Movie + User Effects Model |
0.8812843 |
lambdas <- seq(0, 10, 0.25)
rmses <- sapply(lambdas, function(l){
mu <- mean(train_set$rating)
b_i <- train_set %>%
group_by(movieId) %>%
summarize(b_i = sum(rating - mu)/(n()+l))
b_u <- train_set %>%
left_join(b_i, by="movieId") %>%
group_by(userId) %>%
summarize(b_u = sum(rating - b_i - mu)/(n()+l))
predicted_ratings <-
test_set %>%
left_join(b_i, by = "movieId") %>%
left_join(b_u, by = "userId") %>%
mutate(pred = mu + b_i + b_u) %>%
.$pred
return(RMSE(predicted_ratings, test_set$rating))
})
qplot(lambdas, rmses)

lambda <- lambdas[which.min(rmses)]
lambda
## [1] 3.25
rmse_results <- bind_rows(rmse_results,
data_frame(method="Regularized Movie + User Effect Model",
RMSE = min(rmses)))
rmse_results %>% knitr::kable() #Modelo 3
| Just the average |
1.0528047 |
| Movie Effect Model |
0.9885931 |
| Movie + User Effects Model |
0.8812843 |
| Regularized Movie + User Effect Model |
0.8814297 |