Capstone-Part-1-V3.R

#Movielens Data. This project will utilize the following dataset:
#https://grouplens.org/datasets/movielens/10m/.  The initial code was provided by the course
#materials and we used to do the data pull as shown below:

##########################################################
# Create edx set, validation set (final hold-out test set)
##########################################################

if(!require(tidyverse)) install.packages("tidyverse", repos = "http://cran.us.r-project.org")

## Loading required package: tidyverse

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.8
## ✓ tidyr   1.2.0     ✓ stringr 1.4.0
## ✓ readr   2.1.2     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

if(!require(caret)) install.packages("caret", repos = "http://cran.us.r-project.org")

## Loading required package: caret

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

if(!require(data.table)) install.packages("data.table", repos = "http://cran.us.r-project.org")

## Loading required package: data.table

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

## The following object is masked from 'package:purrr':
## 
##     transpose

library(tidyverse)
library(caret)
library(data.table)

# MovieLens 10M dataset:
# https://grouplens.org/datasets/movielens/10m/
# http://files.grouplens.org/datasets/movielens/ml-10m.zip

dl <- tempfile()
download.file("http://files.grouplens.org/datasets/movielens/ml-10m.zip", dl)

ratings <- fread(text = gsub("::", "\t", readLines(unzip(dl, "ml-10M100K/ratings.dat"))),
                 col.names = c("userId", "movieId", "rating", "timestamp"))

movies <- str_split_fixed(readLines(unzip(dl, "ml-10M100K/movies.dat")), "\\::", 3)
colnames(movies) <- c("movieId", "title", "genres")

movies <- as.data.frame(movies) %>% mutate(movieId = as.numeric(movieId),
                                           title = as.character(title),
                                           genres = as.character(genres))


movielens <- left_join(ratings, movies, by = "movieId")

movielens <- mutate(movielens, rating_year= year(as.Date(as.POSIXct(timestamp, 
                                                            origin = "1970-01-01"))))
#create movie year

movie_year <- stringi::stri_extract(movielens$title, regex = "(\\d{4})", comments = TRUE) %>% 
  as.numeric()
movielens <- movielens %>% mutate(movie_year = movie_year) %>% select(-timestamp)

summary(movielens)

##      userId         movieId          rating         title          
##  Min.   :    1   Min.   :    1   Min.   :0.500   Length:10000054   
##  1st Qu.:18123   1st Qu.:  648   1st Qu.:3.000   Class :character  
##  Median :35740   Median : 1834   Median :4.000   Mode  :character  
##  Mean   :35870   Mean   : 4120   Mean   :3.512                     
##  3rd Qu.:53608   3rd Qu.: 3624   3rd Qu.:4.000                     
##  Max.   :71567   Max.   :65133   Max.   :5.000                     
##     genres           rating_year     movie_year  
##  Length:10000054    Min.   :1995   Min.   :1000  
##  Class :character   1st Qu.:2000   1st Qu.:1987  
##  Mode  :character   Median :2002   Median :1994  
##                     Mean   :2002   Mean   :1991  
##                     3rd Qu.:2005   3rd Qu.:1998  
##                     Max.   :2009   Max.   :9000

movielens %>% filter(movie_year > 2018) %>% 
  group_by(movieId, title, movie_year) %>% 
  summarize(n = n())

## `summarise()` has grouped output by 'movieId', 'title'. You can override using
## the `.groups` argument.

## # A tibble: 6 × 4
## # Groups:   movieId, title [6]
##   movieId title                                          movie_year     n
##     <dbl> <chr>                                               <dbl> <int>
## 1     671 Mystery Science Theater 3000: The Movie (1996)       3000  3620
## 2    2308 Detroit 9000 (1973)                                  9000    24
## 3    4159 3000 Miles to Graceland (2001)                       3000   788
## 4    5310 Transylvania 6-5000 (1985)                           5000   218
## 5    8864 Mr. 3000 (2004)                                      3000   163
## 6   27266 2046 (2004)                                          2046   472

movielens %>% filter(movie_year < 1900) %>% 
  group_by(movieId, title, movie_year) %>% 
  summarize(n = n())

## `summarise()` has grouped output by 'movieId', 'title'. You can override using
## the `.groups` argument.

## # A tibble: 8 × 4
## # Groups:   movieId, title [8]
##   movieId title                                                 movie_year     n
##     <dbl> <chr>                                                      <dbl> <int>
## 1    1422 Murder at 1600 (1997)                                       1600  1742
## 2    4311 Bloody Angels (1732 Høtten: Marerittet Har et Postnu…       1732     9
## 3    5472 1776 (1972)                                                 1776   205
## 4    6290 House of 1000 Corpses (2003)                                1000   406
## 5    6645 THX 1138 (1971)                                             1138   525
## 6    8198 1000 Eyes of Dr. Mabuse, The (Tausend Augen des Dr. …       1000    30
## 7    8905 1492: Conquest of Paradise (1992)                           1492   152
## 8   53953 1408 (2007)                                                 1408   520

#Correct movie dates dates

movielens[movielens$movieId == "27266", "movie_year"] <- 2004
movielens[movielens$movieId == "671", "movie_year"] <- 1996
movielens[movielens$movieId == "2308", "movie_year"] <- 1973
movielens[movielens$movieId == "4159", "movie_year"] <- 2001
movielens[movielens$movieId == "5310", "movie_year"] <- 1985
movielens[movielens$movieId == "8864", "movie_year"] <- 2004
movielens[movielens$movieId == "1422", "movie_year"] <- 1997
movielens[movielens$movieId == "4311", "movie_year"] <- 1998
movielens[movielens$movieId == "5472", "movie_year"] <- 1972
movielens[movielens$movieId == "6290", "movie_year"] <- 2003
movielens[movielens$movieId == "6645", "movie_year"] <- 1971
movielens[movielens$movieId == "8198", "movie_year"] <- 1960
movielens[movielens$movieId == "8905", "movie_year"] <- 1992
movielens[movielens$movieId == "53953", "movie_year"] <- 2007

#Calculate the age of movie

movielens <-movielens %>% mutate(movie_age = 2022 - movie_year)

#Create the validation set that will be 10% of MovieLens data.

set.seed(1, sample.kind="Rounding")

## Warning in set.seed(1, sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used

test_index <- createDataPartition(y = movielens$rating, times = 1, p = 0.1, list = FALSE)
edx <- movielens[-test_index,]
temp <- movielens[test_index,]

#Make sure userId and movieId in validation set are also in edx set

validation <- temp %>% 
  semi_join(edx, by = "movieId") %>%
  semi_join(edx, by = "userId")

#Add rows removed from validation set back into edx set

removed <- anti_join(temp, validation)

## Joining, by = c("userId", "movieId", "rating", "title", "genres",
## "rating_year", "movie_year", "movie_age")

train <- rbind(edx, removed)

rm(dl, ratings, movies, test_index, temp, movielens, removed, edx)

set.seed(1, sample.kind="Rounding")

## Warning in set.seed(1, sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used

test_index <- createDataPartition(y = train$rating, times = 1, p = 0.1, list = FALSE)
train_data <- train[-test_index,]
temp <- train[test_index,]

#Matching userId and movieId in both train and test sets

test_data <- temp %>%
  semi_join(train_data, by = "movieId") %>%
  semi_join(train_data, by = "userId")

# Adding back rows into train set

removed <- anti_join(temp, test_data)

## Joining, by = c("userId", "movieId", "rating", "title", "genres",
## "rating_year", "movie_year", "movie_age")

train_data <- rbind(train_data, removed)

rm(test_index, temp, removed)

summary(train)

##      userId         movieId          rating         title          
##  Min.   :    1   Min.   :    1   Min.   :0.500   Length:9000055    
##  1st Qu.:18124   1st Qu.:  648   1st Qu.:3.000   Class :character  
##  Median :35738   Median : 1834   Median :4.000   Mode  :character  
##  Mean   :35870   Mean   : 4122   Mean   :3.512                     
##  3rd Qu.:53607   3rd Qu.: 3626   3rd Qu.:4.000                     
##  Max.   :71567   Max.   :65133   Max.   :5.000                     
##     genres           rating_year     movie_year     movie_age     
##  Length:9000055     Min.   :1995   Min.   :1900   Min.   : 12.00  
##  Class :character   1st Qu.:2000   1st Qu.:1987   1st Qu.: 24.00  
##  Mode  :character   Median :2002   Median :1994   Median : 28.00  
##                     Mean   :2002   Mean   :1990   Mean   : 31.73  
##                     3rd Qu.:2005   3rd Qu.:1998   3rd Qu.: 35.00  
##                     Max.   :2009   Max.   :2010   Max.   :122.00

head(train)

##    userId movieId rating                         title
## 1:      1     122      5              Boomerang (1992)
## 2:      1     185      5               Net, The (1995)
## 3:      1     292      5               Outbreak (1995)
## 4:      1     316      5               Stargate (1994)
## 5:      1     329      5 Star Trek: Generations (1994)
## 6:      1     355      5       Flintstones, The (1994)
##                           genres rating_year movie_year movie_age
## 1:                Comedy|Romance        1996       1992        30
## 2:         Action|Crime|Thriller        1996       1995        27
## 3:  Action|Drama|Sci-Fi|Thriller        1996       1995        27
## 4:       Action|Adventure|Sci-Fi        1996       1994        28
## 5: Action|Adventure|Drama|Sci-Fi        1996       1994        28
## 6:       Children|Comedy|Fantasy        1996       1994        28

#Data Analysis

train %>% summarize(unique_users = length(unique(userId)),
                  unique_movies = length(unique(movieId)),
                  unique_genres = length(unique(genres)))

##   unique_users unique_movies unique_genres
## 1        69878         10677           797

#Analysis of Rating
#Distribution and Avg. Rating by Genre 

train %>% separate_rows(genres, sep = "\\|") %>%
  group_by(genres) %>%
  summarize(count = n(), avg_rating = mean(rating)) %>%
  arrange(desc(count))

## # A tibble: 20 × 3
##    genres               count avg_rating
##    <chr>                <int>      <dbl>
##  1 Drama              3910127       3.67
##  2 Comedy             3540930       3.44
##  3 Action             2560545       3.42
##  4 Thriller           2325899       3.51
##  5 Adventure          1908892       3.49
##  6 Romance            1712100       3.55
##  7 Sci-Fi             1341183       3.40
##  8 Crime              1327715       3.67
##  9 Fantasy             925637       3.50
## 10 Children            737994       3.42
## 11 Horror              691485       3.27
## 12 Mystery             568332       3.68
## 13 War                 511147       3.78
## 14 Animation           467168       3.60
## 15 Musical             433080       3.56
## 16 Western             189394       3.56
## 17 Film-Noir           118541       4.01
## 18 Documentary          93066       3.78
## 19 IMAX                  8181       3.77
## 20 (no genres listed)       7       3.64

#Avg.rating by genre visually
train %>% separate_rows(genres, sep = "\\|") %>%
  group_by(genres) %>%
  summarize(avg_rating = mean(rating)) %>%
  arrange(desc(avg_rating)) %>%
  ggplot(aes(reorder(genres, avg_rating), avg_rating, fill= avg_rating)) +
  geom_bar(stat = "identity") + coord_flip() +
  scale_fill_distiller(palette = "Reds") + labs(y = "Avg Rating", x = "Genre") +
  ggtitle("Average Rating by Genre")

#Analysis of ratings: compare n of ratings by level
train %>% group_by(rating) %>% summarize(count = n())

## # A tibble: 10 × 2
##    rating   count
##     <dbl>   <int>
##  1    0.5   85374
##  2    1    345679
##  3    1.5  106426
##  4    2    711422
##  5    2.5  333010
##  6    3   2121240
##  7    3.5  791624
##  8    4   2588430
##  9    4.5  526736
## 10    5   1390114

#Ratings' distribution by level visually
  train %>% 
  group_by(rating) %>%
  summarize(count = n()) %>%
  ggplot(aes(x = rating, y = count)) +
  scale_y_continuous(labels = scales::comma) +
  geom_line() +
  ggtitle("Ratings' Distribution by level")

#Relationship between movie age and ratings  
train %>% group_by(movie_age) %>%
    summarize(rating = mean(rating)) %>%
    ggplot(aes(movie_age, rating)) +
    geom_point() +
    geom_smooth() +
  ggtitle("Avg. Rating by Movie Age")

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Distribution of ratings by title

  train %>% group_by(title) %>% summarize(count = n()) %>%
    ggplot(aes(count)) + geom_histogram(fill = "blue", color = "grey", bins = 100) +
    labs(x = "No. of Movie Ratings", y = "Count") +
    scale_x_continuous(trans="log10") +
  ggtitle("Number of Ratings by Title")

#Distribution of ratings by users

train %>% group_by(userId) %>% summarize(count = n()) %>%
    ggplot(aes(count)) + geom_histogram(fill = "skyblue", color = "black", bins = 100) +
    labs(x = "No. of Movie Ratings", y = "Count") +
    scale_x_log10() +
  ggtitle("Number of Ratings by User")

#By user, distribution of avg. rating

user_rating <- train %>%
  group_by(userId) %>%
  summarize(count = n(), rating = mean(rating)) %>%
  arrange(desc(count))

summary(user_rating)

##      userId          count            rating     
##  Min.   :    1   Min.   :  10.0   Min.   :0.500  
##  1st Qu.:17943   1st Qu.:  32.0   1st Qu.:3.357  
##  Median :35798   Median :  62.0   Median :3.635  
##  Mean   :35782   Mean   : 128.8   Mean   :3.614  
##  3rd Qu.:53620   3rd Qu.: 141.0   3rd Qu.:3.903  
##  Max.   :71567   Max.   :6616.0   Max.   :5.000

train %>% group_by(userId) %>% summarize(rating = mean(rating)) %>%
  ggplot(aes(rating,userId)) +
  geom_point() +
  geom_smooth() +
ggtitle("Avg. Rating by User")

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

#Modeling

RMSE <- function(true_ratings, predicted_ratings){
  sqrt(mean((true_ratings - predicted_ratings)^2))
}

mu <- mean(train_data$rating)
mu

## [1] 3.512456

naive_rmse <- RMSE(test_data$rating, mu)
naive_rmse

## [1] 1.060054

rmse_results <- tibble(method = "Just the average", RMSE = naive_rmse)
rmse_results %>% knitr::kable()

method	RMSE
Just the average	1.060054

#effect of movie ratings 
# calculate RMSE of movie ranking effect

mu <- mean(train_data$rating) 
movie_avgs <- train_data %>% 
  group_by(movieId) %>% 
  summarize(b_i = mean(rating - mu))

predicted_ratings <- test_data %>% 
  left_join(movie_avgs, by='movieId') %>%
  mutate(pred = mu + b_i) %>%
  pull(pred)
RMSE(predicted_ratings, test_data$rating)

## [1] 0.9429615

movie_bias <- RMSE(predicted_ratings, test_data$rating)
rmse_results <- bind_rows(rmse_results, tibble(method = "movie bias added", RMSE = movie_bias))
rmse_results %>% knitr::kable()

method	RMSE
Just the average	1.0600537
movie bias added	0.9429615

#Looking at the estimates taking into account the movie bias, we see that they vary.

qplot(b_i, data = movie_avgs, bins = 10, color = I("black"))

#effect of users

user_avgs <- train_data %>% 
  left_join(movie_avgs, by='movieId') %>%
  group_by(userId) %>%
  summarize(b_u = mean(rating - mu - b_i))

predicted_ratings <- test_data %>% 
  left_join(movie_avgs, by='movieId') %>%
  left_join(user_avgs, by='userId') %>%
  mutate(pred = mu + b_i + b_u) %>%
  pull(pred)
user_bias <- RMSE(predicted_ratings, test_data$rating)
rmse_results <- bind_rows(rmse_results, tibble(method = "movie and user bias added", RMSE = user_bias))
rmse_results %>% knitr::kable()

method	RMSE
Just the average	1.0600537
movie bias added	0.9429615
movie and user bias added	0.8646843

#Regularization method

lambdas <- seq(0,10,0.25)
rmses <- sapply(lambdas, function(a){
  mu <- mean(train_data$rating)
  
  b_i <- train_data %>%
    group_by(movieId) %>%
    summarize(b_i = sum(rating - mu)/(n() + a))
  
  b_u <- train_data %>%
    left_join(b_i, by='movieId') %>% 
    group_by(userId) %>%
    summarize(b_u = sum(rating - b_i - mu)/(n() + a))
  
  predicted_ratings <- test_data %>%
    left_join(b_i, by = "movieId") %>%
    left_join(b_u, by = "userId") %>%
    mutate(pred = mu + b_i +  b_u) %>% .$pred
  
  return(RMSE(predicted_ratings, test_data$rating))
})

rmse_results <- bind_rows(rmse_results, tibble(method = "regularization method", RMSE = min(rmses)))
rmse_results %>% knitr::kable()

method	RMSE
Just the average	1.0600537
movie bias added	0.9429615
movie and user bias added	0.8646843
regularization method	0.8641362

qplot(lambdas, rmses, color = I("green"))

lambdas[which.min(rmses)]

## [1] 5

#Regularization with additional attributes

lambdas <- seq(0, 10, 0.25)

rmses <- sapply(lambdas, function(a){
  mu <- mean(train_data$rating)
  
  b_i <- train_data %>%
    group_by(movieId) %>%
    summarize(b_i = sum(rating - mu)/(n() + a))
  
  b_u <- train_data %>%
    left_join(b_i, by='movieId') %>% 
    group_by(userId) %>%
    summarize(b_u = sum(rating - b_i - mu)/(n() + a))
  
  b_y <- train_data %>%
    left_join(b_i, by='movieId') %>%
    left_join(b_u, by='userId') %>%
    group_by(movie_year) %>%
    summarize(b_y = sum(rating - mu - b_i - b_u)/(n() + a))
  
  b_g <- train_data %>%
    left_join(b_i, by='movieId') %>%
    left_join(b_u, by='userId') %>%
    left_join(b_y, by = 'movie_year') %>%
    group_by(genres) %>%
    summarize(b_g = sum(rating - mu - b_i - b_u - b_y)/(n() + a))
  
  predicted_ratings <- test_data %>%
    left_join(b_i, by = "movieId") %>%
    left_join(b_u, by = "userId") %>%
    left_join(b_y, by = 'movie_year') %>%
    left_join(b_g, by = 'genres') %>%
    mutate(pred = mu + b_i +  b_u + b_y + b_g) %>% .$pred
  
  return(RMSE(predicted_ratings, test_data$rating))
})

rmse_results <- bind_rows(rmse_results, 
tibble(method = "regularization method enhanced with year and genre", RMSE = min(rmses)))
rmse_results %>% knitr::kable()

method	RMSE
Just the average	1.0600537
movie bias added	0.9429615
movie and user bias added	0.8646843
regularization method	0.8641362
regularization method enhanced with year and genre	0.8636089

# Compute new predictions using the optimal lambda

qplot(lambdas, rmses)

a <- lambdas[which.min(rmses)]

#Validation results

mu <- mean(validation$rating)

b_title <- validation %>%
  group_by(movieId) %>%
  summarize(b_title = sum(rating - mu)/(n() + a))

b_user <- validation %>%
  left_join(b_title, by='movieId') %>% 
  group_by(userId) %>%
  summarize(b_user = sum(rating - b_title - mu)/(n() + a))

predicted_ratings <- validation %>%
  left_join(b_title, by = "movieId") %>%
  left_join(b_user, by = "userId") %>%
  mutate(pred = mu + b_title +  b_user) %>% .$pred

RMSE(predicted_ratings, validation$rating)

## [1] 0.8388471

Capstone-Part-1-V3.R

alevcieslinski

2022-02-15