Exploring Movielens

library(dplyr)    # install.packages('dplyr')

## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2) # install.packages('ggplot2')
library(scales)    # install.packages('scales')

# set plot theme
theme_set(theme_bw())

data.dir <- '.'

# read ratings from csv file
system.time(
  ratings <- read.delim(sprintf('%s/movielens/ratings.csv', data.dir),
                        sep=',', header=F,
                        col.names=c('user_id','movie_id','rating','timestamp'),
                        colClasses=c('integer','integer','numeric','integer'))
)

##    user  system elapsed 
##   37.36    0.51   38.06

print(object.size(ratings), units="Mb")

## 190.7 Mb

####################
# brief look at data
####################

head(ratings)

##   user_id movie_id rating timestamp
## 1       1      122      5 838985046
## 2       1      185      5 838983525
## 3       1      231      5 838983392
## 4       1      292      5 838983421
## 5       1      316      5 838983392
## 6       1      329      5 838983392

nrow(ratings)

## [1] 10000054

str(ratings)

## 'data.frame':    10000054 obs. of  4 variables:
##  $ user_id  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ movie_id : int  122 185 231 292 316 329 355 356 362 364 ...
##  $ rating   : num  5 5 5 5 5 5 5 5 5 5 ...
##  $ timestamp: int  838985046 838983525 838983392 838983421 838983392 838983392 838984474 838983653 838984885 838983707 ...

####################
# aggregate stats
####################

# compute aggregate stats
summary(ratings$rating)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.50    3.00    4.00    3.51    4.00    5.00

# plot distribution of ratings
ggplot(data=ratings, aes(x=rating)) +
  geom_histogram() +
  scale_y_continuous(labels=comma) +
  xlab('Rating') + ylab('Count')

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-1

ggsave(file="figures/rating_dist.pdf", width=4, height=4)

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

####################
# per-movie stats
####################

# aggregate ratings by movie, computing mean and number of ratings
movie_stats <- ratings %>%
  group_by(movie_id) %>%
  summarize(num_ratings=n(), mean_rating=mean(rating))

# compute movie-level summary stats
summary(movie_stats)

##     movie_id      num_ratings     mean_rating  
##  Min.   :    1   Min.   :    1   Min.   :0.50  
##  1st Qu.: 2754   1st Qu.:   34   1st Qu.:2.85  
##  Median : 5434   Median :  135   Median :3.27  
##  Mean   :13105   Mean   :  937   Mean   :3.19  
##  3rd Qu.: 8710   3rd Qu.:  626   3rd Qu.:3.61  
##  Max.   :65133   Max.   :34864   Max.   :5.00

# plot distribution of movie popularity
ggplot(data=movie_stats, aes(x=num_ratings)) +
  geom_histogram() +
  scale_x_continuous(labels=comma) +
  scale_y_continuous(labels=comma) +
  xlab('Number of Ratings by Movie') + ylab('Count') +
  scale_x_log10()

## Scale for 'x' is already present. Adding another scale for 'x', which will replace the existing scale.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-1

ggsave(file="figures/movie_popularity_dist.pdf", width=4, height=4)

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

# plot distribution of mean ratings by movie
ggplot(data=movie_stats, aes(x=mean_rating)) +
  stat_density() +
  scale_x_continuous(labels=comma) +
  scale_y_continuous(labels=comma) +
  theme(axis.text.y=element_blank(), axis.ticks.y=element_blank()) +
  xlab('Mean Rating by Movie') + ylab('Density')

plot of chunk unnamed-chunk-1

ggsave(file="figures/mean_rating_by_movie_dist.pdf", width=4, height=4)

# rank movies by popularity and compute cdf
movie_stats <- movie_stats %>%
  mutate(rank=rank(-num_ratings)) %>%
  arrange(rank) %>%
  mutate(cdf=cumsum(num_ratings)/sum(num_ratings))

# plot CCDF of movie popularity
ggplot(data=movie_stats, aes(x=rank, y=cdf)) +
  geom_line() +
  scale_x_continuous(labels=comma) +
  scale_y_continuous(labels=percent) +
  xlab('Movie Rank') + ylab('CDF')

plot of chunk unnamed-chunk-1

ggsave(file="figures/movie_popularity_cdf.pdf", width=4, height=4)


####################
# per-user stats
####################

# aggregate ratings by user, computing mean and number of ratings
user_stats <- ratings %>%
  group_by(user_id) %>%
  summarize(num_ratings=n(), mean_rating=mean(rating))

# compute user-level stats
summary(user_stats)

##     user_id       num_ratings    mean_rating  
##  Min.   :    1   Min.   :  20   Min.   :0.50  
##  1st Qu.:17943   1st Qu.:  35   1st Qu.:3.36  
##  Median :35798   Median :  69   Median :3.63  
##  Mean   :35782   Mean   : 143   Mean   :3.61  
##  3rd Qu.:53620   3rd Qu.: 156   3rd Qu.:3.90  
##  Max.   :71567   Max.   :7359   Max.   :5.00

# plot distribution of user activity
ggplot(data=user_stats, aes(x=num_ratings)) +
  geom_histogram() +
  scale_x_continuous(labels=comma) +
  scale_y_continuous(labels=comma) +
  xlab('Number of Ratings by User') + ylab('Count') +
  scale_x_log10()

## Scale for 'x' is already present. Adding another scale for 'x', which will replace the existing scale.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-1

ggsave(file="figures/user_activity_dist.pdf", width=4, height=4)

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

Exploring Movielens

Jake Hofman

February 6, 2015