library(dplyr) # install.packages('dplyr')
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2) # install.packages('ggplot2')
library(scales) # install.packages('scales')
# set plot theme
theme_set(theme_bw())
data.dir <- '.'
# read ratings from csv file
system.time(
ratings <- read.delim(sprintf('%s/movielens/ratings.csv', data.dir),
sep=',', header=F,
col.names=c('user_id','movie_id','rating','timestamp'),
colClasses=c('integer','integer','numeric','integer'))
)
## user system elapsed
## 37.36 0.51 38.06
print(object.size(ratings), units="Mb")
## 190.7 Mb
####################
# brief look at data
####################
head(ratings)
## user_id movie_id rating timestamp
## 1 1 122 5 838985046
## 2 1 185 5 838983525
## 3 1 231 5 838983392
## 4 1 292 5 838983421
## 5 1 316 5 838983392
## 6 1 329 5 838983392
nrow(ratings)
## [1] 10000054
str(ratings)
## 'data.frame': 10000054 obs. of 4 variables:
## $ user_id : int 1 1 1 1 1 1 1 1 1 1 ...
## $ movie_id : int 122 185 231 292 316 329 355 356 362 364 ...
## $ rating : num 5 5 5 5 5 5 5 5 5 5 ...
## $ timestamp: int 838985046 838983525 838983392 838983421 838983392 838983392 838984474 838983653 838984885 838983707 ...
####################
# aggregate stats
####################
# compute aggregate stats
summary(ratings$rating)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.50 3.00 4.00 3.51 4.00 5.00
# plot distribution of ratings
ggplot(data=ratings, aes(x=rating)) +
geom_histogram() +
scale_y_continuous(labels=comma) +
xlab('Rating') + ylab('Count')
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

ggsave(file="figures/rating_dist.pdf", width=4, height=4)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
####################
# per-movie stats
####################
# aggregate ratings by movie, computing mean and number of ratings
movie_stats <- ratings %>%
group_by(movie_id) %>%
summarize(num_ratings=n(), mean_rating=mean(rating))
# compute movie-level summary stats
summary(movie_stats)
## movie_id num_ratings mean_rating
## Min. : 1 Min. : 1 Min. :0.50
## 1st Qu.: 2754 1st Qu.: 34 1st Qu.:2.85
## Median : 5434 Median : 135 Median :3.27
## Mean :13105 Mean : 937 Mean :3.19
## 3rd Qu.: 8710 3rd Qu.: 626 3rd Qu.:3.61
## Max. :65133 Max. :34864 Max. :5.00
# plot distribution of movie popularity
ggplot(data=movie_stats, aes(x=num_ratings)) +
geom_histogram() +
scale_x_continuous(labels=comma) +
scale_y_continuous(labels=comma) +
xlab('Number of Ratings by Movie') + ylab('Count') +
scale_x_log10()
## Scale for 'x' is already present. Adding another scale for 'x', which will replace the existing scale.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

ggsave(file="figures/movie_popularity_dist.pdf", width=4, height=4)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
# plot distribution of mean ratings by movie
ggplot(data=movie_stats, aes(x=mean_rating)) +
stat_density() +
scale_x_continuous(labels=comma) +
scale_y_continuous(labels=comma) +
theme(axis.text.y=element_blank(), axis.ticks.y=element_blank()) +
xlab('Mean Rating by Movie') + ylab('Density')

ggsave(file="figures/mean_rating_by_movie_dist.pdf", width=4, height=4)
# rank movies by popularity and compute cdf
movie_stats <- movie_stats %>%
mutate(rank=rank(-num_ratings)) %>%
arrange(rank) %>%
mutate(cdf=cumsum(num_ratings)/sum(num_ratings))
# plot CCDF of movie popularity
ggplot(data=movie_stats, aes(x=rank, y=cdf)) +
geom_line() +
scale_x_continuous(labels=comma) +
scale_y_continuous(labels=percent) +
xlab('Movie Rank') + ylab('CDF')

ggsave(file="figures/movie_popularity_cdf.pdf", width=4, height=4)
####################
# per-user stats
####################
# aggregate ratings by user, computing mean and number of ratings
user_stats <- ratings %>%
group_by(user_id) %>%
summarize(num_ratings=n(), mean_rating=mean(rating))
# compute user-level stats
summary(user_stats)
## user_id num_ratings mean_rating
## Min. : 1 Min. : 20 Min. :0.50
## 1st Qu.:17943 1st Qu.: 35 1st Qu.:3.36
## Median :35798 Median : 69 Median :3.63
## Mean :35782 Mean : 143 Mean :3.61
## 3rd Qu.:53620 3rd Qu.: 156 3rd Qu.:3.90
## Max. :71567 Max. :7359 Max. :5.00
# plot distribution of user activity
ggplot(data=user_stats, aes(x=num_ratings)) +
geom_histogram() +
scale_x_continuous(labels=comma) +
scale_y_continuous(labels=comma) +
xlab('Number of Ratings by User') + ylab('Count') +
scale_x_log10()
## Scale for 'x' is already present. Adding another scale for 'x', which will replace the existing scale.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

ggsave(file="figures/user_activity_dist.pdf", width=4, height=4)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.