Started at 1:58 pm PCT on 04/04/2016 Total Time Spent 3.5 hours
Install data table package and read in data
# data is being stored in data.table
library(data.table)
# genres
data.genres = as.data.table(read.table('data/u.genre', sep = '|'))
setnames(data.genres, c('nameGenre', 'idGenre'))
# contains information about each movie, from which genres binary data is most important for
# the manipulations which will be done
data.movies = as.data.table(read.table('data/u.item', sep = '|', quote = ""))
# removing the columns in which all values are NA (in my case 4th column)
data.movies = data.movies[, colSums(is.na(data.movies))<nrow(data.movies), with = F]
# setting names of the columns in data.movies
setnames(data.movies, c('idMovie', 'nameMovie', 'dateRelease', 'imdbUrl',
as.character(data.genres$nameGenre)))
# stores data about each user
data.users = as.data.table(read.table('data/u.user', sep = '|'))
# removing 5th row, because I had no idea how to use ZIP code for the future algorithms
data.users = data.users[, -5, with = F]
setnames(data.users, c('idUser', 'age', 'gender', 'occupation'))
# loading ratings given by users
data.ratings = as.data.table(read.table('data/u.data', sep = '\t'))
# deleting 4th column which represents the time when the movie was rated by that user
data.ratings = data.ratings[, -4, with = F]
# setting column names
setnames(data.ratings, c('idUser', 'idMovie', 'rating'))
# make ratings floats
data.ratings$rating = as.numeric(data.ratings$rating)
Basic summaries for all read data tables before normalization
summary(data.users)
## idUser age gender occupation
## Min. : 1.0 Min. : 7.00 F:273 student :196
## 1st Qu.:236.5 1st Qu.:25.00 M:670 other :105
## Median :472.0 Median :31.00 educator : 95
## Mean :472.0 Mean :34.05 administrator: 79
## 3rd Qu.:707.5 3rd Qu.:43.00 engineer : 67
## Max. :943.0 Max. :73.00 programmer : 66
## (Other) :335
summary(data.movies)
## idMovie nameMovie dateRelease
## Min. : 1.0 Body Snatchers (1993) : 2 01-Jan-1995:215
## 1st Qu.: 421.2 Butcher Boy, The (1998) : 2 01-Jan-1994:213
## Median : 841.5 Chairman of the Board (1998) : 2 01-Jan-1993:126
## Mean : 841.5 Chasing Amy (1997) : 2 01-Jan-1997: 98
## 3rd Qu.:1261.8 Deceiver (1997) : 2 01-Jan-1992: 37
## Max. :1682.0 Designated Mourner, The (1997): 2 01-Jan-1996: 26
## (Other) :1670 (Other) :967
## imdbUrl
## : 3
## http://us.imdb.com/M/title-exact?Body%20Snatchers%20(1993) : 2
## http://us.imdb.com/M/title-exact?Chasing+Amy+(1997) : 2
## http://us.imdb.com/M/title-exact?Designated%20Mourner%2C%20The%20%281997%29: 2
## http://us.imdb.com/M/title-exact?Fly%20Away%20Home%20(1996) : 2
## http://us.imdb.com/M/title-exact?Hugo+Pool+(1997) : 2
## (Other) :1669
## unknown Action Adventure Animation
## Min. :0.000000 Min. :0.0000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.000000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.000000 Median :0.0000 Median :0.00000 Median :0.00000
## Mean :0.001189 Mean :0.1492 Mean :0.08026 Mean :0.02497
## 3rd Qu.:0.000000 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.000000 Max. :1.0000 Max. :1.00000 Max. :1.00000
##
## Children Comedy Crime Documentary
## Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.00000 Median :0.0000 Median :0.0000 Median :0.00000
## Mean :0.07253 Mean :0.3002 Mean :0.0648 Mean :0.02973
## 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.0000 Max. :1.0000 Max. :1.00000
##
## Drama Fantasy FilmNoir Horror
## Min. :0.000 Min. :0.00000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.000 Median :0.00000 Median :0.00000 Median :0.0000
## Mean :0.431 Mean :0.01308 Mean :0.01427 Mean :0.0547
## 3rd Qu.:1.000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :1.000 Max. :1.00000 Max. :1.00000 Max. :1.0000
##
## Musical Mystery Romance SciFi
## Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.0000 Median :0.00000
## Mean :0.03329 Mean :0.03627 Mean :0.1468 Mean :0.06005
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.0000 Max. :1.00000
##
## Thriller War Western
## Min. :0.0000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.0000 Median :0.00000 Median :0.00000
## Mean :0.1492 Mean :0.04221 Mean :0.01605
## 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.00000 Max. :1.00000
##
summary(data.ratings)
## idUser idMovie rating
## Min. : 1.0 Min. : 1.0 Min. :1.00
## 1st Qu.:254.0 1st Qu.: 175.0 1st Qu.:3.00
## Median :447.0 Median : 322.0 Median :4.00
## Mean :462.5 Mean : 425.5 Mean :3.53
## 3rd Qu.:682.0 3rd Qu.: 631.0 3rd Qu.:4.00
## Max. :943.0 Max. :1682.0 Max. :5.00
Generate descriptive statistics and plots
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.4
ggplot(data.users, aes(age, fill=gender, color=gender)) +
geom_density(alpha = 0.5) +
ggtitle('Age Distribution') +
theme_minimal() +
guides(fill = guide_legend(override.aes = list(alpha = 1))) +
ggtitle("Age Distribution") +
theme(text = element_text(size=20),
legend.position= "bottom",
axis.text = element_blank(),
axis.title=element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank())
ggplot(data.ratings, aes(rating)) +
geom_bar(alpha = 0.8, width=.8) +
ggtitle("Movie Ratings Distribution") +
xlab('Rating') +
theme_minimal() +
theme(text = element_text(size=20),
legend.position= "none",
axis.text.y = element_blank(),
axis.title.y =element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank())
Merge users and ratings data
data.userRating = merge(data.users, data.ratings, by = "idUser")
# merge userRating with movie to have actual movie titles instead of ids
data.userMovieRating = merge(data.userRating, data.movies, by = "idMovie")
ggplot(data.userMovieRating, aes(rating, fill=gender, color=gender)) +
geom_bar(alpha = 0.5, position="dodge", width=1) +
facet_wrap(~gender,scales = "free") +
ggtitle('Movie Ratings by Gender\n') +
theme_minimal() +
ylab("") +
xlab("Rating") +
guides(fill = guide_legend(override.aes = list(alpha = 1))) +
theme(text = element_text(size=15),
legend.position= "none",
axis.text.y = element_blank(),
# axis.title=element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank())
Feature scaling and data normalization. Scale feature values to fall in the interval [0, 1]
# norm.users stores the users with normalized age between [0, 1]
norm.users = data.users
# normalizing users age
norm.users$age = (norm.users$age - min(norm.users$age)) / (max(norm.users$age) - min(norm.users$age))
norm.userRating = data.userRating
norm.userRating$age = (data.userRating$age - min(data.userRating$age)) / (max(data.userRating$age) - min(data.userRating$age))
norm.userRating$rating = (data.userRating$rating - min(data.userRating$rating)) / (max(data.userRating$rating) - min(data.userRating$rating))
norm.userMovieRating = data.userMovieRating
norm.userMovieRating$age = (data.userMovieRating$age - min(data.userMovieRating$age)) / (max(data.userMovieRating$age) - min(data.userMovieRating$age))
norm.userMovieRating$rating = (data.userMovieRating$rating - min(data.userMovieRating$rating)) / (max(data.userMovieRating$rating) - min(data.userMovieRating$rating))
Normalized data summary and plots
summary(norm.userRating$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.2576 0.3485 0.3935 0.5000 1.0000
# Normalized ratings summary
summary(norm.userRating$rating)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.5000 0.7500 0.6325 0.7500 1.0000
Split data into training and test sets
head(norm.userMovieRating)
## idMovie idUser age gender occupation rating nameMovie
## 1: 1 1 0.2575758 M technician 1.00 Toy Story (1995)
## 2: 1 2 0.6969697 F other 0.75 Toy Story (1995)
## 3: 1 5 0.3939394 F other 0.75 Toy Story (1995)
## 4: 1 6 0.5303030 M executive 0.75 Toy Story (1995)
## 5: 1 10 0.6969697 M lawyer 0.75 Toy Story (1995)
## 6: 1 13 0.6060606 M educator 0.50 Toy Story (1995)
## dateRelease imdbUrl
## 1: 01-Jan-1995 http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)
## 2: 01-Jan-1995 http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)
## 3: 01-Jan-1995 http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)
## 4: 01-Jan-1995 http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)
## 5: 01-Jan-1995 http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)
## 6: 01-Jan-1995 http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)
## unknown Action Adventure Animation Children Comedy Crime Documentary
## 1: 0 0 0 1 1 1 0 0
## 2: 0 0 0 1 1 1 0 0
## 3: 0 0 0 1 1 1 0 0
## 4: 0 0 0 1 1 1 0 0
## 5: 0 0 0 1 1 1 0 0
## 6: 0 0 0 1 1 1 0 0
## Drama Fantasy FilmNoir Horror Musical Mystery Romance SciFi Thriller
## 1: 0 0 0 0 0 0 0 0 0
## 2: 0 0 0 0 0 0 0 0 0
## 3: 0 0 0 0 0 0 0 0 0
## 4: 0 0 0 0 0 0 0 0 0
## 5: 0 0 0 0 0 0 0 0 0
## 6: 0 0 0 0 0 0 0 0 0
## War Western
## 1: 0 0
## 2: 0 0
## 3: 0 0
## 4: 0 0
## 5: 0 0
## 6: 0 0
importantFeatures = c('age', 'gender', as.character(data.genres[2:nrow(data.genres)]$nameGenre), 'rating')
# choose important feature to create the model data
modelData = norm.userMovieRating[, importantFeatures, with = F]
# converting male and female to 1 and 2 before applyin KNN
modelData$gender = as.numeric(modelData$gender)
# ratings should not be normalized
modelData$rating = data.userMovieRating$rating
# splitting data into 80% train and 20% test set
ind = sample(2, nrow(modelData), replace=TRUE, prob=c(0.8, 0.2))
trainData = modelData[ind==1, ]
testData = modelData[ind==2, ]
## 80% of the sample size
smp_size <- floor(0.8 * nrow(modelData))
# set seed value to make partition reproducible
set.seed(123)
train_ind <- sample(seq_len(nrow(modelData)), size = smp_size)
train <- modelData[train_ind, ]
test <- modelData[-train_ind, ]
# use the different features to predict the rating
formula = rating ~ age + gender + Action + Adventure + Animation + Children +
+ Comedy + Crime + Documentary + Drama + Fantasy + FilmNoir + Horror +
+ Musical + Mystery + Romance + SciFi + Thriller + War + Western
Develop the ALS optimized softimpute algorithm to predict which movies users will like on the scale of 1-5. Fit ALS Soft Impute Model on the train data and make predictions on the test data.
library(softImpute)
## Loading required package: Matrix
## Loaded softImpute 1.4
require(softImpute)
trainData <- as.matrix(trainData)
fits = softImpute(trainData, trace=TRUE,type="als")
## 1 : obj 0.45936 ratio 670533.6
## 2 : obj 0.04228 ratio 0.04638652
## 3 : obj 0.04114 ratio 0.001235584
## 4 : obj 0.04077 ratio 0.0006998015
## 5 : obj 0.04053 ratio 0.0004588863
## 6 : obj 0.04037 ratio 0.0002900495
## 7 : obj 0.04028 ratio 0.0001760577
## 8 : obj 0.04022 ratio 0.0001040315
## 9 : obj 0.04019 ratio 6.052576e-05
## 10 : obj 0.04017 ratio 3.493359e-05
## 11 : obj 0.04016 ratio 2.009324e-05
## 12 : obj 0.04015 ratio 1.154772e-05
## 13 : obj 0.04015 ratio 6.640565e-06
fits$d
## [1] 1172.0584 184.1307
Impute the missing values using complete(), which returns the full matrix:
testData <- as.matrix(testData)
out <- complete(testData,fits)
head(out)
## age gender Action Adventure Animation Children Comedy Crime
## [1,] 0.2121212 2 0 0 1 1 1 0
## [2,] 0.3939394 2 0 0 1 1 1 0
## [3,] 0.3333333 1 0 0 1 1 1 0
## [4,] 0.2424242 1 0 0 1 1 1 0
## [5,] 0.1363636 2 0 0 1 1 1 0
## [6,] 0.2575758 2 0 0 1 1 1 0
## Documentary Drama Fantasy FilmNoir Horror Musical Mystery Romance
## [1,] 0 0 0 0 0 0 0 0
## [2,] 0 0 0 0 0 0 0 0
## [3,] 0 0 0 0 0 0 0 0
## [4,] 0 0 0 0 0 0 0 0
## [5,] 0 0 0 0 0 0 0 0
## [6,] 0 0 0 0 0 0 0 0
## SciFi Thriller War Western rating
## [1,] 0 0 0 0 5
## [2,] 0 0 0 0 4
## [3,] 0 0 0 0 5
## [4,] 0 0 0 0 2
## [5,] 0 0 0 0 5
## [6,] 0 0 0 0 4
predicted <- out[,21]
head(predicted)
## [1] 5 4 5 2 5 4
Store true results in bin
true <- testData[,21]
table(true, predicted)
## predicted
## true 1 2 3 4 5
## 1 1296 0 0 0 0
## 2 0 2255 0 0 0
## 3 0 0 5520 0 0
## 4 0 0 0 6783 0
## 5 0 0 0 0 4203
Store summary for the created model
library(caret)
## Loading required package: lattice
summarization = confusionMatrix(predicted, true)
summarization
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4 5
## 1 1296 0 0 0 0
## 2 0 2255 0 0 0
## 3 0 0 5520 0 0
## 4 0 0 0 6783 0
## 5 0 0 0 0 4203
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.9998, 1)
## No Information Rate : 0.3382
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 1.00000 1.0000 1.0000 1.0000 1.0000
## Specificity 1.00000 1.0000 1.0000 1.0000 1.0000
## Pos Pred Value 1.00000 1.0000 1.0000 1.0000 1.0000
## Neg Pred Value 1.00000 1.0000 1.0000 1.0000 1.0000
## Prevalence 0.06462 0.1124 0.2752 0.3382 0.2096
## Detection Rate 0.06462 0.1124 0.2752 0.3382 0.2096
## Detection Prevalence 0.06462 0.1124 0.2752 0.3382 0.2096
## Balanced Accuracy 1.00000 1.0000 1.0000 1.0000 1.0000