Started at 1:58 pm PCT on 04/04/2016 Total Time Spent 3.5 hours

I. Data Preprocessing

Install data table package and read in data

# data is being stored in data.table
library(data.table)

# genres
data.genres = as.data.table(read.table('data/u.genre', sep = '|'))
setnames(data.genres, c('nameGenre', 'idGenre'))

# contains information about each movie, from which genres binary data is most important for
# the manipulations which will be done
data.movies = as.data.table(read.table('data/u.item', sep = '|', quote = ""))
# removing the columns in which all values are NA (in my case 4th column)
data.movies = data.movies[, colSums(is.na(data.movies))<nrow(data.movies), with = F]

# setting names of the columns in data.movies
setnames(data.movies, c('idMovie', 'nameMovie', 'dateRelease', 'imdbUrl', 
                        as.character(data.genres$nameGenre)))
# stores data about each user
data.users = as.data.table(read.table('data/u.user', sep = '|'))

# removing 5th row, because I had no idea how to use ZIP code for the future algorithms
data.users = data.users[, -5, with = F]
setnames(data.users, c('idUser', 'age', 'gender', 'occupation'))

# loading ratings given by users
data.ratings = as.data.table(read.table('data/u.data', sep = '\t'))

# deleting 4th column which represents the time when the movie was rated by that user
data.ratings = data.ratings[, -4, with = F]

# setting column names
setnames(data.ratings, c('idUser', 'idMovie', 'rating'))

# make ratings floats
data.ratings$rating = as.numeric(data.ratings$rating)

Basic summaries for all read data tables before normalization

summary(data.users)

##      idUser           age        gender          occupation 
##  Min.   :  1.0   Min.   : 7.00   F:273   student      :196  
##  1st Qu.:236.5   1st Qu.:25.00   M:670   other        :105  
##  Median :472.0   Median :31.00           educator     : 95  
##  Mean   :472.0   Mean   :34.05           administrator: 79  
##  3rd Qu.:707.5   3rd Qu.:43.00           engineer     : 67  
##  Max.   :943.0   Max.   :73.00           programmer   : 66  
##                                          (Other)      :335

summary(data.movies)

##     idMovie                                nameMovie         dateRelease 
##  Min.   :   1.0   Body Snatchers (1993)         :   2   01-Jan-1995:215  
##  1st Qu.: 421.2   Butcher Boy, The (1998)       :   2   01-Jan-1994:213  
##  Median : 841.5   Chairman of the Board (1998)  :   2   01-Jan-1993:126  
##  Mean   : 841.5   Chasing Amy (1997)            :   2   01-Jan-1997: 98  
##  3rd Qu.:1261.8   Deceiver (1997)               :   2   01-Jan-1992: 37  
##  Max.   :1682.0   Designated Mourner, The (1997):   2   01-Jan-1996: 26  
##                   (Other)                       :1670   (Other)    :967  
##                                                                         imdbUrl    
##                                                                             :   3  
##  http://us.imdb.com/M/title-exact?Body%20Snatchers%20(1993)                 :   2  
##  http://us.imdb.com/M/title-exact?Chasing+Amy+(1997)                        :   2  
##  http://us.imdb.com/M/title-exact?Designated%20Mourner%2C%20The%20%281997%29:   2  
##  http://us.imdb.com/M/title-exact?Fly%20Away%20Home%20(1996)                :   2  
##  http://us.imdb.com/M/title-exact?Hugo+Pool+(1997)                          :   2  
##  (Other)                                                                    :1669  
##     unknown             Action         Adventure         Animation      
##  Min.   :0.000000   Min.   :0.0000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.000000   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.000000   Median :0.0000   Median :0.00000   Median :0.00000  
##  Mean   :0.001189   Mean   :0.1492   Mean   :0.08026   Mean   :0.02497  
##  3rd Qu.:0.000000   3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.000000   Max.   :1.0000   Max.   :1.00000   Max.   :1.00000  
##                                                                         
##     Children           Comedy           Crime         Documentary     
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.0000   Median :0.0000   Median :0.00000  
##  Mean   :0.07253   Mean   :0.3002   Mean   :0.0648   Mean   :0.02973  
##  3rd Qu.:0.00000   3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##                                                                       
##      Drama          Fantasy           FilmNoir           Horror      
##  Min.   :0.000   Min.   :0.00000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000  
##  Median :0.000   Median :0.00000   Median :0.00000   Median :0.0000  
##  Mean   :0.431   Mean   :0.01308   Mean   :0.01427   Mean   :0.0547  
##  3rd Qu.:1.000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.0000  
##  Max.   :1.000   Max.   :1.00000   Max.   :1.00000   Max.   :1.0000  
##                                                                      
##     Musical           Mystery           Romance           SciFi        
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.00000   Median :0.0000   Median :0.00000  
##  Mean   :0.03329   Mean   :0.03627   Mean   :0.1468   Mean   :0.06005  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.0000   Max.   :1.00000  
##                                                                        
##     Thriller           War             Western       
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.0000   Median :0.00000   Median :0.00000  
##  Mean   :0.1492   Mean   :0.04221   Mean   :0.01605  
##  3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.00000  
##

summary(data.ratings)

##      idUser         idMovie           rating    
##  Min.   :  1.0   Min.   :   1.0   Min.   :1.00  
##  1st Qu.:254.0   1st Qu.: 175.0   1st Qu.:3.00  
##  Median :447.0   Median : 322.0   Median :4.00  
##  Mean   :462.5   Mean   : 425.5   Mean   :3.53  
##  3rd Qu.:682.0   3rd Qu.: 631.0   3rd Qu.:4.00  
##  Max.   :943.0   Max.   :1682.0   Max.   :5.00

Generate descriptive statistics and plots

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.2.4

ggplot(data.users, aes(age, fill=gender, color=gender)) +
 geom_density(alpha = 0.5) +
  ggtitle('Age Distribution') +
  theme_minimal() +
    guides(fill = guide_legend(override.aes = list(alpha = 1))) +
    ggtitle("Age Distribution") +
    theme(text = element_text(size=20),
      legend.position= "bottom",
      axis.text = element_blank(), 
      axis.title=element_blank(),
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank(),
      panel.border = element_blank(),
      panel.background = element_blank())

ggplot(data.ratings, aes(rating)) +
 geom_bar(alpha = 0.8, width=.8) +
  ggtitle("Movie Ratings Distribution") +
  xlab('Rating') +
  theme_minimal() +
    theme(text = element_text(size=20),
      legend.position= "none",
      axis.text.y = element_blank(), 
      axis.title.y =element_blank(),
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank(),
      panel.border = element_blank(),
      panel.background = element_blank())

Merge users and ratings data

data.userRating = merge(data.users, data.ratings, by = "idUser")

# merge userRating with movie to have actual movie titles instead of ids
data.userMovieRating = merge(data.userRating, data.movies, by = "idMovie")

ggplot(data.userMovieRating, aes(rating, fill=gender, color=gender)) +
 geom_bar(alpha = 0.5, position="dodge", width=1) + 
  facet_wrap(~gender,scales = "free") +
  ggtitle('Movie Ratings by Gender\n') +
  theme_minimal() +
  ylab("") + 
  xlab("Rating") +
    guides(fill = guide_legend(override.aes = list(alpha = 1))) +
    theme(text = element_text(size=15),
      legend.position= "none",
      axis.text.y = element_blank(), 
    #  axis.title=element_blank(),
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank(),
      panel.border = element_blank(),
      panel.background = element_blank())

Feature scaling and data normalization. Scale feature values to fall in the interval [0, 1]

# norm.users stores the users with normalized age between [0, 1]
norm.users = data.users

# normalizing users age
norm.users$age = (norm.users$age - min(norm.users$age)) / (max(norm.users$age) - min(norm.users$age))
norm.userRating = data.userRating
norm.userRating$age = (data.userRating$age - min(data.userRating$age)) / (max(data.userRating$age) - min(data.userRating$age))
norm.userRating$rating = (data.userRating$rating - min(data.userRating$rating)) / (max(data.userRating$rating) - min(data.userRating$rating))

norm.userMovieRating = data.userMovieRating
norm.userMovieRating$age = (data.userMovieRating$age - min(data.userMovieRating$age)) / (max(data.userMovieRating$age) - min(data.userMovieRating$age))
norm.userMovieRating$rating = (data.userMovieRating$rating - min(data.userMovieRating$rating)) / (max(data.userMovieRating$rating) - min(data.userMovieRating$rating))

Normalized data summary and plots

summary(norm.userRating$age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.2576  0.3485  0.3935  0.5000  1.0000

# Normalized ratings summary
summary(norm.userRating$rating)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.5000  0.7500  0.6325  0.7500  1.0000

Split data into training and test sets

head(norm.userMovieRating)

##    idMovie idUser       age gender occupation rating        nameMovie
## 1:       1      1 0.2575758      M technician   1.00 Toy Story (1995)
## 2:       1      2 0.6969697      F      other   0.75 Toy Story (1995)
## 3:       1      5 0.3939394      F      other   0.75 Toy Story (1995)
## 4:       1      6 0.5303030      M  executive   0.75 Toy Story (1995)
## 5:       1     10 0.6969697      M     lawyer   0.75 Toy Story (1995)
## 6:       1     13 0.6060606      M   educator   0.50 Toy Story (1995)
##    dateRelease                                               imdbUrl
## 1: 01-Jan-1995 http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)
## 2: 01-Jan-1995 http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)
## 3: 01-Jan-1995 http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)
## 4: 01-Jan-1995 http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)
## 5: 01-Jan-1995 http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)
## 6: 01-Jan-1995 http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)
##    unknown Action Adventure Animation Children Comedy Crime Documentary
## 1:       0      0         0         1        1      1     0           0
## 2:       0      0         0         1        1      1     0           0
## 3:       0      0         0         1        1      1     0           0
## 4:       0      0         0         1        1      1     0           0
## 5:       0      0         0         1        1      1     0           0
## 6:       0      0         0         1        1      1     0           0
##    Drama Fantasy FilmNoir Horror Musical Mystery Romance SciFi Thriller
## 1:     0       0        0      0       0       0       0     0        0
## 2:     0       0        0      0       0       0       0     0        0
## 3:     0       0        0      0       0       0       0     0        0
## 4:     0       0        0      0       0       0       0     0        0
## 5:     0       0        0      0       0       0       0     0        0
## 6:     0       0        0      0       0       0       0     0        0
##    War Western
## 1:   0       0
## 2:   0       0
## 3:   0       0
## 4:   0       0
## 5:   0       0
## 6:   0       0

importantFeatures = c('age', 'gender', as.character(data.genres[2:nrow(data.genres)]$nameGenre), 'rating')

# choose important feature to create the model data
modelData = norm.userMovieRating[, importantFeatures, with = F]
# converting male and female to 1 and 2 before applyin KNN
modelData$gender = as.numeric(modelData$gender)

# ratings should not be normalized
modelData$rating = data.userMovieRating$rating

# splitting data into 80% train and 20% test set
ind = sample(2, nrow(modelData), replace=TRUE, prob=c(0.8, 0.2))
trainData = modelData[ind==1, ]
testData = modelData[ind==2, ]

## 80% of the sample size
smp_size <- floor(0.8 * nrow(modelData))

# set seed value to make partition reproducible
set.seed(123)
train_ind <- sample(seq_len(nrow(modelData)), size = smp_size)

train <- modelData[train_ind, ]
test <- modelData[-train_ind, ]

# use the different features to predict the rating
formula = rating ~ age + gender  +  Action + Adventure + Animation + Children +
  + Comedy + Crime + Documentary + Drama + Fantasy + FilmNoir + Horror +     
  + Musical + Mystery + Romance + SciFi + Thriller + War + Western

II. Model

Develop the ALS optimized softimpute algorithm to predict which movies users will like on the scale of 1-5. Fit ALS Soft Impute Model on the train data and make predictions on the test data.

library(softImpute)

## Loading required package: Matrix

## Loaded softImpute 1.4

require(softImpute)

trainData <- as.matrix(trainData) 
fits = softImpute(trainData, trace=TRUE,type="als")

## 1 : obj 0.45936 ratio 670533.6 
## 2 : obj 0.04228 ratio 0.04638652 
## 3 : obj 0.04114 ratio 0.001235584 
## 4 : obj 0.04077 ratio 0.0006998015 
## 5 : obj 0.04053 ratio 0.0004588863 
## 6 : obj 0.04037 ratio 0.0002900495 
## 7 : obj 0.04028 ratio 0.0001760577 
## 8 : obj 0.04022 ratio 0.0001040315 
## 9 : obj 0.04019 ratio 6.052576e-05 
## 10 : obj 0.04017 ratio 3.493359e-05 
## 11 : obj 0.04016 ratio 2.009324e-05 
## 12 : obj 0.04015 ratio 1.154772e-05 
## 13 : obj 0.04015 ratio 6.640565e-06

fits$d

## [1] 1172.0584  184.1307

Impute the missing values using complete(), which returns the full matrix:

testData <- as.matrix(testData) 
out <- complete(testData,fits)
head(out)

##            age gender Action Adventure Animation Children Comedy Crime
## [1,] 0.2121212      2      0         0         1        1      1     0
## [2,] 0.3939394      2      0         0         1        1      1     0
## [3,] 0.3333333      1      0         0         1        1      1     0
## [4,] 0.2424242      1      0         0         1        1      1     0
## [5,] 0.1363636      2      0         0         1        1      1     0
## [6,] 0.2575758      2      0         0         1        1      1     0
##      Documentary Drama Fantasy FilmNoir Horror Musical Mystery Romance
## [1,]           0     0       0        0      0       0       0       0
## [2,]           0     0       0        0      0       0       0       0
## [3,]           0     0       0        0      0       0       0       0
## [4,]           0     0       0        0      0       0       0       0
## [5,]           0     0       0        0      0       0       0       0
## [6,]           0     0       0        0      0       0       0       0
##      SciFi Thriller War Western rating
## [1,]     0        0   0       0      5
## [2,]     0        0   0       0      4
## [3,]     0        0   0       0      5
## [4,]     0        0   0       0      2
## [5,]     0        0   0       0      5
## [6,]     0        0   0       0      4

predicted <- out[,21]
head(predicted)

## [1] 5 4 5 2 5 4

Store true results in bin

true <- testData[,21]
table(true, predicted)

##     predicted
## true    1    2    3    4    5
##    1 1296    0    0    0    0
##    2    0 2255    0    0    0
##    3    0    0 5520    0    0
##    4    0    0    0 6783    0
##    5    0    0    0    0 4203

Store summary for the created model

library(caret)

## Loading required package: lattice

summarization = confusionMatrix(predicted, true)
summarization

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    2    3    4    5
##          1 1296    0    0    0    0
##          2    0 2255    0    0    0
##          3    0    0 5520    0    0
##          4    0    0    0 6783    0
##          5    0    0    0    0 4203
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9998, 1)
##     No Information Rate : 0.3382     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity           1.00000   1.0000   1.0000   1.0000   1.0000
## Specificity           1.00000   1.0000   1.0000   1.0000   1.0000
## Pos Pred Value        1.00000   1.0000   1.0000   1.0000   1.0000
## Neg Pred Value        1.00000   1.0000   1.0000   1.0000   1.0000
## Prevalence            0.06462   0.1124   0.2752   0.3382   0.2096
## Detection Rate        0.06462   0.1124   0.2752   0.3382   0.2096
## Detection Prevalence  0.06462   0.1124   0.2752   0.3382   0.2096
## Balanced Accuracy     1.00000   1.0000   1.0000   1.0000   1.0000

ML Interview Assignment

Michael Jules

April 6, 2016

I. Data Preprocessing

II. Model