Recommendation system 643 :: Project 3 - Singular Value Decomposition

Step 1 : Import required libraries and data

library(recommenderlab)
library(tm)
data(MovieLense)

Step 2 : Set up initial data

#hardcoding rating data
myMovieDataSet <- matrix(c(3, 2, NA, 1, 2, 3, 5, 1, 4, 3, 4, 1, 5, 3, 5, 4, 2, NA, 2, 1, NA, 2, 4, 2, 3, 5, 3, NA, 3, 1, 5, NA, 3, 5, 2, 3, 1, 3, 2, 3, NA, 3), nrow = 6, byrow = T)

# getting movie names from movieLense database
colnames(myMovieDataSet)<-colnames(MovieLense)[1:7]

# writing user names
rownames(myMovieDataSet) <- c("U1", "U2", "U3", "U4", "U5", "U6")

myMovieDataSet

##    Toy Story (1995) GoldenEye (1995) Four Rooms (1995) Get Shorty (1995)
## U1                3                2                NA                 1
## U2                1                4                 3                 4
## U3                5                4                 2                NA
## U4                2                4                 2                 3
## U5                3                1                 5                NA
## U6                3                1                 3                 2
##    Copycat (1995) Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)
## U1              2                                                    3
## U2              1                                                    5
## U3              2                                                    1
## U4              5                                                    3
## U5              3                                                    5
## U6              3                                                   NA
##    Twelve Monkeys (1995)
## U1                     5
## U2                     3
## U3                    NA
## U4                    NA
## U5                     2
## U6                     3

Step 3 : Do SVD (Singular Value Decomposition) using built in function

#A matrix containing ratings (typically 1-5 stars, etc.).
ratingMatrix <- as(myMovieDataSet, "realRatingMatrix")
#Dissimilarity and Similarity Calculation Between Rating Data
#method=>similarity measure to use
similarity_rating_matrix <- as.matrix(similarity(ratingMatrix, method = "cosine", which = "items"))


s <- svd(similarity_rating_matrix)
#
s$u #s$u is the matrix whose columns contain the left singular vectors of X

##            [,1]       [,2]        [,3]       [,4]        [,5]        [,6]
## [1,] -0.3770026  0.5554078 -0.21864128  0.1402564 -0.36774684  0.19172555
## [2,] -0.3962431 -0.5311554 -0.12675147  0.5098699 -0.33376133 -0.41640854
## [3,] -0.3862971 -0.2434491 -0.51584529 -0.2377167  0.63757372  0.03676604
## [4,] -0.3562993  0.4002149 -0.25214857 -0.3254674 -0.13035985 -0.39787763
## [5,] -0.3972987 -0.1625243  0.63906292 -0.5314793 -0.08428847 -0.18803891
## [6,] -0.3926946  0.2870112  0.44458107  0.5128880  0.49873172  0.09732846
## [7,] -0.3356046 -0.2848984 -0.02002608 -0.1113474 -0.27209156  0.76508525
##              [,7]
## [1,] -0.556703613
## [2,] -0.006276293
## [3,] -0.247082023
## [4,]  0.606688246
## [5,] -0.287062270
## [6,]  0.210957105
## [7,]  0.366079082

V <- s$v
t(V) #s$v is the matrix whose columns contain the right singular vectors of X

##            [,1]         [,2]        [,3]       [,4]        [,5]
## [1,] -0.3770026 -0.396243078 -0.38629709 -0.3562993 -0.39729870
## [2,] -0.5554078  0.531155412  0.24344906 -0.4002149  0.16252433
## [3,]  0.2186413  0.126751470  0.51584529  0.2521486 -0.63906292
## [4,] -0.1402564 -0.509869911  0.23771670  0.3254674  0.53147930
## [5,]  0.3677468  0.333761334 -0.63757372  0.1303599  0.08428847
## [6,] -0.1917256  0.416408538 -0.03676604  0.3978776  0.18803891
## [7,]  0.5567036  0.006276293  0.24708202 -0.6066882  0.28706227
##             [,6]        [,7]
## [1,] -0.39269464 -0.33560462
## [2,] -0.28701125  0.28489843
## [3,] -0.44458107  0.02002608
## [4,] -0.51288798  0.11134737
## [5,] -0.49873172  0.27209156
## [6,] -0.09732846 -0.76508525
## [7,] -0.21095710 -0.36607908

s$d #s$d is the vector that contains the singular values of X

## [1] 4.1499588 1.0000000 0.8707070 0.7778677 0.6395443 0.4728235 0.3890163

s$u %*% diag(s$d) %*% t(s$v)

##               [,1]         [,2]         [,3]         [,4]         [,5]
## [1,] -4.024558e-16 7.930825e-01 7.604337e-01 4.594683e-01 8.265582e-01
## [2,]  7.930825e-01 5.839445e-17 6.859943e-01 7.950464e-01 7.925939e-01
## [3,]  7.604337e-01 6.859943e-01 4.996004e-16 6.135720e-01 7.961540e-01
## [4,]  4.594683e-01 7.950464e-01 6.135720e-01 2.498002e-16 6.835991e-01
## [5,]  8.265582e-01 7.925939e-01 7.961540e-01 6.835991e-01 9.020562e-17
## [6,]  6.378198e-01 7.699747e-01 8.091548e-01 7.033392e-01 7.178641e-01
## [7,]  6.375696e-01 5.359422e-01 5.719052e-01 6.125172e-01 5.663794e-01
##               [,6]          [,7]
## [1,]  6.378198e-01  6.375696e-01
## [2,]  7.699747e-01  5.359422e-01
## [3,]  8.091548e-01  5.719052e-01
## [4,]  7.033392e-01  6.125172e-01
## [5,]  7.178641e-01  5.663794e-01
## [6,] -4.163336e-17  7.024033e-01
## [7,]  7.024033e-01 -2.151057e-16

Step 4 : Get text data (plot summary of films) from IMDB

plot_summary <- c("A cowboy doll is profoundly threatened and jealous when a new spaceman figure supplants him as top toy in a boy's room", 
                  
"James Bond teams up with the lone survivor of a destroyed Russian research center to stop the hijacking of a nuclear space weapon by a fellow agent formerly believed to be dead", 

"Four interlocking tales that take place in a fading hotel on New Year's Eve", 

"A mobster travels to Hollywood to collect a debt and discovers that the movie business is much the same as his current job", 

"An agoraphobic psychologist and a female detective must work together to take down a serial killer who copies serial killers from the past", 

"A provincial boy related to a Shanghai crime family is recruited by his uncle into cosmopolitan Shanghai in the 1930s to be a servant to a ganglord's mistress", 

"In a future world devastated by disease, a convict is sent back in time to gather information about the man-made virus that wiped out most of the human population on the planet")

Step 5 : Build a corpus

movie_corpus <- Corpus(VectorSource(plot_summary))
dtm <- DocumentTermMatrix(movie_corpus, control = list(removePunctuation = TRUE, removeNumbers = TRUE, stopwords = TRUE, tolower = TRUE, weighting = weightTfIdf))
dtm_matrix <- as.matrix(dtm)
dimnames(dtm_matrix)$Docs <- colnames(myMovieDataSet)

Step 6 : Do SVD on Content

movie_svd <- svd(dtm_matrix)
#Sigmak <- movie_svd$d
Uk <- movie_svd$u
#Vk <- t(as.matrix(movie_svd$v))

movie_cat <- as.matrix(t(Uk))
dimnames(movie_cat) <- list(SVs = paste0("sv", 1:7), Movies = colnames(myMovieDataSet))

movies <- as(myMovieDataSet, "realRatingMatrix")
movies_norm <- normalize(movies)
norm_cat <- as.vector(movies_norm@data)
norm_cat <- ifelse(norm_cat == 0, NA, norm_cat)
norm_cat <- matrix(norm_cat, nrow = nrow(movies), ncol = ncol(movies))

user_cat <- matrix(nrow = nrow(myMovieDataSet), ncol = ncol(myMovieDataSet))
for (i in 1:nrow(user_cat)){
  for (j in 1:ncol(user_cat)){
    user_cat[i,j] <- sum(norm_cat[i,]*movie_cat[j,], na.rm = T)/sum(movie_cat[j,] != 0) 
  }
}

Step 7 : Estimate users’ ratings for unrated movies.

estimated <- matrix(nrow = nrow(myMovieDataSet), ncol = ncol(myMovieDataSet))
colnames(estimated) = colnames(myMovieDataSet)
rownames(estimated) = rownames(myMovieDataSet)
for (i in 1:nrow(estimated))
{
  for(j in 1:ncol(estimated))
  {
    if(is.na(norm_cat[i,j]))
    {
      numerator <- 0
      denominator <- 1
      
      for(k in 1:nrow(movie_cat))
      {
        numerator <- sum(user_cat[i,k]*movie_cat[k,j])
        denominator <- sqrt(sum(user_cat[i,]^2, na.rm = TRUE))*sqrt(sum(movie_cat[,j]^2, na.rm = TRUE))
      }
      estimated[i,j] <- numerator/denominator
    }
    else
    {
      estimated[i,j] <- NA
    }
  }
}


estimated

##    Toy Story (1995) GoldenEye (1995) Four Rooms (1995) Get Shorty (1995)
## U1               NA               NA     -2.887986e-16                NA
## U2               NA               NA      1.511556e-15                NA
## U3               NA               NA                NA      6.986927e-17
## U4               NA               NA                NA                NA
## U5               NA               NA                NA     -5.958972e-17
## U6               NA               NA                NA                NA
##    Copycat (1995) Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)
## U1             NA                                                   NA
## U2             NA                                                   NA
## U3             NA                                                   NA
## U4             NA                                                   NA
## U5             NA                                                   NA
## U6             NA                                         2.270232e-15
##    Twelve Monkeys (1995)
## U1                    NA
## U2                     0
## U3                     0
## U4                     0
## U5                    NA
## U6                    NA

Looks like something gone wrong ! values are not as expected. Unfortunately changing the initial value didn’t change result much.

References

Data Mining Algorithms In R/Dimensionality Reduction/Singular Value Decomposition => https://en.wikibooks.org/wiki/Data_Mining_Algorithms_In_R/Dimensionality_Reduction/Singular_Value_Decomposition

Singular Value Decomposition | Stanford University https://www.youtube.com/watch?v=P5mlg91as1c

Recommendation system 643 :: Project 3 - Singular Value Decomposition

Chirag Vithalani

March 05, 2017

References