DATA 643 - Project 1

Build a basic recommender system

Description

This system recommends the tv shows to users.

Source of the survey data is from: http://cs.mtu.edu/~lebrown/TechHacks-f13/rec-sys/data/cs1000_f13_data.csv

Load the data & Analyse

Lets find the top 3 TV shows by mean rating.

data <- read.csv("tv_ratings.csv")
head(data)

##   American.Idol Big.Bang.Theory Breaking.Bad Game.of.Thrones
## 1             1              NA            5               5
## 2             2               3           NA              NA
## 3             3               1            5              NA
## 4            NA               1            4              NA
## 5             2              NA            5               5
## 6             1               5           NA              NA
##   Grey.s.Anatomy Homeland How.I.Met.Your.Mother Justified Mad.Men
## 1             NA        3                     4        NA       4
## 2              2        2                    NA         2       3
## 3             NA       NA                     5        NA      NA
## 4             NA       NA                    NA        NA      NA
## 5              1        3                     2        NA       5
## 6              3        3                    NA         4      NA
##   Modern.Family NCIS Parks.and.Recreation Real.Housewives.of.New.Jersey
## 1             4   NA                    4                             1
## 2             2   NA                    5                            NA
## 3            NA   NA                   NA                            NA
## 4             3   NA                   NA                            NA
## 5            NA    4                    5                             1
## 6             2    2                    1                             1
##   The.Voice The.Walking.Dead
## 1        NA                2
## 2         3                4
## 3         2                2
## 4        NA               NA
## 5         2               NA
## 6        NA                5

mns <- colMeans(data, na.rm = TRUE)
colnames(data)[order(mns, decreasing = TRUE)[1:3]]

## [1] "Breaking.Bad"     "The.Walking.Dead" "Game.of.Thrones"

Let’s look at the mean rating for each user and plot this as a histogram.

rmns <- rowMeans(data, na.rm = TRUE)
hist(rmns, breaks=15, main="Histogram of Mean User Ratings", xlab="Ratings", col= "light blue")

Mean ratings of each show.

cmns <- colMeans(data, na.rm = TRUE)
hist(cmns, breaks=15, main="Histogram of Mean TV Rating", xlab="Ratings", col= "light gray")

Item Based Similarity

Similarity of each tv show with the rest of the shows. We would need to compare each column in our data set with every other column in the data set. Basically, we will be comparing the ‘Cosine Similarity’ here:

#cosine sim = crossprod(x, y)/ ( sqrt(crossprod(x,x)) * sqrt(crossprod(y,y)) )
# In otherwords , its simply the (dotproduct of x,y) / (Magnitude of x * Magnitude of y)
cosineSim <- function(x, y)
{
  return ( sum(x*y , na.rm = TRUE) / ( sqrt(sum(x*x , na.rm = TRUE)) * sqrt(sum(y*y, na.rm = TRUE)) )  )
}

Now, lets prepare empty place holder to keep the cosine similarities, lising tv-show Vs tv-show.

data.ibs.sim <- matrix(NA, nrow=ncol(data),ncol=ncol(data),dimnames=list(colnames(data),colnames(data)))

#display just the first row.
head(data.ibs.sim,1)

##               American.Idol Big.Bang.Theory Breaking.Bad Game.of.Thrones
## American.Idol            NA              NA           NA              NA
##               Grey.s.Anatomy Homeland How.I.Met.Your.Mother Justified
## American.Idol             NA       NA                    NA        NA
##               Mad.Men Modern.Family NCIS Parks.and.Recreation
## American.Idol      NA            NA   NA                   NA
##               Real.Housewives.of.New.Jersey The.Voice The.Walking.Dead
## American.Idol                            NA        NA               NA

Now, lets fill in the matrix with cosine similarities

#for each column in the data set
for(i in 1:ncol(data)) {
  #Loop thru the columns for each column
  for (j in 1:ncol(data)){
    #transforming the columns into matrices using as.matrix since 
    #the matrix operations are     generally faster
    data.ibs.sim[i,j] <- cosineSim(as.matrix(data[i]), as.matrix(data[j]))
  }
}

#transform the matrix back to data frame.
data.ibs.sim <- as.data.frame(data.ibs.sim)
head(data.ibs.sim, 2)

##                 American.Idol Big.Bang.Theory Breaking.Bad Game.of.Thrones
## American.Idol       1.0000000       0.4904383    0.5315357       0.3810056
## Big.Bang.Theory     0.4904383       1.0000000    0.4768924       0.5086207
##                 Grey.s.Anatomy  Homeland How.I.Met.Your.Mother Justified
## American.Idol        0.4783666 0.4433691             0.5447451 0.3937809
## Big.Bang.Theory      0.5178657 0.4683435             0.6604074 0.5512647
##                   Mad.Men Modern.Family      NCIS Parks.and.Recreation
## American.Idol   0.4519371     0.4936818 0.5029490            0.5116847
## Big.Bang.Theory 0.3990493     0.5061522 0.5999742            0.5233974
##                 Real.Housewives.of.New.Jersey The.Voice The.Walking.Dead
## American.Idol                       0.4307218 0.5512879        0.5903987
## Big.Bang.Theory                     0.5630100 0.5816683        0.5874211

Since we have got the similarity matrix, we can make some recommendations. Lets find the top 3 neighbours of each tv-show.

Lets create a place holder again to keep the recommendations.

data.neighbours <- matrix(NA, nrow = ncol(data.ibs.sim), ncol = 3, dimnames = list(colnames(data.ibs.sim)))
data.neighbours

##                               [,1] [,2] [,3]
## American.Idol                   NA   NA   NA
## Big.Bang.Theory                 NA   NA   NA
## Breaking.Bad                    NA   NA   NA
## Game.of.Thrones                 NA   NA   NA
## Grey.s.Anatomy                  NA   NA   NA
## Homeland                        NA   NA   NA
## How.I.Met.Your.Mother           NA   NA   NA
## Justified                       NA   NA   NA
## Mad.Men                         NA   NA   NA
## Modern.Family                   NA   NA   NA
## NCIS                            NA   NA   NA
## Parks.and.Recreation            NA   NA   NA
## Real.Housewives.of.New.Jersey   NA   NA   NA
## The.Voice                       NA   NA   NA
## The.Walking.Dead                NA   NA   NA

Find the neighbours for each column.

for(i in 1:ncol(data))
{
  data.neighbours[i,]  <- (t(head(n=3,rownames(data.ibs.sim[order(data.ibs.sim[,i],decreasing=TRUE),][i]))))
}
data.neighbours

##                               [,1]                           
## American.Idol                 "American.Idol"                
## Big.Bang.Theory               "Big.Bang.Theory"              
## Breaking.Bad                  "Breaking.Bad"                 
## Game.of.Thrones               "Game.of.Thrones"              
## Grey.s.Anatomy                "Grey.s.Anatomy"               
## Homeland                      "Homeland"                     
## How.I.Met.Your.Mother         "How.I.Met.Your.Mother"        
## Justified                     "Justified"                    
## Mad.Men                       "Mad.Men"                      
## Modern.Family                 "Modern.Family"                
## NCIS                          "NCIS"                         
## Parks.and.Recreation          "Parks.and.Recreation"         
## Real.Housewives.of.New.Jersey "Real.Housewives.of.New.Jersey"
## The.Voice                     "The.Voice"                    
## The.Walking.Dead              "The.Walking.Dead"             
##                               [,2]                   
## American.Idol                 "The.Walking.Dead"     
## Big.Bang.Theory               "How.I.Met.Your.Mother"
## Breaking.Bad                  "How.I.Met.Your.Mother"
## Game.of.Thrones               "Parks.and.Recreation" 
## Grey.s.Anatomy                "Mad.Men"              
## Homeland                      "Justified"            
## How.I.Met.Your.Mother         "Big.Bang.Theory"      
## Justified                     "Homeland"             
## Mad.Men                       "Grey.s.Anatomy"       
## Modern.Family                 "How.I.Met.Your.Mother"
## NCIS                          "Big.Bang.Theory"      
## Parks.and.Recreation          "Mad.Men"              
## Real.Housewives.of.New.Jersey "Justified"            
## The.Voice                     "How.I.Met.Your.Mother"
## The.Walking.Dead              "Justified"            
##                               [,3]                           
## American.Idol                 "The.Voice"                    
## Big.Bang.Theory               "NCIS"                         
## Breaking.Bad                  "Parks.and.Recreation"         
## Game.of.Thrones               "Real.Housewives.of.New.Jersey"
## Grey.s.Anatomy                "Justified"                    
## Homeland                      "Mad.Men"                      
## How.I.Met.Your.Mother         "Parks.and.Recreation"         
## Justified                     "Real.Housewives.of.New.Jersey"
## Mad.Men                       "Parks.and.Recreation"         
## Modern.Family                 "Parks.and.Recreation"         
## NCIS                          "How.I.Met.Your.Mother"        
## Parks.and.Recreation          "How.I.Met.Your.Mother"        
## Real.Housewives.of.New.Jersey "Mad.Men"                      
## The.Voice                     "Parks.and.Recreation"         
## The.Walking.Dead              "How.I.Met.Your.Mother"

User Based Similarity

Lets try to normalize the ratings by subtracting from each rating the average rating of that user. This would turn low ratings into negative numbers and high ratings into positive numbers. We will then take the cosine distance. Uses with similar opinions about the show rated in common will have a relatively small angle between them.

data.mat <- as.matrix(data)
data.mat[is.na(data.mat)] <- 0
row.means <- rowMeans(data.mat)
#sweep the data - here 1 indicate row-wise
data.mat.normal <- sweep(data.mat, 1, row.means, FUN="-", check.margin = FALSE)

Prepare the similarity matrix using the cosine similarity between users

#prepare place holder matrix to keep the ubs
data.ubs.sim <- matrix(NA, nrow=nrow(data),ncol=nrow(data),dimnames=list(rownames(data),rownames(data)))

#display just the first row.
head(data.ubs.sim,1)

##    1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
## 1 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
##   25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
## 1 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
##   49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## 1 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
##   73 74 75 76 77 78 79 80 81 82 83 84 85
## 1 NA NA NA NA NA NA NA NA NA NA NA NA NA

#Now lets fill-in the place holder matrix.
#for each row in the data set
for(i in 1:nrow(data)) {
  #Loop thru the rows for each row
  for (j in 1:nrow(data)){
    #transforming the rows into matrices using as.matrix since 
    #the matrix operations are     generally faster
    data.ubs.sim[i,j] <- cosineSim(as.matrix(data[i,]), as.matrix(data[j,]))
  }
}

#transform the matrix back to data frame.
data.ubs.sim <- as.data.frame(data.ubs.sim)
head(data.ubs.sim, 2)

##           1         2         3         4         5         6         7
## 1 1.0000000 0.5255957 0.5552058 0.5525460 0.8214676 0.2980968 0.7770178
## 2 0.5255957 1.0000000 0.2973254 0.1881545 0.5244200 0.7218398 0.6898997
##           8         9        10        11        12        13        14
## 1 0.7238999 0.7043607 0.5506465 0.6038670 0.5576189 0.2819657 0.5979140
## 2 0.6741999 0.2132007 0.1621687 0.4089373 0.5448463 0.4473375 0.4653789
##          15        16        17        18        19        20        21
## 1 0.6670322 0.3451057 0.7686806 0.5693280 0.2988844 0.6750124 0.4793489
## 2 0.7040681 0.4499770 0.5128226 0.7385489 0.2302831 0.7195524 0.2731155
##          22        23        24        25        26        27        28
## 1 0.5254558 0.4255513 0.2902542 0.6482194 0.6867768 0.4402255 0.2779426
## 2 0.4274430 0.4086347 0.4978513 0.7074523 0.7416198 0.0000000 0.3167230
##          29        30        31         32        33 34        35
## 1 0.4661681 0.6129807 0.3052858 0.05906244 0.6444241  0 0.6395394
## 2 0.5352180 0.5121982 0.4981889 0.26220221 0.5201565  0 0.3712496
##          36        37        38        39        40        41        42
## 1 0.5478641 0.5660507 0.3760399 0.8075350 0.2750067 0.8619130 0.4047931
## 2 0.3919647 0.6521826 0.3167230 0.7314248 0.0000000 0.5716888 0.8372579
##          43        44        45        46        47        48        49
## 1 0.7494896 0.5251722 0.2156655 0.6922844 0.5682412 0.6718340 0.6307786
## 2 0.4900190 0.3668370 0.4569538 0.6639061 0.6517861 0.6747699 0.6364279
##          50        51        52        53        54        55        56
## 1 0.2512693 0.3334490 0.2386529 0.8116573 0.3472722 0.7137749 0.5780443
## 2 0.2808218 0.4427924 0.6281486 0.3139219 0.2038589 0.7312470 0.1399731
##          57        58        59        60        61        62        63
## 1 0.5433745 0.7589992 0.3304136 0.3486408 0.6493556 0.6603382 0.4794770
## 2 0.7341662 0.4324500 0.5231388 0.4070403 0.5309441 0.5196767 0.5383054
##          64        65        66        67        68        69        70
## 1 0.6130673 0.5537780 0.4847743 0.3939839 0.6302478 0.2167520 0.5634886
## 2 0.5195887 0.7598833 0.7336740 0.3618734 0.5540103 0.1574592 0.3198011
##          71        72        73        74        75        76        77
## 1 0.7237947 0.6621573 0.2541643 0.8399485 0.7887638 0.0000000 0.5781033
## 2 0.8158958 0.4750845 0.2461830 0.4858835 0.6126375 0.1997781 0.7214736
##          78        79        80        81         82       83        84
## 1 0.4707188 0.7421207 0.0000000 0.6150155 0.02784230 0.392184 0.6201556
## 2 0.5815526 0.5717859 0.2926152 0.5112374 0.06741999 0.332385 0.5124861
##          85
## 1 0.6503915
## 2 0.6428243

Lets look at the top 3 neighbours of the user 3:

nd <- order(data.ubs.sim, decreasing = TRUE)

head(order(data.ubs.sim[3,],decreasing=TRUE),4)[-1]

## [1] 11 70 81

#So users 11, 70, and 81 are similar to user 3.
data[11,]

##    American.Idol Big.Bang.Theory Breaking.Bad Game.of.Thrones
## 11             2               3            5              NA
##    Grey.s.Anatomy Homeland How.I.Met.Your.Mother Justified Mad.Men
## 11             NA       NA                     4        NA      NA
##    Modern.Family NCIS Parks.and.Recreation Real.Housewives.of.New.Jersey
## 11             2   NA                   NA                            NA
##    The.Voice The.Walking.Dead
## 11        NA                4

data[70,]

##    American.Idol Big.Bang.Theory Breaking.Bad Game.of.Thrones
## 70             3              NA            4               3
##    Grey.s.Anatomy Homeland How.I.Met.Your.Mother Justified Mad.Men
## 70             NA       NA                     5        NA      NA
##    Modern.Family NCIS Parks.and.Recreation Real.Housewives.of.New.Jersey
## 70            NA    4                   NA                            NA
##    The.Voice The.Walking.Dead
## 70         4                3

data[81,]

##    American.Idol Big.Bang.Theory Breaking.Bad Game.of.Thrones
## 81             2               5            5              NA
##    Grey.s.Anatomy Homeland How.I.Met.Your.Mother Justified Mad.Men
## 81             NA       NA                     4        NA      NA
##    Modern.Family NCIS Parks.and.Recreation Real.Housewives.of.New.Jersey
## 81             3   NA                    3                            NA
##    The.Voice The.Walking.Dead
## 81         2               NA

data[3,]

##   American.Idol Big.Bang.Theory Breaking.Bad Game.of.Thrones
## 3             3               1            5              NA
##   Grey.s.Anatomy Homeland How.I.Met.Your.Mother Justified Mad.Men
## 3             NA       NA                     5        NA      NA
##   Modern.Family NCIS Parks.and.Recreation Real.Housewives.of.New.Jersey
## 3            NA   NA                   NA                            NA
##   The.Voice The.Walking.Dead
## 3         2                2

Analysis using R’s recommenderlab package:

#install.packages('recommenderlab', dependencies=TRUE)
library(recommenderlab)

rating.matrix <- as.matrix(data)
r1 <- as(rating.matrix, "realRatingMatrix")

rn1 <- normalize(r1)
image(r1, main="Ratings")

image(rn1, main="Normalized Ratings")

Recommendation System

Recommend the shows for a given user using User Similarity [ Userbased Collaborative Filtering ]

ubr <- Recommender(r1, method="UBCF") #Userbased Collaborative Filtering
pred <- predict(ubr, r1, type="ratings")
#as(pred, "matrix")

The supplied ratings for User 3 are:

data[3,]

##   American.Idol Big.Bang.Theory Breaking.Bad Game.of.Thrones
## 3             3               1            5              NA
##   Grey.s.Anatomy Homeland How.I.Met.Your.Mother Justified Mad.Men
## 3             NA       NA                     5        NA      NA
##   Modern.Family NCIS Parks.and.Recreation Real.Housewives.of.New.Jersey
## 3            NA   NA                   NA                            NA
##   The.Voice The.Walking.Dead
## 3         2                2

The predicted ratings for the missing shows are:

colnames(data)[is.na(data[3,])]

## [1] "Game.of.Thrones"               "Grey.s.Anatomy"               
## [3] "Homeland"                      "Justified"                    
## [5] "Mad.Men"                       "Modern.Family"                
## [7] "NCIS"                          "Parks.and.Recreation"         
## [9] "Real.Housewives.of.New.Jersey"

getRatings(pred[3,])

## [1] 3.413844 2.627422 2.871503 3.034138 3.020983 2.931710 3.098739 3.152282
## [9] 2.657036

Present the top 3 predicted ratings for user 3

pred <- predict(ubr, r1[3], n=3)
as(pred, "list")

## [[1]]
## [1] "Game.of.Thrones"      "Parks.and.Recreation" "NCIS"