Recommendation Base on User’Ratings

library(magrittr)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   0.8.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x tidyr::extract()   masks magrittr::extract()
## x dplyr::filter()    masks stats::filter()
## x dplyr::lag()       masks stats::lag()
## x purrr::set_names() masks magrittr::set_names()
library(dplyr)

User Base Filtering:

We find the distance between User A and User A1, A2, A3,…

We look for the nearest neighbor of A and we recommend to A the produtcs that the nearest neighbor rated high.

Reading the File

movie.rating <- read.csv("C:\\DATA607\\Week912 Material\\Movie_Ratings.csv")
m.r <- movie.rating
#str(m.r)
#head(m.r)
#tail(m.r)

Example Computing Patrick.C and Bryan

m.r %>% filter(!is.na(Patrick.C), !is.na(Bryan))%>%
  mutate(delta = (Patrick.C-Bryan)**2) %>%
  summarise(distance = sqrt(sum(delta)))
##   distance
## 1 4.472136

Using the matrix dist in r

m.x <- select(m.r, -X)
 ed <- sqrt(dist(m.x, upper = TRUE))

Reordering the matrix dist

eds <- matrix(rep(6),nrow = 24, ncol = 24, byrow = TRUE)
j=1
for (i in 1:24) {
  k <- 24*i
  eds[i:24,i] <- ed[j:k] 
  j <- j+25
}

Nearest neighbor of Patrik.C

e.dist <- ed[1:24]
Patrick.C <- data.frame(e.dist, neighbor = colnames(m.r)[3:26])
arrange(Patrick.C, e.dist)
##      e.dist  neighbor
## 1  2.080896       ben
## 2  2.340347     Chris
## 3  2.340347   Stephen
## 4  2.432299   Chris.1
## 5  2.514867   vanessa
## 6  2.514867      Gary
## 7  2.571843    Thomas
## 8  2.571843       Amy
## 9  2.610336     Bryan
## 10 2.610336      Matt
## 11 2.695335 Patrick.T
## 12 2.811707      Josh
## 13 2.827199      Erin
## 14 2.886751      greg
## 15 2.977256   Heather
## 16 3.010526  Jonathan
## 17 3.070520     aaron
## 18 3.117411 Katherine
## 19 3.162278       Zak
## 20 3.201086       Zwe
## 21 3.343702     brian
## 22 3.364407   Jessica
## 23 3.384736      Jeff
## 24 3.899603   Valerie

We can choose Ben and Chris as Patrick.C closest neighbors.

 pat.c <- m.r[(m.r$ben==5|m.r$Chris==5),c("X","Patrick.C", "ben","Chris")]
filter(pat.c, is.na(Patrick.C))
##              X Patrick.C ben Chris
## 1         <NA>        NA  NA    NA
## 2         <NA>        NA  NA    NA
## 3    Gladiator        NA   5     3
## 4         <NA>        NA  NA    NA
## 5 Pulp Fiction        NA   4     5
## 6         <NA>        NA  NA    NA
## 7         <NA>        NA  NA    NA

We can recommend Gladiator and Pulp Fiction to PatrickC.

Another Method

Euclidien Distance

squaroot <- function(A,B){
  m1 <- sum((A-B)**2, na.rm = TRUE)
  m1 <- sqrt(m1)
  return(m1)
}
squaroot(m.r$Patrick.C, m.r$Heather)
## [1] 4.123106

Computation of Euclidian Distance for all Users

  distance <- squaroot(m.r[,2], m.r[,2])
  for (j in 3:26) {
    distance <- c(distance, squaroot(m.r[,2], m.r[,j]))   
  }
  euclidist <- distance


for (i in 3:26) {
  distance <- squaroot(m.r[,i], m.r[,3])
  for (j in 3:26) {
    distance <- c(distance, squaroot(m.r[,i], m.r[,j]))   
  }
  euclidist <- c(euclidist, distance)
}
euclidist <- matrix(euclidist, ncol = 25)

Item Base Filtering

The user base filtring consumes a lot of memories. The computation become slow when the number of users increase.

The item base filtering find the most similar items to target item and combine that with the user’rating to generate recommendations.

Reading the File

movie.rating <- read.csv("C:\\DATA607\\Project4\\ml-20m\\ratings.csv")
m.r <- movie.rating
str(m.r)
## 'data.frame':    20000263 obs. of  4 variables:
##  $ userId   : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ movieId  : int  2 29 32 47 50 112 151 223 253 260 ...
##  $ rating   : num  3.5 3.5 3.5 3.5 3.5 3.5 4 4 4 4 ...
##  $ timestamp: int  1112486027 1112484676 1112484819 1112484727 1112484580 1094785740 1094785734 1112485573 1112484940 1112484826 ...

Calculating the Ajusted cosine similarity

Computing Each User Average

 Ru.hat <- m.r %>% 
           group_by(userId)%>%
           summarize(Ru.hat= mean(rating, na.rm = TRUE))
str(Ru.hat)
## Classes 'tbl_df', 'tbl' and 'data.frame':    138493 obs. of  2 variables:
##  $ userId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Ru.hat: num  3.74 4 4.12 3.57 4.27 ...
m.r <- merge(m.r, Ru.hat, by = "userId")

Computing \(x-\bar x\) and \((x-\bar x)^2\)

 m.r <- m.r %>% mutate(D.Ru = rating-Ru.hat, DS.Ru = (rating-Ru.hat)**2)
mvi <- m.r %>% 
      group_by(movieId)%>%
      summarise(nm = n())

Correlation between different items

    ma1 <- subset(m.r, m.r$movieId == mvi$movieId[1])
    ma2 <- subset(m.r, m.r$movieId == mvi$movieId[2])
    mer <- merge(ma1, ma2, by="userId")
    s <- cor(mer$D.Ru.x, mer$D.Ru.y)
for (i in 1:length(mvi-1)) {
  ma1 <- subset(m.r, m.r$movieId == mvi$movieId[i])
  for ( j in i+1:length(mvi)) {
    ma2 <- subset(m.r, m.r$movieId == mvi$movieId[j])
    mer <- merge(ma1, ma2, by="userId")
    s1 <- cor(mer$D.Ru.x, mer$D.Ru.y)
    s <- c(s,s1)
  }
}
mering <- merge(ma1, ma2, by="userId")
cor(mering$D.Ru.x, mering$D.Ru.y)
## [1] 0.08631851

For axample, movie 2 and movie 29 are similar The movie.csv file has all movies name with the corresponding Id Jumanji and Citie of The lost children are two adventurous movies for children that associate Fantasy.