This is based on the article linked below by Muffaddal Qutbuddin.
https://towardsdatascience.com/comprehensive-guide-on-item-based-recommendation-systems-d67e40e2b75d
The data set in this exercise has 20 users and 20 movies.
data <- read_excel('IBCF - Movie Ratings.xlsx')
Please refer to the article above to download this sample data set.
Adjust movie ratings by subtracting user’s average rating from every rating for that user. This adjusts for user-bias.
## create a copy of rating dataframe
data.adjusted <-data[FALSE,]
#myrating <- as.numeric(data[1,-1])
#mean(myrating[myrating!=0], na.rm = TRUE)
##normalize user rating
for (u in 1:nrow(data)) {
#get rating of the user for each item
ratings <-as.numeric(data[u,-1])
#calculate average rating
meanAvg <- mean(ratings[ratings!=0], na.rm=TRUE)
#iterate each user ratings.
# we start with 2nd column as first column is user id
for (j in 2:ncol(data)) {
#store user id in normalized dataframe
data.adjusted[u,1]<-data[u,1]
#store zero incase of no rating
if(data[u,j]==0 | is.na(data[u,j])){
data.adjusted[u,j] <- 0
}
#subtract user's item rating with average rating.
else{
data.adjusted[u,j] <- data[u,j] - meanAvg
}
}
}
Create data.ibs
and data.adjusted.ibs
data frames without User column. Replace zero ratings with NA
in data
data frame.
data.ibs<-data[,-1]
data.adjusted.ibs<-data.adjusted[,-1]
data[data==0] <- NA
data.ibs[data.ibs==0] <- NA
Create function calCosine
to calculate cosine similarity.
The function defined below is based on this formula.
r_i_adj
- adjusted rating for ith item across all usersr_j_adj
- adjusted rating for jth item across all usersNOTE: this function was modified so that it is consistent with the formula above. The denominator should use the adjusted ratings as well. This was not the case with the original function.
#function to calculate cosine similarity
calCosine <- function(`r_i_adj`, `r_j_adj`)
{
return(sum(r_i_adj*r_j_adj) / (sqrt(sum(r_i_adj*r_i_adj)) * sqrt(sum(r_j_adj*r_j_adj))))
}
#create an emptry table to store similarity
data.ibs.similarity <- read.table(text = "",
colClasses = rep(c('numeric'),ncol(data.adjusted.ibs)),
col.names = c('items',colnames(data.adjusted.ibs)),
check.names=FALSE)
# Lets fill in those empty spaces with cosine similarities
# Loop through the columns
for(i in 1:ncol(data.adjusted.ibs)) {
# Loop through the columns for each column
for(j in 1:ncol(data.adjusted.ibs)) {
#get movie name for which to calculate similartiy
data.ibs.similarity[i,1] <- colnames(data.adjusted.ibs)[i]
# Fill in cosine similarities
data.ibs.similarity[i,j+1] <- calCosine(as.matrix(data.adjusted.ibs[,i]), as.matrix(data.adjusted.ibs[,j]))
}
}
Create function calScore
to calculate the recommendation score.
This function is based on this formula.
#function to compute score for item recommendation.
calScore <- function(history, similarities, avgRating)
{
return (sum((history-avgRating)*similarities)/sum(similarities))
}
NOTE: The code below was modified so that the target item’s overall average rating (targetItem.rating.avg
) is taken into account instead of the user’s average rating. I think that this modification aligns the calculation with the formula above.
#create empty dataframe for score
data.ibs.user.score = data[FALSE,]
# Loop through the users (rows)
for(i in 1:nrow(data.ibs))
{
#get user id for which to calculate score
users <- as.numeric(data[i,1])
data.ibs.user.score[i,1] <- users
# Loops through the movies (columns)
for(j in 2:ncol(data))
{
# Get the movie's name
item <- colnames(data)[j]
# We do not want to recommend products you have already consumed
# If you have already consumed it, we store -1
#check if user have rated the movie or not.
if(data[i,j] >0 & is.na(data[i,j])==FALSE) #user has a rating
{
data.ibs.user.score[i,j]<- -1
}else {
# We first have to get a product's top 10 neighbours sorted by similarity
#get top 10 similar movies to our given movie
topN <- head(n=11,( data.ibs.similarity[ order( data.ibs.similarity[,item], decreasing = T),][,c('items',item)] ) )
topN.similarities <- as.numeric(topN[,item])
#Dropping first movie as it will be the same movie
topN <- topN[-c(1),] #remove 1st movie since it is the same movie
topN.similarities <- topN.similarities[-1]
# We then get the user's rating history for those 10 movies.
#topN.userPurchases <- as.numeric( data[,c('User',topN.names)] %>% filter(User==users))[-1]
topN.userPurchases <- as.numeric( data[,c('User',topN$items)] %>% filter(User==users))[-1]
#calculate score for the given movie and the user
item.rating.avg <-as.numeric(colMeans(x=data.ibs[,topN$items], na.rm = TRUE))
targetItem.rating.avg <- colMeans(x=data.ibs[,item], na.rm = TRUE)
#replace NA with zero
topN.userPurchases[is.na(topN.userPurchases)] <- 0
data.ibs.user.score[i,j] <- targetItem.rating.avg+(calScore(similarities=topN.similarities, history=topN.userPurchases, avgRating = item.rating.avg))
#if(item == '1: Toy Story (1995)' & users=='1577'){print(topN.userPurchases)}
#if(item == '1: Toy Story (1995)' & users=='1577'){print(topN.similarities)}
#if(item == '1: Toy Story (1995)' & users=='1577'){print(item.rating.avg)}
#if(item == '1: Toy Story (1995)' & users=='1577'){print(targetItem.rating.avg)}
#Get Top 10 most similar movies for Toy Story and relevant ratings for User 1577
if(item == '1: Toy Story (1995)' & users=='1577'){
user1577_ToyStory <- cbind(topN$items, topN.similarities,topN.userPurchases,item.rating.avg,rep(targetItem.rating.avg, 10))
}
} # close else statement
} # end product for loop
} # end user for loop
User 1577
for Toy Story has a recommendation score of 1.3441
.
Table below provides information you need to calculate recommendation score for Toy Story for user 1577.
Remember, movies that were already rated by users receive a score of -1.
write.csv(data.ibs.similarity, 'similarity matrix.csv')
write.csv(data.ibs.user.score, 'IBCF score.csv')