library(readr)
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
library(arulesViz)
## Loading required package: grid
setwd("C:/Users/etital/Desktop/final project -big data")
data.germany <- read_csv("lastfm-matrix-germany.csv")
## Parsed with column specification:
## cols(
## .default = col_integer()
## )
## See spec(...) for full column specifications.
#Taking off all users that didnt rate anything and dropping the user column
data.germany <- data.germany [-which(apply(data.germany [,-1],1,sum)==0),]
train.data.germany <- data.germany [1:859,-1] #859 observations
test.data.germany <- data.germany [-(1:859),-1] #367 observations
#Finding the most frequent items
train.data.germany.2 <- train.data.germany
for(i in 1:ncol(train.data.germany.2)){
train.data.germany.2[which(train.data.germany.2[,i]==0),i] <- NA
}
train.data.germany.2 <- apply(train.data.germany.2,2,as.factor)
train.data.germany.2 <- as.data.frame(train.data.germany.2)
#Create an item frequency plot for the top 20 items
trnsc <- as(train.data.germany.2,"transactions")
itemFrequencyPlot(trnsc,topN=20,type="relative")
The top 5 most frequent artist are: Coldplay,Linkin park,Rammstein,Red hot chili peppers and system of a down. It is noticeable that the frequency of the most frequent artist is not so high relative to the size of the data set (n=859,number of items=285). This can suggest that the data set is sparse, which can affect on our accuracy of predictions.
#Lets have a look at the summary of frequency items
summary(itemFrequency(trnsc,type="absolut"))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12.00 22.00 28.00 37.17 43.00 138.00
par(mfrow=c(1,2))
boxplot(itemFrequency(trnsc,type="absolut"))
hist(itemFrequency(trnsc,type="absolut"),main = "Frequency of items",xlab = "Frequency",ylab="Num of items")
The distribution of the frequency is right skewed with only about 2% of the items appearing more then 100 times in the data set. As mentioned before this is probably due to the sparseness of the data.
library(rmr2)
## Please review your hadoop settings. See help(hadoop.settings)
rmr.options(backend = "local")
## NULL
data.germany.ibs <- train.data.germany
#Create a map function to calculate the how many times two items appears together in an itemset
naive_mapper <- function(v){
index <- which(v!=0)
z <- expand.grid(index,index)
if(length(which(z$Var1>z$Var2))>0){
key <- data.matrix(z[-which(!(z$Var1<=z$Var2)),])
} #The keys are i&j which are not equal to 0
else(key <- data.matrix(z))
val <- 1
return(keyval(key,val))
}
map <- function(k,v){
return(do.call(c.keyval,apply(v,1,naive_mapper)))
}
#Create a map function to calculate the frequency of each item
naive_mapper2 <- function(v){
index <- which(v!=0)
key <- index
val <- 1
return(keyval(key,val))
}
map_freq <- function(k,v){
return(do.call(c.keyval,apply(v,1,naive_mapper2)))
}
#For both map functions the reduce function is the same
rd <- function(k,vv){
keyval(k,sum(vv))
}
dat <- to.dfs(as.matrix(train.data.germany))
output_intersection <- from.dfs(mapreduce(input = dat,map = map,reduce = rd))
output_solo <- from.dfs(mapreduce(input = dat,map = map_freq,reduce = rd))
#Calculating jaccard similarity matrix
data.germany.ibs.jaccard.similarity <- matrix(0,285,285)
for(i in 1:nrow(keys(output_intersection))){
m11 <- values(output_intersection)[i]
m10 <- values(output_solo)[keys(output_solo)==keys(output_intersection)[i,1]]
m01 <- values(output_solo)[keys(output_solo)==keys(output_intersection)[i,2]]
data.germany.ibs.jaccard.similarity[keys(output_intersection)[i,1],
keys(output_intersection)[i,2]] <-
m11/(m10+m01-m11)
data.germany.ibs.jaccard.similarity[keys(output_intersection)[i,2],
keys(output_intersection)[i,1]] <-
m11/(m10+m01-m11)
}
data.germany.ibs.jaccard.similarity <- as.data.frame(
data.germany.ibs.jaccard.similarity,row.names = colnames(data.germany.ibs))
#Get the top 5 recommendations for each artist
data.germany.CF.recommendations <- matrix(NA, nrow=ncol(data.germany.ibs.jaccard.similarity),
ncol=6,dimnames=list(colnames(data.germany.ibs)))
for(i in 1:ncol(data.germany.ibs))
{
data.germany.CF.recommendations[i,] <- (t(head(n=6,
rownames(data.germany.ibs.jaccard.similarity[order(data.germany.ibs.jaccard.similarity[,i],
decreasing=TRUE),][i]))))
}
data.germany.CF.recommendations <- data.germany.CF.recommendations[,-1]
colnames(data.germany.CF.recommendations) <- c("Option 1","Option 2","Option 3",
"Option 4","Option 5")
data.germany.ar <- train.data.germany
#Calculating support matrix
data.germany.ar.support<- matrix(0,285,285,
dimnames = list(names(data.germany.ar),names(data.germany.ar)))
for(i in 1:nrow(keys(output_intersection))){
n <- 859
m11 <- values(output_intersection)[i]
m10 <- values(output_solo)[keys(output_solo)==keys(output_intersection)[i,1]]
m01 <- values(output_solo)[keys(output_solo)==keys(output_intersection)[i,2]]
data.germany.ar.support[keys(output_intersection)[i,1],keys(output_intersection)[i,2]] <-
m11/n
data.germany.ar.support[keys(output_intersection)[i,2],keys(output_intersection)[i,1]] <-
m11/n
}
#Calculating confidence matrix
data.germany.ar.confidence <- matrix(0,285,285,
dimnames = list(names(data.germany.ar),names(data.germany.ar)))
for(i in 1:nrow(keys(output_intersection))){
n <- 859
m11 <- values(output_intersection)[i]
m10 <- values(output_solo)[keys(output_solo)==keys(output_intersection)[i,1]]
m01 <- values(output_solo)[keys(output_solo)==keys(output_intersection)[i,2]]
data.germany.ar.confidence[keys(output_intersection)[i,1],keys(output_intersection)[i,2]] <-
(m11/n)/(m10/n)
data.germany.ar.confidence[keys(output_intersection)[i,2],keys(output_intersection)[i,1]] <-
(m11/n)/(m01/n)
}
#Calculating lift matrix
data.germany.ar.lift <- matrix(0,285,285,
dimnames = list(names(data.germany.ar),names(data.germany.ar)))
for(i in 1:nrow(keys(output_intersection))){
n <- 859
m11 <- values(output_intersection)[i]
m10 <- values(output_solo)[keys(output_solo)==keys(output_intersection)[i,1]]
m01 <- values(output_solo)[keys(output_solo)==keys(output_intersection)[i,2]]
data.germany.ar.lift[keys(output_intersection)[i,1],keys(output_intersection)[i,2]] <-
(m11/n)/((m10/n)*(m01/n))
data.germany.ar.lift[keys(output_intersection)[i,2],keys(output_intersection)[i,1]] <-
(m11/n)/((m10/n)*(m01/n))
}
#Calculating conviction matrix
data.germany.ar.conviction <- matrix(0,285,285,
dimnames = list(names(data.germany.ar),names(data.germany.ar)))
for(i in 1:nrow(keys(output_intersection))){
n <- 859
m11 <- values(output_intersection)[i]
m10 <- values(output_solo)[keys(output_solo)==keys(output_intersection)[i,1]]
m01 <- values(output_solo)[keys(output_solo)==keys(output_intersection)[i,2]]
data.germany.ar.conviction[keys(output_intersection)[i,1],keys(output_intersection)[i,2]] <-
(1-(m01/n))/(1-((m11/n)/(m10/n)))
data.germany.ar.conviction[keys(output_intersection)[i,2],keys(output_intersection)[i,1]] <-
(1-(m10/n))/(1-((m11/n)/(m01/n)))
}
#The Distribution of top 5 for each measure
par(mfrow=c(2,2))
hist(apply(data.germany.ar.confidence,1,sort)[280:284,],main = "Confidence top 5 distribution",
xlab = "Values")
hist(apply(data.germany.ar.support,1,sort)[280:284,],main = "Support top 5 distribution",
xlab = "Values")
hist(apply(data.germany.ar.lift,1,sort)[280:284,],main = "Lift top 5 distribution",
xlab = "Values")
hist(apply(data.germany.ar.conviction,1,sort)[280:284,],main = "Conviction top 5 distribution",
xlab = "Values")
By looking at the distribution of each measure I can conclude that the size of the data set might not be enough to extract “strong” recommendations. In addition, it’s very likely that a recommendation will have a support of 0.009-0.017, and at the best case a maximum support of 0.057. This kind of support can imply that recommendations may occur simply by chance, therefor we should interpret each recommendation with caution.
#Function for generating recommendations for each measure
rec_fun <- function(measure_matrix){
x <- matrix(NA, nrow=ncol(measure_matrix),ncol=6,dimnames=list(colnames(data.germany.ar)))
for(i in 1:ncol(data.germany.ar)) {
x[i,] <- (head(n=6,names(measure_matrix[,order(measure_matrix[i,],decreasing=TRUE)][i,])))
}
x <- x[,-1]
colnames(x) <- c("Option 1","Option 2","Option 3","Option 4","Option 5")
return(x)
}
#Generating recommendations
data.germany.AR.support.recommendations <- rec_fun(data.germany.ar.support)
data.germany.AR.confidence.recommendations <- rec_fun(data.germany.ar.confidence)
data.germany.AR.lift.recommendations <- rec_fun(data.germany.ar.lift)
data.germany.AR.conviction.recommendations <- rec_fun(data.germany.ar.conviction)
#Lets have a look how the recommendations look like
head(data.germany.AR.confidence.recommendations)
## Option 1 Option 2 Option 3
## a perfect circle "tool" "incubus" "system of a down"
## abba "the beatles" "madonna" "queen"
## ac/dc "red hot chili peppers" "rammstein" "metallica"
## adam green "the strokes" "coldplay" "foo fighters"
## aerosmith "red hot chili peppers" "jack johnson" "ac/dc"
## afi "rise against" "billy talent" "system of a down"
## Option 4 Option 5
## a perfect circle "deftones" "dredg"
## abba "elvis presley" "michael jackson"
## ac/dc "the offspring" "die toten hosen"
## adam green "the beatles" "the kooks"
## aerosmith "linkin park" "metallica"
## afi "the offspring" "die toten hosen"
#Precision calculating function
precision_fun <- function(rec_data){
set.seed(123)
mean_precision <- numeric()
for(j in 1:20){
user_precision <- numeric()
for(i in 1:nrow(test.data.germany)){
user <- test.data.germany[i,]
sampled.artist <- names(user[sample(which(user==1),1)])
recommandations_AR <- rec_data[which(rownames(rec_data)==sampled.artist),]
user_precision <- c(user_precision,sum(recommandations_AR%in%names(user[which(user==1)]))/5)
}
mean_precision <- c(mean_precision,mean(user_precision))
}
return(mean(mean_precision))
}
#Calculating precisions
precision_CF <- precision_fun(data.germany.CF.recommendations)
precision_AR_support <- precision_fun(data.germany.AR.support.recommendations)
precision_AR_confidence <- precision_fun(data.germany.AR.confidence.recommendations)
precision_AR_lift <- precision_fun(data.germany.AR.lift.recommendations)
precision_AR_conviction <- precision_fun(data.germany.AR.conviction.recommendations)
library(knitr)
p.table <- rbind(data.frame(precision_CF),precision_AR_support,
precision_AR_confidence,precision_AR_lift,precision_AR_conviction)
rownames(p.table) <- c("CF","AR_support","AR_confidence","AR_lift","AR_conviction")
kable(p.table,caption = "Precision Table",col.names = "Precision",row.names =T )
| Precision | |
|---|---|
| CF | 0.1737330 |
| AR_support | 0.1992371 |
| AR_confidence | 0.1992371 |
| AR_lift | 0.1311444 |
| AR_conviction | 0.1956948 |
Confidence and support measurements gives the highest precision, therefor I have decided to build the association rules - recommendation engine with the confidence measurement.
library(shiny)
ui <- fluidPage(
tags$h1("Music Recommendation System"),
selectInput(inputId = "artist",label = "Choose one of your favorite artists",
choices = colnames(data.germany.ar)),
tags$hr(),
tags$h2("You might like this artists too!"),
tags$br(),
tags$h3("Item-Based Collaboration Filtering"),
tableOutput("table"),
tags$br(),
tags$h3("Association Rules"),
tableOutput("table2")
)
server <- function(input,output){
output$table <- renderTable(t(data.germany.CF.recommendations[
rownames(data.germany.CF.recommendations)==input$artist,]))
output$table2 <- renderTable(t(data.germany.AR.confidence.recommendations[
rownames(data.germany.CF.recommendations)==input$artist,]))
}
shinyApp(ui = ui,server = server)
Here is a link to the app