R Markdown
######################################################################################
####### #########
####### ******* LastFM ********* #########
####### #########
######################################################################################
# chagne working directory to where lastfm.csv is at
setwd("D:/lastfm-dataset-360K.tar/lastfm-dataset-360K")
# load lastfm.csv Data
last.fm <- read.csv(file = "lastfm.csv", sep = "," , header = T)
####### data preprocessing and exlore the data #######################################
if (!require("arules")) install.packages("arules", dependencies=TRUE)
## Loading required package: arules
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
if (!require("dplyr")) install.packages("dplyr", dependencies=TRUE)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:arules':
##
## intersect, recode, setdiff, setequal, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(arules)
library(dplyr)
# preview lastfm.csv data
head(last.fm,3)
## user artist sex country
## 1 1 red hot chili peppers f Germany
## 2 1 the black dahlia murder f Germany
## 3 1 goldfrapp f Germany
# dimensions of the data
dim(last.fm)
## [1] 289955 4
# first select user and artist
last.fm <- last.fm[,1:2]
# remove duplicate user-artist combination
last.fm <- distinct(last.fm)
# extract the transactions to a new variable using arules
fm.trans <- as(split(last.fm$artist, last.fm$user), "transactions")
# let us check the new transformation
inspect(fm.trans[1:3])
## items transactionID
## [1] {dropkick murphys,
## edguy,
## eluveitie,
## goldfrapp,
## guano apes,
## jack johnson,
## john mayer,
## judas priest,
## le tigre,
## red hot chili peppers,
## rob zombie,
## schandmaul,
## the black dahlia murder,
## the killers,
## the rolling stones,
## the who} 1
## [2] {aesop rock,
## air,
## amon tobin,
## animal collective,
## aphex twin,
## arcade fire,
## atmosphere,
## autechre,
## beastie boys,
## boards of canada,
## broken social scene,
## cocorosie,
## devendra banhart,
## four tet,
## goldfrapp,
## joanna newsom,
## m83,
## massive attack,
## max richter,
## mf doom,
## neutral milk hotel,
## pavement,
## plaid,
## portishead,
## prefuse 73,
## radiohead,
## sage francis,
## the books,
## the flashbulb} 3
## [3] {a tribe called quest,
## air,
## battles,
## beck,
## bon iver,
## bonobo,
## dj shadow,
## fleetwood mac,
## flight of the conchords,
## kyuss,
## late of the pier,
## led zeppelin,
## mgmt,
## michael jackson,
## muse,
## pink floyd,
## röyksopp,
## rjd2,
## simian mobile disco,
## snow patrol,
## the cinematic orchestra,
## the decemberists,
## the flaming lips,
## the prodigy,
## the rolling stones,
## tool,
## tv on the radio} 4
# plot item frequency with 10% support
itemFrequencyPlot(fm.trans, support=0.10)

# plot top ten songs with the highest support
itemFrequencyPlot(fm.trans, topN=10)

####### building model ###############################################################
# build the association model with support 1%, confidence 40%, and with 2 minimum length of rules
association.model <- apriori(fm.trans, parameter=list(support=0.01, confidence=0.40, minlen=2))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.4 0.1 1 none FALSE TRUE 5 0.01 2
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 150
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[1004 item(s), 15000 transaction(s)] done [0.12s].
## sorting and recoding items ... [655 item(s)] done [0.01s].
## creating transaction tree ... done [0.01s].
## checking subsets of size 1 2 3 4 done [0.03s].
## writing ... [211 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
# there are 211 rules created
summary(association.model)
## set of 211 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3
## 113 98
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 2.000 2.000 2.464 3.000 3.000
##
## summary of quality measures:
## support confidence lift count
## Min. :0.01000 Min. :0.4009 Min. : 2.232 Min. :150.0
## 1st Qu.:0.01093 1st Qu.:0.4273 1st Qu.: 2.604 1st Qu.:164.0
## Median :0.01273 Median :0.4630 Median : 2.938 Median :191.0
## Mean :0.01457 Mean :0.4726 Mean : 3.642 Mean :218.5
## 3rd Qu.:0.01550 3rd Qu.:0.4984 3rd Qu.: 4.043 3rd Qu.:232.5
## Max. :0.04107 Max. :0.6627 Max. :14.053 Max. :616.0
##
## mining info:
## data ntransactions support confidence
## fm.trans 15000 0.01 0.4
# now let us inspect the top 10 rules with the highest lift
inspect(sort(association.model,by="lift")[1:10])
## lhs rhs support confidence
## [1] {nas} => {jay-z} 0.01060000 0.4262735
## [2] {the pussycat dolls} => {rihanna} 0.01040000 0.5777778
## [3] {beyoncé} => {rihanna} 0.01393333 0.4686099
## [4] {morrissey} => {the smiths} 0.01126667 0.4655647
## [5] {t.i.} => {kanye west} 0.01040000 0.5672727
## [6] {kylie minogue} => {madonna} 0.01093333 0.4781341
## [7] {a perfect circle} => {tool} 0.01626667 0.4428312
## [8] {judas priest} => {iron maiden} 0.01353333 0.5075000
## [9] {panic at the disco} => {fall out boy} 0.01153333 0.4346734
## [10] {sonata arctica} => {nightwish} 0.01346667 0.5101010
## lift count
## [1] 14.052971 159
## [2] 13.415893 156
## [3] 10.881034 209
## [4] 8.896141 169
## [5] 8.854413 156
## [6] 8.757035 164
## [7] 8.717150 244
## [8] 8.562992 203
## [9] 8.413033 173
## [10] 8.236292 202
# to interpret the result
# people who listen to Naz are 14 time likely to listen to Jay-z too or
# kylie Minogue listeners are 8 times likely to listen to Madonna