Unit 5 - IP

R Markdown

######################################################################################
#######                                                                      #########
####### *******                      LastFM                        ********* #########
#######                                                                      #########
######################################################################################

# chagne working directory to where lastfm.csv is at
setwd("D:/lastfm-dataset-360K.tar/lastfm-dataset-360K")

# load  lastfm.csv Data
last.fm <- read.csv(file = "lastfm.csv", sep = "," , header = T)



####### data preprocessing and exlore the data #######################################
if (!require("arules")) install.packages("arules", dependencies=TRUE)

## Loading required package: arules

## Loading required package: Matrix

## 
## Attaching package: 'arules'

## The following objects are masked from 'package:base':
## 
##     abbreviate, write

if (!require("dplyr")) install.packages("dplyr", dependencies=TRUE)

## Loading required package: dplyr

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:arules':
## 
##     intersect, recode, setdiff, setequal, union

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(arules)
library(dplyr)

# preview lastfm.csv data
head(last.fm,3)

##   user                  artist sex country
## 1    1   red hot chili peppers   f Germany
## 2    1 the black dahlia murder   f Germany
## 3    1               goldfrapp   f Germany

# dimensions of the data
dim(last.fm)

## [1] 289955      4

# first select user and artist
last.fm <- last.fm[,1:2]

# remove duplicate user-artist combination
last.fm <- distinct(last.fm)

# extract the transactions to a new variable using arules
fm.trans <- as(split(last.fm$artist, last.fm$user), "transactions")

# let us check the new transformation
inspect(fm.trans[1:3])

##     items                     transactionID
## [1] {dropkick murphys,                     
##      edguy,                                
##      eluveitie,                            
##      goldfrapp,                            
##      guano apes,                           
##      jack johnson,                         
##      john mayer,                           
##      judas priest,                         
##      le tigre,                             
##      red hot chili peppers,                
##      rob zombie,                           
##      schandmaul,                           
##      the black dahlia murder,              
##      the killers,                          
##      the rolling stones,                   
##      the who}                             1
## [2] {aesop rock,                           
##      air,                                  
##      amon tobin,                           
##      animal collective,                    
##      aphex twin,                           
##      arcade fire,                          
##      atmosphere,                           
##      autechre,                             
##      beastie boys,                         
##      boards of canada,                     
##      broken social scene,                  
##      cocorosie,                            
##      devendra banhart,                     
##      four tet,                             
##      goldfrapp,                            
##      joanna newsom,                        
##      m83,                                  
##      massive attack,                       
##      max richter,                          
##      mf doom,                              
##      neutral milk hotel,                   
##      pavement,                             
##      plaid,                                
##      portishead,                           
##      prefuse 73,                           
##      radiohead,                            
##      sage francis,                         
##      the books,                            
##      the flashbulb}                       3
## [3] {a tribe called quest,                 
##      air,                                  
##      battles,                              
##      beck,                                 
##      bon iver,                             
##      bonobo,                               
##      dj shadow,                            
##      fleetwood mac,                        
##      flight of the conchords,              
##      kyuss,                                
##      late of the pier,                     
##      led zeppelin,                         
##      mgmt,                                 
##      michael jackson,                      
##      muse,                                 
##      pink floyd,                           
##      rÃ¶yksopp,                            
##      rjd2,                                 
##      simian mobile disco,                  
##      snow patrol,                          
##      the cinematic orchestra,              
##      the decemberists,                     
##      the flaming lips,                     
##      the prodigy,                          
##      the rolling stones,                   
##      tool,                                 
##      tv on the radio}                     4

# plot item frequency with 10% support
itemFrequencyPlot(fm.trans, support=0.10)

# plot top ten songs with the highest support
itemFrequencyPlot(fm.trans, topN=10)

####### building model ###############################################################

# build the association model with support 1%, confidence 40%, and with 2 minimum length of rules
association.model <- apriori(fm.trans, parameter=list(support=0.01, confidence=0.40, minlen=2))

## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.4    0.1    1 none FALSE            TRUE       5    0.01      2
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 150 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[1004 item(s), 15000 transaction(s)] done [0.12s].
## sorting and recoding items ... [655 item(s)] done [0.01s].
## creating transaction tree ... done [0.01s].
## checking subsets of size 1 2 3 4 done [0.03s].
## writing ... [211 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].

# there are 211 rules created
summary(association.model)

## set of 211 rules
## 
## rule length distribution (lhs + rhs):sizes
##   2   3 
## 113  98 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   2.000   2.000   2.464   3.000   3.000 
## 
## summary of quality measures:
##     support          confidence          lift            count      
##  Min.   :0.01000   Min.   :0.4009   Min.   : 2.232   Min.   :150.0  
##  1st Qu.:0.01093   1st Qu.:0.4273   1st Qu.: 2.604   1st Qu.:164.0  
##  Median :0.01273   Median :0.4630   Median : 2.938   Median :191.0  
##  Mean   :0.01457   Mean   :0.4726   Mean   : 3.642   Mean   :218.5  
##  3rd Qu.:0.01550   3rd Qu.:0.4984   3rd Qu.: 4.043   3rd Qu.:232.5  
##  Max.   :0.04107   Max.   :0.6627   Max.   :14.053   Max.   :616.0  
## 
## mining info:
##      data ntransactions support confidence
##  fm.trans         15000    0.01        0.4

# now let us inspect the top 10 rules with the highest lift
inspect(sort(association.model,by="lift")[1:10])

##      lhs                     rhs            support    confidence
## [1]  {nas}                => {jay-z}        0.01060000 0.4262735 
## [2]  {the pussycat dolls} => {rihanna}      0.01040000 0.5777778 
## [3]  {beyoncÃ©}           => {rihanna}      0.01393333 0.4686099 
## [4]  {morrissey}          => {the smiths}   0.01126667 0.4655647 
## [5]  {t.i.}               => {kanye west}   0.01040000 0.5672727 
## [6]  {kylie minogue}      => {madonna}      0.01093333 0.4781341 
## [7]  {a perfect circle}   => {tool}         0.01626667 0.4428312 
## [8]  {judas priest}       => {iron maiden}  0.01353333 0.5075000 
## [9]  {panic at the disco} => {fall out boy} 0.01153333 0.4346734 
## [10] {sonata arctica}     => {nightwish}    0.01346667 0.5101010 
##      lift      count
## [1]  14.052971 159  
## [2]  13.415893 156  
## [3]  10.881034 209  
## [4]   8.896141 169  
## [5]   8.854413 156  
## [6]   8.757035 164  
## [7]   8.717150 244  
## [8]   8.562992 203  
## [9]   8.413033 173  
## [10]  8.236292 202

# to interpret the result
# people who listen to Naz are 14 time likely to listen to Jay-z too or
# kylie Minogue listeners are 8 times likely to listen to Madonna

Unit 5 - IP

Biz Nigatu

March 9, 2019

R Markdown