DATA 612 Project 4

Dataset

We use a TidyTuesday data set comprised of the best hip hop songs off all time. The data set was compile by bbc news and includes a file for polls and a file for rankings.

polls <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-14/polls.csv')
rankings <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-14/rankings.csv')

Our Game Plan

We will use the Poll data set to create recommenders that given a list of favorite songs the recommender with suggest other songs that the user might/should like. We will execute the gameplan by training three alternative models UBCF, IBCF and Random.

EDA

We will perform a some EDA to develop a better understanding of the data.

polls %>% 
  count(title, sort = TRUE) %>% 
  head(n=10) %>% 
  kable() %>% 
  kable_styling()

title	n
Juicy	18
Nuthin’ But A ‘G’ Thang	14
The Message	14
Shook Ones (Part II)	13
Fight The Power	11
C.R.E.A.M.	10
93 ’Til Infinity	7
N.Y. State Of Mind	7
Dear Mama	6
Jesus Walks	6

Pools over Time

It appears the majority of pools took place during the 1990s.

polls %>% 
  count(year) %>% 
  mutate(decade = floor(year/10) *10) %>%
  mutate(decade = factor(decade)) %>% 
  ggplot(aes(x=year, y=n, fill = decade)) + geom_col()

polls %>% 
  count(artist, sort =TRUE) %>% 
  ggplot(aes(x=n )) + geom_density()

rankings %>% 
  select(artist, n, n1, n2, n3, n4, n5) %>% 
  group_by(artist) %>% 
  summarise_all(sum) %>% 
  filter(!str_detect(artist, 'ft')) %>% 
  arrange(desc(n1)) %>% 
  slice(1:10)

## # A tibble: 10 x 7
##    artist                                   n    n1    n2    n3    n4    n5
##    <chr>                                <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 The Notorious B.I.G.                    22    10     3     4     2     3
##  2 Public Enemy                            18     9     4     2     2     1
##  3 Wu-Tang Clan                            20     6     1     3     7     3
##  4 Grandmaster Flash & The Furious Five    14     5     3     1     0     5
##  5 2Pac                                    13     4     3     1     4     1
##  6 Eric B & Rakim                          11     4     1     5     0     1
##  7 Mobb Deep                               14     4     5     1     2     2
##  8 Nas                                     13     3     5     2     1     2
##  9 The Pharcyde                             6     3     2     0     0     1
## 10 Clipse                                   5     2     0     1     1     1

polls

## # A tibble: 535 x 9
##     rank title artist gender  year critic_name critic_rols critic_country
##    <dbl> <chr> <chr>  <chr>  <dbl> <chr>       <chr>       <chr>         
##  1     1 Term~ Publi~ male    1998 Joseph Aba~ Fat Beats   US            
##  2     2 4th ~ Gza f~ male    1995 Joseph Aba~ Fat Beats   US            
##  3     3 Pete~ Run D~ male    1986 Joseph Aba~ Fat Beats   US            
##  4     4 Play~ GLOBE~ male    2001 Joseph Aba~ Fat Beats   US            
##  5     5 Time~ O.C.   male    1994 Joseph Aba~ Fat Beats   US            
##  6     1 Play~ Slum ~ male    1997 Biba Adams  Critic      US            
##  7     2 Self~ Stop ~ mixed   1989 Biba Adams  Critic      US            
##  8     3 Push~ Salt-~ female  1986 Biba Adams  Critic      US            
##  9     4 Ambi~ 2Pac   male    1996 Biba Adams  Critic      US            
## 10     5 Big ~ JAY-Z~ male    1999 Biba Adams  Critic      US            
## # ... with 525 more rows, and 1 more variable: critic_country2 <chr>

# 15.5% of the top songs were voted by one country.
polls %>% 
  count(title, critic_country, name= "song_nom") %>%
  add_count(title, name= "number_of_countries") %>% 
  filter(number_of_countries ==1 & critic_country != "US") %>% 
  nrow() / nrow(polls)

## [1] 0.1551402

Create Binanry Hip Hop Matrix

We create a Binary Rating Matrix utilizing the number of times s song received a rating from 1 to 5 from critiics

rap_matrix <- polls %>% 
  select(critic_name, title) %>% 
  mutate(n=1) %>% 
  arrange(title) %>% 
  pivot_wider(names_from = "title", values_from = "n", values_fill = list(n=0)) %>% 
  select(-critic_name) %>% 
  as.matrix() %>% 
  as("binaryRatingMatrix")

Create a Training Schema

We create a training schema that utilized 80% of the data.

set.seed(4763)
training_schema <- evaluationScheme(rap_matrix, method = "split", train = .8, given=-1)
training_schema

## Evaluation scheme using all-but-1 items
## Method: 'split' with 1 run(s).
## Training set proportion: 0.800
## Good ratings: NA
## Data set: 107 x 309 rating matrix of class 'binaryRatingMatrix' with 535 ratings.

Train Models

Per our gameplan, we train UBCF, IBCF and Random models.

User-based Filtering Model

UBCF_Mod <- evaluate(training_schema, method = "UBCF", type = "topNList", n = c(1,5,10,15,20,25))

## UBCF run fold/sample [model time/prediction time]
##   1  [0.03sec/0.35sec]

IBCF_Mod <- evaluate(training_schema, method = "IBCF", type = "topNList", n = c(1,5,10,15,20))

## IBCF run fold/sample [model time/prediction time]
##   1  [0.54sec/0.11sec]

RAND_Mod <- evaluate(training_schema, method = "RANDOM",type="topNList", n = c(1,5,10,15,20))

## RANDOM run fold/sample [model time/prediction time]
##   1  [0sec/0.02sec]

Evaluate Models

models <- list(
  "UBCF" = list(name = "UBCF", param = NULL),
  "IBCF" = list(name = "IBCF", param = NULL),
  "Random" = list(name = "RANDOM", param = NULL))
  
evalResults <- evaluate(training_schema, method = models, n = c(1,5,10,15,20))

## UBCF run fold/sample [model time/prediction time]
##   1  [0sec/0.25sec] 
## IBCF run fold/sample [model time/prediction time]
##   1  [0.36sec/0.01sec] 
## RANDOM run fold/sample [model time/prediction time]
##   1  [0sec/0.01sec]

ROC Curve Plots

The UBCF appears to be the superiour model.

plot(evalResults, 
     annotate = TRUE, legend = "topleft", main = "ROC Curve")

Create Final Models Using Optimized Parameters

We create final models using the best parameter values as derived from the ROC curves.

UBCF_Final_model <- Recommender(getData(training_schema, "train"), "UBCF", param = list(nn = 20))
IBCF_Final_model <- Recommender(getData(training_schema, "train"), "IBCF", param = list(nn = 20))

## Available parameter (with default values):
## k     =  30
## method    =  Jaccard
## normalize_sim_matrix  =  FALSE
## alpha     =  0.5
## verbose   =  FALSE

RAND_Final_model <- Recommender(getData(training_schema, "train"), "RANDOM", param = list(nn = 15))

Make Prediction and Evaluate Accuracy

Upredictions <- predict(UBCF_Final_model, getData(training_schema, "known"), type = "topNList")
Ipredictions <- predict(IBCF_Final_model, getData(training_schema, "known"), type = "topNList")
Rpredictions <- predict(RAND_Final_model, getData(training_schema, "known"), type = "topNList")

accU <- calcPredictionAccuracy(Upredictions, getData(training_schema,"unknown"), given = -1) 
accI <- calcPredictionAccuracy(Ipredictions, getData(training_schema,"unknown"), given = -1) 
accR <- calcPredictionAccuracy(Rpredictions, getData(training_schema,"unknown"), given = -1)

Let take a look at the accuracy of our three models. Overall, UBCF remains the best model.

accuracy <- rbind(accU, accI)
accuracy <- rbind(accuracy, accR)
rownames(accuracy) <- c("UBCF","IBCF","Random")

kable(accuracy, format = "html") %>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))

	TP	FP	FN	TN	precision	recall	TPR	FPR
UBCF	0.1818182	9.363636	0.8181818	299.6364	0.0190476	0.1818182	0.1818182	0.0303030
IBCF	0.0909091	9.909091	0.9090909	299.0909	0.0090909	0.0909091	0.0909091	0.0320683
Random	0.0000000	10.000000	1.0000000	299.0000	0.0000000	0.0000000	0.0000000	0.0323625

Create Recommender Engines

Next we use the Recommender function to create recommender engines that utilize our three models.

Urec_engine <- Recommender(rap_matrix, "UBCF", param = list(n = 20))
Irec_engine <- Recommender(rap_matrix, "IBCF", param = list(n = 20))
Rrec_engine <- Recommender(rap_matrix, "RANDOM", param = list(n = 15) )

Create a List of Favorite Songs

The Recommender requires a list of songs to produce recommendations. Given the list the engines should recommend some alternative songs that you might like. Five song are chosen and transformed to binaryMatrix formate.

test_songs <- polls %>% 
  select(title) %>% 
  distinct() %>% 
  arrange(title) %>% 
  filter(title %in% c("In Da Club", "Alright", "Bitch Don’t Kill My Vibe", "Still D.R.E.", "Changes")) %>%  
  rbind(polls %>% select(title) %>% distinct()) %>% 
  count(title) %>% 
  mutate(n = n -1) %>% 
  pivot_wider(names_from = "title", values_from = "n", values_fill = list(n = 0)) %>% 
  as.matrix() %>% 
  as("binaryRatingMatrix")

Apply Favorite Song List to Recommendation Engines

Once the Recommendation Engines are given five favorite songs, it recommends a list of song that the user might / should like. I’m not a big hip hop fan, but my son gave the recommender said the UBCF did in fact do the best job.

UBCF Recommendatations

predict(Urec_engine, test_songs) %>% as("list") %>% as.data.frame()

##                         X1
## 1                    Juicy
## 2       N.Y. State Of Mind
## 3          Bring Da Ruckus
## 4                Dear Mama
## 5          Fuck Tha Police
## 6              Jesus Walks
## 7            Lose Yourself
## 8  Nuthin’ But A ‘G’ Thang
## 9     Shook Ones (Part II)
## 10  America’s Most Blunted

IBCF Recommendations

predict(Irec_engine, test_songs) %>% as("list") %>% as.data.frame()

##                                        X1
## 1                                     DNA
## 2         Quiet Storm (Remix ft. Lil Kim)
## 3                       I've Seen Footage
## 4                     Concrete Schoolyard
## 5                        Guess Who’s Back
## 6                           Puppet Master
## 7  Wu-Tang Clan Ain’t Nuthing Ta Fuck Wit
## 8                         Bring Da Ruckus
## 9                           m.A.A.d. city
## 10                 Straight Outta Compton

Random Recommendations

predict(Rrec_engine, test_songs) %>% as("list") %>% as.data.frame()

##                                                      X1
## 1                               Nuthin’ But A ‘G’ Thang
## 2                Wu-Tang Clan Ain’t Nuthing Ta Fuck Wit
## 3                                               Push It
## 4  B.I.B.L.E. (Basic Instructions Before Leaving Earth)
## 5                                     Bible On The Dash
## 6                                         m.A.A.d. city
## 7                                           Look At Me!
## 8                                           White Lines
## 9                                           Mass Appeal
## 10                       Da Art of Storytellin’ (Pt. 2)

Business User Experience Goal

As the President of the Hip Hop Artist Organization, one of my objectives is to promote all of our artistS. Market research shows that most Hip Hop fans have 3 to 5 artistS that they mainly follow and support. In an attempt to increae that number and champion all hip hop music, I have asked my crackerjack data scientist to create a Recommender Engine that provides the user with some new content suggestion. The data scientist has accomplished this by creating a Hybrid Recommender that combines two models - UBCF and Random. What’s even better, the Hybrid model has weights that allow me to determine how strongly one model influences the recommendations relative to the other modeling approach. The higher the UBCF weighting the more the recommendations will be like the UBCF model and vice versa.

Below you can see modeling results for the Hybrid model when weightings are set to 90%/10% and 50%/50%. At the 90% weighting the Hyprid model recommended 7 songs that were recommended by the UBCF model. At the 50/50 weighting, the recommender system suggested only two songs that were alson on the UBCF list.

Hybrid Model with Weight Set to 90% (UBCF) to 10% (RANDOM)

HYBR_Final_Model <- HybridRecommender(
  UBCF_Final_model,
  RAND_Final_model,
  weights = c(0.90, 0.10)
)

as(predict(UBCF_Final_model, test_songs), "list")

## $`1`
##  [1] "Juicy"                            "Bring Da Ruckus"                 
##  [3] "Dear Mama"                        "Fuck Tha Police"                 
##  [5] "Jesus Walks"                      "Lose Yourself"                   
##  [7] "N.Y. State Of Mind"               "America’s Most Blunted"          
##  [9] "B.O.B."                           "Black Steel In The Hour Of Chaos"

as(predict(HYBR_Final_Model, test_songs), "list")

## $`1`
##  [1] "Juicy"              "N.Y. State Of Mind" "Dear Mama"         
##  [4] "Lose Yourself"      "Bring Da Ruckus"    "Ready Or Not"      
##  [7] "California Love"    "I Am I Be"          "Fuck Tha Police"   
## [10] "Jesus Walks"

Hybrid Model with Weight Set to 50% (UBCF) to 50% (RANDOM)

HYBR_Final_Model <- HybridRecommender(
  UBCF_Final_model,
  RAND_Final_model,
  weights = c(0.50, 0.50)
)

as(predict(UBCF_Final_model, test_songs), "list")

## $`1`
##  [1] "Juicy"                            "Bring Da Ruckus"                 
##  [3] "Dear Mama"                        "Fuck Tha Police"                 
##  [5] "Jesus Walks"                      "Lose Yourself"                   
##  [7] "N.Y. State Of Mind"               "America’s Most Blunted"          
##  [9] "B.O.B."                           "Black Steel In The Hour Of Chaos"

as(predict(HYBR_Final_Model, test_songs), "list")

## $`1`
##  [1] "N.Y. State Of Mind"                 "Dear Mama"                         
##  [3] "Double Trouble At The Amphitheatre" "Passin’ Me By"                     
##  [5] "Follow The Leader"                  "C.R.E.A.M."                        
##  [7] "Ready Or Not"                       "Walk This Way"                     
##  [9] "Quiet Storm (Remix)"                "Quiet Storm (Remix ft. Lil Kim)"