Introduction

We will use a neural network model in easing music executives task of sorting randomly through large quantities of song demo submissions. The information at hand is that these busy executives spends an average of 6 minutes evaluating a song if its less likely going to be popular and they spend an average of 10 minutes evaluating a good song which will be expected to be popular in their genre. These executives has just 120 minutes to make evaluations.

library(tidyverse)
library(caret)
library(neuralnet)

The dataset

For this analysis, there are three datasets available, two of which will be used in building the model and for the other, we will be making predictions about the songs’ popularity in its genre.

First, we load the datasets and check for missing observations at once, then we merge both data by track Id.

songFts<-read.csv(file.choose())
songGenPop<-read.csv(file.choose())
newToRank<-read.csv(file.choose())
#Check for missing observations in the 3 datasets
sum(is.na(songFts));sum(is.na(songGenPop));sum(is.na(newToRank))
## [1] 0
## [1] 0
## [1] 0
#1. Merging the data and removing duplicates
songData<-left_join(songGenPop,songFts, by="track_id") %>% distinct()
songData2<-songData

Visual exploration of data

Before proceeding with the analysis, we do some data visualizations to have some insight about the data.

#Data Viz
xy<-songData %>% 
        group_by(genre,popular_in_genre) %>% 
        tally() %>%
        head(40)
xy$popular_in_genre<- ifelse(xy$popular_in_genre==1,"Popular","Not Popular")

plotly::ggplotly(ggplot(data=xy,aes(x = reorder(genre, n), n, fill=popular_in_genre)) +
                         geom_bar(stat = "identity" )+
                         coord_flip() +
                         ggtitle("Number of songs for top 20 genres") +
                         xlab("Genres")+ ylab("Number of songs")+ labs(fill ="Popular In Genre")+
                         scale_fill_viridis_d(direction = -1, option = "D")
)

Let us look at the executives picking songs at random using selection with replacement, the probability that they pick a song which is popular in its genre is 0.32

knitr::kable(songData %>% 
        group_by(popular_in_genre) %>% 
        summarise(Proportion = n()/nrow(songData)))
popular_in_genre Proportion
0 0.678738
1 0.321262
ggplot(songData, aes(x=as.factor(popular_in_genre))) +
        ggtitle("Popular in Genre") + xlab("PopInGenre") +
        geom_bar(aes(y = 100*(..count..)/sum(..count..)), 
                 width = 0.5, fill="dark blue") + 
        ylab("Percentage") + coord_flip() + 
        theme_minimal()

The executives will definitely sample without replacement and the probability for sampling without replacement will not be the same after each sample has been drawn.

Data preprocessing

In carrying out the analysis, we need to carry out data preprocessing. We convert the genre feature into a dummy variable because we will be using the neuralnet package and it does not support factors as a feature.

# PReprocessing
songData$genre<-as.factor(songData$genre)
PopularInGenre<-songData[5]
#Coding genre into dummy variables
categoricals<-model.matrix(~0 + genre, data = songData)
songData<-songData[,-c(1,2,3,4,6,7)] #Picking columns needed for analysis

We proceed to scale the data using the min-max algorihm in order to make sure our data is in the range of 0-1.

#minMax Normalization
minMax <- function(x) {
        return ((x - min(x)) / (max(x) - min(x)))
}
mM<-sapply(songData, function(x){minMax(x)})
songData<-(cbind(mM, categoricals))

We need to rename some new features for recognizability purposes in the future.

colnames(songData)[18]<-"genrealternativeRnb"
colnames(songData)[31]<-"genredeepChillOut"
colnames(songData)[48]<-"genregoGo"
colnames(songData)[53]<-"genreJrap"
colnames(songData)[63]<-"genreRnb"
colnames(songData)[77]<-"genreVpop"
#Convert it back to dataframe
songData<-data.frame(songData)

splitting dataset for validation

We use the caret package to create data partition so we can randomly split the data using the feature “popular_in_genre” as our response. We also make sure to set a random seed for reproducibility.

set.seed(40)
trIndex<- createDataPartition(songData$popular_in_genre, p=.7, list=FALSE) #70/30
trainSet<- songData[trIndex,]
testSet<- songData[-trIndex,]

Since we have a dataframe with large number of features, we will not be manually filling the features into formula. We will create a formula using a function which collects all feature names.

# Set up formula
frm <- as.formula(paste(names(trainSet)[1], " ~ ", 
                        paste(names(trainSet)[-1], collapse= "+")))

We will build our model using the formula and tune it to find an optimal model.

set.seed(40)
#The model in different params to select best
nNet1<-neuralnet(frm, data = trainSet, hidden = 2,
                threshold = 1, linear.output = F, err.fct = "ce",
                act.fct = "logistic", algorithm = "rprop-")

nNet2<-neuralnet(frm, data = trainSet, hidden = 3,
                 threshold = 1, linear.output = F, err.fct = "ce",
                 act.fct = "logistic", algorithm = "rprop+")

In the first model (nNet1), we set our hidden layers to 2 and used the resilient back propagation without weight backtracking(rprop-). In the second model (nNet2), we set our hidden layers to 3 and used the resilient back propagation with weight backtracking(rprop+).

We apply both models on our train set to compute for their accuracy.

#Prediction1 on train data
pred<- compute(nNet1, trainSet[,-1])$net.result
pred<- ifelse(pred >=.5,1,0)

#Accuracy1
crossTab<- table(pred,trainSet[,1]);crossTab
##     
## pred    0    1
##    0 1604  537
##    1  130  281
accuracy<- sum(diag(crossTab))/sum(crossTab);accuracy
## [1] 0.7386364
#Prediction2 on train data
pred2<-compute(nNet2, trainSet[,-1])$net.result
pred2<- ifelse(pred2 >=.5,1,0)

#Accuracy2 [Better accuracy value]
crossTab2<- table(pred2,trainSet[,1]);crossTab2
##      
## pred2    0    1
##     0 1576  520
##     1  158  298
accuracy2<- sum(diag(crossTab2))/sum(crossTab2);accuracy2
## [1] 0.734326

We apply the best model on our test set to compute for the accuracy.

#Prediction on test data
tPred<-compute(nNet1, testSet[,-1])$net.result
tPred<- ifelse(tPred >=.5,1,0)

#Accuracy
tcrossTab<- table(tPred,testSet[,1]);tcrossTab
##      
## tPred   0   1
##     0 648 273
##     1  92  80
taccuracy<- sum(diag(tcrossTab))/sum(tcrossTab);taccuracy
## [1] 0.6660567

Ranking songs by their predicted popularity

In order to rank songs by predicted popularity, we use our model on the entire dataset of 3645 tracks. We then compute for the predicted popularity from the model.

#3. Ranking songs, we use the best model on the song data
Ranks<- compute(nNet2, songData[,-1])$net.result
RankData<-data.frame(cbind(songData2$track_id,songData2$artist_name,
                songData2$track_name.x, songData2$genre, Ranks)
)
names(RankData)<-c("TrackID","ArtistName","TrackTitle", "Genre","Ranks")

RankData <- RankData %>%
        arrange(desc(Ranks))

In the table below, we display the top 15 songs in the dataset using their predicted popularity to rank.

#Top 15 songs
knitr::kable(head(RankData,15))      
TrackID ArtistName TrackTitle Genre Ranks
3188 3oh6SCCeLuXhFpEyepla6G Blueface Thotiana hiphop 0.920879217937792
2658 1ThmUihH9dF8EV08ku5AXN Ski Mask The Slump God Faucet Failure hiphop 0.920650726822785
1206 6flur5o18COKeafnjLHRiJ Lil Pump Be Like Me (feat. Lil Wayne) hiphop 0.920073198847862
2711 1wJRveJZLSb1rjhnUHQiv6 A Boogie Wit da Hoodie Swervin (feat. 6ix9ine) hiphop 0.919854947897454
2192 01vv2AjxgP4uUyb8waYO5Y Twenty One Pilots Morph reading 0.919378167954494
1138 4bi9YrkOvmg4Eb7h00AW2c Rich The Kid Fall Threw (feat. Young Thug & Gunna) hiphop 0.919336210970099
2653 79OEIr4J4FHV0O3KrhaXRb Rich The Kid Splashin hiphop 0.919313130898429
2611 577YBGuskWkVDCxZrLRB4v Lil Baby Pure Cocaine hiphop 0.919186198335507
847 6fIMvZ2JA9kuQG8IuDgWMM YBN Cordae Have Mercy hiphop 0.918942026344218
2645 1jo3TwNGCXfNdB5uba3ZVv Eminem Killshot hiphop 0.918896048327624
2494 116H0KvKr2Zl4RPuVBruDO Bad Bunny MIA (feat. Drake) skyroom 0.918618896551828
1171 0fcq51a3gOI6gPvzc0YLsk Post Malone Wow. - Remix hiphop 0.918111174607048
1572 1TULEHv0wpIZYI7zqmhzEg Octavian Bet (feat. Skepta & Michael Phantom) hiphop 0.918102975513267
1190 1q9jq5X5vwmewjOa2mHtQ1 Gunna Same Yung Nigga (feat. Playboi Carti) hiphop 0.917991970703094
2475 2IRZnDFmlqMuOrYOLnZZyc Meek Mill Going Bad (feat. Drake) hiphop 0.917814238603503

Using the Neural network model, what we achieve is to help executives select songs that will most likely be popular in its genre. The model will however, be 66-74% accurate. This means that for every 10 songs the model selects, we are most likely to obtain at least 7 songs which are expected to be popular in their genre. We will not have to select randomly with the help of the model.

Our model will help the executives decrease the number of songs to be reviewed with high probability that the song will be popular in its genre. This will save their time in reviewing much bad songs which means they will get to review lesser amount of songs and higer number of good ones.

Using our model to create best 15 songs for review.

There is the data for songs submission and it is time to put our model to use. We need to apply the model on our new data but we may not easily achieve this until we do some feature engineering. We have to balance the data adding features of those genres which are not present in the new song to rank in order to make our new model function. Lastly, we use the model on the balanced dataset and make predictions.

#5. Probability of being popular [ranking new songs]
#USing the new data to rank, we engineer the data first
newToRank$genre<-as.factor(newToRank$genre)
ID<-newToRank[1]
Genre<-newToRank$genre
newToRank<-newToRank[,-1]
mMat<-model.matrix(~0 + genre, data = newToRank)

newToRank<-newToRank[,-14]
mMax<-sapply(newToRank, function(x){minMax(x)})
new<-cbind(mMax,mMat)

colnames(new)[46]<-"genregoGo"
colnames(new)[50]<-"genreJrap"
colnames(new)[30]<-"genredeepChillOut"
colnames(new)[58]<-"genreRnb"
colnames(new)[17]<-"genrealternativeRnb"


## NOTE THAT NUMBER OF GENRE IN NEW DATA IS 
## LESS THAN THE PREVIOUS ONE

#Adding lost features in the dummies
lostMat<-setdiff(colnames(songData[,2:80]), colnames(new))

for  (i  in  1:length(lostMat))  {
        temp.matrix  =  matrix(rep(0,  nrow(new)),  ncol=1)
        colnames(temp.matrix) = lostMat[i] 
        new = cbind(new, temp.matrix)
}
#Use the model to predict new data
probRank<-compute(nNet2, new)$net.result
Rank<-cbind(probRank,ID, Genre)


#4. IN NOTE PAD


#5. Best 15 songs to be evaluated
Rank1<- Rank %>% 
        arrange(desc(probRank)) %>% 
        head(15)

The table below shows the top 15 songs predicted by our model to be popular in their genre.

knitr::kable(Rank1)
probRank track_id Genre
16 0.9213698 370BNSvDvj8ceBlUV1qnD7 tropicalhouse
205 0.9213698 1NFnSycFY2NNT4NjB9SUNl tropicalhouse
71 0.9213698 4gK4g9BSjc0s8oF8nlUbJ9 tropicalhouse
280 0.9213698 3vbkMO3A7nKlEsz2Uay8nU tropicalhouse
214 0.9213698 6GyR41k0C1zaWtehhZuR1X tropicalhouse
117 0.9205441 1wJRveJZLSb1rjhnUHQiv6 j-rap
33 0.9143601 58q2HKrzhC3ozto2nDdN4z finnishhiphop
189 0.9096627 6YZQx90VEuzf0lRY9TYh7j soul
74 0.9033456 01IQ4aQgOf0Kkq3a273hmO j-rap
121 0.9028643 3iIZvct75tkk1CwbqVjsry porro
20 0.8776455 5qe9D2JqiNkhHksPhkVl0y r&b
145 0.8767790 2cytBOLpwFRX7J9URCrFIe j-rap
171 0.8606896 5qe9D2JqiNkhHksPhkVl0y soul
42 0.8327934 3GCdLUSnKSMJhs4Tj6CV3s urbancontemporary
190 0.7865318 6JqpFboOH2bq9pzaYbkKBn channelpop