1 Library Definition

library(tidyverse)
library(feather)
library(data.table)
library(lubridate) library(viridis)
library(DT)
library(magrittr)
library(RMySQL)

2 Read CSV Files into Data Frames

Data Files are sourced from https://www.kaggle.com/c/kkbox-music-recommendation-challenge/data. The files train.csv, songs.csv, test.csv and train.csv have been downloaded and saved into local drive.

3 Insert CSV Files into MySQL Tables

mydb <- dbConnect(MySQL(), user='data607', password='testpassword', dbname='music', host='localhost',use_unicode = TRUE, charset = 'utf8')

dbSendQuery(mydb, "LOAD DATA LOCAL INFILE 'members.csv'
                  INTO TABLE members
                  FIELDS TERMINATED by ','
                  LINES TERMINATED BY '\n'
                  IGNORE 1 LINES");

dbSendQuery(mydb, "LOAD DATA LOCAL INFILE 'train.csv'
                  INTO TABLE train
                  FIELDS TERMINATED by ','
                  LINES TERMINATED BY '\n'
                  IGNORE 1 LINES");


dbSendQuery(mydb, "LOAD DATA LOCAL INFILE 'test.csv'
                  INTO TABLE test
                  FIELDS TERMINATED by ','
                  LINES TERMINATED BY '\n'
                  IGNORE 1 LINES");

dbSendQuery(mydb, "LOAD DATA LOCAL INFILE 'songs.csv'
                  INTO TABLE songs
                  FIELDS TERMINATED by ','
                  LINES TERMINATED BY '\n'
                  IGNORE 1 LINES");
#ins_query <- "INSERT INTO songs(song_id, song_length, genre_ids,language) VALUES (songs$song_id, songs$song_length, songs$genre_ids,songs$language)";

#sqlExecute(mydb, ins_query, songs) ;

#remove(train)

4 Analysis of individual data frames

4.1 TRAIN

We will see how each variable in the train DF affects the target.

Let’s look at the data,

## Observations: 7,377,418
## Variables: 6
## $ msno               <chr> "FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kC...
## $ song_id            <chr> "BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6ti...
## $ source_system_tab  <chr> "explore", "my library", "my library", "my ...
## $ source_screen_name <chr> "Explore", "Local playlist more", "Local pl...
## $ source_type        <chr> "online-playlist", "local-playlist", "local...
## $ target             <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1...

source_system_tab, source_screen_name and source_type are categorical.

Functions tgt_vs_col and tgt_vs_colcnt are defined as follows

## Readable labels
readble_labels <- theme(axis.text=element_text(size=13),
                     axis.title=element_text(size=15),
                     plot.title = element_text(hjust = 0.5))

# Function to dislpay count of each category of the column and plot how it affects target
tgt_vs_col  <-function(df, col_name, x , y, title)
                  {
                    tmp_df <- df %>% 
                          group_by_(col_name) %>% summarize(count = n(), mean_target = mean(target)) %>% 
                          arrange(desc(mean_target)) 
                  
                  df_graph <- tmp_df %>% ggplot(aes_string(col_name, "mean_target")) + 
                            geom_col(aes(fill=count)) + scale_fill_gradient(low='darkgreen', high = 'violet')+
                            labs(x = x, y = y,title= title) +
                            readble_labels  + coord_flip()
                          
                  print(df_graph)
                  return (tmp_df)
                  
                  }

# Function to group songs and user by count and check it agains mean_target
tgt_vs_colcnt <- function(df, col_name, x, y, title)
                    { 
  
                    df %>% 
                      group_by_(col_name) %>% summarize(count = n(), mean_target = mean(target)) %>% 
                      group_by(count) %>%  summarize(new_count = n(), avg_target = mean(mean_target)) %>% 
                      rename(no_of_items = new_count, occurence = count) %>% 
                      print %>%  ggplot(aes(occurence, avg_target)) +
                        geom_line(color='brown') +
                        geom_smooth(color='turquoise') +
                        labs(x = x,
                             y = y,
                             title= title) +
                        readble_labels

                  
                  }

4.1.1 Column Count of Train and its effect on target

4.1.1.1 source_system_tab

For this column ‘setting’ has the least count and ‘my library’ the most count in the data set. So songs are played mostly through my library, search, radio and discover. An interesting thing is that, if the song is from ‘my library’ then it is more likely to be replayed within a month and if it is from radio then it is less likely to be replayed within a month. ‘My library’ is where the user stores their songs locally and hence they really love that song, and that leads to high mean_ target. On the contrary, radio is a random shuffle of songs and hence the user likeability is not predefined leading to low mean_target.

tgt_vs_col(train, col_name = "source_system_tab",
                  x = 'Frequency',
                  y = 'Target',
                  title = 'Count of source_system_tab vs Target')

4.1.1.2 source_screen_name

Similar to column source system tab, it is found that screens associated with ‘my library’ have the most count. Looks like KKBox users prefer downloaded music than live streaming. Payment (purchasing a single song) has the highest repeatability but the count of that category is only 12 in the entire data set. Local songs in general has higher repeatability.

tgt_vs_col(train, col_name = "source_screen_name",
                  x = 'Frequency',
                  y = 'Target',
                  title = 'Count of source_screen_name vs Target')

4.1.1.3 source_type

Songs appearing in ‘local-playlist’ has a slightly more repeatability than ‘local-library’.
The user might have liked the song so much to include in their local playlist that positively affects repeatability.

tgt_vs_col(train, col_name = "source_type",
                  x = 'Frequency',
                  y = 'Target',
                  title = 'Count of source_type vs Target')
glimpse(train)
## Observations: 7,377,418
## Variables: 6
## $ msno               <chr> "FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kC...
## $ song_id            <chr> "BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6ti...
## $ source_system_tab  <chr> "explore", "my library", "my library", "my ...
## $ source_screen_name <chr> "Explore", "Local playlist more", "Local pl...
## $ source_type        <chr> "online-playlist", "local-playlist", "local...
## $ target             <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1...

4.1.2 Song count and User count vs target

‘Song-id’ + ‘msno’ is unique in the train data set

4.1.2.1 Song count vs Target

Songs are grouped together and their count is checked against the ‘target’ variable. The count of a song present in the train data set is almost linearly associated with the mean_target. Assuming the train data set is randomnly drawn from the population, the more the song occurs the more it is discoverable by the user. This plots shows the relationship between discoverability vs mean_target.

You could see that there are 166766 songs that are appearing only once and has a lower mean_target and a single song that is appearing 13293 times that has a higher mean_target.

tgt_vs_colcnt(train, "song_id", "Song Occurence", "Target", "Song Occurence vs Target")
## # A tibble: 1,798 x 3
##   occurence no_of_items avg_target
##       <int>       <int>      <dbl>
## 1         1      166766  0.3776309
## 2         2       48444  0.3787363
## 3         3       26319  0.3860202
## 4         4       16789  0.3852671
## 5         5       12023  0.3876071
## # ... with 1,793 more rows

4.1.3 Count of target shows its balanced

train %>%   group_by(target) %>%   count

4.2 SONGS

Songs Data Frame

glimpse(songs) 
## Observations: 2,296,320
## Variables: 7
## $ song_id     <chr> "CXoTN1eb7AI+DntdU1vbcwGRV4SCIDxZu+YD8JP8r4E=", "o...
## $ song_length <int> 247640, 197328, 231781, 273554, 140329, 235520, 22...
## $ genre_ids   <chr> "465", "444", "465", "465", "726", "864|857|850|84...
## $ artist_name <chr> "<U+5F35><U+4FE1><U+54F2> (Jeff Chang)", "BLACKPINK", "SUPER JUNIOR", "...
## $ composer    <chr> "<U+8463><U+8C9E>", "TEDDY|  FUTURE BOUNCE|  Bekuh BOOM", NA, "<U+6E6F>...
## $ lyricist    <chr> "<U+4F55><U+555F><U+5F18>", "TEDDY", NA, "<U+5F90><U+4E16><U+73CD>", "Traditional", "Hayao M...
## $ language    <dbl> 3, 31, 31, 3, 52, 17, 3, 3, 3, 52, 10, 3, -1, 3, 5...

4.2.1 Top Items

Let’s see most frequent items in each category,

top <- function(df, col_name)
{
  temp_df <- df %>%  group_by_(col_name) %>%  count %>% arrange(desc(n)) %>% print
  
  return(temp_df)
}

4.2.1.1 Top Artists

artist_cnt <- top(songs, "artist_name")
## # A tibble: 222,363 x 2
## # Groups:   artist_name [222,363]
##                                                                artist_name
##                                                                      <chr>
## 1                                                          Various Artists
## 2      <U+8A3C><U+8072><U+97F3><U+6A02><U+5716><U+66F8><U+9928> ECHO MUSIC
## 3                                                             Billy Vaughn
## 4 <U+0E23><U+0E27><U+0E21><U+0E28><U+0E34><U+0E25><U+0E1B><U+0E34><U+0E19>
## 5                                                       Richard Clayderman
## # ... with 2.224e+05 more rows, and 1 more variables: n <int>

4.2.1.2 Top Lyricists

lyricist_count <- top(songs, "lyricist")
## # A tibble: 110,577 x 2
## # Groups:   lyricist [110,577]
##           lyricist       n
##              <chr>   <int>
## 1             <NA> 1945425
## 2      Traditional    1751
## 3         <U+2015>    1530
## 4 <U+6797><U+5915>    1044
## 5   Michael Ruland     832
## # ... with 1.106e+05 more rows

4.2.1.3 Top composers

composer_count <- top(songs, "composer")
## # A tibble: 329,297 x 2
## # Groups:   composer [329,297]
##                  composer       n
##                     <chr>   <int>
## 1                    <NA> 1071350
## 2             Neuromancer   17888
## 3   Johann Sebastian Bach   12105
## 4 Wolfgang Amadeus Mozart   10839
## 5           Marco Rinaldo   10803
## # ... with 3.293e+05 more rows

4.2.1.4 Top Language

language_count <- top(songs, "language")
## # A tibble: 11 x 2
## # Groups:   language [11]
##   language       n
##      <dbl>   <int>
## 1       52 1336694
## 2       -1  639467
## 3        3  106295
## 4       17   92518
## 5       24   41744
## # ... with 6 more rows
head(songs)

Genre_id is a multi label column with a minumum label of ‘one’ to a maximum label of ‘eight’. There are 192 unique genres. There are some missing values as well.

genids <- songs %>% separate(genre_ids, c("one", "two", "three", "four", "five", "six", "seven", "eight"), extra="merge")

genre_cnt <- genids %>%  select(one:eight)%>% gather(one:eight, key="nth_id", value="genre_ids", na.rm=TRUE) %>% 
                  group_by(genre_ids) %>% count %>% arrange(desc(n)) %>% print()
## # A tibble: 191 x 2
## # Groups:   genre_ids [191]
##   genre_ids      n
##       <chr>  <int>
## 1       465 589220
## 2       958 182836
## 3      1609 177258
## 4      2022 176531
## 5      2122 149608
## # ... with 186 more rows
4.2.1.4.1 Analysis of ‘song_length’
(min(songs$song_length)/1000)/60;(max(songs$song_length)/1000)/60
## [1] 0.003083333
## [1] 202.8975
#Records that have more than 15 minutes
 subset(songs, ((songs$song_length/1000)/60) > 15) %>% count

Song length range from 0.003 minutes to 202.89 minutes. There are 13623 records that have length more than 15 minutes.

songs %>% 
  mutate(song_length = song_length/6e4) %>%   ggplot(aes(song_length)) +   geom_histogram(binwidth = 0.25, fill='darkolivegreen') +
  labs(x='Song Length', y = 'Frequency', title = 'Distribution of song length') +   xlim(0, 15)

4.3 MEMBERS

Let’s look at the members df,

## Observations: 34,403
## Variables: 7
## $ msno                   <chr> "XQxgAYj3klVKjR3oxPPXYYFp4soD4TuBghkhMT...
## $ city                   <int> 1, 1, 1, 1, 1, 13, 1, 1, 1, 1, 12, 1, 1...
## $ bd                     <int> 0, 0, 0, 0, 0, 43, 0, 0, 0, 0, 28, 0, 0...
## $ gender                 <chr> NA, NA, NA, NA, NA, "female", NA, NA, N...
## $ registered_via         <int> 7, 7, 4, 9, 4, 9, 4, 7, 7, 7, 9, 7, 7, ...
## $ registration_init_time <int> 20110820, 20150628, 20160411, 20150906,...
## $ expiration_date        <int> 20170920, 20170622, 20170712, 20150907,...

In members data fram, city, bd, gender, registered via are categorical and registration init and expiration date are dates.

city_1<-subset(members, city==1)
age_0 <- subset(members,bd==0)

The functions memb_colgrp and memb_dt_cnt are defined as follows

memb_colgrp <- function(df,col_name, x, y, title, xmin, xmax, ymin, ymax)
                    {
                      
                    temp_df <- df %>% group_by_(col_name) %>% count() %>% arrange(desc(n))
                    
                    df_plot <- temp_df %>% ggplot(aes_string(col_name, "n")) + geom_col(fill='goldenrod2') + 
                                    labs(x = x, y = y, title = title) + xlim(xmin, xmax) +  ylim(ymin, ymax) + readble_labels
                    
                    print(df_plot)
                    return(temp_df)

}

memb_dt_cnt <- function(df, col_name, x, y, title)
{
                            df %>%   group_by_(month = month(col_name), year = year(col_name)) %>% 
                                count() %>% ungroup %>% 
                                mutate(date = as.Date(paste(as.character(year), as.character(month), '01', sep='-')))
                                ggplot(aes(date, n))+ geom_line(color='goldenrod2', size=1) +
                                labs(x = x,  y = y, title= title) +
                                xlim(xmin, xmax) +  readble_labels
}

4.3.1 Distribution of city, bd(age), gender

4.3.1.1 City

glimpse(city_1)
## Observations: 19,445
## Variables: 7
## $ msno                   <chr> "XQxgAYj3klVKjR3oxPPXYYFp4soD4TuBghkhMT...
## $ city                   <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ bd                     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ gender                 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ registered_via         <int> 7, 7, 4, 9, 4, 4, 7, 7, 7, 7, 7, 3, 7, ...
## $ registration_init_time <int> 20110820, 20150628, 20160411, 20150906,...
## $ expiration_date        <int> 20170920, 20170622, 20170712, 20150907,...
members%>%group_by(city)%>%count

The value ‘1’ have the most count. But the number 19445 seems suspicious as it is close to the number of records with zero age.
City = 1 is also far from other city counts.

4.3.1.2 Age

glimpse(age_0)
## Observations: 19,932
## Variables: 7
## $ msno                   <chr> "XQxgAYj3klVKjR3oxPPXYYFp4soD4TuBghkhMT...
## $ city                   <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ bd                     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ gender                 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ registered_via         <int> 7, 7, 4, 9, 4, 4, 7, 7, 7, 7, 7, 3, 7, ...
## $ registration_init_time <int> 20110820, 20150628, 20160411, 20150906,...
## $ expiration_date        <int> 20170920, 20170622, 20170712, 20150907,...

There are observations with age greater than 1000 and there are observations with age less than or equal to 0 and those are ouliers as described in the data dictionary. Sorted bd vs Frequency is shown in the tibble as well as the plot. There are 19932 records with 0 as age. This could be either outliers or missing values.
Plotting in the age range 1 -100 to show the real distribution.

memb_colgrp(members, "bd", "Age", "Frequency", "Age Distribution", 1, 100, 0, 1000)
## Warning: Removed 16 rows containing missing values (position_stack).

4.3.1.3 Gender

As we see the counts of ‘target’ values in the train data frame, in members, the Male and female counts are almost equal. We have a lot of missing gender (19902).

members %>% group_by(gender) %>%  count

4.3.1.4 Registered_via

members %>% group_by(registered_via)%>% count

Registration methods seem to be dominated mainly by 4,7,9 and 3

Convert the data types of the variables ‘registration_init_time’ and ‘expiration_date’ from int to date.

# This is one of the items in the checklist. ymd is used to convert
members %<>%  mutate(registration_init_time = ymd(registration_init_time),
                     expiration_date = ymd(expiration_date))
glimpse(members)  # Notice the changes inthe data type
## Observations: 34,403
## Variables: 7
## $ msno                   <chr> "XQxgAYj3klVKjR3oxPPXYYFp4soD4TuBghkhMT...
## $ city                   <int> 1, 1, 1, 1, 1, 13, 1, 1, 1, 1, 12, 1, 1...
## $ bd                     <int> 0, 0, 0, 0, 0, 43, 0, 0, 0, 0, 28, 0, 0...
## $ gender                 <chr> NA, NA, NA, NA, NA, "female", NA, NA, N...
## $ registered_via         <int> 7, 7, 4, 9, 4, 9, 4, 7, 7, 7, 9, 7, 7, ...
## $ registration_init_time <date> 2011-08-20, 2015-06-28, 2016-04-11, 20...
## $ expiration_date        <date> 2017-09-20, 2017-06-22, 2017-07-12, 20...

4.3.2 Signup vs Expiration

There are members signed up in 2004. But mostly users signed up between later part of 2016 and early part of 2017.

registration_cnt <- members %>% 
    group_by(month = month(registration_init_time), year = year(registration_init_time)) %>% 
    count() %>%  ungroup %>%  
   mutate(date = as.Date(paste(as.character(year), as.character(month), '01', sep='-'))) %>% 
    arrange(desc(n)) %>%  print
## # A tibble: 155 x 4
##   month  year     n       date
##   <dbl> <dbl> <int>     <date>
## 1     1  2017  2573 2017-01-01
## 2    12  2016  2545 2016-12-01
## 3     2  2017  2109 2017-02-01
## 4    11  2016  1432 2016-11-01
## 5    10  2016   978 2016-10-01
## # ... with 150 more rows
expiration_count <- members %>% 
    group_by(month = month(expiration_date), year = year(expiration_date)) %>% 
    count() %>% ungroup %>% 
    mutate(date = as.Date(paste(as.character(year), as.character(month), '01', sep='-'))) %>% 
    arrange(desc(n)) %>%  print
## # A tibble: 138 x 4
##   month  year     n       date
##   <dbl> <dbl> <int>     <date>
## 1     9  2017 10681 2017-09-01
## 2    10  2017  4924 2017-10-01
## 3    12  2016  2407 2016-12-01
## 4     1  2017  2260 2017-01-01
## 5     2  2017  1918 2017-02-01
## # ... with 133 more rows
registration_cnt %>% 
  left_join(expiration_count, by="date") %>% 
  ggplot() + geom_line(aes(date, n.x), color='goldenrod2') +
  geom_line(aes(date, n.y), color='mediumorchid') +
  labs(y="Frequency", title="Registration and Expiration Distribution")+
  readble_labels

4.3.3 Missingness in members

For City == 1, the values of gender and age shows that there are missing values. There are 18516 records that match all three condition. There is certainly relationship between these missingness.

members %>% 
  mutate(cit_gen_age = if_else(((city == 1) & (bd == 0) & (gender == "")), 1, 0),
         cit_gen =  if_else(((city == 1) & (gender == "")), 1, 0),
         cit_age = if_else(((city == 1) & (bd == 0)), 1, 0),
         gen_age =  if_else(((bd == 0) & (gender == "")), 1, 0)) %>% 
  summarize(city_gender_age = sum(cit_gen_age),
            city_gender = sum(cit_gen),
            city_age = sum(cit_age),
            gender_age =sum(gen_age))

4.4 TEST

Compare the test and train data frames.

4.4.1 Distribution of test and train on columns

train_test_graph <- function(train, test, col_name, x, y)
  {
  test %>%  group_by_(col_name) %>%   summarize(count = n()) %>% 
  left_join(train %>% group_by_(col_name) %>% summarize(count = n()) , by=col_name) %>% 
  mutate(ratio = count.x/count.y) %>% 
  rename(test_cnt = count.x, train_cnt = count.y) %>% 
  arrange(ratio) %>%   print %>% 
  ggplot() +
  geom_col(aes_string(col_name, "train_cnt"), fill='brown', alpha = 0.5) +
  geom_col(aes_string(col_name, "test_cnt"), fill='blue', alpha = 0.5) +
  coord_flip() +  labs(x = x, y= y)+  readble_labels
}

4.4.1.1 Source_system_tab

Train had more records from with value ‘my library’ compared to test.

x<-'source system tab'
y<-'Test/Train record Count'

test %>%  group_by_("source_system_tab") %>%   summarize(count = n()) %>% 
  left_join(train %>% group_by_("source_system_tab") %>% summarize(count = n()) , by='source_system_tab') %>% 
  mutate(ratio = count.x/count.y) %>% 
  rename(test_cnt = count.x, train_cnt = count.y) %>% 
  arrange(ratio) %>%   print
## # A tibble: 10 x 4
##   source_system_tab test_cnt train_cnt     ratio
##               <chr>    <int>     <int>     <dbl>
## 1        my library  1019492   3684730 0.2766802
## 2              <NA>     5096     18371 0.2773937
## 3          settings      633      2200 0.2877273
## 4      notification     2124      6185 0.3434115
## 5           explore    66023    167949 0.3931134
## # ... with 5 more rows

4.4.1.2 Source_screen_name

train_test_graph(train, test, col_name = "source_screen_name", "Source Screen Name", "Test/Train Count")
## # A tibble: 23 x 4
##    source_screen_name test_cnt train_cnt     ratio
##                 <chr>    <int>     <int>     <dbl>
## 1 Local playlist more   845115   3228202 0.2617912
## 2             Concert       13        47 0.2765957
## 3   My library_Search     2114      6451 0.3277011
## 4        Discover New     5277     15955 0.3307427
## 5          My library    25559     75980 0.3363912
## # ... with 18 more rows
## Warning: Removed 2 rows containing missing values (position_stack).

5 Feature Engineering

5.1 Songs Features

Features in the song data frame that shows the frequency of a particular item in the data set.

# <> is from magrittr package that is used for assiging it back the result
songs %<>% 
  left_join(artist_cnt, by='artist_name') %>% left_join(lyricist_count, by='lyricist') %>% 
  left_join(composer_count, by='composer') %>%  left_join(language_count, by='language') %>% 
  rename(art_cnt = n.x, lyr_cnt = n.y, cmp_cnt = n.x.x, lng_cnt = n.y.y)

glimpse(songs)
## Observations: 2,296,320
## Variables: 11
## $ song_id     <chr> "CXoTN1eb7AI+DntdU1vbcwGRV4SCIDxZu+YD8JP8r4E=", "o...
## $ song_length <int> 247640, 197328, 231781, 273554, 140329, 235520, 22...
## $ genre_ids   <chr> "465", "444", "465", "465", "726", "864|857|850|84...
## $ artist_name <chr> "<U+5F35><U+4FE1><U+54F2> (Jeff Chang)", "BLACKPINK", "SUPER JUNIOR", "...
## $ composer    <chr> "<U+8463><U+8C9E>", "TEDDY|  FUTURE BOUNCE|  Bekuh BOOM", NA, "<U+6E6F>...
## $ lyricist    <chr> "<U+4F55><U+555F><U+5F18>", "TEDDY", NA, "<U+5F90><U+4E16><U+73CD>", "Traditional", "Hayao M...
## $ language    <dbl> 3, 31, 31, 3, 52, 17, 3, 3, 3, 52, 10, 3, -1, 3, 5...
## $ art_cnt     <int> 352, 10, 375, 349, 2419, 2419, 445, 163, 279, 100,...
## $ lyr_cnt     <int> 167, 8, 1945425, 103, 1751, 29, 1945425, 9, 17, 9,...
## $ cmp_cnt     <int> 3, 3, 1071350, 25, 10106, 223, 8, 249, 51, 9, 104,...
## $ lng_cnt     <int> 106295, 39201, 39201, 106295, 1336694, 92518, 1062...

5.2 Train Features

count_frame <- function(df, col_name, new_name)
{
  return(df %>% group_by_(col_name) %>% count %>% rename_(.dots=setNames('n', new_name)))
}
train_song_cnt <- count_frame(train, 'song_id', 'song_cnt')
train_sst <- count_frame(train, 'source_system_tab', 'sst_cnt')
train_ssn <- count_frame(train, 'source_screen_name', 'ssn_cnt')
train_st <- count_frame(train, 'source_type', 'st_cnt')
train_song_cnt%>%arrange(desc(song_cnt))%>%head(10)
train_sst%>%arrange(desc(sst_cnt))%>%head(10)
train_ssn%>%arrange(desc(ssn_cnt))%>%head(10)
train_st%>%arrange(desc(st_cnt))%>%head(10)

Bibliography : https://www.kaggle.com/adiamaan/eda-and-feature-engineering