R Notebook

# Load library
library(ggplot2)
library(plyr)

## Warning: package 'plyr' was built under R version 3.4.2

library(plotly)

## Warning: package 'plotly' was built under R version 3.4.2

## 
## Attaching package: 'plotly'

## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(tibble)
library(data.table)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

#Load data sets
setwd('D:/WSDM_task2/')
train<-as.tibble(fread('train.csv'))

## 
Read 0.0% of 7377418 rows
Read 15.6% of 7377418 rows
Read 29.4% of 7377418 rows
Read 44.1% of 7377418 rows
Read 57.2% of 7377418 rows
Read 69.1% of 7377418 rows
Read 84.7% of 7377418 rows
Read 7377418 rows and 6 (of 6) columns from 0.905 GB file in 00:00:14

songs<-as.tibble(fread('songs.csv'))

## 
Read 0.0% of 2296833 rows
Read 25.7% of 2296833 rows
Read 33.1% of 2296833 rows
Read 48.3% of 2296833 rows
Read 64.0% of 2296833 rows
Read 66.2% of 2296833 rows
Read 89.3% of 2296833 rows
Read 2296320 rows and 7 (of 7) columns from 0.207 GB file in 00:00:13

members<-as.tibble(fread('members.csv'))
test<-as.tibble(fread('test.csv'))

## 
Read 0.0% of 2556790 rows
Read 41.8% of 2556790 rows
Read 84.9% of 2556790 rows
Read 2556790 rows and 6 (of 6) columns from 0.324 GB file in 00:00:05

#Some statistic numbers of dataset

print("Unique songs in train set:")

## [1] "Unique songs in train set:"

nrow(data.frame((unique(train$song_id))))

## [1] 359966

print("Unique users in train set:")

## [1] "Unique users in train set:"

nrow(data.frame((unique(train$msno))))

## [1] 30755

print("Unique songs in test set:")

## [1] "Unique songs in test set:"

nrow(data.frame((unique(test$song_id))))

## [1] 224753

#Join dataset
train_songs<-inner_join(train, songs)

## Joining, by = "song_id"

test_songs<-inner_join(test, songs)

## Joining, by = "song_id"

print("Unique artists in train set:")

## [1] "Unique artists in train set:"

nrow(data.frame((unique(train_songs$artist_name))))

## [1] 40582

print("Unique languages in train set:")

## [1] "Unique languages in train set:"

nrow(data.frame((unique(train_songs$language))))

## [1] 11

print("Unique genres in train set:")

## [1] "Unique genres in train set:"

nrow(data.frame((unique(train_songs$genre_ids))))

## [1] 573

print("Unique artists in test set:")

## [1] "Unique artists in test set:"

nrow(data.frame((unique(test_songs$artist_name))))

## [1] 27563

print("Unique languages in test set:")

## [1] "Unique languages in test set:"

nrow(data.frame((unique(test_songs$language))))

## [1] 11

print("Unique genres in test set:")

## [1] "Unique genres in test set:"

nrow(data.frame((unique(test_songs$genre_ids))))

## [1] 502

print("Numbers of unique songs in test set is excluded in train set:")

## [1] "Numbers of unique songs in test set is excluded in train set:"

nrow(data.frame((unique(test$song_id))))-nrow(data.frame(intersect(train_songs$song_id,test_songs$song_id)))

## [1] 59881

print("Numbers of unique users in test set is excluded in train set:")

## [1] "Numbers of unique users in test set is excluded in train set:"

nrow(data.frame((unique(test$msno))))-nrow(data.frame(intersect(train_songs$msno,test_songs$msno)))

## [1] 3648

print("Numbers of unique artists in test set is excluded in train set:")

## [1] "Numbers of unique artists in test set is excluded in train set:"

nrow(data.frame((unique(test_songs$artist_name))))-nrow(data.frame(intersect(train_songs$artist_name,test_songs$artist_name)))

## [1] 5790

print("Numbers of unique languages in test set is excluded in train set:")

## [1] "Numbers of unique languages in test set is excluded in train set:"

nrow(data.frame((unique(test_songs$language))))-nrow(data.frame(intersect(train_songs$language,test_songs$language)))

## [1] 0

print("Numbers of unique genres in test set is excluded in train set:")

## [1] "Numbers of unique genres in test set is excluded in train set:"

nrow(data.frame((unique(test_songs$genre_ids))))-nrow(data.frame(intersect(train_songs$genre_ids,test_songs$genre_ids)))

## [1] 36

#Join train, members, song dataset
listens<-inner_join(train, members)

## Joining, by = "msno"

listens<-inner_join(listens, songs)

## Joining, by = "song_id"

print("table joined successfully")

## [1] "table joined successfully"

#Extract the dataset which target is 1
listens1<-subset(listens,listens$target==1)

#source_system_tab frequency
source_system_tab1 <- data.frame(table(listens1$source_system_tab))
plot_ly(source_system_tab1, x = ~Var1, y = ~Freq, type = "bar", text = "Source_System_Tab Frequency - Target 1")

#source_type frequency
source_type1 <- data.frame(table(listens1$source_type)) 
plot_ly(source_type1, x = ~Var1, y = ~Freq, type = "bar", text = "Source_Type Frequency - Target 1")

#Probability of Source with target = 0
total.source_system_tab = table(listens$source_system_tab)
total.source_screen_name = table(listens$source_screen_name)
total.source_type = table(listens$source_type)

source_system_tab1.b = table(listens1$source_system_tab)
source_screen_name1.b = table(listens1$source_screen_name)
source_type1.b = table(listens1$source_type)

c = data.frame(((total.source_system_tab-source_system_tab1.b)*100/total.source_system_tab))
d = data.frame(((total.source_screen_name-source_screen_name1.b)*100/total.source_screen_name))
e = data.frame(((total.source_type - source_type1.b)*100/ total.source_type))

plot_ly(c, x=~Freq, y=~Var1, type = "bar", text= "Probability of Source_System_Tab Type")

plot_ly(d, x=~Freq, y=~Var1, type = "bar",text= "Probability of Source_Screen_Name Type")

plot_ly(e, x=~Freq, y=~Var1, type = "bar",text= "Probability of App Source Type")

#Forecasting the 'Joint' Probability of Source given a User's Gender
#female users' preferred sources
listens1 <- subset(listens, listens$target == 1)     
listens2 <- subset(listens, listens$target == 1 & listens$gender =="female")       

total.source_system_tab = table(listens1$source_system_tab)
total.source_screen_name = table(listens1$source_screen_name)
total.source_type = table(listens1$source_type)

source_system_tab.f = table(listens2$source_system_tab)
source_screen_name.f = table(listens2$source_screen_name)
source_type.f = table(listens2$source_type)

c = data.frame(((total.source_system_tab - source_system_tab.f)*100/total.source_system_tab))
d = data.frame(((total.source_screen_name - source_screen_name.f)*100/total.source_screen_name))
e = data.frame(((total.source_type - source_type.f)*100/ total.source_type))

plot_ly(c, x=~Freq, y=~Var1, type = "bar", text= "Probability of Source_System_Tab Type for a Female User")

plot_ly(d, x=~Freq, y=~Var1, type = "bar",text= "Probability of Source_Screen_Name Type for a Female User")

plot_ly(e, x=~Freq, y=~Var1, type = "bar",text= "Probability of App Source Type for a Female User")

#male users' preferred sources
listens1 <- subset(listens, listens$target == 1)     
listens2 <- subset(listens, listens$target == 1 & listens$gender =="male")       

total.source_system_tab = table(listens1$source_system_tab)
total.source_screen_name = table(listens1$source_screen_name)
total.source_type = table(listens1$source_type)

source_system_tab.f = table(listens2$source_system_tab)
source_screen_name.f = table(listens2$source_screen_name)
source_type.f = table(listens2$source_type)

c = data.frame(((total.source_system_tab - source_system_tab.f)*100/total.source_system_tab))
d = data.frame(((total.source_screen_name - source_screen_name.f)*100/total.source_screen_name))
e = data.frame(((total.source_type - source_type.f)*100/ total.source_type))

plot_ly(c, x=~Freq, y=~Var1, type = "bar", text= "Probability of Source_System_Tab Type for a Male User")

plot_ly(d, x=~Freq, y=~Var1, type = "bar",text= "Probability of Source_Screen_Name Type for a Male User")

plot_ly(e, x=~Freq, y=~Var1, type = "bar",text= "Probability of App Source Type for a Male User")