# Load library
library(ggplot2)
library(plyr)
## Warning: package 'plyr' was built under R version 3.4.2
library(plotly)
## Warning: package 'plotly' was built under R version 3.4.2
## 
## Attaching package: 'plotly'
## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(tibble)
library(data.table)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#Load data sets
setwd('D:/WSDM_task2/')
train<-as.tibble(fread('train.csv'))
## 
Read 0.0% of 7377418 rows
Read 15.6% of 7377418 rows
Read 29.4% of 7377418 rows
Read 44.1% of 7377418 rows
Read 57.2% of 7377418 rows
Read 69.1% of 7377418 rows
Read 84.7% of 7377418 rows
Read 7377418 rows and 6 (of 6) columns from 0.905 GB file in 00:00:14
songs<-as.tibble(fread('songs.csv'))
## 
Read 0.0% of 2296833 rows
Read 25.7% of 2296833 rows
Read 33.1% of 2296833 rows
Read 48.3% of 2296833 rows
Read 64.0% of 2296833 rows
Read 66.2% of 2296833 rows
Read 89.3% of 2296833 rows
Read 2296320 rows and 7 (of 7) columns from 0.207 GB file in 00:00:13
members<-as.tibble(fread('members.csv'))
test<-as.tibble(fread('test.csv'))
## 
Read 0.0% of 2556790 rows
Read 41.8% of 2556790 rows
Read 84.9% of 2556790 rows
Read 2556790 rows and 6 (of 6) columns from 0.324 GB file in 00:00:05
#Some statistic numbers of dataset

print("Unique songs in train set:")
## [1] "Unique songs in train set:"
nrow(data.frame((unique(train$song_id))))
## [1] 359966
print("Unique users in train set:")
## [1] "Unique users in train set:"
nrow(data.frame((unique(train$msno))))
## [1] 30755
print("Unique songs in test set:")
## [1] "Unique songs in test set:"
nrow(data.frame((unique(test$song_id))))
## [1] 224753
#Join dataset
train_songs<-inner_join(train, songs)
## Joining, by = "song_id"
test_songs<-inner_join(test, songs)
## Joining, by = "song_id"
print("Unique artists in train set:")
## [1] "Unique artists in train set:"
nrow(data.frame((unique(train_songs$artist_name))))
## [1] 40582
print("Unique languages in train set:")
## [1] "Unique languages in train set:"
nrow(data.frame((unique(train_songs$language))))
## [1] 11
print("Unique genres in train set:")
## [1] "Unique genres in train set:"
nrow(data.frame((unique(train_songs$genre_ids))))
## [1] 573
print("Unique artists in test set:")
## [1] "Unique artists in test set:"
nrow(data.frame((unique(test_songs$artist_name))))
## [1] 27563
print("Unique languages in test set:")
## [1] "Unique languages in test set:"
nrow(data.frame((unique(test_songs$language))))
## [1] 11
print("Unique genres in test set:")
## [1] "Unique genres in test set:"
nrow(data.frame((unique(test_songs$genre_ids))))
## [1] 502
print("Numbers of unique songs in test set is excluded in train set:")
## [1] "Numbers of unique songs in test set is excluded in train set:"
nrow(data.frame((unique(test$song_id))))-nrow(data.frame(intersect(train_songs$song_id,test_songs$song_id)))
## [1] 59881
print("Numbers of unique users in test set is excluded in train set:")
## [1] "Numbers of unique users in test set is excluded in train set:"
nrow(data.frame((unique(test$msno))))-nrow(data.frame(intersect(train_songs$msno,test_songs$msno)))
## [1] 3648
print("Numbers of unique artists in test set is excluded in train set:")
## [1] "Numbers of unique artists in test set is excluded in train set:"
nrow(data.frame((unique(test_songs$artist_name))))-nrow(data.frame(intersect(train_songs$artist_name,test_songs$artist_name)))
## [1] 5790
print("Numbers of unique languages in test set is excluded in train set:")
## [1] "Numbers of unique languages in test set is excluded in train set:"
nrow(data.frame((unique(test_songs$language))))-nrow(data.frame(intersect(train_songs$language,test_songs$language)))
## [1] 0
print("Numbers of unique genres in test set is excluded in train set:")
## [1] "Numbers of unique genres in test set is excluded in train set:"
nrow(data.frame((unique(test_songs$genre_ids))))-nrow(data.frame(intersect(train_songs$genre_ids,test_songs$genre_ids)))
## [1] 36
#Join train, members, song dataset
listens<-inner_join(train, members)
## Joining, by = "msno"
listens<-inner_join(listens, songs)
## Joining, by = "song_id"
print("table joined successfully")
## [1] "table joined successfully"
#Extract the dataset which target is 1
listens1<-subset(listens,listens$target==1)

#source_system_tab frequency
source_system_tab1 <- data.frame(table(listens1$source_system_tab))
plot_ly(source_system_tab1, x = ~Var1, y = ~Freq, type = "bar", text = "Source_System_Tab Frequency - Target 1")
#source_type frequency
source_type1 <- data.frame(table(listens1$source_type)) 
plot_ly(source_type1, x = ~Var1, y = ~Freq, type = "bar", text = "Source_Type Frequency - Target 1")
#Probability of Source with target = 0
total.source_system_tab = table(listens$source_system_tab)
total.source_screen_name = table(listens$source_screen_name)
total.source_type = table(listens$source_type)

source_system_tab1.b = table(listens1$source_system_tab)
source_screen_name1.b = table(listens1$source_screen_name)
source_type1.b = table(listens1$source_type)

c = data.frame(((total.source_system_tab-source_system_tab1.b)*100/total.source_system_tab))
d = data.frame(((total.source_screen_name-source_screen_name1.b)*100/total.source_screen_name))
e = data.frame(((total.source_type - source_type1.b)*100/ total.source_type))

plot_ly(c, x=~Freq, y=~Var1, type = "bar", text= "Probability of Source_System_Tab Type") 
plot_ly(d, x=~Freq, y=~Var1, type = "bar",text= "Probability of Source_Screen_Name Type")
plot_ly(e, x=~Freq, y=~Var1, type = "bar",text= "Probability of App Source Type")
#Forecasting the 'Joint' Probability of Source given a User's Gender
#female users' preferred sources
listens1 <- subset(listens, listens$target == 1)     
listens2 <- subset(listens, listens$target == 1 & listens$gender =="female")       

total.source_system_tab = table(listens1$source_system_tab)
total.source_screen_name = table(listens1$source_screen_name)
total.source_type = table(listens1$source_type)

source_system_tab.f = table(listens2$source_system_tab)
source_screen_name.f = table(listens2$source_screen_name)
source_type.f = table(listens2$source_type)

c = data.frame(((total.source_system_tab - source_system_tab.f)*100/total.source_system_tab))
d = data.frame(((total.source_screen_name - source_screen_name.f)*100/total.source_screen_name))
e = data.frame(((total.source_type - source_type.f)*100/ total.source_type))

plot_ly(c, x=~Freq, y=~Var1, type = "bar", text= "Probability of Source_System_Tab Type for a Female User") 
plot_ly(d, x=~Freq, y=~Var1, type = "bar",text= "Probability of Source_Screen_Name Type for a Female User")
plot_ly(e, x=~Freq, y=~Var1, type = "bar",text= "Probability of App Source Type for a Female User")
#male users' preferred sources
listens1 <- subset(listens, listens$target == 1)     
listens2 <- subset(listens, listens$target == 1 & listens$gender =="male")       

total.source_system_tab = table(listens1$source_system_tab)
total.source_screen_name = table(listens1$source_screen_name)
total.source_type = table(listens1$source_type)

source_system_tab.f = table(listens2$source_system_tab)
source_screen_name.f = table(listens2$source_screen_name)
source_type.f = table(listens2$source_type)

c = data.frame(((total.source_system_tab - source_system_tab.f)*100/total.source_system_tab))
d = data.frame(((total.source_screen_name - source_screen_name.f)*100/total.source_screen_name))
e = data.frame(((total.source_type - source_type.f)*100/ total.source_type))

plot_ly(c, x=~Freq, y=~Var1, type = "bar", text= "Probability of Source_System_Tab Type for a Male User") 
plot_ly(d, x=~Freq, y=~Var1, type = "bar",text= "Probability of Source_Screen_Name Type for a Male User")
plot_ly(e, x=~Freq, y=~Var1, type = "bar",text= "Probability of App Source Type for a Male User")