Libraries:

library(mongolite)
library(data.table)
library(tidyverse)
library(sparklyr)
library(xgboost)

Import the data:

setwd("~/Data_607_Final_Project/")
members = fread('members.csv')
extra_song = fread('song_extra_info.csv')

## 
Read 13.9% of 2296869 rows
Read 23.5% of 2296869 rows
Read 34.4% of 2296869 rows
Read 41.8% of 2296869 rows
Read 63.6% of 2296869 rows
Read 73.6% of 2296869 rows
Read 2296869 rows and 3 (of 3) columns from 0.169 GB file in 00:00:11

songs = fread('songs.csv')

## 
Read 0.0% of 2296833 rows
Read 36.6% of 2296833 rows
Read 72.3% of 2296833 rows
Read 2296320 rows and 7 (of 7) columns from 0.207 GB file in 00:00:06

train = fread('train.csv')

## 
Read 0.0% of 7377418 rows
Read 15.7% of 7377418 rows
Read 31.4% of 7377418 rows
Read 47.3% of 7377418 rows
Read 63.3% of 7377418 rows
Read 79.2% of 7377418 rows
Read 95.3% of 7377418 rows
Read 7377418 rows and 6 (of 6) columns from 0.905 GB file in 00:00:15

test <- fread('test.csv')

## 
Read 48.5% of 2556790 rows
Read 93.5% of 2556790 rows
Read 2556790 rows and 6 (of 6) columns from 0.324 GB file in 00:00:04

library(ggplot2)
membership.date <- as.Date(as.character(members$registration_init_time), "%Y%m%d")

## Warning in strptime(x, format, tz = "GMT"): unknown timezone 'zone/tz/
## 2017c.1.0/zoneinfo/America/Vancouver'

df <- data.frame(membership.date)
df$day <- weekdays(as.Date(df$membership.date))

x <- df %>% 
  group_by(membership.date) %>% 
  summarise(n = n()) %>% 
  mutate(dow = weekdays(as.Date(membership.date))) %>% 
  group_by(dow) %>% 
  summarise(xBar = mean(n),
            xSD = sd(n))

ggplot(data = x, aes(x = dow, y = xBar)) +
  geom_bar(stat = 'identity', fill = 'light blue')

y <- df %>% 
  group_by(membership.date) %>% 
  summarise(n = n()) %>% 
  mutate(dow = weekdays(as.Date(membership.date)))


ggplot(data = y, aes(x = dow, y = n)) +
  geom_violin(fill = 'light blue')

ggplot(data = y, aes(x = dow, y = n)) +
  geom_bar(stat = 'identity', fill = 'light blue')

Changing date format:

Given our prefered package of xgboost we need all data to be numeric.

 members$registration_init_time <- gsub("(\\d{4})(\\d{2})(\\d{2})$",
                                        "\\1-\\2-\\3", 
                                        members$registration_init_time) %>% 
  as.Date() %>% as.integer()

members$expiration_date <-  gsub("(\\d{4})(\\d{2})(\\d{2})$",
                                        "\\1-\\2-\\3", 
                                        members$expiration_date) %>% 
  as.Date() %>% as.integer()

Loading Data to MongoDB

# # SEtting up MongoDB
# m <- mongo(url = "mongodb://127.0.0.1:27017" ) # Database ports
# 
# jsonlite::stream_out(df, file("skillsDF.json"), verbose = FALSE)
# mt <- mongo("skillsDF")
# mt$import(file("skillsDF.json"))
# mt$find() %>% head(30) %>% kable()

Dealing with NA’s.

empty_as_na <- function(x){
    if("factor" %in% class(x)) x <- as.character(x) ## since ifelse wont work with factors
    ifelse(as.character(x)!="", x, NA)
}

This changes all "" to NA. After the NA values are replaced with artist name or composer name. This is to compensate for many composer values being left blank. We hypothesized that if the composer or lyrisist was blank, the artist mighe actually be the composer and if not, would be a good proxy as upposed to deleting AN values.

songs <- songs %>% mutate_all(funs(empty_as_na))
songs$composer <- ifelse(is.na(songs$composer), songs$artist_name, songs$composer) 
songs$lyricist <- ifelse(is.na(songs$lyricist), songs$composer, songs$lyricist)

Merging data:

While it is effecent to store the data in .csv or database in seperate tables, the data needs to be joined to perform actual analysis.

# Train Join
train <- merge(train, members, by = 'msno')
train <- merge(train, songs, by = 'song_id')

# Test Join
test <- merge(test, members, by = 'msno')
test <- merge(test, songs, by = 'song_id')

To make sure that there are no empty values.

## transform all columns
train <- train %>% mutate_all(funs(empty_as_na)) 
test <- test %>% mutate_all(funs(empty_as_na))

Convert all NA values to -1. This is necessary for the xgboost package.

train[is.na(data)] <- -1
test[is.na(data)] <- -1

Making data numerical:

This was probably the hardest part. xgboost does not deal with factor variables so we have to substitute factors for unique numbers.

# This converts all data to numeric:
# In effect, it makes everythign categorical and then assigns a number to them. 
train <- as.data.table(train)
test <- as.data.table(test)
for (f in names(train)){
  if( class(train[[f]]) == "character"){
    train[is.na(train[[f]]), eval(f) := ""]
    train[, eval(f) := as.integer(
      as.factor(train[[f]]))]
  } else train[is.na(train[[f]]), eval(f) := -1]
}


for (f in names(test)){
  if( class(test[[f]]) == "character"){
    test[is.na(test[[f]]), eval(f) := ""]
    test[, eval(f) := as.integer(
      as.factor(test[[f]]))]
  } else test[is.na(test[[f]]), eval(f) := -1]
}

Train, test, cross-validate split:

trainY <- train$target
trainX <- train
trainX$target <- NULL

testID <- test$id
test <- test[,-'id']

set.seed(101) # For reproducibility.
n <- sample(nrow(trainX), .2*nrow(trainX))
val <- trainX[n, ]
Yval <- trainY[n]
trainX <- trainX[-n, ]
trainY <- trainY[-n]

for (f in names(trainX)){
  xbar <- mean(train[[f]])
  xsd <- sd(train[[f]])
  trainX[[f]] <- (trainX[[f]] - xbar)/xsd
  test[[f]] <- (test[[f]] - xbar)/xsd
  val[[f]] <- (val[[f]] - xbar)/xsd
}

param = list(
  objective="binary:logistic",
  eval_metric= "auc",
  subsample= 0.95,
  colsample_bytree=0.45, 
  max_depth= 10,
  min_child= 6,
  tree_method= "approx", 
  eta  = 0.9 , 
  nthreads = 8
)

x_train <- xgb.DMatrix(
    as.matrix(trainX),
    label = trainY, 
    missing = -1)

x_val <- xgb.DMatrix(
    as.matrix(val), 
    label = Yval, missing = -1)
x_test <- xgb.DMatrix(as.matrix(test), missing = -1)


model <- xgb.train(
    data = x_train,
    nrounds = 100, 
    params = param,  
    maximize = TRUE,
    watchlist = list(val = x_val),
    print_every_n = 50
  )

## [18:11:39] Tree method is selected to be 'approx'
## [1]  val-auc:0.664067 
## [51] val-auc:0.719945 
## [100]    val-auc:0.724626

pred_3_e  <- predict(model, x_val)
pred_3_t  <- predict(model, x_test)

Results:

As you can see from our results, we did an ok job predicting

DATA 607 Final Project

Kai Lukowiak

2017-12-03