library(mongolite)
library(data.table)
library(tidyverse)
library(sparklyr)
library(xgboost)
setwd("~/Data_607_Final_Project/")
members = fread('members.csv')
extra_song = fread('song_extra_info.csv')
##
Read 13.9% of 2296869 rows
Read 23.5% of 2296869 rows
Read 34.4% of 2296869 rows
Read 41.8% of 2296869 rows
Read 63.6% of 2296869 rows
Read 73.6% of 2296869 rows
Read 2296869 rows and 3 (of 3) columns from 0.169 GB file in 00:00:11
songs = fread('songs.csv')
##
Read 0.0% of 2296833 rows
Read 36.6% of 2296833 rows
Read 72.3% of 2296833 rows
Read 2296320 rows and 7 (of 7) columns from 0.207 GB file in 00:00:06
train = fread('train.csv')
##
Read 0.0% of 7377418 rows
Read 15.7% of 7377418 rows
Read 31.4% of 7377418 rows
Read 47.3% of 7377418 rows
Read 63.3% of 7377418 rows
Read 79.2% of 7377418 rows
Read 95.3% of 7377418 rows
Read 7377418 rows and 6 (of 6) columns from 0.905 GB file in 00:00:15
test <- fread('test.csv')
##
Read 48.5% of 2556790 rows
Read 93.5% of 2556790 rows
Read 2556790 rows and 6 (of 6) columns from 0.324 GB file in 00:00:04
library(ggplot2)
membership.date <- as.Date(as.character(members$registration_init_time), "%Y%m%d")
## Warning in strptime(x, format, tz = "GMT"): unknown timezone 'zone/tz/
## 2017c.1.0/zoneinfo/America/Vancouver'
df <- data.frame(membership.date)
df$day <- weekdays(as.Date(df$membership.date))
x <- df %>%
group_by(membership.date) %>%
summarise(n = n()) %>%
mutate(dow = weekdays(as.Date(membership.date))) %>%
group_by(dow) %>%
summarise(xBar = mean(n),
xSD = sd(n))
ggplot(data = x, aes(x = dow, y = xBar)) +
geom_bar(stat = 'identity', fill = 'light blue')
y <- df %>%
group_by(membership.date) %>%
summarise(n = n()) %>%
mutate(dow = weekdays(as.Date(membership.date)))
ggplot(data = y, aes(x = dow, y = n)) +
geom_violin(fill = 'light blue')
ggplot(data = y, aes(x = dow, y = n)) +
geom_bar(stat = 'identity', fill = 'light blue')
Given our prefered package of xgboost we need all data to be numeric.
members$registration_init_time <- gsub("(\\d{4})(\\d{2})(\\d{2})$",
"\\1-\\2-\\3",
members$registration_init_time) %>%
as.Date() %>% as.integer()
members$expiration_date <- gsub("(\\d{4})(\\d{2})(\\d{2})$",
"\\1-\\2-\\3",
members$expiration_date) %>%
as.Date() %>% as.integer()
# # SEtting up MongoDB
# m <- mongo(url = "mongodb://127.0.0.1:27017" ) # Database ports
#
# jsonlite::stream_out(df, file("skillsDF.json"), verbose = FALSE)
# mt <- mongo("skillsDF")
# mt$import(file("skillsDF.json"))
# mt$find() %>% head(30) %>% kable()
empty_as_na <- function(x){
if("factor" %in% class(x)) x <- as.character(x) ## since ifelse wont work with factors
ifelse(as.character(x)!="", x, NA)
}
This changes all "" to NA. After the NA values are replaced with artist name or composer name. This is to compensate for many composer values being left blank. We hypothesized that if the composer or lyrisist was blank, the artist mighe actually be the composer and if not, would be a good proxy as upposed to deleting AN values.
songs <- songs %>% mutate_all(funs(empty_as_na))
songs$composer <- ifelse(is.na(songs$composer), songs$artist_name, songs$composer)
songs$lyricist <- ifelse(is.na(songs$lyricist), songs$composer, songs$lyricist)
While it is effecent to store the data in .csv or database in seperate tables, the data needs to be joined to perform actual analysis.
# Train Join
train <- merge(train, members, by = 'msno')
train <- merge(train, songs, by = 'song_id')
# Test Join
test <- merge(test, members, by = 'msno')
test <- merge(test, songs, by = 'song_id')
To make sure that there are no empty values.
## transform all columns
train <- train %>% mutate_all(funs(empty_as_na))
test <- test %>% mutate_all(funs(empty_as_na))
Convert all NA values to -1. This is necessary for the xgboost package.
train[is.na(data)] <- -1
test[is.na(data)] <- -1
This was probably the hardest part. xgboost does not deal with factor variables so we have to substitute factors for unique numbers.
# This converts all data to numeric:
# In effect, it makes everythign categorical and then assigns a number to them.
train <- as.data.table(train)
test <- as.data.table(test)
for (f in names(train)){
if( class(train[[f]]) == "character"){
train[is.na(train[[f]]), eval(f) := ""]
train[, eval(f) := as.integer(
as.factor(train[[f]]))]
} else train[is.na(train[[f]]), eval(f) := -1]
}
for (f in names(test)){
if( class(test[[f]]) == "character"){
test[is.na(test[[f]]), eval(f) := ""]
test[, eval(f) := as.integer(
as.factor(test[[f]]))]
} else test[is.na(test[[f]]), eval(f) := -1]
}
trainY <- train$target
trainX <- train
trainX$target <- NULL
testID <- test$id
test <- test[,-'id']
set.seed(101) # For reproducibility.
n <- sample(nrow(trainX), .2*nrow(trainX))
val <- trainX[n, ]
Yval <- trainY[n]
trainX <- trainX[-n, ]
trainY <- trainY[-n]
for (f in names(trainX)){
xbar <- mean(train[[f]])
xsd <- sd(train[[f]])
trainX[[f]] <- (trainX[[f]] - xbar)/xsd
test[[f]] <- (test[[f]] - xbar)/xsd
val[[f]] <- (val[[f]] - xbar)/xsd
}
param = list(
objective="binary:logistic",
eval_metric= "auc",
subsample= 0.95,
colsample_bytree=0.45,
max_depth= 10,
min_child= 6,
tree_method= "approx",
eta = 0.9 ,
nthreads = 8
)
x_train <- xgb.DMatrix(
as.matrix(trainX),
label = trainY,
missing = -1)
x_val <- xgb.DMatrix(
as.matrix(val),
label = Yval, missing = -1)
x_test <- xgb.DMatrix(as.matrix(test), missing = -1)
model <- xgb.train(
data = x_train,
nrounds = 100,
params = param,
maximize = TRUE,
watchlist = list(val = x_val),
print_every_n = 50
)
## [18:11:39] Tree method is selected to be 'approx'
## [1] val-auc:0.664067
## [51] val-auc:0.719945
## [100] val-auc:0.724626
pred_3_e <- predict(model, x_val)
pred_3_t <- predict(model, x_test)
As you can see from our results, we did an ok job predicting