library(stringr)
suppressMessages(library(dplyr))
library(reshape2)
library(ggplot2)
suppressMessages(library(caret))
library(coefplot)
options(stringsAsFactors = F)
setwd('/Users/Runze/Google Drive/quora_classification_challenge')
# read in training data (the first 4500 rows)
train = read.table('input00.txt', sep = ' ', skip = 1, nrows = 4500)
str(train)
## 'data.frame': 4500 obs. of 25 variables:
## $ V1 : chr "Nt8FJ" "VCaTF" "gParY" "DtWDw" ...
## $ V2 : int 1 1 1 1 1 -1 1 -1 1 1 ...
## $ V3 : chr "1:12087620705283" "1:282114466020" "1:173284955" "1:4708728355523" ...
## $ V4 : chr "2:4.797982" "2:3.151926" "2:1.785813" "2:2.394989" ...
## $ V5 : chr "3:1" "3:1" "3:1" "3:1" ...
## $ V6 : chr "4:4.990433" "4:3.737670" "4:1.791759" "4:3.091042" ...
## $ V7 : chr "5:6.492240" "5:3.891820" "5:1.945910" "5:2.890372" ...
## $ V8 : chr "6:80.000000" "6:83.000000" "6:0.000000" "6:0.000000" ...
## $ V9 : chr "7:0.000000" "7:0.000000" "7:0.000000" "7:0.000000" ...
## $ V10: chr "8:0.000000" "8:0.000000" "8:0.000000" "8:0.000000" ...
## $ V11: chr "9:0" "9:1" "9:0" "9:0" ...
## $ V12: chr "10:0" "10:0" "10:0" "10:0" ...
## $ V13: chr "11:3.850148" "11:2.564949" "11:4.262680" "11:4.804021" ...
## $ V14: chr "12:0" "12:0" "12:2" "12:8" ...
## $ V15: chr "13:1" "13:1" "13:1" "13:1" ...
## $ V16: chr "14:0" "14:0" "14:0" "14:0" ...
## $ V17: chr "15:0" "15:0" "15:0" "15:0" ...
## $ V18: chr "16:1" "16:1" "16:1" "16:1" ...
## $ V19: chr "17:1" "17:0" "17:0" "17:1" ...
## $ V20: chr "18:0" "18:0" "18:0" "18:0" ...
## $ V21: chr "19:0" "19:0" "19:0" "19:0" ...
## $ V22: chr "20:14" "20:2" "20:6" "20:5" ...
## $ V23: chr "21:4.653960" "21:2.890372" "21:3.637586" "21:2.772589" ...
## $ V24: chr "22:0.000000" "22:0.000000" "22:0.000000" "22:0.000000" ...
## $ V25: chr "23:0.000000" "23:0.000000" "23:0.000000" "23:0.000000" ...
# remove answer id (the first column) and rename others
train = dplyr::select(train, -V1)
names(train)[1] = 'rating'
names(train)[2:ncol(train)] = paste0('v_', seq_len(ncol(train)-1))
train = data.frame(lapply(train, function(x) gsub('[0-9]+:', '', x)))
str(train)
## 'data.frame': 4500 obs. of 24 variables:
## $ rating: chr "1" "1" "1" "1" ...
## $ v_1 : chr "12087620705283" "282114466020" "173284955" "4708728355523" ...
## $ v_2 : chr "4.797982" "3.151926" "1.785813" "2.394989" ...
## $ v_3 : chr "1" "1" "1" "1" ...
## $ v_4 : chr "4.990433" "3.737670" "1.791759" "3.091042" ...
## $ v_5 : chr "6.492240" "3.891820" "1.945910" "2.890372" ...
## $ v_6 : chr "80.000000" "83.000000" "0.000000" "0.000000" ...
## $ v_7 : chr "0.000000" "0.000000" "0.000000" "0.000000" ...
## $ v_8 : chr "0.000000" "0.000000" "0.000000" "0.000000" ...
## $ v_9 : chr "0" "1" "0" "0" ...
## $ v_10 : chr "0" "0" "0" "0" ...
## $ v_11 : chr "3.850148" "2.564949" "4.262680" "4.804021" ...
## $ v_12 : chr "0" "0" "2" "8" ...
## $ v_13 : chr "1" "1" "1" "1" ...
## $ v_14 : chr "0" "0" "0" "0" ...
## $ v_15 : chr "0" "0" "0" "0" ...
## $ v_16 : chr "1" "1" "1" "1" ...
## $ v_17 : chr "1" "0" "0" "1" ...
## $ v_18 : chr "0" "0" "0" "0" ...
## $ v_19 : chr "0" "0" "0" "0" ...
## $ v_20 : chr "14" "2" "6" "5" ...
## $ v_21 : chr "4.653960" "2.890372" "3.637586" "2.772589" ...
## $ v_22 : chr "0.000000" "0.000000" "0.000000" "0.000000" ...
## $ v_23 : chr "0.000000" "0.000000" "0.000000" "0.000000" ...
# select and transform variables
# check value frequencies for each variable
sapply(train, function(x) length(table(x)))
## rating v_1 v_2 v_3 v_4 v_5 v_6 v_7 v_8 v_9
## 2 4500 2531 2 415 524 172 161 227 7
## v_10 v_11 v_12 v_13 v_14 v_15 v_16 v_17 v_18 v_19
## 2 336 45 2 2 2 2 2 12 2
## v_20 v_21 v_22 v_23
## 79 232 1 1
# v_1 has a different value for each observation and
# v_22 and v_23 have the same value for all observations
# hence, remove all 3 of them
train = dplyr::select(train, -v_1, -v_22, -v_23)
# identify and convert factor and numerical variables
# based on the value frequencies calculated above
fac_vars = c('rating', 'v_3', 'v_9', 'v_10', 'v_13', 'v_14',
'v_15', 'v_16', 'v_17', 'v_19')
num_vars = names(train)[!(names(train) %in% fac_vars)]
train[, fac_vars] = data.frame(lapply(train[, fac_vars], as.factor))
train[, num_vars] = data.frame(lapply(train[, num_vars], as.numeric))
levels(train$rating) = list('down' = -1, 'up' = 1)
str(train)
## 'data.frame': 4500 obs. of 21 variables:
## $ rating: Factor w/ 2 levels "down","up": 2 2 2 2 2 1 2 1 2 2 ...
## $ v_2 : num 4.8 3.15 1.79 2.39 2 ...
## $ v_3 : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ v_4 : num 4.99 3.74 1.79 3.09 4.84 ...
## $ v_5 : num 6.49 3.89 1.95 2.89 4.16 ...
## $ v_6 : num 80 83 0 0 0 0 65 0 0 0 ...
## $ v_7 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ v_8 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ v_9 : Factor w/ 7 levels "0","1","2","3",..: 1 2 1 1 1 1 1 1 1 1 ...
## $ v_10 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ v_11 : num 3.85 2.56 4.26 4.8 4.52 ...
## $ v_12 : num 0 0 2 8 10 0 4 1 12 1 ...
## $ v_13 : Factor w/ 2 levels "0","1": 2 2 2 2 2 1 2 2 2 2 ...
## $ v_14 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ v_15 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ v_16 : Factor w/ 2 levels "0","1": 2 2 2 2 2 1 1 1 1 2 ...
## $ v_17 : Factor w/ 2 levels "0","1": 2 1 1 2 1 1 2 1 2 1 ...
## $ v_18 : num 0 0 0 0 0 0 0 1 1 0 ...
## $ v_19 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ v_20 : num 14 2 6 5 7 2 0 3 1 3 ...
## $ v_21 : num 4.65 2.89 3.64 2.77 2.4 ...
# split into training and test set
set.seed(2014)
train_ind = createDataPartition(train$rating, p = .75, list = F)
train_tr = train[train_ind, ]
train_te = train[-train_ind, ]
# check for missing value
table(sapply(train_tr, function(x) sum(is.na(x))))
##
## 0
## 21
# analyze the training set
# visualize relationships with the response variable
# analyze factor variables
# visualize level distributions
train_fac = train_tr[, fac_vars] %>%
melt(id = 'rating') %>%
group_by(variable, value) %>%
summarise(freq = n())
## Warning: attributes are not identical across measure variables; they will
## be dropped
train_fac %>% head(10)
## Source: local data frame [10 x 3]
## Groups: variable
##
## variable value freq
## 1 v_3 0 241
## 2 v_3 1 3135
## 3 v_9 0 3057
## 4 v_9 1 263
## 5 v_9 2 42
## 6 v_9 3 8
## 7 v_9 4 5
## 8 v_9 7 1
## 9 v_10 0 3133
## 10 v_10 1 243
ggplot(train_fac, aes(x = value, y = freq)) +
geom_bar(stat = 'identity', fill = 'light blue') +
facet_wrap(~ variable, scales = 'free')

# v_19 is not helpful
train_tr = dplyr::select(train_tr, -v_19)
fac_vars = fac_vars[fac_vars != 'v_19']
# v_9's higher levels can be binned together
levels(train_tr$v_9) = c('0', rep('1', 6))
# visualize relationship with the response variable
train_fac = train_tr[, fac_vars] %>%
melt(id = 'rating') %>%
group_by(variable, value, rating) %>%
summarise(freq = n()) %>%
mutate(success_rate = freq / sum(freq)) %>%
filter(rating == 'up')
train_fac %>% head(10)
## Source: local data frame [10 x 5]
## Groups: variable, value
##
## variable value rating freq success_rate
## 1 v_3 0 up 77 0.31950207
## 2 v_3 1 up 1611 0.51387560
## 3 v_9 0 up 1603 0.52437030
## 4 v_9 1 up 85 0.26645768
## 5 v_10 0 up 1647 0.52569422
## 6 v_10 1 up 41 0.16872428
## 7 v_13 0 up 153 0.32553191
## 8 v_13 1 up 1535 0.52821748
## 9 v_14 0 up 1685 0.51340646
## 10 v_14 1 up 3 0.03191489
ggplot(train_fac, aes(x = value, y = success_rate)) +
geom_bar(fill = 'light pink', stat = 'identity') +
facet_wrap(~ variable, scales = 'free')

# although some variables look very indicative,
# they are actuall due to the infrequencies of certain levels
# analyze numeric variables
# visualize distribution
train_num = train_tr[, c('rating', num_vars)] %>%
melt(id = 'rating')
hist = ggplot(train_num, aes(x = value)) +
geom_histogram(fill = 'light green') +
facet_wrap(~ variable, scales = 'free')
suppressMessages(print(hist))

# many variables are very skewed
# try boxcox transformation
# first add 1 to variables
train_tr[, num_vars] = data.frame(lapply(train_tr[, num_vars],
function(x) x + 1))
# boxcox transformation
bc = preProcess(train_tr[, num_vars], method = 'BoxCox')
## Loading required namespace: e1071
train_tr[, num_vars] = predict(bc, train_tr[, num_vars])
# plot again
train_num_2 = train_tr[, c('rating', num_vars)] %>%
melt(id = 'rating')
hist_2 = ggplot(train_num_2, aes(x = value)) +
geom_histogram(fill = 'light green') +
facet_wrap(~ variable, scales = 'free')
suppressMessages(print(hist_2))

# better but still not good enough
# visualize relationship with the response variable
ggplot(train_num_2, aes(x = rating, y = value)) +
geom_boxplot(fill = 'light yellow') +
facet_wrap(~ variable, scales = 'free')

# train models
# first convert factor variables to dummy variables
rating_tr = train_tr$rating
train_tr_m = model.matrix(rating ~ ., data = train_tr)[, -1]
# check for near-0 variance variables
nz_var = nearZeroVar(train_tr_m)
colnames(train_tr_m[, nz_var])
## [1] "v_6" "v_7" "v_8" "v_141"
train_tr_m = train_tr_m[, -nz_var]
# check for highly correlated variables
high_corr = findCorrelation(cor(train_tr_m)) # none
# final variables (used to subset test tests)
final_vars = colnames(train_tr_m)
# 10-fold cv
ctrl = trainControl(method = 'cv', number = 10, classProbs = T,
summaryFunction = twoClassSummary)
# logistic regression
set.seed(2014)
logit_m = train(x = train_tr_m, y = rating_tr,
method = 'glm', family = 'binomial',
metric = 'ROC', trControl = ctrl)
## Loading required package: pROC
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
logit_m
## Generalized Linear Model
##
## 3376 samples
## 15 predictor
## 2 classes: 'down', 'up'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
##
## Summary of sample sizes: 3038, 3038, 3039, 3038, 3039, 3038, ...
##
## Resampling results
##
## ROC Sens Spec ROC SD Sens SD Spec SD
## 0.874399 0.7778036 0.8192977 0.02242158 0.0372472 0.02692868
##
##
coefplot(logit_m)

# svm
set.seed(2014)
svm_g = expand.grid(sigma = .0638, C = seq(1, 5, 1))
svm_m = train(x = train_tr_m, y = rating_tr,
method = 'svmRadial', preProcess = c('center', 'scale'),
metric = 'ROC', trControl = ctrl, tuneGrid = svm_g)
## Loading required package: kernlab
svm_m
## Support Vector Machines with Radial Basis Function Kernel
##
## 3376 samples
## 15 predictor
## 2 classes: 'down', 'up'
##
## Pre-processing: centered, scaled
## Resampling: Cross-Validated (10 fold)
##
## Summary of sample sizes: 3038, 3038, 3039, 3038, 3039, 3038, ...
##
## Resampling results across tuning parameters:
##
## C ROC Sens Spec ROC SD Sens SD Spec SD
## 1 0.8781984 0.7843019 0.8092103 0.02147495 0.04344180 0.03277575
## 2 0.8790228 0.7866582 0.8133629 0.02146116 0.04198433 0.03440929
## 3 0.8793649 0.7872605 0.8186919 0.02132392 0.04138149 0.03939910
## 4 0.8789324 0.7884580 0.8145428 0.02173274 0.03920484 0.03747870
## 5 0.8778745 0.7920048 0.8145464 0.02179709 0.03940769 0.03207315
##
## Tuning parameter 'sigma' was held constant at a value of 0.0638
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.0638 and C = 3.
# rf
set.seed(2014)
rf_m = train(x = train_tr_m, y = rating_tr,
method = 'rf', ntree = 1000,
metric = 'ROC', trControl = ctrl)
## Loading required package: randomForest
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
rf_m
## Random Forest
##
## 3376 samples
## 15 predictor
## 2 classes: 'down', 'up'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
##
## Summary of sample sizes: 3038, 3038, 3039, 3038, 3039, 3038, ...
##
## Resampling results across tuning parameters:
##
## mtry ROC Sens Spec ROC SD Sens SD Spec SD
## 2 0.8933915 0.8145428 0.8370773 0.0170604 0.03392681 0.01758824
## 8 0.8877196 0.8216646 0.8264194 0.0168596 0.02767132 0.02498314
## 15 0.8853535 0.8187060 0.8204917 0.0174110 0.03003836 0.02639266
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
# gbm
set.seed(2014)
gbm_g = expand.grid(n.trees = seq(100, 300, 50), interaction.depth = 1:3,
shrinkage = c(.01, .1))
gbm_m = train(x = train_tr_m, y = rating_tr,
method = 'gbm', metric = 'ROC',
trControl = ctrl, tuneGrid = gbm_g, verbose = F)
## Loading required package: gbm
## Loading required package: survival
## Loading required package: splines
##
## Attaching package: 'survival'
##
## The following object is masked from 'package:caret':
##
## cluster
##
## Loading required package: parallel
## Loaded gbm 2.1
## Loading required package: plyr
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
##
## The following objects are masked from 'package:dplyr':
##
## arrange, desc, failwith, id, mutate, summarise, summarize
gbm_m
## Stochastic Gradient Boosting
##
## 3376 samples
## 15 predictor
## 2 classes: 'down', 'up'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
##
## Summary of sample sizes: 3038, 3038, 3039, 3038, 3039, 3038, ...
##
## Resampling results across tuning parameters:
##
## shrinkage interaction.depth n.trees ROC Sens Spec
## 0.01 1 100 0.8456391 0.8163074 0.8145921
## 0.01 1 150 0.8648524 0.8163074 0.8145921
## 0.01 1 200 0.8685021 0.8180896 0.8145921
## 0.01 1 250 0.8740923 0.8180896 0.8157756
## 0.01 1 300 0.8773860 0.8180861 0.8163673
## 0.01 2 100 0.8742952 0.8163074 0.8175578
## 0.01 2 150 0.8770465 0.8151205 0.8181530
## 0.01 2 200 0.8796859 0.8163039 0.8211116
## 0.01 2 250 0.8837060 0.8133418 0.8217033
## 0.01 2 300 0.8866441 0.8151169 0.8222950
## 0.01 3 100 0.8792243 0.8157087 0.8187447
## 0.01 3 150 0.8810543 0.8168991 0.8205199
## 0.01 3 200 0.8839067 0.8163074 0.8222950
## 0.01 3 250 0.8875358 0.8157157 0.8228867
## 0.01 3 300 0.8898244 0.8151240 0.8222915
## 0.10 1 100 0.8921089 0.8157087 0.8270182
## 0.10 1 150 0.8939147 0.8192801 0.8264159
## 0.10 1 200 0.8939814 0.8204600 0.8311531
## 0.10 1 250 0.8955161 0.8210482 0.8335200
## 0.10 1 300 0.8958001 0.8198648 0.8364786
## 0.10 2 100 0.8953815 0.8222281 0.8317484
## 0.10 2 150 0.8974218 0.8228198 0.8359010
## 0.10 2 200 0.8979736 0.8234256 0.8376761
## 0.10 2 250 0.8994132 0.8281699 0.8400465
## 0.10 2 300 0.8995559 0.8263983 0.8441885
## 0.10 3 100 0.8974352 0.8239962 0.8358833
## 0.10 3 150 0.8982634 0.8245950 0.8352952
## 0.10 3 200 0.8981669 0.8263701 0.8388419
## 0.10 3 250 0.8989624 0.8311144 0.8382537
## 0.10 3 300 0.8977685 0.8317132 0.8394372
## ROC SD Sens SD Spec SD
## 0.02305909 0.03881962 0.02633707
## 0.02166967 0.03881962 0.02633707
## 0.02162610 0.03735787 0.02603640
## 0.02030779 0.03735787 0.02625481
## 0.02022877 0.03727873 0.02619309
## 0.01975820 0.03871928 0.02729714
## 0.01950225 0.03931490 0.02759838
## 0.01883351 0.03894598 0.02896173
## 0.01853346 0.03966835 0.02823837
## 0.01889184 0.03874413 0.02762304
## 0.01882633 0.03921156 0.02952992
## 0.01942253 0.03795885 0.02912486
## 0.01842703 0.03739154 0.02872764
## 0.01794237 0.03836131 0.02906182
## 0.01809610 0.03630920 0.02664253
## 0.01936783 0.03891278 0.02377690
## 0.01959909 0.03692184 0.02457565
## 0.01982246 0.03766279 0.02296321
## 0.02028236 0.03661786 0.01974606
## 0.02077241 0.03686670 0.01763419
## 0.01921394 0.03772253 0.02096281
## 0.02016502 0.03838636 0.01786787
## 0.01996026 0.03982450 0.02288798
## 0.02034797 0.03479619 0.02498193
## 0.02009380 0.03196787 0.02113366
## 0.01883021 0.03910850 0.01932625
## 0.01940493 0.03867477 0.02239307
## 0.01991107 0.03671176 0.02523935
## 0.02021803 0.03814465 0.02830819
## 0.02150769 0.03880135 0.02592652
##
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 300,
## interaction.depth = 2 and shrinkage = 0.1.
resamps = resamples(list(logistic = logit_m, svm = svm_m, random_forest = rf_m, gbm = gbm_m))
parallelplot(resamps)

save(logit_m, file = 'logit_m.RData')
save(svm_m, file = 'svm_m.RData')
save(rf_m, file = 'rf_m.RData')
save(gbm_m, file = 'gbm_m.RData')
# apply to test set
# bin v_9's higher levels
levels(train_te$v_9) = c('0', rep('1', 6))
# add 1 to numeric variables
train_te[, num_vars] = data.frame(lapply(train_te[, num_vars],
function(x) x + 1))
# boxcox transformation
train_te[, num_vars] = predict(bc, train_te[, num_vars])
# convert to model matrix
rating_te = train_te$rating
train_te_m = model.matrix(rating ~ ., data = train_te)[, -1]
train_te_m = train_te_m[, final_vars]
# only apply random forest and gbm
rf_f = predict(rf_m, train_te_m, type = 'prob')
gbm_f = predict(gbm_m, train_te_m, type = 'prob')
mean_f = (rf_f$up + gbm_f$up) / 2
roc_te = roc(rating_te, mean_f)
plot(roc_te) # .9017

##
## Call:
## roc.default(response = rating_te, predictor = mean_f)
##
## Data: mean_f in 562 controls (rating_te down) < 562 cases (rating_te up).
## Area under the curve: 0.9017
# apply to the real test set (the remaining rows in the file)
test = read.table('input00.txt', sep = ' ', skip = 4502)
test = dplyr::select(test, -V1)
names(test) = paste0('v_', seq_len(ncol(test)))
test = data.frame(lapply(test, function(x) gsub('[0-9]+:', '', x)))
fac_vars = fac_vars[fac_vars != 'rating']
test = test[, c(fac_vars, num_vars)]
test[, fac_vars] = data.frame(lapply(test[, fac_vars], as.factor))
test[, num_vars] = data.frame(lapply(test[, num_vars], as.numeric))
str(test)
## 'data.frame': 500 obs. of 19 variables:
## $ v_3 : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ v_9 : Factor w/ 6 levels "0","1","2","3",..: 1 1 1 1 1 1 1 2 1 1 ...
## $ v_10: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ v_13: Factor w/ 2 levels "0","1": 2 2 2 1 2 2 1 2 2 2 ...
## $ v_14: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ v_15: Factor w/ 2 levels "0","1": 1 2 1 1 1 1 2 1 1 1 ...
## $ v_16: Factor w/ 2 levels "0","1": 1 1 1 2 2 2 1 2 1 2 ...
## $ v_17: Factor w/ 2 levels "0","1": 1 2 1 1 1 1 1 1 1 1 ...
## $ v_2 : num 1.85 0 0 3.68 1.8 ...
## $ v_4 : num 1.1 5.51 4.85 5.02 3.97 ...
## $ v_5 : num 1.95 4.52 4.09 5.31 4.42 ...
## $ v_6 : num 34.3 0 82 70 0 ...
## $ v_7 : num 0 0 0 0 0 ...
## $ v_8 : num 0 0 0 0 0 ...
## $ v_11: num 2.48 3.47 2.56 4.11 3.14 ...
## $ v_12: num 1 1 0 2 0 1 0 0 3 1 ...
## $ v_18: num 1 0 0 0 0 0 0 0 0 0 ...
## $ v_20: num 46 2 0 4 4 20 7 41 5 10 ...
## $ v_21: num 4.91 2.83 2.08 2.77 2.89 ...
# apply models
# bin v_9's higher levels
levels(test$v_9) = c('0', rep('1', 5))
# add 1 to numeric variables
test[, num_vars] = data.frame(lapply(test[, num_vars],
function(x) x + 1))
# boxcox transformation
test[, num_vars] = predict(bc, test[, num_vars])
# convert to model matrix
test_m = model.matrix(~ ., data = test)[, -1]
test_m = test_m[, final_vars]
# only apply random forest and gbm
rf_f_test = predict(rf_m, test_m, type = 'prob')
gbm_f_test = predict(gbm_m, test_m, type = 'prob')
mean_f_test = (rf_f_test$up + gbm_f_test$up) / 2
# compare with test outcome
test_outcome = read.table('output00.txt', sep = ' ')
test_rating = as.factor(test_outcome$V2)
levels(test_rating) = list('down' = -1, 'up' = 1)
# roc
roc_test = roc(test_rating, mean_f_test)
plot(roc_test) # .8972

##
## Call:
## roc.default(response = test_rating, predictor = mean_f_test)
##
## Data: mean_f_test in 250 controls (test_rating down) < 250 cases (test_rating up).
## Area under the curve: 0.8972