quora_classification

library(stringr)
suppressMessages(library(dplyr))
library(reshape2)
library(ggplot2)
suppressMessages(library(caret))
library(coefplot)
options(stringsAsFactors = F)
setwd('/Users/Runze/Google Drive/quora_classification_challenge')

# read in training data (the first 4500 rows)
train = read.table('input00.txt', sep = ' ', skip = 1, nrows = 4500)
str(train)

## 'data.frame':    4500 obs. of  25 variables:
##  $ V1 : chr  "Nt8FJ" "VCaTF" "gParY" "DtWDw" ...
##  $ V2 : int  1 1 1 1 1 -1 1 -1 1 1 ...
##  $ V3 : chr  "1:12087620705283" "1:282114466020" "1:173284955" "1:4708728355523" ...
##  $ V4 : chr  "2:4.797982" "2:3.151926" "2:1.785813" "2:2.394989" ...
##  $ V5 : chr  "3:1" "3:1" "3:1" "3:1" ...
##  $ V6 : chr  "4:4.990433" "4:3.737670" "4:1.791759" "4:3.091042" ...
##  $ V7 : chr  "5:6.492240" "5:3.891820" "5:1.945910" "5:2.890372" ...
##  $ V8 : chr  "6:80.000000" "6:83.000000" "6:0.000000" "6:0.000000" ...
##  $ V9 : chr  "7:0.000000" "7:0.000000" "7:0.000000" "7:0.000000" ...
##  $ V10: chr  "8:0.000000" "8:0.000000" "8:0.000000" "8:0.000000" ...
##  $ V11: chr  "9:0" "9:1" "9:0" "9:0" ...
##  $ V12: chr  "10:0" "10:0" "10:0" "10:0" ...
##  $ V13: chr  "11:3.850148" "11:2.564949" "11:4.262680" "11:4.804021" ...
##  $ V14: chr  "12:0" "12:0" "12:2" "12:8" ...
##  $ V15: chr  "13:1" "13:1" "13:1" "13:1" ...
##  $ V16: chr  "14:0" "14:0" "14:0" "14:0" ...
##  $ V17: chr  "15:0" "15:0" "15:0" "15:0" ...
##  $ V18: chr  "16:1" "16:1" "16:1" "16:1" ...
##  $ V19: chr  "17:1" "17:0" "17:0" "17:1" ...
##  $ V20: chr  "18:0" "18:0" "18:0" "18:0" ...
##  $ V21: chr  "19:0" "19:0" "19:0" "19:0" ...
##  $ V22: chr  "20:14" "20:2" "20:6" "20:5" ...
##  $ V23: chr  "21:4.653960" "21:2.890372" "21:3.637586" "21:2.772589" ...
##  $ V24: chr  "22:0.000000" "22:0.000000" "22:0.000000" "22:0.000000" ...
##  $ V25: chr  "23:0.000000" "23:0.000000" "23:0.000000" "23:0.000000" ...

# remove answer id (the first column) and rename others
train = dplyr::select(train, -V1)
names(train)[1] = 'rating'
names(train)[2:ncol(train)] = paste0('v_', seq_len(ncol(train)-1))
train = data.frame(lapply(train, function(x) gsub('[0-9]+:', '', x)))
str(train)

## 'data.frame':    4500 obs. of  24 variables:
##  $ rating: chr  "1" "1" "1" "1" ...
##  $ v_1   : chr  "12087620705283" "282114466020" "173284955" "4708728355523" ...
##  $ v_2   : chr  "4.797982" "3.151926" "1.785813" "2.394989" ...
##  $ v_3   : chr  "1" "1" "1" "1" ...
##  $ v_4   : chr  "4.990433" "3.737670" "1.791759" "3.091042" ...
##  $ v_5   : chr  "6.492240" "3.891820" "1.945910" "2.890372" ...
##  $ v_6   : chr  "80.000000" "83.000000" "0.000000" "0.000000" ...
##  $ v_7   : chr  "0.000000" "0.000000" "0.000000" "0.000000" ...
##  $ v_8   : chr  "0.000000" "0.000000" "0.000000" "0.000000" ...
##  $ v_9   : chr  "0" "1" "0" "0" ...
##  $ v_10  : chr  "0" "0" "0" "0" ...
##  $ v_11  : chr  "3.850148" "2.564949" "4.262680" "4.804021" ...
##  $ v_12  : chr  "0" "0" "2" "8" ...
##  $ v_13  : chr  "1" "1" "1" "1" ...
##  $ v_14  : chr  "0" "0" "0" "0" ...
##  $ v_15  : chr  "0" "0" "0" "0" ...
##  $ v_16  : chr  "1" "1" "1" "1" ...
##  $ v_17  : chr  "1" "0" "0" "1" ...
##  $ v_18  : chr  "0" "0" "0" "0" ...
##  $ v_19  : chr  "0" "0" "0" "0" ...
##  $ v_20  : chr  "14" "2" "6" "5" ...
##  $ v_21  : chr  "4.653960" "2.890372" "3.637586" "2.772589" ...
##  $ v_22  : chr  "0.000000" "0.000000" "0.000000" "0.000000" ...
##  $ v_23  : chr  "0.000000" "0.000000" "0.000000" "0.000000" ...

# select and transform variables
# check value frequencies for each variable
sapply(train, function(x) length(table(x)))

## rating    v_1    v_2    v_3    v_4    v_5    v_6    v_7    v_8    v_9 
##      2   4500   2531      2    415    524    172    161    227      7 
##   v_10   v_11   v_12   v_13   v_14   v_15   v_16   v_17   v_18   v_19 
##      2    336     45      2      2      2      2      2     12      2 
##   v_20   v_21   v_22   v_23 
##     79    232      1      1

# v_1 has a different value for each observation and
# v_22 and v_23 have the same value for all observations
# hence, remove all 3 of them
train = dplyr::select(train, -v_1, -v_22, -v_23)

# identify and convert factor and numerical variables
# based on the value frequencies calculated above
fac_vars = c('rating', 'v_3', 'v_9', 'v_10', 'v_13', 'v_14', 
             'v_15', 'v_16', 'v_17', 'v_19')
num_vars = names(train)[!(names(train) %in% fac_vars)]

train[, fac_vars] = data.frame(lapply(train[, fac_vars], as.factor))
train[, num_vars] = data.frame(lapply(train[, num_vars], as.numeric))
levels(train$rating) = list('down' = -1, 'up' = 1)
str(train)

## 'data.frame':    4500 obs. of  21 variables:
##  $ rating: Factor w/ 2 levels "down","up": 2 2 2 2 2 1 2 1 2 2 ...
##  $ v_2   : num  4.8 3.15 1.79 2.39 2 ...
##  $ v_3   : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ v_4   : num  4.99 3.74 1.79 3.09 4.84 ...
##  $ v_5   : num  6.49 3.89 1.95 2.89 4.16 ...
##  $ v_6   : num  80 83 0 0 0 0 65 0 0 0 ...
##  $ v_7   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ v_8   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ v_9   : Factor w/ 7 levels "0","1","2","3",..: 1 2 1 1 1 1 1 1 1 1 ...
##  $ v_10  : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ v_11  : num  3.85 2.56 4.26 4.8 4.52 ...
##  $ v_12  : num  0 0 2 8 10 0 4 1 12 1 ...
##  $ v_13  : Factor w/ 2 levels "0","1": 2 2 2 2 2 1 2 2 2 2 ...
##  $ v_14  : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ v_15  : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ v_16  : Factor w/ 2 levels "0","1": 2 2 2 2 2 1 1 1 1 2 ...
##  $ v_17  : Factor w/ 2 levels "0","1": 2 1 1 2 1 1 2 1 2 1 ...
##  $ v_18  : num  0 0 0 0 0 0 0 1 1 0 ...
##  $ v_19  : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ v_20  : num  14 2 6 5 7 2 0 3 1 3 ...
##  $ v_21  : num  4.65 2.89 3.64 2.77 2.4 ...

# split into training and test set
set.seed(2014)
train_ind = createDataPartition(train$rating, p = .75, list = F)
train_tr = train[train_ind, ]
train_te = train[-train_ind, ]

# check for missing value
table(sapply(train_tr, function(x) sum(is.na(x))))

## 
##  0 
## 21

# analyze the training set
# visualize relationships with the response variable

# analyze factor variables
# visualize level distributions
train_fac = train_tr[, fac_vars] %>%
  melt(id = 'rating') %>%
  group_by(variable, value) %>%
  summarise(freq = n())

## Warning: attributes are not identical across measure variables; they will
## be dropped

train_fac %>% head(10)

## Source: local data frame [10 x 3]
## Groups: variable
## 
##    variable value freq
## 1       v_3     0  241
## 2       v_3     1 3135
## 3       v_9     0 3057
## 4       v_9     1  263
## 5       v_9     2   42
## 6       v_9     3    8
## 7       v_9     4    5
## 8       v_9     7    1
## 9      v_10     0 3133
## 10     v_10     1  243

ggplot(train_fac, aes(x = value, y = freq)) +
  geom_bar(stat = 'identity', fill = 'light blue') +
  facet_wrap(~ variable, scales = 'free')

# v_19 is not helpful
train_tr = dplyr::select(train_tr, -v_19)
fac_vars = fac_vars[fac_vars != 'v_19']

# v_9's higher levels can be binned together
levels(train_tr$v_9) = c('0', rep('1', 6))

# visualize relationship with the response variable
train_fac = train_tr[, fac_vars] %>%
  melt(id = 'rating') %>%
  group_by(variable, value, rating) %>%
  summarise(freq = n()) %>%
  mutate(success_rate  = freq / sum(freq)) %>%
  filter(rating == 'up')
train_fac %>% head(10)

## Source: local data frame [10 x 5]
## Groups: variable, value
## 
##    variable value rating freq success_rate
## 1       v_3     0     up   77   0.31950207
## 2       v_3     1     up 1611   0.51387560
## 3       v_9     0     up 1603   0.52437030
## 4       v_9     1     up   85   0.26645768
## 5      v_10     0     up 1647   0.52569422
## 6      v_10     1     up   41   0.16872428
## 7      v_13     0     up  153   0.32553191
## 8      v_13     1     up 1535   0.52821748
## 9      v_14     0     up 1685   0.51340646
## 10     v_14     1     up    3   0.03191489

ggplot(train_fac, aes(x = value, y = success_rate)) +
  geom_bar(fill = 'light pink', stat = 'identity') + 
  facet_wrap(~ variable, scales = 'free')

# although some variables look very indicative,
# they are actuall due to the infrequencies of certain levels

# analyze numeric variables
# visualize distribution
train_num = train_tr[, c('rating', num_vars)] %>%
  melt(id = 'rating')

hist = ggplot(train_num, aes(x = value)) + 
  geom_histogram(fill = 'light green') +
  facet_wrap(~ variable, scales = 'free')
suppressMessages(print(hist))

# many variables are very skewed
# try boxcox transformation
# first add 1 to variables
train_tr[, num_vars] = data.frame(lapply(train_tr[, num_vars],
                                         function(x) x + 1))

# boxcox transformation
bc = preProcess(train_tr[, num_vars], method = 'BoxCox')

## Loading required namespace: e1071

train_tr[, num_vars] = predict(bc, train_tr[, num_vars])

# plot again
train_num_2 = train_tr[, c('rating', num_vars)] %>%
  melt(id = 'rating')

hist_2 = ggplot(train_num_2, aes(x = value)) + 
  geom_histogram(fill = 'light green') +
  facet_wrap(~ variable, scales = 'free')
suppressMessages(print(hist_2))

# better but still not good enough
# visualize relationship with the response variable
ggplot(train_num_2, aes(x = rating, y = value)) +
  geom_boxplot(fill = 'light yellow') +
  facet_wrap(~ variable, scales = 'free')

# train models
# first convert factor variables to dummy variables
rating_tr = train_tr$rating
train_tr_m = model.matrix(rating ~ ., data = train_tr)[, -1]

# check for near-0 variance variables
nz_var = nearZeroVar(train_tr_m)
colnames(train_tr_m[, nz_var])

## [1] "v_6"   "v_7"   "v_8"   "v_141"

train_tr_m = train_tr_m[, -nz_var]

# check for highly correlated variables
high_corr = findCorrelation(cor(train_tr_m)) # none

# final variables (used to subset test tests)
final_vars = colnames(train_tr_m)

# 10-fold cv
ctrl = trainControl(method = 'cv', number = 10, classProbs = T,
                    summaryFunction = twoClassSummary)

# logistic regression
set.seed(2014)
logit_m = train(x = train_tr_m, y = rating_tr,
                method = 'glm', family = 'binomial',
                metric = 'ROC', trControl = ctrl)

## Loading required package: pROC
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

logit_m

## Generalized Linear Model 
## 
## 3376 samples
##   15 predictor
##    2 classes: 'down', 'up' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## 
## Summary of sample sizes: 3038, 3038, 3039, 3038, 3039, 3038, ... 
## 
## Resampling results
## 
##   ROC       Sens       Spec       ROC SD      Sens SD    Spec SD   
##   0.874399  0.7778036  0.8192977  0.02242158  0.0372472  0.02692868
## 
##

coefplot(logit_m)

# svm
set.seed(2014)
svm_g = expand.grid(sigma = .0638, C = seq(1, 5, 1))
svm_m = train(x = train_tr_m, y = rating_tr,
              method = 'svmRadial', preProcess = c('center', 'scale'),
              metric = 'ROC', trControl = ctrl, tuneGrid = svm_g)

## Loading required package: kernlab

svm_m

## Support Vector Machines with Radial Basis Function Kernel 
## 
## 3376 samples
##   15 predictor
##    2 classes: 'down', 'up' 
## 
## Pre-processing: centered, scaled 
## Resampling: Cross-Validated (10 fold) 
## 
## Summary of sample sizes: 3038, 3038, 3039, 3038, 3039, 3038, ... 
## 
## Resampling results across tuning parameters:
## 
##   C  ROC        Sens       Spec       ROC SD      Sens SD     Spec SD   
##   1  0.8781984  0.7843019  0.8092103  0.02147495  0.04344180  0.03277575
##   2  0.8790228  0.7866582  0.8133629  0.02146116  0.04198433  0.03440929
##   3  0.8793649  0.7872605  0.8186919  0.02132392  0.04138149  0.03939910
##   4  0.8789324  0.7884580  0.8145428  0.02173274  0.03920484  0.03747870
##   5  0.8778745  0.7920048  0.8145464  0.02179709  0.03940769  0.03207315
## 
## Tuning parameter 'sigma' was held constant at a value of 0.0638
## ROC was used to select the optimal model using  the largest value.
## The final values used for the model were sigma = 0.0638 and C = 3.

# rf
set.seed(2014)
rf_m = train(x = train_tr_m, y = rating_tr,
              method = 'rf', ntree = 1000,
              metric = 'ROC', trControl = ctrl)

## Loading required package: randomForest
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.

rf_m

## Random Forest 
## 
## 3376 samples
##   15 predictor
##    2 classes: 'down', 'up' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## 
## Summary of sample sizes: 3038, 3038, 3039, 3038, 3039, 3038, ... 
## 
## Resampling results across tuning parameters:
## 
##   mtry  ROC        Sens       Spec       ROC SD     Sens SD     Spec SD   
##    2    0.8933915  0.8145428  0.8370773  0.0170604  0.03392681  0.01758824
##    8    0.8877196  0.8216646  0.8264194  0.0168596  0.02767132  0.02498314
##   15    0.8853535  0.8187060  0.8204917  0.0174110  0.03003836  0.02639266
## 
## ROC was used to select the optimal model using  the largest value.
## The final value used for the model was mtry = 2.

# gbm
set.seed(2014)
gbm_g = expand.grid(n.trees = seq(100, 300, 50), interaction.depth = 1:3, 
                    shrinkage = c(.01, .1))
gbm_m = train(x = train_tr_m, y = rating_tr,
              method = 'gbm', metric = 'ROC', 
              trControl = ctrl, tuneGrid = gbm_g, verbose = F)

## Loading required package: gbm
## Loading required package: survival
## Loading required package: splines
## 
## Attaching package: 'survival'
## 
## The following object is masked from 'package:caret':
## 
##     cluster
## 
## Loading required package: parallel
## Loaded gbm 2.1
## Loading required package: plyr
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## 
## The following objects are masked from 'package:dplyr':
## 
##     arrange, desc, failwith, id, mutate, summarise, summarize

gbm_m

## Stochastic Gradient Boosting 
## 
## 3376 samples
##   15 predictor
##    2 classes: 'down', 'up' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## 
## Summary of sample sizes: 3038, 3038, 3039, 3038, 3039, 3038, ... 
## 
## Resampling results across tuning parameters:
## 
##   shrinkage  interaction.depth  n.trees  ROC        Sens       Spec     
##   0.01       1                  100      0.8456391  0.8163074  0.8145921
##   0.01       1                  150      0.8648524  0.8163074  0.8145921
##   0.01       1                  200      0.8685021  0.8180896  0.8145921
##   0.01       1                  250      0.8740923  0.8180896  0.8157756
##   0.01       1                  300      0.8773860  0.8180861  0.8163673
##   0.01       2                  100      0.8742952  0.8163074  0.8175578
##   0.01       2                  150      0.8770465  0.8151205  0.8181530
##   0.01       2                  200      0.8796859  0.8163039  0.8211116
##   0.01       2                  250      0.8837060  0.8133418  0.8217033
##   0.01       2                  300      0.8866441  0.8151169  0.8222950
##   0.01       3                  100      0.8792243  0.8157087  0.8187447
##   0.01       3                  150      0.8810543  0.8168991  0.8205199
##   0.01       3                  200      0.8839067  0.8163074  0.8222950
##   0.01       3                  250      0.8875358  0.8157157  0.8228867
##   0.01       3                  300      0.8898244  0.8151240  0.8222915
##   0.10       1                  100      0.8921089  0.8157087  0.8270182
##   0.10       1                  150      0.8939147  0.8192801  0.8264159
##   0.10       1                  200      0.8939814  0.8204600  0.8311531
##   0.10       1                  250      0.8955161  0.8210482  0.8335200
##   0.10       1                  300      0.8958001  0.8198648  0.8364786
##   0.10       2                  100      0.8953815  0.8222281  0.8317484
##   0.10       2                  150      0.8974218  0.8228198  0.8359010
##   0.10       2                  200      0.8979736  0.8234256  0.8376761
##   0.10       2                  250      0.8994132  0.8281699  0.8400465
##   0.10       2                  300      0.8995559  0.8263983  0.8441885
##   0.10       3                  100      0.8974352  0.8239962  0.8358833
##   0.10       3                  150      0.8982634  0.8245950  0.8352952
##   0.10       3                  200      0.8981669  0.8263701  0.8388419
##   0.10       3                  250      0.8989624  0.8311144  0.8382537
##   0.10       3                  300      0.8977685  0.8317132  0.8394372
##   ROC SD      Sens SD     Spec SD   
##   0.02305909  0.03881962  0.02633707
##   0.02166967  0.03881962  0.02633707
##   0.02162610  0.03735787  0.02603640
##   0.02030779  0.03735787  0.02625481
##   0.02022877  0.03727873  0.02619309
##   0.01975820  0.03871928  0.02729714
##   0.01950225  0.03931490  0.02759838
##   0.01883351  0.03894598  0.02896173
##   0.01853346  0.03966835  0.02823837
##   0.01889184  0.03874413  0.02762304
##   0.01882633  0.03921156  0.02952992
##   0.01942253  0.03795885  0.02912486
##   0.01842703  0.03739154  0.02872764
##   0.01794237  0.03836131  0.02906182
##   0.01809610  0.03630920  0.02664253
##   0.01936783  0.03891278  0.02377690
##   0.01959909  0.03692184  0.02457565
##   0.01982246  0.03766279  0.02296321
##   0.02028236  0.03661786  0.01974606
##   0.02077241  0.03686670  0.01763419
##   0.01921394  0.03772253  0.02096281
##   0.02016502  0.03838636  0.01786787
##   0.01996026  0.03982450  0.02288798
##   0.02034797  0.03479619  0.02498193
##   0.02009380  0.03196787  0.02113366
##   0.01883021  0.03910850  0.01932625
##   0.01940493  0.03867477  0.02239307
##   0.01991107  0.03671176  0.02523935
##   0.02021803  0.03814465  0.02830819
##   0.02150769  0.03880135  0.02592652
## 
## ROC was used to select the optimal model using  the largest value.
## The final values used for the model were n.trees = 300,
##  interaction.depth = 2 and shrinkage = 0.1.

resamps = resamples(list(logistic = logit_m, svm = svm_m, random_forest = rf_m, gbm = gbm_m))
parallelplot(resamps)

save(logit_m, file = 'logit_m.RData')
save(svm_m, file = 'svm_m.RData')
save(rf_m, file = 'rf_m.RData')
save(gbm_m, file = 'gbm_m.RData')

# apply to test set
# bin v_9's higher levels
levels(train_te$v_9) = c('0', rep('1', 6))

# add 1 to numeric variables
train_te[, num_vars] = data.frame(lapply(train_te[, num_vars],
                                         function(x) x + 1))

# boxcox transformation
train_te[, num_vars] = predict(bc, train_te[, num_vars])

# convert to model matrix
rating_te = train_te$rating
train_te_m = model.matrix(rating ~ ., data = train_te)[, -1]
train_te_m = train_te_m[, final_vars]

# only apply random forest and gbm
rf_f = predict(rf_m, train_te_m, type = 'prob')
gbm_f = predict(gbm_m, train_te_m, type = 'prob')

mean_f = (rf_f$up + gbm_f$up) / 2
roc_te = roc(rating_te, mean_f)
plot(roc_te) # .9017

## 
## Call:
## roc.default(response = rating_te, predictor = mean_f)
## 
## Data: mean_f in 562 controls (rating_te down) < 562 cases (rating_te up).
## Area under the curve: 0.9017

# apply to the real test set (the remaining rows in the file)
test = read.table('input00.txt', sep = ' ', skip = 4502)
test = dplyr::select(test, -V1)
names(test) = paste0('v_', seq_len(ncol(test)))
test = data.frame(lapply(test, function(x) gsub('[0-9]+:', '', x)))

fac_vars = fac_vars[fac_vars != 'rating']
test = test[, c(fac_vars, num_vars)]

test[, fac_vars] = data.frame(lapply(test[, fac_vars], as.factor))
test[, num_vars] = data.frame(lapply(test[, num_vars], as.numeric))
str(test)

## 'data.frame':    500 obs. of  19 variables:
##  $ v_3 : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ v_9 : Factor w/ 6 levels "0","1","2","3",..: 1 1 1 1 1 1 1 2 1 1 ...
##  $ v_10: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ v_13: Factor w/ 2 levels "0","1": 2 2 2 1 2 2 1 2 2 2 ...
##  $ v_14: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ v_15: Factor w/ 2 levels "0","1": 1 2 1 1 1 1 2 1 1 1 ...
##  $ v_16: Factor w/ 2 levels "0","1": 1 1 1 2 2 2 1 2 1 2 ...
##  $ v_17: Factor w/ 2 levels "0","1": 1 2 1 1 1 1 1 1 1 1 ...
##  $ v_2 : num  1.85 0 0 3.68 1.8 ...
##  $ v_4 : num  1.1 5.51 4.85 5.02 3.97 ...
##  $ v_5 : num  1.95 4.52 4.09 5.31 4.42 ...
##  $ v_6 : num  34.3 0 82 70 0 ...
##  $ v_7 : num  0 0 0 0 0 ...
##  $ v_8 : num  0 0 0 0 0 ...
##  $ v_11: num  2.48 3.47 2.56 4.11 3.14 ...
##  $ v_12: num  1 1 0 2 0 1 0 0 3 1 ...
##  $ v_18: num  1 0 0 0 0 0 0 0 0 0 ...
##  $ v_20: num  46 2 0 4 4 20 7 41 5 10 ...
##  $ v_21: num  4.91 2.83 2.08 2.77 2.89 ...

# apply models
# bin v_9's higher levels
levels(test$v_9) = c('0', rep('1', 5))

# add 1 to numeric variables
test[, num_vars] = data.frame(lapply(test[, num_vars],
                                     function(x) x + 1))

# boxcox transformation
test[, num_vars] = predict(bc, test[, num_vars])

# convert to model matrix
test_m = model.matrix(~ ., data = test)[, -1]
test_m = test_m[, final_vars]

# only apply random forest and gbm
rf_f_test = predict(rf_m, test_m, type = 'prob')
gbm_f_test = predict(gbm_m, test_m, type = 'prob')

mean_f_test = (rf_f_test$up + gbm_f_test$up) / 2

# compare with test outcome
test_outcome = read.table('output00.txt', sep = ' ')
test_rating = as.factor(test_outcome$V2)
levels(test_rating) = list('down' = -1, 'up' = 1)

# roc
roc_test = roc(test_rating, mean_f_test)
plot(roc_test) # .8972

## 
## Call:
## roc.default(response = test_rating, predictor = mean_f_test)
## 
## Data: mean_f_test in 250 controls (test_rating down) < 250 cases (test_rating up).
## Area under the curve: 0.8972

quora_classification_challenge.R

Runze

Sun Nov 16 17:33:42 2014