trainData = read.csv('class_train_in.csv')
trainLabels = read.csv('class_train_out.csv')
train = merge(trainData, trainLabels, by='Point_ID')
train = train[names(train) != "Point_ID"] # drop ID column since we already joined outputs
train$Output = as.factor(train$Output) # convert to categorical labels
str(train$Output)
##  Factor w/ 2 levels "0","1": 2 2 2 2 1 2 1 2 2 2 ...

Visualization

Do a PCA

plot(princomp(train[names(train) != "Output"]))

TODO: Exploring reducing to 5 dimensional subspace

Model Training

Train/test split, 3-fold CV

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
set.seed(3456)
trainIndex <- createDataPartition(train$Output, p = .8,
                                  list = FALSE,
                                  times = 1)
training = train[trainIndex,]
testing = train[-trainIndex,]

fitControl <- trainControl(method = "cv",
                           number = 3)

Logistic Regression

library(e1071)
set.seed(825)
lrFit <- train(Output ~ ., data = training,
               method = "glm",
               trControl = fitControl,
               family=binomial(link = "logit"))
lrFit
## Generalized Linear Model 
## 
## 1570 samples
##  265 predictors
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold) 
## Summary of sample sizes: 1047, 1046, 1047 
## Resampling results
## 
##   Accuracy   Kappa      Accuracy SD  Kappa SD  
##   0.5598621  0.1197112  0.01429869   0.02860541
## 
## 

Training confusion matrix

confusionMatrix.train(lrFit)
## Cross-Validated (3 fold) Confusion Matrix 
## 
## (entries are percentages of table totals)
##  
##           Reference
## Prediction    0    1
##          0 28.6 22.6
##          1 21.4 27.4

GBDT

library(gbm)
## Loading required package: survival
## 
## Attaching package: 'survival'
## 
## The following object is masked from 'package:caret':
## 
##     cluster
## 
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.1
set.seed(825)
gbmFit <- train(Output ~ ., data = training,
                method = "gbm",
                trControl = fitControl,
                verbose=F)
## Loading required package: plyr
gbmFit
## Stochastic Gradient Boosting 
## 
## 1570 samples
##  265 predictors
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold) 
## Summary of sample sizes: 1047, 1046, 1047 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  Accuracy   Kappa      Accuracy SD
##   1                   50      0.5821645  0.1643084  0.007806296
##   1                  100      0.6057488  0.2114816  0.027486561
##   1                  150      0.6038246  0.2075921  0.025172153
##   2                   50      0.6248291  0.2496076  0.023394768
##   2                  100      0.6305652  0.2610545  0.012774408
##   2                  150      0.6394747  0.2789225  0.024232901
##   3                   50      0.6203859  0.2407543  0.012552386
##   3                  100      0.6375919  0.2751496  0.018776823
##   3                  150      0.6452243  0.2904436  0.006016099
##   Kappa SD  
##   0.01546084
##   0.05490827
##   0.05028519
##   0.04673111
##   0.02556440
##   0.04848187
##   0.02497865
##   0.03752200
##   0.01200638
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using  the largest value.
## The final values used for the model were n.trees = 150,
##  interaction.depth = 3, shrinkage = 0.1 and n.minobsinnode = 10.

Training confusion matrix

confusionMatrix.train(gbmFit)
## Cross-Validated (3 fold) Confusion Matrix 
## 
## (entries are percentages of table totals)
##  
##           Reference
## Prediction    0    1
##          0 32.4 17.8
##          1 17.6 32.2