trainData = read.csv('class_train_in.csv')
trainLabels = read.csv('class_train_out.csv')
train = merge(trainData, trainLabels, by='Point_ID')
train = train[names(train) != "Point_ID"] # drop ID column since we already joined outputs
train$Output = as.factor(train$Output) # convert to categorical labels
str(train$Output)
## Factor w/ 2 levels "0","1": 2 2 2 2 1 2 1 2 2 2 ...
Do a PCA
plot(princomp(train[names(train) != "Output"]))
TODO: Exploring reducing to 5 dimensional subspace
Train/test split, 3-fold CV
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
set.seed(3456)
trainIndex <- createDataPartition(train$Output, p = .8,
list = FALSE,
times = 1)
training = train[trainIndex,]
testing = train[-trainIndex,]
fitControl <- trainControl(method = "cv",
number = 3)
library(e1071)
set.seed(825)
lrFit <- train(Output ~ ., data = training,
method = "glm",
trControl = fitControl,
family=binomial(link = "logit"))
lrFit
## Generalized Linear Model
##
## 1570 samples
## 265 predictors
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold)
## Summary of sample sizes: 1047, 1046, 1047
## Resampling results
##
## Accuracy Kappa Accuracy SD Kappa SD
## 0.5598621 0.1197112 0.01429869 0.02860541
##
##
Training confusion matrix
confusionMatrix.train(lrFit)
## Cross-Validated (3 fold) Confusion Matrix
##
## (entries are percentages of table totals)
##
## Reference
## Prediction 0 1
## 0 28.6 22.6
## 1 21.4 27.4
library(gbm)
## Loading required package: survival
##
## Attaching package: 'survival'
##
## The following object is masked from 'package:caret':
##
## cluster
##
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.1
set.seed(825)
gbmFit <- train(Output ~ ., data = training,
method = "gbm",
trControl = fitControl,
verbose=F)
## Loading required package: plyr
gbmFit
## Stochastic Gradient Boosting
##
## 1570 samples
## 265 predictors
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold)
## Summary of sample sizes: 1047, 1046, 1047
## Resampling results across tuning parameters:
##
## interaction.depth n.trees Accuracy Kappa Accuracy SD
## 1 50 0.5821645 0.1643084 0.007806296
## 1 100 0.6057488 0.2114816 0.027486561
## 1 150 0.6038246 0.2075921 0.025172153
## 2 50 0.6248291 0.2496076 0.023394768
## 2 100 0.6305652 0.2610545 0.012774408
## 2 150 0.6394747 0.2789225 0.024232901
## 3 50 0.6203859 0.2407543 0.012552386
## 3 100 0.6375919 0.2751496 0.018776823
## 3 150 0.6452243 0.2904436 0.006016099
## Kappa SD
## 0.01546084
## 0.05490827
## 0.05028519
## 0.04673111
## 0.02556440
## 0.04848187
## 0.02497865
## 0.03752200
## 0.01200638
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 150,
## interaction.depth = 3, shrinkage = 0.1 and n.minobsinnode = 10.
Training confusion matrix
confusionMatrix.train(gbmFit)
## Cross-Validated (3 fold) Confusion Matrix
##
## (entries are percentages of table totals)
##
## Reference
## Prediction 0 1
## 0 32.4 17.8
## 1 17.6 32.2